Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 81 additions & 5 deletions src/adapters/telegram/bot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { Bot as GrammyBot, InputFile } from "grammy";
import type { Bot, BotEvent, BotHandler, PlatformInfo } from "../../adapter.js";
import * as log from "../../log.js";
import { createTelegramAdapters } from "./context.js";
import { type SttConfig, transcribeAudio } from "./transcribe.js";

// ============================================================================
// Types
Expand Down Expand Up @@ -68,15 +69,20 @@ export class TelegramBot implements Bot {
private handler: BotHandler;
private botToken: string;
private workingDir: string;
private sttConfig: SttConfig | null;
private botUserId: string | null = null;
private botUsername: string | null = null;
private queues = new Map<string, ChannelQueue>();
private startupTime: number = 0;

constructor(handler: BotHandler, config: { token: string; workingDir: string }) {
constructor(
handler: BotHandler,
config: { token: string; workingDir: string; sttConfig?: SttConfig | null },
) {
this.handler = handler;
this.botToken = config.token;
this.workingDir = config.workingDir;
this.sttConfig = config.sttConfig ?? null;
this.client = new GrammyBot(config.token);
this.client.catch((err) => {
log.logWarning("Telegram error", err instanceof Error ? err.message : String(err));
Expand Down Expand Up @@ -232,6 +238,27 @@ export class TelegramBot implements Bot {
downloads.push(this.processTelegramFile(chatId, fileId, fileName));
}

// Handle voice messages (OGG Opus)
if (message.voice) {
const fileId = message.voice.file_id;
downloads.push(this.processTelegramFile(chatId, fileId, `voice_${message.message_id}.ogg`));
}

// Handle audio files
if (message.audio) {
const fileId = message.audio.file_id;
const fileName = message.audio.file_name ?? `audio_${message.message_id}.mp3`;
downloads.push(this.processTelegramFile(chatId, fileId, fileName));
}

// Handle video notes (circular video messages)
if (message.video_note) {
const fileId = message.video_note.file_id;
downloads.push(
this.processTelegramFile(chatId, fileId, `video_note_${message.message_id}.mp4`),
);
}

const attachments = await Promise.all(downloads);
return attachments.filter(
(attachment): attachment is { name: string; localPath: string } => attachment !== null,
Expand Down Expand Up @@ -285,6 +312,43 @@ export class TelegramBot implements Bot {
}
}

/**
* Transcribe a voice/audio attachment using the configured STT provider.
*/
private async transcribeVoiceMessage(
chatId: string,
attachments: { name: string; localPath: string }[],
): Promise<string> {
if (!this.sttConfig) {
log.logWarning("Voice message received but STT is not configured (sttProvider/sttModel)");
return "[Voice message received but transcription is not configured]";
}

const voiceAttachment = attachments.find(
(a) =>
a.name.startsWith("voice_") ||
a.name.startsWith("audio_") ||
a.name.startsWith("video_note_"),
);

if (!voiceAttachment) {
log.logWarning("Voice attachment not found after download");
return "[Voice message could not be processed]";
}

const fullPath = join(this.workingDir, voiceAttachment.localPath);

try {
const text = await transcribeAudio(fullPath, this.sttConfig);
log.logInfo(`Voice transcription (${chatId}): ${text.substring(0, 100)}`);
return text;
} catch (err) {
const errMsg = err instanceof Error ? err.message : String(err);
log.logWarning("Voice transcription failed", errMsg);
return `[Voice transcription failed: ${errMsg}]`;
}
}

// ==========================================================================
// Private - Event Handlers
// ==========================================================================
Expand All @@ -304,7 +368,8 @@ export class TelegramBot implements Bot {
if (msg.from?.is_bot) return null;

const text = msg.text ?? msg.caption ?? "";
if (!text && !msg.document && !msg.photo) return null;
if (!text && !msg.document && !msg.photo && !msg.voice && !msg.audio && !msg.video_note)
return null;

const chatId = String(msg.chat.id);
const chatType = msg.chat.type;
Expand Down Expand Up @@ -400,6 +465,17 @@ export class TelegramBot implements Bot {
// Process attachments
const processedAttachments = await this.processAttachments(mc.chatId, mc.msg);

// Transcribe voice/audio if present
let finalText = cleanedText;
if (mc.msg.voice || mc.msg.audio || mc.msg.video_note) {
const transcription = await this.transcribeVoiceMessage(mc.chatId, processedAttachments);
if (transcription) {
finalText = finalText
? `${finalText}\n\n[Voice transcription]: ${transcription}`
: transcription;
}
}

const event: TelegramEvent = {
type: "message",
channel: mc.chatId,
Expand All @@ -408,7 +484,7 @@ export class TelegramBot implements Bot {
sessionKey: mc.sessionKey,
user: mc.userId,
userName: mc.userName,
text: cleanedText,
text: finalText,
attachments: processedAttachments,
};

Expand All @@ -418,13 +494,13 @@ export class TelegramBot implements Bot {
ts: mc.msgId,
user: mc.userId,
userName: mc.userName,
text: cleanedText,
text: finalText,
attachments: processedAttachments,
isBot: false,
});

// Handle bare "stop" text (backward compat)
if (cleanedText.toLowerCase() === "stop") {
if (finalText.toLowerCase() === "stop") {
if (this.handler.isRunning(mc.sessionKey)) {
await this.handler.handleStop(mc.sessionKey, mc.chatId, this);
} else {
Expand Down
1 change: 1 addition & 0 deletions src/adapters/telegram/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export * from "./bot.js";
export * from "./context.js";
export * from "./transcribe.js";
101 changes: 101 additions & 0 deletions src/adapters/telegram/transcribe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import { readFileSync } from "fs";

export interface SttConfig {
provider: string;
model: string;
apiKey: string;
}

const PROVIDER_BASE_URLS: Record<string, string> = {
openrouter: "https://openrouter.ai/api/v1",
openai: "https://api.openai.com/v1",
};

const PROVIDER_API_KEY_ENV: Record<string, string> = {
openrouter: "OPENROUTER_API_KEY",
openai: "OPENAI_API_KEY",
google: "GEMINI_API_KEY",
};

/**
* Resolve STT API key from environment variables based on provider name.
*/
export function resolveSttApiKey(provider: string): string | undefined {
const envVar = PROVIDER_API_KEY_ENV[provider];
if (envVar) return process.env[envVar];
// Fallback: try MOM_STT_API_KEY
return process.env.MOM_STT_API_KEY;
}

/**
* Get the audio format string from a filename extension.
*/
function getAudioFormat(filePath: string): string {
const ext = filePath.toLowerCase().split(".").pop() || "";
const FORMAT_MAP: Record<string, string> = {
ogg: "ogg",
oga: "ogg",
mp3: "mp3",
wav: "wav",
flac: "flac",
m4a: "m4a",
aac: "aac",
aiff: "aiff",
mp4: "mp4",
};
return FORMAT_MAP[ext] || "ogg";
}

/**
* Transcribe an audio file using a chat completions API with multimodal audio input.
*/
export async function transcribeAudio(filePath: string, config: SttConfig): Promise<string> {
const fileBuffer = readFileSync(filePath);
const base64Data = fileBuffer.toString("base64");
const format = getAudioFormat(filePath);

const baseUrl = PROVIDER_BASE_URLS[config.provider] || PROVIDER_BASE_URLS.openrouter;

const body = {
model: config.model,
messages: [
{
role: "user",
content: [
{
type: "input_audio",
input_audio: { data: base64Data, format },
},
{
type: "text",
text: "Transcribe this audio faithfully. Output only the transcription text, nothing else.",
},
],
},
],
};

const response = await fetch(`${baseUrl}/chat/completions`, {
method: "POST",
headers: {
Authorization: `Bearer ${config.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});

if (!response.ok) {
const errBody = await response.text();
throw new Error(`STT API error ${response.status}: ${errBody}`);
}

const result = (await response.json()) as {
choices?: { message?: { content?: string } }[];
};
const text = result.choices?.[0]?.message?.content;
if (!text) {
throw new Error("STT API returned empty transcription");
}

return text.trim();
}
15 changes: 14 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ export interface AgentConfig {
sessionScope?: "thread" | "channel";
logFormat?: "console" | "json";
logLevel?: "trace" | "debug" | "info" | "warn" | "error";
sttProvider?: string;
sttModel?: string;
}

const DEFAULTS: AgentConfig = {
Expand Down Expand Up @@ -41,8 +43,19 @@ export function loadAgentConfig(workspaceDir: string): AgentConfig {
const sessionScope = fromFile.sessionScope ?? DEFAULTS.sessionScope;
const logFormat = fromFile.logFormat ?? DEFAULTS.logFormat;
const logLevel = fromFile.logLevel ?? DEFAULTS.logLevel;
const sttProvider = fromFile.sttProvider || process.env.MOM_STT_PROVIDER || undefined;
const sttModel = fromFile.sttModel || process.env.MOM_STT_MODEL || undefined;

return { provider, model, thinkingLevel, sessionScope, logFormat, logLevel };
return {
provider,
model,
thinkingLevel,
sessionScope,
logFormat,
logLevel,
sttProvider,
sttModel,
};
}

export function saveAgentConfig(workspaceDir: string, config: Partial<AgentConfig>): void {
Expand Down
13 changes: 13 additions & 0 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import { DiscordBot } from "./adapters/discord/index.js";
import { TelegramBot } from "./adapters/telegram/index.js";
import { SlackBot as SlackBotClass } from "./adapters/slack/index.js";
import { type AgentRunner, createRunner } from "./agent.js";
import { loadAgentConfig } from "./config.js";
import { resolveSttApiKey } from "./adapters/telegram/transcribe.js";
import { downloadChannel } from "./download.js";
import { createEventsWatcher } from "./events.js";
import * as log from "./log.js";
Expand Down Expand Up @@ -395,13 +397,24 @@ if (hasSlack) {
log.logInfo("Platform: Slack");
}
if (hasTelegram) {
const agentCfg = loadAgentConfig(workingDir);
const sttApiKey = agentCfg.sttProvider ? resolveSttApiKey(agentCfg.sttProvider) : undefined;
const sttConfig =
agentCfg.sttProvider && agentCfg.sttModel && sttApiKey
? { provider: agentCfg.sttProvider, model: agentCfg.sttModel, apiKey: sttApiKey }
: null;

const telegramBot = new TelegramBot(handler, {
token: MOM_TELEGRAM_BOT_TOKEN!,
workingDir,
sttConfig,
});
bots.push(telegramBot);
botsByPlatform.telegram = telegramBot;
log.logInfo("Platform: Telegram");
if (sttConfig) {
log.logInfo(`STT: ${sttConfig.provider}/${sttConfig.model}`);
}
}
if (hasDiscord) {
const discordBot = new DiscordBot(handler, {
Expand Down
Loading
Loading