From 6fe7b5a37c9b1655390009b2814d58370baeaac0 Mon Sep 17 00:00:00 2001 From: Ying Xiang Date: Fri, 3 Apr 2026 21:27:28 +0800 Subject: [PATCH] feat(telegram): add voice message transcription support Receive voice notes, audio files, and video notes from Telegram and transcribe them to text via OpenRouter (or other providers) using multimodal chat completions API. Original audio files are preserved as attachments. STT provider/model configurable via settings.json (sttProvider/sttModel) or environment variables. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/adapters/telegram/bot.ts | 86 ++++++++++++++++- src/adapters/telegram/index.ts | 1 + src/adapters/telegram/transcribe.ts | 101 ++++++++++++++++++++ src/config.ts | 15 ++- src/main.ts | 13 +++ test/telegram-bot.test.ts | 115 +++++++++++++++++++++++ test/transcribe.test.ts | 137 ++++++++++++++++++++++++++++ 7 files changed, 462 insertions(+), 6 deletions(-) create mode 100644 src/adapters/telegram/transcribe.ts create mode 100644 test/transcribe.test.ts diff --git a/src/adapters/telegram/bot.ts b/src/adapters/telegram/bot.ts index fe0d4fe..1012034 100644 --- a/src/adapters/telegram/bot.ts +++ b/src/adapters/telegram/bot.ts @@ -4,6 +4,7 @@ import { Bot as GrammyBot, InputFile } from "grammy"; import type { Bot, BotEvent, BotHandler, PlatformInfo } from "../../adapter.js"; import * as log from "../../log.js"; import { createTelegramAdapters } from "./context.js"; +import { type SttConfig, transcribeAudio } from "./transcribe.js"; // ============================================================================ // Types @@ -68,15 +69,20 @@ export class TelegramBot implements Bot { private handler: BotHandler; private botToken: string; private workingDir: string; + private sttConfig: SttConfig | null; private botUserId: string | null = null; private botUsername: string | null = null; private queues = new Map(); private startupTime: number = 0; - constructor(handler: BotHandler, config: { token: string; workingDir: string }) { + constructor( + handler: BotHandler, + config: { token: string; workingDir: string; sttConfig?: SttConfig | null }, + ) { this.handler = handler; this.botToken = config.token; this.workingDir = config.workingDir; + this.sttConfig = config.sttConfig ?? null; this.client = new GrammyBot(config.token); this.client.catch((err) => { log.logWarning("Telegram error", err instanceof Error ? err.message : String(err)); @@ -232,6 +238,27 @@ export class TelegramBot implements Bot { downloads.push(this.processTelegramFile(chatId, fileId, fileName)); } + // Handle voice messages (OGG Opus) + if (message.voice) { + const fileId = message.voice.file_id; + downloads.push(this.processTelegramFile(chatId, fileId, `voice_${message.message_id}.ogg`)); + } + + // Handle audio files + if (message.audio) { + const fileId = message.audio.file_id; + const fileName = message.audio.file_name ?? `audio_${message.message_id}.mp3`; + downloads.push(this.processTelegramFile(chatId, fileId, fileName)); + } + + // Handle video notes (circular video messages) + if (message.video_note) { + const fileId = message.video_note.file_id; + downloads.push( + this.processTelegramFile(chatId, fileId, `video_note_${message.message_id}.mp4`), + ); + } + const attachments = await Promise.all(downloads); return attachments.filter( (attachment): attachment is { name: string; localPath: string } => attachment !== null, @@ -285,6 +312,43 @@ export class TelegramBot implements Bot { } } + /** + * Transcribe a voice/audio attachment using the configured STT provider. + */ + private async transcribeVoiceMessage( + chatId: string, + attachments: { name: string; localPath: string }[], + ): Promise { + if (!this.sttConfig) { + log.logWarning("Voice message received but STT is not configured (sttProvider/sttModel)"); + return "[Voice message received but transcription is not configured]"; + } + + const voiceAttachment = attachments.find( + (a) => + a.name.startsWith("voice_") || + a.name.startsWith("audio_") || + a.name.startsWith("video_note_"), + ); + + if (!voiceAttachment) { + log.logWarning("Voice attachment not found after download"); + return "[Voice message could not be processed]"; + } + + const fullPath = join(this.workingDir, voiceAttachment.localPath); + + try { + const text = await transcribeAudio(fullPath, this.sttConfig); + log.logInfo(`Voice transcription (${chatId}): ${text.substring(0, 100)}`); + return text; + } catch (err) { + const errMsg = err instanceof Error ? err.message : String(err); + log.logWarning("Voice transcription failed", errMsg); + return `[Voice transcription failed: ${errMsg}]`; + } + } + // ========================================================================== // Private - Event Handlers // ========================================================================== @@ -304,7 +368,8 @@ export class TelegramBot implements Bot { if (msg.from?.is_bot) return null; const text = msg.text ?? msg.caption ?? ""; - if (!text && !msg.document && !msg.photo) return null; + if (!text && !msg.document && !msg.photo && !msg.voice && !msg.audio && !msg.video_note) + return null; const chatId = String(msg.chat.id); const chatType = msg.chat.type; @@ -400,6 +465,17 @@ export class TelegramBot implements Bot { // Process attachments const processedAttachments = await this.processAttachments(mc.chatId, mc.msg); + // Transcribe voice/audio if present + let finalText = cleanedText; + if (mc.msg.voice || mc.msg.audio || mc.msg.video_note) { + const transcription = await this.transcribeVoiceMessage(mc.chatId, processedAttachments); + if (transcription) { + finalText = finalText + ? `${finalText}\n\n[Voice transcription]: ${transcription}` + : transcription; + } + } + const event: TelegramEvent = { type: "message", channel: mc.chatId, @@ -408,7 +484,7 @@ export class TelegramBot implements Bot { sessionKey: mc.sessionKey, user: mc.userId, userName: mc.userName, - text: cleanedText, + text: finalText, attachments: processedAttachments, }; @@ -418,13 +494,13 @@ export class TelegramBot implements Bot { ts: mc.msgId, user: mc.userId, userName: mc.userName, - text: cleanedText, + text: finalText, attachments: processedAttachments, isBot: false, }); // Handle bare "stop" text (backward compat) - if (cleanedText.toLowerCase() === "stop") { + if (finalText.toLowerCase() === "stop") { if (this.handler.isRunning(mc.sessionKey)) { await this.handler.handleStop(mc.sessionKey, mc.chatId, this); } else { diff --git a/src/adapters/telegram/index.ts b/src/adapters/telegram/index.ts index 691e69a..a14d26e 100644 --- a/src/adapters/telegram/index.ts +++ b/src/adapters/telegram/index.ts @@ -1,2 +1,3 @@ export * from "./bot.js"; export * from "./context.js"; +export * from "./transcribe.js"; diff --git a/src/adapters/telegram/transcribe.ts b/src/adapters/telegram/transcribe.ts new file mode 100644 index 0000000..32c6c5b --- /dev/null +++ b/src/adapters/telegram/transcribe.ts @@ -0,0 +1,101 @@ +import { readFileSync } from "fs"; + +export interface SttConfig { + provider: string; + model: string; + apiKey: string; +} + +const PROVIDER_BASE_URLS: Record = { + openrouter: "https://openrouter.ai/api/v1", + openai: "https://api.openai.com/v1", +}; + +const PROVIDER_API_KEY_ENV: Record = { + openrouter: "OPENROUTER_API_KEY", + openai: "OPENAI_API_KEY", + google: "GEMINI_API_KEY", +}; + +/** + * Resolve STT API key from environment variables based on provider name. + */ +export function resolveSttApiKey(provider: string): string | undefined { + const envVar = PROVIDER_API_KEY_ENV[provider]; + if (envVar) return process.env[envVar]; + // Fallback: try MOM_STT_API_KEY + return process.env.MOM_STT_API_KEY; +} + +/** + * Get the audio format string from a filename extension. + */ +function getAudioFormat(filePath: string): string { + const ext = filePath.toLowerCase().split(".").pop() || ""; + const FORMAT_MAP: Record = { + ogg: "ogg", + oga: "ogg", + mp3: "mp3", + wav: "wav", + flac: "flac", + m4a: "m4a", + aac: "aac", + aiff: "aiff", + mp4: "mp4", + }; + return FORMAT_MAP[ext] || "ogg"; +} + +/** + * Transcribe an audio file using a chat completions API with multimodal audio input. + */ +export async function transcribeAudio(filePath: string, config: SttConfig): Promise { + const fileBuffer = readFileSync(filePath); + const base64Data = fileBuffer.toString("base64"); + const format = getAudioFormat(filePath); + + const baseUrl = PROVIDER_BASE_URLS[config.provider] || PROVIDER_BASE_URLS.openrouter; + + const body = { + model: config.model, + messages: [ + { + role: "user", + content: [ + { + type: "input_audio", + input_audio: { data: base64Data, format }, + }, + { + type: "text", + text: "Transcribe this audio faithfully. Output only the transcription text, nothing else.", + }, + ], + }, + ], + }; + + const response = await fetch(`${baseUrl}/chat/completions`, { + method: "POST", + headers: { + Authorization: `Bearer ${config.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errBody = await response.text(); + throw new Error(`STT API error ${response.status}: ${errBody}`); + } + + const result = (await response.json()) as { + choices?: { message?: { content?: string } }[]; + }; + const text = result.choices?.[0]?.message?.content; + if (!text) { + throw new Error("STT API returned empty transcription"); + } + + return text.trim(); +} diff --git a/src/config.ts b/src/config.ts index 58ecd3e..f1d9a81 100644 --- a/src/config.ts +++ b/src/config.ts @@ -8,6 +8,8 @@ export interface AgentConfig { sessionScope?: "thread" | "channel"; logFormat?: "console" | "json"; logLevel?: "trace" | "debug" | "info" | "warn" | "error"; + sttProvider?: string; + sttModel?: string; } const DEFAULTS: AgentConfig = { @@ -41,8 +43,19 @@ export function loadAgentConfig(workspaceDir: string): AgentConfig { const sessionScope = fromFile.sessionScope ?? DEFAULTS.sessionScope; const logFormat = fromFile.logFormat ?? DEFAULTS.logFormat; const logLevel = fromFile.logLevel ?? DEFAULTS.logLevel; + const sttProvider = fromFile.sttProvider || process.env.MOM_STT_PROVIDER || undefined; + const sttModel = fromFile.sttModel || process.env.MOM_STT_MODEL || undefined; - return { provider, model, thinkingLevel, sessionScope, logFormat, logLevel }; + return { + provider, + model, + thinkingLevel, + sessionScope, + logFormat, + logLevel, + sttProvider, + sttModel, + }; } export function saveAgentConfig(workspaceDir: string, config: Partial): void { diff --git a/src/main.ts b/src/main.ts index 0085792..ec5691a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -9,6 +9,8 @@ import { DiscordBot } from "./adapters/discord/index.js"; import { TelegramBot } from "./adapters/telegram/index.js"; import { SlackBot as SlackBotClass } from "./adapters/slack/index.js"; import { type AgentRunner, createRunner } from "./agent.js"; +import { loadAgentConfig } from "./config.js"; +import { resolveSttApiKey } from "./adapters/telegram/transcribe.js"; import { downloadChannel } from "./download.js"; import { createEventsWatcher } from "./events.js"; import * as log from "./log.js"; @@ -395,13 +397,24 @@ if (hasSlack) { log.logInfo("Platform: Slack"); } if (hasTelegram) { + const agentCfg = loadAgentConfig(workingDir); + const sttApiKey = agentCfg.sttProvider ? resolveSttApiKey(agentCfg.sttProvider) : undefined; + const sttConfig = + agentCfg.sttProvider && agentCfg.sttModel && sttApiKey + ? { provider: agentCfg.sttProvider, model: agentCfg.sttModel, apiKey: sttApiKey } + : null; + const telegramBot = new TelegramBot(handler, { token: MOM_TELEGRAM_BOT_TOKEN!, workingDir, + sttConfig, }); bots.push(telegramBot); botsByPlatform.telegram = telegramBot; log.logInfo("Platform: Telegram"); + if (sttConfig) { + log.logInfo(`STT: ${sttConfig.provider}/${sttConfig.model}`); + } } if (hasDiscord) { const discordBot = new DiscordBot(handler, { diff --git a/test/telegram-bot.test.ts b/test/telegram-bot.test.ts index 521fb91..d1d2aa5 100644 --- a/test/telegram-bot.test.ts +++ b/test/telegram-bot.test.ts @@ -112,6 +112,65 @@ describe("TelegramBot extractMessageContext", () => { }); }); +describe("TelegramBot voice message context", () => { + let workingDir: string; + + beforeEach(() => { + workingDir = join(tmpdir(), `mama-telegram-voice-${Date.now()}`); + mkdirSync(workingDir, { recursive: true }); + }); + + afterEach(() => { + if (existsSync(workingDir)) rmSync(workingDir, { recursive: true, force: true }); + }); + + test("accepts voice messages (not null)", () => { + const bot = new TelegramBot(makeHandler(), { token: "T", workingDir }); + (bot as any).startupTime = 0; + const extract = (bot as any).extractMessageContext.bind(bot); + const msg = makeMessage({ text: undefined, voice: { file_id: "v1", duration: 5 } }); + const result = extract(msg); + expect(result).not.toBeNull(); + expect(result.text).toBe(""); + }); + + test("accepts audio messages (not null)", () => { + const bot = new TelegramBot(makeHandler(), { token: "T", workingDir }); + (bot as any).startupTime = 0; + const extract = (bot as any).extractMessageContext.bind(bot); + const msg = makeMessage({ + text: undefined, + audio: { file_id: "a1", duration: 10, file_name: "song.mp3" }, + }); + const result = extract(msg); + expect(result).not.toBeNull(); + expect(result.text).toBe(""); + }); + + test("accepts video_note messages (not null)", () => { + const bot = new TelegramBot(makeHandler(), { token: "T", workingDir }); + (bot as any).startupTime = 0; + const extract = (bot as any).extractMessageContext.bind(bot); + const msg = makeMessage({ text: undefined, video_note: { file_id: "vn1", duration: 3 } }); + const result = extract(msg); + expect(result).not.toBeNull(); + }); + + test("voice message with caption preserves caption as text", () => { + const bot = new TelegramBot(makeHandler(), { token: "T", workingDir }); + (bot as any).startupTime = 0; + const extract = (bot as any).extractMessageContext.bind(bot); + const msg = makeMessage({ + text: undefined, + caption: "listen to this", + voice: { file_id: "v2", duration: 5 }, + }); + const result = extract(msg); + expect(result).not.toBeNull(); + expect(result.text).toBe("listen to this"); + }); +}); + describe("TelegramBot attachments", () => { let workingDir: string; const originalFetch = globalThis.fetch; @@ -149,6 +208,62 @@ describe("TelegramBot attachments", () => { ]); }); + test("processAttachments handles voice messages", async () => { + const bot = new TelegramBot(makeHandler(), { token: "TEST_TOKEN", workingDir }); + const processTelegramFile = vi + .fn() + .mockResolvedValueOnce({ name: "voice_42.ogg", localPath: "123/attachments/1_voice.ogg" }); + + (bot as any).processTelegramFile = processTelegramFile; + + const attachments = await bot.processAttachments("123", { + message_id: 42, + voice: { file_id: "voice-file-id", duration: 5 }, + }); + + expect(processTelegramFile).toHaveBeenCalledWith("123", "voice-file-id", "voice_42.ogg"); + expect(attachments).toEqual([ + { name: "voice_42.ogg", localPath: "123/attachments/1_voice.ogg" }, + ]); + }); + + test("processAttachments handles audio messages with file_name", async () => { + const bot = new TelegramBot(makeHandler(), { token: "TEST_TOKEN", workingDir }); + const processTelegramFile = vi + .fn() + .mockResolvedValueOnce({ name: "song.mp3", localPath: "123/attachments/1_song.mp3" }); + + (bot as any).processTelegramFile = processTelegramFile; + + const attachments = await bot.processAttachments("123", { + message_id: 42, + audio: { file_id: "audio-file-id", duration: 120, file_name: "song.mp3" }, + }); + + expect(processTelegramFile).toHaveBeenCalledWith("123", "audio-file-id", "song.mp3"); + expect(attachments).toEqual([{ name: "song.mp3", localPath: "123/attachments/1_song.mp3" }]); + }); + + test("processAttachments handles video_note messages", async () => { + const bot = new TelegramBot(makeHandler(), { token: "TEST_TOKEN", workingDir }); + const processTelegramFile = vi.fn().mockResolvedValueOnce({ + name: "video_note_42.mp4", + localPath: "123/attachments/1_video_note.mp4", + }); + + (bot as any).processTelegramFile = processTelegramFile; + + const attachments = await bot.processAttachments("123", { + message_id: 42, + video_note: { file_id: "vn-file-id", duration: 3 }, + }); + + expect(processTelegramFile).toHaveBeenCalledWith("123", "vn-file-id", "video_note_42.mp4"); + expect(attachments).toEqual([ + { name: "video_note_42.mp4", localPath: "123/attachments/1_video_note.mp4" }, + ]); + }); + test("processTelegramFile downloads via bot token and writes the attachment", async () => { const bot = new TelegramBot(makeHandler(), { token: "TEST_TOKEN", workingDir }); const getFile = vi.fn().mockResolvedValue({ file_path: "photos/file_123.jpg" }); diff --git a/test/transcribe.test.ts b/test/transcribe.test.ts new file mode 100644 index 0000000..327b887 --- /dev/null +++ b/test/transcribe.test.ts @@ -0,0 +1,137 @@ +import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { resolveSttApiKey, transcribeAudio } from "../src/adapters/telegram/transcribe.js"; + +describe("transcribeAudio", () => { + let workingDir: string; + let audioFile: string; + const originalFetch = globalThis.fetch; + + beforeEach(() => { + workingDir = join(tmpdir(), `mama-transcribe-${Date.now()}`); + mkdirSync(workingDir, { recursive: true }); + audioFile = join(workingDir, "voice.ogg"); + writeFileSync(audioFile, Buffer.from([0x4f, 0x67, 0x67, 0x53])); // fake OGG header + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + if (existsSync(workingDir)) rmSync(workingDir, { recursive: true, force: true }); + }); + + test("sends correct request and returns transcription", async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + choices: [{ message: { content: "Hello world" } }], + }), + }); + globalThis.fetch = fetchMock as typeof fetch; + + const result = await transcribeAudio(audioFile, { + provider: "openrouter", + model: "google/gemini-2.5-flash", + apiKey: "test-key", + }); + + expect(result).toBe("Hello world"); + expect(fetchMock).toHaveBeenCalledOnce(); + + const [url, opts] = fetchMock.mock.calls[0]; + expect(url).toBe("https://openrouter.ai/api/v1/chat/completions"); + expect(opts.method).toBe("POST"); + expect(opts.headers.Authorization).toBe("Bearer test-key"); + + const body = JSON.parse(opts.body); + expect(body.model).toBe("google/gemini-2.5-flash"); + expect(body.messages[0].content[0].type).toBe("input_audio"); + expect(body.messages[0].content[0].input_audio.format).toBe("ogg"); + expect(body.messages[0].content[1].type).toBe("text"); + }); + + test("throws on API error response", async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: false, + status: 401, + text: async () => "Unauthorized", + }); + globalThis.fetch = fetchMock as typeof fetch; + + await expect( + transcribeAudio(audioFile, { + provider: "openrouter", + model: "google/gemini-2.5-flash", + apiKey: "bad-key", + }), + ).rejects.toThrow("STT API error 401: Unauthorized"); + }); + + test("throws on empty transcription", async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ choices: [{ message: { content: "" } }] }), + }); + globalThis.fetch = fetchMock as typeof fetch; + + await expect( + transcribeAudio(audioFile, { + provider: "openrouter", + model: "google/gemini-2.5-flash", + apiKey: "test-key", + }), + ).rejects.toThrow("STT API returned empty transcription"); + }); + + test("trims whitespace from transcription", async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + choices: [{ message: { content: " hello world \n" } }], + }), + }); + globalThis.fetch = fetchMock as typeof fetch; + + const result = await transcribeAudio(audioFile, { + provider: "openrouter", + model: "google/gemini-2.5-flash", + apiKey: "test-key", + }); + expect(result).toBe("hello world"); + }); +}); + +describe("resolveSttApiKey", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + test("resolves OPENROUTER_API_KEY for openrouter provider", () => { + process.env.OPENROUTER_API_KEY = "or-key-123"; + expect(resolveSttApiKey("openrouter")).toBe("or-key-123"); + }); + + test("resolves OPENAI_API_KEY for openai provider", () => { + process.env.OPENAI_API_KEY = "sk-key-123"; + expect(resolveSttApiKey("openai")).toBe("sk-key-123"); + }); + + test("resolves GEMINI_API_KEY for google provider", () => { + process.env.GEMINI_API_KEY = "gem-key-123"; + expect(resolveSttApiKey("google")).toBe("gem-key-123"); + }); + + test("falls back to MOM_STT_API_KEY for unknown provider", () => { + process.env.MOM_STT_API_KEY = "custom-key"; + expect(resolveSttApiKey("custom-provider")).toBe("custom-key"); + }); + + test("returns undefined when no key is set", () => { + delete process.env.OPENROUTER_API_KEY; + delete process.env.MOM_STT_API_KEY; + expect(resolveSttApiKey("openrouter")).toBeUndefined(); + }); +});