/** * POST /api/coach/transcribe * Empfängt Audio (base64 webm/mp4/aac) → Deepgram → gibt Text zurück * iOS sendet rohes AAC (ADTS) → wird via ffmpeg in M4A konvertiert */ import { execSync } from "node:child_process"; import { writeFileSync, readFileSync, unlinkSync, existsSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { randomUUID } from "node:crypto"; export default defineEventHandler(async (event) => { await requireUser(event); const body = await readBody(event); const { audio, mimeType, language } = body as { audio: string; mimeType?: string; language?: string; }; if (!audio) { throw createError({ statusCode: 400, message: "audio fehlt" }); } const config = useRuntimeConfig(); if (!config.deepgramApiKey) { throw createError({ statusCode: 503, message: "Deepgram nicht konfiguriert", }); } // Base64 → Buffer const base64Data = audio.includes(",") ? audio.split(",")[1] : audio; let buffer = Buffer.from(base64Data, "base64"); // Max 25MB (API-Limit) if (buffer.length > 25 * 1024 * 1024) { throw createError({ statusCode: 400, message: "Audio zu groß (max 25 MB)", }); } // iOS capacitor-voice-recorder liefert rohes AAC (ADTS) — Deepgram akzeptiert das. // Aber konvertiere trotzdem zu M4A für bessere Kompatibilität. const isRawAac = mimeType?.includes("aac"); let ext = "webm"; let blobType = "audio/webm"; if (isRawAac) { const id = randomUUID(); const inPath = join(tmpdir(), `${id}.aac`); const outPath = join(tmpdir(), `${id}.m4a`); try { writeFileSync(inPath, buffer); execSync(`ffmpeg -i ${inPath} -c:a copy ${outPath} -y 2>/dev/null`); buffer = readFileSync(outPath); ext = "m4a"; blobType = "audio/mp4"; } catch (e) { console.error("[transcribe] ffmpeg convert failed:", e); ext = "m4a"; blobType = "audio/mp4"; } finally { if (existsSync(inPath)) unlinkSync(inPath); if (existsSync(outPath)) unlinkSync(outPath); } } else if (mimeType?.includes("mp4") || mimeType?.includes("m4a")) { ext = "m4a"; blobType = "audio/mp4"; } console.log( "[transcribe] mimeType:", mimeType, "→ ext:", ext, "converted:", isRawAac, "bytes:", buffer.length, ); // Deepgram language mapping. // Stand 2026-06-01: nova-3 unterstützt alle Sprachen inkl. ar/tr. // nova-2-general hat ar/tr-Support eingestellt ("No such model/language/tier // combination found") — daher einheitlich nova-3 für alle Sprachen. const deepgramLang = language && ["de", "en", "tr", "ar", "fr", "es", "pt", "it"].includes(language) ? language : "de"; const deepgramUrl = `https://api.deepgram.com/v1/listen?language=${deepgramLang}&model=nova-3`; console.log("[transcribe] language:", deepgramLang, "model: nova-3"); try { const response = await fetch(deepgramUrl, { method: "POST", headers: { Authorization: `Token ${config.deepgramApiKey}`, "Content-Type": blobType, }, body: buffer, }); const result = await response.json(); if (!response.ok) { console.error("[transcribe] Deepgram error:", JSON.stringify(result)); throw createError({ statusCode: response.status, message: JSON.stringify(result), }); } const transcript = result.results?.channels?.[0]?.alternatives?.[0]?.transcript || ""; return { text: transcript }; } catch (err: any) { if (err.statusCode) throw err; console.error("[transcribe] Unexpected error:", err); throw createError({ statusCode: 500, message: err?.message || "Transcribe fehlgeschlagen", }); } });