rebreak-monorepo/backend/server/api/coach/transcribe.post.ts
chahinebrini 55e3cdfb26 fix(transcribe): pass language=ar/tr to nova-2-general so Lyra answers in correct language
Without explicit language param, nova-2-general falls back to multilingual
auto-detect and often misdetects arabic audio as english (phonetic transcript
'salam alaikum' instead of 'السلام عليكم'). detectLang() then sees only
latin chars and answers in english.

Confirmed via Deepgram docs: nova-2-general accepts language=ar and language=tr
(only nova-3 rejects them with HTTP 400).
2026-05-31 01:37:11 +02:00

148 lines
4.5 KiB
TypeScript

/**
* POST /api/coach/transcribe
* Empfängt Audio (base64 webm/mp4/aac) → Deepgram → gibt Text zurück
* iOS sendet rohes AAC (ADTS) → wird via ffmpeg in M4A konvertiert
*/
import { execSync } from "node:child_process";
import { writeFileSync, readFileSync, unlinkSync, existsSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { randomUUID } from "node:crypto";
export default defineEventHandler(async (event) => {
await requireUser(event);
const body = await readBody(event);
const { audio, mimeType, language } = body as {
audio: string;
mimeType?: string;
language?: string;
};
if (!audio) {
throw createError({ statusCode: 400, message: "audio fehlt" });
}
const config = useRuntimeConfig();
if (!config.deepgramApiKey) {
throw createError({
statusCode: 503,
message: "Deepgram nicht konfiguriert",
});
}
// Base64 → Buffer
const base64Data = audio.includes(",") ? audio.split(",")[1] : audio;
let buffer = Buffer.from(base64Data, "base64");
// Max 25MB (API-Limit)
if (buffer.length > 25 * 1024 * 1024) {
throw createError({
statusCode: 400,
message: "Audio zu groß (max 25 MB)",
});
}
// iOS capacitor-voice-recorder liefert rohes AAC (ADTS) — Deepgram akzeptiert das.
// Aber konvertiere trotzdem zu M4A für bessere Kompatibilität.
const isRawAac = mimeType?.includes("aac");
let ext = "webm";
let blobType = "audio/webm";
if (isRawAac) {
const id = randomUUID();
const inPath = join(tmpdir(), `${id}.aac`);
const outPath = join(tmpdir(), `${id}.m4a`);
try {
writeFileSync(inPath, buffer);
execSync(`ffmpeg -i ${inPath} -c:a copy ${outPath} -y 2>/dev/null`);
buffer = readFileSync(outPath);
ext = "m4a";
blobType = "audio/mp4";
} catch (e) {
console.error("[transcribe] ffmpeg convert failed:", e);
ext = "m4a";
blobType = "audio/mp4";
} finally {
if (existsSync(inPath)) unlinkSync(inPath);
if (existsSync(outPath)) unlinkSync(outPath);
}
} else if (mimeType?.includes("mp4") || mimeType?.includes("m4a")) {
ext = "m4a";
blobType = "audio/mp4";
}
console.log(
"[transcribe] mimeType:",
mimeType,
"→ ext:",
ext,
"converted:",
isRawAac,
"bytes:",
buffer.length,
);
// Deepgram language mapping.
// Live-Diagnose (2026-05-30): nova-3 lehnt language=ar (und tr) mit
// 400 "No such model/language/tier combination found" ab — entgegen
// der vorherigen Annahme. Fallback für ar/tr: nova-2-general
// (multilingual auto-detect). Für alle anderen Sprachen bleibt nova-3
// (bessere Genauigkeit, diskrete language-codes).
const deepgramLang =
language &&
["de", "en", "tr", "ar", "fr", "es", "pt", "it"].includes(language)
? language
: "de";
// nova-2-general unterstützt language=ar/tr (im Gegensatz zu nova-3).
// Ohne expliziten language-Param fällt nova-2 auf Auto-Detect zurück und
// misdetektiert arabisches Audio oft als Englisch (phonetisches Transcript
// wie "salam alaikum" statt "السلام عليكم") — Lyra antwortet dann nicht
// auf Arabisch. Mit language=ar wird der korrekte Acoustic-Model-Pfad
// verwendet und die Schrift bleibt arabisch.
const needsGeneralModel = ["ar", "tr"].includes(deepgramLang);
const deepgramUrl = needsGeneralModel
? `https://api.deepgram.com/v1/listen?language=${deepgramLang}&model=nova-2-general`
: `https://api.deepgram.com/v1/listen?language=${deepgramLang}&model=nova-3`;
console.log(
"[transcribe] language:",
deepgramLang,
"model:",
needsGeneralModel ? "nova-2-general" : "nova-3",
);
try {
const response = await fetch(deepgramUrl, {
method: "POST",
headers: {
Authorization: `Token ${config.deepgramApiKey}`,
"Content-Type": blobType,
},
body: buffer,
});
const result = await response.json();
if (!response.ok) {
console.error("[transcribe] Deepgram error:", JSON.stringify(result));
throw createError({
statusCode: response.status,
message: JSON.stringify(result),
});
}
const transcript =
result.results?.channels?.[0]?.alternatives?.[0]?.transcript || "";
return { text: transcript };
} catch (err: any) {
if (err.statusCode) throw err;
console.error("[transcribe] Unexpected error:", err);
throw createError({
statusCode: 500,
message: err?.message || "Transcribe fehlgeschlagen",
});
}
});