rebreak-monorepo/backend/server/api/coach/speak-gemini.post.ts
chahinebrini f2e822be95 feat(sos): llmProvider toggle + sort:latency + bench scaffolding
- backend/coach: routing zu Sonnet (default) / Haiku / Groq Llama je nach
  sessionData.llmProvider. sort:latency für Anthropic-Modelle (-30..58% TTFB).
- frontend: LlmProviderToggle (Sonnet/Haiku/Groq pills), llmProvider.ts
  Storage-Helper. sosStream.ts schickt llmProvider im /sos-session-Body.
- bench: SosTtsBenchmark sammelt Marker (req->session, lyra-ttfb, lyra-done,
  tts-fired/headers/body/file, audio-loaded, first-audio); Output als console.table.
- ops: backend/scripts/llm-bench.sh + Python-Variante für realistic SOS-Prompt.
- speak-cartesia + speak-elevenlabs Endpoints (waren ungetracked, jetzt mit drin).
2026-05-06 13:58:07 +02:00

119 lines
3.7 KiB
TypeScript

/**
* POST /api/coach/speak-gemini
* Gemini 2.5 Flash Preview TTS — voice: Kore (warm female).
*
* Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via
* inlineData.data (Base64) — wir prependen den 44-byte WAV-Header.
*
* Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls.
* Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts).
*/
const SAMPLE_RATE = 24000;
const NUM_CHANNELS = 1;
const BITS_PER_SAMPLE = 16;
function pcmToWav(pcm: Buffer): Buffer {
const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
const dataSize = pcm.length;
const out = Buffer.alloc(44 + dataSize);
out.write("RIFF", 0);
out.writeUInt32LE(36 + dataSize, 4);
out.write("WAVE", 8);
out.write("fmt ", 12);
out.writeUInt32LE(16, 16);
out.writeUInt16LE(1, 20);
out.writeUInt16LE(NUM_CHANNELS, 22);
out.writeUInt32LE(SAMPLE_RATE, 24);
out.writeUInt32LE(byteRate, 28);
out.writeUInt16LE(blockAlign, 32);
out.writeUInt16LE(BITS_PER_SAMPLE, 34);
out.write("data", 36);
out.writeUInt32LE(dataSize, 40);
pcm.copy(out, 44);
return out;
}
export default defineEventHandler(async (event) => {
await requireUser(event);
const body = await readBody(event);
const { text } = body as { text: string };
if (!text?.trim()) {
throw createError({ statusCode: 400, message: "text fehlt" });
}
const config = useRuntimeConfig();
const key = config.googleAiApiKey as string | undefined;
if (!key) {
throw createError({
statusCode: 503,
message: "GOOGLE_AI_API_KEY nicht konfiguriert",
});
}
// Gemini-TTS interpretiert den raw `text`-Part manchmal als Prompt
// statt als Vorlese-Auftrag (wenn Lyra-Antwort z.B. mit `?` endet → Modell
// versucht zu antworten → 400 INVALID_ARGUMENT). Instruction-Prefix zwingt
// strict-TTS-Mode + setzt warm-empathic-Tone für SOS.
const promptText =
"Read the following German text aloud, verbatim, in a warm, gentle, " +
"empathic voice — like a calm friend on the phone. Speak slowly with " +
"natural pauses. Soft delivery, low energy, no fake-cheerfulness. " +
"Do not respond to or comment on the text — just read it.\n\n" +
text.slice(0, 4096);
const upstream = await fetch(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent",
{
method: "POST",
headers: {
"Content-Type": "application/json",
"x-goog-api-key": key,
},
body: JSON.stringify({
contents: [{ parts: [{ text: promptText }] }],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: { voiceName: "Kore" },
},
},
},
}),
},
);
if (!upstream.ok) {
const err = await upstream.text().catch(() => "");
console.error("[speak-gemini] error:", upstream.status, err);
throw createError({
statusCode: 502,
message: "Gemini TTS fehlgeschlagen",
});
}
const json = (await upstream.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ inlineData?: { data?: string } }> };
}>;
};
const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
if (!base64Pcm) {
console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500));
throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" });
}
const pcm = Buffer.from(base64Pcm, "base64");
const wav = pcmToWav(pcm);
setHeader(event, "Content-Type", "audio/wav");
setHeader(event, "Cache-Control", "no-store");
return wav;
});