- backend/coach: routing zu Sonnet (default) / Haiku / Groq Llama je nach sessionData.llmProvider. sort:latency für Anthropic-Modelle (-30..58% TTFB). - frontend: LlmProviderToggle (Sonnet/Haiku/Groq pills), llmProvider.ts Storage-Helper. sosStream.ts schickt llmProvider im /sos-session-Body. - bench: SosTtsBenchmark sammelt Marker (req->session, lyra-ttfb, lyra-done, tts-fired/headers/body/file, audio-loaded, first-audio); Output als console.table. - ops: backend/scripts/llm-bench.sh + Python-Variante für realistic SOS-Prompt. - speak-cartesia + speak-elevenlabs Endpoints (waren ungetracked, jetzt mit drin).
119 lines
3.7 KiB
TypeScript
119 lines
3.7 KiB
TypeScript
/**
|
|
* POST /api/coach/speak-gemini
|
|
* Gemini 2.5 Flash Preview TTS — voice: Kore (warm female).
|
|
*
|
|
* Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via
|
|
* inlineData.data (Base64) — wir prependen den 44-byte WAV-Header.
|
|
*
|
|
* Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls.
|
|
* Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts).
|
|
*/
|
|
const SAMPLE_RATE = 24000;
|
|
const NUM_CHANNELS = 1;
|
|
const BITS_PER_SAMPLE = 16;
|
|
|
|
function pcmToWav(pcm: Buffer): Buffer {
|
|
const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
|
|
const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
|
|
const dataSize = pcm.length;
|
|
const out = Buffer.alloc(44 + dataSize);
|
|
|
|
out.write("RIFF", 0);
|
|
out.writeUInt32LE(36 + dataSize, 4);
|
|
out.write("WAVE", 8);
|
|
out.write("fmt ", 12);
|
|
out.writeUInt32LE(16, 16);
|
|
out.writeUInt16LE(1, 20);
|
|
out.writeUInt16LE(NUM_CHANNELS, 22);
|
|
out.writeUInt32LE(SAMPLE_RATE, 24);
|
|
out.writeUInt32LE(byteRate, 28);
|
|
out.writeUInt16LE(blockAlign, 32);
|
|
out.writeUInt16LE(BITS_PER_SAMPLE, 34);
|
|
out.write("data", 36);
|
|
out.writeUInt32LE(dataSize, 40);
|
|
pcm.copy(out, 44);
|
|
return out;
|
|
}
|
|
|
|
export default defineEventHandler(async (event) => {
|
|
await requireUser(event);
|
|
|
|
const body = await readBody(event);
|
|
const { text } = body as { text: string };
|
|
|
|
if (!text?.trim()) {
|
|
throw createError({ statusCode: 400, message: "text fehlt" });
|
|
}
|
|
|
|
const config = useRuntimeConfig();
|
|
const key = config.googleAiApiKey as string | undefined;
|
|
|
|
if (!key) {
|
|
throw createError({
|
|
statusCode: 503,
|
|
message: "GOOGLE_AI_API_KEY nicht konfiguriert",
|
|
});
|
|
}
|
|
|
|
// Gemini-TTS interpretiert den raw `text`-Part manchmal als Prompt
|
|
// statt als Vorlese-Auftrag (wenn Lyra-Antwort z.B. mit `?` endet → Modell
|
|
// versucht zu antworten → 400 INVALID_ARGUMENT). Instruction-Prefix zwingt
|
|
// strict-TTS-Mode + setzt warm-empathic-Tone für SOS.
|
|
const promptText =
|
|
"Read the following German text aloud, verbatim, in a warm, gentle, " +
|
|
"empathic voice — like a calm friend on the phone. Speak slowly with " +
|
|
"natural pauses. Soft delivery, low energy, no fake-cheerfulness. " +
|
|
"Do not respond to or comment on the text — just read it.\n\n" +
|
|
text.slice(0, 4096);
|
|
|
|
const upstream = await fetch(
|
|
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent",
|
|
{
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
"x-goog-api-key": key,
|
|
},
|
|
body: JSON.stringify({
|
|
contents: [{ parts: [{ text: promptText }] }],
|
|
generationConfig: {
|
|
responseModalities: ["AUDIO"],
|
|
speechConfig: {
|
|
voiceConfig: {
|
|
prebuiltVoiceConfig: { voiceName: "Kore" },
|
|
},
|
|
},
|
|
},
|
|
}),
|
|
},
|
|
);
|
|
|
|
if (!upstream.ok) {
|
|
const err = await upstream.text().catch(() => "");
|
|
console.error("[speak-gemini] error:", upstream.status, err);
|
|
throw createError({
|
|
statusCode: 502,
|
|
message: "Gemini TTS fehlgeschlagen",
|
|
});
|
|
}
|
|
|
|
const json = (await upstream.json()) as {
|
|
candidates?: Array<{
|
|
content?: { parts?: Array<{ inlineData?: { data?: string } }> };
|
|
}>;
|
|
};
|
|
|
|
const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
if (!base64Pcm) {
|
|
console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500));
|
|
throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" });
|
|
}
|
|
|
|
const pcm = Buffer.from(base64Pcm, "base64");
|
|
const wav = pcmToWav(pcm);
|
|
|
|
setHeader(event, "Content-Type", "audio/wav");
|
|
setHeader(event, "Cache-Control", "no-store");
|
|
return wav;
|
|
});
|