/** * POST /api/coach/speak-gemini * Gemini 2.5 Flash Preview TTS — voice: Kore (warm female). * * Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via * inlineData.data (Base64) — wir prependen den 44-byte WAV-Header. * * Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls. * Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts). */ const SAMPLE_RATE = 24000; const NUM_CHANNELS = 1; const BITS_PER_SAMPLE = 16; function pcmToWav(pcm: Buffer): Buffer { const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8; const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8; const dataSize = pcm.length; const out = Buffer.alloc(44 + dataSize); out.write("RIFF", 0); out.writeUInt32LE(36 + dataSize, 4); out.write("WAVE", 8); out.write("fmt ", 12); out.writeUInt32LE(16, 16); out.writeUInt16LE(1, 20); out.writeUInt16LE(NUM_CHANNELS, 22); out.writeUInt32LE(SAMPLE_RATE, 24); out.writeUInt32LE(byteRate, 28); out.writeUInt16LE(blockAlign, 32); out.writeUInt16LE(BITS_PER_SAMPLE, 34); out.write("data", 36); out.writeUInt32LE(dataSize, 40); pcm.copy(out, 44); return out; } export default defineEventHandler(async (event) => { await requireUser(event); const body = await readBody(event); const { text } = body as { text: string }; if (!text?.trim()) { throw createError({ statusCode: 400, message: "text fehlt" }); } const config = useRuntimeConfig(); const key = config.googleAiApiKey as string | undefined; if (!key) { throw createError({ statusCode: 503, message: "GOOGLE_AI_API_KEY nicht konfiguriert", }); } // Gemini-TTS interpretiert den raw `text`-Part manchmal als Prompt // statt als Vorlese-Auftrag (wenn Lyra-Antwort z.B. mit `?` endet → Modell // versucht zu antworten → 400 INVALID_ARGUMENT). Instruction-Prefix zwingt // strict-TTS-Mode + setzt warm-empathic-Tone für SOS. const promptText = "Read the following German text aloud, verbatim, in a warm, gentle, " + "empathic voice — like a calm friend on the phone. Speak slowly with " + "natural pauses. Soft delivery, low energy, no fake-cheerfulness. " + "Do not respond to or comment on the text — just read it.\n\n" + text.slice(0, 4096); const upstream = await fetch( "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent", { method: "POST", headers: { "Content-Type": "application/json", "x-goog-api-key": key, }, body: JSON.stringify({ contents: [{ parts: [{ text: promptText }] }], generationConfig: { responseModalities: ["AUDIO"], speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: "Kore" }, }, }, }, }), }, ); if (!upstream.ok) { const err = await upstream.text().catch(() => ""); console.error("[speak-gemini] error:", upstream.status, err); throw createError({ statusCode: 502, message: "Gemini TTS fehlgeschlagen", }); } const json = (await upstream.json()) as { candidates?: Array<{ content?: { parts?: Array<{ inlineData?: { data?: string } }> }; }>; }; const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; if (!base64Pcm) { console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500)); throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" }); } const pcm = Buffer.from(base64Pcm, "base64"); const wav = pcmToWav(pcm); setHeader(event, "Content-Type", "audio/wav"); setHeader(event, "Cache-Control", "no-store"); return wav; });