rebreak-monorepo/backend/server/api/coach/speak-gemini.post.ts

108 lines
3.1 KiB
TypeScript

/**
* POST /api/coach/speak-gemini
* Gemini 2.5 Flash Preview TTS — voice: Kore (warm female).
*
* Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via
* inlineData.data (Base64) — wir prependen den 44-byte WAV-Header.
*
* Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls.
* Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts).
*/
const SAMPLE_RATE = 24000;
const NUM_CHANNELS = 1;
const BITS_PER_SAMPLE = 16;
function pcmToWav(pcm: Buffer): Buffer {
const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
const dataSize = pcm.length;
const out = Buffer.alloc(44 + dataSize);
out.write("RIFF", 0);
out.writeUInt32LE(36 + dataSize, 4);
out.write("WAVE", 8);
out.write("fmt ", 12);
out.writeUInt32LE(16, 16);
out.writeUInt16LE(1, 20);
out.writeUInt16LE(NUM_CHANNELS, 22);
out.writeUInt32LE(SAMPLE_RATE, 24);
out.writeUInt32LE(byteRate, 28);
out.writeUInt16LE(blockAlign, 32);
out.writeUInt16LE(BITS_PER_SAMPLE, 34);
out.write("data", 36);
out.writeUInt32LE(dataSize, 40);
pcm.copy(out, 44);
return out;
}
export default defineEventHandler(async (event) => {
await requireUser(event);
const body = await readBody(event);
const { text } = body as { text: string };
if (!text?.trim()) {
throw createError({ statusCode: 400, message: "text fehlt" });
}
const config = useRuntimeConfig();
const key = config.googleAiApiKey as string | undefined;
if (!key) {
throw createError({
statusCode: 503,
message: "GOOGLE_AI_API_KEY nicht konfiguriert",
});
}
const upstream = await fetch(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent",
{
method: "POST",
headers: {
"Content-Type": "application/json",
"x-goog-api-key": key,
},
body: JSON.stringify({
contents: [{ parts: [{ text: text.slice(0, 4096) }] }],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: { voiceName: "Kore" },
},
},
},
}),
},
);
if (!upstream.ok) {
const err = await upstream.text().catch(() => "");
console.error("[speak-gemini] error:", upstream.status, err);
throw createError({
statusCode: 502,
message: "Gemini TTS fehlgeschlagen",
});
}
const json = (await upstream.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ inlineData?: { data?: string } }> };
}>;
};
const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
if (!base64Pcm) {
console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500));
throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" });
}
const pcm = Buffer.from(base64Pcm, "base64");
const wav = pcmToWav(pcm);
setHeader(event, "Content-Type", "audio/wav");
setHeader(event, "Cache-Control", "no-store");
return wav;
});