108 lines
3.1 KiB
TypeScript
108 lines
3.1 KiB
TypeScript
/**
|
|
* POST /api/coach/speak-gemini
|
|
* Gemini 2.5 Flash Preview TTS — voice: Kore (warm female).
|
|
*
|
|
* Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via
|
|
* inlineData.data (Base64) — wir prependen den 44-byte WAV-Header.
|
|
*
|
|
* Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls.
|
|
* Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts).
|
|
*/
|
|
const SAMPLE_RATE = 24000;
|
|
const NUM_CHANNELS = 1;
|
|
const BITS_PER_SAMPLE = 16;
|
|
|
|
function pcmToWav(pcm: Buffer): Buffer {
|
|
const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
|
|
const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
|
|
const dataSize = pcm.length;
|
|
const out = Buffer.alloc(44 + dataSize);
|
|
|
|
out.write("RIFF", 0);
|
|
out.writeUInt32LE(36 + dataSize, 4);
|
|
out.write("WAVE", 8);
|
|
out.write("fmt ", 12);
|
|
out.writeUInt32LE(16, 16);
|
|
out.writeUInt16LE(1, 20);
|
|
out.writeUInt16LE(NUM_CHANNELS, 22);
|
|
out.writeUInt32LE(SAMPLE_RATE, 24);
|
|
out.writeUInt32LE(byteRate, 28);
|
|
out.writeUInt16LE(blockAlign, 32);
|
|
out.writeUInt16LE(BITS_PER_SAMPLE, 34);
|
|
out.write("data", 36);
|
|
out.writeUInt32LE(dataSize, 40);
|
|
pcm.copy(out, 44);
|
|
return out;
|
|
}
|
|
|
|
export default defineEventHandler(async (event) => {
|
|
await requireUser(event);
|
|
|
|
const body = await readBody(event);
|
|
const { text } = body as { text: string };
|
|
|
|
if (!text?.trim()) {
|
|
throw createError({ statusCode: 400, message: "text fehlt" });
|
|
}
|
|
|
|
const config = useRuntimeConfig();
|
|
const key = config.googleAiApiKey as string | undefined;
|
|
|
|
if (!key) {
|
|
throw createError({
|
|
statusCode: 503,
|
|
message: "GOOGLE_AI_API_KEY nicht konfiguriert",
|
|
});
|
|
}
|
|
|
|
const upstream = await fetch(
|
|
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent",
|
|
{
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
"x-goog-api-key": key,
|
|
},
|
|
body: JSON.stringify({
|
|
contents: [{ parts: [{ text: text.slice(0, 4096) }] }],
|
|
generationConfig: {
|
|
responseModalities: ["AUDIO"],
|
|
speechConfig: {
|
|
voiceConfig: {
|
|
prebuiltVoiceConfig: { voiceName: "Kore" },
|
|
},
|
|
},
|
|
},
|
|
}),
|
|
},
|
|
);
|
|
|
|
if (!upstream.ok) {
|
|
const err = await upstream.text().catch(() => "");
|
|
console.error("[speak-gemini] error:", upstream.status, err);
|
|
throw createError({
|
|
statusCode: 502,
|
|
message: "Gemini TTS fehlgeschlagen",
|
|
});
|
|
}
|
|
|
|
const json = (await upstream.json()) as {
|
|
candidates?: Array<{
|
|
content?: { parts?: Array<{ inlineData?: { data?: string } }> };
|
|
}>;
|
|
};
|
|
|
|
const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
if (!base64Pcm) {
|
|
console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500));
|
|
throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" });
|
|
}
|
|
|
|
const pcm = Buffer.from(base64Pcm, "base64");
|
|
const wav = pcmToWav(pcm);
|
|
|
|
setHeader(event, "Content-Type", "audio/wav");
|
|
setHeader(event, "Cache-Control", "no-store");
|
|
return wav;
|
|
});
|