rebreak-monorepo/backend/server/api/coach/speak-gemini.post.ts

/**
 * POST /api/coach/speak-gemini
 * Gemini 2.5 Flash Preview TTS — voice: Kore (warm female).
 *
 * Returns audio/wav. Gemini liefert 24kHz 16-bit mono PCM via
 * inlineData.data (Base64) — wir prependen den 44-byte WAV-Header.
 *
 * Kein `instructions`-Feld → keine wahrgenommene Stimm-Drift zwischen Calls.
 * Voice ist deterministisch konstant (im Gegensatz zu gpt-4o-mini-tts).
 */
const SAMPLE_RATE = 24000;
const NUM_CHANNELS = 1;
const BITS_PER_SAMPLE = 16;

function pcmToWav(pcm: Buffer): Buffer {
  const byteRate = (SAMPLE_RATE * NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
  const blockAlign = (NUM_CHANNELS * BITS_PER_SAMPLE) / 8;
  const dataSize = pcm.length;
  const out = Buffer.alloc(44 + dataSize);

  out.write("RIFF", 0);
  out.writeUInt32LE(36 + dataSize, 4);
  out.write("WAVE", 8);
  out.write("fmt ", 12);
  out.writeUInt32LE(16, 16);
  out.writeUInt16LE(1, 20);
  out.writeUInt16LE(NUM_CHANNELS, 22);
  out.writeUInt32LE(SAMPLE_RATE, 24);
  out.writeUInt32LE(byteRate, 28);
  out.writeUInt16LE(blockAlign, 32);
  out.writeUInt16LE(BITS_PER_SAMPLE, 34);
  out.write("data", 36);
  out.writeUInt32LE(dataSize, 40);
  pcm.copy(out, 44);
  return out;
}

export default defineEventHandler(async (event) => {
  await requireUser(event);

  const body = await readBody(event);
  const { text } = body as { text: string };

  if (!text?.trim()) {
    throw createError({ statusCode: 400, message: "text fehlt" });
  }

  const config = useRuntimeConfig();
  const key = config.googleAiApiKey as string | undefined;

  if (!key) {
    throw createError({
      statusCode: 503,
      message: "GOOGLE_AI_API_KEY nicht konfiguriert",
    });
  }

  // Gemini-TTS interpretiert den raw `text`-Part manchmal als Prompt
  // statt als Vorlese-Auftrag (wenn Lyra-Antwort z.B. mit `?` endet → Modell
  // versucht zu antworten → 400 INVALID_ARGUMENT). Instruction-Prefix zwingt
  // strict-TTS-Mode + setzt warm-empathic-Tone für SOS.
  const promptText =
    "Read the following German text aloud, verbatim, in a warm, gentle, " +
    "empathic voice — like a calm friend on the phone. Speak slowly with " +
    "natural pauses. Soft delivery, low energy, no fake-cheerfulness. " +
    "Do not respond to or comment on the text — just read it.\n\n" +
    text.slice(0, 4096);

  const upstream = await fetch(
    "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent",
    {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        "x-goog-api-key": key,
      },
      body: JSON.stringify({
        contents: [{ parts: [{ text: promptText }] }],
        generationConfig: {
          responseModalities: ["AUDIO"],
          speechConfig: {
            voiceConfig: {
              prebuiltVoiceConfig: { voiceName: "Kore" },
            },
          },
        },
      }),
    },
  );

  if (!upstream.ok) {
    const err = await upstream.text().catch(() => "");
    console.error("[speak-gemini] error:", upstream.status, err);
    throw createError({
      statusCode: 502,
      message: "Gemini TTS fehlgeschlagen",
    });
  }

  const json = (await upstream.json()) as {
    candidates?: Array<{
      content?: { parts?: Array<{ inlineData?: { data?: string } }> };
    }>;
  };

  const base64Pcm = json.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
  if (!base64Pcm) {
    console.error("[speak-gemini] no audio in response:", JSON.stringify(json).slice(0, 500));
    throw createError({ statusCode: 502, message: "Gemini TTS: kein Audio zurückgegeben" });
  }

  const pcm = Buffer.from(base64Pcm, "base64");
  const wav = pcmToWav(pcm);

  setHeader(event, "Content-Type", "audio/wav");
  setHeader(event, "Cache-Control", "no-store");
  return wav;
});