rebreak-monorepo/backend/server/api/coach/speak-openai.post.ts

/**
 * POST /api/coach/speak-openai — v5
 * OpenAI TTS — gpt-4o-mini-tts (Mar 2025), Stimmen: nova (chat) / shimmer (sos).
 *
 * Modes:
 *  - "chat"             → nova, neutral
 *  - "sos"              → shimmer, single warm-empathic instruction set
 *  - "sos-continuation" → shimmer, **identical** instructions zu "sos"
 *
 * Warum identisch: gpt-4o-mini-tts interpretiert `instructions` so kreativ,
 * dass unterschiedliche Strings im selben SOS-Flow als "Stimme wechselt"
 * wahrgenommen werden. Single-instruction-Mode eliminiert den Voice-Boundary.
 */
export default defineEventHandler(async (event) => {
  await requireUser(event);

  const body = await readBody(event);
  const { text, mode } = body as {
    text: string;
    mode?: "sos" | "sos-continuation" | "chat";
  };

  if (!text?.trim()) {
    throw createError({ statusCode: 400, message: "text fehlt" });
  }

  const isSos = mode === "sos" || mode === "sos-continuation";

  const config = useRuntimeConfig();
  const key = config.openaiApiKey as string | undefined;

  if (!key) {
    throw createError({
      statusCode: 503,
      message: "OpenAI API Key nicht konfiguriert",
    });
  }

  // Identische instructions für sos + sos-continuation → keine wahrgenommene
  // Stimm-Drift zwischen aufeinanderfolgenden TTS-Calls in derselben SOS-Session.
  const instructions = isSos
    ? "Warm, gentle, empathic — like a calm friend on the phone in a difficult moment. " +
      "Speak slowly with natural pauses between sentences. " +
      "Soft delivery, lower energy than chat-mode. " +
      "German native pronunciation. No fake-cheerful intonation."
    : undefined;

  const upstream = await fetch("https://api.openai.com/v1/audio/speech", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${key}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      model: "gpt-4o-mini-tts",
      input: text.slice(0, 4096),
      voice: isSos ? "shimmer" : "nova",
      response_format: "mp3",
      speed: 1.08,
      ...(instructions ? { instructions } : {}),
    }),
  });

  if (!upstream.ok || !upstream.body) {
    const err = await upstream.text().catch(() => "");
    console.error("[speak-openai] error:", upstream.status, err);
    throw createError({
      statusCode: 502,
      message: "OpenAI TTS fehlgeschlagen",
    });
  }

  setHeader(event, "Content-Type", "audio/mpeg");
  setHeader(event, "Cache-Control", "no-store");

  const { Readable } = await import("node:stream");
  const nodeStream = Readable.fromWeb(upstream.body as never);
  return sendStream(event, nodeStream);
});