rebreak-monorepo/backend/server/api/coach/speak-openai.post.ts

80 lines
2.6 KiB
TypeScript

/**
* POST /api/coach/speak-openai — v5
* OpenAI TTS — gpt-4o-mini-tts (Mar 2025), Stimmen: nova (chat) / shimmer (sos).
*
* Modes:
* - "chat" → nova, neutral
* - "sos" → shimmer, single warm-empathic instruction set
* - "sos-continuation" → shimmer, **identical** instructions zu "sos"
*
* Warum identisch: gpt-4o-mini-tts interpretiert `instructions` so kreativ,
* dass unterschiedliche Strings im selben SOS-Flow als "Stimme wechselt"
* wahrgenommen werden. Single-instruction-Mode eliminiert den Voice-Boundary.
*/
export default defineEventHandler(async (event) => {
await requireUser(event);
const body = await readBody(event);
const { text, mode } = body as {
text: string;
mode?: "sos" | "sos-continuation" | "chat";
};
if (!text?.trim()) {
throw createError({ statusCode: 400, message: "text fehlt" });
}
const isSos = mode === "sos" || mode === "sos-continuation";
const config = useRuntimeConfig();
const key = config.openaiApiKey as string | undefined;
if (!key) {
throw createError({
statusCode: 503,
message: "OpenAI API Key nicht konfiguriert",
});
}
// Identische instructions für sos + sos-continuation → keine wahrgenommene
// Stimm-Drift zwischen aufeinanderfolgenden TTS-Calls in derselben SOS-Session.
const instructions = isSos
? "Warm, gentle, empathic — like a calm friend on the phone in a difficult moment. " +
"Speak slowly with natural pauses between sentences. " +
"Soft delivery, lower energy than chat-mode. " +
"German native pronunciation. No fake-cheerful intonation."
: undefined;
const upstream = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${key}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-mini-tts",
input: text.slice(0, 4096),
voice: isSos ? "shimmer" : "nova",
response_format: "mp3",
speed: 1.08,
...(instructions ? { instructions } : {}),
}),
});
if (!upstream.ok || !upstream.body) {
const err = await upstream.text().catch(() => "");
console.error("[speak-openai] error:", upstream.status, err);
throw createError({
statusCode: 502,
message: "OpenAI TTS fehlgeschlagen",
});
}
setHeader(event, "Content-Type", "audio/mpeg");
setHeader(event, "Cache-Control", "no-store");
const { Readable } = await import("node:stream");
const nodeStream = Readable.fromWeb(upstream.body as never);
return sendStream(event, nodeStream);
});