rebreak-monorepo/backend/server/api/coach/speak-elevenlabs.post.ts
chahinebrini f2e822be95 feat(sos): llmProvider toggle + sort:latency + bench scaffolding
- backend/coach: routing zu Sonnet (default) / Haiku / Groq Llama je nach
  sessionData.llmProvider. sort:latency für Anthropic-Modelle (-30..58% TTFB).
- frontend: LlmProviderToggle (Sonnet/Haiku/Groq pills), llmProvider.ts
  Storage-Helper. sosStream.ts schickt llmProvider im /sos-session-Body.
- bench: SosTtsBenchmark sammelt Marker (req->session, lyra-ttfb, lyra-done,
  tts-fired/headers/body/file, audio-loaded, first-audio); Output als console.table.
- ops: backend/scripts/llm-bench.sh + Python-Variante für realistic SOS-Prompt.
- speak-cartesia + speak-elevenlabs Endpoints (waren ungetracked, jetzt mit drin).
2026-05-06 13:58:07 +02:00

96 lines
3.0 KiB
TypeScript

/**
* POST /api/coach/speak-elevenlabs
* ElevenLabs eleven_multilingual_v2 — voice via runtimeConfig.elevenlabsVoiceId
* (default: Alexandra `kdmDKE6EkgrWrrykO9Qt` als Fallback wenn unset).
*
* Returns audio/mpeg. Voice ist deterministisch konstant über mehrere Calls
* — identisch zu Gemini-Verhalten, kein Mode-Switch wie bei gpt-4o-mini-tts.
*/
const FALLBACK_VOICE_ID = "kdmDKE6EkgrWrrykO9Qt"; // Alexandra
export default defineEventHandler(async (event) => {
await requireUser(event);
const body = await readBody(event);
const { text } = body as { text: string };
if (!text?.trim()) {
throw createError({ statusCode: 400, message: "text fehlt" });
}
const config = useRuntimeConfig();
// Fallback chain: runtimeConfig (Nuxt build-time) → process.env (runtime injection
// via Infisical at pm2-start). Stellt sicher dass auch dann ein Key vorhanden ist
// wenn nuxt's runtimeConfig-Inflate den process.env-Wert nicht mit-bundelt.
const key =
(config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || "";
const voiceId =
(config.elevenlabsVoiceId as string) ||
process.env.ELEVENLABS_VOICE_ID ||
FALLBACK_VOICE_ID;
console.log(
"[speak-elevenlabs] cfg-key:",
!!config.elevenlabsApiKey,
"env-key:",
!!process.env.ELEVENLABS_API_KEY,
"key-len:",
key.length,
"voice:",
voiceId,
);
if (!key) {
throw createError({
statusCode: 503,
message: "ELEVENLABS_API_KEY nicht konfiguriert",
});
}
console.log("[speak-elevenlabs] CALL recv, text-len=", text?.length ?? 0, "voice=", voiceId);
// /stream endpoint + optimize_streaming_latency=4 (max-latency-optimized,
// marginal weniger Quality). ElevenLabs sendet erste Bytes ~200-300ms statt
// 600-1000ms beim non-stream endpoint.
const upstream = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream?optimize_streaming_latency=4`,
{
method: "POST",
headers: {
"xi-api-key": key,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text: text.slice(0, 4096),
// Turbo v2.5: ~50% schneller als multilingual_v2, marginal niedrigere
// Quality — Trade-off lohnt sich für SOS (latency > Studio-Polish).
model_id: "eleven_turbo_v2_5",
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
style: 0.3,
use_speaker_boost: true,
},
output_format: "mp3_22050_32",
}),
},
);
if (!upstream.ok || !upstream.body) {
const err = await upstream.text().catch(() => "");
console.error("[speak-elevenlabs] error:", upstream.status, err);
throw createError({
statusCode: 502,
message: "ElevenLabs TTS fehlgeschlagen",
});
}
setHeader(event, "Content-Type", "audio/mpeg");
setHeader(event, "Cache-Control", "no-store");
const { Readable } = await import("node:stream");
const nodeStream = Readable.fromWeb(upstream.body as never);
return sendStream(event, nodeStream);
});