LLM-Prompt (message.post + sos-stream):
- LANG_INSTRUCTIONS Map raus, ersetzt durch dynamische Instruktion
'Reply in {detectedFromUser} ... fallback: {appLang}'
- Lyra matcht jetzt die Sprache der letzten User-Message (per
detectLang Unicode-Detection); App-Locale ist nur noch Fallback
- Instruktion doppelt eingehängt (Anfang + Ende des System-Prompts)
gegen recency bias bei langen deutschen Prompts
TTS (speak dispatcher + speak-cartesia + speak-elevenlabs):
- Kein 'de'-Default mehr für language. detectLang(text, locale) leitet
Sprache primär aus dem Antwort-Text ab (Arabic/Cyrillic/CJK/Turkish-
Letters), Locale als Fallback
- Cartesia + ElevenLabs: language/language_code nur senden wenn
ableitbar, sonst Provider auto-detect statt erzwungenem 'de'
- speak-cartesia: sonic-2 → sonic-3 (Multi-Lang, war beim Dispatcher-
Fix gestern vergessen worden)
- Google: en-US neutraler Fallback statt de-DE-Bias
Neu: server/utils/detect-lang.ts
105 lines
3.5 KiB
TypeScript
105 lines
3.5 KiB
TypeScript
/**
|
|
* POST /api/coach/speak-elevenlabs
|
|
* ElevenLabs eleven_multilingual_v2 — voice via runtimeConfig.elevenlabsVoiceId
|
|
* (default: Alexandra `kdmDKE6EkgrWrrykO9Qt` als Fallback wenn unset).
|
|
*
|
|
* Returns audio/mpeg. Voice ist deterministisch konstant über mehrere Calls
|
|
* — identisch zu Gemini-Verhalten, kein Mode-Switch wie bei gpt-4o-mini-tts.
|
|
*/
|
|
import { detectLang } from "../../utils/detect-lang";
|
|
|
|
const FALLBACK_VOICE_ID = "kdmDKE6EkgrWrrykO9Qt"; // Alexandra
|
|
|
|
export default defineEventHandler(async (event) => {
|
|
await requireUser(event);
|
|
|
|
const body = await readBody(event);
|
|
const { text, locale } = body as { text: string; locale?: string };
|
|
|
|
if (!text?.trim()) {
|
|
throw createError({ statusCode: 400, message: "text fehlt" });
|
|
}
|
|
|
|
// Sprache dynamisch ableiten (Text-Script > Locale-Hint > null) statt
|
|
// hardcoded "de" — sonst landen arabische Texte in deutscher Phonetik.
|
|
const lang = detectLang(text, locale);
|
|
|
|
const config = useRuntimeConfig();
|
|
// Fallback chain: runtimeConfig (Nuxt build-time) → process.env (runtime injection
|
|
// via Infisical at pm2-start). Stellt sicher dass auch dann ein Key vorhanden ist
|
|
// wenn nuxt's runtimeConfig-Inflate den process.env-Wert nicht mit-bundelt.
|
|
const key =
|
|
(config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || "";
|
|
const voiceId =
|
|
(config.elevenlabsVoiceId as string) ||
|
|
process.env.ELEVENLABS_VOICE_ID ||
|
|
FALLBACK_VOICE_ID;
|
|
|
|
console.log(
|
|
"[speak-elevenlabs] cfg-key:",
|
|
!!config.elevenlabsApiKey,
|
|
"env-key:",
|
|
!!process.env.ELEVENLABS_API_KEY,
|
|
"key-len:",
|
|
key.length,
|
|
"voice:",
|
|
voiceId,
|
|
);
|
|
|
|
if (!key) {
|
|
throw createError({
|
|
statusCode: 503,
|
|
message: "ELEVENLABS_API_KEY nicht konfiguriert",
|
|
});
|
|
}
|
|
|
|
console.log("[speak-elevenlabs] CALL recv, text-len=", text?.length ?? 0, "voice=", voiceId);
|
|
|
|
// /stream endpoint + optimize_streaming_latency=4 (max-latency-optimized,
|
|
// marginal weniger Quality). ElevenLabs sendet erste Bytes ~200-300ms statt
|
|
// 600-1000ms beim non-stream endpoint.
|
|
const upstream = await fetch(
|
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream?optimize_streaming_latency=4`,
|
|
{
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": key,
|
|
"Content-Type": "application/json",
|
|
Accept: "audio/mpeg",
|
|
},
|
|
body: JSON.stringify({
|
|
text: text.slice(0, 4096),
|
|
// Turbo v2.5: ~50% schneller als multilingual_v2, marginal niedrigere
|
|
// Quality — Trade-off lohnt sich für SOS (latency > Studio-Polish).
|
|
model_id: "eleven_turbo_v2_5",
|
|
// language_code nur explizit wenn wir ihn aus Detection/Locale
|
|
// ableiten konnten — sonst ElevenLabs auto-detecten lassen.
|
|
...(lang ? { language_code: lang } : {}),
|
|
voice_settings: {
|
|
stability: 0.5,
|
|
similarity_boost: 0.75,
|
|
style: 0.3,
|
|
use_speaker_boost: true,
|
|
},
|
|
output_format: "mp3_22050_32",
|
|
}),
|
|
},
|
|
);
|
|
|
|
if (!upstream.ok || !upstream.body) {
|
|
const err = await upstream.text().catch(() => "");
|
|
console.error("[speak-elevenlabs] error:", upstream.status, err);
|
|
throw createError({
|
|
statusCode: 502,
|
|
message: "ElevenLabs TTS fehlgeschlagen",
|
|
});
|
|
}
|
|
|
|
setHeader(event, "Content-Type", "audio/mpeg");
|
|
setHeader(event, "Cache-Control", "no-store");
|
|
|
|
const { Readable } = await import("node:stream");
|
|
const nodeStream = Readable.fromWeb(upstream.body as never);
|
|
return sendStream(event, nodeStream);
|
|
});
|