diff --git a/backend/server/api/coach/speak.post.ts b/backend/server/api/coach/speak.post.ts index 0ee89c9..d2b3430 100644 --- a/backend/server/api/coach/speak.post.ts +++ b/backend/server/api/coach/speak.post.ts @@ -31,9 +31,10 @@ export default defineEventHandler(async (event) => { const user = await requireUser(event); const body = await readBody(event); - const { text, mode } = body as { + const { text, mode, locale } = body as { text?: string; mode?: "chat" | "sos" | "sos-continuation"; + locale?: string; }; if (!text?.trim()) { @@ -41,6 +42,9 @@ export default defineEventHandler(async (event) => { } const trimmed = text.slice(0, 4096); + // i18n-Locale (z.B. "ar", "de-DE") → 2-Buchstaben-Basis-Sprachcode für die + // Provider. Ohne das sprach Lyra arabischen Text mit deutscher Stimme/Phonetik. + const lang = (locale ?? "de").split("-")[0].toLowerCase(); // ─── Load profile + plan ──────────────────────────────────────────────── const db = usePrisma(); @@ -76,15 +80,15 @@ export default defineEventHandler(async (event) => { // ─── Dispatch per provider ─────────────────────────────────────────────── switch (voiceCfg.provider) { case "google": - return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan); + return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang); case "cartesia": - return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan); + return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan, lang); case "elevenlabs": - return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId); + return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId, lang); default: { // Unknown provider in config — fallback to Google with warning console.warn("[speak] unknown provider in plan-features:", voiceCfg.provider, "→ falling back to google"); - return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan); + return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang); } } }); @@ -98,13 +102,23 @@ async function speakGoogle( voiceCfg: VoiceConfig, userId: string, plan: string, + lang: string, ) { const key = (config.googleApiKey as string) || process.env.GOOGLE_API_KEY || ""; if (!key) { throw createError({ statusCode: 503, message: "Google TTS API Key nicht konfiguriert" }); } - const voiceName = voiceCfg.model ?? "de-DE-Neural2-F"; + // Google-Stimmen sind sprachgebunden — eine de-DE-Stimme kann kein Arabisch. + // BCP-47-LanguageCode aus dem Basis-Locale ableiten; den konfigurierten + // Voice-Namen nur für Deutsch verwenden, sonst Google eine Default-Stimme + // der Zielsprache wählen lassen (name weglassen + ssmlGender). + const GOOGLE_LANG: Record = { + de: "de-DE", en: "en-US", fr: "fr-FR", ar: "ar-XA", + tr: "tr-TR", es: "es-ES", pt: "pt-PT", it: "it-IT", + }; + const languageCode = GOOGLE_LANG[lang] ?? "de-DE"; + const voiceName = lang === "de" ? (voiceCfg.model ?? "de-DE-Neural2-F") : undefined; const response = await fetch( `https://texttospeech.googleapis.com/v1/text:synthesize?key=${key}`, @@ -114,8 +128,8 @@ async function speakGoogle( body: JSON.stringify({ input: { text }, voice: { - languageCode: "de-DE", - name: voiceName, + languageCode, + ...(voiceName ? { name: voiceName } : {}), ssmlGender: "FEMALE", }, audioConfig: { @@ -157,6 +171,7 @@ async function speakCartesia( voiceCfg: VoiceConfig, userId: string, plan: string, + lang: string, ) { const key = (config.cartesiaApiKey as string) || process.env.CARTESIA_API_KEY || ""; if (!key) { @@ -178,7 +193,7 @@ async function speakCartesia( "Content-Type": "application/json", }, body: JSON.stringify({ - model_id: voiceCfg.model ?? "sonic-2", + model_id: voiceCfg.model ?? "sonic-3", transcript: text, voice: { mode: "id", id: voiceId }, output_format: { @@ -186,7 +201,9 @@ async function speakCartesia( sample_rate: 22050, bit_rate: 64000, }, - language: "de", + // Sonic-3 unterstützt 42 Sprachen inkl. ar — language aus User-Locale + // statt hardcoded "de", sonst klingt arabischer Text deutsch-phonetisch. + language: lang, }), }); @@ -214,6 +231,7 @@ async function speakElevenLabs( userId: string, plan: string, userLyraVoiceId: string | null = null, + lang: string = "de", ) { const key = (config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || ""; @@ -244,6 +262,10 @@ async function speakElevenLabs( body: JSON.stringify({ text, model_id: modelId, + // Turbo v2.5 ist multilingual (32 Sprachen inkl. ar) — dieselbe Stimme + // spricht die Zielsprache. language_code explizit setzen statt nur + // Auto-Detect, damit kurze Texte zuverlässig in der User-Sprache landen. + language_code: lang, voice_settings: { stability: 0.5, similarity_boost: 0.75, diff --git a/backend/server/utils/plan-features.ts b/backend/server/utils/plan-features.ts index 855ee3b..22f1a04 100644 --- a/backend/server/utils/plan-features.ts +++ b/backend/server/utils/plan-features.ts @@ -98,7 +98,7 @@ export const PLAN_LIMITS: Record, PlanLimits> = { aiProvider: "groq", voice: { provider: "cartesia", - model: "sonic-2", // Cartesia Sonic-2 — ~75ms TTFT, native German, ~$4/1M chars + model: "sonic-3", // Cartesia Sonic-3 — 42 Sprachen inkl. Arabisch (sonic-2 konnte nur 15, kein ar) dailyQuotaSeconds: 300, // 5 Minuten/Tag }, },