feat(coach): mehrsprachiges TTS — locale durchreichen + Cartesia Sonic-3
TTS-Sprache war provider-übergreifend hart auf "de" verdrahtet, locale aus dem Request wurde ignoriert → arabischer Text wurde deutsch-phonetisch gesprochen. - locale aus Body auslesen → Basis-Sprachcode an alle Provider - Pro: Cartesia sonic-2 → sonic-3 (sonic-2 kann kein Arabisch; sonic-3 = 42 Sprachen) - Legend: ElevenLabs language_code gesetzt (turbo_v2_5 multilingual, ar dabei) - Google-Fallback: BCP-47-Map (ar→ar-XA etc.), de-Voice nur noch für de Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
b0315fd177
commit
f9d44a6754
@ -31,9 +31,10 @@ export default defineEventHandler(async (event) => {
|
|||||||
const user = await requireUser(event);
|
const user = await requireUser(event);
|
||||||
|
|
||||||
const body = await readBody(event);
|
const body = await readBody(event);
|
||||||
const { text, mode } = body as {
|
const { text, mode, locale } = body as {
|
||||||
text?: string;
|
text?: string;
|
||||||
mode?: "chat" | "sos" | "sos-continuation";
|
mode?: "chat" | "sos" | "sos-continuation";
|
||||||
|
locale?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!text?.trim()) {
|
if (!text?.trim()) {
|
||||||
@ -41,6 +42,9 @@ export default defineEventHandler(async (event) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const trimmed = text.slice(0, 4096);
|
const trimmed = text.slice(0, 4096);
|
||||||
|
// i18n-Locale (z.B. "ar", "de-DE") → 2-Buchstaben-Basis-Sprachcode für die
|
||||||
|
// Provider. Ohne das sprach Lyra arabischen Text mit deutscher Stimme/Phonetik.
|
||||||
|
const lang = (locale ?? "de").split("-")[0].toLowerCase();
|
||||||
|
|
||||||
// ─── Load profile + plan ────────────────────────────────────────────────
|
// ─── Load profile + plan ────────────────────────────────────────────────
|
||||||
const db = usePrisma();
|
const db = usePrisma();
|
||||||
@ -76,15 +80,15 @@ export default defineEventHandler(async (event) => {
|
|||||||
// ─── Dispatch per provider ───────────────────────────────────────────────
|
// ─── Dispatch per provider ───────────────────────────────────────────────
|
||||||
switch (voiceCfg.provider) {
|
switch (voiceCfg.provider) {
|
||||||
case "google":
|
case "google":
|
||||||
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan);
|
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang);
|
||||||
case "cartesia":
|
case "cartesia":
|
||||||
return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan);
|
return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan, lang);
|
||||||
case "elevenlabs":
|
case "elevenlabs":
|
||||||
return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId);
|
return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId, lang);
|
||||||
default: {
|
default: {
|
||||||
// Unknown provider in config — fallback to Google with warning
|
// Unknown provider in config — fallback to Google with warning
|
||||||
console.warn("[speak] unknown provider in plan-features:", voiceCfg.provider, "→ falling back to google");
|
console.warn("[speak] unknown provider in plan-features:", voiceCfg.provider, "→ falling back to google");
|
||||||
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan);
|
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -98,13 +102,23 @@ async function speakGoogle(
|
|||||||
voiceCfg: VoiceConfig,
|
voiceCfg: VoiceConfig,
|
||||||
userId: string,
|
userId: string,
|
||||||
plan: string,
|
plan: string,
|
||||||
|
lang: string,
|
||||||
) {
|
) {
|
||||||
const key = (config.googleApiKey as string) || process.env.GOOGLE_API_KEY || "";
|
const key = (config.googleApiKey as string) || process.env.GOOGLE_API_KEY || "";
|
||||||
if (!key) {
|
if (!key) {
|
||||||
throw createError({ statusCode: 503, message: "Google TTS API Key nicht konfiguriert" });
|
throw createError({ statusCode: 503, message: "Google TTS API Key nicht konfiguriert" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const voiceName = voiceCfg.model ?? "de-DE-Neural2-F";
|
// Google-Stimmen sind sprachgebunden — eine de-DE-Stimme kann kein Arabisch.
|
||||||
|
// BCP-47-LanguageCode aus dem Basis-Locale ableiten; den konfigurierten
|
||||||
|
// Voice-Namen nur für Deutsch verwenden, sonst Google eine Default-Stimme
|
||||||
|
// der Zielsprache wählen lassen (name weglassen + ssmlGender).
|
||||||
|
const GOOGLE_LANG: Record<string, string> = {
|
||||||
|
de: "de-DE", en: "en-US", fr: "fr-FR", ar: "ar-XA",
|
||||||
|
tr: "tr-TR", es: "es-ES", pt: "pt-PT", it: "it-IT",
|
||||||
|
};
|
||||||
|
const languageCode = GOOGLE_LANG[lang] ?? "de-DE";
|
||||||
|
const voiceName = lang === "de" ? (voiceCfg.model ?? "de-DE-Neural2-F") : undefined;
|
||||||
|
|
||||||
const response = await fetch(
|
const response = await fetch(
|
||||||
`https://texttospeech.googleapis.com/v1/text:synthesize?key=${key}`,
|
`https://texttospeech.googleapis.com/v1/text:synthesize?key=${key}`,
|
||||||
@ -114,8 +128,8 @@ async function speakGoogle(
|
|||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
input: { text },
|
input: { text },
|
||||||
voice: {
|
voice: {
|
||||||
languageCode: "de-DE",
|
languageCode,
|
||||||
name: voiceName,
|
...(voiceName ? { name: voiceName } : {}),
|
||||||
ssmlGender: "FEMALE",
|
ssmlGender: "FEMALE",
|
||||||
},
|
},
|
||||||
audioConfig: {
|
audioConfig: {
|
||||||
@ -157,6 +171,7 @@ async function speakCartesia(
|
|||||||
voiceCfg: VoiceConfig,
|
voiceCfg: VoiceConfig,
|
||||||
userId: string,
|
userId: string,
|
||||||
plan: string,
|
plan: string,
|
||||||
|
lang: string,
|
||||||
) {
|
) {
|
||||||
const key = (config.cartesiaApiKey as string) || process.env.CARTESIA_API_KEY || "";
|
const key = (config.cartesiaApiKey as string) || process.env.CARTESIA_API_KEY || "";
|
||||||
if (!key) {
|
if (!key) {
|
||||||
@ -178,7 +193,7 @@ async function speakCartesia(
|
|||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model_id: voiceCfg.model ?? "sonic-2",
|
model_id: voiceCfg.model ?? "sonic-3",
|
||||||
transcript: text,
|
transcript: text,
|
||||||
voice: { mode: "id", id: voiceId },
|
voice: { mode: "id", id: voiceId },
|
||||||
output_format: {
|
output_format: {
|
||||||
@ -186,7 +201,9 @@ async function speakCartesia(
|
|||||||
sample_rate: 22050,
|
sample_rate: 22050,
|
||||||
bit_rate: 64000,
|
bit_rate: 64000,
|
||||||
},
|
},
|
||||||
language: "de",
|
// Sonic-3 unterstützt 42 Sprachen inkl. ar — language aus User-Locale
|
||||||
|
// statt hardcoded "de", sonst klingt arabischer Text deutsch-phonetisch.
|
||||||
|
language: lang,
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -214,6 +231,7 @@ async function speakElevenLabs(
|
|||||||
userId: string,
|
userId: string,
|
||||||
plan: string,
|
plan: string,
|
||||||
userLyraVoiceId: string | null = null,
|
userLyraVoiceId: string | null = null,
|
||||||
|
lang: string = "de",
|
||||||
) {
|
) {
|
||||||
const key =
|
const key =
|
||||||
(config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || "";
|
(config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || "";
|
||||||
@ -244,6 +262,10 @@ async function speakElevenLabs(
|
|||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
text,
|
text,
|
||||||
model_id: modelId,
|
model_id: modelId,
|
||||||
|
// Turbo v2.5 ist multilingual (32 Sprachen inkl. ar) — dieselbe Stimme
|
||||||
|
// spricht die Zielsprache. language_code explizit setzen statt nur
|
||||||
|
// Auto-Detect, damit kurze Texte zuverlässig in der User-Sprache landen.
|
||||||
|
language_code: lang,
|
||||||
voice_settings: {
|
voice_settings: {
|
||||||
stability: 0.5,
|
stability: 0.5,
|
||||||
similarity_boost: 0.75,
|
similarity_boost: 0.75,
|
||||||
|
|||||||
@ -98,7 +98,7 @@ export const PLAN_LIMITS: Record<Exclude<Plan, "free">, PlanLimits> = {
|
|||||||
aiProvider: "groq",
|
aiProvider: "groq",
|
||||||
voice: {
|
voice: {
|
||||||
provider: "cartesia",
|
provider: "cartesia",
|
||||||
model: "sonic-2", // Cartesia Sonic-2 — ~75ms TTFT, native German, ~$4/1M chars
|
model: "sonic-3", // Cartesia Sonic-3 — 42 Sprachen inkl. Arabisch (sonic-2 konnte nur 15, kein ar)
|
||||||
dailyQuotaSeconds: 300, // 5 Minuten/Tag
|
dailyQuotaSeconds: 300, // 5 Minuten/Tag
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user