chahinebrini 685782b538 fix(coach): dynamische Sprache (Text-Detection + App-Locale-Fallback)
LLM-Prompt (message.post + sos-stream):
- LANG_INSTRUCTIONS Map raus, ersetzt durch dynamische Instruktion
  'Reply in {detectedFromUser} ... fallback: {appLang}'
- Lyra matcht jetzt die Sprache der letzten User-Message (per
  detectLang Unicode-Detection); App-Locale ist nur noch Fallback
- Instruktion doppelt eingehängt (Anfang + Ende des System-Prompts)
  gegen recency bias bei langen deutschen Prompts

TTS (speak dispatcher + speak-cartesia + speak-elevenlabs):
- Kein 'de'-Default mehr für language. detectLang(text, locale) leitet
  Sprache primär aus dem Antwort-Text ab (Arabic/Cyrillic/CJK/Turkish-
  Letters), Locale als Fallback
- Cartesia + ElevenLabs: language/language_code nur senden wenn
  ableitbar, sonst Provider auto-detect statt erzwungenem 'de'
- speak-cartesia: sonic-2 → sonic-3 (Multi-Lang, war beim Dispatcher-
  Fix gestern vergessen worden)
- Google: en-US neutraler Fallback statt de-DE-Bias

Neu: server/utils/detect-lang.ts
2026-05-31 00:12:40 +02:00

300 lines
11 KiB
TypeScript

import type { H3Event } from "h3";
import type { VoiceConfig } from "../../utils/plan-features";
import { getPlanLimits } from "../../utils/plan-features";
import { detectLang } from "../../utils/detect-lang";
import {
getRemainingVoiceQuota,
consumeVoiceQuota,
estimateAudioSeconds,
} from "../../db/voiceQuota";
/**
* POST /api/coach/speak
*
* Plan-aware TTS dispatcher:
* Free → Google Cloud TTS Neural2 (60 s/day quota)
* Pro → Cartesia Sonic-2 (300 s/day quota)
* Legend → ElevenLabs Turbo v2.5 (unlimited)
*
* Request body:
* { text: string; mode?: "chat" | "sos" | "sos-continuation" }
*
* Response:
* audio/mpeg stream — on success
* { error: "voice_quota_exceeded", resetAt: string, plan: string } — 429
*
* Quota logic lives in server/db/voiceQuota.ts.
* Provider implementations live in server/api/coach/speak-*.post.ts but are
* NOT called via HTTP redirect — logic is inlined here to avoid double-auth
* overhead and keep quota-consume atomic with the actual provider call.
*/
export default defineEventHandler(async (event) => {
const user = await requireUser(event);
const body = await readBody(event);
const { text, mode, locale } = body as {
text?: string;
mode?: "chat" | "sos" | "sos-continuation";
locale?: string;
};
if (!text?.trim()) {
throw createError({ statusCode: 400, message: "text fehlt" });
}
const trimmed = text.slice(0, 4096);
// Sprache dynamisch: erst Text-Script-Detection (Arabic/Cyrillic/CJK/Turkish
// letters …), Fallback auf App-Locale aus body. Kein "de"-Default mehr —
// wenn beides leer ist, lassen wir den Provider auto-detecten.
const lang = detectLang(trimmed, locale);
// ─── Load profile + plan ────────────────────────────────────────────────
const db = usePrisma();
const profile = await db.profile.findUnique({
where: { id: user.id },
select: { plan: true, lyraVoiceId: true },
});
const plan = (profile?.plan ?? "free").toLowerCase();
// lyraVoiceId nur für legend wirksam — plan-check im speakElevenLabs
const userLyraVoiceId = plan === "legend" ? (profile?.lyraVoiceId ?? null) : null;
const limits = getPlanLimits(plan);
const voiceCfg = limits.voice;
// ─── Quota check ────────────────────────────────────────────────────────
const remaining = await getRemainingVoiceQuota(user.id, plan);
if (remaining === 0) {
// Compute reset timestamp (next UTC midnight)
const resetAt = new Date();
resetAt.setUTCDate(resetAt.getUTCDate() + 1);
resetAt.setUTCHours(0, 0, 0, 0);
setResponseStatus(event, 429);
return {
error: "voice_quota_exceeded",
resetAt: resetAt.toISOString(),
plan,
};
}
const config = useRuntimeConfig();
// ─── Dispatch per provider ───────────────────────────────────────────────
switch (voiceCfg.provider) {
case "google":
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang);
case "cartesia":
return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan, lang);
case "elevenlabs":
return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId, lang);
default: {
// Unknown provider in config — fallback to Google with warning
console.warn("[speak] unknown provider in plan-features:", voiceCfg.provider, "→ falling back to google");
return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan, lang);
}
}
});
// ─── Provider implementations ────────────────────────────────────────────────
async function speakGoogle(
event: H3Event,
text: string,
config: ReturnType<typeof useRuntimeConfig>,
voiceCfg: VoiceConfig,
userId: string,
plan: string,
lang: string | null,
) {
const key = (config.googleApiKey as string) || process.env.GOOGLE_API_KEY || "";
if (!key) {
throw createError({ statusCode: 503, message: "Google TTS API Key nicht konfiguriert" });
}
// Google-Stimmen sind sprachgebunden — eine de-DE-Stimme kann kein Arabisch.
// BCP-47-LanguageCode aus dem Basis-Locale ableiten; den konfigurierten
// Voice-Namen nur für Deutsch verwenden, sonst Google eine Default-Stimme
// der Zielsprache wählen lassen (name weglassen + ssmlGender).
const GOOGLE_LANG: Record<string, string> = {
de: "de-DE", en: "en-US", fr: "fr-FR", ar: "ar-XA",
tr: "tr-TR", es: "es-ES", pt: "pt-PT", it: "it-IT",
};
// Google verlangt zwingend languageCode — wenn weder Detection noch Locale
// greifen, neutraler en-US-Fallback statt de-DE-Bias.
const languageCode = (lang && GOOGLE_LANG[lang]) || "en-US";
const voiceName = lang === "de" ? (voiceCfg.model ?? "de-DE-Neural2-F") : undefined;
const response = await fetch(
`https://texttospeech.googleapis.com/v1/text:synthesize?key=${key}`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
input: { text },
voice: {
languageCode,
...(voiceName ? { name: voiceName } : {}),
ssmlGender: "FEMALE",
},
audioConfig: {
audioEncoding: "MP3",
speakingRate: 1.0,
pitch: 0,
},
}),
},
);
if (!response.ok) {
const err = await response.json().catch(() => ({}));
console.error("[speak/google] error:", response.status, err);
throw createError({ statusCode: 502, message: "Google TTS fehlgeschlagen" });
}
const result = await response.json();
if (!result.audioContent) {
throw createError({ statusCode: 502, message: "Google TTS: kein Audio zurückgegeben" });
}
await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text));
// Google returns base64 — convert to buffer and stream
const audioBuffer = Buffer.from(result.audioContent, "base64");
setHeader(event, "Content-Type", "audio/mpeg");
setHeader(event, "Cache-Control", "no-store");
setHeader(event, "Content-Length", String(audioBuffer.length));
// Send raw bytes — h3 will flush buffer response
return audioBuffer;
}
async function speakCartesia(
event: H3Event,
text: string,
config: ReturnType<typeof useRuntimeConfig>,
voiceCfg: VoiceConfig,
userId: string,
plan: string,
lang: string | null,
) {
const key = (config.cartesiaApiKey as string) || process.env.CARTESIA_API_KEY || "";
if (!key) {
throw createError({ statusCode: 503, message: "Cartesia API Key nicht konfiguriert" });
}
const CARTESIA_FALLBACK_VOICE = "b9de4a89-2257-424b-94c2-db18ba68c81a";
const voiceId =
voiceCfg.voiceId ||
(config.cartesiaVoiceId as string) ||
process.env.CARTESIA_VOICE_ID ||
CARTESIA_FALLBACK_VOICE;
const upstream = await fetch("https://api.cartesia.ai/tts/bytes", {
method: "POST",
headers: {
"X-API-Key": key,
"Cartesia-Version": "2024-11-13",
"Content-Type": "application/json",
},
body: JSON.stringify({
model_id: voiceCfg.model ?? "sonic-3",
transcript: text,
voice: { mode: "id", id: voiceId },
output_format: {
container: "mp3",
sample_rate: 22050,
bit_rate: 64000,
},
// Sonic-3 unterstützt 42 Sprachen — wenn Detection greift, language
// explizit setzen, sonst Provider auto-detecten lassen.
...(lang ? { language: lang } : {}),
}),
});
if (!upstream.ok || !upstream.body) {
const err = await upstream.text().catch(() => "");
console.error("[speak/cartesia] error:", upstream.status, err);
throw createError({ statusCode: 502, message: "Cartesia TTS fehlgeschlagen" });
}
await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text));
setHeader(event, "Content-Type", "audio/mpeg");
setHeader(event, "Cache-Control", "no-store");
const { Readable } = await import("node:stream");
return sendStream(event, Readable.fromWeb(upstream.body as never));
}
async function speakElevenLabs(
event: H3Event,
text: string,
_mode: "chat" | "sos" | "sos-continuation" | undefined,
config: ReturnType<typeof useRuntimeConfig>,
voiceCfg: VoiceConfig,
userId: string,
plan: string,
userLyraVoiceId: string | null = null,
lang: string | null = null,
) {
const key =
(config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || "";
if (!key) {
throw createError({ statusCode: 503, message: "ElevenLabs API Key nicht konfiguriert" });
}
const ELEVENLABS_FALLBACK_VOICE = "kdmDKE6EkgrWrrykO9Qt"; // Alexandra
// User-Voice hat höchste Priorität (bereits plan-gefiltert vom Caller)
const voiceId =
userLyraVoiceId ||
voiceCfg.voiceId ||
(config.elevenlabsVoiceId as string) ||
process.env.ELEVENLABS_VOICE_ID ||
ELEVENLABS_FALLBACK_VOICE;
const modelId = voiceCfg.model ?? "eleven_turbo_v2_5";
const upstream = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream?optimize_streaming_latency=4`,
{
method: "POST",
headers: {
"xi-api-key": key,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
// Turbo v2.5 ist multilingual (32 Sprachen inkl. ar) — dieselbe Stimme
// spricht die Zielsprache. language_code nur explizit setzen wenn wir
// ihn aus Detection/Locale ableiten konnten; sonst ElevenLabs
// auto-detecten lassen statt einen falschen Code aufzuzwingen.
...(lang ? { language_code: lang } : {}),
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
style: 0.3,
use_speaker_boost: true,
},
output_format: "mp3_22050_32",
}),
},
);
if (!upstream.ok || !upstream.body) {
const err = await upstream.text().catch(() => "");
console.error("[speak/elevenlabs] error:", upstream.status, err);
throw createError({ statusCode: 502, message: "ElevenLabs TTS fehlgeschlagen" });
}
// Legend = unlimited → consumeVoiceQuota is a no-op (see db/voiceQuota.ts)
await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text));
setHeader(event, "Content-Type", "audio/mpeg");
setHeader(event, "Cache-Control", "no-store");
const { Readable } = await import("node:stream");
return sendStream(event, Readable.fromWeb(upstream.body as never));
}