import type { H3Event } from "h3"; import type { VoiceConfig } from "../../utils/plan-features"; import { getPlanLimits } from "../../utils/plan-features"; import { getRemainingVoiceQuota, consumeVoiceQuota, estimateAudioSeconds, } from "../../db/voiceQuota"; /** * POST /api/coach/speak * * Plan-aware TTS dispatcher: * Free → Google Cloud TTS Neural2 (60 s/day quota) * Pro → Cartesia Sonic-2 (300 s/day quota) * Legend → ElevenLabs Turbo v2.5 (unlimited) * * Request body: * { text: string; mode?: "chat" | "sos" | "sos-continuation" } * * Response: * audio/mpeg stream — on success * { error: "voice_quota_exceeded", resetAt: string, plan: string } — 429 * * Quota logic lives in server/db/voiceQuota.ts. * Provider implementations live in server/api/coach/speak-*.post.ts but are * NOT called via HTTP redirect — logic is inlined here to avoid double-auth * overhead and keep quota-consume atomic with the actual provider call. */ export default defineEventHandler(async (event) => { const user = await requireUser(event); const body = await readBody(event); const { text, mode } = body as { text?: string; mode?: "chat" | "sos" | "sos-continuation"; }; if (!text?.trim()) { throw createError({ statusCode: 400, message: "text fehlt" }); } const trimmed = text.slice(0, 4096); // ─── Load profile + plan ──────────────────────────────────────────────── const db = usePrisma(); const profile = await db.profile.findUnique({ where: { id: user.id }, select: { plan: true, lyraVoiceId: true }, }); const plan = (profile?.plan ?? "free").toLowerCase(); // lyraVoiceId nur für legend wirksam — plan-check im speakElevenLabs const userLyraVoiceId = plan === "legend" ? (profile?.lyraVoiceId ?? null) : null; const limits = getPlanLimits(plan); const voiceCfg = limits.voice; // ─── Quota check ──────────────────────────────────────────────────────── const remaining = await getRemainingVoiceQuota(user.id, plan); if (remaining === 0) { // Compute reset timestamp (next UTC midnight) const resetAt = new Date(); resetAt.setUTCDate(resetAt.getUTCDate() + 1); resetAt.setUTCHours(0, 0, 0, 0); setResponseStatus(event, 429); return { error: "voice_quota_exceeded", resetAt: resetAt.toISOString(), plan, }; } const config = useRuntimeConfig(); // ─── Dispatch per provider ─────────────────────────────────────────────── switch (voiceCfg.provider) { case "google": return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan); case "cartesia": return await speakCartesia(event, trimmed, config, voiceCfg, user.id, plan); case "elevenlabs": return await speakElevenLabs(event, trimmed, mode, config, voiceCfg, user.id, plan, userLyraVoiceId); default: { // Unknown provider in config — fallback to Google with warning console.warn("[speak] unknown provider in plan-features:", voiceCfg.provider, "→ falling back to google"); return await speakGoogle(event, trimmed, config, voiceCfg, user.id, plan); } } }); // ─── Provider implementations ──────────────────────────────────────────────── async function speakGoogle( event: H3Event, text: string, config: ReturnType, voiceCfg: VoiceConfig, userId: string, plan: string, ) { const key = (config.googleApiKey as string) || process.env.GOOGLE_API_KEY || ""; if (!key) { throw createError({ statusCode: 503, message: "Google TTS API Key nicht konfiguriert" }); } const voiceName = voiceCfg.model ?? "de-DE-Neural2-F"; const response = await fetch( `https://texttospeech.googleapis.com/v1/text:synthesize?key=${key}`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ input: { text }, voice: { languageCode: "de-DE", name: voiceName, ssmlGender: "FEMALE", }, audioConfig: { audioEncoding: "MP3", speakingRate: 1.0, pitch: 0, }, }), }, ); if (!response.ok) { const err = await response.json().catch(() => ({})); console.error("[speak/google] error:", response.status, err); throw createError({ statusCode: 502, message: "Google TTS fehlgeschlagen" }); } const result = await response.json(); if (!result.audioContent) { throw createError({ statusCode: 502, message: "Google TTS: kein Audio zurückgegeben" }); } await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text)); // Google returns base64 — convert to buffer and stream const audioBuffer = Buffer.from(result.audioContent, "base64"); setHeader(event, "Content-Type", "audio/mpeg"); setHeader(event, "Cache-Control", "no-store"); setHeader(event, "Content-Length", String(audioBuffer.length)); // Send raw bytes — h3 will flush buffer response return audioBuffer; } async function speakCartesia( event: H3Event, text: string, config: ReturnType, voiceCfg: VoiceConfig, userId: string, plan: string, ) { const key = (config.cartesiaApiKey as string) || process.env.CARTESIA_API_KEY || ""; if (!key) { throw createError({ statusCode: 503, message: "Cartesia API Key nicht konfiguriert" }); } const CARTESIA_FALLBACK_VOICE = "b9de4a89-2257-424b-94c2-db18ba68c81a"; const voiceId = voiceCfg.voiceId || (config.cartesiaVoiceId as string) || process.env.CARTESIA_VOICE_ID || CARTESIA_FALLBACK_VOICE; const upstream = await fetch("https://api.cartesia.ai/tts/bytes", { method: "POST", headers: { "X-API-Key": key, "Cartesia-Version": "2024-11-13", "Content-Type": "application/json", }, body: JSON.stringify({ model_id: voiceCfg.model ?? "sonic-2", transcript: text, voice: { mode: "id", id: voiceId }, output_format: { container: "mp3", sample_rate: 22050, bit_rate: 64000, }, language: "de", }), }); if (!upstream.ok || !upstream.body) { const err = await upstream.text().catch(() => ""); console.error("[speak/cartesia] error:", upstream.status, err); throw createError({ statusCode: 502, message: "Cartesia TTS fehlgeschlagen" }); } await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text)); setHeader(event, "Content-Type", "audio/mpeg"); setHeader(event, "Cache-Control", "no-store"); const { Readable } = await import("node:stream"); return sendStream(event, Readable.fromWeb(upstream.body as never)); } async function speakElevenLabs( event: H3Event, text: string, _mode: "chat" | "sos" | "sos-continuation" | undefined, config: ReturnType, voiceCfg: VoiceConfig, userId: string, plan: string, userLyraVoiceId: string | null = null, ) { const key = (config.elevenlabsApiKey as string) || process.env.ELEVENLABS_API_KEY || ""; if (!key) { throw createError({ statusCode: 503, message: "ElevenLabs API Key nicht konfiguriert" }); } const ELEVENLABS_FALLBACK_VOICE = "kdmDKE6EkgrWrrykO9Qt"; // Alexandra // User-Voice hat höchste Priorität (bereits plan-gefiltert vom Caller) const voiceId = userLyraVoiceId || voiceCfg.voiceId || (config.elevenlabsVoiceId as string) || process.env.ELEVENLABS_VOICE_ID || ELEVENLABS_FALLBACK_VOICE; const modelId = voiceCfg.model ?? "eleven_turbo_v2_5"; const upstream = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream?optimize_streaming_latency=4`, { method: "POST", headers: { "xi-api-key": key, "Content-Type": "application/json", Accept: "audio/mpeg", }, body: JSON.stringify({ text, model_id: modelId, voice_settings: { stability: 0.5, similarity_boost: 0.75, style: 0.3, use_speaker_boost: true, }, output_format: "mp3_22050_32", }), }, ); if (!upstream.ok || !upstream.body) { const err = await upstream.text().catch(() => ""); console.error("[speak/elevenlabs] error:", upstream.status, err); throw createError({ statusCode: 502, message: "ElevenLabs TTS fehlgeschlagen" }); } // Legend = unlimited → consumeVoiceQuota is a no-op (see db/voiceQuota.ts) await consumeVoiceQuota(userId, plan, estimateAudioSeconds(text)); setHeader(event, "Content-Type", "audio/mpeg"); setHeader(event, "Cache-Control", "no-store"); const { Readable } = await import("node:stream"); return sendStream(event, Readable.fromWeb(upstream.body as never)); }