210 lines
7.1 KiB
TypeScript
210 lines
7.1 KiB
TypeScript
// Sentence-Level TTS Queue für SOS-Streaming.
|
|
//
|
|
// Aufrufer (urge.tsx) erstellt eine neue Queue pro sendToLyra-Call und füttert
|
|
// sie via `enqueue(sentence)` aus dem `onSentence`-Callback von streamSosLyra.
|
|
// Die Queue fetched + spielt sequenziell — wenn n+1 reinkommt während n noch
|
|
// spielt, wartet der Fetch bis n's Audio durch ist (kein doppeltes Sprechen).
|
|
//
|
|
// Lifecycle:
|
|
// - new SosTtsQueue({...}) → bereit, nichts spielt
|
|
// - enqueue(s1) → fetch + play s1
|
|
// - enqueue(s2) während s1 spielt → s2 wartet in queue, fetch+play sobald s1 fertig
|
|
// - abort() → in-flight fetch cancelled, current sound stopped+unloaded, queue cleared
|
|
//
|
|
// State-Reporting via Callbacks: onStart (erster Satz beginnt zu spielen),
|
|
// onIdle (Queue komplett durch + nichts mehr spielt). UI-Layer kann darauf
|
|
// `setIsSpeaking` triggern.
|
|
import { Audio } from 'expo-av';
|
|
import * as FileSystem from 'expo-file-system';
|
|
|
|
export type SosTtsFetchOpts = {
|
|
apiBase: string;
|
|
accessToken: string;
|
|
locale: string;
|
|
/** Server-Pfad zum TTS-Endpoint, default: OpenAI. Erlaubt A/B zwischen
|
|
* /api/coach/speak-openai, /api/coach/speak-gemini, /api/coach/speak-google. */
|
|
endpoint?: string;
|
|
};
|
|
|
|
export type SosTtsQueueOpts = SosTtsFetchOpts & {
|
|
/** Erster Satz beginnt zu spielen. */
|
|
onStart?: () => void;
|
|
/** Queue ist leer + nichts spielt mehr. */
|
|
onIdle?: () => void;
|
|
/** Single-sentence-fetch oder -playback ist gescheitert. Queue läuft weiter. */
|
|
onError?: (err: unknown, sentence: string) => void;
|
|
};
|
|
|
|
const EMOJI_RE = /[\p{Extended_Pictographic}\p{Emoji_Component}]/gu;
|
|
|
|
function cleanForTts(text: string): string {
|
|
return text.replace(EMOJI_RE, '').replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
export type SosTtsMode = 'sos' | 'sos-continuation';
|
|
|
|
type QueueItem = {
|
|
text: string;
|
|
mode: SosTtsMode;
|
|
controller: AbortController;
|
|
/** Pre-fetch starts beim enqueue → wenn play dran ist, ist Audio meist schon
|
|
* fertig oder fast fertig. Eliminiert Gap zwischen Items im Hybrid-Mode. */
|
|
audioPromise: Promise<{ uri: string } | null>;
|
|
};
|
|
|
|
export class SosTtsQueue {
|
|
private queue: QueueItem[] = [];
|
|
private playing = false;
|
|
private currentSound: Audio.Sound | null = null;
|
|
private aborted = false;
|
|
private startedOnce = false;
|
|
private opts: SosTtsQueueOpts;
|
|
|
|
constructor(opts: SosTtsQueueOpts) {
|
|
this.opts = opts;
|
|
}
|
|
|
|
/**
|
|
* Enqueue a text segment for TTS playback.
|
|
* @param mode Default 'sos' (warm-empathic-opening). Use 'sos-continuation'
|
|
* für Folge-Blöcke im Hybrid-Mode → server passt OpenAI's
|
|
* `instructions`-Feld an damit der Voice-Boundary weicher klingt.
|
|
*/
|
|
enqueue(sentence: string, mode: SosTtsMode = 'sos'): void {
|
|
if (this.aborted) return;
|
|
const cleaned = cleanForTts(sentence);
|
|
if (!cleaned) return;
|
|
// Pre-fetch SOFORT beim enqueue → läuft parallel zum Playback der vorigen
|
|
// Items. Heißt: wenn Item 1 fertig spielt, ist Item 2's Audio meist schon
|
|
// im Cache → null Gap zwischen den Sätzen/Blöcken.
|
|
const controller = new AbortController();
|
|
const audioPromise = this.fetchAudio(cleaned, mode, controller.signal).catch((err) => {
|
|
this.opts.onError?.(err, cleaned);
|
|
return null;
|
|
});
|
|
this.queue.push({ text: cleaned, mode, controller, audioPromise });
|
|
void this.tick();
|
|
}
|
|
|
|
abort(): void {
|
|
this.aborted = true;
|
|
// Alle in-flight fetches cancelen (auch pre-fetched ones)
|
|
for (const item of this.queue) {
|
|
item.controller.abort();
|
|
}
|
|
this.queue = [];
|
|
if (this.currentSound) {
|
|
const s = this.currentSound;
|
|
this.currentSound = null;
|
|
s.stopAsync().catch(() => {});
|
|
s.unloadAsync().catch(() => {});
|
|
}
|
|
}
|
|
|
|
/** True wenn noch was läuft (in queue oder gerade spielend). */
|
|
isActive(): boolean {
|
|
return !this.aborted && (this.playing || this.queue.length > 0);
|
|
}
|
|
|
|
private async tick(): Promise<void> {
|
|
if (this.aborted || this.playing) return;
|
|
const item = this.queue.shift();
|
|
if (!item) return;
|
|
this.playing = true;
|
|
|
|
if (!this.startedOnce) {
|
|
this.startedOnce = true;
|
|
this.opts.onStart?.();
|
|
}
|
|
|
|
try {
|
|
const audio = await item.audioPromise;
|
|
if (this.aborted || !audio) return;
|
|
|
|
const { sound } = await Audio.Sound.createAsync(
|
|
{ uri: audio.uri },
|
|
{ shouldPlay: true },
|
|
);
|
|
if (this.aborted) {
|
|
await sound.unloadAsync().catch(() => {});
|
|
return;
|
|
}
|
|
this.currentSound = sound;
|
|
await new Promise<void>((resolve) => {
|
|
sound.setOnPlaybackStatusUpdate((status) => {
|
|
if (this.aborted) {
|
|
sound.setOnPlaybackStatusUpdate(null);
|
|
resolve();
|
|
return;
|
|
}
|
|
if (status.isLoaded && status.didJustFinish) {
|
|
sound.setOnPlaybackStatusUpdate(null);
|
|
sound.unloadAsync().catch(() => {});
|
|
resolve();
|
|
}
|
|
});
|
|
});
|
|
this.currentSound = null;
|
|
} catch (err) {
|
|
this.opts.onError?.(err, item.text);
|
|
} finally {
|
|
this.playing = false;
|
|
if (this.aborted) return;
|
|
if (this.queue.length > 0) {
|
|
void this.tick();
|
|
} else {
|
|
this.opts.onIdle?.();
|
|
}
|
|
}
|
|
}
|
|
|
|
private async fetchAudio(text: string, mode: SosTtsMode, signal: AbortSignal): Promise<{ uri: string } | null> {
|
|
const endpoint = this.opts.endpoint ?? '/api/coach/speak-openai';
|
|
const isGoogleCloud = endpoint.endsWith('/speak-google');
|
|
const res = await fetch(`${this.opts.apiBase}${endpoint}`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${this.opts.accessToken}`,
|
|
},
|
|
body: JSON.stringify({ text, locale: this.opts.locale, mode }),
|
|
signal,
|
|
});
|
|
if (!res.ok || signal.aborted) return null;
|
|
|
|
// /speak-google liefert JSON { audio: "data:audio/mp3;base64,..." }.
|
|
// /speak-openai (audio/mpeg) und /speak-gemini (audio/wav) liefern den
|
|
// Body als raw bytes — gleiche Pipeline reicht für beide.
|
|
let base64: string;
|
|
let ext: 'mp3' | 'wav';
|
|
if (isGoogleCloud) {
|
|
const json = (await res.json()) as { audio?: string };
|
|
const dataUri = json.audio ?? '';
|
|
const comma = dataUri.indexOf(',');
|
|
if (comma === -1) return null;
|
|
base64 = dataUri.slice(comma + 1);
|
|
ext = 'mp3';
|
|
} else {
|
|
const buffer = await res.arrayBuffer();
|
|
if (signal.aborted || buffer.byteLength === 0) return null;
|
|
const bytes = new Uint8Array(buffer);
|
|
const chunks: string[] = [];
|
|
const cs = 0x8000;
|
|
for (let i = 0; i < bytes.length; i += cs) {
|
|
chunks.push(
|
|
String.fromCharCode(...bytes.subarray(i, Math.min(i + cs, bytes.length))),
|
|
);
|
|
}
|
|
base64 = btoa(chunks.join(''));
|
|
ext = endpoint.endsWith('/speak-gemini') ? 'wav' : 'mp3';
|
|
}
|
|
|
|
const tmpPath = `${FileSystem.cacheDirectory}sos-tts-q-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.${ext}`;
|
|
await FileSystem.writeAsStringAsync(tmpPath, base64, {
|
|
encoding: FileSystem.EncodingType.Base64,
|
|
});
|
|
if (signal.aborted) return null;
|
|
return { uri: tmpPath };
|
|
}
|
|
}
|