rebreak-monorepo/apps/rebreak-native/lib/sosTtsQueue.ts

// Sentence-Level TTS Queue für SOS-Streaming.
//
// Aufrufer (urge.tsx) erstellt eine neue Queue pro sendToLyra-Call und füttert
// sie via `enqueue(sentence)` aus dem `onSentence`-Callback von streamSosLyra.
// Die Queue fetched + spielt sequenziell — wenn n+1 reinkommt während n noch
// spielt, wartet der Fetch bis n's Audio durch ist (kein doppeltes Sprechen).
//
// Lifecycle:
//  - new SosTtsQueue({...}) → bereit, nichts spielt
//  - enqueue(s1) → fetch + play s1
//  - enqueue(s2) während s1 spielt → s2 wartet in queue, fetch+play sobald s1 fertig
//  - abort() → in-flight fetch cancelled, current sound stopped+unloaded, queue cleared
//
// State-Reporting via Callbacks: onStart (erster Satz beginnt zu spielen),
// onIdle (Queue komplett durch + nichts mehr spielt). UI-Layer kann darauf
// `setIsSpeaking` triggern.
import { Audio } from 'expo-av';
import * as FileSystem from 'expo-file-system';
import type { BenchOnMetric } from './sosTtsBenchmark';

export type SosTtsFetchOpts = {
  apiBase: string;
  accessToken: string;
  locale: string;
  /** Server-Pfad zum TTS-Endpoint, default: OpenAI. Erlaubt A/B zwischen
   *  /api/coach/speak-openai, /api/coach/speak-gemini, /api/coach/speak-google. */
  endpoint?: string;
};

export type SosTtsQueueOpts = SosTtsFetchOpts & {
  /** Erster Satz beginnt zu spielen. */
  onStart?: () => void;
  /** Queue ist leer + nichts spielt mehr. */
  onIdle?: () => void;
  /** Single-sentence-fetch oder -playback ist gescheitert. Queue läuft weiter. */
  onError?: (err: unknown, sentence: string) => void;
  /** Latenz-Benchmark: feuert nur für das ERSTE enqueue'te Item, weil das
   *  user-wahrgenommen first-audio bestimmt. Marker: tts-fetch-start,
   *  tts-fetch-headers, tts-body-done, tts-file-written, audio-loaded,
   *  first-audio. Folge-Items (z.B. sos-continuation) instrumentieren wir
   *  nicht — die spielen ja schon parallel zum ersten und verzerren nur. */
  onMetric?: BenchOnMetric;
};

const EMOJI_RE = /[\p{Extended_Pictographic}\p{Emoji_Component}]/gu;

function cleanForTts(text: string): string {
  return text.replace(EMOJI_RE, '').replace(/\s+/g, ' ').trim();
}

export type SosTtsMode = 'sos' | 'sos-continuation';

type QueueItem = {
  text: string;
  mode: SosTtsMode;
  controller: AbortController;
  /** Nur das erste enqueue'te Item bekommt einen onMetric — das bestimmt
   *  user-wahrgenommen first-audio. Folge-Items (sos-continuation) tracken
   *  wir nicht. */
  metric?: BenchOnMetric;
  /** Pre-fetch starts beim enqueue → wenn play dran ist, ist Audio meist schon
   *  fertig oder fast fertig. Eliminiert Gap zwischen Items im Hybrid-Mode. */
  audioPromise: Promise<{ uri: string } | null>;
};

export class SosTtsQueue {
  private queue: QueueItem[] = [];
  private playing = false;
  private currentSound: Audio.Sound | null = null;
  private aborted = false;
  private startedOnce = false;
  private opts: SosTtsQueueOpts;
  // Dedup: in dev-mode (React StrictMode) feuern useEffects 2x → identische
  // Sätze würden 2x enqueued + 2x von der TTS-API geholt + 2x abgespielt.
  // Wir tracken die in dieser Queue-Instanz schon gesehenen Texte.
  private seenTexts = new Set<string>();
  // Bench: nur das ERSTE enqueue'te Item kriegt Metric-Tracking.
  private metricGiven = false;

  constructor(opts: SosTtsQueueOpts) {
    this.opts = opts;
  }

  /**
   * Enqueue a text segment for TTS playback.
   * @param mode  Default 'sos' (warm-empathic-opening). Use 'sos-continuation'
   *              für Folge-Blöcke im Hybrid-Mode → server passt OpenAI's
   *              `instructions`-Feld an damit der Voice-Boundary weicher klingt.
   */
  enqueue(sentence: string, mode: SosTtsMode = 'sos'): void {
    if (this.aborted) return;
    const cleaned = cleanForTts(sentence);
    if (!cleaned) return;
    // Dedup gegen StrictMode-double-effects: gleicher Text in derselben
    // Queue-Instanz wird nur 1x angefragt + abgespielt.
    if (this.seenTexts.has(cleaned)) return;
    this.seenTexts.add(cleaned);
    // Pre-fetch SOFORT beim enqueue → läuft parallel zum Playback der vorigen
    // Items. Heißt: wenn Item 1 fertig spielt, ist Item 2's Audio meist schon
    // im Cache → null Gap zwischen den Sätzen/Blöcken.
    const controller = new AbortController();
    let metric: BenchOnMetric | undefined;
    if (!this.metricGiven && this.opts.onMetric) {
      this.metricGiven = true;
      metric = this.opts.onMetric;
    }
    const audioPromise = this.fetchAudio(cleaned, mode, controller.signal, metric).catch((err) => {
      this.opts.onError?.(err, cleaned);
      return null;
    });
    this.queue.push({ text: cleaned, mode, controller, metric, audioPromise });
    void this.tick();
  }

  abort(): void {
    this.aborted = true;
    // Alle in-flight fetches cancelen (auch pre-fetched ones)
    for (const item of this.queue) {
      item.controller.abort();
    }
    this.queue = [];
    if (this.currentSound) {
      const s = this.currentSound;
      this.currentSound = null;
      s.stopAsync().catch(() => {});
      s.unloadAsync().catch(() => {});
    }
  }

  /** True wenn noch was läuft (in queue oder gerade spielend). */
  isActive(): boolean {
    return !this.aborted && (this.playing || this.queue.length > 0);
  }

  private async tick(): Promise<void> {
    if (this.aborted || this.playing) return;
    const item = this.queue.shift();
    if (!item) return;
    this.playing = true;

    if (!this.startedOnce) {
      this.startedOnce = true;
      this.opts.onStart?.();
    }

    try {
      const audio = await item.audioPromise;
      if (this.aborted || !audio) return;

      const { sound } = await Audio.Sound.createAsync(
        { uri: audio.uri },
        { shouldPlay: true },
      );
      item.metric?.('audio-loaded');
      if (this.aborted) {
        await sound.unloadAsync().catch(() => {});
        return;
      }
      this.currentSound = sound;
      let firstAudioReported = false;
      await new Promise<void>((resolve) => {
        sound.setOnPlaybackStatusUpdate((status) => {
          if (this.aborted) {
            sound.setOnPlaybackStatusUpdate(null);
            resolve();
            return;
          }
          if (!firstAudioReported && status.isLoaded && status.isPlaying) {
            firstAudioReported = true;
            item.metric?.('first-audio');
          }
          if (status.isLoaded && status.didJustFinish) {
            sound.setOnPlaybackStatusUpdate(null);
            sound.unloadAsync().catch(() => {});
            resolve();
          }
        });
      });
      this.currentSound = null;
    } catch (err) {
      this.opts.onError?.(err, item.text);
    } finally {
      this.playing = false;
      if (this.aborted) return;
      if (this.queue.length > 0) {
        void this.tick();
      } else {
        this.opts.onIdle?.();
      }
    }
  }

  private async fetchAudio(
    text: string,
    mode: SosTtsMode,
    signal: AbortSignal,
    metric?: BenchOnMetric,
  ): Promise<{ uri: string } | null> {
    const endpoint = this.opts.endpoint ?? '/api/coach/speak-openai';
    const isGoogleCloud = endpoint.endsWith('/speak-google');
    metric?.('tts-fetch-start', { endpoint });
    const res = await fetch(`${this.opts.apiBase}${endpoint}`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${this.opts.accessToken}`,
      },
      body: JSON.stringify({ text, locale: this.opts.locale, mode }),
      signal,
    });
    metric?.('tts-fetch-headers', { status: res.status });
    if (!res.ok || signal.aborted) return null;

    // /speak-google liefert JSON { audio: "data:audio/mp3;base64,..." }.
    // /speak-openai (audio/mpeg) und /speak-gemini (audio/wav) liefern den
    // Body als raw bytes — gleiche Pipeline reicht für beide.
    let base64: string;
    let ext: 'mp3' | 'wav';
    if (isGoogleCloud) {
      const json = (await res.json()) as { audio?: string };
      metric?.('tts-body-done');
      const dataUri = json.audio ?? '';
      const comma = dataUri.indexOf(',');
      if (comma === -1) return null;
      base64 = dataUri.slice(comma + 1);
      ext = 'mp3';
    } else {
      const buffer = await res.arrayBuffer();
      metric?.('tts-body-done', { bytes: buffer.byteLength });
      if (signal.aborted || buffer.byteLength === 0) return null;
      const bytes = new Uint8Array(buffer);
      const chunks: string[] = [];
      const cs = 0x8000;
      for (let i = 0; i < bytes.length; i += cs) {
        chunks.push(
          String.fromCharCode(...bytes.subarray(i, Math.min(i + cs, bytes.length))),
        );
      }
      base64 = btoa(chunks.join(''));
      ext = endpoint.endsWith('/speak-gemini') ? 'wav' : 'mp3';
    }

    const tmpPath = `${FileSystem.cacheDirectory}sos-tts-q-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.${ext}`;
    await FileSystem.writeAsStringAsync(tmpPath, base64, {
      encoding: FileSystem.EncodingType.Base64,
    });
    metric?.('tts-file-written');
    if (signal.aborted) return null;
    return { uri: tmpPath };
  }
}