chahinebrini f2e822be95 feat(sos): llmProvider toggle + sort:latency + bench scaffolding
- backend/coach: routing zu Sonnet (default) / Haiku / Groq Llama je nach
  sessionData.llmProvider. sort:latency für Anthropic-Modelle (-30..58% TTFB).
- frontend: LlmProviderToggle (Sonnet/Haiku/Groq pills), llmProvider.ts
  Storage-Helper. sosStream.ts schickt llmProvider im /sos-session-Body.
- bench: SosTtsBenchmark sammelt Marker (req->session, lyra-ttfb, lyra-done,
  tts-fired/headers/body/file, audio-loaded, first-audio); Output als console.table.
- ops: backend/scripts/llm-bench.sh + Python-Variante für realistic SOS-Prompt.
- speak-cartesia + speak-elevenlabs Endpoints (waren ungetracked, jetzt mit drin).
2026-05-06 13:58:07 +02:00

220 lines
8.8 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# llm-bench.sh — TTFB benchmark across LLM providers für SOS-style requests.
#
# Misst time_starttransfer (= erste Byte vom Body) für streaming chat completions.
# Für SSE-Endpoints korrespondiert das praktisch 1:1 mit dem ersten Token am Client.
#
# Usage:
# bash llm-bench.sh [RUNS] # default RUNS=3
#
# Auf staging-server (mit Infisical):
# infisical run --projectId="$INFISICAL_PROJECT_ID" --env=staging --token="$TOKEN" -- \
# bash llm-bench.sh
#
# ENV vars (alle optional, fehlende → skip):
# OPENROUTER_API_KEY — alle Modelle via OpenRouter
# ANTHROPIC_API_KEY — Haiku/Sonnet direkt
# GROQ_API_KEY — Llama via Groq direkt
# OPENAI_API_KEY — GPT-4o-mini direkt
# GEMINI_API_KEY — Gemini direkt (oder GOOGLE_GENERATIVE_AI_API_KEY)
set -u
RUNS="${1:-3}"
SYSTEM_PROMPT='Du bist Lyra, eine warme empathische Begleiterin für Menschen mit Glücksspielsucht. Antworte in maximal 2-3 deutschen Sätzen, warm und ohne Belehrung. Am Ende JSON-Chips: [{"label":"...","action":"..."}]'
USER_MSG='Ich bin gerade unter starkem Druck und denke daran, einen großen Einsatz zu machen.'
MAX_TOK=80
command -v jq >/dev/null || { echo "jq fehlt — apt install jq"; exit 1; }
# ── helpers ──────────────────────────────────────────────────────────────────
# curl 1× und gib TTFB in ms zurück (oder "FAIL($code)").
ttfb_ms() {
local out
out=$(curl -s -N -o /dev/null --max-time 30 \
-w '%{time_starttransfer}\n%{http_code}' "$@" 2>/dev/null) || {
echo "ERR"; return
}
local time=$(echo "$out" | head -1)
local code=$(echo "$out" | tail -1)
if [[ "$code" != "200" ]] && [[ "$code" != "206" ]]; then
echo "FAIL($code)"; return
fi
awk -v s="$time" 'BEGIN { printf "%d", s * 1000 }'
}
# RUNS Iterationen, druckt Zeile mit allen Times + min/p50.
benchN() {
local label="$1"; shift
local times=()
printf " %-50s " "$label"
for ((i=1; i<=RUNS; i++)); do
t=$(ttfb_ms "$@")
times+=("$t")
printf "%-9s" "$t"
sleep 0.4
done
local valid=()
for t in "${times[@]}"; do
[[ "$t" =~ ^[0-9]+$ ]] && valid+=("$t")
done
if (( ${#valid[@]} > 0 )); then
local sorted
mapfile -t sorted < <(printf '%s\n' "${valid[@]}" | sort -n)
local n=${#sorted[@]}
printf " │ min=%sms p50=%sms\n" "${sorted[0]}" "${sorted[$((n/2))]}"
else
printf " │ ALL FAILED\n"
fi
}
# OpenAI-style streaming body (OpenAI/OpenRouter/Groq/Cerebras/Mistral nutzen alle dieses Format).
openai_body() {
local model="$1"; local extra="${2:-}"
jq -nc \
--arg model "$model" \
--arg system "$SYSTEM_PROMPT" \
--arg user "$USER_MSG" \
--argjson maxtok "$MAX_TOK" \
--argjson extra "${extra:-{\}}" \
'{model:$model, stream:true, max_tokens:$maxtok,
messages:[{role:"system",content:$system},{role:"user",content:$user}]} + $extra'
}
anthropic_body() {
local model="$1"
jq -nc \
--arg model "$model" \
--arg system "$SYSTEM_PROMPT" \
--arg user "$USER_MSG" \
--argjson maxtok "$MAX_TOK" \
'{model:$model, stream:true, max_tokens:$maxtok, system:$system,
messages:[{role:"user",content:$user}]}'
}
gemini_body() {
jq -nc \
--arg system "$SYSTEM_PROMPT" \
--arg user "$USER_MSG" \
--argjson maxtok "$MAX_TOK" \
'{contents:[{parts:[{text:$user}]}],
systemInstruction:{parts:[{text:$system}]},
generationConfig:{maxOutputTokens:$maxtok}}'
}
# ── header ───────────────────────────────────────────────────────────────────
echo
echo "═══════════════════════════════════════════════════════════════════"
echo " LLM TTFB Benchmark — $RUNS runs each — $(date '+%Y-%m-%d %H:%M:%S')"
country=$(curl -s --max-time 3 ipinfo.io/country 2>/dev/null || echo "?")
ip=$(curl -s --max-time 3 ifconfig.co 2>/dev/null || echo "?")
echo " Source: $country / $ip"
echo "═══════════════════════════════════════════════════════════════════"
# ── via OpenRouter (default load-balanced) ───────────────────────────────────
if [[ -n "${OPENROUTER_API_KEY:-}" ]]; then
echo
echo "── via OpenRouter (default load-balancing) ──"
for m in \
"anthropic/claude-haiku-4.5" \
"anthropic/claude-sonnet-4.5" \
"anthropic/claude-3.5-haiku" \
"openai/gpt-4o-mini" \
"google/gemini-2.0-flash-001" \
"meta-llama/llama-3.3-70b-instruct"
do
benchN "OR $m" \
-X POST https://openrouter.ai/api/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "HTTP-Referer: https://rebreak.org" \
-H "X-Title: ReBreak-Bench" \
-d "$(openai_body "$m")"
done
echo
echo "── via OpenRouter + provider:{sort:latency} ──"
for m in \
"anthropic/claude-haiku-4.5" \
"anthropic/claude-sonnet-4.5"
do
benchN "ORL $m" \
-X POST https://openrouter.ai/api/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "HTTP-Referer: https://rebreak.org" \
-H "X-Title: ReBreak-Bench" \
-d "$(openai_body "$m" '{"provider":{"sort":"latency"}}')"
done
else
echo; echo "(skip OpenRouter — OPENROUTER_API_KEY nicht gesetzt)"
fi
# ── Anthropic direct ─────────────────────────────────────────────────────────
if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
echo
echo "── via Anthropic direkt ──"
for m in claude-haiku-4-5 claude-sonnet-4-5; do
benchN "ANT $m" \
-X POST https://api.anthropic.com/v1/messages \
-H "Content-Type: application/json" \
-H "x-api-key: $ANTHROPIC_API_KEY" \
-H "anthropic-version: 2023-06-01" \
-d "$(anthropic_body "$m")"
done
else
echo; echo "(skip Anthropic direkt — ANTHROPIC_API_KEY nicht gesetzt)"
fi
# ── OpenAI direct ────────────────────────────────────────────────────────────
if [[ -n "${OPENAI_API_KEY:-}" ]]; then
echo
echo "── via OpenAI direkt ──"
for m in gpt-4o-mini gpt-4o; do
benchN "OAI $m" \
-X POST https://api.openai.com/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-d "$(openai_body "$m")"
done
else
echo; echo "(skip OpenAI direkt — OPENAI_API_KEY nicht gesetzt)"
fi
# ── Groq direct ──────────────────────────────────────────────────────────────
if [[ -n "${GROQ_API_KEY:-}" ]]; then
echo
echo "── via Groq direkt (LPU hardware) ──"
for m in llama-3.3-70b-versatile llama-3.1-8b-instant; do
benchN "GRQ $m" \
-X POST https://api.groq.com/openai/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $GROQ_API_KEY" \
-d "$(openai_body "$m")"
done
else
echo; echo "(skip Groq — GROQ_API_KEY nicht gesetzt)"
fi
# ── Gemini direct ────────────────────────────────────────────────────────────
GEM_KEY="${GEMINI_API_KEY:-${GOOGLE_GENERATIVE_AI_API_KEY:-}}"
if [[ -n "$GEM_KEY" ]]; then
echo
echo "── via Gemini direkt ──"
for m in gemini-2.0-flash gemini-1.5-flash; do
benchN "GEM $m" \
-X POST "https://generativelanguage.googleapis.com/v1beta/models/${m}:streamGenerateContent?key=${GEM_KEY}&alt=sse" \
-H "Content-Type: application/json" \
-d "$(gemini_body)"
done
else
echo; echo "(skip Gemini direkt — GEMINI_API_KEY nicht gesetzt)"
fi
echo
echo "═══════════════════════════════════════════════════════════════════"
echo " done. min = bestes TTFB, p50 = median über $RUNS Runs"
echo "═══════════════════════════════════════════════════════════════════"