- backend/coach: routing zu Sonnet (default) / Haiku / Groq Llama je nach sessionData.llmProvider. sort:latency für Anthropic-Modelle (-30..58% TTFB). - frontend: LlmProviderToggle (Sonnet/Haiku/Groq pills), llmProvider.ts Storage-Helper. sosStream.ts schickt llmProvider im /sos-session-Body. - bench: SosTtsBenchmark sammelt Marker (req->session, lyra-ttfb, lyra-done, tts-fired/headers/body/file, audio-loaded, first-audio); Output als console.table. - ops: backend/scripts/llm-bench.sh + Python-Variante für realistic SOS-Prompt. - speak-cartesia + speak-elevenlabs Endpoints (waren ungetracked, jetzt mit drin).
220 lines
8.8 KiB
Bash
220 lines
8.8 KiB
Bash
#!/usr/bin/env bash
|
||
# llm-bench.sh — TTFB benchmark across LLM providers für SOS-style requests.
|
||
#
|
||
# Misst time_starttransfer (= erste Byte vom Body) für streaming chat completions.
|
||
# Für SSE-Endpoints korrespondiert das praktisch 1:1 mit dem ersten Token am Client.
|
||
#
|
||
# Usage:
|
||
# bash llm-bench.sh [RUNS] # default RUNS=3
|
||
#
|
||
# Auf staging-server (mit Infisical):
|
||
# infisical run --projectId="$INFISICAL_PROJECT_ID" --env=staging --token="$TOKEN" -- \
|
||
# bash llm-bench.sh
|
||
#
|
||
# ENV vars (alle optional, fehlende → skip):
|
||
# OPENROUTER_API_KEY — alle Modelle via OpenRouter
|
||
# ANTHROPIC_API_KEY — Haiku/Sonnet direkt
|
||
# GROQ_API_KEY — Llama via Groq direkt
|
||
# OPENAI_API_KEY — GPT-4o-mini direkt
|
||
# GEMINI_API_KEY — Gemini direkt (oder GOOGLE_GENERATIVE_AI_API_KEY)
|
||
|
||
set -u
|
||
|
||
RUNS="${1:-3}"
|
||
|
||
SYSTEM_PROMPT='Du bist Lyra, eine warme empathische Begleiterin für Menschen mit Glücksspielsucht. Antworte in maximal 2-3 deutschen Sätzen, warm und ohne Belehrung. Am Ende JSON-Chips: [{"label":"...","action":"..."}]'
|
||
USER_MSG='Ich bin gerade unter starkem Druck und denke daran, einen großen Einsatz zu machen.'
|
||
MAX_TOK=80
|
||
|
||
command -v jq >/dev/null || { echo "jq fehlt — apt install jq"; exit 1; }
|
||
|
||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||
|
||
# curl 1× und gib TTFB in ms zurück (oder "FAIL($code)").
|
||
ttfb_ms() {
|
||
local out
|
||
out=$(curl -s -N -o /dev/null --max-time 30 \
|
||
-w '%{time_starttransfer}\n%{http_code}' "$@" 2>/dev/null) || {
|
||
echo "ERR"; return
|
||
}
|
||
local time=$(echo "$out" | head -1)
|
||
local code=$(echo "$out" | tail -1)
|
||
if [[ "$code" != "200" ]] && [[ "$code" != "206" ]]; then
|
||
echo "FAIL($code)"; return
|
||
fi
|
||
awk -v s="$time" 'BEGIN { printf "%d", s * 1000 }'
|
||
}
|
||
|
||
# RUNS Iterationen, druckt Zeile mit allen Times + min/p50.
|
||
benchN() {
|
||
local label="$1"; shift
|
||
local times=()
|
||
printf " %-50s " "$label"
|
||
for ((i=1; i<=RUNS; i++)); do
|
||
t=$(ttfb_ms "$@")
|
||
times+=("$t")
|
||
printf "%-9s" "$t"
|
||
sleep 0.4
|
||
done
|
||
local valid=()
|
||
for t in "${times[@]}"; do
|
||
[[ "$t" =~ ^[0-9]+$ ]] && valid+=("$t")
|
||
done
|
||
if (( ${#valid[@]} > 0 )); then
|
||
local sorted
|
||
mapfile -t sorted < <(printf '%s\n' "${valid[@]}" | sort -n)
|
||
local n=${#sorted[@]}
|
||
printf " │ min=%sms p50=%sms\n" "${sorted[0]}" "${sorted[$((n/2))]}"
|
||
else
|
||
printf " │ ALL FAILED\n"
|
||
fi
|
||
}
|
||
|
||
# OpenAI-style streaming body (OpenAI/OpenRouter/Groq/Cerebras/Mistral nutzen alle dieses Format).
|
||
openai_body() {
|
||
local model="$1"; local extra="${2:-}"
|
||
jq -nc \
|
||
--arg model "$model" \
|
||
--arg system "$SYSTEM_PROMPT" \
|
||
--arg user "$USER_MSG" \
|
||
--argjson maxtok "$MAX_TOK" \
|
||
--argjson extra "${extra:-{\}}" \
|
||
'{model:$model, stream:true, max_tokens:$maxtok,
|
||
messages:[{role:"system",content:$system},{role:"user",content:$user}]} + $extra'
|
||
}
|
||
|
||
anthropic_body() {
|
||
local model="$1"
|
||
jq -nc \
|
||
--arg model "$model" \
|
||
--arg system "$SYSTEM_PROMPT" \
|
||
--arg user "$USER_MSG" \
|
||
--argjson maxtok "$MAX_TOK" \
|
||
'{model:$model, stream:true, max_tokens:$maxtok, system:$system,
|
||
messages:[{role:"user",content:$user}]}'
|
||
}
|
||
|
||
gemini_body() {
|
||
jq -nc \
|
||
--arg system "$SYSTEM_PROMPT" \
|
||
--arg user "$USER_MSG" \
|
||
--argjson maxtok "$MAX_TOK" \
|
||
'{contents:[{parts:[{text:$user}]}],
|
||
systemInstruction:{parts:[{text:$system}]},
|
||
generationConfig:{maxOutputTokens:$maxtok}}'
|
||
}
|
||
|
||
# ── header ───────────────────────────────────────────────────────────────────
|
||
echo
|
||
echo "═══════════════════════════════════════════════════════════════════"
|
||
echo " LLM TTFB Benchmark — $RUNS runs each — $(date '+%Y-%m-%d %H:%M:%S')"
|
||
country=$(curl -s --max-time 3 ipinfo.io/country 2>/dev/null || echo "?")
|
||
ip=$(curl -s --max-time 3 ifconfig.co 2>/dev/null || echo "?")
|
||
echo " Source: $country / $ip"
|
||
echo "═══════════════════════════════════════════════════════════════════"
|
||
|
||
# ── via OpenRouter (default load-balanced) ───────────────────────────────────
|
||
if [[ -n "${OPENROUTER_API_KEY:-}" ]]; then
|
||
echo
|
||
echo "── via OpenRouter (default load-balancing) ──"
|
||
for m in \
|
||
"anthropic/claude-haiku-4.5" \
|
||
"anthropic/claude-sonnet-4.5" \
|
||
"anthropic/claude-3.5-haiku" \
|
||
"openai/gpt-4o-mini" \
|
||
"google/gemini-2.0-flash-001" \
|
||
"meta-llama/llama-3.3-70b-instruct"
|
||
do
|
||
benchN "OR $m" \
|
||
-X POST https://openrouter.ai/api/v1/chat/completions \
|
||
-H "Content-Type: application/json" \
|
||
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
|
||
-H "HTTP-Referer: https://rebreak.org" \
|
||
-H "X-Title: ReBreak-Bench" \
|
||
-d "$(openai_body "$m")"
|
||
done
|
||
|
||
echo
|
||
echo "── via OpenRouter + provider:{sort:latency} ──"
|
||
for m in \
|
||
"anthropic/claude-haiku-4.5" \
|
||
"anthropic/claude-sonnet-4.5"
|
||
do
|
||
benchN "ORL $m" \
|
||
-X POST https://openrouter.ai/api/v1/chat/completions \
|
||
-H "Content-Type: application/json" \
|
||
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
|
||
-H "HTTP-Referer: https://rebreak.org" \
|
||
-H "X-Title: ReBreak-Bench" \
|
||
-d "$(openai_body "$m" '{"provider":{"sort":"latency"}}')"
|
||
done
|
||
else
|
||
echo; echo "(skip OpenRouter — OPENROUTER_API_KEY nicht gesetzt)"
|
||
fi
|
||
|
||
# ── Anthropic direct ─────────────────────────────────────────────────────────
|
||
if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
|
||
echo
|
||
echo "── via Anthropic direkt ──"
|
||
for m in claude-haiku-4-5 claude-sonnet-4-5; do
|
||
benchN "ANT $m" \
|
||
-X POST https://api.anthropic.com/v1/messages \
|
||
-H "Content-Type: application/json" \
|
||
-H "x-api-key: $ANTHROPIC_API_KEY" \
|
||
-H "anthropic-version: 2023-06-01" \
|
||
-d "$(anthropic_body "$m")"
|
||
done
|
||
else
|
||
echo; echo "(skip Anthropic direkt — ANTHROPIC_API_KEY nicht gesetzt)"
|
||
fi
|
||
|
||
# ── OpenAI direct ────────────────────────────────────────────────────────────
|
||
if [[ -n "${OPENAI_API_KEY:-}" ]]; then
|
||
echo
|
||
echo "── via OpenAI direkt ──"
|
||
for m in gpt-4o-mini gpt-4o; do
|
||
benchN "OAI $m" \
|
||
-X POST https://api.openai.com/v1/chat/completions \
|
||
-H "Content-Type: application/json" \
|
||
-H "Authorization: Bearer $OPENAI_API_KEY" \
|
||
-d "$(openai_body "$m")"
|
||
done
|
||
else
|
||
echo; echo "(skip OpenAI direkt — OPENAI_API_KEY nicht gesetzt)"
|
||
fi
|
||
|
||
# ── Groq direct ──────────────────────────────────────────────────────────────
|
||
if [[ -n "${GROQ_API_KEY:-}" ]]; then
|
||
echo
|
||
echo "── via Groq direkt (LPU hardware) ──"
|
||
for m in llama-3.3-70b-versatile llama-3.1-8b-instant; do
|
||
benchN "GRQ $m" \
|
||
-X POST https://api.groq.com/openai/v1/chat/completions \
|
||
-H "Content-Type: application/json" \
|
||
-H "Authorization: Bearer $GROQ_API_KEY" \
|
||
-d "$(openai_body "$m")"
|
||
done
|
||
else
|
||
echo; echo "(skip Groq — GROQ_API_KEY nicht gesetzt)"
|
||
fi
|
||
|
||
# ── Gemini direct ────────────────────────────────────────────────────────────
|
||
GEM_KEY="${GEMINI_API_KEY:-${GOOGLE_GENERATIVE_AI_API_KEY:-}}"
|
||
if [[ -n "$GEM_KEY" ]]; then
|
||
echo
|
||
echo "── via Gemini direkt ──"
|
||
for m in gemini-2.0-flash gemini-1.5-flash; do
|
||
benchN "GEM $m" \
|
||
-X POST "https://generativelanguage.googleapis.com/v1beta/models/${m}:streamGenerateContent?key=${GEM_KEY}&alt=sse" \
|
||
-H "Content-Type: application/json" \
|
||
-d "$(gemini_body)"
|
||
done
|
||
else
|
||
echo; echo "(skip Gemini direkt — GEMINI_API_KEY nicht gesetzt)"
|
||
fi
|
||
|
||
echo
|
||
echo "═══════════════════════════════════════════════════════════════════"
|
||
echo " done. min = bestes TTFB, p50 = median über $RUNS Runs"
|
||
echo "═══════════════════════════════════════════════════════════════════"
|