test(eval): Concurrency-Limit + 429-Retry für Lyra-Eval-Suite

LYRA_EVAL_CONCURRENCY (Batch statt Promise.all-30-parallel) und
LYRA_EVAL_TIMEOUT_MS als Env-Schalter, plus 429-Retry mit Backoff —
nötig für Live-Runs gegen Provider mit niedrigem TPM-Limit (Groq
on-demand 12k). Default-Verhalten unverändert; Prompts/Regeln/
System-Prompt nicht angefasst.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
chahinebrini 2026-06-11 06:36:32 +02:00
parent 2c33ba55a4
commit 444688b6e9

View File

@ -278,22 +278,29 @@ KRITISCHE SICHERHEITSREGELN (absolute Priorität):
SPRACHE: Antworte in der Sprache des Users.`; SPRACHE: Antworte in der Sprache des Users.`;
const res = await fetch(apiUrl, { // Provider mit niedrigem TPM-Limit (z.B. Groq on-demand: 12k TPM) antworten
method: "POST", // mit 429 — warten und erneut versuchen statt den Suite-Run abzubrechen.
headers: { let res: Response;
Authorization: `Bearer ${apiKey}`, for (let attempt = 1; ; attempt++) {
"Content-Type": "application/json", res = await fetch(apiUrl, {
}, method: "POST",
body: JSON.stringify({ headers: {
model, Authorization: `Bearer ${apiKey}`,
max_tokens: 300, "Content-Type": "application/json",
temperature: 0, },
messages: [ body: JSON.stringify({
{ role: "system", content: systemPrompt }, model,
{ role: "user", content: prompt.userMessage }, max_tokens: 300,
], temperature: 0,
}), messages: [
}); { role: "system", content: systemPrompt },
{ role: "user", content: prompt.userMessage },
],
}),
});
if (res.status !== 429 || attempt >= 5) break;
await new Promise((resolve) => setTimeout(resolve, 15_000 * attempt));
}
if (!res.ok) { if (!res.ok) {
const err = await res.text().catch(() => ""); const err = await res.text().catch(() => "");
@ -351,15 +358,25 @@ const evalResults: EvalResult[] = [];
let evalMap: Map<string, EvalResult>; let evalMap: Map<string, EvalResult>;
beforeAll(async () => { beforeAll(async () => {
const results = await Promise.all( // LYRA_EVAL_CONCURRENCY begrenzt parallele LLM-Calls (Provider-TPM-Limits,
allPrompts.map(async (prompt) => { // z.B. Groq on-demand). Default: alle Prompts parallel (bisheriges Verhalten).
const response = await callLyra(prompt); const concurrency = Math.max(
return evalPrompt(prompt, response); 1,
}), Number(process.env.LYRA_EVAL_CONCURRENCY) || allPrompts.length,
); );
const results: EvalResult[] = [];
for (let i = 0; i < allPrompts.length; i += concurrency) {
const batch = await Promise.all(
allPrompts.slice(i, i + concurrency).map(async (prompt) => {
const response = await callLyra(prompt);
return evalPrompt(prompt, response);
}),
);
results.push(...batch);
}
results.forEach((r) => evalResults.push(r)); results.forEach((r) => evalResults.push(r));
evalMap = new Map(results.map((r) => [r.promptId, r])); evalMap = new Map(results.map((r) => [r.promptId, r]));
}, 120_000); }, Number(process.env.LYRA_EVAL_TIMEOUT_MS) || 120_000);
// ─── Crisis-Detection-Recall ────────────────────────────────────────────────── // ─── Crisis-Detection-Recall ──────────────────────────────────────────────────