From 444688b6e924b1db6e598fba7586e849895cb202 Mon Sep 17 00:00:00 2001 From: chahinebrini Date: Thu, 11 Jun 2026 06:36:32 +0200 Subject: [PATCH] =?UTF-8?q?test(eval):=20Concurrency-Limit=20+=20429-Retry?= =?UTF-8?q?=20f=C3=BCr=20Lyra-Eval-Suite?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LYRA_EVAL_CONCURRENCY (Batch statt Promise.all-30-parallel) und LYRA_EVAL_TIMEOUT_MS als Env-Schalter, plus 429-Retry mit Backoff — nötig für Live-Runs gegen Provider mit niedrigem TPM-Limit (Groq on-demand 12k). Default-Verhalten unverändert; Prompts/Regeln/ System-Prompt nicht angefasst. Co-Authored-By: Claude Fable 5 --- backend/tests/eval/lyra-eval.test.ts | 61 ++++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/backend/tests/eval/lyra-eval.test.ts b/backend/tests/eval/lyra-eval.test.ts index 8dc28b0..9b584da 100644 --- a/backend/tests/eval/lyra-eval.test.ts +++ b/backend/tests/eval/lyra-eval.test.ts @@ -278,22 +278,29 @@ KRITISCHE SICHERHEITSREGELN (absolute Priorität): SPRACHE: Antworte in der Sprache des Users.`; - const res = await fetch(apiUrl, { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model, - max_tokens: 300, - temperature: 0, - messages: [ - { role: "system", content: systemPrompt }, - { role: "user", content: prompt.userMessage }, - ], - }), - }); + // Provider mit niedrigem TPM-Limit (z.B. Groq on-demand: 12k TPM) antworten + // mit 429 — warten und erneut versuchen statt den Suite-Run abzubrechen. + let res: Response; + for (let attempt = 1; ; attempt++) { + res = await fetch(apiUrl, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + max_tokens: 300, + temperature: 0, + messages: [ + { role: "system", content: systemPrompt }, + { role: "user", content: prompt.userMessage }, + ], + }), + }); + if (res.status !== 429 || attempt >= 5) break; + await new Promise((resolve) => setTimeout(resolve, 15_000 * attempt)); + } if (!res.ok) { const err = await res.text().catch(() => ""); @@ -351,15 +358,25 @@ const evalResults: EvalResult[] = []; let evalMap: Map; beforeAll(async () => { - const results = await Promise.all( - allPrompts.map(async (prompt) => { - const response = await callLyra(prompt); - return evalPrompt(prompt, response); - }), + // LYRA_EVAL_CONCURRENCY begrenzt parallele LLM-Calls (Provider-TPM-Limits, + // z.B. Groq on-demand). Default: alle Prompts parallel (bisheriges Verhalten). + const concurrency = Math.max( + 1, + Number(process.env.LYRA_EVAL_CONCURRENCY) || allPrompts.length, ); + const results: EvalResult[] = []; + for (let i = 0; i < allPrompts.length; i += concurrency) { + const batch = await Promise.all( + allPrompts.slice(i, i + concurrency).map(async (prompt) => { + const response = await callLyra(prompt); + return evalPrompt(prompt, response); + }), + ); + results.push(...batch); + } results.forEach((r) => evalResults.push(r)); evalMap = new Map(results.map((r) => [r.promptId, r])); -}, 120_000); +}, Number(process.env.LYRA_EVAL_TIMEOUT_MS) || 120_000); // ─── Crisis-Detection-Recall ──────────────────────────────────────────────────