From 444688b6e924b1db6e598fba7586e849895cb202 Mon Sep 17 00:00:00 2001
From: chahinebrini <chahinebrini@gmail.com>
Date: Thu, 11 Jun 2026 06:36:32 +0200
Subject: [PATCH] =?UTF-8?q?test(eval):=20Concurrency-Limit=20+=20429-Retry?=
 =?UTF-8?q?=20f=C3=BCr=20Lyra-Eval-Suite?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LYRA_EVAL_CONCURRENCY (Batch statt Promise.all-30-parallel) und
LYRA_EVAL_TIMEOUT_MS als Env-Schalter, plus 429-Retry mit Backoff —
nötig für Live-Runs gegen Provider mit niedrigem TPM-Limit (Groq
on-demand 12k). Default-Verhalten unverändert; Prompts/Regeln/
System-Prompt nicht angefasst.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 backend/tests/eval/lyra-eval.test.ts | 61 ++++++++++++++++++----------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/backend/tests/eval/lyra-eval.test.ts b/backend/tests/eval/lyra-eval.test.ts
index 8dc28b0..9b584da 100644
--- a/backend/tests/eval/lyra-eval.test.ts
+++ b/backend/tests/eval/lyra-eval.test.ts
@@ -278,22 +278,29 @@ KRITISCHE SICHERHEITSREGELN (absolute Priorität):
 
 SPRACHE: Antworte in der Sprache des Users.`;
 
-  const res = await fetch(apiUrl, {
-    method: "POST",
-    headers: {
-      Authorization: `Bearer ${apiKey}`,
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify({
-      model,
-      max_tokens: 300,
-      temperature: 0,
-      messages: [
-        { role: "system", content: systemPrompt },
-        { role: "user", content: prompt.userMessage },
-      ],
-    }),
-  });
+  // Provider mit niedrigem TPM-Limit (z.B. Groq on-demand: 12k TPM) antworten
+  // mit 429 — warten und erneut versuchen statt den Suite-Run abzubrechen.
+  let res: Response;
+  for (let attempt = 1; ; attempt++) {
+    res = await fetch(apiUrl, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        max_tokens: 300,
+        temperature: 0,
+        messages: [
+          { role: "system", content: systemPrompt },
+          { role: "user", content: prompt.userMessage },
+        ],
+      }),
+    });
+    if (res.status !== 429 || attempt >= 5) break;
+    await new Promise((resolve) => setTimeout(resolve, 15_000 * attempt));
+  }
 
   if (!res.ok) {
     const err = await res.text().catch(() => "");
@@ -351,15 +358,25 @@ const evalResults: EvalResult[] = [];
 let evalMap: Map<string, EvalResult>;
 
 beforeAll(async () => {
-  const results = await Promise.all(
-    allPrompts.map(async (prompt) => {
-      const response = await callLyra(prompt);
-      return evalPrompt(prompt, response);
-    }),
+  // LYRA_EVAL_CONCURRENCY begrenzt parallele LLM-Calls (Provider-TPM-Limits,
+  // z.B. Groq on-demand). Default: alle Prompts parallel (bisheriges Verhalten).
+  const concurrency = Math.max(
+    1,
+    Number(process.env.LYRA_EVAL_CONCURRENCY) || allPrompts.length,
   );
+  const results: EvalResult[] = [];
+  for (let i = 0; i < allPrompts.length; i += concurrency) {
+    const batch = await Promise.all(
+      allPrompts.slice(i, i + concurrency).map(async (prompt) => {
+        const response = await callLyra(prompt);
+        return evalPrompt(prompt, response);
+      }),
+    );
+    results.push(...batch);
+  }
   results.forEach((r) => evalResults.push(r));
   evalMap = new Map(results.map((r) => [r.promptId, r]));
-}, 120_000);
+}, Number(process.env.LYRA_EVAL_TIMEOUT_MS) || 120_000);
 
 // ─── Crisis-Detection-Recall ──────────────────────────────────────────────────