/** * Mail-Klassifikations-Pipeline (Layer 0–4 + Sample-Capture). * * Architektur: * Layer 0 — Skip-Guard (bereits geblockt / kein Consent) * Layer 1 — Whitelist (wetter, wettkampf …) → PASS * Layer 2 — Domain-Hard-Block (Blocklist) * Layer 2.5 — Brand+Random-Token-Detection (Hard-Block ohne LLM) * Layer 3 — Score 0–100 (deterministisch) * Layer 4 — Groq-Borderline (Score 25–75, mit Local-Part-Redact) * Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0) * * Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks. * * DSGVO-Hinweise: * - Mail-Inhalte (Body) werden nie persistiert (Art. 9). * - Local-Part der Sender-Adresse wird vor dem Groq-Call redacted * (es sei denn, er enthält selbst Casino-Keywords — dann ist er Detection-Signal). * - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5). * - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17). */ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore — .mjs ohne types, Exports sind string[] import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs"; // ─── Typen ───────────────────────────────────────────────────────────────────── export type ClassificationAction = "blocked" | "passed"; export type TriggerSource = | "domain" | "relay-decoded" | "brand+random" | `score:${number}` | `llm:${string}` | "whitelist" | "no-signal"; export interface MailInput { /** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */ senderEmail: string; /** Display-Name des Absenders (kann leer sein) */ senderName: string | null; /** Betreff-Zeile */ subject: string; } export interface ClassificationResult { action: ClassificationAction; triggerSource: TriggerSource; score: number; /** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */ relayDecodedDomain: string | null; /** Groq-Verdict (nur wenn Layer 4 lief) */ groq?: { isGambling: boolean; confidence: number; reason: string; }; /** Score-Komponenten für MailClassificationSample.features */ features: ClassificationFeatures; } export interface ClassificationFeatures { score: number; domainBlocked: boolean; relayDecoded: boolean; brandMatch: boolean; randomTokens: boolean; keywordHitsSubject: string[]; keywordHitsDomain: string[]; keywordHitsName: string[]; styleFlags: string[]; whitelistHit: boolean; } // ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ────────────────── export const SCORE_WEIGHTS = { // Domain-Indikatoren DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …) DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win) // Subject-Indikatoren SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …) SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus") SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum" SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff // Display-Name-Indikatoren SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert) // Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst) BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block) RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match } as const; // Hard-Block-Threshold: Score >= 80 → BLOCK ohne LLM const SCORE_HARD_BLOCK_THRESHOLD = 80; // Borderline-Range: 25–75 → Groq-Call const SCORE_BORDERLINE_LOW = 25; const SCORE_BORDERLINE_HIGH = 75; // ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ───────────────── // Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains. // Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt. const GAMBLING_BRANDS: string[] = [ "casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino", "pokerstars", "interwetten", "netbet", "leovegas", "mrgreen", "betsson", "neobet", "mybet", "lottoland", "betano", "williamhill", "paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet", "mostbet", "luckyvibe", "spinz", "casinoly", "rabona", "justcasino", "getslots", "rocketplay", "freshcasino", "nomnomcasino", "gamblezen", "betandplay", ]; // ─── Relay-Decoder ───────────────────────────────────────────────────────────── /** * Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse. * * Muster die wir kennen: * bounces+user=example.com@sendgrid.net → example.com * track.user=gamblezen.com@mailchimp.com → gamblezen.com * a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com * user=betandplay.com@bounce.em.example → betandplay.com * * Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part. */ export function extractRelayedDomain(senderEmail: string): string | null { if (!senderEmail.includes("@")) return null; const [localPart] = senderEmail.split("@"); // Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces) const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i); if (eqMatch) return eqMatch[1].toLowerCase(); // Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups) const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i); if (atMatch) return atMatch[1].toLowerCase(); return null; } // ─── Brand-Normalisierung ────────────────────────────────────────────────────── /** * Normalisiert einen String für Brand-Vergleiche. * "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip) */ export function normalizeBrand(s: string): string { return s.toLowerCase().replace(/[\s\-._]/g, ""); } /** * Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt. * Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz). */ export function matchesGamblingBrand(normalized: string): boolean { if (normalized.length < 4) return false; return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand)); } /** * Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check. * "betand-play.com" → ["betandplay", "betand"] (root + normalisiert) */ function domainToBrandCandidates(domain: string): string[] { const root = domain.split(".")[0] ?? ""; return [normalizeBrand(root), normalizeBrand(domain)]; } // ─── Random-Token-Detection ─────────────────────────────────────────────────── /** * Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse. * * Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern, * kein bekanntes Funktions-Wort (info, admin, noreply, support …). * * Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" — * typisch für Massen-Mailer mit trackierbaren User-IDs. */ export function hasRandomTokens(localPart: string): boolean { const FUNCTION_WORDS = new Set([ "info", "admin", "noreply", "no-reply", "support", "hello", "news", "marketing", "sales", "contact", "newsletter", "service", "offers", "promotions", "promo", "team", "mail", "email", "reply", "bounce", "return", "postmaster", "mailer", ]); const tokens = localPart.split(/[_\-.+]+/); const randomLooking = tokens.filter((t) => { if (t.length < 6) return false; if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben const lower = t.toLowerCase(); if (FUNCTION_WORDS.has(lower)) return false; return true; }); return randomLooking.length >= 2; } // ─── Local-Part-Redaction ───────────────────────────────────────────────────── /** * Redacted den Local-Part einer E-Mail-Adresse vor dem Groq-Call (DSGVO). * * AUSNAHME: wenn der Local-Part selbst Gambling-Keywords enthält * (z.B. "casino_offers_abc123@mailer.com"), bleibt er erhalten — * er ist in diesem Fall ein Klassifikations-Signal, kein PII. */ export function redactLocalPartForLLM( senderEmail: string, localPartHasKeyword: boolean, ): string { if (localPartHasKeyword) return senderEmail; const atIdx = senderEmail.indexOf("@"); if (atIdx === -1) return senderEmail; return `***${senderEmail.slice(atIdx)}`; } // ─── Score-Berechnung (Layer 3) ─────────────────────────────────────────────── interface ScoreResult { score: number; keywordHitsSubject: string[]; keywordHitsDomain: string[]; keywordHitsName: string[]; styleFlags: string[]; whitelistHit: boolean; } export function computeScore( senderEmail: string, senderName: string | null, subject: string, brandMatchFound: boolean, randomTokensFound: boolean, ): ScoreResult { let score = 0; const keywordHitsSubject: string[] = []; const keywordHitsDomain: string[] = []; const keywordHitsName: string[] = []; const styleFlags: string[] = []; const subjectLower = subject.toLowerCase(); const senderEmailLower = senderEmail.toLowerCase(); const senderNameLower = (senderName ?? "").toLowerCase(); const domain = senderEmailLower.split("@")[1] ?? ""; const domainRoot = domain.split(".")[0] ?? ""; // ── Whitelist-Check (Layer 1) ── for (const w of GAMBLING_WHITELIST as string[]) { if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) { return { score: 0, keywordHitsSubject: [], keywordHitsDomain: [], keywordHitsName: [], styleFlags: [], whitelistHit: true, }; } } // ── Domain-Keywords ── for (const kw of GAMBLING_KEYWORDS as string[]) { if (domain.includes(kw) || domainRoot.includes(kw)) { keywordHitsDomain.push(kw); score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD; break; // einmal reicht } } // ── Subject-Keywords ── for (const kw of GAMBLING_KEYWORDS as string[]) { if (subjectLower.includes(kw)) { keywordHitsSubject.push(kw); score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD; break; } } // ── Sender-Name-Keywords ── for (const kw of GAMBLING_KEYWORDS as string[]) { if (senderNameLower.includes(kw)) { keywordHitsName.push(kw); score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD; break; } } // ── Geld-Pattern im Betreff (€/$ + Zahl) ── if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) { styleFlags.push("money-pattern"); score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN; } // ── Urgency-Wörter im Betreff ── const URGENCY_PATTERNS = [ "nur heute", "letzte chance", "läuft ab", "ablaufdatum", "expires", "last chance", "limited time", "jetzt einlösen", "sofort", "nur noch", "endet heute", ]; if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) { styleFlags.push("urgency"); score += SCORE_WEIGHTS.SUBJECT_URGENCY; } // ── ALL-CAPS-Wort im Betreff ── if (/\b[A-Z]{4,}\b/.test(subject)) { styleFlags.push("all-caps"); score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD; } // ── Short-Random-Domain ── if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) { styleFlags.push("short-random-domain"); score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM; } // ── Layer 2.5 Score-Ergänzungen ── if (brandMatchFound && !randomTokensFound) { score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM; } if (!brandMatchFound && randomTokensFound) { score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND; } return { score: Math.min(score, 100), keywordHitsSubject, keywordHitsDomain, keywordHitsName, styleFlags, whitelistHit: false, }; } // ─── Groq-LLM-Call (Layer 4) ───────────────────────────────────────────────── interface GroqVerdict { isGambling: boolean; confidence: number; reason: string; } /** * Ruft Groq Llama 3.3 70B zur Borderline-Klassifikation auf. * Sendet NUR: senderName, senderEmail (ggf. local-part-redacted), subject. * KEIN Mail-Body, KEINE weiteren PII. */ export async function callGroqClassifier(params: { senderName: string | null; senderEmailRedacted: string; subject: string; groqApiKey: string; }): Promise { const prompt = `You are a spam classifier for a gambling addiction recovery app. Classify whether this email is from a gambling/betting operator. Sender name: ${params.senderName ?? "(none)"} Sender email: ${params.senderEmailRedacted} Subject: ${params.subject} Respond with ONLY valid JSON in this exact format: {"isGambling": true/false, "confidence": 0.0-1.0, "reason": "one sentence"} Do not include any other text.`; const response = await fetch("https://api.groq.com/openai/v1/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${params.groqApiKey}`, }, body: JSON.stringify({ model: "llama-3.3-70b-versatile", messages: [{ role: "user", content: prompt }], temperature: 0, max_tokens: 100, response_format: { type: "json_object" }, }), }); if (!response.ok) { const errText = await response.text().catch(() => ""); throw new Error(`Groq API error ${response.status}: ${errText.slice(0, 200)}`); } const data = await response.json() as { choices: { message: { content: string } }[]; }; const raw = data.choices?.[0]?.message?.content ?? "{}"; try { const parsed = JSON.parse(raw) as Partial; return { isGambling: Boolean(parsed.isGambling), confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0, reason: typeof parsed.reason === "string" ? parsed.reason.slice(0, 300) : "", }; } catch { // JSON-Parse-Fehler → konservativ PASS (kein false-positive durch LLM-Fehler) return { isGambling: false, confidence: 0, reason: "parse-error" }; } } // ─── Haupt-Pipeline ─────────────────────────────────────────────────────────── export interface ClassifyMailParams { mail: MailInput; /** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */ blockedDomainSet: Set; /** Groq API Key (aus runtimeConfig) — wenn leer, Layer 4 überspringen */ groqApiKey: string; } /** * Klassifiziert eine einzelne Mail durch alle Layer. * Pure bezüglich IO — Groq-Call ist die einzige externe Abhängigkeit. * DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer. */ export async function classifyMail(params: ClassifyMailParams): Promise { const { mail, blockedDomainSet, groqApiKey } = params; const { senderEmail, senderName, subject } = mail; const senderEmailLower = senderEmail.toLowerCase(); const domain = senderEmailLower.split("@")[1] ?? ""; const localPart = senderEmailLower.split("@")[0] ?? ""; // ── Layer 1: Whitelist ────────────────────────────────────────────────────── const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase(); for (const w of GAMBLING_WHITELIST as string[]) { if (haystack.includes(w)) { return { action: "passed", triggerSource: "whitelist", score: 0, relayDecodedDomain: null, features: { score: 0, domainBlocked: false, relayDecoded: false, brandMatch: false, randomTokens: false, keywordHitsSubject: [], keywordHitsDomain: [], keywordHitsName: [], styleFlags: [], whitelistHit: true, }, }; } } // ── Layer 2: Domain-Hard-Block ────────────────────────────────────────────── if (domain && blockedDomainSet.has(domain)) { return { action: "blocked", triggerSource: "domain", score: 100, relayDecodedDomain: null, features: { score: 100, domainBlocked: true, relayDecoded: false, brandMatch: false, randomTokens: false, keywordHitsSubject: [], keywordHitsDomain: [], keywordHitsName: [], styleFlags: [], whitelistHit: false, }, }; } // ── Layer 2: Relay-Decoded Domain-Block ───────────────────────────────────── const relayDecodedDomain = extractRelayedDomain(senderEmailLower); if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) { return { action: "blocked", triggerSource: "relay-decoded", score: 100, relayDecodedDomain, features: { score: 100, domainBlocked: false, relayDecoded: true, brandMatch: false, randomTokens: false, keywordHitsSubject: [], keywordHitsDomain: [], keywordHitsName: [], styleFlags: [], whitelistHit: false, }, }; } // ── Layer 2.5: Brand+Random-Token-Hard-Block ──────────────────────────────── // Normalisiere Absender-Name und Domain-Root für Brand-Vergleich const displayNameNorm = normalizeBrand(senderName ?? ""); const domainCandidates = domainToBrandCandidates(domain); const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : []; const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates]; const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c)); const randomTokens = hasRandomTokens(localPart); if (brandMatch && randomTokens) { return { action: "blocked", triggerSource: "brand+random", score: 100, relayDecodedDomain, features: { score: 100, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch: true, randomTokens: true, keywordHitsSubject: [], keywordHitsDomain: [], keywordHitsName: [], styleFlags: [], whitelistHit: false, }, }; } // ── Layer 3: Score ────────────────────────────────────────────────────────── const scoreResult = computeScore( senderEmailLower, senderName, subject, brandMatch, randomTokens, ); if (scoreResult.whitelistHit) { return { action: "passed", triggerSource: "whitelist", score: 0, relayDecodedDomain, features: { ...scoreResult, score: 0, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch, randomTokens, }, }; } const score = scoreResult.score; // Score >= 80 → Hard-Block, kein LLM if (score >= SCORE_HARD_BLOCK_THRESHOLD) { const triggerSource: TriggerSource = `score:${score}`; return { action: "blocked", triggerSource, score, relayDecodedDomain, features: { ...scoreResult, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch, randomTokens, }, }; } // Score < 25 → PASS, kein LLM if (score < SCORE_BORDERLINE_LOW) { return { action: "passed", triggerSource: "no-signal", score, relayDecodedDomain, features: { ...scoreResult, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch, randomTokens, }, }; } // ── Layer 4: Groq-Borderline (25–75) ──────────────────────────────────────── if (score >= SCORE_BORDERLINE_LOW && score <= SCORE_BORDERLINE_HIGH && groqApiKey) { // Local-Part-Redaction: nur behalten wenn er selbst Gambling-Keywords enthält const localPartHasKeyword = (GAMBLING_KEYWORDS as string[]).some((kw: string) => localPart.toLowerCase().includes(kw), ); const senderEmailRedacted = redactLocalPartForLLM(senderEmailLower, localPartHasKeyword); let groqVerdict: GroqVerdict | null = null; try { groqVerdict = await callGroqClassifier({ senderName, senderEmailRedacted, subject, groqApiKey, }); } catch (err) { // LLM-Fehler → konservativ PASS (kein false-positive durch API-Ausfall) console.warn("[mail-classifier] Groq call failed, falling back to score-based decision:", err); } if (groqVerdict) { const action: ClassificationAction = groqVerdict.isGambling ? "blocked" : "passed"; const triggerSource: TriggerSource = `llm:${groqVerdict.confidence.toFixed(2)}`; return { action, triggerSource, score, relayDecodedDomain, groq: groqVerdict, features: { ...scoreResult, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch, randomTokens, }, }; } } // Fallback: Score 25–75 ohne Groq (API-Fehler oder kein Key) → PASS bei < 50, BLOCK bei >= 50 const fallbackAction: ClassificationAction = score >= 50 ? "blocked" : "passed"; const fallbackTrigger: TriggerSource = `score:${score}`; return { action: fallbackAction, triggerSource: fallbackTrigger, score, relayDecodedDomain, features: { ...scoreResult, domainBlocked: false, relayDecoded: !!relayDecodedDomain, brandMatch, randomTokens, }, }; }