rebreak-monorepo/backend/server/utils/mail-classifier.ts
chahinebrini f2e3c00943 refactor(mail): remove groq llm layer — deterministic pipeline only
User-Direktive: Mail-Filter bleibt auf dem deterministischen Score+Layer-2.5-Stack.
Groq-LLM Borderline-Call (Layer 4) entfernt. Layer 2.5 Brand+Random fängt den
Apple Hide-My-Email Fall (icloud.com-Adressen mit kryptischen Local-Parts +
Brand-DisplayName) weiterhin sauber via Hard-Block.

Score-Mid-Range 25-79 entscheidet jetzt deterministisch: ≥50 → BLOCK, sonst PASS.

Damit auch DSGVO-P0-Items aus dem Hans-Müller-Review obsolet
(AVV-Annex Groq, Drittland-USA-Consent-Toggle, Datenschutzerklärung-Absatz).

- mail-classifier.ts: callGroqClassifier + redactLocalPartForLLM + groq-Feld raus
- scan.post.ts + scan-internal.post.ts: groqApiKey-Param raus, groq*-Sample-Felder raus
- mail-classifier.test.ts: Groq-Tests + redactLocalPart-Tests entfernt, 46 Tests grün

DB-Spalten in mail_classification_samples (groq_*) bleiben als legacy nullable —
Cleanup-Migration optional in späterem Sprint.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 22:15:32 +02:00

519 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Mail-Klassifikations-Pipeline (deterministisch, ohne LLM).
*
* Architektur:
* Layer 0 — Skip-Guard (bereits geblockt / kein Consent)
* Layer 1 — Whitelist (wetter, wettkampf …) → PASS
* Layer 2 — Domain-Hard-Block (Blocklist)
* Layer 2.5 — Brand+Random-Token-Detection (Hard-Block, fängt Apple Hide-My-Email)
* Layer 3 — Score 0100 (deterministisch); ≥50 → BLOCK, sonst PASS
* Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0)
*
* Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks.
*
* DSGVO-Hinweise:
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
* - Keine Daten verlassen mehr den Server (kein LLM-Drittland-Transfer).
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore — .mjs ohne types, Exports sind string[]
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
// ─── Typen ─────────────────────────────────────────────────────────────────────
export type ClassificationAction = "blocked" | "passed";
export type TriggerSource =
| "domain"
| "relay-decoded"
| "brand+random"
| `score:${number}`
| "whitelist"
| "no-signal";
export interface MailInput {
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
senderEmail: string;
/** Display-Name des Absenders (kann leer sein) */
senderName: string | null;
/** Betreff-Zeile */
subject: string;
}
export interface ClassificationResult {
action: ClassificationAction;
triggerSource: TriggerSource;
score: number;
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
relayDecodedDomain: string | null;
/** Score-Komponenten für MailClassificationSample.features */
features: ClassificationFeatures;
}
export interface ClassificationFeatures {
score: number;
domainBlocked: boolean;
relayDecoded: boolean;
brandMatch: boolean;
randomTokens: boolean;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
export const SCORE_WEIGHTS = {
// Domain-Indikatoren
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
// Subject-Indikatoren
SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …)
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
// Display-Name-Indikatoren
SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen
SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert)
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
} as const;
// Hard-Block-Threshold: Score >= 80 → BLOCK
const SCORE_HARD_BLOCK_THRESHOLD = 80;
// Pass-Below: Score < 25 → PASS (no-signal)
const SCORE_PASS_BELOW = 25;
// Mid-range Block-Threshold: Score in [25, 80) → BLOCK ab 50, sonst PASS
const SCORE_BLOCK_MIDRANGE = 50;
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
const GAMBLING_BRANDS: string[] = [
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
"justcasino", "getslots", "rocketplay", "freshcasino",
"nomnomcasino", "gamblezen", "betandplay",
];
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
/**
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
*
* Muster die wir kennen:
* bounces+user=example.com@sendgrid.net → example.com
* track.user=gamblezen.com@mailchimp.com → gamblezen.com
* a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com
* user=betandplay.com@bounce.em.example → betandplay.com
*
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
*/
export function extractRelayedDomain(senderEmail: string): string | null {
if (!senderEmail.includes("@")) return null;
const [localPart] = senderEmail.split("@");
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
if (eqMatch) return eqMatch[1].toLowerCase();
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
if (atMatch) return atMatch[1].toLowerCase();
return null;
}
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
/**
* Normalisiert einen String für Brand-Vergleiche.
* "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip)
*/
export function normalizeBrand(s: string): string {
return s.toLowerCase().replace(/[\s\-._]/g, "");
}
/**
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz).
*/
export function matchesGamblingBrand(normalized: string): boolean {
if (normalized.length < 4) return false;
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
}
/**
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
* "betand-play.com" → ["betandplay", "betand"] (root + normalisiert)
*/
function domainToBrandCandidates(domain: string): string[] {
const root = domain.split(".")[0] ?? "";
return [normalizeBrand(root), normalizeBrand(domain)];
}
// ─── Random-Token-Detection ───────────────────────────────────────────────────
/**
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
*
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
* kein bekanntes Funktions-Wort (info, admin, noreply, support …).
*
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" —
* typisch für Massen-Mailer mit trackierbaren User-IDs.
*/
export function hasRandomTokens(localPart: string): boolean {
const FUNCTION_WORDS = new Set([
"info", "admin", "noreply", "no-reply", "support", "hello",
"news", "marketing", "sales", "contact", "newsletter", "service",
"offers", "promotions", "promo", "team", "mail", "email",
"reply", "bounce", "return", "postmaster", "mailer",
]);
const tokens = localPart.split(/[_\-.+]+/);
const randomLooking = tokens.filter((t) => {
if (t.length < 6) return false;
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
const lower = t.toLowerCase();
if (FUNCTION_WORDS.has(lower)) return false;
return true;
});
return randomLooking.length >= 2;
}
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
interface ScoreResult {
score: number;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
export function computeScore(
senderEmail: string,
senderName: string | null,
subject: string,
brandMatchFound: boolean,
randomTokensFound: boolean,
): ScoreResult {
let score = 0;
const keywordHitsSubject: string[] = [];
const keywordHitsDomain: string[] = [];
const keywordHitsName: string[] = [];
const styleFlags: string[] = [];
const subjectLower = subject.toLowerCase();
const senderEmailLower = senderEmail.toLowerCase();
const senderNameLower = (senderName ?? "").toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const domainRoot = domain.split(".")[0] ?? "";
// ── Whitelist-Check (Layer 1) ──
for (const w of GAMBLING_WHITELIST as string[]) {
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
return {
score: 0,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
};
}
}
// ── Domain-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (domain.includes(kw) || domainRoot.includes(kw)) {
keywordHitsDomain.push(kw);
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
break; // einmal reicht
}
}
// ── Subject-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (subjectLower.includes(kw)) {
keywordHitsSubject.push(kw);
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
break;
}
}
// ── Sender-Name-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (senderNameLower.includes(kw)) {
keywordHitsName.push(kw);
score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD;
break;
}
}
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
styleFlags.push("money-pattern");
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
}
// ── Urgency-Wörter im Betreff ──
const URGENCY_PATTERNS = [
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
"expires", "last chance", "limited time", "jetzt einlösen",
"sofort", "nur noch", "endet heute",
];
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
styleFlags.push("urgency");
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
}
// ── ALL-CAPS-Wort im Betreff ──
if (/\b[A-Z]{4,}\b/.test(subject)) {
styleFlags.push("all-caps");
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
}
// ── Short-Random-Domain ──
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
styleFlags.push("short-random-domain");
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
}
// ── Layer 2.5 Score-Ergänzungen ──
if (brandMatchFound && !randomTokensFound) {
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
}
if (!brandMatchFound && randomTokensFound) {
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
}
return {
score: Math.min(score, 100),
keywordHitsSubject,
keywordHitsDomain,
keywordHitsName,
styleFlags,
whitelistHit: false,
};
}
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
export interface ClassifyMailParams {
mail: MailInput;
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
blockedDomainSet: Set<string>;
}
/**
* Klassifiziert eine einzelne Mail durch alle Layer.
* Komplett deterministisch — keine externen Calls, keine PII verlässt den Server.
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
*/
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
const { mail, blockedDomainSet } = params;
const { senderEmail, senderName, subject } = mail;
const senderEmailLower = senderEmail.toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const localPart = senderEmailLower.split("@")[0] ?? "";
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
for (const w of GAMBLING_WHITELIST as string[]) {
if (haystack.includes(w)) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain: null,
features: {
score: 0,
domainBlocked: false,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
},
};
}
}
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
if (domain && blockedDomainSet.has(domain)) {
return {
action: "blocked",
triggerSource: "domain",
score: 100,
relayDecodedDomain: null,
features: {
score: 100,
domainBlocked: true,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
return {
action: "blocked",
triggerSource: "relay-decoded",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: true,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
// Normalisiere Absender-Name und Domain-Root für Brand-Vergleich
const displayNameNorm = normalizeBrand(senderName ?? "");
const domainCandidates = domainToBrandCandidates(domain);
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates];
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
const randomTokens = hasRandomTokens(localPart);
if (brandMatch && randomTokens) {
return {
action: "blocked",
triggerSource: "brand+random",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch: true,
randomTokens: true,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 3: Score ──────────────────────────────────────────────────────────
const scoreResult = computeScore(
senderEmailLower,
senderName,
subject,
brandMatch,
randomTokens,
);
if (scoreResult.whitelistHit) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain,
features: {
...scoreResult,
score: 0,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
const score = scoreResult.score;
// Score >= 80 → Hard-Block, kein LLM
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
const triggerSource: TriggerSource = `score:${score}`;
return {
action: "blocked",
triggerSource,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score < 25 → PASS
if (score < SCORE_PASS_BELOW) {
return {
action: "passed",
triggerSource: "no-signal",
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score 25-79 → PASS bei < 50, BLOCK bei >= 50 (deterministisch, kein LLM)
const midAction: ClassificationAction =
score >= SCORE_BLOCK_MIDRANGE ? "blocked" : "passed";
const midTrigger: TriggerSource = `score:${score}`;
return {
action: midAction,
triggerSource: midTrigger,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}