rebreak-monorepo/backend/server/utils/mail-classifier.ts
chahinebrini 4573d16e1a refactor(mail-classifier): display-name aus Score-Pfad entfernen (v1.0)
SENDER_NAME_GAMBLING_KEYWORD (+30) und SENDER_NAME_BRAND_MATCH (+20) aus
SCORE_WEIGHTS entfernt. Layer-2.5-Brand-Match prüft nur noch Domain-Root
und Relay-Domain, nicht mehr displayNameNorm. Sender-Name-Keywords-Block
in computeScore() entfernt. keywordHitsName bleibt im Interface für v1.1.

Tests: Brand+Random-Tests die Display-Name als einzige Brand-Source hatten
auf neues v1.0-Verhalten (PASS) umgeschrieben. Zwei neue Tests: Display-Name-
only Casino-Signal → Score=0 → PASS verifiziert.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 05:18:00 +02:00

563 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Mail-Klassifikations-Pipeline (deterministisch, ohne LLM).
*
* Architektur:
* Layer 0 — Skip-Guard (bereits geblockt / kein Consent)
* Layer 1 — Whitelist (wetter, wettkampf …) → PASS
* Layer 2 — Domain-Hard-Block (Blocklist)
* Layer 2.5 — Brand+Random-Token-Detection (Hard-Block, fängt Apple Hide-My-Email)
* Layer 3 — Score 0100 (deterministisch); ≥50 → BLOCK, sonst PASS
* Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0)
*
* Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks.
*
* DSGVO-Hinweise:
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
* - Keine Daten verlassen mehr den Server (kein LLM-Drittland-Transfer).
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore — .mjs ohne types, Exports sind string[]
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
// ─── Typen ─────────────────────────────────────────────────────────────────────
export type ClassificationAction = "blocked" | "passed";
export type TriggerSource =
| "domain"
| "relay-decoded"
| "brand+random"
| "custom-display-name"
| `score:${number}`
| "whitelist"
| "no-signal";
export interface MailInput {
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
senderEmail: string;
/** Display-Name des Absenders (kann leer sein) */
senderName: string | null;
/** Betreff-Zeile */
subject: string;
}
export interface ClassificationResult {
action: ClassificationAction;
triggerSource: TriggerSource;
score: number;
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
relayDecodedDomain: string | null;
/** Score-Komponenten für MailClassificationSample.features */
features: ClassificationFeatures;
}
export interface ClassificationFeatures {
score: number;
domainBlocked: boolean;
relayDecoded: boolean;
brandMatch: boolean;
randomTokens: boolean;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
export const SCORE_WEIGHTS = {
// Domain-Indikatoren
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
// Subject-Indikatoren
SUBJECT_GAMBLING_KEYWORD: 50, // Keyword im Betreff (casino, jackpot, freispiel …)
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
// Display-Name-Indikatoren: entfernt in v1.0 (zu False-Positive-anfällig).
// v1.1: SENDER_NAME_GAMBLING_KEYWORD, SENDER_NAME_BRAND_MATCH reaktivieren
// wenn Display-Name-Blocking UX + Testing vollständig sind.
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
} as const;
// Hard-Block-Threshold: Score >= 80 → BLOCK
const SCORE_HARD_BLOCK_THRESHOLD = 80;
// Pass-Below: Score < 25 → PASS (no-signal)
const SCORE_PASS_BELOW = 25;
// Mid-range Block-Threshold: Score in [25, 80) → BLOCK ab 50, sonst PASS
const SCORE_BLOCK_MIDRANGE = 50;
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
const GAMBLING_BRANDS: string[] = [
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
"justcasino", "getslots", "rocketplay", "freshcasino",
"nomnomcasino", "gamblezen", "betandplay",
];
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
/**
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
*
* Muster die wir kennen:
* bounces+user=example.com@sendgrid.net → example.com
* track.user=gamblezen.com@mailchimp.com → gamblezen.com
* a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com
* user=betandplay.com@bounce.em.example → betandplay.com
*
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
*/
export function extractRelayedDomain(senderEmail: string): string | null {
if (!senderEmail.includes("@")) return null;
const [localPart] = senderEmail.split("@");
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
if (eqMatch) return eqMatch[1].toLowerCase();
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
if (atMatch) return atMatch[1].toLowerCase();
return null;
}
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
/**
* Normalisiert einen String für Brand-Vergleiche.
* "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip)
*/
export function normalizeBrand(s: string): string {
return s.toLowerCase().replace(/[\s\-._]/g, "");
}
/**
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz).
*/
export function matchesGamblingBrand(normalized: string): boolean {
if (normalized.length < 4) return false;
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
}
/**
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
* "betand-play.com" → ["betandplay", "betand"] (root + normalisiert)
*/
function domainToBrandCandidates(domain: string): string[] {
const root = domain.split(".")[0] ?? "";
return [normalizeBrand(root), normalizeBrand(domain)];
}
// ─── Random-Token-Detection ───────────────────────────────────────────────────
/**
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
*
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
* kein bekanntes Funktions-Wort (info, admin, noreply, support …).
*
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" —
* typisch für Massen-Mailer mit trackierbaren User-IDs.
*/
export function hasRandomTokens(localPart: string): boolean {
const FUNCTION_WORDS = new Set([
"info", "admin", "noreply", "no-reply", "support", "hello",
"news", "marketing", "sales", "contact", "newsletter", "service",
"offers", "promotions", "promo", "team", "mail", "email",
"reply", "bounce", "return", "postmaster", "mailer",
]);
const tokens = localPart.split(/[_\-.+]+/);
const randomLooking = tokens.filter((t) => {
if (t.length < 6) return false;
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
const lower = t.toLowerCase();
if (FUNCTION_WORDS.has(lower)) return false;
return true;
});
return randomLooking.length >= 2;
}
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
interface ScoreResult {
score: number;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
export function computeScore(
senderEmail: string,
senderName: string | null,
subject: string,
brandMatchFound: boolean,
randomTokensFound: boolean,
): ScoreResult {
let score = 0;
const keywordHitsSubject: string[] = [];
const keywordHitsDomain: string[] = [];
const keywordHitsName: string[] = [];
const styleFlags: string[] = [];
const subjectLower = subject.toLowerCase();
const senderEmailLower = senderEmail.toLowerCase();
const senderNameLower = (senderName ?? "").toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const domainRoot = domain.split(".")[0] ?? "";
// ── Whitelist-Check (Layer 1) ──
for (const w of GAMBLING_WHITELIST as string[]) {
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
return {
score: 0,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
};
}
}
// ── Domain-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (domain.includes(kw) || domainRoot.includes(kw)) {
keywordHitsDomain.push(kw);
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
break; // einmal reicht
}
}
// ── Subject-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (subjectLower.includes(kw)) {
keywordHitsSubject.push(kw);
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
break;
}
}
// ── Sender-Name-Keywords: entfernt in v1.0 (Score-Beitrag via Display-Name
// ist zu False-Positive-anfällig, Display-Name-Blocking nicht supported).
// keywordHitsName bleibt im ScoreResult für v1.1-Reaktivierung (immer leer).
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
styleFlags.push("money-pattern");
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
}
// ── Urgency-Wörter im Betreff ──
const URGENCY_PATTERNS = [
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
"expires", "last chance", "limited time", "jetzt einlösen",
"sofort", "nur noch", "endet heute",
];
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
styleFlags.push("urgency");
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
}
// ── ALL-CAPS-Wort im Betreff ──
if (/\b[A-Z]{4,}\b/.test(subject)) {
styleFlags.push("all-caps");
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
}
// ── Short-Random-Domain ──
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
styleFlags.push("short-random-domain");
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
}
// ── Layer 2.5 Score-Ergänzungen ──
if (brandMatchFound && !randomTokensFound) {
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
}
if (!brandMatchFound && randomTokensFound) {
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
}
return {
score: Math.min(score, 100),
keywordHitsSubject,
keywordHitsDomain,
keywordHitsName,
styleFlags,
whitelistHit: false,
};
}
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
export interface ClassifyMailParams {
mail: MailInput;
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
blockedDomainSet: Set<string>;
/**
* User-spezifische Display-Name-Patterns (aus getCustomMailDisplayNames).
* Layer 2.6: case-insensitive Substring-Match gegen senderName.
* Leer-Array wenn User keine Display-Name-Patterns gesetzt hat.
*
* DSGVO: keine PII — reine Heuristik-Muster (z.B. ["EXTRASPIN"]).
*/
customDisplayNames?: string[];
}
/**
* Klassifiziert eine einzelne Mail durch alle Layer.
* Komplett deterministisch — keine externen Calls, keine PII verlässt den Server.
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
*/
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
const { mail, blockedDomainSet, customDisplayNames } = params;
const { senderEmail, senderName, subject } = mail;
const senderEmailLower = senderEmail.toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const localPart = senderEmailLower.split("@")[0] ?? "";
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
for (const w of GAMBLING_WHITELIST as string[]) {
if (haystack.includes(w)) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain: null,
features: {
score: 0,
domainBlocked: false,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
},
};
}
}
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
if (domain && blockedDomainSet.has(domain)) {
return {
action: "blocked",
triggerSource: "domain",
score: 100,
relayDecodedDomain: null,
features: {
score: 100,
domainBlocked: true,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
return {
action: "blocked",
triggerSource: "relay-decoded",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: true,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
// Brand-Match prüft nur Domain-Root und Relay-Domain — kein Display-Name.
// Display-Name-basiertes Brand-Matching ist in v1.0 entfernt (zu False-Positive-anfällig).
// v1.1: displayNameNorm wieder in allBrandCandidates aufnehmen wenn UX + Testing fertig.
const domainCandidates = domainToBrandCandidates(domain);
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
const allBrandCandidates = [...domainCandidates, ...relayDomainCandidates];
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
const randomTokens = hasRandomTokens(localPart);
if (brandMatch && randomTokens) {
return {
action: "blocked",
triggerSource: "brand+random",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch: true,
randomTokens: true,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.6: User-Custom-Display-Name-Hard-Block ──────────────────────────
// Display-name patterns disabled in v1.0 — re-enable when display-name input UX ships (v1.1).
// getCustomMailDisplayNames() returns [] until mail_display_name rows exist,
// so this block is dead code in practice. Keep logic intact for trivial re-activation.
//
// User-eigene Patterns (z.B. "EXTRASPIN") matchen case-insensitiv als Substring
// gegen den Sender-Display-Name. Kein Score — direkter Hard-Block wenn Match.
//
// Substring-Match (nicht exact) damit "EXTRASPIN Casino" und "ExtraSpin Bonus"
// beide von Pattern "EXTRASPIN" erfasst werden.
//
// Gambling-Brands rotieren aktiv Capitalization → case-insensitive ist Pflicht.
if (customDisplayNames && customDisplayNames.length > 0 && senderName) {
const senderNameLower = senderName.toLowerCase();
const matchedPattern = customDisplayNames.find(
(pattern) => pattern.length > 0 && senderNameLower.includes(pattern.toLowerCase()),
);
if (matchedPattern) {
return {
action: "blocked",
triggerSource: "custom-display-name",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
}
// ── Layer 3: Score ──────────────────────────────────────────────────────────
const scoreResult = computeScore(
senderEmailLower,
senderName,
subject,
brandMatch,
randomTokens,
);
if (scoreResult.whitelistHit) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain,
features: {
...scoreResult,
score: 0,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
const score = scoreResult.score;
// Score >= 80 → Hard-Block, kein LLM
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
const triggerSource: TriggerSource = `score:${score}`;
return {
action: "blocked",
triggerSource,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score < 25 → PASS
if (score < SCORE_PASS_BELOW) {
return {
action: "passed",
triggerSource: "no-signal",
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score 25-79 → PASS bei < 50, BLOCK bei >= 50 (deterministisch, kein LLM)
const midAction: ClassificationAction =
score >= SCORE_BLOCK_MIDRANGE ? "blocked" : "passed";
const midTrigger: TriggerSource = `score:${score}`;
return {
action: midAction,
triggerSource: midTrigger,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}