rebreak-monorepo/backend/server/utils/mail-classifier.ts
chahinebrini c3de7055a5 feat(mail): Sucht-Compound-Regel + Phase-1-Training-Foundation
Task B — linguistische FP-Fix:
- mail-classifier.ts: Subject-Keyword-Loop überspringt Keyword-Score wenn
  Subject das Keyword als Sucht-Compound enthält (z.B. "glücksspiel" in
  "Glücksspielsucht" → kein +50 Score). Globale linguistische Invariante
  Deutsch — Gambling-Marketer schreiben nie "Glücksspielsucht-Bonus".
- gambling-keywords.mjs: GAMBLING_WHITELIST erweitert um Stamm-Varianten
  (wettsucht, spielsucht, suchtberatung, suchthilfe) als Fallback für
  Compounds wo keyword ≠ exakter Stamm.
- 4 neue Tests: Forum Glücksspielsucht → PASS, Hilfe bei Spielsucht → PASS,
  Wettsucht-Selbsthilfe → PASS, Glücksspiel-Bonus 100€ → BLOCK.

Task C — Phase-1-Data-Foundation:
- mail-training-utils.ts: sanitizeSubjectForTraining() (PII-Stripping via
  Regex: EMAIL/URL/NUM/Greeting/ALL-CAPS) + detectSubjectLanguage() via
  franc (iso639-3). 26 Unit-Tests.
- franc@6.2.0 installiert (~50KB ESM).
- mail.ts insertMailClassificationSample(): ruft sanitizeSubjectForTraining()
  auf, schreibt detectedLang + subjectSanitized in features-JSON
  (Interim bis Schema-Migration).
- mail-retention-cron.ts: Subject-Nullification nach 30 Tagen (täglich) +
  Sample-Purge nach 12 Monaten (monatlich). DSGVO Art. 5 Abs. 1e.

105 Tests grün (58 classifier + 26 training-utils + 11 display-name + 10 gmail).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 08:14:57 +02:00

571 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Mail-Klassifikations-Pipeline (deterministisch, ohne LLM).
*
* Architektur:
* Layer 0 — Skip-Guard (bereits geblockt / kein Consent)
* Layer 1 — Whitelist (wetter, wettkampf …) → PASS
* Layer 2 — Domain-Hard-Block (Blocklist)
* Layer 2.5 — Brand+Random-Token-Detection (Hard-Block, fängt Apple Hide-My-Email)
* Layer 3 — Score 0100 (deterministisch); ≥50 → BLOCK, sonst PASS
* Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0)
*
* Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks.
*
* DSGVO-Hinweise:
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
* - Keine Daten verlassen mehr den Server (kein LLM-Drittland-Transfer).
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore — .mjs ohne types, Exports sind string[]
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
// ─── Typen ─────────────────────────────────────────────────────────────────────
export type ClassificationAction = "blocked" | "passed";
export type TriggerSource =
| "domain"
| "relay-decoded"
| "brand+random"
| "custom-display-name"
| `score:${number}`
| "whitelist"
| "no-signal";
export interface MailInput {
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
senderEmail: string;
/** Display-Name des Absenders (kann leer sein) */
senderName: string | null;
/** Betreff-Zeile */
subject: string;
}
export interface ClassificationResult {
action: ClassificationAction;
triggerSource: TriggerSource;
score: number;
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
relayDecodedDomain: string | null;
/** Score-Komponenten für MailClassificationSample.features */
features: ClassificationFeatures;
}
export interface ClassificationFeatures {
score: number;
domainBlocked: boolean;
relayDecoded: boolean;
brandMatch: boolean;
randomTokens: boolean;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
export const SCORE_WEIGHTS = {
// Domain-Indikatoren
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
// Subject-Indikatoren
SUBJECT_GAMBLING_KEYWORD: 50, // Keyword im Betreff (casino, jackpot, freispiel …)
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
// Display-Name-Indikatoren: entfernt in v1.0 (zu False-Positive-anfällig).
// v1.1: SENDER_NAME_GAMBLING_KEYWORD, SENDER_NAME_BRAND_MATCH reaktivieren
// wenn Display-Name-Blocking UX + Testing vollständig sind.
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
} as const;
// Hard-Block-Threshold: Score >= 80 → BLOCK
const SCORE_HARD_BLOCK_THRESHOLD = 80;
// Pass-Below: Score < 25 → PASS (no-signal)
const SCORE_PASS_BELOW = 25;
// Mid-range Block-Threshold: Score in [25, 80) → BLOCK ab 50, sonst PASS
const SCORE_BLOCK_MIDRANGE = 50;
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
const GAMBLING_BRANDS: string[] = [
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
"justcasino", "getslots", "rocketplay", "freshcasino",
"nomnomcasino", "gamblezen", "betandplay",
];
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
/**
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
*
* Muster die wir kennen:
* bounces+user=example.com@sendgrid.net → example.com
* track.user=gamblezen.com@mailchimp.com → gamblezen.com
* a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com
* user=betandplay.com@bounce.em.example → betandplay.com
*
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
*/
export function extractRelayedDomain(senderEmail: string): string | null {
if (!senderEmail.includes("@")) return null;
const [localPart] = senderEmail.split("@");
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
if (eqMatch) return eqMatch[1].toLowerCase();
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
if (atMatch) return atMatch[1].toLowerCase();
return null;
}
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
/**
* Normalisiert einen String für Brand-Vergleiche.
* "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip)
*/
export function normalizeBrand(s: string): string {
return s.toLowerCase().replace(/[\s\-._]/g, "");
}
/**
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz).
*/
export function matchesGamblingBrand(normalized: string): boolean {
if (normalized.length < 4) return false;
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
}
/**
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
* "betand-play.com" → ["betandplay", "betand"] (root + normalisiert)
*/
function domainToBrandCandidates(domain: string): string[] {
const root = domain.split(".")[0] ?? "";
return [normalizeBrand(root), normalizeBrand(domain)];
}
// ─── Random-Token-Detection ───────────────────────────────────────────────────
/**
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
*
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
* kein bekanntes Funktions-Wort (info, admin, noreply, support …).
*
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" —
* typisch für Massen-Mailer mit trackierbaren User-IDs.
*/
export function hasRandomTokens(localPart: string): boolean {
const FUNCTION_WORDS = new Set([
"info", "admin", "noreply", "no-reply", "support", "hello",
"news", "marketing", "sales", "contact", "newsletter", "service",
"offers", "promotions", "promo", "team", "mail", "email",
"reply", "bounce", "return", "postmaster", "mailer",
]);
const tokens = localPart.split(/[_\-.+]+/);
const randomLooking = tokens.filter((t) => {
if (t.length < 6) return false;
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
const lower = t.toLowerCase();
if (FUNCTION_WORDS.has(lower)) return false;
return true;
});
return randomLooking.length >= 2;
}
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
interface ScoreResult {
score: number;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
export function computeScore(
senderEmail: string,
senderName: string | null,
subject: string,
brandMatchFound: boolean,
randomTokensFound: boolean,
): ScoreResult {
let score = 0;
const keywordHitsSubject: string[] = [];
const keywordHitsDomain: string[] = [];
const keywordHitsName: string[] = [];
const styleFlags: string[] = [];
const subjectLower = subject.toLowerCase();
const senderEmailLower = senderEmail.toLowerCase();
const senderNameLower = (senderName ?? "").toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const domainRoot = domain.split(".")[0] ?? "";
// ── Whitelist-Check (Layer 1) ──
for (const w of GAMBLING_WHITELIST as string[]) {
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
return {
score: 0,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
};
}
}
// ── Domain-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (domain.includes(kw) || domainRoot.includes(kw)) {
keywordHitsDomain.push(kw);
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
break; // einmal reicht
}
}
// ── Subject-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (subjectLower.includes(kw)) {
// Linguistische Invariante (Deutsch): Compound-Nomen mit "-sucht"-Suffix
// (Glücksspielsucht, Spielsucht, Wettsucht) signalisieren IMMER Recovery-/
// Anti-Gambling-Kontext. Gambling-Marketer schreiben nie "Glücksspielsucht-Bonus"
// — regulatorisch tabu + würde User-Vertrauen zerstören.
// Implementierung: keyword "glücksspiel" matcht in "Glücksspielsucht" →
// subject enthält "${kw}sucht" → kein Score-Beitrag.
if (subjectLower.includes(`${kw}sucht`)) {
continue; // Recovery-Kontext — kein Gambling-Signal
}
keywordHitsSubject.push(kw);
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
break;
}
}
// ── Sender-Name-Keywords: entfernt in v1.0 (Score-Beitrag via Display-Name
// ist zu False-Positive-anfällig, Display-Name-Blocking nicht supported).
// keywordHitsName bleibt im ScoreResult für v1.1-Reaktivierung (immer leer).
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
styleFlags.push("money-pattern");
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
}
// ── Urgency-Wörter im Betreff ──
const URGENCY_PATTERNS = [
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
"expires", "last chance", "limited time", "jetzt einlösen",
"sofort", "nur noch", "endet heute",
];
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
styleFlags.push("urgency");
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
}
// ── ALL-CAPS-Wort im Betreff ──
if (/\b[A-Z]{4,}\b/.test(subject)) {
styleFlags.push("all-caps");
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
}
// ── Short-Random-Domain ──
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
styleFlags.push("short-random-domain");
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
}
// ── Layer 2.5 Score-Ergänzungen ──
if (brandMatchFound && !randomTokensFound) {
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
}
if (!brandMatchFound && randomTokensFound) {
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
}
return {
score: Math.min(score, 100),
keywordHitsSubject,
keywordHitsDomain,
keywordHitsName,
styleFlags,
whitelistHit: false,
};
}
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
export interface ClassifyMailParams {
mail: MailInput;
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
blockedDomainSet: Set<string>;
/**
* Display-Name-Patterns (global-curated + optional user-scope) aus getMailDisplayNamePatterns().
* Layer 2.6: case-insensitive Substring-Match gegen senderName.
* Leer-Array solange keine Patterns geladen wurden.
*
* DSGVO: keine PII — reine Heuristik-Muster (z.B. ["Tipico", "Bet365"]).
*/
customDisplayNames?: string[];
}
/**
* Klassifiziert eine einzelne Mail durch alle Layer.
* Komplett deterministisch — keine externen Calls, keine PII verlässt den Server.
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
*/
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
const { mail, blockedDomainSet, customDisplayNames } = params;
const { senderEmail, senderName, subject } = mail;
const senderEmailLower = senderEmail.toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const localPart = senderEmailLower.split("@")[0] ?? "";
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
for (const w of GAMBLING_WHITELIST as string[]) {
if (haystack.includes(w)) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain: null,
features: {
score: 0,
domainBlocked: false,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
},
};
}
}
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
if (domain && blockedDomainSet.has(domain)) {
return {
action: "blocked",
triggerSource: "domain",
score: 100,
relayDecodedDomain: null,
features: {
score: 100,
domainBlocked: true,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
return {
action: "blocked",
triggerSource: "relay-decoded",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: true,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
// Brand-Match prüft nur Domain-Root und Relay-Domain — kein Display-Name.
// Display-Name-basiertes Brand-Matching ist in v1.0 entfernt (zu False-Positive-anfällig).
// v1.1: displayNameNorm wieder in allBrandCandidates aufnehmen wenn UX + Testing fertig.
const domainCandidates = domainToBrandCandidates(domain);
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
const allBrandCandidates = [...domainCandidates, ...relayDomainCandidates];
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
const randomTokens = hasRandomTokens(localPart);
if (brandMatch && randomTokens) {
return {
action: "blocked",
triggerSource: "brand+random",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch: true,
randomTokens: true,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.6: Display-Name-Hard-Block (global-curated + user-scope) ────────
// Patterns kommen aus getMailDisplayNamePatterns() — admin-curated globale
// Gambling-Brand-Liste (z.B. "Tipico", "Bet365") plus optionale user-scope Patterns.
//
// v1.1 (2026-05-28): von dead-code zu live — global_mail_display_names-Tabelle
// als Datenquelle. Keine User-UI nötig; Admin pflegt die Liste manuell.
//
// Substring-Match (nicht exact) damit "Tipico Casino" und "TIPICO Bonus"
// beide von Pattern "Tipico" erfasst werden.
//
// Gambling-Brands rotieren aktiv Capitalization → case-insensitive ist Pflicht.
if (customDisplayNames && customDisplayNames.length > 0 && senderName) {
const senderNameLower = senderName.toLowerCase();
const matchedPattern = customDisplayNames.find(
(pattern) => pattern.length > 0 && senderNameLower.includes(pattern.toLowerCase()),
);
if (matchedPattern) {
return {
action: "blocked",
triggerSource: "custom-display-name",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
}
// ── Layer 3: Score ──────────────────────────────────────────────────────────
const scoreResult = computeScore(
senderEmailLower,
senderName,
subject,
brandMatch,
randomTokens,
);
if (scoreResult.whitelistHit) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain,
features: {
...scoreResult,
score: 0,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
const score = scoreResult.score;
// Score >= 80 → Hard-Block, kein LLM
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
const triggerSource: TriggerSource = `score:${score}`;
return {
action: "blocked",
triggerSource,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score < 25 → PASS
if (score < SCORE_PASS_BELOW) {
return {
action: "passed",
triggerSource: "no-signal",
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score 25-79 → PASS bei < 50, BLOCK bei >= 50 (deterministisch, kein LLM)
const midAction: ClassificationAction =
score >= SCORE_BLOCK_MIDRANGE ? "blocked" : "passed";
const midTrigger: TriggerSource = `score:${score}`;
return {
action: midAction,
triggerSource: midTrigger,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}