rebreak-monorepo/backend/server/utils/mail-classifier.ts
chahinebrini c1250836a3 fix(backend): remove display-name pattern support for v1.0
User explicitly chose to drop display-name matching from v1.0 after
the UX trap surfaced — a user typing "EXTRASPIN" without a domain got
a 400 INVALID_DOMAIN back, which is a confusing dead-end. v1.1 will
ship a dedicated display-name UI; until then mail input is domain-only.

- resolveTypeAndValue returns a discriminated union — kind='mail' with
  no dot or @ now resolves to { ok: false, error: 'INVALID_MAIL_DOMAIN' }
  instead of silently turning into a mail_display_name row.
- Full-address mail input (local@domain.tld) still gets its local-part
  stripped server-side so the stored value is always a clean domain.
- Variant-B body { type: 'mail_display_name' } returns 400
  DISPLAY_NAME_NOT_SUPPORTED for direct API consumers.
- The DISPLAY_NAME_PATTERN regex is gone — the path that used it can
  no longer be reached.
- classifyMail's Layer 2.6 (the display-name substring match) is
  intentionally left in place as dead code with a v1.1 marker, so
  re-enabling later is just wiring the input field back up and feeding
  the customDisplayNames array.
- Tests rewritten: the two pre-existing display-name tests now assert
  the 400 INVALID_MAIL_DOMAIN path, plus a new positive case for the
  full-address local-part strip. 217 vitest passes, 4 pre-existing skips.

Staging DB clean — the type column hasn't been deployed yet so no
mail_display_name rows exist to backfill.
2026-05-16 02:17:50 +02:00

567 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Mail-Klassifikations-Pipeline (deterministisch, ohne LLM).
*
* Architektur:
* Layer 0 — Skip-Guard (bereits geblockt / kein Consent)
* Layer 1 — Whitelist (wetter, wettkampf …) → PASS
* Layer 2 — Domain-Hard-Block (Blocklist)
* Layer 2.5 — Brand+Random-Token-Detection (Hard-Block, fängt Apple Hide-My-Email)
* Layer 3 — Score 0100 (deterministisch); ≥50 → BLOCK, sonst PASS
* Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0)
*
* Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks.
*
* DSGVO-Hinweise:
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
* - Keine Daten verlassen mehr den Server (kein LLM-Drittland-Transfer).
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore — .mjs ohne types, Exports sind string[]
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
// ─── Typen ─────────────────────────────────────────────────────────────────────
export type ClassificationAction = "blocked" | "passed";
export type TriggerSource =
| "domain"
| "relay-decoded"
| "brand+random"
| "custom-display-name"
| `score:${number}`
| "whitelist"
| "no-signal";
export interface MailInput {
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
senderEmail: string;
/** Display-Name des Absenders (kann leer sein) */
senderName: string | null;
/** Betreff-Zeile */
subject: string;
}
export interface ClassificationResult {
action: ClassificationAction;
triggerSource: TriggerSource;
score: number;
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
relayDecodedDomain: string | null;
/** Score-Komponenten für MailClassificationSample.features */
features: ClassificationFeatures;
}
export interface ClassificationFeatures {
score: number;
domainBlocked: boolean;
relayDecoded: boolean;
brandMatch: boolean;
randomTokens: boolean;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
export const SCORE_WEIGHTS = {
// Domain-Indikatoren
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
// Subject-Indikatoren
SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …)
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
// Display-Name-Indikatoren
SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen
SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert)
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
} as const;
// Hard-Block-Threshold: Score >= 80 → BLOCK
const SCORE_HARD_BLOCK_THRESHOLD = 80;
// Pass-Below: Score < 25 → PASS (no-signal)
const SCORE_PASS_BELOW = 25;
// Mid-range Block-Threshold: Score in [25, 80) → BLOCK ab 50, sonst PASS
const SCORE_BLOCK_MIDRANGE = 50;
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
const GAMBLING_BRANDS: string[] = [
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
"justcasino", "getslots", "rocketplay", "freshcasino",
"nomnomcasino", "gamblezen", "betandplay",
];
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
/**
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
*
* Muster die wir kennen:
* bounces+user=example.com@sendgrid.net → example.com
* track.user=gamblezen.com@mailchimp.com → gamblezen.com
* a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com
* user=betandplay.com@bounce.em.example → betandplay.com
*
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
*/
export function extractRelayedDomain(senderEmail: string): string | null {
if (!senderEmail.includes("@")) return null;
const [localPart] = senderEmail.split("@");
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
if (eqMatch) return eqMatch[1].toLowerCase();
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
if (atMatch) return atMatch[1].toLowerCase();
return null;
}
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
/**
* Normalisiert einen String für Brand-Vergleiche.
* "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip)
*/
export function normalizeBrand(s: string): string {
return s.toLowerCase().replace(/[\s\-._]/g, "");
}
/**
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz).
*/
export function matchesGamblingBrand(normalized: string): boolean {
if (normalized.length < 4) return false;
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
}
/**
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
* "betand-play.com" → ["betandplay", "betand"] (root + normalisiert)
*/
function domainToBrandCandidates(domain: string): string[] {
const root = domain.split(".")[0] ?? "";
return [normalizeBrand(root), normalizeBrand(domain)];
}
// ─── Random-Token-Detection ───────────────────────────────────────────────────
/**
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
*
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
* kein bekanntes Funktions-Wort (info, admin, noreply, support …).
*
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" —
* typisch für Massen-Mailer mit trackierbaren User-IDs.
*/
export function hasRandomTokens(localPart: string): boolean {
const FUNCTION_WORDS = new Set([
"info", "admin", "noreply", "no-reply", "support", "hello",
"news", "marketing", "sales", "contact", "newsletter", "service",
"offers", "promotions", "promo", "team", "mail", "email",
"reply", "bounce", "return", "postmaster", "mailer",
]);
const tokens = localPart.split(/[_\-.+]+/);
const randomLooking = tokens.filter((t) => {
if (t.length < 6) return false;
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
const lower = t.toLowerCase();
if (FUNCTION_WORDS.has(lower)) return false;
return true;
});
return randomLooking.length >= 2;
}
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
interface ScoreResult {
score: number;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
export function computeScore(
senderEmail: string,
senderName: string | null,
subject: string,
brandMatchFound: boolean,
randomTokensFound: boolean,
): ScoreResult {
let score = 0;
const keywordHitsSubject: string[] = [];
const keywordHitsDomain: string[] = [];
const keywordHitsName: string[] = [];
const styleFlags: string[] = [];
const subjectLower = subject.toLowerCase();
const senderEmailLower = senderEmail.toLowerCase();
const senderNameLower = (senderName ?? "").toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const domainRoot = domain.split(".")[0] ?? "";
// ── Whitelist-Check (Layer 1) ──
for (const w of GAMBLING_WHITELIST as string[]) {
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
return {
score: 0,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
};
}
}
// ── Domain-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (domain.includes(kw) || domainRoot.includes(kw)) {
keywordHitsDomain.push(kw);
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
break; // einmal reicht
}
}
// ── Subject-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (subjectLower.includes(kw)) {
keywordHitsSubject.push(kw);
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
break;
}
}
// ── Sender-Name-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (senderNameLower.includes(kw)) {
keywordHitsName.push(kw);
score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD;
break;
}
}
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
styleFlags.push("money-pattern");
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
}
// ── Urgency-Wörter im Betreff ──
const URGENCY_PATTERNS = [
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
"expires", "last chance", "limited time", "jetzt einlösen",
"sofort", "nur noch", "endet heute",
];
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
styleFlags.push("urgency");
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
}
// ── ALL-CAPS-Wort im Betreff ──
if (/\b[A-Z]{4,}\b/.test(subject)) {
styleFlags.push("all-caps");
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
}
// ── Short-Random-Domain ──
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
styleFlags.push("short-random-domain");
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
}
// ── Layer 2.5 Score-Ergänzungen ──
if (brandMatchFound && !randomTokensFound) {
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
}
if (!brandMatchFound && randomTokensFound) {
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
}
return {
score: Math.min(score, 100),
keywordHitsSubject,
keywordHitsDomain,
keywordHitsName,
styleFlags,
whitelistHit: false,
};
}
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
export interface ClassifyMailParams {
mail: MailInput;
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
blockedDomainSet: Set<string>;
/**
* User-spezifische Display-Name-Patterns (aus getCustomMailDisplayNames).
* Layer 2.6: case-insensitive Substring-Match gegen senderName.
* Leer-Array wenn User keine Display-Name-Patterns gesetzt hat.
*
* DSGVO: keine PII — reine Heuristik-Muster (z.B. ["EXTRASPIN"]).
*/
customDisplayNames?: string[];
}
/**
* Klassifiziert eine einzelne Mail durch alle Layer.
* Komplett deterministisch — keine externen Calls, keine PII verlässt den Server.
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
*/
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
const { mail, blockedDomainSet, customDisplayNames } = params;
const { senderEmail, senderName, subject } = mail;
const senderEmailLower = senderEmail.toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const localPart = senderEmailLower.split("@")[0] ?? "";
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
for (const w of GAMBLING_WHITELIST as string[]) {
if (haystack.includes(w)) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain: null,
features: {
score: 0,
domainBlocked: false,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
},
};
}
}
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
if (domain && blockedDomainSet.has(domain)) {
return {
action: "blocked",
triggerSource: "domain",
score: 100,
relayDecodedDomain: null,
features: {
score: 100,
domainBlocked: true,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
return {
action: "blocked",
triggerSource: "relay-decoded",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: true,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
// Normalisiere Absender-Name und Domain-Root für Brand-Vergleich
const displayNameNorm = normalizeBrand(senderName ?? "");
const domainCandidates = domainToBrandCandidates(domain);
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates];
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
const randomTokens = hasRandomTokens(localPart);
if (brandMatch && randomTokens) {
return {
action: "blocked",
triggerSource: "brand+random",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch: true,
randomTokens: true,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.6: User-Custom-Display-Name-Hard-Block ──────────────────────────
// Display-name patterns disabled in v1.0 — re-enable when display-name input UX ships (v1.1).
// getCustomMailDisplayNames() returns [] until mail_display_name rows exist,
// so this block is dead code in practice. Keep logic intact for trivial re-activation.
//
// User-eigene Patterns (z.B. "EXTRASPIN") matchen case-insensitiv als Substring
// gegen den Sender-Display-Name. Kein Score — direkter Hard-Block wenn Match.
//
// Substring-Match (nicht exact) damit "EXTRASPIN Casino" und "ExtraSpin Bonus"
// beide von Pattern "EXTRASPIN" erfasst werden.
//
// Gambling-Brands rotieren aktiv Capitalization → case-insensitive ist Pflicht.
if (customDisplayNames && customDisplayNames.length > 0 && senderName) {
const senderNameLower = senderName.toLowerCase();
const matchedPattern = customDisplayNames.find(
(pattern) => pattern.length > 0 && senderNameLower.includes(pattern.toLowerCase()),
);
if (matchedPattern) {
return {
action: "blocked",
triggerSource: "custom-display-name",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
}
// ── Layer 3: Score ──────────────────────────────────────────────────────────
const scoreResult = computeScore(
senderEmailLower,
senderName,
subject,
brandMatch,
randomTokens,
);
if (scoreResult.whitelistHit) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain,
features: {
...scoreResult,
score: 0,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
const score = scoreResult.score;
// Score >= 80 → Hard-Block, kein LLM
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
const triggerSource: TriggerSource = `score:${score}`;
return {
action: "blocked",
triggerSource,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score < 25 → PASS
if (score < SCORE_PASS_BELOW) {
return {
action: "passed",
triggerSource: "no-signal",
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score 25-79 → PASS bei < 50, BLOCK bei >= 50 (deterministisch, kein LLM)
const midAction: ClassificationAction =
score >= SCORE_BLOCK_MIDRANGE ? "blocked" : "passed";
const midTrigger: TriggerSource = `score:${score}`;
return {
action: midAction,
triggerSource: midTrigger,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}