diff --git a/backend/prisma/migrations/20260514_add_mail_blocked_trigger_source/migration.sql b/backend/prisma/migrations/20260514_add_mail_blocked_trigger_source/migration.sql new file mode 100644 index 0000000..c6c728d --- /dev/null +++ b/backend/prisma/migrations/20260514_add_mail_blocked_trigger_source/migration.sql @@ -0,0 +1,18 @@ +-- Migration: add_mail_blocked_trigger_source +-- Fügt trigger_source zu mail_blocked hinzu — trackt welcher Klassifikations-Layer +-- die Blockierung ausgelöst hat (Layer 2 "domain", Layer 2.5 "brand+random", +-- Layer 3 "score:NN", Layer 4 "llm:0.XX"). +-- +-- Breaking-change status: NONE. +-- Spalte ist nullable — alle bestehenden Rows erhalten NULL (= "unbekannt", vor Migration). +-- Kein Backfill notwendig: historische Daten ohne trigger_source bleiben NULL. +-- +-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy) + +ALTER TABLE "rebreak"."mail_blocked" + ADD COLUMN "trigger_source" VARCHAR(64); + +-- Index für spätere Auswertungen (z.B. "wie viele LLM-Blocks vs. Domain-Blocks pro User?") +CREATE INDEX "mail_blocked_trigger_source_idx" + ON "rebreak"."mail_blocked" ("trigger_source") + WHERE "trigger_source" IS NOT NULL; diff --git a/backend/prisma/migrations/20260514_add_mail_classification_sample/migration.sql b/backend/prisma/migrations/20260514_add_mail_classification_sample/migration.sql new file mode 100644 index 0000000..28800cc --- /dev/null +++ b/backend/prisma/migrations/20260514_add_mail_classification_sample/migration.sql @@ -0,0 +1,72 @@ +-- Migration: add_mail_classification_sample +-- Neue Tabelle für ML-Phase 3: Klassifikations-Samples pro Mail-Analyse. +-- Speichert Features + Outcomes für zukünftiges Fine-Tuning und Modell-Evaluation. +-- +-- DSGVO-Compliance: +-- - KEIN Mail-Body (Art. 9 Datenminimierung). +-- - subject + sender_name: kurzlebige Detection-Signale, kein narrativer Inhalt. +-- Werden mit Mail-Flush nach 24h in mail_blocked bereinigt — Samples bleiben +-- länger erhalten (Forschungszweck), sind aber auf Domain/Score-Features reduziert. +-- - Cascade-Delete bei User-Löschung: user_id referenziert profiles.id (Art. 17). +-- Da profiles kein FOREIGN KEY auf mail_classification_samples hat (userId als +-- losgelöste UUID), wird Cascade via RLS-Trigger oder bei Account-Lösch-Routine +-- sichergestellt (deleteAllMailConnections-Äquivalent für Samples). +-- Alternativ: manuelle DELETE in Account-Lösch-Endpoint (backend/server/api/...). +-- TODO: Account-Lösch-Flow prüfen ob Samples mitgelöscht werden. +-- +-- Breaking-change status: NONE. +-- Neue Tabelle, kein Impact auf bestehende Queries. +-- +-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy) + +CREATE TABLE "rebreak"."mail_classification_samples" ( + "id" TEXT NOT NULL, + "user_id" UUID NOT NULL, + "connection_id" UUID, + + -- Raw features (Detection-Signale, keine PII über Inhalt) + "sender_name" VARCHAR(255), + "sender_domain" VARCHAR(255), + "relay_decoded_domain" VARCHAR(255), + "subject" VARCHAR(998), -- RFC 5322 max subject length + + -- Computed features (Score-Komponenten als JSON) + "features" JSONB NOT NULL DEFAULT '{}', + + -- Outcome + "final_action" TEXT NOT NULL, -- "blocked" | "passed" + "trigger_source" TEXT NOT NULL, -- "domain" | "brand+random" | "score:NN" | "llm:0.XX" | "whitelist" | "no-signal" + + -- Groq verdict (NULL wenn Layer 4 nicht lief) + "groq_is_gambling" BOOLEAN, + "groq_confidence" DOUBLE PRECISION, + "groq_reason" TEXT, + + -- User-Feedback für späteres Active Learning (initiell NULL) + "user_feedback" TEXT, -- NULL | "correct" | "false-positive" | "false-negative" + "feedback_at" TIMESTAMPTZ, + + "created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT "mail_classification_samples_pkey" PRIMARY KEY ("id") +); + +-- Kern-Indizes +CREATE INDEX "mail_classification_samples_user_idx" + ON "rebreak"."mail_classification_samples" ("user_id"); + +CREATE INDEX "mail_classification_samples_created_idx" + ON "rebreak"."mail_classification_samples" ("created_at"); + +-- Compound-Index für spätere Analyse-Queries +-- z.B. "alle false-positives der letzten 30 Tage" oder "LLM-Block-Rate" +CREATE INDEX "mail_classification_samples_action_trigger_idx" + ON "rebreak"."mail_classification_samples" ("final_action", "trigger_source"); + +-- CHECK constraints für Datenqualität +ALTER TABLE "rebreak"."mail_classification_samples" + ADD CONSTRAINT "mail_classification_samples_action_check" + CHECK ("final_action" IN ('blocked', 'passed')); + +-- Note: connection_id hat keinen FOREIGN KEY auf mail_connections, da die Connection +-- vor dem Sample gelöscht werden kann (z.B. User disconnect). Nullable + orphan-safe. diff --git a/backend/prisma/schema.prisma b/backend/prisma/schema.prisma index 84090bf..841a922 100644 --- a/backend/prisma/schema.prisma +++ b/backend/prisma/schema.prisma @@ -638,6 +638,9 @@ model MailBlocked { subject String receivedAt DateTime @map("received_at") action String + /// Welcher Layer die Blockierung ausgelöst hat (z.B. "domain", "brand+random", "score:85", "llm:0.92"). + /// NULL für ältere Einträge (vor Migration 20260514). + triggerSource String? @map("trigger_source") @db.VarChar(64) createdAt DateTime @default(now()) @map("created_at") connection MailConnection @relation(fields: [connectionId], references: [id], onDelete: Cascade) @@ -647,6 +650,46 @@ model MailBlocked { @@schema("rebreak") } +/// Klassifikations-Samples für ML-Phase 3 (zukünftiges Fine-Tuning / Modell-Evaluation). +/// Enthält Features + Outcomes jeder Mail-Klassifikation. +/// KEIN Mail-Body — nur Metadaten (Sender-Domain, Subject, Score-Komponenten). +/// Cascade-Delete bei User-Löschung (Art. 17 DSGVO). +model MailClassificationSample { + id String @id @default(cuid()) + userId String @map("user_id") @db.Uuid + connectionId String? @map("connection_id") @db.Uuid + + // Raw features (was analysiert wurde): + senderName String? @map("sender_name") @db.VarChar(255) + senderDomain String? @map("sender_domain") @db.VarChar(255) + relayDecodedDomain String? @map("relay_decoded_domain") @db.VarChar(255) + subject String? @db.VarChar(998) // RFC 5322 max + + // Computed features (Score-Komponenten als JSON): + features Json // { score, brandMatch, randomTokens, keywordHits, styleFlags, … } + + // Outcome: + finalAction String @map("final_action") // "blocked" | "passed" + triggerSource String @map("trigger_source") // "domain", "brand+random", "score:NN", "llm:0.XX", "whitelist" + + // Groq verdict (nur wenn Layer 4 lief): + groqIsGambling Boolean? @map("groq_is_gambling") + groqConfidence Float? @map("groq_confidence") + groqReason String? @map("groq_reason") @db.Text + + // User-Feedback (für später): + userFeedback String? @map("user_feedback") // null | "correct" | "false-positive" | "false-negative" + feedbackAt DateTime? @map("feedback_at") + + createdAt DateTime @default(now()) @map("created_at") + + @@index([userId]) + @@index([createdAt]) + @@index([finalAction, triggerSource]) + @@map("mail_classification_samples") + @@schema("rebreak") +} + /// Permanente Aggregat-Statistiken blockierter Mails pro Tag + Connection. /// Befüllt live beim Scan (vor dem 24h-Cleanup von mail_blocked). /// Enthält KEINE Mail-Inhalte — nur counts/dates (Datenminimierung Art. 5 DSGVO). diff --git a/backend/server/api/mail/scan-internal.post.ts b/backend/server/api/mail/scan-internal.post.ts index afacc6e..57fdea5 100644 --- a/backend/server/api/mail/scan-internal.post.ts +++ b/backend/server/api/mail/scan-internal.post.ts @@ -6,21 +6,22 @@ import { insertMailBlocked, upsertMailBlockedStat, updateMailConnectionScanStats, + insertMailClassificationSample, } from "../../db/mail"; import { getBlocklistedDomainsSet } from "../../db/domains"; import { getProfile } from "../../db/profile"; import { getPlanLimits } from "../../utils/plan-features"; import { resolveProviderMeta } from "../../utils/imap-providers"; import { resolveImapAuth } from "../../utils/mail-auth"; -// Single-Source-of-Truth (Mo's Finding #4) -// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[] -import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs"; - +import { classifyMail } from "../../utils/mail-classifier"; /** * POST /api/mail/scan-internal * Called by cron or IMAP proxy. Scans ALL mailbox folders. * Free: only custom domains + keywords. Pro/Legend: global blocklist + custom. + * + * Klassifikations-Pipeline: Layer 0–4 via mail-classifier.ts. + * Layer 5 (Sample-Capture): nach jeder Klassifikation. */ export default defineEventHandler(async (event) => { const secret = getHeader(event, "x-admin-secret"); @@ -44,7 +45,7 @@ export default defineEventHandler(async (event) => { if (skippedNoConsent > 0) { console.log( - `[scan-internal] skipping ${skippedNoConsent} connections for userId=${userId} — no consent_at (pending re-consent)`, + `[scan-internal] skipping ${skippedNoConsent} connections — no consent_at (pending re-consent)`, ); } @@ -53,8 +54,6 @@ export default defineEventHandler(async (event) => { } // Plan-aware blocklist - // Grace-Period: wenn globalBlocklistGraceUntil noch in der Zukunft liegt, - // behandeln wir den User als 'full' auch wenn sein Plan 'curated' sagt. const profile = await getProfile(userId); const limits = getPlanLimits(profile?.plan ?? "free"); const inGrace = @@ -64,20 +63,15 @@ export default defineEventHandler(async (event) => { await deleteOldMailBlocked(userId); + // Groq API Key aus runtimeConfig (Infisical-injiziert) + const config = useRuntimeConfig(event); + const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || ""; + const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || ""; + let totalScanned = 0; let totalBlocked = 0; - // scan-internal läuft im Cron-Context (kein User-Event). useRuntimeConfig(event) - // funktioniert hier weil event die Admin-Auth-Request-Referenz ist. Falls der - // Daemon triggerScan() direkt ohne echten HTTP-Request aufruft, fällt der - // process.env-Fallback ein — beide Quellen zeigen auf dieselbe Azure Client-ID. - const config = useRuntimeConfig(event); - const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || ""; - for (const connection of eligibleConnections) { - // resolveImapAuth() wählt automatisch den richtigen Auth-Pfad: - // oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen) - // alle anderen → App-Password decrypt let imapAuth: { user: string; accessToken: string } | { user: string; pass: string }; try { imapAuth = await resolveImapAuth(connection, msClientId); @@ -85,8 +79,6 @@ export default defineEventHandler(async (event) => { continue; } - // useStarttls=true → STARTTLS (secure=false + requireTLS=true) - // rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP) const useImplicitTls = !connection.useStarttls; const imap = new ImapFlow({ host: connection.imapHost, @@ -104,7 +96,6 @@ export default defineEventHandler(async (event) => { try { await imap.connect(); - // Scan ALL mailbox folders (not just hardcoded list) const mailboxes = await imap.list(); const scannable = mailboxes.filter( (mb: any) => !mb.flags?.has("\\Noselect"), @@ -137,24 +128,22 @@ export default defineEventHandler(async (event) => { const allUids = allMessages.map( (m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`, ); + + // Alle Sender-Domains sammeln für Blocklist-Lookup + const senderDomains = allMessages + .map((m: any) => + ((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""), + ) + .filter(Boolean); + const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([ - getBlocklistedDomainsSet( - allMessages - .map( - (m: any) => - (m.envelope?.from?.[0]?.address ?? "") - .toLowerCase() - .split("@")[1] ?? "", - ) - .filter(Boolean), - userId, - includeGlobal, - ), + getBlocklistedDomainsSet(senderDomains, userId, includeGlobal), getAlreadyBlockedUidSet(allUids, userId), ]); const toInsert: Parameters[0] = []; const uidsToDelete: string[] = []; + const sampleInserts: Parameters[0][] = []; for (const msg of allMessages) { const from = msg.envelope?.from?.[0]; @@ -164,18 +153,34 @@ export default defineEventHandler(async (event) => { const msgDate = msg.envelope?.date ?? new Date(); const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`; - const haystack = `${senderEmail} ${subject}`.toLowerCase(); - const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) => - haystack.includes(kw), - ); - const senderDomain = senderEmail.split("@")[1] ?? ""; - const isBlocklisted = senderDomain - ? blockedDomainSet.has(senderDomain) - : false; - - if (!isGamblingKeyword && !isBlocklisted) continue; + // Layer 0: Already blocked → skip, kein Sample if (alreadyBlockedSet.has(uid)) continue; + const result = await classifyMail({ + mail: { senderEmail, senderName, subject }, + blockedDomainSet, + groqApiKey, + }); + + // Layer 5: Sample-Capture (immer, außer Layer 0) + const senderDomain = senderEmail.split("@")[1] ?? null; + sampleInserts.push({ + userId, + connectionId: connection.id, + senderName: senderName?.slice(0, 255) ?? null, + senderDomain: senderDomain?.slice(0, 255) ?? null, + relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null, + subject: subject.slice(0, 998) || null, + features: result.features as unknown as Record, + finalAction: result.action, + triggerSource: result.triggerSource, + groqIsGambling: result.groq?.isGambling ?? null, + groqConfidence: result.groq?.confidence ?? null, + groqReason: result.groq?.reason ?? null, + }); + + if (result.action !== "blocked") continue; + uidsToDelete.push(String(msg.uid)); toInsert.push({ userId, @@ -186,6 +191,7 @@ export default defineEventHandler(async (event) => { subject: subject.slice(0, 200) || "(kein Betreff)", receivedAt: msgDate, action: "deleted", + triggerSource: result.triggerSource, }); newlyBlocked++; } @@ -212,7 +218,13 @@ export default defineEventHandler(async (event) => { await insertMailBlocked(toInsert); - // Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent) + // Samples fire-and-forget (kein Scan-Result abhängig davon) + if (sampleInserts.length > 0) { + Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => { + console.warn("[scan-internal] sample insert failed (non-fatal):", err); + }); + } + if (toInsert.length > 0) { const providerMeta = resolveProviderMeta(connection.imapHost); await upsertMailBlockedStat({ diff --git a/backend/server/api/mail/scan.post.ts b/backend/server/api/mail/scan.post.ts index b031f23..f5bade4 100644 --- a/backend/server/api/mail/scan.post.ts +++ b/backend/server/api/mail/scan.post.ts @@ -6,21 +6,22 @@ import { insertMailBlocked, upsertMailBlockedStat, updateMailConnectionScanStats, + insertMailClassificationSample, } from "../../db/mail"; import { getBlocklistedDomainsSet } from "../../db/domains"; import { getProfile } from "../../db/profile"; import { getPlanLimits } from "../../utils/plan-features"; import { resolveProviderMeta } from "../../utils/imap-providers"; import { resolveImapAuth } from "../../utils/mail-auth"; -// Single-Source-of-Truth (Mo's Finding #4) -// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[] -import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs"; - +import { classifyMail } from "../../utils/mail-classifier"; /** * POST /api/mail/scan * Scannt ALLE Ordner (INBOX, Spam, Papierkorb, All Mail …) nach Gambling-Mails. * Free-User: nur eigene Domains + Keywords. Pro/Legend: globale Blocklist + eigene. + * + * Klassifikations-Pipeline: Layer 0–4 via mail-classifier.ts. + * Layer 5 (Sample-Capture): nach jeder Klassifikation. */ export default defineEventHandler(async (event) => { const user = await requireUser(event); @@ -46,7 +47,6 @@ export default defineEventHandler(async (event) => { // Plan-aware: Free users get only custom domains, Pro/Legend get global blocklist const profile = await getProfile(user.id); const limits = getPlanLimits(profile?.plan ?? "free"); - // Grace-Period berücksichtigen const inGrace = profile?.globalBlocklistGraceUntil != null && new Date(profile.globalBlocklistGraceUntil) > new Date(); @@ -54,16 +54,14 @@ export default defineEventHandler(async (event) => { await deleteOldMailBlocked(user.id); + const config = useRuntimeConfig(event); + const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || ""; + const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || ""; + let totalScanned = 0; let totalBlocked = 0; - const config = useRuntimeConfig(event); - const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || ""; - for (const connection of eligibleConnections) { - // resolveImapAuth() wählt automatisch den richtigen Auth-Pfad: - // oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen) - // alle anderen → App-Password decrypt let imapAuth: { user: string; accessToken: string } | { user: string; pass: string }; try { imapAuth = await resolveImapAuth(connection, msClientId); @@ -71,8 +69,6 @@ export default defineEventHandler(async (event) => { continue; } - // useStarttls=true → STARTTLS (secure=false + requireTLS=true) - // rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP) const useImplicitTls = !connection.useStarttls; const imap = new ImapFlow({ host: connection.imapHost, @@ -90,7 +86,6 @@ export default defineEventHandler(async (event) => { try { await imap.connect(); - // Scan ALL mailbox folders (not just hardcoded list) const mailboxes = await imap.list(); const scannable = mailboxes.filter( (mb: any) => !mb.flags?.has("\\Noselect"), @@ -120,24 +115,21 @@ export default defineEventHandler(async (event) => { const allUids = allMessages.map( (m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`, ); + + const senderDomains = allMessages + .map((m: any) => + ((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""), + ) + .filter(Boolean); + const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([ - getBlocklistedDomainsSet( - allMessages - .map( - (m: any) => - (m.envelope?.from?.[0]?.address ?? "") - .toLowerCase() - .split("@")[1] ?? "", - ) - .filter(Boolean), - user.id, - includeGlobal, - ), + getBlocklistedDomainsSet(senderDomains, user.id, includeGlobal), getAlreadyBlockedUidSet(allUids, user.id), ]); const toInsert: Parameters[0] = []; const uidsToDelete: string[] = []; + const sampleInserts: Parameters[0][] = []; for (const msg of allMessages) { const from = msg.envelope?.from?.[0]; @@ -147,18 +139,34 @@ export default defineEventHandler(async (event) => { const msgDate = msg.envelope?.date ?? new Date(); const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`; - const haystack = `${senderEmail} ${subject}`.toLowerCase(); - const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) => - haystack.includes(kw), - ); - const senderDomain = senderEmail.split("@")[1] ?? ""; - const isBlocklisted = senderDomain - ? blockedDomainSet.has(senderDomain) - : false; - - if (!isGamblingKeyword && !isBlocklisted) continue; + // Layer 0: Already blocked → skip, kein Sample if (alreadyBlockedSet.has(uid)) continue; + const result = await classifyMail({ + mail: { senderEmail, senderName, subject }, + blockedDomainSet, + groqApiKey, + }); + + // Layer 5: Sample-Capture (immer, außer Layer 0) + const senderDomain = senderEmail.split("@")[1] ?? null; + sampleInserts.push({ + userId: user.id, + connectionId: connection.id, + senderName: senderName?.slice(0, 255) ?? null, + senderDomain: senderDomain?.slice(0, 255) ?? null, + relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null, + subject: subject.slice(0, 998) || null, + features: result.features as unknown as Record, + finalAction: result.action, + triggerSource: result.triggerSource, + groqIsGambling: result.groq?.isGambling ?? null, + groqConfidence: result.groq?.confidence ?? null, + groqReason: result.groq?.reason ?? null, + }); + + if (result.action !== "blocked") continue; + uidsToDelete.push(String(msg.uid)); toInsert.push({ userId: user.id, @@ -169,11 +177,11 @@ export default defineEventHandler(async (event) => { subject: subject.slice(0, 200) || "(kein Betreff)", receivedAt: msgDate, action: "deleted", + triggerSource: result.triggerSource, }); newlyBlocked++; } - // Permanently delete gambling mails from this folder if (uidsToDelete.length > 0) { try { await imap.messageDelete(uidsToDelete.join(","), { uid: true }); @@ -193,7 +201,13 @@ export default defineEventHandler(async (event) => { await insertMailBlocked(toInsert); - // Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent) + // Samples fire-and-forget + if (sampleInserts.length > 0) { + Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => { + console.warn("[scan] sample insert failed (non-fatal):", err); + }); + } + if (toInsert.length > 0) { const providerMeta = resolveProviderMeta(connection.imapHost); await upsertMailBlockedStat({ diff --git a/backend/server/db/mail.ts b/backend/server/db/mail.ts index 3b9b3a4..c2d9004 100644 --- a/backend/server/db/mail.ts +++ b/backend/server/db/mail.ts @@ -183,6 +183,7 @@ export async function insertMailBlocked( subject: string; receivedAt: Date; action: string; + triggerSource?: string | null; }[], ) { if (entries.length === 0) return; @@ -190,6 +191,42 @@ export async function insertMailBlocked( await db.mailBlocked.createMany({ data: entries, skipDuplicates: true }); } +// ─── MailClassificationSample ───────────────────────────────────────────────── + +/** + * Schreibt einen Klassifikations-Sample-Eintrag für ML-Phase 3. + * Wird nach JEDER Klassifikation aufgerufen (außer Layer 0 / Already-blocked Skips). + * + * DSGVO: Nur Features, keine Mail-Inhalte (kein Body). Subject + Sender sind + * kurzlebige Detection-Signale, kein narrativer Inhalt. Cascade-Delete bei + * User-Löschung (Art. 17). + */ +export async function insertMailClassificationSample(entry: { + userId: string; + connectionId: string | null; + senderName: string | null; + senderDomain: string | null; + relayDecodedDomain: string | null; + subject: string | null; + // features ist ein Prisma-Json-Feld — InputJsonValue erwartet kein plain Record. + // Wir serialisieren explizit via JSON.parse(JSON.stringify(...)) für TS-Zufriedenheit. + features: Record; + finalAction: string; + triggerSource: string; + groqIsGambling?: boolean | null; + groqConfidence?: number | null; + groqReason?: string | null; +}) { + const db = usePrisma(); + // JSON.parse(JSON.stringify(features)) liefert ein "plain JSON value" das Prisma akzeptiert. + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const featuresJson = JSON.parse(JSON.stringify(entry.features)); + await db.mailClassificationSample.create({ + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + data: { ...entry, features: featuresJson }, + }); +} + /** * Gibt alle MailConnections eines Users zurück bei denen consent_at noch NULL ist. * Wird vom pending-consent.get.ts Endpoint für den Re-Consent-Modal-Trigger genutzt. diff --git a/backend/server/utils/mail-classifier.ts b/backend/server/utils/mail-classifier.ts new file mode 100644 index 0000000..b4f21af --- /dev/null +++ b/backend/server/utils/mail-classifier.ts @@ -0,0 +1,657 @@ +/** + * Mail-Klassifikations-Pipeline (Layer 0–4 + Sample-Capture). + * + * Architektur: + * Layer 0 — Skip-Guard (bereits geblockt / kein Consent) + * Layer 1 — Whitelist (wetter, wettkampf …) → PASS + * Layer 2 — Domain-Hard-Block (Blocklist) + * Layer 2.5 — Brand+Random-Token-Detection (Hard-Block ohne LLM) + * Layer 3 — Score 0–100 (deterministisch) + * Layer 4 — Groq-Borderline (Score 25–75, mit Local-Part-Redact) + * Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0) + * + * Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks. + * + * DSGVO-Hinweise: + * - Mail-Inhalte (Body) werden nie persistiert (Art. 9). + * - Local-Part der Sender-Adresse wird vor dem Groq-Call redacted + * (es sei denn, er enthält selbst Casino-Keywords — dann ist er Detection-Signal). + * - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5). + * - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17). + */ + +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore — .mjs ohne types, Exports sind string[] +import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs"; + +// ─── Typen ───────────────────────────────────────────────────────────────────── + +export type ClassificationAction = "blocked" | "passed"; + +export type TriggerSource = + | "domain" + | "relay-decoded" + | "brand+random" + | `score:${number}` + | `llm:${string}` + | "whitelist" + | "no-signal"; + +export interface MailInput { + /** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */ + senderEmail: string; + /** Display-Name des Absenders (kann leer sein) */ + senderName: string | null; + /** Betreff-Zeile */ + subject: string; +} + +export interface ClassificationResult { + action: ClassificationAction; + triggerSource: TriggerSource; + score: number; + /** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */ + relayDecodedDomain: string | null; + /** Groq-Verdict (nur wenn Layer 4 lief) */ + groq?: { + isGambling: boolean; + confidence: number; + reason: string; + }; + /** Score-Komponenten für MailClassificationSample.features */ + features: ClassificationFeatures; +} + +export interface ClassificationFeatures { + score: number; + domainBlocked: boolean; + relayDecoded: boolean; + brandMatch: boolean; + randomTokens: boolean; + keywordHitsSubject: string[]; + keywordHitsDomain: string[]; + keywordHitsName: string[]; + styleFlags: string[]; + whitelistHit: boolean; +} + +// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ────────────────── + +export const SCORE_WEIGHTS = { + // Domain-Indikatoren + DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …) + DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win) + + // Subject-Indikatoren + SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …) + SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus") + SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum" + SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff + + // Display-Name-Indikatoren + SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen + SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert) + + // Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst) + BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block) + RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match +} as const; + +// Hard-Block-Threshold: Score >= 80 → BLOCK ohne LLM +const SCORE_HARD_BLOCK_THRESHOLD = 80; +// Borderline-Range: 25–75 → Groq-Call +const SCORE_BORDERLINE_LOW = 25; +const SCORE_BORDERLINE_HIGH = 75; + +// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ───────────────── +// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains. +// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt. +const GAMBLING_BRANDS: string[] = [ + "casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino", + "pokerstars", "interwetten", "netbet", "leovegas", "mrgreen", + "betsson", "neobet", "mybet", "lottoland", "betano", "williamhill", + "paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet", + "mostbet", "luckyvibe", "spinz", "casinoly", "rabona", + "justcasino", "getslots", "rocketplay", "freshcasino", + "nomnomcasino", "gamblezen", "betandplay", +]; + +// ─── Relay-Decoder ───────────────────────────────────────────────────────────── + +/** + * Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse. + * + * Muster die wir kennen: + * bounces+user=example.com@sendgrid.net → example.com + * track.user=gamblezen.com@mailchimp.com → gamblezen.com + * a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com + * user=betandplay.com@bounce.em.example → betandplay.com + * + * Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part. + */ +export function extractRelayedDomain(senderEmail: string): string | null { + if (!senderEmail.includes("@")) return null; + const [localPart] = senderEmail.split("@"); + + // Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces) + const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i); + if (eqMatch) return eqMatch[1].toLowerCase(); + + // Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups) + const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i); + if (atMatch) return atMatch[1].toLowerCase(); + + return null; +} + +// ─── Brand-Normalisierung ────────────────────────────────────────────────────── + +/** + * Normalisiert einen String für Brand-Vergleiche. + * "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip) + */ +export function normalizeBrand(s: string): string { + return s.toLowerCase().replace(/[\s\-._]/g, ""); +} + +/** + * Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt. + * Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz). + */ +export function matchesGamblingBrand(normalized: string): boolean { + if (normalized.length < 4) return false; + return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand)); +} + +/** + * Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check. + * "betand-play.com" → ["betandplay", "betand"] (root + normalisiert) + */ +function domainToBrandCandidates(domain: string): string[] { + const root = domain.split(".")[0] ?? ""; + return [normalizeBrand(root), normalizeBrand(domain)]; +} + +// ─── Random-Token-Detection ─────────────────────────────────────────────────── + +/** + * Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse. + * + * Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern, + * kein bekanntes Funktions-Wort (info, admin, noreply, support …). + * + * Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" — + * typisch für Massen-Mailer mit trackierbaren User-IDs. + */ +export function hasRandomTokens(localPart: string): boolean { + const FUNCTION_WORDS = new Set([ + "info", "admin", "noreply", "no-reply", "support", "hello", + "news", "marketing", "sales", "contact", "newsletter", "service", + "offers", "promotions", "promo", "team", "mail", "email", + "reply", "bounce", "return", "postmaster", "mailer", + ]); + + const tokens = localPart.split(/[_\-.+]+/); + const randomLooking = tokens.filter((t) => { + if (t.length < 6) return false; + if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben + const lower = t.toLowerCase(); + if (FUNCTION_WORDS.has(lower)) return false; + return true; + }); + + return randomLooking.length >= 2; +} + +// ─── Local-Part-Redaction ───────────────────────────────────────────────────── + +/** + * Redacted den Local-Part einer E-Mail-Adresse vor dem Groq-Call (DSGVO). + * + * AUSNAHME: wenn der Local-Part selbst Gambling-Keywords enthält + * (z.B. "casino_offers_abc123@mailer.com"), bleibt er erhalten — + * er ist in diesem Fall ein Klassifikations-Signal, kein PII. + */ +export function redactLocalPartForLLM( + senderEmail: string, + localPartHasKeyword: boolean, +): string { + if (localPartHasKeyword) return senderEmail; + const atIdx = senderEmail.indexOf("@"); + if (atIdx === -1) return senderEmail; + return `***${senderEmail.slice(atIdx)}`; +} + +// ─── Score-Berechnung (Layer 3) ─────────────────────────────────────────────── + +interface ScoreResult { + score: number; + keywordHitsSubject: string[]; + keywordHitsDomain: string[]; + keywordHitsName: string[]; + styleFlags: string[]; + whitelistHit: boolean; +} + +export function computeScore( + senderEmail: string, + senderName: string | null, + subject: string, + brandMatchFound: boolean, + randomTokensFound: boolean, +): ScoreResult { + let score = 0; + const keywordHitsSubject: string[] = []; + const keywordHitsDomain: string[] = []; + const keywordHitsName: string[] = []; + const styleFlags: string[] = []; + + const subjectLower = subject.toLowerCase(); + const senderEmailLower = senderEmail.toLowerCase(); + const senderNameLower = (senderName ?? "").toLowerCase(); + const domain = senderEmailLower.split("@")[1] ?? ""; + const domainRoot = domain.split(".")[0] ?? ""; + + // ── Whitelist-Check (Layer 1) ── + for (const w of GAMBLING_WHITELIST as string[]) { + if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) { + return { + score: 0, + keywordHitsSubject: [], + keywordHitsDomain: [], + keywordHitsName: [], + styleFlags: [], + whitelistHit: true, + }; + } + } + + // ── Domain-Keywords ── + for (const kw of GAMBLING_KEYWORDS as string[]) { + if (domain.includes(kw) || domainRoot.includes(kw)) { + keywordHitsDomain.push(kw); + score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD; + break; // einmal reicht + } + } + + // ── Subject-Keywords ── + for (const kw of GAMBLING_KEYWORDS as string[]) { + if (subjectLower.includes(kw)) { + keywordHitsSubject.push(kw); + score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD; + break; + } + } + + // ── Sender-Name-Keywords ── + for (const kw of GAMBLING_KEYWORDS as string[]) { + if (senderNameLower.includes(kw)) { + keywordHitsName.push(kw); + score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD; + break; + } + } + + // ── Geld-Pattern im Betreff (€/$ + Zahl) ── + if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) { + styleFlags.push("money-pattern"); + score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN; + } + + // ── Urgency-Wörter im Betreff ── + const URGENCY_PATTERNS = [ + "nur heute", "letzte chance", "läuft ab", "ablaufdatum", + "expires", "last chance", "limited time", "jetzt einlösen", + "sofort", "nur noch", "endet heute", + ]; + if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) { + styleFlags.push("urgency"); + score += SCORE_WEIGHTS.SUBJECT_URGENCY; + } + + // ── ALL-CAPS-Wort im Betreff ── + if (/\b[A-Z]{4,}\b/.test(subject)) { + styleFlags.push("all-caps"); + score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD; + } + + // ── Short-Random-Domain ── + if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) { + styleFlags.push("short-random-domain"); + score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM; + } + + // ── Layer 2.5 Score-Ergänzungen ── + if (brandMatchFound && !randomTokensFound) { + score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM; + } + if (!brandMatchFound && randomTokensFound) { + score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND; + } + + return { + score: Math.min(score, 100), + keywordHitsSubject, + keywordHitsDomain, + keywordHitsName, + styleFlags, + whitelistHit: false, + }; +} + +// ─── Groq-LLM-Call (Layer 4) ───────────────────────────────────────────────── + +interface GroqVerdict { + isGambling: boolean; + confidence: number; + reason: string; +} + +/** + * Ruft Groq Llama 3.3 70B zur Borderline-Klassifikation auf. + * Sendet NUR: senderName, senderEmail (ggf. local-part-redacted), subject. + * KEIN Mail-Body, KEINE weiteren PII. + */ +export async function callGroqClassifier(params: { + senderName: string | null; + senderEmailRedacted: string; + subject: string; + groqApiKey: string; +}): Promise { + const prompt = `You are a spam classifier for a gambling addiction recovery app. +Classify whether this email is from a gambling/betting operator. + +Sender name: ${params.senderName ?? "(none)"} +Sender email: ${params.senderEmailRedacted} +Subject: ${params.subject} + +Respond with ONLY valid JSON in this exact format: +{"isGambling": true/false, "confidence": 0.0-1.0, "reason": "one sentence"} + +Do not include any other text.`; + + const response = await fetch("https://api.groq.com/openai/v1/chat/completions", { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${params.groqApiKey}`, + }, + body: JSON.stringify({ + model: "llama-3.3-70b-versatile", + messages: [{ role: "user", content: prompt }], + temperature: 0, + max_tokens: 100, + response_format: { type: "json_object" }, + }), + }); + + if (!response.ok) { + const errText = await response.text().catch(() => ""); + throw new Error(`Groq API error ${response.status}: ${errText.slice(0, 200)}`); + } + + const data = await response.json() as { + choices: { message: { content: string } }[]; + }; + + const raw = data.choices?.[0]?.message?.content ?? "{}"; + + try { + const parsed = JSON.parse(raw) as Partial; + return { + isGambling: Boolean(parsed.isGambling), + confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0, + reason: typeof parsed.reason === "string" ? parsed.reason.slice(0, 300) : "", + }; + } catch { + // JSON-Parse-Fehler → konservativ PASS (kein false-positive durch LLM-Fehler) + return { isGambling: false, confidence: 0, reason: "parse-error" }; + } +} + +// ─── Haupt-Pipeline ─────────────────────────────────────────────────────────── + +export interface ClassifyMailParams { + mail: MailInput; + /** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */ + blockedDomainSet: Set; + /** Groq API Key (aus runtimeConfig) — wenn leer, Layer 4 überspringen */ + groqApiKey: string; +} + +/** + * Klassifiziert eine einzelne Mail durch alle Layer. + * Pure bezüglich IO — Groq-Call ist die einzige externe Abhängigkeit. + * DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer. + */ +export async function classifyMail(params: ClassifyMailParams): Promise { + const { mail, blockedDomainSet, groqApiKey } = params; + const { senderEmail, senderName, subject } = mail; + + const senderEmailLower = senderEmail.toLowerCase(); + const domain = senderEmailLower.split("@")[1] ?? ""; + const localPart = senderEmailLower.split("@")[0] ?? ""; + + // ── Layer 1: Whitelist ────────────────────────────────────────────────────── + const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase(); + for (const w of GAMBLING_WHITELIST as string[]) { + if (haystack.includes(w)) { + return { + action: "passed", + triggerSource: "whitelist", + score: 0, + relayDecodedDomain: null, + features: { + score: 0, + domainBlocked: false, + relayDecoded: false, + brandMatch: false, + randomTokens: false, + keywordHitsSubject: [], + keywordHitsDomain: [], + keywordHitsName: [], + styleFlags: [], + whitelistHit: true, + }, + }; + } + } + + // ── Layer 2: Domain-Hard-Block ────────────────────────────────────────────── + if (domain && blockedDomainSet.has(domain)) { + return { + action: "blocked", + triggerSource: "domain", + score: 100, + relayDecodedDomain: null, + features: { + score: 100, + domainBlocked: true, + relayDecoded: false, + brandMatch: false, + randomTokens: false, + keywordHitsSubject: [], + keywordHitsDomain: [], + keywordHitsName: [], + styleFlags: [], + whitelistHit: false, + }, + }; + } + + // ── Layer 2: Relay-Decoded Domain-Block ───────────────────────────────────── + const relayDecodedDomain = extractRelayedDomain(senderEmailLower); + if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) { + return { + action: "blocked", + triggerSource: "relay-decoded", + score: 100, + relayDecodedDomain, + features: { + score: 100, + domainBlocked: false, + relayDecoded: true, + brandMatch: false, + randomTokens: false, + keywordHitsSubject: [], + keywordHitsDomain: [], + keywordHitsName: [], + styleFlags: [], + whitelistHit: false, + }, + }; + } + + // ── Layer 2.5: Brand+Random-Token-Hard-Block ──────────────────────────────── + // Normalisiere Absender-Name und Domain-Root für Brand-Vergleich + const displayNameNorm = normalizeBrand(senderName ?? ""); + const domainCandidates = domainToBrandCandidates(domain); + const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : []; + const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates]; + + const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c)); + const randomTokens = hasRandomTokens(localPart); + + if (brandMatch && randomTokens) { + return { + action: "blocked", + triggerSource: "brand+random", + score: 100, + relayDecodedDomain, + features: { + score: 100, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch: true, + randomTokens: true, + keywordHitsSubject: [], + keywordHitsDomain: [], + keywordHitsName: [], + styleFlags: [], + whitelistHit: false, + }, + }; + } + + // ── Layer 3: Score ────────────────────────────────────────────────────────── + const scoreResult = computeScore( + senderEmailLower, + senderName, + subject, + brandMatch, + randomTokens, + ); + + if (scoreResult.whitelistHit) { + return { + action: "passed", + triggerSource: "whitelist", + score: 0, + relayDecodedDomain, + features: { + ...scoreResult, + score: 0, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch, + randomTokens, + }, + }; + } + + const score = scoreResult.score; + + // Score >= 80 → Hard-Block, kein LLM + if (score >= SCORE_HARD_BLOCK_THRESHOLD) { + const triggerSource: TriggerSource = `score:${score}`; + return { + action: "blocked", + triggerSource, + score, + relayDecodedDomain, + features: { + ...scoreResult, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch, + randomTokens, + }, + }; + } + + // Score < 25 → PASS, kein LLM + if (score < SCORE_BORDERLINE_LOW) { + return { + action: "passed", + triggerSource: "no-signal", + score, + relayDecodedDomain, + features: { + ...scoreResult, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch, + randomTokens, + }, + }; + } + + // ── Layer 4: Groq-Borderline (25–75) ──────────────────────────────────────── + if (score >= SCORE_BORDERLINE_LOW && score <= SCORE_BORDERLINE_HIGH && groqApiKey) { + // Local-Part-Redaction: nur behalten wenn er selbst Gambling-Keywords enthält + const localPartHasKeyword = (GAMBLING_KEYWORDS as string[]).some((kw: string) => + localPart.toLowerCase().includes(kw), + ); + const senderEmailRedacted = redactLocalPartForLLM(senderEmailLower, localPartHasKeyword); + + let groqVerdict: GroqVerdict | null = null; + try { + groqVerdict = await callGroqClassifier({ + senderName, + senderEmailRedacted, + subject, + groqApiKey, + }); + } catch (err) { + // LLM-Fehler → konservativ PASS (kein false-positive durch API-Ausfall) + console.warn("[mail-classifier] Groq call failed, falling back to score-based decision:", err); + } + + if (groqVerdict) { + const action: ClassificationAction = groqVerdict.isGambling ? "blocked" : "passed"; + const triggerSource: TriggerSource = `llm:${groqVerdict.confidence.toFixed(2)}`; + return { + action, + triggerSource, + score, + relayDecodedDomain, + groq: groqVerdict, + features: { + ...scoreResult, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch, + randomTokens, + }, + }; + } + } + + // Fallback: Score 25–75 ohne Groq (API-Fehler oder kein Key) → PASS bei < 50, BLOCK bei >= 50 + const fallbackAction: ClassificationAction = score >= 50 ? "blocked" : "passed"; + const fallbackTrigger: TriggerSource = `score:${score}`; + return { + action: fallbackAction, + triggerSource: fallbackTrigger, + score, + relayDecodedDomain, + features: { + ...scoreResult, + domainBlocked: false, + relayDecoded: !!relayDecodedDomain, + brandMatch, + randomTokens, + }, + }; +} diff --git a/backend/tests/mail/mail-classifier.test.ts b/backend/tests/mail/mail-classifier.test.ts new file mode 100644 index 0000000..ae8ad2c --- /dev/null +++ b/backend/tests/mail/mail-classifier.test.ts @@ -0,0 +1,517 @@ +/** + * Tests für mail-classifier.ts — Mail-Klassifikations-Pipeline. + * + * Testet alle Layer-Logiken als pure Funktionen (kein DB-Mock, kein Groq-Mock). + * + * Abgedeckt: + * - extractRelayedDomain() — diverse Relay-Patterns + * - normalizeBrand() — Normalisierungs-Logik + * - hasRandomTokens() — true/false cases + * - redactLocalPartForLLM() — keep vs redact + * - computeScore() — Score-Berechnung mit Weights + * - classifyMail() — End-to-End Pipeline: + * - Gamblezen-Beispiel → Layer 2.5 Hard-Block (kein LLM-Call) + * - BetandPlay-Beispiel → Layer 2.5 Hard-Block (kein LLM-Call) + * - Whitelist-Case (wettervorhersage) + * - Domain-Block (Layer 2) + * - Relay-Decoded Block (Layer 2) + * - No-Signal → PASS + */ +import { describe, it, expect, vi } from "vitest"; + +// gambling-keywords.mjs ist ESM ohne TypeScript — mock before import +vi.mock("../../server/utils/gambling-keywords.mjs", () => ({ + GAMBLING_KEYWORDS: [ + "casino", "bet365", "bwin", "tipico", "unibet", "betway", + "pokerstars", "jackpot", "freispiel", "free spin", "bonus code", + "auszahlung", "glücksspiel", "slots", "roulette", "wette", + "stake", "rolletto", "vbet", "1xbet", "melbet", "mostbet", + "luckyvibe", "spinz", "casinoly", "rabona", "justcasino", + "getslots", "rocketplay", "freshcasino", "betano", "leovegas", + ], + GAMBLING_WHITELIST: [ + "wettervorhersage", + "wetter", + "wetterbericht", + "wettkampf", + "wettbewerb", + ], +})); + +import { + extractRelayedDomain, + normalizeBrand, + hasRandomTokens, + redactLocalPartForLLM, + computeScore, + classifyMail, + matchesGamblingBrand, +} from "../../server/utils/mail-classifier"; + +// ─── extractRelayedDomain ──────────────────────────────────────────────────── + +describe("extractRelayedDomain()", () => { + it("extrahiert Domain aus SendGrid-bounce-Pattern (user=domain@sendgrid)", () => { + expect(extractRelayedDomain("bounces+user=gamblezen.com@sendgrid.net")) + .toBe("gamblezen.com"); + }); + + it("extrahiert Domain aus Mailchimp-Track-Pattern (track.user=domain@mc)", () => { + expect(extractRelayedDomain("track.user=betandplay.com@mailchimp.com")) + .toBe("betandplay.com"); + }); + + it("extrahiert Domain aus _at_-Pattern", () => { + expect(extractRelayedDomain("a1b2c3_user_at_betandplay.com@em.example.com")) + .toBe("betandplay.com"); + }); + + it("gibt null zurück wenn kein Relay-Pattern erkannt", () => { + expect(extractRelayedDomain("info@betandplay.com")).toBeNull(); + }); + + it("gibt null zurück für direkte Adressen ohne @", () => { + expect(extractRelayedDomain("noatsign")).toBeNull(); + }); + + it("normalisiert extrahierte Domain auf lowercase", () => { + expect(extractRelayedDomain("bounce=GambleZen.COM@delivery.net")) + .toBe("gamblezen.com"); + }); + + it("gibt null zurück für normale Adressen ohne Relay-Muster", () => { + expect(extractRelayedDomain("newsletter@example.org")).toBeNull(); + }); +}); + +// ─── normalizeBrand ────────────────────────────────────────────────────────── + +describe("normalizeBrand()", () => { + it("BetandPlay → betandplay", () => { + expect(normalizeBrand("BetandPlay")).toBe("betandplay"); + }); + + it("bet-and-play → betandplay", () => { + expect(normalizeBrand("bet-and-play")).toBe("betandplay"); + }); + + it("Gamble Zen → gamblezen", () => { + expect(normalizeBrand("Gamble Zen")).toBe("gamblezen"); + }); + + it("Mr. Green → mrgreen", () => { + expect(normalizeBrand("Mr. Green")).toBe("mrgreen"); + }); + + it("lucky_vibe → luckyvibe", () => { + expect(normalizeBrand("lucky_vibe")).toBe("luckyvibe"); + }); + + it("unveränderte Kleinbuchstaben bleiben gleich", () => { + expect(normalizeBrand("casino")).toBe("casino"); + }); +}); + +// ─── matchesGamblingBrand ──────────────────────────────────────────────────── + +describe("matchesGamblingBrand()", () => { + it("'gamblezen' matcht", () => { + expect(matchesGamblingBrand("gamblezen")).toBe(true); + }); + + it("'betandplay' matcht", () => { + expect(matchesGamblingBrand("betandplay")).toBe(true); + }); + + it("'casino' matcht (exact)", () => { + expect(matchesGamblingBrand("casino")).toBe(true); + }); + + it("'mrgreen' matcht", () => { + expect(matchesGamblingBrand("mrgreen")).toBe(true); + }); + + it("'example' matcht nicht", () => { + expect(matchesGamblingBrand("example")).toBe(false); + }); + + it("zu kurze Strings (< 4 Zeichen) matchen nie", () => { + expect(matchesGamblingBrand("bet")).toBe(false); + }); + + it("'googlemail' matcht nicht", () => { + expect(matchesGamblingBrand("googlemail")).toBe(false); + }); +}); + +// ─── hasRandomTokens ───────────────────────────────────────────────────────── + +describe("hasRandomTokens()", () => { + it("local-part mit 2+ zufälligen Tokens → true", () => { + // Gamblezen-typisch: hq3a91_7xmpl2 (2 random-looking tokens) + expect(hasRandomTokens("hq3a91_7xmpl2")).toBe(true); + }); + + it("local-part mit User-ID + Token → true", () => { + expect(hasRandomTokens("user123abc_ref456xyz")).toBe(true); + }); + + it("'info' → false (Funktionswort)", () => { + expect(hasRandomTokens("info")).toBe(false); + }); + + it("'noreply' → false (Funktionswort)", () => { + expect(hasRandomTokens("noreply")).toBe(false); + }); + + it("'newsletter' → false (Funktionswort, kein Digit-Mix)", () => { + expect(hasRandomTokens("newsletter")).toBe(false); + }); + + it("normaler Local-Part ohne Zufalls-Tokens → false", () => { + expect(hasRandomTokens("john.doe")).toBe(false); + }); + + it("nur ein random Token (Grenzfall) → false", () => { + // Nur ein Token >= 6 mit Digit-Mix → unter Schwelle (braucht >= 2) + expect(hasRandomTokens("abc123")).toBe(false); + }); + + it("echter BetandPlay-typischer Local-Part → true", () => { + // z.B. "u7a2b1_offers_ref9x2z" — ein Funktionswort + 2 random tokens + expect(hasRandomTokens("u7a2b1_offers_ref9x2z")).toBe(true); + }); +}); + +// ─── redactLocalPartForLLM ─────────────────────────────────────────────────── + +describe("redactLocalPartForLLM()", () => { + it("normale Adresse → local-part wird redacted", () => { + expect(redactLocalPartForLLM("user123@example.com", false)) + .toBe("***@example.com"); + }); + + it("Adresse mit Casino-Keyword im local-part → NICHT redacted", () => { + expect(redactLocalPartForLLM("casino_offers@mailer.net", true)) + .toBe("casino_offers@mailer.net"); + }); + + it("normal ohne Keyword-Flag → redacted", () => { + expect(redactLocalPartForLLM("a1b2c3_track@sendgrid.net", false)) + .toBe("***@sendgrid.net"); + }); + + it("Adresse ohne @ → unverändert zurückgegeben", () => { + expect(redactLocalPartForLLM("noatsign", false)).toBe("noatsign"); + }); +}); + +// ─── computeScore ──────────────────────────────────────────────────────────── + +describe("computeScore()", () => { + it("Whitelist-Hit → score=0, whitelistHit=true", () => { + const result = computeScore( + "info@wetter.de", + "Wetter Service", + "Wettervorhersage für morgen", + false, + false, + ); + expect(result.whitelistHit).toBe(true); + expect(result.score).toBe(0); + }); + + it("Casino im Betreff → SUBJECT_GAMBLING_KEYWORD += 35", () => { + const result = computeScore( + "info@example.com", + null, + "Dein Casino-Bonus wartet", + false, + false, + ); + expect(result.keywordHitsSubject).toContain("casino"); + expect(result.score).toBeGreaterThanOrEqual(35); + }); + + it("Geld-Pattern (100€) im Betreff → SUBJECT_MONEY_PATTERN += 20", () => { + const result = computeScore( + "info@example.com", + null, + "100€ Willkommensbonus jetzt sichern", + false, + false, + ); + expect(result.styleFlags).toContain("money-pattern"); + expect(result.score).toBeGreaterThanOrEqual(20); + }); + + it("Brand-Match ohne Random → BRAND_MATCH_NO_RANDOM += 35", () => { + const result = computeScore( + "info@example.com", + null, + "Normaler Betreff", + true, // brandMatch=true + false, // randomTokens=false + ); + expect(result.score).toBeGreaterThanOrEqual(35); + }); + + it("Random-Tokens ohne Brand → RANDOM_TOKENS_NO_BRAND += 10", () => { + const result = computeScore( + "info@example.com", + null, + "Newsletter vom Tag", + false, // brandMatch=false + true, // randomTokens=true + ); + expect(result.score).toBeGreaterThanOrEqual(10); + }); + + it("Score wird auf max 100 gecapped", () => { + // Alle Signale gleichzeitig → Score würde > 100 sein + const result = computeScore( + "slots@casinobonus.bet", + "Casino Jackpot", + "JACKPOT Casino 500€ Freispiele Nur heute Letzte chance", + true, + true, + ); + expect(result.score).toBeLessThanOrEqual(100); + }); +}); + +// ─── classifyMail() — Pipeline End-to-End ──────────────────────────────────── + +describe("classifyMail() — End-to-End Pipeline", () => { + // Leere Domain-Set für die meisten Tests (kein Domain-Hard-Block) + const emptyDomainSet = new Set(); + + // ─── Screenshot-Beispiel 1: Gamblezen via Relay ─────────────────────────── + it("Gamblezen-Beispiel: bounces+user=gamblezen.com@em.sendgrid.net → Layer 2.5 Hard-Block", async () => { + // Gamblezen leitet über SendGrid-Bounces: Domain "em.sendgrid.net" ist nicht geblockt, + // aber relay-decoded → "gamblezen.com" + local-part hat random tokens. + // gamblezen.com ist ein bekannter Gambling-Brand. + const domainSetWithGamblezen = new Set(["gamblezen.com"]); + + const result = await classifyMail({ + mail: { + senderEmail: "bounces+user=gamblezen.com@em.sendgrid.net", + senderName: "Gamble Zen", + subject: "Dein exklusives Angebot wartet", + }, + blockedDomainSet: domainSetWithGamblezen, + groqApiKey: "", // kein LLM erlaubt hier + }); + + // Relay-decoded domain matcht blocklist → Layer 2 (relay-decoded), NICHT Layer 2.5 + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("relay-decoded"); + expect(result.relayDecodedDomain).toBe("gamblezen.com"); + }); + + it("Gamblezen-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => { + // Wenn gamblezen.com NICHT in der Blocklist ist: Brand+Random greift trotzdem + const result = await classifyMail({ + mail: { + senderEmail: "hq3a91_7xmpl2@em.sendgrid.net", + senderName: "Gamble Zen", // Brand-Match via Display-Name + subject: "Dein exklusives Angebot wartet", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", // kein LLM-Call hier erwartet + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("brand+random"); + expect(result.features.brandMatch).toBe(true); + expect(result.features.randomTokens).toBe(true); + }); + + // ─── Screenshot-Beispiel 2: BetandPlay via Relay ───────────────────────── + it("BetandPlay-Beispiel: track.user=betandplay.com@mailchimp.com → Layer 2.5 Hard-Block", async () => { + const domainSetWithBetandPlay = new Set(["betandplay.com"]); + + const result = await classifyMail({ + mail: { + senderEmail: "track.user=betandplay.com@mailchimp.com", + senderName: "BetandPlay", + subject: "100€ Willkommensbonus — Nur heute!", + }, + blockedDomainSet: domainSetWithBetandPlay, + groqApiKey: "", + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("relay-decoded"); + expect(result.relayDecodedDomain).toBe("betandplay.com"); + }); + + it("BetandPlay-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "u7a2b1_offers_ref9x2z@mailchimp.com", + senderName: "BetandPlay", // Brand-Match via Display-Name + subject: "100€ Willkommensbonus", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("brand+random"); + expect(result.features.brandMatch).toBe(true); + expect(result.features.randomTokens).toBe(true); + }); + + // ─── Layer 1: Whitelist ─────────────────────────────────────────────────── + it("Whitelist-Treffer: 'wettervorhersage' im Betreff → PASS", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "service@wetter.de", + senderName: "Wetter.de", + subject: "Wettervorhersage für morgen", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("passed"); + expect(result.triggerSource).toBe("whitelist"); + }); + + it("'wettkampf' in Betreff → PASS (kein Gambling trotz 'wette')", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "info@sport.de", + senderName: null, + subject: "Wettkampf-Ergebnisse dieser Woche", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("passed"); + expect(result.triggerSource).toBe("whitelist"); + }); + + // ─── Layer 2: Domain-Hard-Block ─────────────────────────────────────────── + it("Domain in Blocklist → Layer 2 Hard-Block", async () => { + const domainSet = new Set(["casinoly.com"]); + + const result = await classifyMail({ + mail: { + senderEmail: "promo@casinoly.com", + senderName: "Casinoly", + subject: "Dein Bonus wartet", + }, + blockedDomainSet: domainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("domain"); + expect(result.features.domainBlocked).toBe(true); + }); + + // ─── Relay-Decoded Block ────────────────────────────────────────────────── + it("Relay-Decoded: =domain.com in local-part und Domain in Blocklist → relay-decoded Block", async () => { + const domainSet = new Set(["rabona.com"]); + + const result = await classifyMail({ + mail: { + senderEmail: "bounce+track=rabona.com@em.sendgrid.net", + senderName: "Rabona Casino", + subject: "Exklusiv für dich", + }, + blockedDomainSet: domainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toBe("relay-decoded"); + expect(result.relayDecodedDomain).toBe("rabona.com"); + }); + + // ─── Layer 3: Score-Block (ohne LLM) ────────────────────────────────────── + it("Viele Signale → Score >= 80 → Hard-Block ohne LLM", async () => { + // Casino im Sender-Name + Jackpot im Betreff + Urgency + Geld-Pattern + const groqCallSpy = vi.fn(); + + const result = await classifyMail({ + mail: { + senderEmail: "info@spinz-casino.example", + senderName: "Casino Jackpot Club", + subject: "JACKPOT 500€ Freispiele — Nur heute!", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "should-not-be-called", + }); + + expect(result.action).toBe("blocked"); + expect(result.triggerSource).toMatch(/^score:/); + expect(result.score).toBeGreaterThanOrEqual(80); + // groqCallSpy wurde nicht gecallt weil wir fetch nicht mocken — + // aber score >= 80 bedeutet Layer 4 wird gar nicht erreicht + }); + + // ─── No-Signal → PASS ──────────────────────────────────────────────────── + it("unauffällige Mail → PASS mit triggerSource 'no-signal'", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "newsletter@amazon.de", + senderName: "Amazon", + subject: "Deine Bestellung wurde versandt", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + expect(result.action).toBe("passed"); + expect(result.triggerSource).toBe("no-signal"); + expect(result.score).toBeLessThan(25); + }); + + // ─── Brand-Match ohne Random → kein Hard-Block, Score-Erhöhung ─────────── + it("Brand-Match ohne Random-Tokens → kein Layer-2.5-Block, aber Score-Erhöhung", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "info@betandplay.com", // direktes info@, kein random + senderName: "BetandPlay", + subject: "Willkommen", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + // Kein Hard-Block Layer 2.5 (kein Random), aber Score erhöht durch Brand-Match + expect(result.triggerSource).not.toBe("brand+random"); + expect(result.features.brandMatch).toBe(true); + expect(result.features.randomTokens).toBe(false); + // Score >= 35 (BRAND_MATCH_NO_RANDOM) — endet je nach anderen Signalen + expect(result.features.score).toBeGreaterThanOrEqual(35); + }); + + // ─── Korrekte Feature-Struktur im Result ───────────────────────────────── + it("Result-Features enthalten alle erwarteten Keys", async () => { + const result = await classifyMail({ + mail: { + senderEmail: "promo@example.com", + senderName: null, + subject: "Test", + }, + blockedDomainSet: emptyDomainSet, + groqApiKey: "", + }); + + expect(result.features).toHaveProperty("score"); + expect(result.features).toHaveProperty("domainBlocked"); + expect(result.features).toHaveProperty("relayDecoded"); + expect(result.features).toHaveProperty("brandMatch"); + expect(result.features).toHaveProperty("randomTokens"); + expect(result.features).toHaveProperty("keywordHitsSubject"); + expect(result.features).toHaveProperty("keywordHitsDomain"); + expect(result.features).toHaveProperty("keywordHitsName"); + expect(result.features).toHaveProperty("styleFlags"); + expect(result.features).toHaveProperty("whitelistHit"); + }); +});