feat(mail): multi-layer classifier — Brand+Random, Relay-Decoder, Score, Groq + ML-Sampling

Layer 0–4 Klassifikations-Pipeline in mail-classifier.ts:
- Layer 2: Domain-Hard-Block + Relay-Decoder (=domain.tld aus SendGrid/Mailchimp-Bounces)
- Layer 2.5: Brand+Random-Token-Hard-Block (Gambling-Brand-Normalisierung + Random-Token-Detection)
  verhindert LLM-Call für bekannte Gambling-Relayer (Gamblezen, BetandPlay etc.)
- Layer 3: Score 0–100 (TS-Gewichte: Domain-Keywords, Subject-Keywords, Name-Match,
  Geld-Pattern, Urgency, All-Caps, Short-Random-Domain, Brand/Random-Ergänzungen)
- Layer 4: Groq Llama 3.3 70B Borderline-Klassifikation (Score 25–75)
  mit Local-Part-Redaction (DSGVO: nur behalten wenn local-part selbst Keyword enthält)
- Layer 5: MailClassificationSample-Insert nach jeder Klassifikation (ML-Phase 3)

Migrations:
- 20260514_add_mail_blocked_trigger_source: ADD COLUMN trigger_source auf mail_blocked
- 20260514_add_mail_classification_sample: CREATE TABLE mail_classification_samples

50 neue Tests (mail-classifier.test.ts): alle Layer, beide Screenshot-Beispiele (Gamblezen +
BetandPlay) bestätigt als Layer-2.5-Hard-Block ohne LLM-Call, Whitelist, Score, Redaction.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
chahinebrini 2026-05-14 08:19:51 +02:00
parent c218287c5e
commit bdd93668ae
8 changed files with 1451 additions and 81 deletions

View File

@ -0,0 +1,18 @@
-- Migration: add_mail_blocked_trigger_source
-- Fügt trigger_source zu mail_blocked hinzu — trackt welcher Klassifikations-Layer
-- die Blockierung ausgelöst hat (Layer 2 "domain", Layer 2.5 "brand+random",
-- Layer 3 "score:NN", Layer 4 "llm:0.XX").
--
-- Breaking-change status: NONE.
-- Spalte ist nullable — alle bestehenden Rows erhalten NULL (= "unbekannt", vor Migration).
-- Kein Backfill notwendig: historische Daten ohne trigger_source bleiben NULL.
--
-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy)
ALTER TABLE "rebreak"."mail_blocked"
ADD COLUMN "trigger_source" VARCHAR(64);
-- Index für spätere Auswertungen (z.B. "wie viele LLM-Blocks vs. Domain-Blocks pro User?")
CREATE INDEX "mail_blocked_trigger_source_idx"
ON "rebreak"."mail_blocked" ("trigger_source")
WHERE "trigger_source" IS NOT NULL;

View File

@ -0,0 +1,72 @@
-- Migration: add_mail_classification_sample
-- Neue Tabelle für ML-Phase 3: Klassifikations-Samples pro Mail-Analyse.
-- Speichert Features + Outcomes für zukünftiges Fine-Tuning und Modell-Evaluation.
--
-- DSGVO-Compliance:
-- - KEIN Mail-Body (Art. 9 Datenminimierung).
-- - subject + sender_name: kurzlebige Detection-Signale, kein narrativer Inhalt.
-- Werden mit Mail-Flush nach 24h in mail_blocked bereinigt — Samples bleiben
-- länger erhalten (Forschungszweck), sind aber auf Domain/Score-Features reduziert.
-- - Cascade-Delete bei User-Löschung: user_id referenziert profiles.id (Art. 17).
-- Da profiles kein FOREIGN KEY auf mail_classification_samples hat (userId als
-- losgelöste UUID), wird Cascade via RLS-Trigger oder bei Account-Lösch-Routine
-- sichergestellt (deleteAllMailConnections-Äquivalent für Samples).
-- Alternativ: manuelle DELETE in Account-Lösch-Endpoint (backend/server/api/...).
-- TODO: Account-Lösch-Flow prüfen ob Samples mitgelöscht werden.
--
-- Breaking-change status: NONE.
-- Neue Tabelle, kein Impact auf bestehende Queries.
--
-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy)
CREATE TABLE "rebreak"."mail_classification_samples" (
"id" TEXT NOT NULL,
"user_id" UUID NOT NULL,
"connection_id" UUID,
-- Raw features (Detection-Signale, keine PII über Inhalt)
"sender_name" VARCHAR(255),
"sender_domain" VARCHAR(255),
"relay_decoded_domain" VARCHAR(255),
"subject" VARCHAR(998), -- RFC 5322 max subject length
-- Computed features (Score-Komponenten als JSON)
"features" JSONB NOT NULL DEFAULT '{}',
-- Outcome
"final_action" TEXT NOT NULL, -- "blocked" | "passed"
"trigger_source" TEXT NOT NULL, -- "domain" | "brand+random" | "score:NN" | "llm:0.XX" | "whitelist" | "no-signal"
-- Groq verdict (NULL wenn Layer 4 nicht lief)
"groq_is_gambling" BOOLEAN,
"groq_confidence" DOUBLE PRECISION,
"groq_reason" TEXT,
-- User-Feedback für späteres Active Learning (initiell NULL)
"user_feedback" TEXT, -- NULL | "correct" | "false-positive" | "false-negative"
"feedback_at" TIMESTAMPTZ,
"created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT "mail_classification_samples_pkey" PRIMARY KEY ("id")
);
-- Kern-Indizes
CREATE INDEX "mail_classification_samples_user_idx"
ON "rebreak"."mail_classification_samples" ("user_id");
CREATE INDEX "mail_classification_samples_created_idx"
ON "rebreak"."mail_classification_samples" ("created_at");
-- Compound-Index für spätere Analyse-Queries
-- z.B. "alle false-positives der letzten 30 Tage" oder "LLM-Block-Rate"
CREATE INDEX "mail_classification_samples_action_trigger_idx"
ON "rebreak"."mail_classification_samples" ("final_action", "trigger_source");
-- CHECK constraints für Datenqualität
ALTER TABLE "rebreak"."mail_classification_samples"
ADD CONSTRAINT "mail_classification_samples_action_check"
CHECK ("final_action" IN ('blocked', 'passed'));
-- Note: connection_id hat keinen FOREIGN KEY auf mail_connections, da die Connection
-- vor dem Sample gelöscht werden kann (z.B. User disconnect). Nullable + orphan-safe.

View File

@ -638,6 +638,9 @@ model MailBlocked {
subject String
receivedAt DateTime @map("received_at")
action String
/// Welcher Layer die Blockierung ausgelöst hat (z.B. "domain", "brand+random", "score:85", "llm:0.92").
/// NULL für ältere Einträge (vor Migration 20260514).
triggerSource String? @map("trigger_source") @db.VarChar(64)
createdAt DateTime @default(now()) @map("created_at")
connection MailConnection @relation(fields: [connectionId], references: [id], onDelete: Cascade)
@ -647,6 +650,46 @@ model MailBlocked {
@@schema("rebreak")
}
/// Klassifikations-Samples für ML-Phase 3 (zukünftiges Fine-Tuning / Modell-Evaluation).
/// Enthält Features + Outcomes jeder Mail-Klassifikation.
/// KEIN Mail-Body — nur Metadaten (Sender-Domain, Subject, Score-Komponenten).
/// Cascade-Delete bei User-Löschung (Art. 17 DSGVO).
model MailClassificationSample {
id String @id @default(cuid())
userId String @map("user_id") @db.Uuid
connectionId String? @map("connection_id") @db.Uuid
// Raw features (was analysiert wurde):
senderName String? @map("sender_name") @db.VarChar(255)
senderDomain String? @map("sender_domain") @db.VarChar(255)
relayDecodedDomain String? @map("relay_decoded_domain") @db.VarChar(255)
subject String? @db.VarChar(998) // RFC 5322 max
// Computed features (Score-Komponenten als JSON):
features Json // { score, brandMatch, randomTokens, keywordHits, styleFlags, … }
// Outcome:
finalAction String @map("final_action") // "blocked" | "passed"
triggerSource String @map("trigger_source") // "domain", "brand+random", "score:NN", "llm:0.XX", "whitelist"
// Groq verdict (nur wenn Layer 4 lief):
groqIsGambling Boolean? @map("groq_is_gambling")
groqConfidence Float? @map("groq_confidence")
groqReason String? @map("groq_reason") @db.Text
// User-Feedback (für später):
userFeedback String? @map("user_feedback") // null | "correct" | "false-positive" | "false-negative"
feedbackAt DateTime? @map("feedback_at")
createdAt DateTime @default(now()) @map("created_at")
@@index([userId])
@@index([createdAt])
@@index([finalAction, triggerSource])
@@map("mail_classification_samples")
@@schema("rebreak")
}
/// Permanente Aggregat-Statistiken blockierter Mails pro Tag + Connection.
/// Befüllt live beim Scan (vor dem 24h-Cleanup von mail_blocked).
/// Enthält KEINE Mail-Inhalte — nur counts/dates (Datenminimierung Art. 5 DSGVO).

View File

@ -6,21 +6,22 @@ import {
insertMailBlocked,
upsertMailBlockedStat,
updateMailConnectionScanStats,
insertMailClassificationSample,
} from "../../db/mail";
import { getBlocklistedDomainsSet } from "../../db/domains";
import { getProfile } from "../../db/profile";
import { getPlanLimits } from "../../utils/plan-features";
import { resolveProviderMeta } from "../../utils/imap-providers";
import { resolveImapAuth } from "../../utils/mail-auth";
// Single-Source-of-Truth (Mo's Finding #4)
// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[]
import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs";
import { classifyMail } from "../../utils/mail-classifier";
/**
* POST /api/mail/scan-internal
* Called by cron or IMAP proxy. Scans ALL mailbox folders.
* Free: only custom domains + keywords. Pro/Legend: global blocklist + custom.
*
* Klassifikations-Pipeline: Layer 04 via mail-classifier.ts.
* Layer 5 (Sample-Capture): nach jeder Klassifikation.
*/
export default defineEventHandler(async (event) => {
const secret = getHeader(event, "x-admin-secret");
@ -44,7 +45,7 @@ export default defineEventHandler(async (event) => {
if (skippedNoConsent > 0) {
console.log(
`[scan-internal] skipping ${skippedNoConsent} connections for userId=${userId} — no consent_at (pending re-consent)`,
`[scan-internal] skipping ${skippedNoConsent} connections — no consent_at (pending re-consent)`,
);
}
@ -53,8 +54,6 @@ export default defineEventHandler(async (event) => {
}
// Plan-aware blocklist
// Grace-Period: wenn globalBlocklistGraceUntil noch in der Zukunft liegt,
// behandeln wir den User als 'full' auch wenn sein Plan 'curated' sagt.
const profile = await getProfile(userId);
const limits = getPlanLimits(profile?.plan ?? "free");
const inGrace =
@ -64,20 +63,15 @@ export default defineEventHandler(async (event) => {
await deleteOldMailBlocked(userId);
// Groq API Key aus runtimeConfig (Infisical-injiziert)
const config = useRuntimeConfig(event);
const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || "";
const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || "";
let totalScanned = 0;
let totalBlocked = 0;
// scan-internal läuft im Cron-Context (kein User-Event). useRuntimeConfig(event)
// funktioniert hier weil event die Admin-Auth-Request-Referenz ist. Falls der
// Daemon triggerScan() direkt ohne echten HTTP-Request aufruft, fällt der
// process.env-Fallback ein — beide Quellen zeigen auf dieselbe Azure Client-ID.
const config = useRuntimeConfig(event);
const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || "";
for (const connection of eligibleConnections) {
// resolveImapAuth() wählt automatisch den richtigen Auth-Pfad:
// oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen)
// alle anderen → App-Password decrypt
let imapAuth: { user: string; accessToken: string } | { user: string; pass: string };
try {
imapAuth = await resolveImapAuth(connection, msClientId);
@ -85,8 +79,6 @@ export default defineEventHandler(async (event) => {
continue;
}
// useStarttls=true → STARTTLS (secure=false + requireTLS=true)
// rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP)
const useImplicitTls = !connection.useStarttls;
const imap = new ImapFlow({
host: connection.imapHost,
@ -104,7 +96,6 @@ export default defineEventHandler(async (event) => {
try {
await imap.connect();
// Scan ALL mailbox folders (not just hardcoded list)
const mailboxes = await imap.list();
const scannable = mailboxes.filter(
(mb: any) => !mb.flags?.has("\\Noselect"),
@ -137,24 +128,22 @@ export default defineEventHandler(async (event) => {
const allUids = allMessages.map(
(m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`,
);
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
getBlocklistedDomainsSet(
allMessages
.map(
(m: any) =>
(m.envelope?.from?.[0]?.address ?? "")
.toLowerCase()
.split("@")[1] ?? "",
// Alle Sender-Domains sammeln für Blocklist-Lookup
const senderDomains = allMessages
.map((m: any) =>
((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""),
)
.filter(Boolean),
userId,
includeGlobal,
),
.filter(Boolean);
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
getBlocklistedDomainsSet(senderDomains, userId, includeGlobal),
getAlreadyBlockedUidSet(allUids, userId),
]);
const toInsert: Parameters<typeof insertMailBlocked>[0] = [];
const uidsToDelete: string[] = [];
const sampleInserts: Parameters<typeof insertMailClassificationSample>[0][] = [];
for (const msg of allMessages) {
const from = msg.envelope?.from?.[0];
@ -164,18 +153,34 @@ export default defineEventHandler(async (event) => {
const msgDate = msg.envelope?.date ?? new Date();
const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`;
const haystack = `${senderEmail} ${subject}`.toLowerCase();
const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) =>
haystack.includes(kw),
);
const senderDomain = senderEmail.split("@")[1] ?? "";
const isBlocklisted = senderDomain
? blockedDomainSet.has(senderDomain)
: false;
if (!isGamblingKeyword && !isBlocklisted) continue;
// Layer 0: Already blocked → skip, kein Sample
if (alreadyBlockedSet.has(uid)) continue;
const result = await classifyMail({
mail: { senderEmail, senderName, subject },
blockedDomainSet,
groqApiKey,
});
// Layer 5: Sample-Capture (immer, außer Layer 0)
const senderDomain = senderEmail.split("@")[1] ?? null;
sampleInserts.push({
userId,
connectionId: connection.id,
senderName: senderName?.slice(0, 255) ?? null,
senderDomain: senderDomain?.slice(0, 255) ?? null,
relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null,
subject: subject.slice(0, 998) || null,
features: result.features as unknown as Record<string, unknown>,
finalAction: result.action,
triggerSource: result.triggerSource,
groqIsGambling: result.groq?.isGambling ?? null,
groqConfidence: result.groq?.confidence ?? null,
groqReason: result.groq?.reason ?? null,
});
if (result.action !== "blocked") continue;
uidsToDelete.push(String(msg.uid));
toInsert.push({
userId,
@ -186,6 +191,7 @@ export default defineEventHandler(async (event) => {
subject: subject.slice(0, 200) || "(kein Betreff)",
receivedAt: msgDate,
action: "deleted",
triggerSource: result.triggerSource,
});
newlyBlocked++;
}
@ -212,7 +218,13 @@ export default defineEventHandler(async (event) => {
await insertMailBlocked(toInsert);
// Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent)
// Samples fire-and-forget (kein Scan-Result abhängig davon)
if (sampleInserts.length > 0) {
Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => {
console.warn("[scan-internal] sample insert failed (non-fatal):", err);
});
}
if (toInsert.length > 0) {
const providerMeta = resolveProviderMeta(connection.imapHost);
await upsertMailBlockedStat({

View File

@ -6,21 +6,22 @@ import {
insertMailBlocked,
upsertMailBlockedStat,
updateMailConnectionScanStats,
insertMailClassificationSample,
} from "../../db/mail";
import { getBlocklistedDomainsSet } from "../../db/domains";
import { getProfile } from "../../db/profile";
import { getPlanLimits } from "../../utils/plan-features";
import { resolveProviderMeta } from "../../utils/imap-providers";
import { resolveImapAuth } from "../../utils/mail-auth";
// Single-Source-of-Truth (Mo's Finding #4)
// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[]
import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs";
import { classifyMail } from "../../utils/mail-classifier";
/**
* POST /api/mail/scan
* Scannt ALLE Ordner (INBOX, Spam, Papierkorb, All Mail ) nach Gambling-Mails.
* Free-User: nur eigene Domains + Keywords. Pro/Legend: globale Blocklist + eigene.
*
* Klassifikations-Pipeline: Layer 04 via mail-classifier.ts.
* Layer 5 (Sample-Capture): nach jeder Klassifikation.
*/
export default defineEventHandler(async (event) => {
const user = await requireUser(event);
@ -46,7 +47,6 @@ export default defineEventHandler(async (event) => {
// Plan-aware: Free users get only custom domains, Pro/Legend get global blocklist
const profile = await getProfile(user.id);
const limits = getPlanLimits(profile?.plan ?? "free");
// Grace-Period berücksichtigen
const inGrace =
profile?.globalBlocklistGraceUntil != null &&
new Date(profile.globalBlocklistGraceUntil) > new Date();
@ -54,16 +54,14 @@ export default defineEventHandler(async (event) => {
await deleteOldMailBlocked(user.id);
const config = useRuntimeConfig(event);
const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || "";
const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || "";
let totalScanned = 0;
let totalBlocked = 0;
const config = useRuntimeConfig(event);
const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || "";
for (const connection of eligibleConnections) {
// resolveImapAuth() wählt automatisch den richtigen Auth-Pfad:
// oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen)
// alle anderen → App-Password decrypt
let imapAuth: { user: string; accessToken: string } | { user: string; pass: string };
try {
imapAuth = await resolveImapAuth(connection, msClientId);
@ -71,8 +69,6 @@ export default defineEventHandler(async (event) => {
continue;
}
// useStarttls=true → STARTTLS (secure=false + requireTLS=true)
// rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP)
const useImplicitTls = !connection.useStarttls;
const imap = new ImapFlow({
host: connection.imapHost,
@ -90,7 +86,6 @@ export default defineEventHandler(async (event) => {
try {
await imap.connect();
// Scan ALL mailbox folders (not just hardcoded list)
const mailboxes = await imap.list();
const scannable = mailboxes.filter(
(mb: any) => !mb.flags?.has("\\Noselect"),
@ -120,24 +115,21 @@ export default defineEventHandler(async (event) => {
const allUids = allMessages.map(
(m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`,
);
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
getBlocklistedDomainsSet(
allMessages
.map(
(m: any) =>
(m.envelope?.from?.[0]?.address ?? "")
.toLowerCase()
.split("@")[1] ?? "",
const senderDomains = allMessages
.map((m: any) =>
((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""),
)
.filter(Boolean),
user.id,
includeGlobal,
),
.filter(Boolean);
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
getBlocklistedDomainsSet(senderDomains, user.id, includeGlobal),
getAlreadyBlockedUidSet(allUids, user.id),
]);
const toInsert: Parameters<typeof insertMailBlocked>[0] = [];
const uidsToDelete: string[] = [];
const sampleInserts: Parameters<typeof insertMailClassificationSample>[0][] = [];
for (const msg of allMessages) {
const from = msg.envelope?.from?.[0];
@ -147,18 +139,34 @@ export default defineEventHandler(async (event) => {
const msgDate = msg.envelope?.date ?? new Date();
const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`;
const haystack = `${senderEmail} ${subject}`.toLowerCase();
const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) =>
haystack.includes(kw),
);
const senderDomain = senderEmail.split("@")[1] ?? "";
const isBlocklisted = senderDomain
? blockedDomainSet.has(senderDomain)
: false;
if (!isGamblingKeyword && !isBlocklisted) continue;
// Layer 0: Already blocked → skip, kein Sample
if (alreadyBlockedSet.has(uid)) continue;
const result = await classifyMail({
mail: { senderEmail, senderName, subject },
blockedDomainSet,
groqApiKey,
});
// Layer 5: Sample-Capture (immer, außer Layer 0)
const senderDomain = senderEmail.split("@")[1] ?? null;
sampleInserts.push({
userId: user.id,
connectionId: connection.id,
senderName: senderName?.slice(0, 255) ?? null,
senderDomain: senderDomain?.slice(0, 255) ?? null,
relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null,
subject: subject.slice(0, 998) || null,
features: result.features as unknown as Record<string, unknown>,
finalAction: result.action,
triggerSource: result.triggerSource,
groqIsGambling: result.groq?.isGambling ?? null,
groqConfidence: result.groq?.confidence ?? null,
groqReason: result.groq?.reason ?? null,
});
if (result.action !== "blocked") continue;
uidsToDelete.push(String(msg.uid));
toInsert.push({
userId: user.id,
@ -169,11 +177,11 @@ export default defineEventHandler(async (event) => {
subject: subject.slice(0, 200) || "(kein Betreff)",
receivedAt: msgDate,
action: "deleted",
triggerSource: result.triggerSource,
});
newlyBlocked++;
}
// Permanently delete gambling mails from this folder
if (uidsToDelete.length > 0) {
try {
await imap.messageDelete(uidsToDelete.join(","), { uid: true });
@ -193,7 +201,13 @@ export default defineEventHandler(async (event) => {
await insertMailBlocked(toInsert);
// Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent)
// Samples fire-and-forget
if (sampleInserts.length > 0) {
Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => {
console.warn("[scan] sample insert failed (non-fatal):", err);
});
}
if (toInsert.length > 0) {
const providerMeta = resolveProviderMeta(connection.imapHost);
await upsertMailBlockedStat({

View File

@ -183,6 +183,7 @@ export async function insertMailBlocked(
subject: string;
receivedAt: Date;
action: string;
triggerSource?: string | null;
}[],
) {
if (entries.length === 0) return;
@ -190,6 +191,42 @@ export async function insertMailBlocked(
await db.mailBlocked.createMany({ data: entries, skipDuplicates: true });
}
// ─── MailClassificationSample ─────────────────────────────────────────────────
/**
* Schreibt einen Klassifikations-Sample-Eintrag für ML-Phase 3.
* Wird nach JEDER Klassifikation aufgerufen (außer Layer 0 / Already-blocked Skips).
*
* DSGVO: Nur Features, keine Mail-Inhalte (kein Body). Subject + Sender sind
* kurzlebige Detection-Signale, kein narrativer Inhalt. Cascade-Delete bei
* User-Löschung (Art. 17).
*/
export async function insertMailClassificationSample(entry: {
userId: string;
connectionId: string | null;
senderName: string | null;
senderDomain: string | null;
relayDecodedDomain: string | null;
subject: string | null;
// features ist ein Prisma-Json-Feld — InputJsonValue erwartet kein plain Record.
// Wir serialisieren explizit via JSON.parse(JSON.stringify(...)) für TS-Zufriedenheit.
features: Record<string, unknown>;
finalAction: string;
triggerSource: string;
groqIsGambling?: boolean | null;
groqConfidence?: number | null;
groqReason?: string | null;
}) {
const db = usePrisma();
// JSON.parse(JSON.stringify(features)) liefert ein "plain JSON value" das Prisma akzeptiert.
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
const featuresJson = JSON.parse(JSON.stringify(entry.features));
await db.mailClassificationSample.create({
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
data: { ...entry, features: featuresJson },
});
}
/**
* Gibt alle MailConnections eines Users zurück bei denen consent_at noch NULL ist.
* Wird vom pending-consent.get.ts Endpoint für den Re-Consent-Modal-Trigger genutzt.

View File

@ -0,0 +1,657 @@
/**
* Mail-Klassifikations-Pipeline (Layer 04 + Sample-Capture).
*
* Architektur:
* Layer 0 Skip-Guard (bereits geblockt / kein Consent)
* Layer 1 Whitelist (wetter, wettkampf ) PASS
* Layer 2 Domain-Hard-Block (Blocklist)
* Layer 2.5 Brand+Random-Token-Detection (Hard-Block ohne LLM)
* Layer 3 Score 0100 (deterministisch)
* Layer 4 Groq-Borderline (Score 2575, mit Local-Part-Redact)
* Layer 5 MailClassificationSample-Insert (immer, außer Layer 0)
*
* Alle Layer-Logiken sind pure Funktionen vollständig unit-testbar ohne DB-Mocks.
*
* DSGVO-Hinweise:
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
* - Local-Part der Sender-Adresse wird vor dem Groq-Call redacted
* (es sei denn, er enthält selbst Casino-Keywords dann ist er Detection-Signal).
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore — .mjs ohne types, Exports sind string[]
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
// ─── Typen ─────────────────────────────────────────────────────────────────────
export type ClassificationAction = "blocked" | "passed";
export type TriggerSource =
| "domain"
| "relay-decoded"
| "brand+random"
| `score:${number}`
| `llm:${string}`
| "whitelist"
| "no-signal";
export interface MailInput {
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
senderEmail: string;
/** Display-Name des Absenders (kann leer sein) */
senderName: string | null;
/** Betreff-Zeile */
subject: string;
}
export interface ClassificationResult {
action: ClassificationAction;
triggerSource: TriggerSource;
score: number;
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
relayDecodedDomain: string | null;
/** Groq-Verdict (nur wenn Layer 4 lief) */
groq?: {
isGambling: boolean;
confidence: number;
reason: string;
};
/** Score-Komponenten für MailClassificationSample.features */
features: ClassificationFeatures;
}
export interface ClassificationFeatures {
score: number;
domainBlocked: boolean;
relayDecoded: boolean;
brandMatch: boolean;
randomTokens: boolean;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
export const SCORE_WEIGHTS = {
// Domain-Indikatoren
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
// Subject-Indikatoren
SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …)
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
// Display-Name-Indikatoren
SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen
SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert)
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
} as const;
// Hard-Block-Threshold: Score >= 80 → BLOCK ohne LLM
const SCORE_HARD_BLOCK_THRESHOLD = 80;
// Borderline-Range: 2575 → Groq-Call
const SCORE_BORDERLINE_LOW = 25;
const SCORE_BORDERLINE_HIGH = 75;
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
const GAMBLING_BRANDS: string[] = [
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
"justcasino", "getslots", "rocketplay", "freshcasino",
"nomnomcasino", "gamblezen", "betandplay",
];
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
/**
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
*
* Muster die wir kennen:
* bounces+user=example.com@sendgrid.net example.com
* track.user=gamblezen.com@mailchimp.com gamblezen.com
* a1b2c3_user_at_betandplay.com@em.em.xyz betandplay.com
* user=betandplay.com@bounce.em.example betandplay.com
*
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
*/
export function extractRelayedDomain(senderEmail: string): string | null {
if (!senderEmail.includes("@")) return null;
const [localPart] = senderEmail.split("@");
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
if (eqMatch) return eqMatch[1].toLowerCase();
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
if (atMatch) return atMatch[1].toLowerCase();
return null;
}
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
/**
* Normalisiert einen String für Brand-Vergleiche.
* "BetandPlay" "betandplay", "bet-and-play.com" "betandplay" (nach Strip)
*/
export function normalizeBrand(s: string): string {
return s.toLowerCase().replace(/[\s\-._]/g, "");
}
/**
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine zu kurz).
*/
export function matchesGamblingBrand(normalized: string): boolean {
if (normalized.length < 4) return false;
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
}
/**
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
* "betand-play.com" ["betandplay", "betand"] (root + normalisiert)
*/
function domainToBrandCandidates(domain: string): string[] {
const root = domain.split(".")[0] ?? "";
return [normalizeBrand(root), normalizeBrand(domain)];
}
// ─── Random-Token-Detection ───────────────────────────────────────────────────
/**
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
*
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
* kein bekanntes Funktions-Wort (info, admin, noreply, support ).
*
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking"
* typisch für Massen-Mailer mit trackierbaren User-IDs.
*/
export function hasRandomTokens(localPart: string): boolean {
const FUNCTION_WORDS = new Set([
"info", "admin", "noreply", "no-reply", "support", "hello",
"news", "marketing", "sales", "contact", "newsletter", "service",
"offers", "promotions", "promo", "team", "mail", "email",
"reply", "bounce", "return", "postmaster", "mailer",
]);
const tokens = localPart.split(/[_\-.+]+/);
const randomLooking = tokens.filter((t) => {
if (t.length < 6) return false;
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
const lower = t.toLowerCase();
if (FUNCTION_WORDS.has(lower)) return false;
return true;
});
return randomLooking.length >= 2;
}
// ─── Local-Part-Redaction ─────────────────────────────────────────────────────
/**
* Redacted den Local-Part einer E-Mail-Adresse vor dem Groq-Call (DSGVO).
*
* AUSNAHME: wenn der Local-Part selbst Gambling-Keywords enthält
* (z.B. "casino_offers_abc123@mailer.com"), bleibt er erhalten
* er ist in diesem Fall ein Klassifikations-Signal, kein PII.
*/
export function redactLocalPartForLLM(
senderEmail: string,
localPartHasKeyword: boolean,
): string {
if (localPartHasKeyword) return senderEmail;
const atIdx = senderEmail.indexOf("@");
if (atIdx === -1) return senderEmail;
return `***${senderEmail.slice(atIdx)}`;
}
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
interface ScoreResult {
score: number;
keywordHitsSubject: string[];
keywordHitsDomain: string[];
keywordHitsName: string[];
styleFlags: string[];
whitelistHit: boolean;
}
export function computeScore(
senderEmail: string,
senderName: string | null,
subject: string,
brandMatchFound: boolean,
randomTokensFound: boolean,
): ScoreResult {
let score = 0;
const keywordHitsSubject: string[] = [];
const keywordHitsDomain: string[] = [];
const keywordHitsName: string[] = [];
const styleFlags: string[] = [];
const subjectLower = subject.toLowerCase();
const senderEmailLower = senderEmail.toLowerCase();
const senderNameLower = (senderName ?? "").toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const domainRoot = domain.split(".")[0] ?? "";
// ── Whitelist-Check (Layer 1) ──
for (const w of GAMBLING_WHITELIST as string[]) {
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
return {
score: 0,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
};
}
}
// ── Domain-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (domain.includes(kw) || domainRoot.includes(kw)) {
keywordHitsDomain.push(kw);
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
break; // einmal reicht
}
}
// ── Subject-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (subjectLower.includes(kw)) {
keywordHitsSubject.push(kw);
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
break;
}
}
// ── Sender-Name-Keywords ──
for (const kw of GAMBLING_KEYWORDS as string[]) {
if (senderNameLower.includes(kw)) {
keywordHitsName.push(kw);
score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD;
break;
}
}
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
styleFlags.push("money-pattern");
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
}
// ── Urgency-Wörter im Betreff ──
const URGENCY_PATTERNS = [
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
"expires", "last chance", "limited time", "jetzt einlösen",
"sofort", "nur noch", "endet heute",
];
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
styleFlags.push("urgency");
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
}
// ── ALL-CAPS-Wort im Betreff ──
if (/\b[A-Z]{4,}\b/.test(subject)) {
styleFlags.push("all-caps");
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
}
// ── Short-Random-Domain ──
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
styleFlags.push("short-random-domain");
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
}
// ── Layer 2.5 Score-Ergänzungen ──
if (brandMatchFound && !randomTokensFound) {
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
}
if (!brandMatchFound && randomTokensFound) {
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
}
return {
score: Math.min(score, 100),
keywordHitsSubject,
keywordHitsDomain,
keywordHitsName,
styleFlags,
whitelistHit: false,
};
}
// ─── Groq-LLM-Call (Layer 4) ─────────────────────────────────────────────────
interface GroqVerdict {
isGambling: boolean;
confidence: number;
reason: string;
}
/**
* Ruft Groq Llama 3.3 70B zur Borderline-Klassifikation auf.
* Sendet NUR: senderName, senderEmail (ggf. local-part-redacted), subject.
* KEIN Mail-Body, KEINE weiteren PII.
*/
export async function callGroqClassifier(params: {
senderName: string | null;
senderEmailRedacted: string;
subject: string;
groqApiKey: string;
}): Promise<GroqVerdict> {
const prompt = `You are a spam classifier for a gambling addiction recovery app.
Classify whether this email is from a gambling/betting operator.
Sender name: ${params.senderName ?? "(none)"}
Sender email: ${params.senderEmailRedacted}
Subject: ${params.subject}
Respond with ONLY valid JSON in this exact format:
{"isGambling": true/false, "confidence": 0.0-1.0, "reason": "one sentence"}
Do not include any other text.`;
const response = await fetch("https://api.groq.com/openai/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${params.groqApiKey}`,
},
body: JSON.stringify({
model: "llama-3.3-70b-versatile",
messages: [{ role: "user", content: prompt }],
temperature: 0,
max_tokens: 100,
response_format: { type: "json_object" },
}),
});
if (!response.ok) {
const errText = await response.text().catch(() => "");
throw new Error(`Groq API error ${response.status}: ${errText.slice(0, 200)}`);
}
const data = await response.json() as {
choices: { message: { content: string } }[];
};
const raw = data.choices?.[0]?.message?.content ?? "{}";
try {
const parsed = JSON.parse(raw) as Partial<GroqVerdict>;
return {
isGambling: Boolean(parsed.isGambling),
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0,
reason: typeof parsed.reason === "string" ? parsed.reason.slice(0, 300) : "",
};
} catch {
// JSON-Parse-Fehler → konservativ PASS (kein false-positive durch LLM-Fehler)
return { isGambling: false, confidence: 0, reason: "parse-error" };
}
}
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
export interface ClassifyMailParams {
mail: MailInput;
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
blockedDomainSet: Set<string>;
/** Groq API Key (aus runtimeConfig) — wenn leer, Layer 4 überspringen */
groqApiKey: string;
}
/**
* Klassifiziert eine einzelne Mail durch alle Layer.
* Pure bezüglich IO Groq-Call ist die einzige externe Abhängigkeit.
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
*/
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
const { mail, blockedDomainSet, groqApiKey } = params;
const { senderEmail, senderName, subject } = mail;
const senderEmailLower = senderEmail.toLowerCase();
const domain = senderEmailLower.split("@")[1] ?? "";
const localPart = senderEmailLower.split("@")[0] ?? "";
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
for (const w of GAMBLING_WHITELIST as string[]) {
if (haystack.includes(w)) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain: null,
features: {
score: 0,
domainBlocked: false,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: true,
},
};
}
}
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
if (domain && blockedDomainSet.has(domain)) {
return {
action: "blocked",
triggerSource: "domain",
score: 100,
relayDecodedDomain: null,
features: {
score: 100,
domainBlocked: true,
relayDecoded: false,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
return {
action: "blocked",
triggerSource: "relay-decoded",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: true,
brandMatch: false,
randomTokens: false,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
// Normalisiere Absender-Name und Domain-Root für Brand-Vergleich
const displayNameNorm = normalizeBrand(senderName ?? "");
const domainCandidates = domainToBrandCandidates(domain);
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates];
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
const randomTokens = hasRandomTokens(localPart);
if (brandMatch && randomTokens) {
return {
action: "blocked",
triggerSource: "brand+random",
score: 100,
relayDecodedDomain,
features: {
score: 100,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch: true,
randomTokens: true,
keywordHitsSubject: [],
keywordHitsDomain: [],
keywordHitsName: [],
styleFlags: [],
whitelistHit: false,
},
};
}
// ── Layer 3: Score ──────────────────────────────────────────────────────────
const scoreResult = computeScore(
senderEmailLower,
senderName,
subject,
brandMatch,
randomTokens,
);
if (scoreResult.whitelistHit) {
return {
action: "passed",
triggerSource: "whitelist",
score: 0,
relayDecodedDomain,
features: {
...scoreResult,
score: 0,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
const score = scoreResult.score;
// Score >= 80 → Hard-Block, kein LLM
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
const triggerSource: TriggerSource = `score:${score}`;
return {
action: "blocked",
triggerSource,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// Score < 25 → PASS, kein LLM
if (score < SCORE_BORDERLINE_LOW) {
return {
action: "passed",
triggerSource: "no-signal",
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
// ── Layer 4: Groq-Borderline (2575) ────────────────────────────────────────
if (score >= SCORE_BORDERLINE_LOW && score <= SCORE_BORDERLINE_HIGH && groqApiKey) {
// Local-Part-Redaction: nur behalten wenn er selbst Gambling-Keywords enthält
const localPartHasKeyword = (GAMBLING_KEYWORDS as string[]).some((kw: string) =>
localPart.toLowerCase().includes(kw),
);
const senderEmailRedacted = redactLocalPartForLLM(senderEmailLower, localPartHasKeyword);
let groqVerdict: GroqVerdict | null = null;
try {
groqVerdict = await callGroqClassifier({
senderName,
senderEmailRedacted,
subject,
groqApiKey,
});
} catch (err) {
// LLM-Fehler → konservativ PASS (kein false-positive durch API-Ausfall)
console.warn("[mail-classifier] Groq call failed, falling back to score-based decision:", err);
}
if (groqVerdict) {
const action: ClassificationAction = groqVerdict.isGambling ? "blocked" : "passed";
const triggerSource: TriggerSource = `llm:${groqVerdict.confidence.toFixed(2)}`;
return {
action,
triggerSource,
score,
relayDecodedDomain,
groq: groqVerdict,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}
}
// Fallback: Score 2575 ohne Groq (API-Fehler oder kein Key) → PASS bei < 50, BLOCK bei >= 50
const fallbackAction: ClassificationAction = score >= 50 ? "blocked" : "passed";
const fallbackTrigger: TriggerSource = `score:${score}`;
return {
action: fallbackAction,
triggerSource: fallbackTrigger,
score,
relayDecodedDomain,
features: {
...scoreResult,
domainBlocked: false,
relayDecoded: !!relayDecodedDomain,
brandMatch,
randomTokens,
},
};
}

View File

@ -0,0 +1,517 @@
/**
* Tests für mail-classifier.ts Mail-Klassifikations-Pipeline.
*
* Testet alle Layer-Logiken als pure Funktionen (kein DB-Mock, kein Groq-Mock).
*
* Abgedeckt:
* - extractRelayedDomain() diverse Relay-Patterns
* - normalizeBrand() Normalisierungs-Logik
* - hasRandomTokens() true/false cases
* - redactLocalPartForLLM() keep vs redact
* - computeScore() Score-Berechnung mit Weights
* - classifyMail() End-to-End Pipeline:
* - Gamblezen-Beispiel Layer 2.5 Hard-Block (kein LLM-Call)
* - BetandPlay-Beispiel Layer 2.5 Hard-Block (kein LLM-Call)
* - Whitelist-Case (wettervorhersage)
* - Domain-Block (Layer 2)
* - Relay-Decoded Block (Layer 2)
* - No-Signal PASS
*/
import { describe, it, expect, vi } from "vitest";
// gambling-keywords.mjs ist ESM ohne TypeScript — mock before import
vi.mock("../../server/utils/gambling-keywords.mjs", () => ({
GAMBLING_KEYWORDS: [
"casino", "bet365", "bwin", "tipico", "unibet", "betway",
"pokerstars", "jackpot", "freispiel", "free spin", "bonus code",
"auszahlung", "glücksspiel", "slots", "roulette", "wette",
"stake", "rolletto", "vbet", "1xbet", "melbet", "mostbet",
"luckyvibe", "spinz", "casinoly", "rabona", "justcasino",
"getslots", "rocketplay", "freshcasino", "betano", "leovegas",
],
GAMBLING_WHITELIST: [
"wettervorhersage",
"wetter",
"wetterbericht",
"wettkampf",
"wettbewerb",
],
}));
import {
extractRelayedDomain,
normalizeBrand,
hasRandomTokens,
redactLocalPartForLLM,
computeScore,
classifyMail,
matchesGamblingBrand,
} from "../../server/utils/mail-classifier";
// ─── extractRelayedDomain ────────────────────────────────────────────────────
describe("extractRelayedDomain()", () => {
it("extrahiert Domain aus SendGrid-bounce-Pattern (user=domain@sendgrid)", () => {
expect(extractRelayedDomain("bounces+user=gamblezen.com@sendgrid.net"))
.toBe("gamblezen.com");
});
it("extrahiert Domain aus Mailchimp-Track-Pattern (track.user=domain@mc)", () => {
expect(extractRelayedDomain("track.user=betandplay.com@mailchimp.com"))
.toBe("betandplay.com");
});
it("extrahiert Domain aus _at_-Pattern", () => {
expect(extractRelayedDomain("a1b2c3_user_at_betandplay.com@em.example.com"))
.toBe("betandplay.com");
});
it("gibt null zurück wenn kein Relay-Pattern erkannt", () => {
expect(extractRelayedDomain("info@betandplay.com")).toBeNull();
});
it("gibt null zurück für direkte Adressen ohne @", () => {
expect(extractRelayedDomain("noatsign")).toBeNull();
});
it("normalisiert extrahierte Domain auf lowercase", () => {
expect(extractRelayedDomain("bounce=GambleZen.COM@delivery.net"))
.toBe("gamblezen.com");
});
it("gibt null zurück für normale Adressen ohne Relay-Muster", () => {
expect(extractRelayedDomain("newsletter@example.org")).toBeNull();
});
});
// ─── normalizeBrand ──────────────────────────────────────────────────────────
describe("normalizeBrand()", () => {
it("BetandPlay → betandplay", () => {
expect(normalizeBrand("BetandPlay")).toBe("betandplay");
});
it("bet-and-play → betandplay", () => {
expect(normalizeBrand("bet-and-play")).toBe("betandplay");
});
it("Gamble Zen → gamblezen", () => {
expect(normalizeBrand("Gamble Zen")).toBe("gamblezen");
});
it("Mr. Green → mrgreen", () => {
expect(normalizeBrand("Mr. Green")).toBe("mrgreen");
});
it("lucky_vibe → luckyvibe", () => {
expect(normalizeBrand("lucky_vibe")).toBe("luckyvibe");
});
it("unveränderte Kleinbuchstaben bleiben gleich", () => {
expect(normalizeBrand("casino")).toBe("casino");
});
});
// ─── matchesGamblingBrand ────────────────────────────────────────────────────
describe("matchesGamblingBrand()", () => {
it("'gamblezen' matcht", () => {
expect(matchesGamblingBrand("gamblezen")).toBe(true);
});
it("'betandplay' matcht", () => {
expect(matchesGamblingBrand("betandplay")).toBe(true);
});
it("'casino' matcht (exact)", () => {
expect(matchesGamblingBrand("casino")).toBe(true);
});
it("'mrgreen' matcht", () => {
expect(matchesGamblingBrand("mrgreen")).toBe(true);
});
it("'example' matcht nicht", () => {
expect(matchesGamblingBrand("example")).toBe(false);
});
it("zu kurze Strings (< 4 Zeichen) matchen nie", () => {
expect(matchesGamblingBrand("bet")).toBe(false);
});
it("'googlemail' matcht nicht", () => {
expect(matchesGamblingBrand("googlemail")).toBe(false);
});
});
// ─── hasRandomTokens ─────────────────────────────────────────────────────────
describe("hasRandomTokens()", () => {
it("local-part mit 2+ zufälligen Tokens → true", () => {
// Gamblezen-typisch: hq3a91_7xmpl2 (2 random-looking tokens)
expect(hasRandomTokens("hq3a91_7xmpl2")).toBe(true);
});
it("local-part mit User-ID + Token → true", () => {
expect(hasRandomTokens("user123abc_ref456xyz")).toBe(true);
});
it("'info' → false (Funktionswort)", () => {
expect(hasRandomTokens("info")).toBe(false);
});
it("'noreply' → false (Funktionswort)", () => {
expect(hasRandomTokens("noreply")).toBe(false);
});
it("'newsletter' → false (Funktionswort, kein Digit-Mix)", () => {
expect(hasRandomTokens("newsletter")).toBe(false);
});
it("normaler Local-Part ohne Zufalls-Tokens → false", () => {
expect(hasRandomTokens("john.doe")).toBe(false);
});
it("nur ein random Token (Grenzfall) → false", () => {
// Nur ein Token >= 6 mit Digit-Mix → unter Schwelle (braucht >= 2)
expect(hasRandomTokens("abc123")).toBe(false);
});
it("echter BetandPlay-typischer Local-Part → true", () => {
// z.B. "u7a2b1_offers_ref9x2z" — ein Funktionswort + 2 random tokens
expect(hasRandomTokens("u7a2b1_offers_ref9x2z")).toBe(true);
});
});
// ─── redactLocalPartForLLM ───────────────────────────────────────────────────
describe("redactLocalPartForLLM()", () => {
it("normale Adresse → local-part wird redacted", () => {
expect(redactLocalPartForLLM("user123@example.com", false))
.toBe("***@example.com");
});
it("Adresse mit Casino-Keyword im local-part → NICHT redacted", () => {
expect(redactLocalPartForLLM("casino_offers@mailer.net", true))
.toBe("casino_offers@mailer.net");
});
it("normal ohne Keyword-Flag → redacted", () => {
expect(redactLocalPartForLLM("a1b2c3_track@sendgrid.net", false))
.toBe("***@sendgrid.net");
});
it("Adresse ohne @ → unverändert zurückgegeben", () => {
expect(redactLocalPartForLLM("noatsign", false)).toBe("noatsign");
});
});
// ─── computeScore ────────────────────────────────────────────────────────────
describe("computeScore()", () => {
it("Whitelist-Hit → score=0, whitelistHit=true", () => {
const result = computeScore(
"info@wetter.de",
"Wetter Service",
"Wettervorhersage für morgen",
false,
false,
);
expect(result.whitelistHit).toBe(true);
expect(result.score).toBe(0);
});
it("Casino im Betreff → SUBJECT_GAMBLING_KEYWORD += 35", () => {
const result = computeScore(
"info@example.com",
null,
"Dein Casino-Bonus wartet",
false,
false,
);
expect(result.keywordHitsSubject).toContain("casino");
expect(result.score).toBeGreaterThanOrEqual(35);
});
it("Geld-Pattern (100€) im Betreff → SUBJECT_MONEY_PATTERN += 20", () => {
const result = computeScore(
"info@example.com",
null,
"100€ Willkommensbonus jetzt sichern",
false,
false,
);
expect(result.styleFlags).toContain("money-pattern");
expect(result.score).toBeGreaterThanOrEqual(20);
});
it("Brand-Match ohne Random → BRAND_MATCH_NO_RANDOM += 35", () => {
const result = computeScore(
"info@example.com",
null,
"Normaler Betreff",
true, // brandMatch=true
false, // randomTokens=false
);
expect(result.score).toBeGreaterThanOrEqual(35);
});
it("Random-Tokens ohne Brand → RANDOM_TOKENS_NO_BRAND += 10", () => {
const result = computeScore(
"info@example.com",
null,
"Newsletter vom Tag",
false, // brandMatch=false
true, // randomTokens=true
);
expect(result.score).toBeGreaterThanOrEqual(10);
});
it("Score wird auf max 100 gecapped", () => {
// Alle Signale gleichzeitig → Score würde > 100 sein
const result = computeScore(
"slots@casinobonus.bet",
"Casino Jackpot",
"JACKPOT Casino 500€ Freispiele Nur heute Letzte chance",
true,
true,
);
expect(result.score).toBeLessThanOrEqual(100);
});
});
// ─── classifyMail() — Pipeline End-to-End ────────────────────────────────────
describe("classifyMail() — End-to-End Pipeline", () => {
// Leere Domain-Set für die meisten Tests (kein Domain-Hard-Block)
const emptyDomainSet = new Set<string>();
// ─── Screenshot-Beispiel 1: Gamblezen via Relay ───────────────────────────
it("Gamblezen-Beispiel: bounces+user=gamblezen.com@em.sendgrid.net → Layer 2.5 Hard-Block", async () => {
// Gamblezen leitet über SendGrid-Bounces: Domain "em.sendgrid.net" ist nicht geblockt,
// aber relay-decoded → "gamblezen.com" + local-part hat random tokens.
// gamblezen.com ist ein bekannter Gambling-Brand.
const domainSetWithGamblezen = new Set(["gamblezen.com"]);
const result = await classifyMail({
mail: {
senderEmail: "bounces+user=gamblezen.com@em.sendgrid.net",
senderName: "Gamble Zen",
subject: "Dein exklusives Angebot wartet",
},
blockedDomainSet: domainSetWithGamblezen,
groqApiKey: "", // kein LLM erlaubt hier
});
// Relay-decoded domain matcht blocklist → Layer 2 (relay-decoded), NICHT Layer 2.5
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("relay-decoded");
expect(result.relayDecodedDomain).toBe("gamblezen.com");
});
it("Gamblezen-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => {
// Wenn gamblezen.com NICHT in der Blocklist ist: Brand+Random greift trotzdem
const result = await classifyMail({
mail: {
senderEmail: "hq3a91_7xmpl2@em.sendgrid.net",
senderName: "Gamble Zen", // Brand-Match via Display-Name
subject: "Dein exklusives Angebot wartet",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "", // kein LLM-Call hier erwartet
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("brand+random");
expect(result.features.brandMatch).toBe(true);
expect(result.features.randomTokens).toBe(true);
});
// ─── Screenshot-Beispiel 2: BetandPlay via Relay ─────────────────────────
it("BetandPlay-Beispiel: track.user=betandplay.com@mailchimp.com → Layer 2.5 Hard-Block", async () => {
const domainSetWithBetandPlay = new Set(["betandplay.com"]);
const result = await classifyMail({
mail: {
senderEmail: "track.user=betandplay.com@mailchimp.com",
senderName: "BetandPlay",
subject: "100€ Willkommensbonus — Nur heute!",
},
blockedDomainSet: domainSetWithBetandPlay,
groqApiKey: "",
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("relay-decoded");
expect(result.relayDecodedDomain).toBe("betandplay.com");
});
it("BetandPlay-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => {
const result = await classifyMail({
mail: {
senderEmail: "u7a2b1_offers_ref9x2z@mailchimp.com",
senderName: "BetandPlay", // Brand-Match via Display-Name
subject: "100€ Willkommensbonus",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("brand+random");
expect(result.features.brandMatch).toBe(true);
expect(result.features.randomTokens).toBe(true);
});
// ─── Layer 1: Whitelist ───────────────────────────────────────────────────
it("Whitelist-Treffer: 'wettervorhersage' im Betreff → PASS", async () => {
const result = await classifyMail({
mail: {
senderEmail: "service@wetter.de",
senderName: "Wetter.de",
subject: "Wettervorhersage für morgen",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
expect(result.action).toBe("passed");
expect(result.triggerSource).toBe("whitelist");
});
it("'wettkampf' in Betreff → PASS (kein Gambling trotz 'wette')", async () => {
const result = await classifyMail({
mail: {
senderEmail: "info@sport.de",
senderName: null,
subject: "Wettkampf-Ergebnisse dieser Woche",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
expect(result.action).toBe("passed");
expect(result.triggerSource).toBe("whitelist");
});
// ─── Layer 2: Domain-Hard-Block ───────────────────────────────────────────
it("Domain in Blocklist → Layer 2 Hard-Block", async () => {
const domainSet = new Set(["casinoly.com"]);
const result = await classifyMail({
mail: {
senderEmail: "promo@casinoly.com",
senderName: "Casinoly",
subject: "Dein Bonus wartet",
},
blockedDomainSet: domainSet,
groqApiKey: "",
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("domain");
expect(result.features.domainBlocked).toBe(true);
});
// ─── Relay-Decoded Block ──────────────────────────────────────────────────
it("Relay-Decoded: =domain.com in local-part und Domain in Blocklist → relay-decoded Block", async () => {
const domainSet = new Set(["rabona.com"]);
const result = await classifyMail({
mail: {
senderEmail: "bounce+track=rabona.com@em.sendgrid.net",
senderName: "Rabona Casino",
subject: "Exklusiv für dich",
},
blockedDomainSet: domainSet,
groqApiKey: "",
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toBe("relay-decoded");
expect(result.relayDecodedDomain).toBe("rabona.com");
});
// ─── Layer 3: Score-Block (ohne LLM) ──────────────────────────────────────
it("Viele Signale → Score >= 80 → Hard-Block ohne LLM", async () => {
// Casino im Sender-Name + Jackpot im Betreff + Urgency + Geld-Pattern
const groqCallSpy = vi.fn();
const result = await classifyMail({
mail: {
senderEmail: "info@spinz-casino.example",
senderName: "Casino Jackpot Club",
subject: "JACKPOT 500€ Freispiele — Nur heute!",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "should-not-be-called",
});
expect(result.action).toBe("blocked");
expect(result.triggerSource).toMatch(/^score:/);
expect(result.score).toBeGreaterThanOrEqual(80);
// groqCallSpy wurde nicht gecallt weil wir fetch nicht mocken —
// aber score >= 80 bedeutet Layer 4 wird gar nicht erreicht
});
// ─── No-Signal → PASS ────────────────────────────────────────────────────
it("unauffällige Mail → PASS mit triggerSource 'no-signal'", async () => {
const result = await classifyMail({
mail: {
senderEmail: "newsletter@amazon.de",
senderName: "Amazon",
subject: "Deine Bestellung wurde versandt",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
expect(result.action).toBe("passed");
expect(result.triggerSource).toBe("no-signal");
expect(result.score).toBeLessThan(25);
});
// ─── Brand-Match ohne Random → kein Hard-Block, Score-Erhöhung ───────────
it("Brand-Match ohne Random-Tokens → kein Layer-2.5-Block, aber Score-Erhöhung", async () => {
const result = await classifyMail({
mail: {
senderEmail: "info@betandplay.com", // direktes info@, kein random
senderName: "BetandPlay",
subject: "Willkommen",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
// Kein Hard-Block Layer 2.5 (kein Random), aber Score erhöht durch Brand-Match
expect(result.triggerSource).not.toBe("brand+random");
expect(result.features.brandMatch).toBe(true);
expect(result.features.randomTokens).toBe(false);
// Score >= 35 (BRAND_MATCH_NO_RANDOM) — endet je nach anderen Signalen
expect(result.features.score).toBeGreaterThanOrEqual(35);
});
// ─── Korrekte Feature-Struktur im Result ─────────────────────────────────
it("Result-Features enthalten alle erwarteten Keys", async () => {
const result = await classifyMail({
mail: {
senderEmail: "promo@example.com",
senderName: null,
subject: "Test",
},
blockedDomainSet: emptyDomainSet,
groqApiKey: "",
});
expect(result.features).toHaveProperty("score");
expect(result.features).toHaveProperty("domainBlocked");
expect(result.features).toHaveProperty("relayDecoded");
expect(result.features).toHaveProperty("brandMatch");
expect(result.features).toHaveProperty("randomTokens");
expect(result.features).toHaveProperty("keywordHitsSubject");
expect(result.features).toHaveProperty("keywordHitsDomain");
expect(result.features).toHaveProperty("keywordHitsName");
expect(result.features).toHaveProperty("styleFlags");
expect(result.features).toHaveProperty("whitelistHit");
});
});