feat(mail): multi-layer classifier — Brand+Random, Relay-Decoder, Score, Groq + ML-Sampling
Layer 0–4 Klassifikations-Pipeline in mail-classifier.ts: - Layer 2: Domain-Hard-Block + Relay-Decoder (=domain.tld aus SendGrid/Mailchimp-Bounces) - Layer 2.5: Brand+Random-Token-Hard-Block (Gambling-Brand-Normalisierung + Random-Token-Detection) verhindert LLM-Call für bekannte Gambling-Relayer (Gamblezen, BetandPlay etc.) - Layer 3: Score 0–100 (TS-Gewichte: Domain-Keywords, Subject-Keywords, Name-Match, Geld-Pattern, Urgency, All-Caps, Short-Random-Domain, Brand/Random-Ergänzungen) - Layer 4: Groq Llama 3.3 70B Borderline-Klassifikation (Score 25–75) mit Local-Part-Redaction (DSGVO: nur behalten wenn local-part selbst Keyword enthält) - Layer 5: MailClassificationSample-Insert nach jeder Klassifikation (ML-Phase 3) Migrations: - 20260514_add_mail_blocked_trigger_source: ADD COLUMN trigger_source auf mail_blocked - 20260514_add_mail_classification_sample: CREATE TABLE mail_classification_samples 50 neue Tests (mail-classifier.test.ts): alle Layer, beide Screenshot-Beispiele (Gamblezen + BetandPlay) bestätigt als Layer-2.5-Hard-Block ohne LLM-Call, Whitelist, Score, Redaction. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c218287c5e
commit
bdd93668ae
@ -0,0 +1,18 @@
|
||||
-- Migration: add_mail_blocked_trigger_source
|
||||
-- Fügt trigger_source zu mail_blocked hinzu — trackt welcher Klassifikations-Layer
|
||||
-- die Blockierung ausgelöst hat (Layer 2 "domain", Layer 2.5 "brand+random",
|
||||
-- Layer 3 "score:NN", Layer 4 "llm:0.XX").
|
||||
--
|
||||
-- Breaking-change status: NONE.
|
||||
-- Spalte ist nullable — alle bestehenden Rows erhalten NULL (= "unbekannt", vor Migration).
|
||||
-- Kein Backfill notwendig: historische Daten ohne trigger_source bleiben NULL.
|
||||
--
|
||||
-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy)
|
||||
|
||||
ALTER TABLE "rebreak"."mail_blocked"
|
||||
ADD COLUMN "trigger_source" VARCHAR(64);
|
||||
|
||||
-- Index für spätere Auswertungen (z.B. "wie viele LLM-Blocks vs. Domain-Blocks pro User?")
|
||||
CREATE INDEX "mail_blocked_trigger_source_idx"
|
||||
ON "rebreak"."mail_blocked" ("trigger_source")
|
||||
WHERE "trigger_source" IS NOT NULL;
|
||||
@ -0,0 +1,72 @@
|
||||
-- Migration: add_mail_classification_sample
|
||||
-- Neue Tabelle für ML-Phase 3: Klassifikations-Samples pro Mail-Analyse.
|
||||
-- Speichert Features + Outcomes für zukünftiges Fine-Tuning und Modell-Evaluation.
|
||||
--
|
||||
-- DSGVO-Compliance:
|
||||
-- - KEIN Mail-Body (Art. 9 Datenminimierung).
|
||||
-- - subject + sender_name: kurzlebige Detection-Signale, kein narrativer Inhalt.
|
||||
-- Werden mit Mail-Flush nach 24h in mail_blocked bereinigt — Samples bleiben
|
||||
-- länger erhalten (Forschungszweck), sind aber auf Domain/Score-Features reduziert.
|
||||
-- - Cascade-Delete bei User-Löschung: user_id referenziert profiles.id (Art. 17).
|
||||
-- Da profiles kein FOREIGN KEY auf mail_classification_samples hat (userId als
|
||||
-- losgelöste UUID), wird Cascade via RLS-Trigger oder bei Account-Lösch-Routine
|
||||
-- sichergestellt (deleteAllMailConnections-Äquivalent für Samples).
|
||||
-- Alternativ: manuelle DELETE in Account-Lösch-Endpoint (backend/server/api/...).
|
||||
-- TODO: Account-Lösch-Flow prüfen ob Samples mitgelöscht werden.
|
||||
--
|
||||
-- Breaking-change status: NONE.
|
||||
-- Neue Tabelle, kein Impact auf bestehende Queries.
|
||||
--
|
||||
-- Deploy: automatisch via GitHub Actions (pnpm prisma migrate deploy)
|
||||
|
||||
CREATE TABLE "rebreak"."mail_classification_samples" (
|
||||
"id" TEXT NOT NULL,
|
||||
"user_id" UUID NOT NULL,
|
||||
"connection_id" UUID,
|
||||
|
||||
-- Raw features (Detection-Signale, keine PII über Inhalt)
|
||||
"sender_name" VARCHAR(255),
|
||||
"sender_domain" VARCHAR(255),
|
||||
"relay_decoded_domain" VARCHAR(255),
|
||||
"subject" VARCHAR(998), -- RFC 5322 max subject length
|
||||
|
||||
-- Computed features (Score-Komponenten als JSON)
|
||||
"features" JSONB NOT NULL DEFAULT '{}',
|
||||
|
||||
-- Outcome
|
||||
"final_action" TEXT NOT NULL, -- "blocked" | "passed"
|
||||
"trigger_source" TEXT NOT NULL, -- "domain" | "brand+random" | "score:NN" | "llm:0.XX" | "whitelist" | "no-signal"
|
||||
|
||||
-- Groq verdict (NULL wenn Layer 4 nicht lief)
|
||||
"groq_is_gambling" BOOLEAN,
|
||||
"groq_confidence" DOUBLE PRECISION,
|
||||
"groq_reason" TEXT,
|
||||
|
||||
-- User-Feedback für späteres Active Learning (initiell NULL)
|
||||
"user_feedback" TEXT, -- NULL | "correct" | "false-positive" | "false-negative"
|
||||
"feedback_at" TIMESTAMPTZ,
|
||||
|
||||
"created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT "mail_classification_samples_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- Kern-Indizes
|
||||
CREATE INDEX "mail_classification_samples_user_idx"
|
||||
ON "rebreak"."mail_classification_samples" ("user_id");
|
||||
|
||||
CREATE INDEX "mail_classification_samples_created_idx"
|
||||
ON "rebreak"."mail_classification_samples" ("created_at");
|
||||
|
||||
-- Compound-Index für spätere Analyse-Queries
|
||||
-- z.B. "alle false-positives der letzten 30 Tage" oder "LLM-Block-Rate"
|
||||
CREATE INDEX "mail_classification_samples_action_trigger_idx"
|
||||
ON "rebreak"."mail_classification_samples" ("final_action", "trigger_source");
|
||||
|
||||
-- CHECK constraints für Datenqualität
|
||||
ALTER TABLE "rebreak"."mail_classification_samples"
|
||||
ADD CONSTRAINT "mail_classification_samples_action_check"
|
||||
CHECK ("final_action" IN ('blocked', 'passed'));
|
||||
|
||||
-- Note: connection_id hat keinen FOREIGN KEY auf mail_connections, da die Connection
|
||||
-- vor dem Sample gelöscht werden kann (z.B. User disconnect). Nullable + orphan-safe.
|
||||
@ -638,6 +638,9 @@ model MailBlocked {
|
||||
subject String
|
||||
receivedAt DateTime @map("received_at")
|
||||
action String
|
||||
/// Welcher Layer die Blockierung ausgelöst hat (z.B. "domain", "brand+random", "score:85", "llm:0.92").
|
||||
/// NULL für ältere Einträge (vor Migration 20260514).
|
||||
triggerSource String? @map("trigger_source") @db.VarChar(64)
|
||||
createdAt DateTime @default(now()) @map("created_at")
|
||||
|
||||
connection MailConnection @relation(fields: [connectionId], references: [id], onDelete: Cascade)
|
||||
@ -647,6 +650,46 @@ model MailBlocked {
|
||||
@@schema("rebreak")
|
||||
}
|
||||
|
||||
/// Klassifikations-Samples für ML-Phase 3 (zukünftiges Fine-Tuning / Modell-Evaluation).
|
||||
/// Enthält Features + Outcomes jeder Mail-Klassifikation.
|
||||
/// KEIN Mail-Body — nur Metadaten (Sender-Domain, Subject, Score-Komponenten).
|
||||
/// Cascade-Delete bei User-Löschung (Art. 17 DSGVO).
|
||||
model MailClassificationSample {
|
||||
id String @id @default(cuid())
|
||||
userId String @map("user_id") @db.Uuid
|
||||
connectionId String? @map("connection_id") @db.Uuid
|
||||
|
||||
// Raw features (was analysiert wurde):
|
||||
senderName String? @map("sender_name") @db.VarChar(255)
|
||||
senderDomain String? @map("sender_domain") @db.VarChar(255)
|
||||
relayDecodedDomain String? @map("relay_decoded_domain") @db.VarChar(255)
|
||||
subject String? @db.VarChar(998) // RFC 5322 max
|
||||
|
||||
// Computed features (Score-Komponenten als JSON):
|
||||
features Json // { score, brandMatch, randomTokens, keywordHits, styleFlags, … }
|
||||
|
||||
// Outcome:
|
||||
finalAction String @map("final_action") // "blocked" | "passed"
|
||||
triggerSource String @map("trigger_source") // "domain", "brand+random", "score:NN", "llm:0.XX", "whitelist"
|
||||
|
||||
// Groq verdict (nur wenn Layer 4 lief):
|
||||
groqIsGambling Boolean? @map("groq_is_gambling")
|
||||
groqConfidence Float? @map("groq_confidence")
|
||||
groqReason String? @map("groq_reason") @db.Text
|
||||
|
||||
// User-Feedback (für später):
|
||||
userFeedback String? @map("user_feedback") // null | "correct" | "false-positive" | "false-negative"
|
||||
feedbackAt DateTime? @map("feedback_at")
|
||||
|
||||
createdAt DateTime @default(now()) @map("created_at")
|
||||
|
||||
@@index([userId])
|
||||
@@index([createdAt])
|
||||
@@index([finalAction, triggerSource])
|
||||
@@map("mail_classification_samples")
|
||||
@@schema("rebreak")
|
||||
}
|
||||
|
||||
/// Permanente Aggregat-Statistiken blockierter Mails pro Tag + Connection.
|
||||
/// Befüllt live beim Scan (vor dem 24h-Cleanup von mail_blocked).
|
||||
/// Enthält KEINE Mail-Inhalte — nur counts/dates (Datenminimierung Art. 5 DSGVO).
|
||||
|
||||
@ -6,21 +6,22 @@ import {
|
||||
insertMailBlocked,
|
||||
upsertMailBlockedStat,
|
||||
updateMailConnectionScanStats,
|
||||
insertMailClassificationSample,
|
||||
} from "../../db/mail";
|
||||
import { getBlocklistedDomainsSet } from "../../db/domains";
|
||||
import { getProfile } from "../../db/profile";
|
||||
import { getPlanLimits } from "../../utils/plan-features";
|
||||
import { resolveProviderMeta } from "../../utils/imap-providers";
|
||||
import { resolveImapAuth } from "../../utils/mail-auth";
|
||||
// Single-Source-of-Truth (Mo's Finding #4)
|
||||
// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[]
|
||||
import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs";
|
||||
|
||||
import { classifyMail } from "../../utils/mail-classifier";
|
||||
|
||||
/**
|
||||
* POST /api/mail/scan-internal
|
||||
* Called by cron or IMAP proxy. Scans ALL mailbox folders.
|
||||
* Free: only custom domains + keywords. Pro/Legend: global blocklist + custom.
|
||||
*
|
||||
* Klassifikations-Pipeline: Layer 0–4 via mail-classifier.ts.
|
||||
* Layer 5 (Sample-Capture): nach jeder Klassifikation.
|
||||
*/
|
||||
export default defineEventHandler(async (event) => {
|
||||
const secret = getHeader(event, "x-admin-secret");
|
||||
@ -44,7 +45,7 @@ export default defineEventHandler(async (event) => {
|
||||
|
||||
if (skippedNoConsent > 0) {
|
||||
console.log(
|
||||
`[scan-internal] skipping ${skippedNoConsent} connections for userId=${userId} — no consent_at (pending re-consent)`,
|
||||
`[scan-internal] skipping ${skippedNoConsent} connections — no consent_at (pending re-consent)`,
|
||||
);
|
||||
}
|
||||
|
||||
@ -53,8 +54,6 @@ export default defineEventHandler(async (event) => {
|
||||
}
|
||||
|
||||
// Plan-aware blocklist
|
||||
// Grace-Period: wenn globalBlocklistGraceUntil noch in der Zukunft liegt,
|
||||
// behandeln wir den User als 'full' auch wenn sein Plan 'curated' sagt.
|
||||
const profile = await getProfile(userId);
|
||||
const limits = getPlanLimits(profile?.plan ?? "free");
|
||||
const inGrace =
|
||||
@ -64,20 +63,15 @@ export default defineEventHandler(async (event) => {
|
||||
|
||||
await deleteOldMailBlocked(userId);
|
||||
|
||||
// Groq API Key aus runtimeConfig (Infisical-injiziert)
|
||||
const config = useRuntimeConfig(event);
|
||||
const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || "";
|
||||
const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || "";
|
||||
|
||||
let totalScanned = 0;
|
||||
let totalBlocked = 0;
|
||||
|
||||
// scan-internal läuft im Cron-Context (kein User-Event). useRuntimeConfig(event)
|
||||
// funktioniert hier weil event die Admin-Auth-Request-Referenz ist. Falls der
|
||||
// Daemon triggerScan() direkt ohne echten HTTP-Request aufruft, fällt der
|
||||
// process.env-Fallback ein — beide Quellen zeigen auf dieselbe Azure Client-ID.
|
||||
const config = useRuntimeConfig(event);
|
||||
const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || "";
|
||||
|
||||
for (const connection of eligibleConnections) {
|
||||
// resolveImapAuth() wählt automatisch den richtigen Auth-Pfad:
|
||||
// oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen)
|
||||
// alle anderen → App-Password decrypt
|
||||
let imapAuth: { user: string; accessToken: string } | { user: string; pass: string };
|
||||
try {
|
||||
imapAuth = await resolveImapAuth(connection, msClientId);
|
||||
@ -85,8 +79,6 @@ export default defineEventHandler(async (event) => {
|
||||
continue;
|
||||
}
|
||||
|
||||
// useStarttls=true → STARTTLS (secure=false + requireTLS=true)
|
||||
// rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP)
|
||||
const useImplicitTls = !connection.useStarttls;
|
||||
const imap = new ImapFlow({
|
||||
host: connection.imapHost,
|
||||
@ -104,7 +96,6 @@ export default defineEventHandler(async (event) => {
|
||||
try {
|
||||
await imap.connect();
|
||||
|
||||
// Scan ALL mailbox folders (not just hardcoded list)
|
||||
const mailboxes = await imap.list();
|
||||
const scannable = mailboxes.filter(
|
||||
(mb: any) => !mb.flags?.has("\\Noselect"),
|
||||
@ -137,24 +128,22 @@ export default defineEventHandler(async (event) => {
|
||||
const allUids = allMessages.map(
|
||||
(m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`,
|
||||
);
|
||||
|
||||
// Alle Sender-Domains sammeln für Blocklist-Lookup
|
||||
const senderDomains = allMessages
|
||||
.map((m: any) =>
|
||||
((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""),
|
||||
)
|
||||
.filter(Boolean);
|
||||
|
||||
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
|
||||
getBlocklistedDomainsSet(
|
||||
allMessages
|
||||
.map(
|
||||
(m: any) =>
|
||||
(m.envelope?.from?.[0]?.address ?? "")
|
||||
.toLowerCase()
|
||||
.split("@")[1] ?? "",
|
||||
)
|
||||
.filter(Boolean),
|
||||
userId,
|
||||
includeGlobal,
|
||||
),
|
||||
getBlocklistedDomainsSet(senderDomains, userId, includeGlobal),
|
||||
getAlreadyBlockedUidSet(allUids, userId),
|
||||
]);
|
||||
|
||||
const toInsert: Parameters<typeof insertMailBlocked>[0] = [];
|
||||
const uidsToDelete: string[] = [];
|
||||
const sampleInserts: Parameters<typeof insertMailClassificationSample>[0][] = [];
|
||||
|
||||
for (const msg of allMessages) {
|
||||
const from = msg.envelope?.from?.[0];
|
||||
@ -164,18 +153,34 @@ export default defineEventHandler(async (event) => {
|
||||
const msgDate = msg.envelope?.date ?? new Date();
|
||||
const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`;
|
||||
|
||||
const haystack = `${senderEmail} ${subject}`.toLowerCase();
|
||||
const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) =>
|
||||
haystack.includes(kw),
|
||||
);
|
||||
const senderDomain = senderEmail.split("@")[1] ?? "";
|
||||
const isBlocklisted = senderDomain
|
||||
? blockedDomainSet.has(senderDomain)
|
||||
: false;
|
||||
|
||||
if (!isGamblingKeyword && !isBlocklisted) continue;
|
||||
// Layer 0: Already blocked → skip, kein Sample
|
||||
if (alreadyBlockedSet.has(uid)) continue;
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: { senderEmail, senderName, subject },
|
||||
blockedDomainSet,
|
||||
groqApiKey,
|
||||
});
|
||||
|
||||
// Layer 5: Sample-Capture (immer, außer Layer 0)
|
||||
const senderDomain = senderEmail.split("@")[1] ?? null;
|
||||
sampleInserts.push({
|
||||
userId,
|
||||
connectionId: connection.id,
|
||||
senderName: senderName?.slice(0, 255) ?? null,
|
||||
senderDomain: senderDomain?.slice(0, 255) ?? null,
|
||||
relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null,
|
||||
subject: subject.slice(0, 998) || null,
|
||||
features: result.features as unknown as Record<string, unknown>,
|
||||
finalAction: result.action,
|
||||
triggerSource: result.triggerSource,
|
||||
groqIsGambling: result.groq?.isGambling ?? null,
|
||||
groqConfidence: result.groq?.confidence ?? null,
|
||||
groqReason: result.groq?.reason ?? null,
|
||||
});
|
||||
|
||||
if (result.action !== "blocked") continue;
|
||||
|
||||
uidsToDelete.push(String(msg.uid));
|
||||
toInsert.push({
|
||||
userId,
|
||||
@ -186,6 +191,7 @@ export default defineEventHandler(async (event) => {
|
||||
subject: subject.slice(0, 200) || "(kein Betreff)",
|
||||
receivedAt: msgDate,
|
||||
action: "deleted",
|
||||
triggerSource: result.triggerSource,
|
||||
});
|
||||
newlyBlocked++;
|
||||
}
|
||||
@ -212,7 +218,13 @@ export default defineEventHandler(async (event) => {
|
||||
|
||||
await insertMailBlocked(toInsert);
|
||||
|
||||
// Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent)
|
||||
// Samples fire-and-forget (kein Scan-Result abhängig davon)
|
||||
if (sampleInserts.length > 0) {
|
||||
Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => {
|
||||
console.warn("[scan-internal] sample insert failed (non-fatal):", err);
|
||||
});
|
||||
}
|
||||
|
||||
if (toInsert.length > 0) {
|
||||
const providerMeta = resolveProviderMeta(connection.imapHost);
|
||||
await upsertMailBlockedStat({
|
||||
|
||||
@ -6,21 +6,22 @@ import {
|
||||
insertMailBlocked,
|
||||
upsertMailBlockedStat,
|
||||
updateMailConnectionScanStats,
|
||||
insertMailClassificationSample,
|
||||
} from "../../db/mail";
|
||||
import { getBlocklistedDomainsSet } from "../../db/domains";
|
||||
import { getProfile } from "../../db/profile";
|
||||
import { getPlanLimits } from "../../utils/plan-features";
|
||||
import { resolveProviderMeta } from "../../utils/imap-providers";
|
||||
import { resolveImapAuth } from "../../utils/mail-auth";
|
||||
// Single-Source-of-Truth (Mo's Finding #4)
|
||||
// @ts-expect-error — .mjs ohne types, GAMBLING_KEYWORDS ist string[]
|
||||
import { GAMBLING_KEYWORDS } from "../../utils/gambling-keywords.mjs";
|
||||
|
||||
import { classifyMail } from "../../utils/mail-classifier";
|
||||
|
||||
/**
|
||||
* POST /api/mail/scan
|
||||
* Scannt ALLE Ordner (INBOX, Spam, Papierkorb, All Mail …) nach Gambling-Mails.
|
||||
* Free-User: nur eigene Domains + Keywords. Pro/Legend: globale Blocklist + eigene.
|
||||
*
|
||||
* Klassifikations-Pipeline: Layer 0–4 via mail-classifier.ts.
|
||||
* Layer 5 (Sample-Capture): nach jeder Klassifikation.
|
||||
*/
|
||||
export default defineEventHandler(async (event) => {
|
||||
const user = await requireUser(event);
|
||||
@ -46,7 +47,6 @@ export default defineEventHandler(async (event) => {
|
||||
// Plan-aware: Free users get only custom domains, Pro/Legend get global blocklist
|
||||
const profile = await getProfile(user.id);
|
||||
const limits = getPlanLimits(profile?.plan ?? "free");
|
||||
// Grace-Period berücksichtigen
|
||||
const inGrace =
|
||||
profile?.globalBlocklistGraceUntil != null &&
|
||||
new Date(profile.globalBlocklistGraceUntil) > new Date();
|
||||
@ -54,16 +54,14 @@ export default defineEventHandler(async (event) => {
|
||||
|
||||
await deleteOldMailBlocked(user.id);
|
||||
|
||||
const config = useRuntimeConfig(event);
|
||||
const groqApiKey: string = (config.groqApiKey as string) || process.env.GROQ_API_KEY || "";
|
||||
const msClientId: string = (config.msOauthClientId as string) || process.env.MS_OAUTH_CLIENT_ID || "";
|
||||
|
||||
let totalScanned = 0;
|
||||
let totalBlocked = 0;
|
||||
|
||||
const config = useRuntimeConfig(event);
|
||||
const msClientId: string = config.msOauthClientId as string || process.env.MS_OAUTH_CLIENT_ID || "";
|
||||
|
||||
for (const connection of eligibleConnections) {
|
||||
// resolveImapAuth() wählt automatisch den richtigen Auth-Pfad:
|
||||
// oauth2_microsoft → Access-Token (mit proaktivem Refresh falls abgelaufen)
|
||||
// alle anderen → App-Password decrypt
|
||||
let imapAuth: { user: string; accessToken: string } | { user: string; pass: string };
|
||||
try {
|
||||
imapAuth = await resolveImapAuth(connection, msClientId);
|
||||
@ -71,8 +69,6 @@ export default defineEventHandler(async (event) => {
|
||||
continue;
|
||||
}
|
||||
|
||||
// useStarttls=true → STARTTLS (secure=false + requireTLS=true)
|
||||
// rejectUnauthorized=false → self-signed Certs zulassen (nur Custom-IMAP)
|
||||
const useImplicitTls = !connection.useStarttls;
|
||||
const imap = new ImapFlow({
|
||||
host: connection.imapHost,
|
||||
@ -90,7 +86,6 @@ export default defineEventHandler(async (event) => {
|
||||
try {
|
||||
await imap.connect();
|
||||
|
||||
// Scan ALL mailbox folders (not just hardcoded list)
|
||||
const mailboxes = await imap.list();
|
||||
const scannable = mailboxes.filter(
|
||||
(mb: any) => !mb.flags?.has("\\Noselect"),
|
||||
@ -120,24 +115,21 @@ export default defineEventHandler(async (event) => {
|
||||
const allUids = allMessages.map(
|
||||
(m: any) => `${mb.path}:${String(m.uid ?? m.seq)}`,
|
||||
);
|
||||
|
||||
const senderDomains = allMessages
|
||||
.map((m: any) =>
|
||||
((m.envelope?.from?.[0]?.address ?? "").toLowerCase().split("@")[1] ?? ""),
|
||||
)
|
||||
.filter(Boolean);
|
||||
|
||||
const [blockedDomainSet, alreadyBlockedSet] = await Promise.all([
|
||||
getBlocklistedDomainsSet(
|
||||
allMessages
|
||||
.map(
|
||||
(m: any) =>
|
||||
(m.envelope?.from?.[0]?.address ?? "")
|
||||
.toLowerCase()
|
||||
.split("@")[1] ?? "",
|
||||
)
|
||||
.filter(Boolean),
|
||||
user.id,
|
||||
includeGlobal,
|
||||
),
|
||||
getBlocklistedDomainsSet(senderDomains, user.id, includeGlobal),
|
||||
getAlreadyBlockedUidSet(allUids, user.id),
|
||||
]);
|
||||
|
||||
const toInsert: Parameters<typeof insertMailBlocked>[0] = [];
|
||||
const uidsToDelete: string[] = [];
|
||||
const sampleInserts: Parameters<typeof insertMailClassificationSample>[0][] = [];
|
||||
|
||||
for (const msg of allMessages) {
|
||||
const from = msg.envelope?.from?.[0];
|
||||
@ -147,18 +139,34 @@ export default defineEventHandler(async (event) => {
|
||||
const msgDate = msg.envelope?.date ?? new Date();
|
||||
const uid = `${mb.path}:${String(msg.uid ?? msg.seq)}`;
|
||||
|
||||
const haystack = `${senderEmail} ${subject}`.toLowerCase();
|
||||
const isGamblingKeyword = GAMBLING_KEYWORDS.some((kw) =>
|
||||
haystack.includes(kw),
|
||||
);
|
||||
const senderDomain = senderEmail.split("@")[1] ?? "";
|
||||
const isBlocklisted = senderDomain
|
||||
? blockedDomainSet.has(senderDomain)
|
||||
: false;
|
||||
|
||||
if (!isGamblingKeyword && !isBlocklisted) continue;
|
||||
// Layer 0: Already blocked → skip, kein Sample
|
||||
if (alreadyBlockedSet.has(uid)) continue;
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: { senderEmail, senderName, subject },
|
||||
blockedDomainSet,
|
||||
groqApiKey,
|
||||
});
|
||||
|
||||
// Layer 5: Sample-Capture (immer, außer Layer 0)
|
||||
const senderDomain = senderEmail.split("@")[1] ?? null;
|
||||
sampleInserts.push({
|
||||
userId: user.id,
|
||||
connectionId: connection.id,
|
||||
senderName: senderName?.slice(0, 255) ?? null,
|
||||
senderDomain: senderDomain?.slice(0, 255) ?? null,
|
||||
relayDecodedDomain: result.relayDecodedDomain?.slice(0, 255) ?? null,
|
||||
subject: subject.slice(0, 998) || null,
|
||||
features: result.features as unknown as Record<string, unknown>,
|
||||
finalAction: result.action,
|
||||
triggerSource: result.triggerSource,
|
||||
groqIsGambling: result.groq?.isGambling ?? null,
|
||||
groqConfidence: result.groq?.confidence ?? null,
|
||||
groqReason: result.groq?.reason ?? null,
|
||||
});
|
||||
|
||||
if (result.action !== "blocked") continue;
|
||||
|
||||
uidsToDelete.push(String(msg.uid));
|
||||
toInsert.push({
|
||||
userId: user.id,
|
||||
@ -169,11 +177,11 @@ export default defineEventHandler(async (event) => {
|
||||
subject: subject.slice(0, 200) || "(kein Betreff)",
|
||||
receivedAt: msgDate,
|
||||
action: "deleted",
|
||||
triggerSource: result.triggerSource,
|
||||
});
|
||||
newlyBlocked++;
|
||||
}
|
||||
|
||||
// Permanently delete gambling mails from this folder
|
||||
if (uidsToDelete.length > 0) {
|
||||
try {
|
||||
await imap.messageDelete(uidsToDelete.join(","), { uid: true });
|
||||
@ -193,7 +201,13 @@ export default defineEventHandler(async (event) => {
|
||||
|
||||
await insertMailBlocked(toInsert);
|
||||
|
||||
// Aggregat-Stats aktualisieren (vor 24h-Cleanup resistent)
|
||||
// Samples fire-and-forget
|
||||
if (sampleInserts.length > 0) {
|
||||
Promise.all(sampleInserts.map((s) => insertMailClassificationSample(s))).catch((err) => {
|
||||
console.warn("[scan] sample insert failed (non-fatal):", err);
|
||||
});
|
||||
}
|
||||
|
||||
if (toInsert.length > 0) {
|
||||
const providerMeta = resolveProviderMeta(connection.imapHost);
|
||||
await upsertMailBlockedStat({
|
||||
|
||||
@ -183,6 +183,7 @@ export async function insertMailBlocked(
|
||||
subject: string;
|
||||
receivedAt: Date;
|
||||
action: string;
|
||||
triggerSource?: string | null;
|
||||
}[],
|
||||
) {
|
||||
if (entries.length === 0) return;
|
||||
@ -190,6 +191,42 @@ export async function insertMailBlocked(
|
||||
await db.mailBlocked.createMany({ data: entries, skipDuplicates: true });
|
||||
}
|
||||
|
||||
// ─── MailClassificationSample ─────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Schreibt einen Klassifikations-Sample-Eintrag für ML-Phase 3.
|
||||
* Wird nach JEDER Klassifikation aufgerufen (außer Layer 0 / Already-blocked Skips).
|
||||
*
|
||||
* DSGVO: Nur Features, keine Mail-Inhalte (kein Body). Subject + Sender sind
|
||||
* kurzlebige Detection-Signale, kein narrativer Inhalt. Cascade-Delete bei
|
||||
* User-Löschung (Art. 17).
|
||||
*/
|
||||
export async function insertMailClassificationSample(entry: {
|
||||
userId: string;
|
||||
connectionId: string | null;
|
||||
senderName: string | null;
|
||||
senderDomain: string | null;
|
||||
relayDecodedDomain: string | null;
|
||||
subject: string | null;
|
||||
// features ist ein Prisma-Json-Feld — InputJsonValue erwartet kein plain Record.
|
||||
// Wir serialisieren explizit via JSON.parse(JSON.stringify(...)) für TS-Zufriedenheit.
|
||||
features: Record<string, unknown>;
|
||||
finalAction: string;
|
||||
triggerSource: string;
|
||||
groqIsGambling?: boolean | null;
|
||||
groqConfidence?: number | null;
|
||||
groqReason?: string | null;
|
||||
}) {
|
||||
const db = usePrisma();
|
||||
// JSON.parse(JSON.stringify(features)) liefert ein "plain JSON value" das Prisma akzeptiert.
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
|
||||
const featuresJson = JSON.parse(JSON.stringify(entry.features));
|
||||
await db.mailClassificationSample.create({
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
|
||||
data: { ...entry, features: featuresJson },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt alle MailConnections eines Users zurück bei denen consent_at noch NULL ist.
|
||||
* Wird vom pending-consent.get.ts Endpoint für den Re-Consent-Modal-Trigger genutzt.
|
||||
|
||||
657
backend/server/utils/mail-classifier.ts
Normal file
657
backend/server/utils/mail-classifier.ts
Normal file
@ -0,0 +1,657 @@
|
||||
/**
|
||||
* Mail-Klassifikations-Pipeline (Layer 0–4 + Sample-Capture).
|
||||
*
|
||||
* Architektur:
|
||||
* Layer 0 — Skip-Guard (bereits geblockt / kein Consent)
|
||||
* Layer 1 — Whitelist (wetter, wettkampf …) → PASS
|
||||
* Layer 2 — Domain-Hard-Block (Blocklist)
|
||||
* Layer 2.5 — Brand+Random-Token-Detection (Hard-Block ohne LLM)
|
||||
* Layer 3 — Score 0–100 (deterministisch)
|
||||
* Layer 4 — Groq-Borderline (Score 25–75, mit Local-Part-Redact)
|
||||
* Layer 5 — MailClassificationSample-Insert (immer, außer Layer 0)
|
||||
*
|
||||
* Alle Layer-Logiken sind pure Funktionen → vollständig unit-testbar ohne DB-Mocks.
|
||||
*
|
||||
* DSGVO-Hinweise:
|
||||
* - Mail-Inhalte (Body) werden nie persistiert (Art. 9).
|
||||
* - Local-Part der Sender-Adresse wird vor dem Groq-Call redacted
|
||||
* (es sei denn, er enthält selbst Casino-Keywords — dann ist er Detection-Signal).
|
||||
* - userId in Logs nur wenn absolut nötig (Datenminimierung Art. 5).
|
||||
* - MailClassificationSample: Cascade-Delete via userId-Relation (Art. 17).
|
||||
*/
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
||||
// @ts-ignore — .mjs ohne types, Exports sind string[]
|
||||
import { GAMBLING_KEYWORDS, GAMBLING_WHITELIST } from "./gambling-keywords.mjs";
|
||||
|
||||
// ─── Typen ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
export type ClassificationAction = "blocked" | "passed";
|
||||
|
||||
export type TriggerSource =
|
||||
| "domain"
|
||||
| "relay-decoded"
|
||||
| "brand+random"
|
||||
| `score:${number}`
|
||||
| `llm:${string}`
|
||||
| "whitelist"
|
||||
| "no-signal";
|
||||
|
||||
export interface MailInput {
|
||||
/** Sender-E-Mail-Adresse (lowercase, wie von IMAP geliefert) */
|
||||
senderEmail: string;
|
||||
/** Display-Name des Absenders (kann leer sein) */
|
||||
senderName: string | null;
|
||||
/** Betreff-Zeile */
|
||||
subject: string;
|
||||
}
|
||||
|
||||
export interface ClassificationResult {
|
||||
action: ClassificationAction;
|
||||
triggerSource: TriggerSource;
|
||||
score: number;
|
||||
/** Aus Relay-Adressen extrahierte echte Domain (z.B. gamblezen.com) */
|
||||
relayDecodedDomain: string | null;
|
||||
/** Groq-Verdict (nur wenn Layer 4 lief) */
|
||||
groq?: {
|
||||
isGambling: boolean;
|
||||
confidence: number;
|
||||
reason: string;
|
||||
};
|
||||
/** Score-Komponenten für MailClassificationSample.features */
|
||||
features: ClassificationFeatures;
|
||||
}
|
||||
|
||||
export interface ClassificationFeatures {
|
||||
score: number;
|
||||
domainBlocked: boolean;
|
||||
relayDecoded: boolean;
|
||||
brandMatch: boolean;
|
||||
randomTokens: boolean;
|
||||
keywordHitsSubject: string[];
|
||||
keywordHitsDomain: string[];
|
||||
keywordHitsName: string[];
|
||||
styleFlags: string[];
|
||||
whitelistHit: boolean;
|
||||
}
|
||||
|
||||
// ─── Score-Weights (TS-Constants, kein Config-File-Overhead) ──────────────────
|
||||
|
||||
export const SCORE_WEIGHTS = {
|
||||
// Domain-Indikatoren
|
||||
DOMAIN_GAMBLING_KEYWORD: 40, // Domain enthält Gambling-Begriff (bet, casino, slots …)
|
||||
DOMAIN_SHORT_RANDOM: 15, // Domain-Root < 6 Zeichen und zufällig wirkend (betx, 1win)
|
||||
|
||||
// Subject-Indikatoren
|
||||
SUBJECT_GAMBLING_KEYWORD: 35, // Keyword im Betreff (casino, jackpot, freispiel …)
|
||||
SUBJECT_MONEY_PATTERN: 20, // €/$ + Zahl (z.B. "100€ Bonus")
|
||||
SUBJECT_URGENCY: 15, // "Nur heute", "Letzte Chance", "Ablaufdatum"
|
||||
SUBJECT_ALL_CAPS_WORD: 5, // EINZELNES ALL-CAPS-WORT im Betreff
|
||||
|
||||
// Display-Name-Indikatoren
|
||||
SENDER_NAME_GAMBLING_KEYWORD: 30, // Gambling-Begriff im Absender-Namen
|
||||
SENDER_NAME_BRAND_MATCH: 20, // Name matcht bekannten Gambling-Brand (normalisiert)
|
||||
|
||||
// Layer 2.5 Score-Ergänzungen (wenn kein Hard-Block ausgelöst)
|
||||
BRAND_MATCH_NO_RANDOM: 35, // Brand-Match ohne Random-Tokens (kein Hard-Block)
|
||||
RANDOM_TOKENS_NO_BRAND: 10, // Random-Tokens ohne Brand-Match
|
||||
} as const;
|
||||
|
||||
// Hard-Block-Threshold: Score >= 80 → BLOCK ohne LLM
|
||||
const SCORE_HARD_BLOCK_THRESHOLD = 80;
|
||||
// Borderline-Range: 25–75 → Groq-Call
|
||||
const SCORE_BORDERLINE_LOW = 25;
|
||||
const SCORE_BORDERLINE_HIGH = 75;
|
||||
|
||||
// ─── Bekannte Gambling-Brands (für Brand-Match-Normalisierung) ─────────────────
|
||||
// Abgeleitet aus GAMBLING_KEYWORDS + typischen Blocklist-Domains.
|
||||
// Normalisierungsregel: lowercase, alle Sonder- und Leerzeichen entfernt.
|
||||
const GAMBLING_BRANDS: string[] = [
|
||||
"casino", "bet365", "bwin", "tipico", "unibet", "betway", "888casino",
|
||||
"pokerstars", "interwetten", "netbet", "leovegas", "mrgreen",
|
||||
"betsson", "neobet", "mybet", "lottoland", "betano", "williamhill",
|
||||
"paddypower", "betfair", "stake", "rolletto", "vbet", "1xbet", "melbet",
|
||||
"mostbet", "luckyvibe", "spinz", "casinoly", "rabona",
|
||||
"justcasino", "getslots", "rocketplay", "freshcasino",
|
||||
"nomnomcasino", "gamblezen", "betandplay",
|
||||
];
|
||||
|
||||
// ─── Relay-Decoder ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extrahiert die echte Ziel-Domain aus einer E-Mail-Relay-Adresse.
|
||||
*
|
||||
* Muster die wir kennen:
|
||||
* bounces+user=example.com@sendgrid.net → example.com
|
||||
* track.user=gamblezen.com@mailchimp.com → gamblezen.com
|
||||
* a1b2c3_user_at_betandplay.com@em.em.xyz → betandplay.com
|
||||
* user=betandplay.com@bounce.em.example → betandplay.com
|
||||
*
|
||||
* Pattern: Sucht nach `=domain.tld` oder `_at_domain.tld` im local-part.
|
||||
*/
|
||||
export function extractRelayedDomain(senderEmail: string): string | null {
|
||||
if (!senderEmail.includes("@")) return null;
|
||||
const [localPart] = senderEmail.split("@");
|
||||
|
||||
// Pattern 1: user=domain.tld (SendGrid, Mailchimp, SES-Bounces)
|
||||
const eqMatch = localPart.match(/=([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[+_&]|$)/i);
|
||||
if (eqMatch) return eqMatch[1].toLowerCase();
|
||||
|
||||
// Pattern 2: _at_domain.tld (weniger häufig, einige Custom-Relay-Setups)
|
||||
const atMatch = localPart.match(/_at_([a-z0-9][\w-]*\.[a-z]{2,}(?:\.[a-z]{2,})?)(?:[_+]|$)/i);
|
||||
if (atMatch) return atMatch[1].toLowerCase();
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Brand-Normalisierung ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Normalisiert einen String für Brand-Vergleiche.
|
||||
* "BetandPlay" → "betandplay", "bet-and-play.com" → "betandplay" (nach Strip)
|
||||
*/
|
||||
export function normalizeBrand(s: string): string {
|
||||
return s.toLowerCase().replace(/[\s\-._]/g, "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Prüft ob ein normalisierter String mit einem bekannten Gambling-Brand übereinstimmt.
|
||||
* Mindestlänge 4 Zeichen um False-Positives zu vermeiden ("bet" alleine → zu kurz).
|
||||
*/
|
||||
export function matchesGamblingBrand(normalized: string): boolean {
|
||||
if (normalized.length < 4) return false;
|
||||
return GAMBLING_BRANDS.some((brand) => normalized === brand || normalized.includes(brand));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrahiert Brand-Kandidaten aus einer Domain für den Match-Check.
|
||||
* "betand-play.com" → ["betandplay", "betand"] (root + normalisiert)
|
||||
*/
|
||||
function domainToBrandCandidates(domain: string): string[] {
|
||||
const root = domain.split(".")[0] ?? "";
|
||||
return [normalizeBrand(root), normalizeBrand(domain)];
|
||||
}
|
||||
|
||||
// ─── Random-Token-Detection ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Erkennt zufällig wirkende Tokens im Local-Part einer E-Mail-Adresse.
|
||||
*
|
||||
* Definition "random token": >= 6 Zeichen, Mix aus Buchstaben + Ziffern,
|
||||
* kein bekanntes Funktions-Wort (info, admin, noreply, support …).
|
||||
*
|
||||
* Ein Local-Part mit >= 2 solchen Tokens gilt als "random-looking" —
|
||||
* typisch für Massen-Mailer mit trackierbaren User-IDs.
|
||||
*/
|
||||
export function hasRandomTokens(localPart: string): boolean {
|
||||
const FUNCTION_WORDS = new Set([
|
||||
"info", "admin", "noreply", "no-reply", "support", "hello",
|
||||
"news", "marketing", "sales", "contact", "newsletter", "service",
|
||||
"offers", "promotions", "promo", "team", "mail", "email",
|
||||
"reply", "bounce", "return", "postmaster", "mailer",
|
||||
]);
|
||||
|
||||
const tokens = localPart.split(/[_\-.+]+/);
|
||||
const randomLooking = tokens.filter((t) => {
|
||||
if (t.length < 6) return false;
|
||||
if (!/[a-z]/i.test(t) || !/[0-9]/.test(t)) return false; // muss Letters+Digits haben
|
||||
const lower = t.toLowerCase();
|
||||
if (FUNCTION_WORDS.has(lower)) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
return randomLooking.length >= 2;
|
||||
}
|
||||
|
||||
// ─── Local-Part-Redaction ─────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Redacted den Local-Part einer E-Mail-Adresse vor dem Groq-Call (DSGVO).
|
||||
*
|
||||
* AUSNAHME: wenn der Local-Part selbst Gambling-Keywords enthält
|
||||
* (z.B. "casino_offers_abc123@mailer.com"), bleibt er erhalten —
|
||||
* er ist in diesem Fall ein Klassifikations-Signal, kein PII.
|
||||
*/
|
||||
export function redactLocalPartForLLM(
|
||||
senderEmail: string,
|
||||
localPartHasKeyword: boolean,
|
||||
): string {
|
||||
if (localPartHasKeyword) return senderEmail;
|
||||
const atIdx = senderEmail.indexOf("@");
|
||||
if (atIdx === -1) return senderEmail;
|
||||
return `***${senderEmail.slice(atIdx)}`;
|
||||
}
|
||||
|
||||
// ─── Score-Berechnung (Layer 3) ───────────────────────────────────────────────
|
||||
|
||||
interface ScoreResult {
|
||||
score: number;
|
||||
keywordHitsSubject: string[];
|
||||
keywordHitsDomain: string[];
|
||||
keywordHitsName: string[];
|
||||
styleFlags: string[];
|
||||
whitelistHit: boolean;
|
||||
}
|
||||
|
||||
export function computeScore(
|
||||
senderEmail: string,
|
||||
senderName: string | null,
|
||||
subject: string,
|
||||
brandMatchFound: boolean,
|
||||
randomTokensFound: boolean,
|
||||
): ScoreResult {
|
||||
let score = 0;
|
||||
const keywordHitsSubject: string[] = [];
|
||||
const keywordHitsDomain: string[] = [];
|
||||
const keywordHitsName: string[] = [];
|
||||
const styleFlags: string[] = [];
|
||||
|
||||
const subjectLower = subject.toLowerCase();
|
||||
const senderEmailLower = senderEmail.toLowerCase();
|
||||
const senderNameLower = (senderName ?? "").toLowerCase();
|
||||
const domain = senderEmailLower.split("@")[1] ?? "";
|
||||
const domainRoot = domain.split(".")[0] ?? "";
|
||||
|
||||
// ── Whitelist-Check (Layer 1) ──
|
||||
for (const w of GAMBLING_WHITELIST as string[]) {
|
||||
if (subjectLower.includes(w) || senderEmailLower.includes(w) || senderNameLower.includes(w)) {
|
||||
return {
|
||||
score: 0,
|
||||
keywordHitsSubject: [],
|
||||
keywordHitsDomain: [],
|
||||
keywordHitsName: [],
|
||||
styleFlags: [],
|
||||
whitelistHit: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ── Domain-Keywords ──
|
||||
for (const kw of GAMBLING_KEYWORDS as string[]) {
|
||||
if (domain.includes(kw) || domainRoot.includes(kw)) {
|
||||
keywordHitsDomain.push(kw);
|
||||
score += SCORE_WEIGHTS.DOMAIN_GAMBLING_KEYWORD;
|
||||
break; // einmal reicht
|
||||
}
|
||||
}
|
||||
|
||||
// ── Subject-Keywords ──
|
||||
for (const kw of GAMBLING_KEYWORDS as string[]) {
|
||||
if (subjectLower.includes(kw)) {
|
||||
keywordHitsSubject.push(kw);
|
||||
score += SCORE_WEIGHTS.SUBJECT_GAMBLING_KEYWORD;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Sender-Name-Keywords ──
|
||||
for (const kw of GAMBLING_KEYWORDS as string[]) {
|
||||
if (senderNameLower.includes(kw)) {
|
||||
keywordHitsName.push(kw);
|
||||
score += SCORE_WEIGHTS.SENDER_NAME_GAMBLING_KEYWORD;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Geld-Pattern im Betreff (€/$ + Zahl) ──
|
||||
if (/[€$£]\s*\d|\d\s*[€$£]/.test(subject)) {
|
||||
styleFlags.push("money-pattern");
|
||||
score += SCORE_WEIGHTS.SUBJECT_MONEY_PATTERN;
|
||||
}
|
||||
|
||||
// ── Urgency-Wörter im Betreff ──
|
||||
const URGENCY_PATTERNS = [
|
||||
"nur heute", "letzte chance", "läuft ab", "ablaufdatum",
|
||||
"expires", "last chance", "limited time", "jetzt einlösen",
|
||||
"sofort", "nur noch", "endet heute",
|
||||
];
|
||||
if (URGENCY_PATTERNS.some((p) => subjectLower.includes(p))) {
|
||||
styleFlags.push("urgency");
|
||||
score += SCORE_WEIGHTS.SUBJECT_URGENCY;
|
||||
}
|
||||
|
||||
// ── ALL-CAPS-Wort im Betreff ──
|
||||
if (/\b[A-Z]{4,}\b/.test(subject)) {
|
||||
styleFlags.push("all-caps");
|
||||
score += SCORE_WEIGHTS.SUBJECT_ALL_CAPS_WORD;
|
||||
}
|
||||
|
||||
// ── Short-Random-Domain ──
|
||||
if (domainRoot.length > 0 && domainRoot.length <= 5 && /[a-z]/.test(domainRoot) && /[0-9]/.test(domainRoot)) {
|
||||
styleFlags.push("short-random-domain");
|
||||
score += SCORE_WEIGHTS.DOMAIN_SHORT_RANDOM;
|
||||
}
|
||||
|
||||
// ── Layer 2.5 Score-Ergänzungen ──
|
||||
if (brandMatchFound && !randomTokensFound) {
|
||||
score += SCORE_WEIGHTS.BRAND_MATCH_NO_RANDOM;
|
||||
}
|
||||
if (!brandMatchFound && randomTokensFound) {
|
||||
score += SCORE_WEIGHTS.RANDOM_TOKENS_NO_BRAND;
|
||||
}
|
||||
|
||||
return {
|
||||
score: Math.min(score, 100),
|
||||
keywordHitsSubject,
|
||||
keywordHitsDomain,
|
||||
keywordHitsName,
|
||||
styleFlags,
|
||||
whitelistHit: false,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Groq-LLM-Call (Layer 4) ─────────────────────────────────────────────────
|
||||
|
||||
interface GroqVerdict {
|
||||
isGambling: boolean;
|
||||
confidence: number;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ruft Groq Llama 3.3 70B zur Borderline-Klassifikation auf.
|
||||
* Sendet NUR: senderName, senderEmail (ggf. local-part-redacted), subject.
|
||||
* KEIN Mail-Body, KEINE weiteren PII.
|
||||
*/
|
||||
export async function callGroqClassifier(params: {
|
||||
senderName: string | null;
|
||||
senderEmailRedacted: string;
|
||||
subject: string;
|
||||
groqApiKey: string;
|
||||
}): Promise<GroqVerdict> {
|
||||
const prompt = `You are a spam classifier for a gambling addiction recovery app.
|
||||
Classify whether this email is from a gambling/betting operator.
|
||||
|
||||
Sender name: ${params.senderName ?? "(none)"}
|
||||
Sender email: ${params.senderEmailRedacted}
|
||||
Subject: ${params.subject}
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{"isGambling": true/false, "confidence": 0.0-1.0, "reason": "one sentence"}
|
||||
|
||||
Do not include any other text.`;
|
||||
|
||||
const response = await fetch("https://api.groq.com/openai/v1/chat/completions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${params.groqApiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "llama-3.3-70b-versatile",
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
temperature: 0,
|
||||
max_tokens: 100,
|
||||
response_format: { type: "json_object" },
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errText = await response.text().catch(() => "");
|
||||
throw new Error(`Groq API error ${response.status}: ${errText.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const data = await response.json() as {
|
||||
choices: { message: { content: string } }[];
|
||||
};
|
||||
|
||||
const raw = data.choices?.[0]?.message?.content ?? "{}";
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(raw) as Partial<GroqVerdict>;
|
||||
return {
|
||||
isGambling: Boolean(parsed.isGambling),
|
||||
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0,
|
||||
reason: typeof parsed.reason === "string" ? parsed.reason.slice(0, 300) : "",
|
||||
};
|
||||
} catch {
|
||||
// JSON-Parse-Fehler → konservativ PASS (kein false-positive durch LLM-Fehler)
|
||||
return { isGambling: false, confidence: 0, reason: "parse-error" };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Haupt-Pipeline ───────────────────────────────────────────────────────────
|
||||
|
||||
export interface ClassifyMailParams {
|
||||
mail: MailInput;
|
||||
/** Menge der geblockten Domains (aus getBlocklistedDomainsSet) */
|
||||
blockedDomainSet: Set<string>;
|
||||
/** Groq API Key (aus runtimeConfig) — wenn leer, Layer 4 überspringen */
|
||||
groqApiKey: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Klassifiziert eine einzelne Mail durch alle Layer.
|
||||
* Pure bezüglich IO — Groq-Call ist die einzige externe Abhängigkeit.
|
||||
* DB-Writes (MailBlocked, MailClassificationSample) liegen beim Aufrufer.
|
||||
*/
|
||||
export async function classifyMail(params: ClassifyMailParams): Promise<ClassificationResult> {
|
||||
const { mail, blockedDomainSet, groqApiKey } = params;
|
||||
const { senderEmail, senderName, subject } = mail;
|
||||
|
||||
const senderEmailLower = senderEmail.toLowerCase();
|
||||
const domain = senderEmailLower.split("@")[1] ?? "";
|
||||
const localPart = senderEmailLower.split("@")[0] ?? "";
|
||||
|
||||
// ── Layer 1: Whitelist ──────────────────────────────────────────────────────
|
||||
const haystack = `${senderEmailLower} ${subject} ${senderName ?? ""}`.toLowerCase();
|
||||
for (const w of GAMBLING_WHITELIST as string[]) {
|
||||
if (haystack.includes(w)) {
|
||||
return {
|
||||
action: "passed",
|
||||
triggerSource: "whitelist",
|
||||
score: 0,
|
||||
relayDecodedDomain: null,
|
||||
features: {
|
||||
score: 0,
|
||||
domainBlocked: false,
|
||||
relayDecoded: false,
|
||||
brandMatch: false,
|
||||
randomTokens: false,
|
||||
keywordHitsSubject: [],
|
||||
keywordHitsDomain: [],
|
||||
keywordHitsName: [],
|
||||
styleFlags: [],
|
||||
whitelistHit: true,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ── Layer 2: Domain-Hard-Block ──────────────────────────────────────────────
|
||||
if (domain && blockedDomainSet.has(domain)) {
|
||||
return {
|
||||
action: "blocked",
|
||||
triggerSource: "domain",
|
||||
score: 100,
|
||||
relayDecodedDomain: null,
|
||||
features: {
|
||||
score: 100,
|
||||
domainBlocked: true,
|
||||
relayDecoded: false,
|
||||
brandMatch: false,
|
||||
randomTokens: false,
|
||||
keywordHitsSubject: [],
|
||||
keywordHitsDomain: [],
|
||||
keywordHitsName: [],
|
||||
styleFlags: [],
|
||||
whitelistHit: false,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Layer 2: Relay-Decoded Domain-Block ─────────────────────────────────────
|
||||
const relayDecodedDomain = extractRelayedDomain(senderEmailLower);
|
||||
if (relayDecodedDomain && blockedDomainSet.has(relayDecodedDomain)) {
|
||||
return {
|
||||
action: "blocked",
|
||||
triggerSource: "relay-decoded",
|
||||
score: 100,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
score: 100,
|
||||
domainBlocked: false,
|
||||
relayDecoded: true,
|
||||
brandMatch: false,
|
||||
randomTokens: false,
|
||||
keywordHitsSubject: [],
|
||||
keywordHitsDomain: [],
|
||||
keywordHitsName: [],
|
||||
styleFlags: [],
|
||||
whitelistHit: false,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Layer 2.5: Brand+Random-Token-Hard-Block ────────────────────────────────
|
||||
// Normalisiere Absender-Name und Domain-Root für Brand-Vergleich
|
||||
const displayNameNorm = normalizeBrand(senderName ?? "");
|
||||
const domainCandidates = domainToBrandCandidates(domain);
|
||||
const relayDomainCandidates = relayDecodedDomain ? domainToBrandCandidates(relayDecodedDomain) : [];
|
||||
const allBrandCandidates = [displayNameNorm, ...domainCandidates, ...relayDomainCandidates];
|
||||
|
||||
const brandMatch = allBrandCandidates.some((c) => c.length >= 4 && matchesGamblingBrand(c));
|
||||
const randomTokens = hasRandomTokens(localPart);
|
||||
|
||||
if (brandMatch && randomTokens) {
|
||||
return {
|
||||
action: "blocked",
|
||||
triggerSource: "brand+random",
|
||||
score: 100,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
score: 100,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch: true,
|
||||
randomTokens: true,
|
||||
keywordHitsSubject: [],
|
||||
keywordHitsDomain: [],
|
||||
keywordHitsName: [],
|
||||
styleFlags: [],
|
||||
whitelistHit: false,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Layer 3: Score ──────────────────────────────────────────────────────────
|
||||
const scoreResult = computeScore(
|
||||
senderEmailLower,
|
||||
senderName,
|
||||
subject,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
);
|
||||
|
||||
if (scoreResult.whitelistHit) {
|
||||
return {
|
||||
action: "passed",
|
||||
triggerSource: "whitelist",
|
||||
score: 0,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
...scoreResult,
|
||||
score: 0,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const score = scoreResult.score;
|
||||
|
||||
// Score >= 80 → Hard-Block, kein LLM
|
||||
if (score >= SCORE_HARD_BLOCK_THRESHOLD) {
|
||||
const triggerSource: TriggerSource = `score:${score}`;
|
||||
return {
|
||||
action: "blocked",
|
||||
triggerSource,
|
||||
score,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
...scoreResult,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Score < 25 → PASS, kein LLM
|
||||
if (score < SCORE_BORDERLINE_LOW) {
|
||||
return {
|
||||
action: "passed",
|
||||
triggerSource: "no-signal",
|
||||
score,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
...scoreResult,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Layer 4: Groq-Borderline (25–75) ────────────────────────────────────────
|
||||
if (score >= SCORE_BORDERLINE_LOW && score <= SCORE_BORDERLINE_HIGH && groqApiKey) {
|
||||
// Local-Part-Redaction: nur behalten wenn er selbst Gambling-Keywords enthält
|
||||
const localPartHasKeyword = (GAMBLING_KEYWORDS as string[]).some((kw: string) =>
|
||||
localPart.toLowerCase().includes(kw),
|
||||
);
|
||||
const senderEmailRedacted = redactLocalPartForLLM(senderEmailLower, localPartHasKeyword);
|
||||
|
||||
let groqVerdict: GroqVerdict | null = null;
|
||||
try {
|
||||
groqVerdict = await callGroqClassifier({
|
||||
senderName,
|
||||
senderEmailRedacted,
|
||||
subject,
|
||||
groqApiKey,
|
||||
});
|
||||
} catch (err) {
|
||||
// LLM-Fehler → konservativ PASS (kein false-positive durch API-Ausfall)
|
||||
console.warn("[mail-classifier] Groq call failed, falling back to score-based decision:", err);
|
||||
}
|
||||
|
||||
if (groqVerdict) {
|
||||
const action: ClassificationAction = groqVerdict.isGambling ? "blocked" : "passed";
|
||||
const triggerSource: TriggerSource = `llm:${groqVerdict.confidence.toFixed(2)}`;
|
||||
return {
|
||||
action,
|
||||
triggerSource,
|
||||
score,
|
||||
relayDecodedDomain,
|
||||
groq: groqVerdict,
|
||||
features: {
|
||||
...scoreResult,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Score 25–75 ohne Groq (API-Fehler oder kein Key) → PASS bei < 50, BLOCK bei >= 50
|
||||
const fallbackAction: ClassificationAction = score >= 50 ? "blocked" : "passed";
|
||||
const fallbackTrigger: TriggerSource = `score:${score}`;
|
||||
return {
|
||||
action: fallbackAction,
|
||||
triggerSource: fallbackTrigger,
|
||||
score,
|
||||
relayDecodedDomain,
|
||||
features: {
|
||||
...scoreResult,
|
||||
domainBlocked: false,
|
||||
relayDecoded: !!relayDecodedDomain,
|
||||
brandMatch,
|
||||
randomTokens,
|
||||
},
|
||||
};
|
||||
}
|
||||
517
backend/tests/mail/mail-classifier.test.ts
Normal file
517
backend/tests/mail/mail-classifier.test.ts
Normal file
@ -0,0 +1,517 @@
|
||||
/**
|
||||
* Tests für mail-classifier.ts — Mail-Klassifikations-Pipeline.
|
||||
*
|
||||
* Testet alle Layer-Logiken als pure Funktionen (kein DB-Mock, kein Groq-Mock).
|
||||
*
|
||||
* Abgedeckt:
|
||||
* - extractRelayedDomain() — diverse Relay-Patterns
|
||||
* - normalizeBrand() — Normalisierungs-Logik
|
||||
* - hasRandomTokens() — true/false cases
|
||||
* - redactLocalPartForLLM() — keep vs redact
|
||||
* - computeScore() — Score-Berechnung mit Weights
|
||||
* - classifyMail() — End-to-End Pipeline:
|
||||
* - Gamblezen-Beispiel → Layer 2.5 Hard-Block (kein LLM-Call)
|
||||
* - BetandPlay-Beispiel → Layer 2.5 Hard-Block (kein LLM-Call)
|
||||
* - Whitelist-Case (wettervorhersage)
|
||||
* - Domain-Block (Layer 2)
|
||||
* - Relay-Decoded Block (Layer 2)
|
||||
* - No-Signal → PASS
|
||||
*/
|
||||
import { describe, it, expect, vi } from "vitest";
|
||||
|
||||
// gambling-keywords.mjs ist ESM ohne TypeScript — mock before import
|
||||
vi.mock("../../server/utils/gambling-keywords.mjs", () => ({
|
||||
GAMBLING_KEYWORDS: [
|
||||
"casino", "bet365", "bwin", "tipico", "unibet", "betway",
|
||||
"pokerstars", "jackpot", "freispiel", "free spin", "bonus code",
|
||||
"auszahlung", "glücksspiel", "slots", "roulette", "wette",
|
||||
"stake", "rolletto", "vbet", "1xbet", "melbet", "mostbet",
|
||||
"luckyvibe", "spinz", "casinoly", "rabona", "justcasino",
|
||||
"getslots", "rocketplay", "freshcasino", "betano", "leovegas",
|
||||
],
|
||||
GAMBLING_WHITELIST: [
|
||||
"wettervorhersage",
|
||||
"wetter",
|
||||
"wetterbericht",
|
||||
"wettkampf",
|
||||
"wettbewerb",
|
||||
],
|
||||
}));
|
||||
|
||||
import {
|
||||
extractRelayedDomain,
|
||||
normalizeBrand,
|
||||
hasRandomTokens,
|
||||
redactLocalPartForLLM,
|
||||
computeScore,
|
||||
classifyMail,
|
||||
matchesGamblingBrand,
|
||||
} from "../../server/utils/mail-classifier";
|
||||
|
||||
// ─── extractRelayedDomain ────────────────────────────────────────────────────
|
||||
|
||||
describe("extractRelayedDomain()", () => {
|
||||
it("extrahiert Domain aus SendGrid-bounce-Pattern (user=domain@sendgrid)", () => {
|
||||
expect(extractRelayedDomain("bounces+user=gamblezen.com@sendgrid.net"))
|
||||
.toBe("gamblezen.com");
|
||||
});
|
||||
|
||||
it("extrahiert Domain aus Mailchimp-Track-Pattern (track.user=domain@mc)", () => {
|
||||
expect(extractRelayedDomain("track.user=betandplay.com@mailchimp.com"))
|
||||
.toBe("betandplay.com");
|
||||
});
|
||||
|
||||
it("extrahiert Domain aus _at_-Pattern", () => {
|
||||
expect(extractRelayedDomain("a1b2c3_user_at_betandplay.com@em.example.com"))
|
||||
.toBe("betandplay.com");
|
||||
});
|
||||
|
||||
it("gibt null zurück wenn kein Relay-Pattern erkannt", () => {
|
||||
expect(extractRelayedDomain("info@betandplay.com")).toBeNull();
|
||||
});
|
||||
|
||||
it("gibt null zurück für direkte Adressen ohne @", () => {
|
||||
expect(extractRelayedDomain("noatsign")).toBeNull();
|
||||
});
|
||||
|
||||
it("normalisiert extrahierte Domain auf lowercase", () => {
|
||||
expect(extractRelayedDomain("bounce=GambleZen.COM@delivery.net"))
|
||||
.toBe("gamblezen.com");
|
||||
});
|
||||
|
||||
it("gibt null zurück für normale Adressen ohne Relay-Muster", () => {
|
||||
expect(extractRelayedDomain("newsletter@example.org")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ─── normalizeBrand ──────────────────────────────────────────────────────────
|
||||
|
||||
describe("normalizeBrand()", () => {
|
||||
it("BetandPlay → betandplay", () => {
|
||||
expect(normalizeBrand("BetandPlay")).toBe("betandplay");
|
||||
});
|
||||
|
||||
it("bet-and-play → betandplay", () => {
|
||||
expect(normalizeBrand("bet-and-play")).toBe("betandplay");
|
||||
});
|
||||
|
||||
it("Gamble Zen → gamblezen", () => {
|
||||
expect(normalizeBrand("Gamble Zen")).toBe("gamblezen");
|
||||
});
|
||||
|
||||
it("Mr. Green → mrgreen", () => {
|
||||
expect(normalizeBrand("Mr. Green")).toBe("mrgreen");
|
||||
});
|
||||
|
||||
it("lucky_vibe → luckyvibe", () => {
|
||||
expect(normalizeBrand("lucky_vibe")).toBe("luckyvibe");
|
||||
});
|
||||
|
||||
it("unveränderte Kleinbuchstaben bleiben gleich", () => {
|
||||
expect(normalizeBrand("casino")).toBe("casino");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── matchesGamblingBrand ────────────────────────────────────────────────────
|
||||
|
||||
describe("matchesGamblingBrand()", () => {
|
||||
it("'gamblezen' matcht", () => {
|
||||
expect(matchesGamblingBrand("gamblezen")).toBe(true);
|
||||
});
|
||||
|
||||
it("'betandplay' matcht", () => {
|
||||
expect(matchesGamblingBrand("betandplay")).toBe(true);
|
||||
});
|
||||
|
||||
it("'casino' matcht (exact)", () => {
|
||||
expect(matchesGamblingBrand("casino")).toBe(true);
|
||||
});
|
||||
|
||||
it("'mrgreen' matcht", () => {
|
||||
expect(matchesGamblingBrand("mrgreen")).toBe(true);
|
||||
});
|
||||
|
||||
it("'example' matcht nicht", () => {
|
||||
expect(matchesGamblingBrand("example")).toBe(false);
|
||||
});
|
||||
|
||||
it("zu kurze Strings (< 4 Zeichen) matchen nie", () => {
|
||||
expect(matchesGamblingBrand("bet")).toBe(false);
|
||||
});
|
||||
|
||||
it("'googlemail' matcht nicht", () => {
|
||||
expect(matchesGamblingBrand("googlemail")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── hasRandomTokens ─────────────────────────────────────────────────────────
|
||||
|
||||
describe("hasRandomTokens()", () => {
|
||||
it("local-part mit 2+ zufälligen Tokens → true", () => {
|
||||
// Gamblezen-typisch: hq3a91_7xmpl2 (2 random-looking tokens)
|
||||
expect(hasRandomTokens("hq3a91_7xmpl2")).toBe(true);
|
||||
});
|
||||
|
||||
it("local-part mit User-ID + Token → true", () => {
|
||||
expect(hasRandomTokens("user123abc_ref456xyz")).toBe(true);
|
||||
});
|
||||
|
||||
it("'info' → false (Funktionswort)", () => {
|
||||
expect(hasRandomTokens("info")).toBe(false);
|
||||
});
|
||||
|
||||
it("'noreply' → false (Funktionswort)", () => {
|
||||
expect(hasRandomTokens("noreply")).toBe(false);
|
||||
});
|
||||
|
||||
it("'newsletter' → false (Funktionswort, kein Digit-Mix)", () => {
|
||||
expect(hasRandomTokens("newsletter")).toBe(false);
|
||||
});
|
||||
|
||||
it("normaler Local-Part ohne Zufalls-Tokens → false", () => {
|
||||
expect(hasRandomTokens("john.doe")).toBe(false);
|
||||
});
|
||||
|
||||
it("nur ein random Token (Grenzfall) → false", () => {
|
||||
// Nur ein Token >= 6 mit Digit-Mix → unter Schwelle (braucht >= 2)
|
||||
expect(hasRandomTokens("abc123")).toBe(false);
|
||||
});
|
||||
|
||||
it("echter BetandPlay-typischer Local-Part → true", () => {
|
||||
// z.B. "u7a2b1_offers_ref9x2z" — ein Funktionswort + 2 random tokens
|
||||
expect(hasRandomTokens("u7a2b1_offers_ref9x2z")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── redactLocalPartForLLM ───────────────────────────────────────────────────
|
||||
|
||||
describe("redactLocalPartForLLM()", () => {
|
||||
it("normale Adresse → local-part wird redacted", () => {
|
||||
expect(redactLocalPartForLLM("user123@example.com", false))
|
||||
.toBe("***@example.com");
|
||||
});
|
||||
|
||||
it("Adresse mit Casino-Keyword im local-part → NICHT redacted", () => {
|
||||
expect(redactLocalPartForLLM("casino_offers@mailer.net", true))
|
||||
.toBe("casino_offers@mailer.net");
|
||||
});
|
||||
|
||||
it("normal ohne Keyword-Flag → redacted", () => {
|
||||
expect(redactLocalPartForLLM("a1b2c3_track@sendgrid.net", false))
|
||||
.toBe("***@sendgrid.net");
|
||||
});
|
||||
|
||||
it("Adresse ohne @ → unverändert zurückgegeben", () => {
|
||||
expect(redactLocalPartForLLM("noatsign", false)).toBe("noatsign");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── computeScore ────────────────────────────────────────────────────────────
|
||||
|
||||
describe("computeScore()", () => {
|
||||
it("Whitelist-Hit → score=0, whitelistHit=true", () => {
|
||||
const result = computeScore(
|
||||
"info@wetter.de",
|
||||
"Wetter Service",
|
||||
"Wettervorhersage für morgen",
|
||||
false,
|
||||
false,
|
||||
);
|
||||
expect(result.whitelistHit).toBe(true);
|
||||
expect(result.score).toBe(0);
|
||||
});
|
||||
|
||||
it("Casino im Betreff → SUBJECT_GAMBLING_KEYWORD += 35", () => {
|
||||
const result = computeScore(
|
||||
"info@example.com",
|
||||
null,
|
||||
"Dein Casino-Bonus wartet",
|
||||
false,
|
||||
false,
|
||||
);
|
||||
expect(result.keywordHitsSubject).toContain("casino");
|
||||
expect(result.score).toBeGreaterThanOrEqual(35);
|
||||
});
|
||||
|
||||
it("Geld-Pattern (100€) im Betreff → SUBJECT_MONEY_PATTERN += 20", () => {
|
||||
const result = computeScore(
|
||||
"info@example.com",
|
||||
null,
|
||||
"100€ Willkommensbonus jetzt sichern",
|
||||
false,
|
||||
false,
|
||||
);
|
||||
expect(result.styleFlags).toContain("money-pattern");
|
||||
expect(result.score).toBeGreaterThanOrEqual(20);
|
||||
});
|
||||
|
||||
it("Brand-Match ohne Random → BRAND_MATCH_NO_RANDOM += 35", () => {
|
||||
const result = computeScore(
|
||||
"info@example.com",
|
||||
null,
|
||||
"Normaler Betreff",
|
||||
true, // brandMatch=true
|
||||
false, // randomTokens=false
|
||||
);
|
||||
expect(result.score).toBeGreaterThanOrEqual(35);
|
||||
});
|
||||
|
||||
it("Random-Tokens ohne Brand → RANDOM_TOKENS_NO_BRAND += 10", () => {
|
||||
const result = computeScore(
|
||||
"info@example.com",
|
||||
null,
|
||||
"Newsletter vom Tag",
|
||||
false, // brandMatch=false
|
||||
true, // randomTokens=true
|
||||
);
|
||||
expect(result.score).toBeGreaterThanOrEqual(10);
|
||||
});
|
||||
|
||||
it("Score wird auf max 100 gecapped", () => {
|
||||
// Alle Signale gleichzeitig → Score würde > 100 sein
|
||||
const result = computeScore(
|
||||
"slots@casinobonus.bet",
|
||||
"Casino Jackpot",
|
||||
"JACKPOT Casino 500€ Freispiele Nur heute Letzte chance",
|
||||
true,
|
||||
true,
|
||||
);
|
||||
expect(result.score).toBeLessThanOrEqual(100);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── classifyMail() — Pipeline End-to-End ────────────────────────────────────
|
||||
|
||||
describe("classifyMail() — End-to-End Pipeline", () => {
|
||||
// Leere Domain-Set für die meisten Tests (kein Domain-Hard-Block)
|
||||
const emptyDomainSet = new Set<string>();
|
||||
|
||||
// ─── Screenshot-Beispiel 1: Gamblezen via Relay ───────────────────────────
|
||||
it("Gamblezen-Beispiel: bounces+user=gamblezen.com@em.sendgrid.net → Layer 2.5 Hard-Block", async () => {
|
||||
// Gamblezen leitet über SendGrid-Bounces: Domain "em.sendgrid.net" ist nicht geblockt,
|
||||
// aber relay-decoded → "gamblezen.com" + local-part hat random tokens.
|
||||
// gamblezen.com ist ein bekannter Gambling-Brand.
|
||||
const domainSetWithGamblezen = new Set(["gamblezen.com"]);
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "bounces+user=gamblezen.com@em.sendgrid.net",
|
||||
senderName: "Gamble Zen",
|
||||
subject: "Dein exklusives Angebot wartet",
|
||||
},
|
||||
blockedDomainSet: domainSetWithGamblezen,
|
||||
groqApiKey: "", // kein LLM erlaubt hier
|
||||
});
|
||||
|
||||
// Relay-decoded domain matcht blocklist → Layer 2 (relay-decoded), NICHT Layer 2.5
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("relay-decoded");
|
||||
expect(result.relayDecodedDomain).toBe("gamblezen.com");
|
||||
});
|
||||
|
||||
it("Gamblezen-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => {
|
||||
// Wenn gamblezen.com NICHT in der Blocklist ist: Brand+Random greift trotzdem
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "hq3a91_7xmpl2@em.sendgrid.net",
|
||||
senderName: "Gamble Zen", // Brand-Match via Display-Name
|
||||
subject: "Dein exklusives Angebot wartet",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "", // kein LLM-Call hier erwartet
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("brand+random");
|
||||
expect(result.features.brandMatch).toBe(true);
|
||||
expect(result.features.randomTokens).toBe(true);
|
||||
});
|
||||
|
||||
// ─── Screenshot-Beispiel 2: BetandPlay via Relay ─────────────────────────
|
||||
it("BetandPlay-Beispiel: track.user=betandplay.com@mailchimp.com → Layer 2.5 Hard-Block", async () => {
|
||||
const domainSetWithBetandPlay = new Set(["betandplay.com"]);
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "track.user=betandplay.com@mailchimp.com",
|
||||
senderName: "BetandPlay",
|
||||
subject: "100€ Willkommensbonus — Nur heute!",
|
||||
},
|
||||
blockedDomainSet: domainSetWithBetandPlay,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("relay-decoded");
|
||||
expect(result.relayDecodedDomain).toBe("betandplay.com");
|
||||
});
|
||||
|
||||
it("BetandPlay-Beispiel ohne Blocklist-Entry → Layer 2.5 Hard-Block via Brand+Random", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "u7a2b1_offers_ref9x2z@mailchimp.com",
|
||||
senderName: "BetandPlay", // Brand-Match via Display-Name
|
||||
subject: "100€ Willkommensbonus",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("brand+random");
|
||||
expect(result.features.brandMatch).toBe(true);
|
||||
expect(result.features.randomTokens).toBe(true);
|
||||
});
|
||||
|
||||
// ─── Layer 1: Whitelist ───────────────────────────────────────────────────
|
||||
it("Whitelist-Treffer: 'wettervorhersage' im Betreff → PASS", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "service@wetter.de",
|
||||
senderName: "Wetter.de",
|
||||
subject: "Wettervorhersage für morgen",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("passed");
|
||||
expect(result.triggerSource).toBe("whitelist");
|
||||
});
|
||||
|
||||
it("'wettkampf' in Betreff → PASS (kein Gambling trotz 'wette')", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "info@sport.de",
|
||||
senderName: null,
|
||||
subject: "Wettkampf-Ergebnisse dieser Woche",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("passed");
|
||||
expect(result.triggerSource).toBe("whitelist");
|
||||
});
|
||||
|
||||
// ─── Layer 2: Domain-Hard-Block ───────────────────────────────────────────
|
||||
it("Domain in Blocklist → Layer 2 Hard-Block", async () => {
|
||||
const domainSet = new Set(["casinoly.com"]);
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "promo@casinoly.com",
|
||||
senderName: "Casinoly",
|
||||
subject: "Dein Bonus wartet",
|
||||
},
|
||||
blockedDomainSet: domainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("domain");
|
||||
expect(result.features.domainBlocked).toBe(true);
|
||||
});
|
||||
|
||||
// ─── Relay-Decoded Block ──────────────────────────────────────────────────
|
||||
it("Relay-Decoded: =domain.com in local-part und Domain in Blocklist → relay-decoded Block", async () => {
|
||||
const domainSet = new Set(["rabona.com"]);
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "bounce+track=rabona.com@em.sendgrid.net",
|
||||
senderName: "Rabona Casino",
|
||||
subject: "Exklusiv für dich",
|
||||
},
|
||||
blockedDomainSet: domainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toBe("relay-decoded");
|
||||
expect(result.relayDecodedDomain).toBe("rabona.com");
|
||||
});
|
||||
|
||||
// ─── Layer 3: Score-Block (ohne LLM) ──────────────────────────────────────
|
||||
it("Viele Signale → Score >= 80 → Hard-Block ohne LLM", async () => {
|
||||
// Casino im Sender-Name + Jackpot im Betreff + Urgency + Geld-Pattern
|
||||
const groqCallSpy = vi.fn();
|
||||
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "info@spinz-casino.example",
|
||||
senderName: "Casino Jackpot Club",
|
||||
subject: "JACKPOT 500€ Freispiele — Nur heute!",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "should-not-be-called",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("blocked");
|
||||
expect(result.triggerSource).toMatch(/^score:/);
|
||||
expect(result.score).toBeGreaterThanOrEqual(80);
|
||||
// groqCallSpy wurde nicht gecallt weil wir fetch nicht mocken —
|
||||
// aber score >= 80 bedeutet Layer 4 wird gar nicht erreicht
|
||||
});
|
||||
|
||||
// ─── No-Signal → PASS ────────────────────────────────────────────────────
|
||||
it("unauffällige Mail → PASS mit triggerSource 'no-signal'", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "newsletter@amazon.de",
|
||||
senderName: "Amazon",
|
||||
subject: "Deine Bestellung wurde versandt",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.action).toBe("passed");
|
||||
expect(result.triggerSource).toBe("no-signal");
|
||||
expect(result.score).toBeLessThan(25);
|
||||
});
|
||||
|
||||
// ─── Brand-Match ohne Random → kein Hard-Block, Score-Erhöhung ───────────
|
||||
it("Brand-Match ohne Random-Tokens → kein Layer-2.5-Block, aber Score-Erhöhung", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "info@betandplay.com", // direktes info@, kein random
|
||||
senderName: "BetandPlay",
|
||||
subject: "Willkommen",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
// Kein Hard-Block Layer 2.5 (kein Random), aber Score erhöht durch Brand-Match
|
||||
expect(result.triggerSource).not.toBe("brand+random");
|
||||
expect(result.features.brandMatch).toBe(true);
|
||||
expect(result.features.randomTokens).toBe(false);
|
||||
// Score >= 35 (BRAND_MATCH_NO_RANDOM) — endet je nach anderen Signalen
|
||||
expect(result.features.score).toBeGreaterThanOrEqual(35);
|
||||
});
|
||||
|
||||
// ─── Korrekte Feature-Struktur im Result ─────────────────────────────────
|
||||
it("Result-Features enthalten alle erwarteten Keys", async () => {
|
||||
const result = await classifyMail({
|
||||
mail: {
|
||||
senderEmail: "promo@example.com",
|
||||
senderName: null,
|
||||
subject: "Test",
|
||||
},
|
||||
blockedDomainSet: emptyDomainSet,
|
||||
groqApiKey: "",
|
||||
});
|
||||
|
||||
expect(result.features).toHaveProperty("score");
|
||||
expect(result.features).toHaveProperty("domainBlocked");
|
||||
expect(result.features).toHaveProperty("relayDecoded");
|
||||
expect(result.features).toHaveProperty("brandMatch");
|
||||
expect(result.features).toHaveProperty("randomTokens");
|
||||
expect(result.features).toHaveProperty("keywordHitsSubject");
|
||||
expect(result.features).toHaveProperty("keywordHitsDomain");
|
||||
expect(result.features).toHaveProperty("keywordHitsName");
|
||||
expect(result.features).toHaveProperty("styleFlags");
|
||||
expect(result.features).toHaveProperty("whitelistHit");
|
||||
});
|
||||
});
|
||||
Loading…
x
Reference in New Issue
Block a user