/** * Detect language from text using Unicode script ranges. * * Non-Latin scripts are detected reliably from a single character. For Latin * scripts (de/en/fr/tr/es/it/pt …) we fall back to the supplied locale-hint, * since distinguishing them needs a real NLP library and the user-facing * App-Sprache is a perfectly good signal. * * Returns a 2-letter ISO code, or null if neither detection nor hint apply. */ export function detectLang( text: string, localeHint?: string | null, ): string | null { if (text) { // Sample a window — first 300 chars is plenty; counting script hits is // cheaper than scanning multi-KB Lyra-Antworten. const sample = text.slice(0, 300); if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(sample)) return "ar"; // Arabic if (/[\u0400-\u04FF]/.test(sample)) return "ru"; // Cyrillic if (/[\u3040-\u309F\u30A0-\u30FF]/.test(sample)) return "ja"; // Hiragana/Katakana if (/[\uAC00-\uD7AF]/.test(sample)) return "ko"; // Hangul if (/[\u4E00-\u9FFF]/.test(sample)) return "zh"; // CJK Unified Ideographs if (/[\u0590-\u05FF]/.test(sample)) return "he"; // Hebrew if (/[\u0E00-\u0E7F]/.test(sample)) return "th"; // Thai // Turkish-specific Latin letters — strong hint without an NLP lib. if (/[ğĞıİşŞ]/.test(sample)) return "tr"; } if (localeHint) { const base = localeHint.split("-")[0].toLowerCase(); if (base) return base; } return null; }