rebreak-monorepo/backend/server/utils/detect-lang.ts

/**
 * Detect language from text using Unicode script ranges.
 *
 * Non-Latin scripts are detected reliably from a single character. For Latin
 * scripts (de/en/fr/tr/es/it/pt …) we fall back to the supplied locale-hint,
 * since distinguishing them needs a real NLP library and the user-facing
 * App-Sprache is a perfectly good signal.
 *
 * Returns a 2-letter ISO code, or null if neither detection nor hint apply.
 */
export function detectLang(
  text: string,
  localeHint?: string | null,
): string | null {
  if (text) {
    // Sample a window — first 300 chars is plenty; counting script hits is
    // cheaper than scanning multi-KB Lyra-Antworten.
    const sample = text.slice(0, 300);
    if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(sample)) return "ar"; // Arabic
    if (/[\u0400-\u04FF]/.test(sample)) return "ru"; // Cyrillic
    if (/[\u3040-\u309F\u30A0-\u30FF]/.test(sample)) return "ja"; // Hiragana/Katakana
    if (/[\uAC00-\uD7AF]/.test(sample)) return "ko"; // Hangul
    if (/[\u4E00-\u9FFF]/.test(sample)) return "zh"; // CJK Unified Ideographs
    if (/[\u0590-\u05FF]/.test(sample)) return "he"; // Hebrew
    if (/[\u0E00-\u0E7F]/.test(sample)) return "th"; // Thai
    // Turkish-specific Latin letters — strong hint without an NLP lib.
    if (/[ğĞıİşŞ]/.test(sample)) return "tr";
  }

  if (localeHint) {
    const base = localeHint.split("-")[0].toLowerCase();
    if (base) return base;
  }

  return null;
}