From 2f2cb2c4fcb2b4c5a6cefb9277c2eabb4f282c96 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 4 Feb 2025 15:05:52 -0600 Subject: [PATCH] detectLanguage: check the text's script for definitive language categorization for some languages --- src/utils/language.test.ts | 15 +++++++++++++++ src/utils/language.ts | 30 ++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts index 255f6b58..f4025290 100644 --- a/src/utils/language.test.ts +++ b/src/utils/language.test.ts @@ -26,3 +26,18 @@ Deno.test('Detect English language', () => { 'en', ); }); + +Deno.test('Detects definitive texts', () => { + // NOTE: pass `1` as min confidence to test only the definitive patterns + + // unambiguous + assertEquals(detectLanguage('안녕하세요.', 1), 'ko'); + assertEquals(detectLanguage('Γειά σου!', 1), 'el'); + assertEquals(detectLanguage('שלום!', 1), 'he'); + assertEquals(detectLanguage('こんにちは。', 1), 'ja'); + + // ambiguous + assertEquals(detectLanguage('你好', 1), undefined); + assertEquals(detectLanguage('Привет', 1), undefined); + assertEquals(detectLanguage('Hello', 1), undefined); +}); diff --git a/src/utils/language.ts b/src/utils/language.ts index 8af8ddf9..4b6e3807 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -4,8 +4,9 @@ import linkify from 'linkifyjs'; linkify.registerCustomProtocol('nostr', true); -/** Returns the detected language if the confidence is greater or equal than 'minConfidence' - * 'minConfidence' must be a number between 0 and 1, such as 0.95 +/** + * Returns the detected language if the confidence is greater or equal than 'minConfidence'. + * 'minConfidence' must be a number between 0 and 1, such as 0.95. */ export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { // It's better to remove the emojis first @@ -15,13 +16,31 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); + // Definite patterns for some languages. + // Text which matches MUST unambiguously be in the given language. + // This is only possible for some languages. + // All patterns match the full text, so mixed scripts would fail these tests. + const languagePatterns: Partial> = { + ko: /^[\p{Script=Hangul}\s]+$/u, // Korean (Hangul only) + el: /^[\p{Script=Greek}\s]+$/u, // Greek + he: /^[\p{Script=Hebrew}\s]+$/u, // Hebrew + ja: /^(?=.*[\p{Script=Hiragana}\p{Script=Katakana}])[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\s]+$/u, // Japanese (requires at least one Kana) + // zh: not possible to detect unambiguously + }; + + // If any pattern matches, the language is known. + for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) { + if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking + return lang; + } + } + if (sanitizedText.length < 10) { // heuristics return; } - const [topResult] = lande( - sanitizedText, - ); + const [topResult] = lande(sanitizedText); + if (topResult) { const [iso6393, confidence] = topResult; const locale = new Intl.Locale(iso6393); @@ -30,5 +49,4 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod return locale.language as LanguageCode; } } - return; }