diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts index 60b844f0..66a26edd 100644 --- a/src/utils/language.test.ts +++ b/src/utils/language.test.ts @@ -35,6 +35,13 @@ Deno.test('Detects definitive texts', () => { assertEquals(detectLanguage('Γειά σου!', 1), 'el'); assertEquals(detectLanguage('שלום!', 1), 'he'); assertEquals(detectLanguage('こんにちは。', 1), 'ja'); + assertEquals( + detectLanguage( + '最近、長女から「中学生男子全員クソ」という話を良く聞き中学生女子側の視点が分かってよかった。父からは「中学生男子は自分がクソだということを3年間かかって学習するんだよ」と言っておいた', + 1, + ), + 'ja', + ); // ambiguous assertEquals(detectLanguage('你好', 1), undefined); diff --git a/src/utils/language.ts b/src/utils/language.ts index b95e3e78..9a713122 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -12,9 +12,10 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod // It's better to remove the emojis first const sanitizedText = linkify.tokenize( text - .replaceAll(/\p{Extended_Pictographic}/gu, '') - .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), - ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); + .replaceAll(/\p{Extended_Pictographic}/gu, '') // strip emojis + .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), // strip invisible characters + ) + .reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); // Definite patterns for some languages. // Text which matches MUST unambiguously be in the given language. @@ -30,7 +31,11 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod // If any pattern matches, the language is known. for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) { - if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking + const text = sanitizedText + .replaceAll(/[\p{P}\p{S}]/gu, '') // strip punctuation and symbols + .replaceAll(/\p{N}/gu, ''); // strip numbers + + if (pattern.test(text)) { return lang; } }