detectLanguage: strip numbers from text before matching language patterns

This commit is contained in:
Alex Gleason 2025-02-06 18:42:29 -06:00
parent 86ffa7f0cc
commit 00e10eb19f
No known key found for this signature in database
GPG key ID: 7211D1F99744FBB7
2 changed files with 16 additions and 4 deletions

View file

@ -35,6 +35,13 @@ Deno.test('Detects definitive texts', () => {
assertEquals(detectLanguage('Γειά σου!', 1), 'el');
assertEquals(detectLanguage('שלום!', 1), 'he');
assertEquals(detectLanguage('こんにちは。', 1), 'ja');
assertEquals(
detectLanguage(
'最近、長女から「中学生男子全員クソ」という話を良く聞き中学生女子側の視点が分かってよかった。父からは「中学生男子は自分がクソだということを3年間かかって学習するんだよ」と言っておいた',
1,
),
'ja',
);
// ambiguous
assertEquals(detectLanguage('你好', 1), undefined);

View file

@ -12,9 +12,10 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
// It's better to remove the emojis first
const sanitizedText = linkify.tokenize(
text
.replaceAll(/\p{Extended_Pictographic}/gu, '')
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
.replaceAll(/\p{Extended_Pictographic}/gu, '') // strip emojis
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), // strip invisible characters
)
.reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
// Definite patterns for some languages.
// Text which matches MUST unambiguously be in the given language.
@ -30,7 +31,11 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
// If any pattern matches, the language is known.
for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) {
if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking
const text = sanitizedText
.replaceAll(/[\p{P}\p{S}]/gu, '') // strip punctuation and symbols
.replaceAll(/\p{N}/gu, ''); // strip numbers
if (pattern.test(text)) {
return lang;
}
}