detectLanguage: check the text's script for definitive language categorization for some languages

This commit is contained in:
Alex Gleason 2025-02-04 15:05:52 -06:00
parent 47d004d92a
commit 2f2cb2c4fc
No known key found for this signature in database
GPG key ID: 7211D1F99744FBB7
2 changed files with 39 additions and 6 deletions

View file

@ -26,3 +26,18 @@ Deno.test('Detect English language', () => {
'en', 'en',
); );
}); });
Deno.test('Detects definitive texts', () => {
// NOTE: pass `1` as min confidence to test only the definitive patterns
// unambiguous
assertEquals(detectLanguage('안녕하세요.', 1), 'ko');
assertEquals(detectLanguage('Γειά σου!', 1), 'el');
assertEquals(detectLanguage('שלום!', 1), 'he');
assertEquals(detectLanguage('こんにちは。', 1), 'ja');
// ambiguous
assertEquals(detectLanguage('你好', 1), undefined);
assertEquals(detectLanguage('Привет', 1), undefined);
assertEquals(detectLanguage('Hello', 1), undefined);
});

View file

@ -4,8 +4,9 @@ import linkify from 'linkifyjs';
linkify.registerCustomProtocol('nostr', true); linkify.registerCustomProtocol('nostr', true);
/** Returns the detected language if the confidence is greater or equal than 'minConfidence' /**
* 'minConfidence' must be a number between 0 and 1, such as 0.95 * Returns the detected language if the confidence is greater or equal than 'minConfidence'.
* 'minConfidence' must be a number between 0 and 1, such as 0.95.
*/ */
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
// It's better to remove the emojis first // It's better to remove the emojis first
@ -15,13 +16,31 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
// Definite patterns for some languages.
// Text which matches MUST unambiguously be in the given language.
// This is only possible for some languages.
// All patterns match the full text, so mixed scripts would fail these tests.
const languagePatterns: Partial<Record<LanguageCode, RegExp>> = {
ko: /^[\p{Script=Hangul}\s]+$/u, // Korean (Hangul only)
el: /^[\p{Script=Greek}\s]+$/u, // Greek
he: /^[\p{Script=Hebrew}\s]+$/u, // Hebrew
ja: /^(?=.*[\p{Script=Hiragana}\p{Script=Katakana}])[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\s]+$/u, // Japanese (requires at least one Kana)
// zh: not possible to detect unambiguously
};
// If any pattern matches, the language is known.
for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) {
if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking
return lang;
}
}
if (sanitizedText.length < 10) { // heuristics if (sanitizedText.length < 10) { // heuristics
return; return;
} }
const [topResult] = lande( const [topResult] = lande(sanitizedText);
sanitizedText,
);
if (topResult) { if (topResult) {
const [iso6393, confidence] = topResult; const [iso6393, confidence] = topResult;
const locale = new Intl.Locale(iso6393); const locale = new Intl.Locale(iso6393);
@ -30,5 +49,4 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
return locale.language as LanguageCode; return locale.language as LanguageCode;
} }
} }
return;
} }