mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 03:19:46 +00:00
Merge branch 'definitive-languages' into 'main'
detectLanguage: check the text's script for definitive language categorization for some languages See merge request soapbox-pub/ditto!635
This commit is contained in:
commit
e58d0af691
2 changed files with 39 additions and 6 deletions
|
|
@ -26,3 +26,18 @@ Deno.test('Detect English language', () => {
|
|||
'en',
|
||||
);
|
||||
});
|
||||
|
||||
Deno.test('Detects definitive texts', () => {
|
||||
// NOTE: pass `1` as min confidence to test only the definitive patterns
|
||||
|
||||
// unambiguous
|
||||
assertEquals(detectLanguage('안녕하세요.', 1), 'ko');
|
||||
assertEquals(detectLanguage('Γειά σου!', 1), 'el');
|
||||
assertEquals(detectLanguage('שלום!', 1), 'he');
|
||||
assertEquals(detectLanguage('こんにちは。', 1), 'ja');
|
||||
|
||||
// ambiguous
|
||||
assertEquals(detectLanguage('你好', 1), undefined);
|
||||
assertEquals(detectLanguage('Привет', 1), undefined);
|
||||
assertEquals(detectLanguage('Hello', 1), undefined);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -4,8 +4,9 @@ import linkify from 'linkifyjs';
|
|||
|
||||
linkify.registerCustomProtocol('nostr', true);
|
||||
|
||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||
/**
|
||||
* Returns the detected language if the confidence is greater or equal than 'minConfidence'.
|
||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95.
|
||||
*/
|
||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||
// It's better to remove the emojis first
|
||||
|
|
@ -15,13 +16,31 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
|||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||
|
||||
// Definite patterns for some languages.
|
||||
// Text which matches MUST unambiguously be in the given language.
|
||||
// This is only possible for some languages.
|
||||
// All patterns match the full text, so mixed scripts would fail these tests.
|
||||
const languagePatterns: Partial<Record<LanguageCode, RegExp>> = {
|
||||
ko: /^[\p{Script=Hangul}\s]+$/u, // Korean (Hangul only)
|
||||
el: /^[\p{Script=Greek}\s]+$/u, // Greek
|
||||
he: /^[\p{Script=Hebrew}\s]+$/u, // Hebrew
|
||||
ja: /^(?=.*[\p{Script=Hiragana}\p{Script=Katakana}])[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\s]+$/u, // Japanese (requires at least one Kana)
|
||||
// zh: not possible to detect unambiguously
|
||||
};
|
||||
|
||||
// If any pattern matches, the language is known.
|
||||
for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) {
|
||||
if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking
|
||||
return lang;
|
||||
}
|
||||
}
|
||||
|
||||
if (sanitizedText.length < 10) { // heuristics
|
||||
return;
|
||||
}
|
||||
|
||||
const [topResult] = lande(
|
||||
sanitizedText,
|
||||
);
|
||||
const [topResult] = lande(sanitizedText);
|
||||
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
|
|
@ -30,5 +49,4 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
|||
return locale.language as LanguageCode;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue