mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 11:29:46 +00:00
Merge branch 'definitive-languages' into 'main'
detectLanguage: check the text's script for definitive language categorization for some languages See merge request soapbox-pub/ditto!635
This commit is contained in:
commit
e58d0af691
2 changed files with 39 additions and 6 deletions
|
|
@ -26,3 +26,18 @@ Deno.test('Detect English language', () => {
|
||||||
'en',
|
'en',
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Deno.test('Detects definitive texts', () => {
|
||||||
|
// NOTE: pass `1` as min confidence to test only the definitive patterns
|
||||||
|
|
||||||
|
// unambiguous
|
||||||
|
assertEquals(detectLanguage('안녕하세요.', 1), 'ko');
|
||||||
|
assertEquals(detectLanguage('Γειά σου!', 1), 'el');
|
||||||
|
assertEquals(detectLanguage('שלום!', 1), 'he');
|
||||||
|
assertEquals(detectLanguage('こんにちは。', 1), 'ja');
|
||||||
|
|
||||||
|
// ambiguous
|
||||||
|
assertEquals(detectLanguage('你好', 1), undefined);
|
||||||
|
assertEquals(detectLanguage('Привет', 1), undefined);
|
||||||
|
assertEquals(detectLanguage('Hello', 1), undefined);
|
||||||
|
});
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,9 @@ import linkify from 'linkifyjs';
|
||||||
|
|
||||||
linkify.registerCustomProtocol('nostr', true);
|
linkify.registerCustomProtocol('nostr', true);
|
||||||
|
|
||||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
/**
|
||||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
* Returns the detected language if the confidence is greater or equal than 'minConfidence'.
|
||||||
|
* 'minConfidence' must be a number between 0 and 1, such as 0.95.
|
||||||
*/
|
*/
|
||||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||||
// It's better to remove the emojis first
|
// It's better to remove the emojis first
|
||||||
|
|
@ -15,13 +16,31 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
||||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||||
|
|
||||||
|
// Definite patterns for some languages.
|
||||||
|
// Text which matches MUST unambiguously be in the given language.
|
||||||
|
// This is only possible for some languages.
|
||||||
|
// All patterns match the full text, so mixed scripts would fail these tests.
|
||||||
|
const languagePatterns: Partial<Record<LanguageCode, RegExp>> = {
|
||||||
|
ko: /^[\p{Script=Hangul}\s]+$/u, // Korean (Hangul only)
|
||||||
|
el: /^[\p{Script=Greek}\s]+$/u, // Greek
|
||||||
|
he: /^[\p{Script=Hebrew}\s]+$/u, // Hebrew
|
||||||
|
ja: /^(?=.*[\p{Script=Hiragana}\p{Script=Katakana}])[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\s]+$/u, // Japanese (requires at least one Kana)
|
||||||
|
// zh: not possible to detect unambiguously
|
||||||
|
};
|
||||||
|
|
||||||
|
// If any pattern matches, the language is known.
|
||||||
|
for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) {
|
||||||
|
if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking
|
||||||
|
return lang;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (sanitizedText.length < 10) { // heuristics
|
if (sanitizedText.length < 10) { // heuristics
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const [topResult] = lande(
|
const [topResult] = lande(sanitizedText);
|
||||||
sanitizedText,
|
|
||||||
);
|
|
||||||
if (topResult) {
|
if (topResult) {
|
||||||
const [iso6393, confidence] = topResult;
|
const [iso6393, confidence] = topResult;
|
||||||
const locale = new Intl.Locale(iso6393);
|
const locale = new Intl.Locale(iso6393);
|
||||||
|
|
@ -30,5 +49,4 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
||||||
return locale.language as LanguageCode;
|
return locale.language as LanguageCode;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue