mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 11:29:46 +00:00
detectLanguage: strip numbers from text before matching language patterns
This commit is contained in:
parent
86ffa7f0cc
commit
00e10eb19f
2 changed files with 16 additions and 4 deletions
|
|
@ -35,6 +35,13 @@ Deno.test('Detects definitive texts', () => {
|
|||
assertEquals(detectLanguage('Γειά σου!', 1), 'el');
|
||||
assertEquals(detectLanguage('שלום!', 1), 'he');
|
||||
assertEquals(detectLanguage('こんにちは。', 1), 'ja');
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
'最近、長女から「中学生男子全員クソ」という話を良く聞き中学生女子側の視点が分かってよかった。父からは「中学生男子は自分がクソだということを3年間かかって学習するんだよ」と言っておいた',
|
||||
1,
|
||||
),
|
||||
'ja',
|
||||
);
|
||||
|
||||
// ambiguous
|
||||
assertEquals(detectLanguage('你好', 1), undefined);
|
||||
|
|
|
|||
|
|
@ -12,9 +12,10 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
|||
// It's better to remove the emojis first
|
||||
const sanitizedText = linkify.tokenize(
|
||||
text
|
||||
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||
.replaceAll(/\p{Extended_Pictographic}/gu, '') // strip emojis
|
||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), // strip invisible characters
|
||||
)
|
||||
.reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||
|
||||
// Definite patterns for some languages.
|
||||
// Text which matches MUST unambiguously be in the given language.
|
||||
|
|
@ -30,7 +31,11 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod
|
|||
|
||||
// If any pattern matches, the language is known.
|
||||
for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) {
|
||||
if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking
|
||||
const text = sanitizedText
|
||||
.replaceAll(/[\p{P}\p{S}]/gu, '') // strip punctuation and symbols
|
||||
.replaceAll(/\p{N}/gu, ''); // strip numbers
|
||||
|
||||
if (pattern.test(text)) {
|
||||
return lang;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue