From 12e8eaf340477f50fb1c14827f22169425b65a79 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 27 Dec 2024 04:55:51 +0530 Subject: [PATCH 1/5] add task to download latest lingua-wasm blob --- deno.json | 1 + 1 file changed, 1 insertion(+) diff --git a/deno.json b/deno.json index b37f7e8c..b5818a5e 100644 --- a/deno.json +++ b/deno.json @@ -19,6 +19,7 @@ "setup:kind0": "deno run -A --env-file --deny-read=.env scripts/setup-kind0.ts", "stats:recompute": "deno run -A --env-file --deny-read=.env scripts/stats-recompute.ts", "soapbox": "curl -O https://dl.soapbox.pub/main/soapbox.zip && mkdir -p public && mv soapbox.zip public/ && cd public/ && unzip -o soapbox.zip && rm soapbox.zip", + "lingua": "mkdir -p data/lingua && cd data/lingua && curl -# -o lingua-wasm.zip -L https://github.com/xyzshantaram/lingua-wasm/releases/latest/download/lingua-wasm.zip && unzip -o lingua-wasm.zip && rm lingua-wasm.zip", "trends": "deno run -A --env-file --deny-read=.env scripts/trends.ts", "clean:deps": "deno cache --reload src/app.ts", "db:populate-search": "deno run -A --env-file --deny-read=.env scripts/db-populate-search.ts", From 3812c1d3e6287dca7f0a0ed6236b07db9a33e9a7 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 27 Dec 2024 04:56:03 +0530 Subject: [PATCH 2/5] add language detection conf option --- src/config.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/config.ts b/src/config.ts index 68bf3ed8..73173a8c 100644 --- a/src/config.ts +++ b/src/config.ts @@ -303,6 +303,9 @@ class Conf { static get translationProvider(): string | undefined { return Deno.env.get('TRANSLATION_PROVIDER'); } + static get languageDetector(): string { + return Deno.env.get('DITTO_LANG_DETECTOR') || 'lande'; + } /** DeepL URL endpoint. */ static get deeplBaseUrl(): string | undefined { return Deno.env.get('DEEPL_BASE_URL'); From 86e816ae671ad6ae9e79d3b87e54c5d69c02395c Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 27 Dec 2024 04:56:28 +0530 Subject: [PATCH 3/5] add lingua lang detection --- src/pipeline.ts | 2 +- src/utils/language.ts | 92 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/src/pipeline.ts b/src/pipeline.ts index 5becff20..14f684ff 100644 --- a/src/pipeline.ts +++ b/src/pipeline.ts @@ -234,7 +234,7 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise { if (event.kind !== 1) return; - const language = detectLanguage(event.content, 0.90); + const language = await detectLanguage(event.content, 0.90); if (!language) return; const kysely = await Storages.kysely(); diff --git a/src/utils/language.ts b/src/utils/language.ts index 8af8ddf9..e317a263 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,34 +1,98 @@ import ISO6391, { type LanguageCode } from 'iso-639-1'; import lande from 'lande'; import linkify from 'linkifyjs'; +import { Conf } from '@/config.ts'; + +declare class LinguaDetector { + private worker: Worker; + private pending: Map>>; + destroyed: boolean; + + /** + * Instantiate a new LanguageDetector. + * @param workerPath Path to the worker file. If not supplied, it defaults to using the lingua-wasm build present with this package. + * The worker is simply posted messages with a detection id and the string to detect. It must then respond with the same detection id and the detected language (or undefined.) + */ + constructor(workerPath?: string); + + /** + * Detect the language of a string. + * @param str The string to detect for. + * @returns A Promise that resolves to an ISO-639-3 language code. + */ + detect(str: string): Promise; + + /** + * Checks if the LanguageDetector has been destroyed. + * @throws Error if the detector has been destroyed. + */ + private checkDestroyed(): void; + + /** + * Destroys the LanguageDetector instance and terminates the associated Worker. + */ + destroy(): void; +} + +const toLanguageCode = (iso6393: string) => { + const locale = new Intl.Locale(iso6393); + if (ISO6391.validate(locale.language)) return locale.language as LanguageCode; +}; + +function detectWithLande(sanitized: string, threshold: number) { + const [topResult] = lande(sanitized); + if (topResult) { + const [iso6393, confidence] = topResult; + + if (confidence >= threshold) { + return toLanguageCode(iso6393); + } + } +} + +const linguaDirUrl = (file: string) => { + const normalized = new URL('../../data/lingua/' + file, import.meta.url); + return normalized.toString(); +}; + +let linguaInstance: LinguaDetector | undefined = undefined; + +async function detectWithLingua(text: string) { + try { + if (!linguaInstance) { + const { LanguageDetector } = await import(linguaDirUrl('mod.ts')); + linguaInstance = new LanguageDetector(linguaDirUrl('src/worker.js')) as unknown as LinguaDetector; + globalThis.addEventListener('unload', () => linguaInstance?.destroy()); + } + const result = await linguaInstance.detect(text); + return toLanguageCode(result); + } catch (e) { + console.error(e); + return null; + } +} linkify.registerCustomProtocol('nostr', true); /** Returns the detected language if the confidence is greater or equal than 'minConfidence' * 'minConfidence' must be a number between 0 and 1, such as 0.95 */ -export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { +export async function detectLanguage(text: string, threshold: number): Promise { // It's better to remove the emojis first - const sanitizedText = linkify.tokenize( + const sanitized = linkify.tokenize( text .replaceAll(/\p{Extended_Pictographic}/gu, '') .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); - if (sanitizedText.length < 10) { // heuristics + if (sanitized.length < 10) { // heuristics return; } - const [topResult] = lande( - sanitizedText, - ); - if (topResult) { - const [iso6393, confidence] = topResult; - const locale = new Intl.Locale(iso6393); - - if (confidence >= minConfidence && ISO6391.validate(locale.language)) { - return locale.language as LanguageCode; - } + if (Conf.languageDetector === 'lingua') { + const detected = await detectWithLingua(sanitized); + if (detected) return detected; } - return; + + return detectWithLande(sanitized, threshold); } From b6e41f4eaf33149b3e74580fcea617ecb8b95968 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 27 Dec 2024 04:56:38 +0530 Subject: [PATCH 4/5] rewrite and expand tests --- src/utils/language.test.ts | 91 +++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts index 255f6b58..b9563bf0 100644 --- a/src/utils/language.test.ts +++ b/src/utils/language.test.ts @@ -1,28 +1,69 @@ import { detectLanguage } from '@/utils/language.ts'; -import { assertEquals } from '@std/assert'; +import { assertEquals, assertNotEquals } from '@std/assert'; +import { Conf } from '@/config.ts'; -Deno.test('Detect English language', () => { - assertEquals(detectLanguage(``, 0.90), undefined); - assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en'); - assertEquals( - detectLanguage( - `Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, - 0.90, - ), - 'en', - ); - assertEquals( - detectLanguage( - `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, - 0.90, - ), - 'en', - ); - assertEquals( - detectLanguage( - `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u đŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸Ž https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, - 0.90, - ), - 'en', - ); +Deno.test('Tests for language detection', async (t) => { + await t.step('Empty string should return undefined', async () => { + assertEquals(await detectLanguage(``, 0.90), undefined); + }); + + await t.step('Regular English string should be detected', async () => { + assertEquals(await detectLanguage(`Good morning my fellow friends`, 0.90), 'en'); + }); + + await t.step('nostr event id should be ignored', async () => { + assertEquals( + await detectLanguage( + `Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); + }); + + await t.step('URLs should be ignored', async () => { + assertEquals( + await detectLanguage( + `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); + }); + + await t.step('Emoji should be ignored', async () => { + assertEquals( + await detectLanguage( + `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u đŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸ŽđŸ˜‚đŸ’¯â™ĄâŒ¨ī¸Ž https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); + }); + + await t.step('The horrific problem sentence', async () => { + switch (Conf.languageDetector) { + case 'lingua': + assertEquals( + await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), + 'en', + ); + break; + default: + assertNotEquals( + await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), + 'en', + ); + break; + } + }); + + await t.step('The horrific problem sentence', async () => { + const tester = Conf.languageDetector === 'lingua' ? assertEquals : assertNotEquals; + tester(await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), 'en'); + }); + // + await t.step('Should detect Hindi sentences', async () => { + assertEquals(await detectLanguage(`ā¤ŽāĨˆ ā¤Ąā¤ŋ⤟āĨā¤ŸāĨ‹ ⤕āĨ€ ⤍⤝āĨ€ ⤅⤍āĨā¤ĩā¤žā¤Ļ ⤏āĨā¤ĩā¤ŋā¤§ā¤ž ⤕āĨ‹ ⤆āĨ›ā¤Žā¤ž ā¤°ā¤šā¤ž ā¤šāĨ‚⤁`, 0.80), 'hi'); + }); }); From 8a2ac8f7f67e0f91c0b248fa6009666959948de5 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 27 Dec 2024 05:09:29 +0530 Subject: [PATCH 5/5] fix copied test --- src/utils/language.test.ts | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts index b9563bf0..cffd3d50 100644 --- a/src/utils/language.test.ts +++ b/src/utils/language.test.ts @@ -41,23 +41,6 @@ Deno.test('Tests for language detection', async (t) => { ); }); - await t.step('The horrific problem sentence', async () => { - switch (Conf.languageDetector) { - case 'lingua': - assertEquals( - await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), - 'en', - ); - break; - default: - assertNotEquals( - await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), - 'en', - ); - break; - } - }); - await t.step('The horrific problem sentence', async () => { const tester = Conf.languageDetector === 'lingua' ? assertEquals : assertNotEquals; tester(await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), 'en');