mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 11:29:46 +00:00
Merge branch 'languagedetector-lingua' into 'main'
Add lingua backend for language detection Closes #250 See merge request soapbox-pub/ditto!610
This commit is contained in:
commit
2bdf100537
5 changed files with 132 additions and 40 deletions
|
|
@ -19,6 +19,7 @@
|
||||||
"setup:kind0": "deno run -A --env-file --deny-read=.env scripts/setup-kind0.ts",
|
"setup:kind0": "deno run -A --env-file --deny-read=.env scripts/setup-kind0.ts",
|
||||||
"stats:recompute": "deno run -A --env-file --deny-read=.env scripts/stats-recompute.ts",
|
"stats:recompute": "deno run -A --env-file --deny-read=.env scripts/stats-recompute.ts",
|
||||||
"soapbox": "curl -O https://dl.soapbox.pub/main/soapbox.zip && mkdir -p public && mv soapbox.zip public/ && cd public/ && unzip -o soapbox.zip && rm soapbox.zip",
|
"soapbox": "curl -O https://dl.soapbox.pub/main/soapbox.zip && mkdir -p public && mv soapbox.zip public/ && cd public/ && unzip -o soapbox.zip && rm soapbox.zip",
|
||||||
|
"lingua": "mkdir -p data/lingua && cd data/lingua && curl -# -o lingua-wasm.zip -L https://github.com/xyzshantaram/lingua-wasm/releases/latest/download/lingua-wasm.zip && unzip -o lingua-wasm.zip && rm lingua-wasm.zip",
|
||||||
"trends": "deno run -A --env-file --deny-read=.env scripts/trends.ts",
|
"trends": "deno run -A --env-file --deny-read=.env scripts/trends.ts",
|
||||||
"clean:deps": "deno cache --reload src/app.ts",
|
"clean:deps": "deno cache --reload src/app.ts",
|
||||||
"db:populate-search": "deno run -A --env-file --deny-read=.env scripts/db-populate-search.ts",
|
"db:populate-search": "deno run -A --env-file --deny-read=.env scripts/db-populate-search.ts",
|
||||||
|
|
|
||||||
|
|
@ -303,6 +303,9 @@ class Conf {
|
||||||
static get translationProvider(): string | undefined {
|
static get translationProvider(): string | undefined {
|
||||||
return Deno.env.get('TRANSLATION_PROVIDER');
|
return Deno.env.get('TRANSLATION_PROVIDER');
|
||||||
}
|
}
|
||||||
|
static get languageDetector(): string {
|
||||||
|
return Deno.env.get('DITTO_LANG_DETECTOR') || 'lande';
|
||||||
|
}
|
||||||
/** DeepL URL endpoint. */
|
/** DeepL URL endpoint. */
|
||||||
static get deeplBaseUrl(): string | undefined {
|
static get deeplBaseUrl(): string | undefined {
|
||||||
return Deno.env.get('DEEPL_BASE_URL');
|
return Deno.env.get('DEEPL_BASE_URL');
|
||||||
|
|
|
||||||
|
|
@ -234,7 +234,7 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
|
||||||
async function setLanguage(event: NostrEvent): Promise<void> {
|
async function setLanguage(event: NostrEvent): Promise<void> {
|
||||||
if (event.kind !== 1) return;
|
if (event.kind !== 1) return;
|
||||||
|
|
||||||
const language = detectLanguage(event.content, 0.90);
|
const language = await detectLanguage(event.content, 0.90);
|
||||||
if (!language) return;
|
if (!language) return;
|
||||||
|
|
||||||
const kysely = await Storages.kysely();
|
const kysely = await Storages.kysely();
|
||||||
|
|
|
||||||
|
|
@ -1,28 +1,52 @@
|
||||||
import { detectLanguage } from '@/utils/language.ts';
|
import { detectLanguage } from '@/utils/language.ts';
|
||||||
import { assertEquals } from '@std/assert';
|
import { assertEquals, assertNotEquals } from '@std/assert';
|
||||||
|
import { Conf } from '@/config.ts';
|
||||||
|
|
||||||
Deno.test('Detect English language', () => {
|
Deno.test('Tests for language detection', async (t) => {
|
||||||
assertEquals(detectLanguage(``, 0.90), undefined);
|
await t.step('Empty string should return undefined', async () => {
|
||||||
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
|
assertEquals(await detectLanguage(``, 0.90), undefined);
|
||||||
assertEquals(
|
});
|
||||||
detectLanguage(
|
|
||||||
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
await t.step('Regular English string should be detected', async () => {
|
||||||
0.90,
|
assertEquals(await detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
|
||||||
),
|
});
|
||||||
'en',
|
|
||||||
);
|
await t.step('nostr event id should be ignored', async () => {
|
||||||
assertEquals(
|
assertEquals(
|
||||||
detectLanguage(
|
await detectLanguage(
|
||||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
0.90,
|
0.90,
|
||||||
),
|
),
|
||||||
'en',
|
'en',
|
||||||
);
|
);
|
||||||
assertEquals(
|
});
|
||||||
detectLanguage(
|
|
||||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
await t.step('URLs should be ignored', async () => {
|
||||||
0.90,
|
assertEquals(
|
||||||
),
|
await detectLanguage(
|
||||||
'en',
|
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
);
|
0.90,
|
||||||
|
),
|
||||||
|
'en',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step('Emoji should be ignored', async () => {
|
||||||
|
assertEquals(
|
||||||
|
await detectLanguage(
|
||||||
|
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
|
0.90,
|
||||||
|
),
|
||||||
|
'en',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step('The horrific problem sentence', async () => {
|
||||||
|
const tester = Conf.languageDetector === 'lingua' ? assertEquals : assertNotEquals;
|
||||||
|
tester(await detectLanguage(`It may die when I die, and that's okay. It's my earnings.`, 0.90), 'en');
|
||||||
|
});
|
||||||
|
//
|
||||||
|
await t.step('Should detect Hindi sentences', async () => {
|
||||||
|
assertEquals(await detectLanguage(`मै डिट्टो की नयी अनुवाद सुविधा को आज़मा रहा हूँ`, 0.80), 'hi');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -1,34 +1,98 @@
|
||||||
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
||||||
import lande from 'lande';
|
import lande from 'lande';
|
||||||
import linkify from 'linkifyjs';
|
import linkify from 'linkifyjs';
|
||||||
|
import { Conf } from '@/config.ts';
|
||||||
|
|
||||||
|
declare class LinguaDetector {
|
||||||
|
private worker: Worker;
|
||||||
|
private pending: Map<string, PromiseWithResolvers<Map<string, number>>>;
|
||||||
|
destroyed: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiate a new LanguageDetector.
|
||||||
|
* @param workerPath Path to the worker file. If not supplied, it defaults to using the lingua-wasm build present with this package.
|
||||||
|
* The worker is simply posted messages with a detection id and the string to detect. It must then respond with the same detection id and the detected language (or undefined.)
|
||||||
|
*/
|
||||||
|
constructor(workerPath?: string);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect the language of a string.
|
||||||
|
* @param str The string to detect for.
|
||||||
|
* @returns A Promise that resolves to an ISO-639-3 language code.
|
||||||
|
*/
|
||||||
|
detect(str: string): Promise<string>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the LanguageDetector has been destroyed.
|
||||||
|
* @throws Error if the detector has been destroyed.
|
||||||
|
*/
|
||||||
|
private checkDestroyed(): void;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destroys the LanguageDetector instance and terminates the associated Worker.
|
||||||
|
*/
|
||||||
|
destroy(): void;
|
||||||
|
}
|
||||||
|
|
||||||
|
const toLanguageCode = (iso6393: string) => {
|
||||||
|
const locale = new Intl.Locale(iso6393);
|
||||||
|
if (ISO6391.validate(locale.language)) return locale.language as LanguageCode;
|
||||||
|
};
|
||||||
|
|
||||||
|
function detectWithLande(sanitized: string, threshold: number) {
|
||||||
|
const [topResult] = lande(sanitized);
|
||||||
|
if (topResult) {
|
||||||
|
const [iso6393, confidence] = topResult;
|
||||||
|
|
||||||
|
if (confidence >= threshold) {
|
||||||
|
return toLanguageCode(iso6393);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const linguaDirUrl = (file: string) => {
|
||||||
|
const normalized = new URL('../../data/lingua/' + file, import.meta.url);
|
||||||
|
return normalized.toString();
|
||||||
|
};
|
||||||
|
|
||||||
|
let linguaInstance: LinguaDetector | undefined = undefined;
|
||||||
|
|
||||||
|
async function detectWithLingua(text: string) {
|
||||||
|
try {
|
||||||
|
if (!linguaInstance) {
|
||||||
|
const { LanguageDetector } = await import(linguaDirUrl('mod.ts'));
|
||||||
|
linguaInstance = new LanguageDetector(linguaDirUrl('src/worker.js')) as unknown as LinguaDetector;
|
||||||
|
globalThis.addEventListener('unload', () => linguaInstance?.destroy());
|
||||||
|
}
|
||||||
|
const result = await linguaInstance.detect(text);
|
||||||
|
return toLanguageCode(result);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
linkify.registerCustomProtocol('nostr', true);
|
linkify.registerCustomProtocol('nostr', true);
|
||||||
|
|
||||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||||
*/
|
*/
|
||||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
export async function detectLanguage(text: string, threshold: number): Promise<LanguageCode | undefined> {
|
||||||
// It's better to remove the emojis first
|
// It's better to remove the emojis first
|
||||||
const sanitizedText = linkify.tokenize(
|
const sanitized = linkify.tokenize(
|
||||||
text
|
text
|
||||||
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||||
|
|
||||||
if (sanitizedText.length < 10) { // heuristics
|
if (sanitized.length < 10) { // heuristics
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const [topResult] = lande(
|
if (Conf.languageDetector === 'lingua') {
|
||||||
sanitizedText,
|
const detected = await detectWithLingua(sanitized);
|
||||||
);
|
if (detected) return detected;
|
||||||
if (topResult) {
|
|
||||||
const [iso6393, confidence] = topResult;
|
|
||||||
const locale = new Intl.Locale(iso6393);
|
|
||||||
|
|
||||||
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
|
||||||
return locale.language as LanguageCode;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
|
return detectWithLande(sanitized, threshold);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue