diff --git a/src/pipeline.ts b/src/pipeline.ts index a00456a9..7c81a484 100644 --- a/src/pipeline.ts +++ b/src/pipeline.ts @@ -1,8 +1,6 @@ import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify'; import { Stickynotes } from '@soapbox/stickynotes'; -import ISO6391 from 'iso-639-1'; import { Kysely, sql } from 'kysely'; -import lande from 'lande'; import { LRUCache } from 'lru-cache'; import { z } from 'zod'; @@ -22,6 +20,7 @@ import { nip05Cache } from '@/utils/nip05.ts'; import { purifyEvent } from '@/utils/purify.ts'; import { updateStats } from '@/utils/stats.ts'; import { getTagSet } from '@/utils/tags.ts'; +import { detectLanguage } from '@/utils/language.ts'; const console = new Stickynotes('ditto:pipeline'); @@ -200,23 +199,19 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise { - const [topResult] = lande(event.content); + if (event.kind !== 1) return; - if (topResult) { - const [iso6393, confidence] = topResult; - const locale = new Intl.Locale(iso6393); + const language = detectLanguage(event.content, 0.90); + if (!language) return; - if (confidence >= 0.95 && ISO6391.validate(locale.language)) { - const kysely = await Storages.kysely(); - try { - await kysely.updateTable('nostr_events') - .set('language', locale.language) - .where('id', '=', event.id) - .execute(); - } catch { - // do nothing - } - } + const kysely = await Storages.kysely(); + try { + await kysely.updateTable('nostr_events') + .set('language', language) + .where('id', '=', event.id) + .execute(); + } catch { + // do nothing } } diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts new file mode 100644 index 00000000..255f6b58 --- /dev/null +++ b/src/utils/language.test.ts @@ -0,0 +1,28 @@ +import { detectLanguage } from '@/utils/language.ts'; +import { assertEquals } from '@std/assert'; + +Deno.test('Detect English language', () => { + assertEquals(detectLanguage(``, 0.90), undefined); + assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en'); + assertEquals( + detectLanguage( + `Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); + assertEquals( + detectLanguage( + `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); + assertEquals( + detectLanguage( + `https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ๐Ÿ˜‚๐Ÿ’ฏโ™กโŒจ๏ธŽ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`, + 0.90, + ), + 'en', + ); +}); diff --git a/src/utils/language.ts b/src/utils/language.ts new file mode 100644 index 00000000..8af8ddf9 --- /dev/null +++ b/src/utils/language.ts @@ -0,0 +1,34 @@ +import ISO6391, { type LanguageCode } from 'iso-639-1'; +import lande from 'lande'; +import linkify from 'linkifyjs'; + +linkify.registerCustomProtocol('nostr', true); + +/** Returns the detected language if the confidence is greater or equal than 'minConfidence' + * 'minConfidence' must be a number between 0 and 1, such as 0.95 + */ +export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { + // It's better to remove the emojis first + const sanitizedText = linkify.tokenize( + text + .replaceAll(/\p{Extended_Pictographic}/gu, '') + .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), + ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); + + if (sanitizedText.length < 10) { // heuristics + return; + } + + const [topResult] = lande( + sanitizedText, + ); + if (topResult) { + const [iso6393, confidence] = topResult; + const locale = new Intl.Locale(iso6393); + + if (confidence >= minConfidence && ISO6391.validate(locale.language)) { + return locale.language as LanguageCode; + } + } + return; +}