feat: make lande great again

create detectLanguage() function that removes emojis, links and other weird invisible characters
This commit is contained in:
P. Reis 2024-10-11 14:28:01 -03:00
parent 1cb13b141a
commit 36d09af467
3 changed files with 74 additions and 27 deletions

View file

@ -1,9 +1,6 @@
import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify'; import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify';
import { Stickynotes } from '@soapbox/stickynotes'; import { Stickynotes } from '@soapbox/stickynotes';
import ISO6391 from 'iso-639-1';
import { Kysely, sql } from 'kysely'; import { Kysely, sql } from 'kysely';
import lande from 'lande';
import linkify from 'linkifyjs';
import { LRUCache } from 'lru-cache'; import { LRUCache } from 'lru-cache';
import { z } from 'zod'; import { z } from 'zod';
@ -23,6 +20,7 @@ import { nip05Cache } from '@/utils/nip05.ts';
import { purifyEvent } from '@/utils/purify.ts'; import { purifyEvent } from '@/utils/purify.ts';
import { updateStats } from '@/utils/stats.ts'; import { updateStats } from '@/utils/stats.ts';
import { getTagSet } from '@/utils/tags.ts'; import { getTagSet } from '@/utils/tags.ts';
import { detectLanguage } from '@/utils/language.ts';
const console = new Stickynotes('ditto:pipeline'); const console = new Stickynotes('ditto:pipeline');
@ -201,32 +199,19 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
/** Update the event in the database and set its language. */ /** Update the event in the database and set its language. */
async function setLanguage(event: NostrEvent): Promise<void> { async function setLanguage(event: NostrEvent): Promise<void> {
const contentWithoutEmoji = event.content.replace( if (event.kind !== 1) return;
/[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu,
'',
);
const contentWithoutLinks = linkify.tokenize(contentWithoutEmoji).reduce((accumulator, current) => {
if (current.t === 'text') return accumulator + current.v;
return accumulator;
}, '');
const parsedContent = contentWithoutLinks;
const [topResult] = lande(parsedContent);
if (topResult) { const language = detectLanguage(event.content, 0.90);
const [iso6393, confidence] = topResult; if (!language) return;
const locale = new Intl.Locale(iso6393);
if (confidence >= 0.95 && ISO6391.validate(locale.language)) { const kysely = await Storages.kysely();
const kysely = await Storages.kysely(); try {
try { await kysely.updateTable('nostr_events')
await kysely.updateTable('nostr_events') .set('language', language)
.set('language', locale.language) .where('id', '=', event.id)
.where('id', '=', event.id) .execute();
.execute(); } catch {
} catch { // do nothing
// do nothing
}
}
} }
} }

View file

@ -0,0 +1,28 @@
import { detectLanguage } from '@/utils/language.ts';
import { assertEquals } from '@std/assert';
Deno.test('Detect English language', () => {
assertEquals(detectLanguage(``, 0.90), undefined);
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
assertEquals(
detectLanguage(
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
assertEquals(
detectLanguage(
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
assertEquals(
detectLanguage(
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
});

34
src/utils/language.ts Normal file
View file

@ -0,0 +1,34 @@
import ISO6391, { type LanguageCode } from 'iso-639-1';
import lande from 'lande';
import linkify from 'linkifyjs';
linkify.registerCustomProtocol('nostr', true);
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
* 'minConfidence' must be a number between 0 and 1, such as 0.95
*/
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
// It's better to remove the emojis first
const sanitizedText = (linkify.tokenize(
text.replaceAll(/\p{Extended_Pictographic}/gu, '')
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
)
.reduce(
(acc, { t, v }) => t === 'text' ? acc + v : acc,
'',
)).trim();
if (sanitizedText.length < 10) return; // heuristics
const [topResult] = lande(
sanitizedText,
);
if (topResult) {
const [iso6393, confidence] = topResult;
const locale = new Intl.Locale(iso6393);
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
return locale.language as LanguageCode;
}
}
return;
}