mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 11:29:46 +00:00
feat: make lande great again
create detectLanguage() function that removes emojis, links and other weird invisible characters
This commit is contained in:
parent
1cb13b141a
commit
36d09af467
3 changed files with 74 additions and 27 deletions
|
|
@ -1,9 +1,6 @@
|
||||||
import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify';
|
import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify';
|
||||||
import { Stickynotes } from '@soapbox/stickynotes';
|
import { Stickynotes } from '@soapbox/stickynotes';
|
||||||
import ISO6391 from 'iso-639-1';
|
|
||||||
import { Kysely, sql } from 'kysely';
|
import { Kysely, sql } from 'kysely';
|
||||||
import lande from 'lande';
|
|
||||||
import linkify from 'linkifyjs';
|
|
||||||
import { LRUCache } from 'lru-cache';
|
import { LRUCache } from 'lru-cache';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
|
@ -23,6 +20,7 @@ import { nip05Cache } from '@/utils/nip05.ts';
|
||||||
import { purifyEvent } from '@/utils/purify.ts';
|
import { purifyEvent } from '@/utils/purify.ts';
|
||||||
import { updateStats } from '@/utils/stats.ts';
|
import { updateStats } from '@/utils/stats.ts';
|
||||||
import { getTagSet } from '@/utils/tags.ts';
|
import { getTagSet } from '@/utils/tags.ts';
|
||||||
|
import { detectLanguage } from '@/utils/language.ts';
|
||||||
|
|
||||||
const console = new Stickynotes('ditto:pipeline');
|
const console = new Stickynotes('ditto:pipeline');
|
||||||
|
|
||||||
|
|
@ -201,32 +199,19 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
|
||||||
|
|
||||||
/** Update the event in the database and set its language. */
|
/** Update the event in the database and set its language. */
|
||||||
async function setLanguage(event: NostrEvent): Promise<void> {
|
async function setLanguage(event: NostrEvent): Promise<void> {
|
||||||
const contentWithoutEmoji = event.content.replace(
|
if (event.kind !== 1) return;
|
||||||
/[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu,
|
|
||||||
'',
|
|
||||||
);
|
|
||||||
const contentWithoutLinks = linkify.tokenize(contentWithoutEmoji).reduce((accumulator, current) => {
|
|
||||||
if (current.t === 'text') return accumulator + current.v;
|
|
||||||
return accumulator;
|
|
||||||
}, '');
|
|
||||||
const parsedContent = contentWithoutLinks;
|
|
||||||
const [topResult] = lande(parsedContent);
|
|
||||||
|
|
||||||
if (topResult) {
|
const language = detectLanguage(event.content, 0.90);
|
||||||
const [iso6393, confidence] = topResult;
|
if (!language) return;
|
||||||
const locale = new Intl.Locale(iso6393);
|
|
||||||
|
|
||||||
if (confidence >= 0.95 && ISO6391.validate(locale.language)) {
|
const kysely = await Storages.kysely();
|
||||||
const kysely = await Storages.kysely();
|
try {
|
||||||
try {
|
await kysely.updateTable('nostr_events')
|
||||||
await kysely.updateTable('nostr_events')
|
.set('language', language)
|
||||||
.set('language', locale.language)
|
.where('id', '=', event.id)
|
||||||
.where('id', '=', event.id)
|
.execute();
|
||||||
.execute();
|
} catch {
|
||||||
} catch {
|
// do nothing
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
28
src/utils/language.test.ts
Normal file
28
src/utils/language.test.ts
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
import { detectLanguage } from '@/utils/language.ts';
|
||||||
|
import { assertEquals } from '@std/assert';
|
||||||
|
|
||||||
|
Deno.test('Detect English language', () => {
|
||||||
|
assertEquals(detectLanguage(``, 0.90), undefined);
|
||||||
|
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
|
||||||
|
assertEquals(
|
||||||
|
detectLanguage(
|
||||||
|
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
|
0.90,
|
||||||
|
),
|
||||||
|
'en',
|
||||||
|
);
|
||||||
|
assertEquals(
|
||||||
|
detectLanguage(
|
||||||
|
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
|
0.90,
|
||||||
|
),
|
||||||
|
'en',
|
||||||
|
);
|
||||||
|
assertEquals(
|
||||||
|
detectLanguage(
|
||||||
|
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||||
|
0.90,
|
||||||
|
),
|
||||||
|
'en',
|
||||||
|
);
|
||||||
|
});
|
||||||
34
src/utils/language.ts
Normal file
34
src/utils/language.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
||||||
|
import lande from 'lande';
|
||||||
|
import linkify from 'linkifyjs';
|
||||||
|
|
||||||
|
linkify.registerCustomProtocol('nostr', true);
|
||||||
|
|
||||||
|
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||||
|
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||||
|
*/
|
||||||
|
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||||
|
// It's better to remove the emojis first
|
||||||
|
const sanitizedText = (linkify.tokenize(
|
||||||
|
text.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||||
|
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||||
|
)
|
||||||
|
.reduce(
|
||||||
|
(acc, { t, v }) => t === 'text' ? acc + v : acc,
|
||||||
|
'',
|
||||||
|
)).trim();
|
||||||
|
if (sanitizedText.length < 10) return; // heuristics
|
||||||
|
|
||||||
|
const [topResult] = lande(
|
||||||
|
sanitizedText,
|
||||||
|
);
|
||||||
|
if (topResult) {
|
||||||
|
const [iso6393, confidence] = topResult;
|
||||||
|
const locale = new Intl.Locale(iso6393);
|
||||||
|
|
||||||
|
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
||||||
|
return locale.language as LanguageCode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue