Merge branch 'improve-lande-use' into 'main'

Improve setLanguage() function for better language detection

See merge request soapbox-pub/ditto!543
This commit is contained in:
Alex Gleason 2024-10-11 21:52:11 +00:00
commit 81a15ac82d
3 changed files with 74 additions and 17 deletions

View file

@ -1,8 +1,6 @@
import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify'; import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify';
import { Stickynotes } from '@soapbox/stickynotes'; import { Stickynotes } from '@soapbox/stickynotes';
import ISO6391 from 'iso-639-1';
import { Kysely, sql } from 'kysely'; import { Kysely, sql } from 'kysely';
import lande from 'lande';
import { LRUCache } from 'lru-cache'; import { LRUCache } from 'lru-cache';
import { z } from 'zod'; import { z } from 'zod';
@ -22,6 +20,7 @@ import { nip05Cache } from '@/utils/nip05.ts';
import { purifyEvent } from '@/utils/purify.ts'; import { purifyEvent } from '@/utils/purify.ts';
import { updateStats } from '@/utils/stats.ts'; import { updateStats } from '@/utils/stats.ts';
import { getTagSet } from '@/utils/tags.ts'; import { getTagSet } from '@/utils/tags.ts';
import { detectLanguage } from '@/utils/language.ts';
const console = new Stickynotes('ditto:pipeline'); const console = new Stickynotes('ditto:pipeline');
@ -200,25 +199,21 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
/** Update the event in the database and set its language. */ /** Update the event in the database and set its language. */
async function setLanguage(event: NostrEvent): Promise<void> { async function setLanguage(event: NostrEvent): Promise<void> {
const [topResult] = lande(event.content); if (event.kind !== 1) return;
if (topResult) { const language = detectLanguage(event.content, 0.90);
const [iso6393, confidence] = topResult; if (!language) return;
const locale = new Intl.Locale(iso6393);
if (confidence >= 0.95 && ISO6391.validate(locale.language)) {
const kysely = await Storages.kysely(); const kysely = await Storages.kysely();
try { try {
await kysely.updateTable('nostr_events') await kysely.updateTable('nostr_events')
.set('language', locale.language) .set('language', language)
.where('id', '=', event.id) .where('id', '=', event.id)
.execute(); .execute();
} catch { } catch {
// do nothing // do nothing
} }
} }
}
}
/** Determine if the event is being received in a timely manner. */ /** Determine if the event is being received in a timely manner. */
function isFresh(event: NostrEvent): boolean { function isFresh(event: NostrEvent): boolean {

View file

@ -0,0 +1,28 @@
import { detectLanguage } from '@/utils/language.ts';
import { assertEquals } from '@std/assert';
Deno.test('Detect English language', () => {
assertEquals(detectLanguage(``, 0.90), undefined);
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
assertEquals(
detectLanguage(
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
assertEquals(
detectLanguage(
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
assertEquals(
detectLanguage(
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
0.90,
),
'en',
);
});

34
src/utils/language.ts Normal file
View file

@ -0,0 +1,34 @@
import ISO6391, { type LanguageCode } from 'iso-639-1';
import lande from 'lande';
import linkify from 'linkifyjs';
linkify.registerCustomProtocol('nostr', true);
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
* 'minConfidence' must be a number between 0 and 1, such as 0.95
*/
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
// It's better to remove the emojis first
const sanitizedText = linkify.tokenize(
text
.replaceAll(/\p{Extended_Pictographic}/gu, '')
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
if (sanitizedText.length < 10) { // heuristics
return;
}
const [topResult] = lande(
sanitizedText,
);
if (topResult) {
const [iso6393, confidence] = topResult;
const locale = new Intl.Locale(iso6393);
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
return locale.language as LanguageCode;
}
}
return;
}