mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 03:19:46 +00:00
Merge branch 'improve-lande-use' into 'main'
Improve setLanguage() function for better language detection See merge request soapbox-pub/ditto!543
This commit is contained in:
commit
81a15ac82d
3 changed files with 74 additions and 17 deletions
|
|
@ -1,8 +1,6 @@
|
|||
import { NKinds, NostrEvent, NSchema as n } from '@nostrify/nostrify';
|
||||
import { Stickynotes } from '@soapbox/stickynotes';
|
||||
import ISO6391 from 'iso-639-1';
|
||||
import { Kysely, sql } from 'kysely';
|
||||
import lande from 'lande';
|
||||
import { LRUCache } from 'lru-cache';
|
||||
import { z } from 'zod';
|
||||
|
||||
|
|
@ -22,6 +20,7 @@ import { nip05Cache } from '@/utils/nip05.ts';
|
|||
import { purifyEvent } from '@/utils/purify.ts';
|
||||
import { updateStats } from '@/utils/stats.ts';
|
||||
import { getTagSet } from '@/utils/tags.ts';
|
||||
import { detectLanguage } from '@/utils/language.ts';
|
||||
|
||||
const console = new Stickynotes('ditto:pipeline');
|
||||
|
||||
|
|
@ -200,23 +199,19 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
|
|||
|
||||
/** Update the event in the database and set its language. */
|
||||
async function setLanguage(event: NostrEvent): Promise<void> {
|
||||
const [topResult] = lande(event.content);
|
||||
if (event.kind !== 1) return;
|
||||
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
const language = detectLanguage(event.content, 0.90);
|
||||
if (!language) return;
|
||||
|
||||
if (confidence >= 0.95 && ISO6391.validate(locale.language)) {
|
||||
const kysely = await Storages.kysely();
|
||||
try {
|
||||
await kysely.updateTable('nostr_events')
|
||||
.set('language', locale.language)
|
||||
.where('id', '=', event.id)
|
||||
.execute();
|
||||
} catch {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
const kysely = await Storages.kysely();
|
||||
try {
|
||||
await kysely.updateTable('nostr_events')
|
||||
.set('language', language)
|
||||
.where('id', '=', event.id)
|
||||
.execute();
|
||||
} catch {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
28
src/utils/language.test.ts
Normal file
28
src/utils/language.test.ts
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import { detectLanguage } from '@/utils/language.ts';
|
||||
import { assertEquals } from '@std/assert';
|
||||
|
||||
Deno.test('Detect English language', () => {
|
||||
assertEquals(detectLanguage(``, 0.90), undefined);
|
||||
assertEquals(detectLanguage(`Good morning my fellow friends`, 0.90), 'en');
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uWould you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
assertEquals(
|
||||
detectLanguage(
|
||||
`https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u 😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎😂💯♡⌨︎ https://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_uhttps://youtu.be/FxppefYTA2I?si=grgEpbEhFu_-3V_u Would you listen to Michael Jackson's songs?\n\nnostr:nevent1qvzqqqqqqypzqprpljlvcnpnw3pejvkkhrc3y6wvmd7vjuad0fg2ud3dky66gaxaqyvhwumn8ghj7cm0vfexzen4d4sjucm0d5hhyetvv9usqg8htx8xcjq7ffrzxu7nrhlr8vljcv6gpmet0auy87mpj6djxk4myqha02kp`,
|
||||
0.90,
|
||||
),
|
||||
'en',
|
||||
);
|
||||
});
|
||||
34
src/utils/language.ts
Normal file
34
src/utils/language.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
||||
import lande from 'lande';
|
||||
import linkify from 'linkifyjs';
|
||||
|
||||
linkify.registerCustomProtocol('nostr', true);
|
||||
|
||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||
*/
|
||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||
// It's better to remove the emojis first
|
||||
const sanitizedText = linkify.tokenize(
|
||||
text
|
||||
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||
|
||||
if (sanitizedText.length < 10) { // heuristics
|
||||
return;
|
||||
}
|
||||
|
||||
const [topResult] = lande(
|
||||
sanitizedText,
|
||||
);
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
|
||||
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
||||
return locale.language as LanguageCode;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue