From cfa684892770f73da2013d90d8de48d9936afd9d Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Mon, 3 Feb 2025 14:53:38 -0600 Subject: [PATCH 1/4] Only parse mentions with valid pubkeys Fixes https://gitlab.com/soapbox-pub/ditto/-/issues/290 --- src/views/mastodon/statuses.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/views/mastodon/statuses.ts b/src/views/mastodon/statuses.ts index 3f8f1c96..265cf442 100644 --- a/src/views/mastodon/statuses.ts +++ b/src/views/mastodon/statuses.ts @@ -7,7 +7,7 @@ import { MastodonMention } from '@/entities/MastodonMention.ts'; import { MastodonStatus } from '@/entities/MastodonStatus.ts'; import { type DittoEvent } from '@/interfaces/DittoEvent.ts'; import { Storages } from '@/storages.ts'; -import { nostrDate } from '@/utils.ts'; +import { isNostrId, nostrDate } from '@/utils.ts'; import { getMediaLinks, parseNoteContent, stripimeta } from '@/utils/note.ts'; import { findReplyTag } from '@/utils/tags.ts'; import { unfurlCardCached } from '@/utils/unfurl.ts'; @@ -41,8 +41,8 @@ async function renderStatus(event: DittoEvent, opts: RenderStatusOpts): Promise< const mentionedPubkeys = [ ...new Set( event.tags - .filter((tag) => tag[0] === 'p') - .map((tag) => tag[1]), + .filter(([name, value]) => name === 'p' && isNostrId(value)) + .map(([, value]) => value), ), ]; From ec5a000265963dd50c57ae88efb409b1356161a9 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Mon, 3 Feb 2025 16:58:26 -0600 Subject: [PATCH 2/4] Upgrade gleasonator-policy, only index lowercase t-tags in EventsDB --- deno.lock | 8 ++++++++ src/storages/EventsDB.ts | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/deno.lock b/deno.lock index a3b02cbb..5e4134da 100644 --- a/deno.lock +++ b/deno.lock @@ -26,6 +26,7 @@ "jsr:@gleasonator/policy@0.9.1": "0.9.1", "jsr:@gleasonator/policy@0.9.2": "0.9.2", "jsr:@gleasonator/policy@0.9.3": "0.9.3", + "jsr:@gleasonator/policy@0.9.4": "0.9.4", "jsr:@hono/hono@^4.4.6": "4.6.15", "jsr:@lambdalisue/async@^2.1.1": "2.1.1", "jsr:@negrel/http-ece@0.6.0": "0.6.0", @@ -298,6 +299,13 @@ "jsr:@nostrify/policies@~0.36.1" ] }, + "@gleasonator/policy@0.9.4": { + "integrity": "5d5b8a585b8e3cd6e6b7daed2cfa61cd1a3e5945691f092eb98f8671384c3657", + "dependencies": [ + "jsr:@nostrify/nostrify@0.36", + "jsr:@nostrify/policies@~0.36.1" + ] + }, "@hono/hono@4.4.6": { "integrity": "aa557ca9930787ee86b9ca1730691f1ce1c379174c2cb244d5934db2b6314453" }, diff --git a/src/storages/EventsDB.ts b/src/storages/EventsDB.ts index f79c0180..b22cd32a 100644 --- a/src/storages/EventsDB.ts +++ b/src/storages/EventsDB.ts @@ -59,7 +59,8 @@ class EventsDB extends NPostgres { 'proxy': ({ count, value }) => count === 0 && value.length < 256, 'q': ({ event, count, value }) => count === 0 && event.kind === 1 && isNostrId(value), 'r': ({ event, count }) => (event.kind === 1985 ? count < 20 : count < 3), - 't': ({ event, count, value }) => (event.kind === 1985 ? count < 20 : count < 5) && value.length < 50, + 't': ({ event, count, value }) => + (value === value.toLowerCase()) && (event.kind === 1985 ? count < 20 : count < 5) && value.length < 50, }; static indexExtensions(event: NostrEvent): Record { From 7beb2d594a82d8b91e00f3a36ee955ab120444ad Mon Sep 17 00:00:00 2001 From: "P. Reis" Date: Mon, 3 Feb 2025 23:08:20 +0000 Subject: [PATCH 3/4] feat: populate extensions --- deno.json | 1 + scripts/db-populate-extensions.ts | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 scripts/db-populate-extensions.ts diff --git a/deno.json b/deno.json index f64b0a1f..80c58382 100644 --- a/deno.json +++ b/deno.json @@ -22,6 +22,7 @@ "trends": "deno run -A --env-file --deny-read=.env scripts/trends.ts", "clean:deps": "deno cache --reload src/app.ts", "db:populate-search": "deno run -A --env-file --deny-read=.env scripts/db-populate-search.ts", + "db:populate-extensions": "deno run -A --env-file --deny-read=.env scripts/db-populate-extensions.ts", "vapid": "deno run scripts/vapid.ts" }, "unstable": [ diff --git a/scripts/db-populate-extensions.ts b/scripts/db-populate-extensions.ts new file mode 100644 index 00000000..428b591f --- /dev/null +++ b/scripts/db-populate-extensions.ts @@ -0,0 +1,26 @@ +import { Storages } from '@/storages.ts'; +import { EventsDB } from '@/storages/EventsDB.ts'; + +const store = await Storages.db(); +const kysely = await Storages.kysely(); + +for await (const msg of store.req([{}])) { + if (msg[0] === 'EVENT') { + const event = msg[2]; + + const ext = EventsDB.indexExtensions(event); + + try { + await kysely.updateTable('nostr_events') + .set('search_ext', ext) + .where('id', '=', event.id) + .execute(); + } catch { + // do nothing + } + } else { + break; + } +} + +Deno.exit(); From 2f2cb2c4fcb2b4c5a6cefb9277c2eabb4f282c96 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 4 Feb 2025 15:05:52 -0600 Subject: [PATCH 4/4] detectLanguage: check the text's script for definitive language categorization for some languages --- src/utils/language.test.ts | 15 +++++++++++++++ src/utils/language.ts | 30 ++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/utils/language.test.ts b/src/utils/language.test.ts index 255f6b58..f4025290 100644 --- a/src/utils/language.test.ts +++ b/src/utils/language.test.ts @@ -26,3 +26,18 @@ Deno.test('Detect English language', () => { 'en', ); }); + +Deno.test('Detects definitive texts', () => { + // NOTE: pass `1` as min confidence to test only the definitive patterns + + // unambiguous + assertEquals(detectLanguage('안녕하세요.', 1), 'ko'); + assertEquals(detectLanguage('Γειά σου!', 1), 'el'); + assertEquals(detectLanguage('שלום!', 1), 'he'); + assertEquals(detectLanguage('こんにちは。', 1), 'ja'); + + // ambiguous + assertEquals(detectLanguage('你好', 1), undefined); + assertEquals(detectLanguage('Привет', 1), undefined); + assertEquals(detectLanguage('Hello', 1), undefined); +}); diff --git a/src/utils/language.ts b/src/utils/language.ts index 8af8ddf9..4b6e3807 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -4,8 +4,9 @@ import linkify from 'linkifyjs'; linkify.registerCustomProtocol('nostr', true); -/** Returns the detected language if the confidence is greater or equal than 'minConfidence' - * 'minConfidence' must be a number between 0 and 1, such as 0.95 +/** + * Returns the detected language if the confidence is greater or equal than 'minConfidence'. + * 'minConfidence' must be a number between 0 and 1, such as 0.95. */ export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { // It's better to remove the emojis first @@ -15,13 +16,31 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); + // Definite patterns for some languages. + // Text which matches MUST unambiguously be in the given language. + // This is only possible for some languages. + // All patterns match the full text, so mixed scripts would fail these tests. + const languagePatterns: Partial> = { + ko: /^[\p{Script=Hangul}\s]+$/u, // Korean (Hangul only) + el: /^[\p{Script=Greek}\s]+$/u, // Greek + he: /^[\p{Script=Hebrew}\s]+$/u, // Hebrew + ja: /^(?=.*[\p{Script=Hiragana}\p{Script=Katakana}])[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\s]+$/u, // Japanese (requires at least one Kana) + // zh: not possible to detect unambiguously + }; + + // If any pattern matches, the language is known. + for (const [lang, pattern] of Object.entries(languagePatterns) as [LanguageCode, RegExp][]) { + if (pattern.test(text.replace(/[\p{P}\p{S}]/gu, ''))) { // strip punctuation and symbols before checking + return lang; + } + } + if (sanitizedText.length < 10) { // heuristics return; } - const [topResult] = lande( - sanitizedText, - ); + const [topResult] = lande(sanitizedText); + if (topResult) { const [iso6393, confidence] = topResult; const locale = new Intl.Locale(iso6393); @@ -30,5 +49,4 @@ export function detectLanguage(text: string, minConfidence: number): LanguageCod return locale.language as LanguageCode; } } - return; }