diff --git a/src/pipeline.ts b/src/pipeline.ts index 5becff20..14f684ff 100644 --- a/src/pipeline.ts +++ b/src/pipeline.ts @@ -234,7 +234,7 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise { if (event.kind !== 1) return; - const language = detectLanguage(event.content, 0.90); + const language = await detectLanguage(event.content, 0.90); if (!language) return; const kysely = await Storages.kysely(); diff --git a/src/utils/language.ts b/src/utils/language.ts index 8af8ddf9..e317a263 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,34 +1,98 @@ import ISO6391, { type LanguageCode } from 'iso-639-1'; import lande from 'lande'; import linkify from 'linkifyjs'; +import { Conf } from '@/config.ts'; + +declare class LinguaDetector { + private worker: Worker; + private pending: Map>>; + destroyed: boolean; + + /** + * Instantiate a new LanguageDetector. + * @param workerPath Path to the worker file. If not supplied, it defaults to using the lingua-wasm build present with this package. + * The worker is simply posted messages with a detection id and the string to detect. It must then respond with the same detection id and the detected language (or undefined.) + */ + constructor(workerPath?: string); + + /** + * Detect the language of a string. + * @param str The string to detect for. + * @returns A Promise that resolves to an ISO-639-3 language code. + */ + detect(str: string): Promise; + + /** + * Checks if the LanguageDetector has been destroyed. + * @throws Error if the detector has been destroyed. + */ + private checkDestroyed(): void; + + /** + * Destroys the LanguageDetector instance and terminates the associated Worker. + */ + destroy(): void; +} + +const toLanguageCode = (iso6393: string) => { + const locale = new Intl.Locale(iso6393); + if (ISO6391.validate(locale.language)) return locale.language as LanguageCode; +}; + +function detectWithLande(sanitized: string, threshold: number) { + const [topResult] = lande(sanitized); + if (topResult) { + const [iso6393, confidence] = topResult; + + if (confidence >= threshold) { + return toLanguageCode(iso6393); + } + } +} + +const linguaDirUrl = (file: string) => { + const normalized = new URL('../../data/lingua/' + file, import.meta.url); + return normalized.toString(); +}; + +let linguaInstance: LinguaDetector | undefined = undefined; + +async function detectWithLingua(text: string) { + try { + if (!linguaInstance) { + const { LanguageDetector } = await import(linguaDirUrl('mod.ts')); + linguaInstance = new LanguageDetector(linguaDirUrl('src/worker.js')) as unknown as LinguaDetector; + globalThis.addEventListener('unload', () => linguaInstance?.destroy()); + } + const result = await linguaInstance.detect(text); + return toLanguageCode(result); + } catch (e) { + console.error(e); + return null; + } +} linkify.registerCustomProtocol('nostr', true); /** Returns the detected language if the confidence is greater or equal than 'minConfidence' * 'minConfidence' must be a number between 0 and 1, such as 0.95 */ -export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined { +export async function detectLanguage(text: string, threshold: number): Promise { // It's better to remove the emojis first - const sanitizedText = linkify.tokenize( + const sanitized = linkify.tokenize( text .replaceAll(/\p{Extended_Pictographic}/gu, '') .replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '), ).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim(); - if (sanitizedText.length < 10) { // heuristics + if (sanitized.length < 10) { // heuristics return; } - const [topResult] = lande( - sanitizedText, - ); - if (topResult) { - const [iso6393, confidence] = topResult; - const locale = new Intl.Locale(iso6393); - - if (confidence >= minConfidence && ISO6391.validate(locale.language)) { - return locale.language as LanguageCode; - } + if (Conf.languageDetector === 'lingua') { + const detected = await detectWithLingua(sanitized); + if (detected) return detected; } - return; + + return detectWithLande(sanitized, threshold); }