mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 03:19:46 +00:00
add lingua lang detection
This commit is contained in:
parent
3812c1d3e6
commit
86e816ae67
2 changed files with 79 additions and 15 deletions
|
|
@ -234,7 +234,7 @@ async function parseMetadata(event: NostrEvent, signal: AbortSignal): Promise<vo
|
|||
async function setLanguage(event: NostrEvent): Promise<void> {
|
||||
if (event.kind !== 1) return;
|
||||
|
||||
const language = detectLanguage(event.content, 0.90);
|
||||
const language = await detectLanguage(event.content, 0.90);
|
||||
if (!language) return;
|
||||
|
||||
const kysely = await Storages.kysely();
|
||||
|
|
|
|||
|
|
@ -1,34 +1,98 @@
|
|||
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
||||
import lande from 'lande';
|
||||
import linkify from 'linkifyjs';
|
||||
import { Conf } from '@/config.ts';
|
||||
|
||||
declare class LinguaDetector {
|
||||
private worker: Worker;
|
||||
private pending: Map<string, PromiseWithResolvers<Map<string, number>>>;
|
||||
destroyed: boolean;
|
||||
|
||||
/**
|
||||
* Instantiate a new LanguageDetector.
|
||||
* @param workerPath Path to the worker file. If not supplied, it defaults to using the lingua-wasm build present with this package.
|
||||
* The worker is simply posted messages with a detection id and the string to detect. It must then respond with the same detection id and the detected language (or undefined.)
|
||||
*/
|
||||
constructor(workerPath?: string);
|
||||
|
||||
/**
|
||||
* Detect the language of a string.
|
||||
* @param str The string to detect for.
|
||||
* @returns A Promise that resolves to an ISO-639-3 language code.
|
||||
*/
|
||||
detect(str: string): Promise<string>;
|
||||
|
||||
/**
|
||||
* Checks if the LanguageDetector has been destroyed.
|
||||
* @throws Error if the detector has been destroyed.
|
||||
*/
|
||||
private checkDestroyed(): void;
|
||||
|
||||
/**
|
||||
* Destroys the LanguageDetector instance and terminates the associated Worker.
|
||||
*/
|
||||
destroy(): void;
|
||||
}
|
||||
|
||||
const toLanguageCode = (iso6393: string) => {
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
if (ISO6391.validate(locale.language)) return locale.language as LanguageCode;
|
||||
};
|
||||
|
||||
function detectWithLande(sanitized: string, threshold: number) {
|
||||
const [topResult] = lande(sanitized);
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
|
||||
if (confidence >= threshold) {
|
||||
return toLanguageCode(iso6393);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const linguaDirUrl = (file: string) => {
|
||||
const normalized = new URL('../../data/lingua/' + file, import.meta.url);
|
||||
return normalized.toString();
|
||||
};
|
||||
|
||||
let linguaInstance: LinguaDetector | undefined = undefined;
|
||||
|
||||
async function detectWithLingua(text: string) {
|
||||
try {
|
||||
if (!linguaInstance) {
|
||||
const { LanguageDetector } = await import(linguaDirUrl('mod.ts'));
|
||||
linguaInstance = new LanguageDetector(linguaDirUrl('src/worker.js')) as unknown as LinguaDetector;
|
||||
globalThis.addEventListener('unload', () => linguaInstance?.destroy());
|
||||
}
|
||||
const result = await linguaInstance.detect(text);
|
||||
return toLanguageCode(result);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
linkify.registerCustomProtocol('nostr', true);
|
||||
|
||||
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
||||
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
||||
*/
|
||||
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
||||
export async function detectLanguage(text: string, threshold: number): Promise<LanguageCode | undefined> {
|
||||
// It's better to remove the emojis first
|
||||
const sanitizedText = linkify.tokenize(
|
||||
const sanitized = linkify.tokenize(
|
||||
text
|
||||
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
||||
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
||||
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
||||
|
||||
if (sanitizedText.length < 10) { // heuristics
|
||||
if (sanitized.length < 10) { // heuristics
|
||||
return;
|
||||
}
|
||||
|
||||
const [topResult] = lande(
|
||||
sanitizedText,
|
||||
);
|
||||
if (topResult) {
|
||||
const [iso6393, confidence] = topResult;
|
||||
const locale = new Intl.Locale(iso6393);
|
||||
if (Conf.languageDetector === 'lingua') {
|
||||
const detected = await detectWithLingua(sanitized);
|
||||
if (detected) return detected;
|
||||
}
|
||||
|
||||
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
||||
return locale.language as LanguageCode;
|
||||
}
|
||||
}
|
||||
return;
|
||||
return detectWithLande(sanitized, threshold);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue