mirror of
https://gitlab.com/soapbox-pub/ditto.git
synced 2025-12-06 03:19:46 +00:00
34 lines
1.1 KiB
TypeScript
34 lines
1.1 KiB
TypeScript
import ISO6391, { type LanguageCode } from 'iso-639-1';
|
|
import lande from 'lande';
|
|
import linkify from 'linkifyjs';
|
|
|
|
linkify.registerCustomProtocol('nostr', true);
|
|
|
|
/** Returns the detected language if the confidence is greater or equal than 'minConfidence'
|
|
* 'minConfidence' must be a number between 0 and 1, such as 0.95
|
|
*/
|
|
export function detectLanguage(text: string, minConfidence: number): LanguageCode | undefined {
|
|
// It's better to remove the emojis first
|
|
const sanitizedText = linkify.tokenize(
|
|
text
|
|
.replaceAll(/\p{Extended_Pictographic}/gu, '')
|
|
.replaceAll(/[\s\uFEFF\u00A0\u200B-\u200D\u{0FE0E}]+/gu, ' '),
|
|
).reduce((acc, { t, v }) => t === 'text' ? acc + v : acc, '').trim();
|
|
|
|
if (sanitizedText.length < 10) { // heuristics
|
|
return;
|
|
}
|
|
|
|
const [topResult] = lande(
|
|
sanitizedText,
|
|
);
|
|
if (topResult) {
|
|
const [iso6393, confidence] = topResult;
|
|
const locale = new Intl.Locale(iso6393);
|
|
|
|
if (confidence >= minConfidence && ISO6391.validate(locale.language)) {
|
|
return locale.language as LanguageCode;
|
|
}
|
|
}
|
|
return;
|
|
}
|