diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index cbf98ead..d516765e 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -43,7 +43,13 @@ const fn: BlockFn = (database, filters, common, args) => { .filter((lang) => lang.value < langThreshold) .reduce((sum, lang) => sum + lang.value, 0); const languageList = allLanguages.filter((lang) => lang.value >= langThreshold); - languageList.push({ index: 0, value: totalUnreliable }); + + // since langIndex can be 0 now, it can appear in languagesCount + // so we sum to the existing value or push it to the list if it doesn't exist + const utdIndex = languageList.findIndex((item) => item.index == 0); + if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable }); + else languageList[utdIndex].value += totalUnreliable; + languageList.sort((a, b) => b.value - a.value); return { diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts index 4521ba45..06e38665 100644 --- a/pipeline/process/MessageProcessor.ts +++ b/pipeline/process/MessageProcessor.ts @@ -49,16 +49,28 @@ export class MessageProcessor { // detect language in the whole group text // this yields better accuracy - let langIndex: number | undefined; - if (allText.length > 0) { - langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index; + let langIndex: number = 0; // 0 = "Unreliable to detect" + + // See https://github.com/mlomb/chat-analytics/pull/110 + const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1) + const word_count_threshold = 1; // must have at least this many words for language detection + const word_count: number = tokenizations + .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message + .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group + + if (word_count >= word_count_threshold) { + const result = this.langPredictModel!.identifyLanguage(allText); + + if (result.accuracy >= accuracy_threshold) { + langIndex = result.iso639index; + } } return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex)); } /** Process the given message. Also takes the tokens for the message, and other information. */ - private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message { + private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message { const wordsCount = new IndexCountsBuilder(); const emojisCount = new IndexCountsBuilder(); const mentionsCount = new IndexCountsBuilder(); @@ -114,10 +126,8 @@ export class MessageProcessor { } // sentiment analysis - let sentiment = 0; - if (langIndex) { - sentiment = this.sentiment?.calculate(tokens, langIndex) || 0; - } + // note that if langIndex is 0 (no language), the following line will return undefined (as expected) + const sentiment = this.sentiment?.calculate(tokens, langIndex); let replyOffset: number | undefined = undefined; if (msg.replyTo) { @@ -147,7 +157,7 @@ export class MessageProcessor { authorIndex: this.builder.authors.getIndex(msg.authorId)!, replyOffset, langIndex, - sentiment: langIndex !== undefined ? sentiment : undefined, + sentiment, words: wordsCount.toArray(), emojis: emojisCount.toArray(), mentions: mentionsCount.toArray(), diff --git a/report/components/cards/language/LanguageStatsTable.tsx b/report/components/cards/language/LanguageStatsTable.tsx index ed6816aa..85c39c71 100644 --- a/report/components/cards/language/LanguageStatsTable.tsx +++ b/report/components/cards/language/LanguageStatsTable.tsx @@ -40,7 +40,7 @@ const LanguageStatsTable = () => { depth: 1, tooltip: language.index === 0 - ? "Messages that did not have enough text to reliable detect the language" + ? "Messages that did not have enough text to reliably detect the language" : undefined, } as Line) ) ?? []),