pipeline/aggregate/blocks/language/LanguageStats.ts

-Original file line number
+Diff line change
@@ Expand Up @@
             .filter((lang) => lang.value < langThreshold)
             .reduce((sum, lang) => sum + lang.value, 0);
         const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
-        languageList.push({ index: 0, value: totalUnreliable });
+        // since langIndex can be 0 now, it can appear in languagesCount
+        // so we sum to the existing value or push it to the list if it doesn't exist
+        const utdIndex = languageList.findIndex((item) => item.index == 0);
+        if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable });
+        else languageList[utdIndex].value += totalUnreliable;
         languageList.sort((a, b) => b.value - a.value);
         return {
@@ Expand Down @@

pipeline/process/MessageProcessor.ts

-Original file line number
+Diff line change
@@ Expand Up / @@ -49,16 +49,28 @@ export class MessageProcessor { @@
             // detect language in the whole group text
             // this yields better accuracy
-            let langIndex: number | undefined;
-            if (allText.length > 0) {
-                langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index;
+            let langIndex: number = 0; // 0 = "Unreliable to detect"
+            // See https://github.com/mlomb/chat-analytics/pull/110
+            const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1)
+            const word_count_threshold = 1; // must have at least this many words for language detection
+            const word_count: number = tokenizations
+                .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message
+                .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group
+            if (word_count >= word_count_threshold) {
+                const result = this.langPredictModel!.identifyLanguage(allText);
+                if (result.accuracy >= accuracy_threshold) {
+                    langIndex = result.iso639index;
+                }
             }
             return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex));
         }
         /** Process the given message. Also takes the tokens for the message, and other information. */
-        private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message {
+        private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message {
             const wordsCount = new IndexCountsBuilder();
             const emojisCount = new IndexCountsBuilder();
             const mentionsCount = new IndexCountsBuilder();
@@ Expand Down Expand Up / @@ -114,10 +126,8 @@ export class MessageProcessor { @@
             }
             // sentiment analysis
-            let sentiment = 0;
-            if (langIndex) {
-                sentiment = this.sentiment?.calculate(tokens, langIndex) || 0;
-            }
+            // note that if langIndex is 0 (no language), the following line will return undefined (as expected)
+            const sentiment = this.sentiment?.calculate(tokens, langIndex);
             let replyOffset: number | undefined = undefined;
             if (msg.replyTo) {
@@ Expand Down Expand Up / @@ -147,7 +157,7 @@ export class MessageProcessor { @@
                 authorIndex: this.builder.authors.getIndex(msg.authorId)!,
                 replyOffset,
                 langIndex,
-                sentiment: langIndex !== undefined ? sentiment : undefined,
+                sentiment,
                 words: wordsCount.toArray(),
                 emojis: emojisCount.toArray(),
                 mentions: mentionsCount.toArray(),
@@ Expand Down @@

report/components/cards/language/LanguageStatsTable.tsx

-Original file line number
+Diff line change
@@ Expand Up / @@ -40,7 +40,7 @@ const LanguageStatsTable = () => { @@
                         depth: 1,
                         tooltip:
                             language.index === 0
-                                ? "Messages that did not have enough text to reliable detect the language"
+                                ? "Messages that did not have enough text to reliably detect the language"
                                 : undefined,
                     } as Line)
             ) ?? []),
@@ Expand Down @@

Improve Language Processing Logic #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

mlomb merged 14 commits into mlomb:main from Cedric-Boucher:main

Mar 28, 2025

-Original file line number
+Diff line change
@@ Expand Up @@
             .filter((lang) => lang.value < langThreshold)
             .reduce((sum, lang) => sum + lang.value, 0);
         const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
-        languageList.push({ index: 0, value: totalUnreliable });
+        // since langIndex can be 0 now, it can appear in languagesCount
+        // so we sum to the existing value or push it to the list if it doesn't exist
+        const utdIndex = languageList.findIndex((item) => item.index == 0);
+        if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable });
+        else languageList[utdIndex].value += totalUnreliable;
         languageList.sort((a, b) => b.value - a.value);
         return {
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -49,16 +49,28 @@ export class MessageProcessor { @@
             // detect language in the whole group text
             // this yields better accuracy
-            let langIndex: number | undefined;
-            if (allText.length > 0) {
-                langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index;
+            let langIndex: number = 0; // 0 = "Unreliable to detect"
+            // See https://github.com/mlomb/chat-analytics/pull/110
+            const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1)
+            const word_count_threshold = 1; // must have at least this many words for language detection
+            const word_count: number = tokenizations
+                .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message
+                .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group
+            if (word_count >= word_count_threshold) {
+                const result = this.langPredictModel!.identifyLanguage(allText);
+                if (result.accuracy >= accuracy_threshold) {
+                    langIndex = result.iso639index;
+                }
             }
             return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex));
         }
         /** Process the given message. Also takes the tokens for the message, and other information. */
-        private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message {
+        private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message {
             const wordsCount = new IndexCountsBuilder();
             const emojisCount = new IndexCountsBuilder();
             const mentionsCount = new IndexCountsBuilder();
@@ Expand Down Expand Up / @@ -114,10 +126,8 @@ export class MessageProcessor { @@
             }
             // sentiment analysis
-            let sentiment = 0;
-            if (langIndex) {
-                sentiment = this.sentiment?.calculate(tokens, langIndex) || 0;
-            }
+            // note that if langIndex is 0 (no language), the following line will return undefined (as expected)
+            const sentiment = this.sentiment?.calculate(tokens, langIndex);
             let replyOffset: number | undefined = undefined;
             if (msg.replyTo) {
@@ Expand Down Expand Up / @@ -147,7 +157,7 @@ export class MessageProcessor { @@
                 authorIndex: this.builder.authors.getIndex(msg.authorId)!,
                 replyOffset,
                 langIndex,
-                sentiment: langIndex !== undefined ? sentiment : undefined,
+                sentiment,
                 words: wordsCount.toArray(),
                 emojis: emojisCount.toArray(),
                 mentions: mentionsCount.toArray(),
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -40,7 +40,7 @@ const LanguageStatsTable = () => { @@
                         depth: 1,
                         tooltip:
                             language.index === 0
-                                ? "Messages that did not have enough text to reliable detect the language"
+                                ? "Messages that did not have enough text to reliably detect the language"
                                 : undefined,
                     } as Line)
             ) ?? []),
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Improve Language Processing Logic #110

Uh oh!

Diff view

Diff view

There are no files selected for viewing