From 98a242b12c17939bd5abd92f8b0b4b74bb2319b7 Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sun, 26 May 2024 16:11:20 -0600
Subject: [PATCH 01/12] fix typo in user-facing tooltip text

---
 report/components/cards/language/LanguageStatsTable.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/report/components/cards/language/LanguageStatsTable.tsx b/report/components/cards/language/LanguageStatsTable.tsx
index ed6816aa..85c39c71 100644
--- a/report/components/cards/language/LanguageStatsTable.tsx
+++ b/report/components/cards/language/LanguageStatsTable.tsx
@@ -40,7 +40,7 @@ const LanguageStatsTable = () => {
                     depth: 1,
                     tooltip:
                         language.index === 0
-                            ? "Messages that did not have enough text to reliable detect the language"
+                            ? "Messages that did not have enough text to reliably detect the language"
                             : undefined,
                 } as Line)
         ) ?? []),

From cf66bc2ec964e68e76cfaae268f72c4e585bbdb4 Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sun, 26 May 2024 16:12:07 -0600
Subject: [PATCH 02/12] improve language processing logic

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 10 ++++++++--
 pipeline/process/DatabaseBuilder.ts                 |  7 +++++--
 pipeline/process/MessageProcessor.ts                | 13 ++++++++++---
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index cbf98ead..11d4be52 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -37,13 +37,19 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
     filterMessages(processMessage, database, filters);
 
     // lang
-    const langThreshold = Math.max(1, totalWithLang * 0.03); // at least 3% to be reliable
+    const langThreshold = Math.max(1, totalWithLang * 0.001); // at least 0.1% to be reliable
     const allLanguages = languagesCount.map((count, index) => ({ index, value: count }));
     const totalUnreliable = allLanguages
         .filter((lang) => lang.value < langThreshold)
         .reduce((sum, lang) => sum + lang.value, 0);
     const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
-    languageList.push({ index: 0, value: totalUnreliable });
+    try {
+        languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count
+    }
+    catch {
+        // if the index didn't exist, push the value instead
+        languageList.push({ index: 0, value: totalUnreliable });
+    }
     languageList.sort((a, b) => b.value - a.value);
 
     return {
diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts
index 318cd15c..2948ec9d 100644
--- a/pipeline/process/DatabaseBuilder.ts
+++ b/pipeline/process/DatabaseBuilder.ts
@@ -204,11 +204,13 @@ export class DatabaseBuilder {
         // we determine which languages we have to correctly filter stopwords
 
         const totalWithLang = this.langCounts.reduce((a, b) => a + b, 0);
+        console.log(totalWithLang)
+        console.log(this.langCounts)
         return (
             this.langCounts
                 .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang }))
-                // we need AT LEAST 3% to consider reliable
-                .filter((l) => l.ratio >= 0.03)
+                // we need AT LEAST 0.1% to consider reliable
+                .filter((l) => l.ratio >= 0.001)
                 // sort most used
                 .sort((a, b) => b.ratio - a.ratio)
                 // only keep the code
@@ -470,6 +472,7 @@ export class DatabaseBuilder {
         const { finalMessages } = this.compactMessagesData(channels, dateKeys);
         const { finalCalls } = this.processCalls(dateKeys);
 
+        console.log(langs)
         return {
             config: this.config,
             generatedAt: new Date().toISOString(),
diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts
index 4521ba45..d6d698de 100644
--- a/pipeline/process/MessageProcessor.ts
+++ b/pipeline/process/MessageProcessor.ts
@@ -49,9 +49,16 @@ export class MessageProcessor {
 
         // detect language in the whole group text
         // this yields better accuracy
-        let langIndex: number | undefined;
-        if (allText.length > 0) {
-            langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index;
+        let langIndex: number = 0;
+        let result;
+        const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1)
+        const word_count_threshold = 6; // must have at least this many words for language detection
+        let word_count: number = allText.split(" ").length - 1;
+        if (word_count >= word_count_threshold) {
+            result = this.langPredictModel!.identifyLanguage(allText);
+            if (result.accuracy >= accuracy_threshold) {
+                langIndex = result.iso639index;
+            }
         }
 
         return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex));

From b67bf2c23ba77de0bf0df2fb413bdf771118b2e9 Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sun, 26 May 2024 17:01:14 -0600
Subject: [PATCH 03/12] remove log commands I forgot about

---
 pipeline/process/DatabaseBuilder.ts | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts
index 2948ec9d..aa7dc958 100644
--- a/pipeline/process/DatabaseBuilder.ts
+++ b/pipeline/process/DatabaseBuilder.ts
@@ -204,8 +204,6 @@ export class DatabaseBuilder {
         // we determine which languages we have to correctly filter stopwords
 
         const totalWithLang = this.langCounts.reduce((a, b) => a + b, 0);
-        console.log(totalWithLang)
-        console.log(this.langCounts)
         return (
             this.langCounts
                 .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang }))
@@ -472,7 +470,6 @@ export class DatabaseBuilder {
         const { finalMessages } = this.compactMessagesData(channels, dateKeys);
         const { finalCalls } = this.processCalls(dateKeys);
 
-        console.log(langs)
         return {
             config: this.config,
             generatedAt: new Date().toISOString(),

From 51bd0d3b29b82a6203e4fdd6831238c2e40eea4d Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sun, 26 May 2024 17:02:03 -0600
Subject: [PATCH 04/12] fix format with npm run format

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 11d4be52..61fb39ea 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -45,8 +45,7 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
     const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
     try {
         languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count
-    }
-    catch {
+    } catch {
         // if the index didn't exist, push the value instead
         languageList.push({ index: 0, value: totalUnreliable });
     }

From c015b1498151805e8cd59569b4b5a0f92c23411b Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sat, 1 Jun 2024 12:16:28 -0600
Subject: [PATCH 05/12] correct word count for language detection cutoff

---
 pipeline/process/MessageProcessor.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts
index d6d698de..95d6ad66 100644
--- a/pipeline/process/MessageProcessor.ts
+++ b/pipeline/process/MessageProcessor.ts
@@ -52,8 +52,10 @@ export class MessageProcessor {
         let langIndex: number = 0;
         let result;
         const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1)
-        const word_count_threshold = 6; // must have at least this many words for language detection
-        let word_count: number = allText.split(" ").length - 1;
+        const word_count_threshold = 1; // must have at least this many words for language detection
+        let word_count: number = tokenizations
+            .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message
+            .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group
         if (word_count >= word_count_threshold) {
             result = this.langPredictModel!.identifyLanguage(allText);
             if (result.accuracy >= accuracy_threshold) {

From 1842711f09ea51ca700ac130da572c54a83c2674 Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Sun, 2 Jun 2024 18:41:52 -0600
Subject: [PATCH 06/12] fix bug in Unreliable to Detect counter

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 61fb39ea..2415dfaa 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -1,3 +1,5 @@
+import { isUndefined } from "util";
+
 import { BlockDescription, BlockFn } from "@pipeline/aggregate/Blocks";
 import { IndexEntry } from "@pipeline/aggregate/Common";
 import { filterMessages } from "@pipeline/aggregate/Helpers";
@@ -43,11 +45,11 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
         .filter((lang) => lang.value < langThreshold)
         .reduce((sum, lang) => sum + lang.value, 0);
     const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
-    try {
-        languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count
-    } catch {
-        // if the index didn't exist, push the value instead
-        languageList.push({ index: 0, value: totalUnreliable });
+    const UnreliableToDetectIndex = languageList.findIndex((item) => item.index == 0);
+    if (UnreliableToDetectIndex < 0) {
+        languageList.push({ index: 0, value: totalUnreliable }); // if the index didn't exist, push the value
+    } else {
+        languageList[UnreliableToDetectIndex].value += totalUnreliable; // append cutoff languages to unreliable languages count
     }
     languageList.sort((a, b) => b.value - a.value);
 

From e8331386b7738a6d72fbc914198329c23c0e8b5e Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Thu, 9 Jan 2025 13:38:59 -0700
Subject: [PATCH 07/12] reset minimum language model accuracy to 0

---
 pipeline/process/MessageProcessor.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts
index 95d6ad66..0eec29c4 100644
--- a/pipeline/process/MessageProcessor.ts
+++ b/pipeline/process/MessageProcessor.ts
@@ -51,7 +51,7 @@ export class MessageProcessor {
         // this yields better accuracy
         let langIndex: number = 0;
         let result;
-        const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1)
+        const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1)
         const word_count_threshold = 1; // must have at least this many words for language detection
         let word_count: number = tokenizations
             .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message

From 1f97e1d4716499507b52d7b497b20c89ce1d823f Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Thu, 9 Jan 2025 13:45:39 -0700
Subject: [PATCH 08/12] set minimum language proportion to 1%

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 2 +-
 pipeline/process/DatabaseBuilder.ts                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 2415dfaa..5e564524 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -39,7 +39,7 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
     filterMessages(processMessage, database, filters);
 
     // lang
-    const langThreshold = Math.max(1, totalWithLang * 0.001); // at least 0.1% to be reliable
+    const langThreshold = Math.max(1, totalWithLang * 0.01); // at least 1% to be reliable
     const allLanguages = languagesCount.map((count, index) => ({ index, value: count }));
     const totalUnreliable = allLanguages
         .filter((lang) => lang.value < langThreshold)
diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts
index be497e33..1ff6a414 100644
--- a/pipeline/process/DatabaseBuilder.ts
+++ b/pipeline/process/DatabaseBuilder.ts
@@ -207,8 +207,8 @@ export class DatabaseBuilder {
         return (
             this.langCounts
                 .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang }))
-                // we need AT LEAST 0.1% to consider reliable
-                .filter((l) => l.ratio >= 0.001)
+                // we need AT LEAST 1% to consider reliable
+                .filter((l) => l.ratio >= 0.01)
                 // sort most used
                 .sort((a, b) => b.ratio - a.ratio)
                 // only keep the code

From 66926e278dccd4bc4038735849a8de2e6773e354 Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Thu, 9 Jan 2025 13:47:32 -0700
Subject: [PATCH 09/12] reset minimum language proportion to original 3%

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 2 +-
 pipeline/process/DatabaseBuilder.ts                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 5e564524..29bf0462 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -39,7 +39,7 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
     filterMessages(processMessage, database, filters);
 
     // lang
-    const langThreshold = Math.max(1, totalWithLang * 0.01); // at least 1% to be reliable
+    const langThreshold = Math.max(1, totalWithLang * 0.03); // at least 3% to be reliable
     const allLanguages = languagesCount.map((count, index) => ({ index, value: count }));
     const totalUnreliable = allLanguages
         .filter((lang) => lang.value < langThreshold)
diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts
index 1ff6a414..314b563f 100644
--- a/pipeline/process/DatabaseBuilder.ts
+++ b/pipeline/process/DatabaseBuilder.ts
@@ -207,8 +207,8 @@ export class DatabaseBuilder {
         return (
             this.langCounts
                 .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang }))
-                // we need AT LEAST 1% to consider reliable
-                .filter((l) => l.ratio >= 0.01)
+                // we need AT LEAST 3% to consider reliable
+                .filter((l) => l.ratio >= 0.03)
                 // sort most used
                 .sort((a, b) => b.ratio - a.ratio)
                 // only keep the code

From a404a2a51d0d83e3f80935aae124d49216d209fe Mon Sep 17 00:00:00 2001
From: Cedric Boucher <onebitalpha@gmail.com>
Date: Thu, 9 Jan 2025 13:48:49 -0700
Subject: [PATCH 10/12] remove unused accidental import

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 29bf0462..26f8b572 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -1,5 +1,3 @@
-import { isUndefined } from "util";
-
 import { BlockDescription, BlockFn } from "@pipeline/aggregate/Blocks";
 import { IndexEntry } from "@pipeline/aggregate/Common";
 import { filterMessages } from "@pipeline/aggregate/Helpers";

From 128134f73d5fd6d312e05607aafb8c7243e68248 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mart=C3=ADn=20Lombardo?= <to@mlomb.me>
Date: Fri, 28 Mar 2025 15:15:51 -0300
Subject: [PATCH 11/12] Make `UnreliableToDetectIndex` shorter and add comment

---
 pipeline/aggregate/blocks/language/LanguageStats.ts | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts
index 26f8b572..d516765e 100644
--- a/pipeline/aggregate/blocks/language/LanguageStats.ts
+++ b/pipeline/aggregate/blocks/language/LanguageStats.ts
@@ -43,12 +43,13 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
         .filter((lang) => lang.value < langThreshold)
         .reduce((sum, lang) => sum + lang.value, 0);
     const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
-    const UnreliableToDetectIndex = languageList.findIndex((item) => item.index == 0);
-    if (UnreliableToDetectIndex < 0) {
-        languageList.push({ index: 0, value: totalUnreliable }); // if the index didn't exist, push the value
-    } else {
-        languageList[UnreliableToDetectIndex].value += totalUnreliable; // append cutoff languages to unreliable languages count
-    }
+
+    // since langIndex can be 0 now, it can appear in languagesCount
+    // so we sum to the existing value or push it to the list if it doesn't exist
+    const utdIndex = languageList.findIndex((item) => item.index == 0);
+    if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable });
+    else languageList[utdIndex].value += totalUnreliable;
+
     languageList.sort((a, b) => b.value - a.value);
 
     return {

From e2a55675e657467cd043cf6a1803439c79f6d310 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mart=C3=ADn=20Lombardo?= <to@mlomb.me>
Date: Fri, 28 Mar 2025 15:40:07 -0300
Subject: [PATCH 12/12] Cleaning up PR

Made some changes since langIndex can no longer be undefined
---
 pipeline/process/MessageProcessor.ts | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts
index 0eec29c4..06e38665 100644
--- a/pipeline/process/MessageProcessor.ts
+++ b/pipeline/process/MessageProcessor.ts
@@ -49,15 +49,18 @@ export class MessageProcessor {
 
         // detect language in the whole group text
         // this yields better accuracy
-        let langIndex: number = 0;
-        let result;
+        let langIndex: number = 0; // 0 = "Unreliable to detect"
+
+        // See https://github.com/mlomb/chat-analytics/pull/110
         const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1)
         const word_count_threshold = 1; // must have at least this many words for language detection
-        let word_count: number = tokenizations
+        const word_count: number = tokenizations
             .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message
             .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group
+
         if (word_count >= word_count_threshold) {
-            result = this.langPredictModel!.identifyLanguage(allText);
+            const result = this.langPredictModel!.identifyLanguage(allText);
+
             if (result.accuracy >= accuracy_threshold) {
                 langIndex = result.iso639index;
             }
@@ -67,7 +70,7 @@ export class MessageProcessor {
     }
 
     /** Process the given message. Also takes the tokens for the message, and other information. */
-    private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message {
+    private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message {
         const wordsCount = new IndexCountsBuilder();
         const emojisCount = new IndexCountsBuilder();
         const mentionsCount = new IndexCountsBuilder();
@@ -123,10 +126,8 @@ export class MessageProcessor {
         }
 
         // sentiment analysis
-        let sentiment = 0;
-        if (langIndex) {
-            sentiment = this.sentiment?.calculate(tokens, langIndex) || 0;
-        }
+        // note that if langIndex is 0 (no language), the following line will return undefined (as expected)
+        const sentiment = this.sentiment?.calculate(tokens, langIndex);
 
         let replyOffset: number | undefined = undefined;
         if (msg.replyTo) {
@@ -156,7 +157,7 @@ export class MessageProcessor {
             authorIndex: this.builder.authors.getIndex(msg.authorId)!,
             replyOffset,
             langIndex,
-            sentiment: langIndex !== undefined ? sentiment : undefined,
+            sentiment,
             words: wordsCount.toArray(),
             emojis: emojisCount.toArray(),
             mentions: mentionsCount.toArray(),