From 98a242b12c17939bd5abd92f8b0b4b74bb2319b7 Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sun, 26 May 2024 16:11:20 -0600 Subject: [PATCH 01/12] fix typo in user-facing tooltip text --- report/components/cards/language/LanguageStatsTable.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report/components/cards/language/LanguageStatsTable.tsx b/report/components/cards/language/LanguageStatsTable.tsx index ed6816aa..85c39c71 100644 --- a/report/components/cards/language/LanguageStatsTable.tsx +++ b/report/components/cards/language/LanguageStatsTable.tsx @@ -40,7 +40,7 @@ const LanguageStatsTable = () => { depth: 1, tooltip: language.index === 0 - ? "Messages that did not have enough text to reliable detect the language" + ? "Messages that did not have enough text to reliably detect the language" : undefined, } as Line) ) ?? []), From cf66bc2ec964e68e76cfaae268f72c4e585bbdb4 Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sun, 26 May 2024 16:12:07 -0600 Subject: [PATCH 02/12] improve language processing logic --- pipeline/aggregate/blocks/language/LanguageStats.ts | 10 ++++++++-- pipeline/process/DatabaseBuilder.ts | 7 +++++-- pipeline/process/MessageProcessor.ts | 13 ++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index cbf98ead..11d4be52 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -37,13 +37,19 @@ const fn: BlockFn = (database, filters, common, args) => { filterMessages(processMessage, database, filters); // lang - const langThreshold = Math.max(1, totalWithLang * 0.03); // at least 3% to be reliable + const langThreshold = Math.max(1, totalWithLang * 0.001); // at least 0.1% to be reliable const allLanguages = languagesCount.map((count, index) => ({ index, value: count })); const totalUnreliable = allLanguages .filter((lang) => lang.value < langThreshold) .reduce((sum, lang) => sum + lang.value, 0); const languageList = allLanguages.filter((lang) => lang.value >= langThreshold); - languageList.push({ index: 0, value: totalUnreliable }); + try { + languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count + } + catch { + // if the index didn't exist, push the value instead + languageList.push({ index: 0, value: totalUnreliable }); + } languageList.sort((a, b) => b.value - a.value); return { diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts index 318cd15c..2948ec9d 100644 --- a/pipeline/process/DatabaseBuilder.ts +++ b/pipeline/process/DatabaseBuilder.ts @@ -204,11 +204,13 @@ export class DatabaseBuilder { // we determine which languages we have to correctly filter stopwords const totalWithLang = this.langCounts.reduce((a, b) => a + b, 0); + console.log(totalWithLang) + console.log(this.langCounts) return ( this.langCounts .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang })) - // we need AT LEAST 3% to consider reliable - .filter((l) => l.ratio >= 0.03) + // we need AT LEAST 0.1% to consider reliable + .filter((l) => l.ratio >= 0.001) // sort most used .sort((a, b) => b.ratio - a.ratio) // only keep the code @@ -470,6 +472,7 @@ export class DatabaseBuilder { const { finalMessages } = this.compactMessagesData(channels, dateKeys); const { finalCalls } = this.processCalls(dateKeys); + console.log(langs) return { config: this.config, generatedAt: new Date().toISOString(), diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts index 4521ba45..d6d698de 100644 --- a/pipeline/process/MessageProcessor.ts +++ b/pipeline/process/MessageProcessor.ts @@ -49,9 +49,16 @@ export class MessageProcessor { // detect language in the whole group text // this yields better accuracy - let langIndex: number | undefined; - if (allText.length > 0) { - langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index; + let langIndex: number = 0; + let result; + const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1) + const word_count_threshold = 6; // must have at least this many words for language detection + let word_count: number = allText.split(" ").length - 1; + if (word_count >= word_count_threshold) { + result = this.langPredictModel!.identifyLanguage(allText); + if (result.accuracy >= accuracy_threshold) { + langIndex = result.iso639index; + } } return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex)); From b67bf2c23ba77de0bf0df2fb413bdf771118b2e9 Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sun, 26 May 2024 17:01:14 -0600 Subject: [PATCH 03/12] remove log commands I forgot about --- pipeline/process/DatabaseBuilder.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts index 2948ec9d..aa7dc958 100644 --- a/pipeline/process/DatabaseBuilder.ts +++ b/pipeline/process/DatabaseBuilder.ts @@ -204,8 +204,6 @@ export class DatabaseBuilder { // we determine which languages we have to correctly filter stopwords const totalWithLang = this.langCounts.reduce((a, b) => a + b, 0); - console.log(totalWithLang) - console.log(this.langCounts) return ( this.langCounts .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang })) @@ -472,7 +470,6 @@ export class DatabaseBuilder { const { finalMessages } = this.compactMessagesData(channels, dateKeys); const { finalCalls } = this.processCalls(dateKeys); - console.log(langs) return { config: this.config, generatedAt: new Date().toISOString(), From 51bd0d3b29b82a6203e4fdd6831238c2e40eea4d Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sun, 26 May 2024 17:02:03 -0600 Subject: [PATCH 04/12] fix format with npm run format --- pipeline/aggregate/blocks/language/LanguageStats.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 11d4be52..61fb39ea 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -45,8 +45,7 @@ const fn: BlockFn = (database, filters, common, args) => { const languageList = allLanguages.filter((lang) => lang.value >= langThreshold); try { languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count - } - catch { + } catch { // if the index didn't exist, push the value instead languageList.push({ index: 0, value: totalUnreliable }); } From c015b1498151805e8cd59569b4b5a0f92c23411b Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sat, 1 Jun 2024 12:16:28 -0600 Subject: [PATCH 05/12] correct word count for language detection cutoff --- pipeline/process/MessageProcessor.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts index d6d698de..95d6ad66 100644 --- a/pipeline/process/MessageProcessor.ts +++ b/pipeline/process/MessageProcessor.ts @@ -52,8 +52,10 @@ export class MessageProcessor { let langIndex: number = 0; let result; const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1) - const word_count_threshold = 6; // must have at least this many words for language detection - let word_count: number = allText.split(" ").length - 1; + const word_count_threshold = 1; // must have at least this many words for language detection + let word_count: number = tokenizations + .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message + .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group if (word_count >= word_count_threshold) { result = this.langPredictModel!.identifyLanguage(allText); if (result.accuracy >= accuracy_threshold) { From 1842711f09ea51ca700ac130da572c54a83c2674 Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Sun, 2 Jun 2024 18:41:52 -0600 Subject: [PATCH 06/12] fix bug in Unreliable to Detect counter --- pipeline/aggregate/blocks/language/LanguageStats.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 61fb39ea..2415dfaa 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -1,3 +1,5 @@ +import { isUndefined } from "util"; + import { BlockDescription, BlockFn } from "@pipeline/aggregate/Blocks"; import { IndexEntry } from "@pipeline/aggregate/Common"; import { filterMessages } from "@pipeline/aggregate/Helpers"; @@ -43,11 +45,11 @@ const fn: BlockFn = (database, filters, common, args) => { .filter((lang) => lang.value < langThreshold) .reduce((sum, lang) => sum + lang.value, 0); const languageList = allLanguages.filter((lang) => lang.value >= langThreshold); - try { - languageList[0].value += totalUnreliable; // append cutoff languages to unreliable languages count - } catch { - // if the index didn't exist, push the value instead - languageList.push({ index: 0, value: totalUnreliable }); + const UnreliableToDetectIndex = languageList.findIndex((item) => item.index == 0); + if (UnreliableToDetectIndex < 0) { + languageList.push({ index: 0, value: totalUnreliable }); // if the index didn't exist, push the value + } else { + languageList[UnreliableToDetectIndex].value += totalUnreliable; // append cutoff languages to unreliable languages count } languageList.sort((a, b) => b.value - a.value); From e8331386b7738a6d72fbc914198329c23c0e8b5e Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Thu, 9 Jan 2025 13:38:59 -0700 Subject: [PATCH 07/12] reset minimum language model accuracy to 0 --- pipeline/process/MessageProcessor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts index 95d6ad66..0eec29c4 100644 --- a/pipeline/process/MessageProcessor.ts +++ b/pipeline/process/MessageProcessor.ts @@ -51,7 +51,7 @@ export class MessageProcessor { // this yields better accuracy let langIndex: number = 0; let result; - const accuracy_threshold = 0.7; // language model accuracy must be at least this high (0-1) + const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1) const word_count_threshold = 1; // must have at least this many words for language detection let word_count: number = tokenizations .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message From 1f97e1d4716499507b52d7b497b20c89ce1d823f Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Thu, 9 Jan 2025 13:45:39 -0700 Subject: [PATCH 08/12] set minimum language proportion to 1% --- pipeline/aggregate/blocks/language/LanguageStats.ts | 2 +- pipeline/process/DatabaseBuilder.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 2415dfaa..5e564524 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -39,7 +39,7 @@ const fn: BlockFn = (database, filters, common, args) => { filterMessages(processMessage, database, filters); // lang - const langThreshold = Math.max(1, totalWithLang * 0.001); // at least 0.1% to be reliable + const langThreshold = Math.max(1, totalWithLang * 0.01); // at least 1% to be reliable const allLanguages = languagesCount.map((count, index) => ({ index, value: count })); const totalUnreliable = allLanguages .filter((lang) => lang.value < langThreshold) diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts index be497e33..1ff6a414 100644 --- a/pipeline/process/DatabaseBuilder.ts +++ b/pipeline/process/DatabaseBuilder.ts @@ -207,8 +207,8 @@ export class DatabaseBuilder { return ( this.langCounts .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang })) - // we need AT LEAST 0.1% to consider reliable - .filter((l) => l.ratio >= 0.001) + // we need AT LEAST 1% to consider reliable + .filter((l) => l.ratio >= 0.01) // sort most used .sort((a, b) => b.ratio - a.ratio) // only keep the code From 66926e278dccd4bc4038735849a8de2e6773e354 Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Thu, 9 Jan 2025 13:47:32 -0700 Subject: [PATCH 09/12] reset minimum language proportion to original 3% --- pipeline/aggregate/blocks/language/LanguageStats.ts | 2 +- pipeline/process/DatabaseBuilder.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 5e564524..29bf0462 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -39,7 +39,7 @@ const fn: BlockFn = (database, filters, common, args) => { filterMessages(processMessage, database, filters); // lang - const langThreshold = Math.max(1, totalWithLang * 0.01); // at least 1% to be reliable + const langThreshold = Math.max(1, totalWithLang * 0.03); // at least 3% to be reliable const allLanguages = languagesCount.map((count, index) => ({ index, value: count })); const totalUnreliable = allLanguages .filter((lang) => lang.value < langThreshold) diff --git a/pipeline/process/DatabaseBuilder.ts b/pipeline/process/DatabaseBuilder.ts index 1ff6a414..314b563f 100644 --- a/pipeline/process/DatabaseBuilder.ts +++ b/pipeline/process/DatabaseBuilder.ts @@ -207,8 +207,8 @@ export class DatabaseBuilder { return ( this.langCounts .map((count, index) => ({ code: LanguageCodes[index], ratio: count / totalWithLang })) - // we need AT LEAST 1% to consider reliable - .filter((l) => l.ratio >= 0.01) + // we need AT LEAST 3% to consider reliable + .filter((l) => l.ratio >= 0.03) // sort most used .sort((a, b) => b.ratio - a.ratio) // only keep the code From a404a2a51d0d83e3f80935aae124d49216d209fe Mon Sep 17 00:00:00 2001 From: Cedric Boucher Date: Thu, 9 Jan 2025 13:48:49 -0700 Subject: [PATCH 10/12] remove unused accidental import --- pipeline/aggregate/blocks/language/LanguageStats.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 29bf0462..26f8b572 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -1,5 +1,3 @@ -import { isUndefined } from "util"; - import { BlockDescription, BlockFn } from "@pipeline/aggregate/Blocks"; import { IndexEntry } from "@pipeline/aggregate/Common"; import { filterMessages } from "@pipeline/aggregate/Helpers"; From 128134f73d5fd6d312e05607aafb8c7243e68248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lombardo?= Date: Fri, 28 Mar 2025 15:15:51 -0300 Subject: [PATCH 11/12] Make `UnreliableToDetectIndex` shorter and add comment --- pipeline/aggregate/blocks/language/LanguageStats.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pipeline/aggregate/blocks/language/LanguageStats.ts b/pipeline/aggregate/blocks/language/LanguageStats.ts index 26f8b572..d516765e 100644 --- a/pipeline/aggregate/blocks/language/LanguageStats.ts +++ b/pipeline/aggregate/blocks/language/LanguageStats.ts @@ -43,12 +43,13 @@ const fn: BlockFn = (database, filters, common, args) => { .filter((lang) => lang.value < langThreshold) .reduce((sum, lang) => sum + lang.value, 0); const languageList = allLanguages.filter((lang) => lang.value >= langThreshold); - const UnreliableToDetectIndex = languageList.findIndex((item) => item.index == 0); - if (UnreliableToDetectIndex < 0) { - languageList.push({ index: 0, value: totalUnreliable }); // if the index didn't exist, push the value - } else { - languageList[UnreliableToDetectIndex].value += totalUnreliable; // append cutoff languages to unreliable languages count - } + + // since langIndex can be 0 now, it can appear in languagesCount + // so we sum to the existing value or push it to the list if it doesn't exist + const utdIndex = languageList.findIndex((item) => item.index == 0); + if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable }); + else languageList[utdIndex].value += totalUnreliable; + languageList.sort((a, b) => b.value - a.value); return { From e2a55675e657467cd043cf6a1803439c79f6d310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lombardo?= Date: Fri, 28 Mar 2025 15:40:07 -0300 Subject: [PATCH 12/12] Cleaning up PR Made some changes since langIndex can no longer be undefined --- pipeline/process/MessageProcessor.ts | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pipeline/process/MessageProcessor.ts b/pipeline/process/MessageProcessor.ts index 0eec29c4..06e38665 100644 --- a/pipeline/process/MessageProcessor.ts +++ b/pipeline/process/MessageProcessor.ts @@ -49,15 +49,18 @@ export class MessageProcessor { // detect language in the whole group text // this yields better accuracy - let langIndex: number = 0; - let result; + let langIndex: number = 0; // 0 = "Unreliable to detect" + + // See https://github.com/mlomb/chat-analytics/pull/110 const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1) const word_count_threshold = 1; // must have at least this many words for language detection - let word_count: number = tokenizations + const word_count: number = tokenizations .map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message .reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group + if (word_count >= word_count_threshold) { - result = this.langPredictModel!.identifyLanguage(allText); + const result = this.langPredictModel!.identifyLanguage(allText); + if (result.accuracy >= accuracy_threshold) { langIndex = result.iso639index; } @@ -67,7 +70,7 @@ export class MessageProcessor { } /** Process the given message. Also takes the tokens for the message, and other information. */ - private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message { + private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message { const wordsCount = new IndexCountsBuilder(); const emojisCount = new IndexCountsBuilder(); const mentionsCount = new IndexCountsBuilder(); @@ -123,10 +126,8 @@ export class MessageProcessor { } // sentiment analysis - let sentiment = 0; - if (langIndex) { - sentiment = this.sentiment?.calculate(tokens, langIndex) || 0; - } + // note that if langIndex is 0 (no language), the following line will return undefined (as expected) + const sentiment = this.sentiment?.calculate(tokens, langIndex); let replyOffset: number | undefined = undefined; if (msg.replyTo) { @@ -156,7 +157,7 @@ export class MessageProcessor { authorIndex: this.builder.authors.getIndex(msg.authorId)!, replyOffset, langIndex, - sentiment: langIndex !== undefined ? sentiment : undefined, + sentiment, words: wordsCount.toArray(), emojis: emojisCount.toArray(), mentions: mentionsCount.toArray(),