Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pipeline/aggregate/blocks/language/LanguageStats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,13 @@ const fn: BlockFn<LanguageStats> = (database, filters, common, args) => {
.filter((lang) => lang.value < langThreshold)
.reduce((sum, lang) => sum + lang.value, 0);
const languageList = allLanguages.filter((lang) => lang.value >= langThreshold);
languageList.push({ index: 0, value: totalUnreliable });

// since langIndex can be 0 now, it can appear in languagesCount
// so we sum to the existing value or push it to the list if it doesn't exist
const utdIndex = languageList.findIndex((item) => item.index == 0);
if (utdIndex < 0) languageList.push({ index: 0, value: totalUnreliable });
else languageList[utdIndex].value += totalUnreliable;

languageList.sort((a, b) => b.value - a.value);

return {
Expand Down
28 changes: 19 additions & 9 deletions pipeline/process/MessageProcessor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,28 @@ export class MessageProcessor {

// detect language in the whole group text
// this yields better accuracy
let langIndex: number | undefined;
if (allText.length > 0) {
langIndex = this.langPredictModel!.identifyLanguage(allText).iso639index;
let langIndex: number = 0; // 0 = "Unreliable to detect"

// See https://github.com/mlomb/chat-analytics/pull/110
const accuracy_threshold = 0; // language model accuracy must be at least this high (0-1)
const word_count_threshold = 1; // must have at least this many words for language detection
const word_count: number = tokenizations
.map((msg) => msg.reduce((sum, token) => (token.tag == "word" ? sum + 1 : sum), 0)) // sum the number of words per message
.reduce((sum, len) => sum + len, 0); // sum the number of words in all messages in the group

if (word_count >= word_count_threshold) {
const result = this.langPredictModel!.identifyLanguage(allText);

if (result.accuracy >= accuracy_threshold) {
langIndex = result.iso639index;
}
}

return group.map((message, index) => this.processMessage(message, tokenizations[index], langIndex));
}

/** Process the given message. Also takes the tokens for the message, and other information. */
private processMessage(msg: PMessage, tokens: Token[], langIndex: number | undefined): Message {
private processMessage(msg: PMessage, tokens: Token[], langIndex: number): Message {
const wordsCount = new IndexCountsBuilder();
const emojisCount = new IndexCountsBuilder();
const mentionsCount = new IndexCountsBuilder();
Expand Down Expand Up @@ -114,10 +126,8 @@ export class MessageProcessor {
}

// sentiment analysis
let sentiment = 0;
if (langIndex) {
sentiment = this.sentiment?.calculate(tokens, langIndex) || 0;
}
// note that if langIndex is 0 (no language), the following line will return undefined (as expected)
const sentiment = this.sentiment?.calculate(tokens, langIndex);

let replyOffset: number | undefined = undefined;
if (msg.replyTo) {
Expand Down Expand Up @@ -147,7 +157,7 @@ export class MessageProcessor {
authorIndex: this.builder.authors.getIndex(msg.authorId)!,
replyOffset,
langIndex,
sentiment: langIndex !== undefined ? sentiment : undefined,
sentiment,
words: wordsCount.toArray(),
emojis: emojisCount.toArray(),
mentions: mentionsCount.toArray(),
Expand Down
2 changes: 1 addition & 1 deletion report/components/cards/language/LanguageStatsTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ const LanguageStatsTable = () => {
depth: 1,
tooltip:
language.index === 0
? "Messages that did not have enough text to reliable detect the language"
? "Messages that did not have enough text to reliably detect the language"
: undefined,
} as Line)
) ?? []),
Expand Down