Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# v1.1.3 (YYYY/MM/DD) [IN DEVELOPMENT]

- Added support for Telegram data exports (all chats) in addition to single-chat exports. Both formats are automatically detected and supported with full backwards compatibility.

# v1.1.2 (2024/08/14)

- Fixed WhatsApp channel type. [Comment](https://github.com/mlomb/chat-analytics/commit/9e25f8bfe3323fc62ce6d7474c3f38d45390358c#r145396724).
Expand Down
23 changes: 23 additions & 0 deletions pipeline/parse/parsers/Telegram.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,26 @@ interface TextArray {
| unknown;
text: string;
}

// Single chat export format: { name, type, id, messages: [...] }
interface TelegramChat {
name?: string;
type: TelegramChannelType;
id: number;
messages: TelegramMessage[];
}

// Multi-chat export format: { chats: { about, list: [TelegramChat, ...] } }
interface TelegramChatsContainer {
about: string;
list: TelegramChat[];
}

interface TelegramDataExport {
about?: string;
personal_information?: any;
profile_pictures?: any[];
stories?: any[];
contacts?: any;
chats: TelegramChatsContainer;
}
106 changes: 106 additions & 0 deletions pipeline/parse/parsers/TelegramParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,129 @@ export class TelegramParser extends Parser {
/**
* Regex to find the timestamp of the last message in a Telegram export file.
* We use the timestamp of the last message as the `at` value (see @Parser)
* This pattern matches messages nested anywhere in the JSON structure.
*/
static readonly TS_MSG_REGEX = /"date(?:_unixtime)?": ?"(.+?)"/gi;

async *parse(file: FileInput, progress?: Progress) {
this.lastMessageTimestampInFile = await tryToFindTimestampAtEnd(TelegramParser.TS_MSG_REGEX, file);

// Detect format by reading the beginning of the file
const isMultiChatExport = await this.detectMultiChatFormat(file);

if (isMultiChatExport) {
yield* this.parseMultiChatExport(file, progress);
} else {
yield* this.parseSingleChatExport(file, progress);
}

// Reset state
this.lastChannelName = undefined;
this.lastChannelID = undefined;
this.lastEmittedMessageTimestamp = undefined;
}

/**
* Detect if this is a multi-chat export by examining the structure.
* Multi-chat exports have: { chats: { list: [...] } }
* Single-chat exports have: { name, type, id, messages: [...] }
*/
private async detectMultiChatFormat(file: FileInput): Promise<boolean> {
try {
// Read first 2MB to detect format without loading the entire file
const sampleSize = Math.min(2097152, file.size || 2097152);
const buffer = await file.slice(0, sampleSize);
const text = new TextDecoder('utf-8').decode(buffer);

// Simple check: if the file contains "chats" key and "list" key in the root level,
// it's likely a multi-chat export. Single-chat exports have "name", "type", "id", "messages".
const hasChatsKey = text.includes('"chats"');
const hasListKey = text.includes('"list"');
const hasNameKey = text.includes('"name"');
const hasMessagesKey = text.includes('"messages"');

// If it has chats and list but is early in the file, it's multi-chat
// If it has name and messages early, it's single-chat
if (hasChatsKey && hasListKey) {
return true;
} else if (hasNameKey && hasMessagesKey) {
return false;
} else {
return false;
}
} catch (err) {
// If detection fails, assume single-chat format for backward compatibility
return false;
}
}

/**
* Parse a single-chat export format: { name, type, id, messages: [...] }
*/
private async *parseSingleChatExport(file: FileInput, progress?: Progress) {
const stream = new JSONStream()
.onObject<string>("name", this.onChannelName.bind(this))
.onObject<TelegramChannelType>("type", this.onChannelType.bind(this))
.onObject<RawID>("id", this.onChannelId.bind(this))
.onArrayItem<TelegramMessage>("messages", this.parseMessage.bind(this));

yield* streamJSONFromFile(stream, file, progress);
}

/**
* Parse a multi-chat export format: { chats: { list: [TelegramChat, ...] } }
*/
private async *parseMultiChatExport(file: FileInput, progress?: Progress) {
const stream = new JSONStream()
.onObject<TelegramChatsContainer>("chats", this.parseChatsContainer.bind(this));

yield* streamJSONFromFile(stream, file, progress);
}

/**
* Parse the chats container object from multi-chat exports.
*/
private parseChatsContainer(chatsContainer: TelegramChatsContainer) {
for (const chat of chatsContainer.list) {
this.parseChatObject(chat);
}
}

/**
* Parse a single chat object. Used by both single-chat and multi-chat formats.
* Resets state variables to avoid cross-chat contamination.
*/
private parseChatObject(chat: TelegramChat) {
// Reset state for this chat
this.lastChannelName = undefined;
this.lastChannelType = undefined;
this.lastChannelID = undefined;
this.lastEmittedMessageTimestamp = undefined;

// Set chat metadata
this.lastChannelName = chat.name;
this.lastChannelType = chat.type;
this.lastChannelID = chat.id;

// Emit guild and channel
const pguild: PGuild = {
id: 0,
name: "Telegram Chats",
};
const pchannel: PChannel = {
id: chat.id,
guildId: 0,
name: chat.name || "Telegram chat",
type: ["personal_chat", "bot_chat"].includes(chat.type || "") ? "dm" : "group",
};

this.emit("guild", pguild, this.lastMessageTimestampInFile);
this.emit("channel", pchannel, this.lastMessageTimestampInFile);

// Process all messages in this chat
for (const message of chat.messages) {
this.parseMessage(message);
}
}

private onChannelName(channelName: string) {
Expand Down
2 changes: 2 additions & 0 deletions tests/parse/Parsers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ describe("should parse correctly", () => {
{ parser: WhatsAppParser, inputs: ["whatsapp/4A_11M.zip"] },

{ parser: TelegramParser, inputs: ["telegram/DM_2A_7M.json"] },
{ parser: TelegramParser, inputs: ["telegram/MultiChat_2C_6M.json"] },

{ parser: MessengerParser, inputs: ["messenger/2A_7M.json"] },

Expand Down Expand Up @@ -53,6 +54,7 @@ describe("timestamp of the last message at the end of the file", () => {
{ file: "discord/SV_5A_5M.json", regex: DiscordParser.TS_MSG_REGEX, lastMessageTimestamp: new Date("2018-05-20T16:09:51.118+00:00").getTime() },

{ file: "telegram/DM_2A_7M.json", regex: TelegramParser.TS_MSG_REGEX, lastMessageTimestamp: 1691719862 },
{ file: "telegram/MultiChat_2C_6M.json", regex: TelegramParser.TS_MSG_REGEX, lastMessageTimestamp: 1672747200 },
];

test.each(cases)("$file", async ({ file, regex, lastMessageTimestamp }) => {
Expand Down
112 changes: 112 additions & 0 deletions tests/samples/telegram/MultiChat_2C_6M.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"about": "Test multi-chat export",
"personal_information": {
"user_id": 100000001,
"first_name": "Test",
"last_name": "User"
},
"chats": {
"about": "This page lists all chats from this export.",
"list": [
{
"name": "Chat One",
"type": "personal_chat",
"id": 700000001,
"messages": [
{
"id": 1001,
"type": "message",
"date": "2023-01-01T10:00:00",
"date_unixtime": "1672574400",
"from": "Alice",
"from_id": "user300000000",
"text": "Hello from chat one!",
"text_entities": [
{
"type": "plain",
"text": "Hello from chat one!"
}
]
},
{
"id": 1002,
"type": "message",
"date": "2023-01-01T10:01:00",
"date_unixtime": "1672574460",
"from": "Bob",
"from_id": "user700000000",
"text": "Hi there!",
"text_entities": [
{
"type": "plain",
"text": "Hi there!"
}
]
}
]
},
{
"name": "Chat Two",
"type": "private_group",
"id": 700000002,
"messages": [
{
"id": 2001,
"type": "message",
"date": "2023-01-02T10:00:00",
"date_unixtime": "1672660800",
"from": "Charlie",
"from_id": "user800000000",
"text": "Welcome to chat two!",
"text_entities": [
{
"type": "plain",
"text": "Welcome to chat two!"
}
]
},
{
"id": 2002,
"type": "message",
"date": "2023-01-02T10:02:00",
"date_unixtime": "1672660920",
"from": "David",
"from_id": "user900000000",
"text": "Group chat is fun!",
"text_entities": [
{
"type": "plain",
"text": "Group chat is fun!"
}
]
},
{
"id": 2003,
"type": "service",
"date": "2023-01-02T10:03:00",
"date_unixtime": "1672660980",
"actor": "Charlie",
"actor_id": "user800000000",
"action": "phone_call",
"duration_seconds": 120,
"text": "",
"text_entities": []
},
{
"id": 2004,
"type": "message",
"date": "2023-01-03T10:00:00",
"date_unixtime": "1672747200",
"from": "Eve",
"from_id": "user101000000",
"photo": "(File not included. Change data exporting settings to download.)",
"width": 800,
"height": 600,
"text": "",
"text_entities": []
}
]
}
]
}
}
65 changes: 65 additions & 0 deletions tests/samples/telegram/MultiChat_2C_6M.json.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { AttachmentType } from "@pipeline/Attachments";

import type { ExpectedPartialParseResult } from "@tests/parse/Parse";
import { PGUILD_DEFAULT } from "@tests/samples/telegram/Common";

export const expectedParse: ExpectedPartialParseResult = {
guilds: [PGUILD_DEFAULT],
channels: [
{ id: 700000001, guildId: 0, type: "dm", name: "Chat One" },
{ id: 700000002, guildId: 0, type: "group", name: "Chat Two" },
],
authors: [
{ id: "user300000000", name: "Alice", bot: false },
{ id: "user700000000", name: "Bob", bot: false },
{ id: "user800000000", name: "Charlie", bot: false },
{ id: "user900000000", name: "David", bot: false },
{ id: "user101000000", name: "Eve", bot: false },
],
messages: [
{
id: "1001",
authorId: "user300000000",
channelId: 700000001,
textContent: "Hello from chat one!",
timestamp: 1672574400 * 1000,
},
{
id: "1002",
authorId: "user700000000",
channelId: 700000001,
textContent: "Hi there!",
timestamp: 1672574460 * 1000,
},
{
id: "2001",
authorId: "user800000000",
channelId: 700000002,
textContent: "Welcome to chat two!",
timestamp: 1672660800 * 1000,
},
{
id: "2002",
authorId: "user900000000",
channelId: 700000002,
textContent: "Group chat is fun!",
timestamp: 1672660920 * 1000,
},
{
id: "2004",
attachments: [AttachmentType.Image],
authorId: "user101000000",
channelId: 700000002,
timestamp: 1672747200 * 1000,
},
],
calls: [
{
id: "2003",
authorId: "user800000000",
channelId: 700000002,
timestampStart: 1672660980 * 1000,
timestampEnd: (1672660980 + 120) * 1000,
},
],
};