From 56e153147fcd91d46d2e3fb58d94e3c783b47eed Mon Sep 17 00:00:00 2001 From: Nikolai Berezovskii Date: Fri, 12 Sep 2025 18:40:18 +0400 Subject: [PATCH 1/2] feat: add support for Telegram multi-chat data exports - Add automatic format detection for both single-chat and data exports - Extend TypeScript definitions for multi-chat export structure - Implement parseChatObject method for unified chat processing - Add comprehensive tests for multi-chat format - Update documentation and README with usage information - Maintain full backward compatibility with existing single-chat exports Closes: Support for parsing Telegram Desktop data exports (all chats) in addition to single-chat exports. Both formats are automatically detected and processed consistently. --- README.md | 10 +- docs/CHANGELOG.md | 2 + pipeline/parse/parsers/Telegram.d.ts | 23 ++++ pipeline/parse/parsers/TelegramParser.ts | 106 +++++++++++++++++ tests/parse/Parsers.test.ts | 2 + tests/samples/telegram/MultiChat_2C_6M.json | 112 ++++++++++++++++++ .../samples/telegram/MultiChat_2C_6M.json.ts | 65 ++++++++++ 7 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 tests/samples/telegram/MultiChat_2C_6M.json create mode 100644 tests/samples/telegram/MultiChat_2C_6M.json.ts diff --git a/README.md b/README.md index 99673f3f..1d285db1 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ You can generate reports from the following platforms: |-----------|----------------------------------------------------------------------------------|--------------|------------------|-------------------------------------------------------------------------------------|------------------|------------------------|-------------|-------| | Discord | `json` from [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter) | ✅ | ✅ | ✅ | ✅ | ✅ (until link expires) | ✅ (as text) | ✅ | | Messenger | `json` from [Facebook DYI export](https://www.facebook.com/dyi) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ (as text) | ❌ | -| Telegram | `json` from [Telegram Desktop](https://desktop.telegram.org/) | ✅ | ✅ | ✅ | ❌ (not provided) | ❌ | ✅ (as text) | ✅ | +|| Telegram | `json` from [Telegram Desktop](https://desktop.telegram.org/)
Supports both single-chat exports and full data exports (all chats) | ✅ | ✅ | ✅ | ❌ (not provided) | ❌ | ✅ (as text) | ✅ | | WhatsApp | `txt` or `zip` exported from a phone | ✅ | ❌ (not provided) | ✅* (if exported from iOS)
🟦 (generic if exported from Android) | ❌ (not provided) | ❌ | ✅ (as text) | ❌ | * not all languages are supported, check [WhatsApp.ts](/pipeline/parse/parsers/WhatsApp.ts). @@ -74,6 +74,14 @@ For example: npx chat-analytics -p discord -i "exported/*.json" -o report.html ``` +### Telegram exports + +Telegram Desktop supports two types of exports: +- **Single chat export**: Export messages from one specific chat (use "Export chat history" on a chat) +- **Data export**: Export all your Telegram data including all chats (use "Settings → Privacy & Security → Export Telegram data") + +Both formats are supported and will be automatically detected. + ## Docker Compose You can self-host the app using the official docker image provided at https://hub.docker.com/r/mlomb/chat-analytics. Check out the [Dockerfile](/Dockerfile). diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 5b0b477f..8348377b 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,5 +1,7 @@ # v1.1.3 (YYYY/MM/DD) [IN DEVELOPMENT] +- Added support for Telegram data exports (all chats) in addition to single-chat exports. Both formats are automatically detected and supported with full backwards compatibility. + # v1.1.2 (2024/08/14) - Fixed WhatsApp channel type. [Comment](https://github.com/mlomb/chat-analytics/commit/9e25f8bfe3323fc62ce6d7474c3f38d45390358c#r145396724). diff --git a/pipeline/parse/parsers/Telegram.d.ts b/pipeline/parse/parsers/Telegram.d.ts index 8861d230..484fdb55 100644 --- a/pipeline/parse/parsers/Telegram.d.ts +++ b/pipeline/parse/parsers/Telegram.d.ts @@ -50,3 +50,26 @@ interface TextArray { | unknown; text: string; } + +// Single chat export format: { name, type, id, messages: [...] } +interface TelegramChat { + name?: string; + type: TelegramChannelType; + id: number; + messages: TelegramMessage[]; +} + +// Multi-chat export format: { chats: { about, list: [TelegramChat, ...] } } +interface TelegramChatsContainer { + about: string; + list: TelegramChat[]; +} + +interface TelegramDataExport { + about?: string; + personal_information?: any; + profile_pictures?: any[]; + stories?: any[]; + contacts?: any; + chats: TelegramChatsContainer; +} diff --git a/pipeline/parse/parsers/TelegramParser.ts b/pipeline/parse/parsers/TelegramParser.ts index 5d452aca..4bd06465 100644 --- a/pipeline/parse/parsers/TelegramParser.ts +++ b/pipeline/parse/parsers/TelegramParser.ts @@ -17,12 +17,66 @@ export class TelegramParser extends Parser { /** * Regex to find the timestamp of the last message in a Telegram export file. * We use the timestamp of the last message as the `at` value (see @Parser) + * This pattern matches messages nested anywhere in the JSON structure. */ static readonly TS_MSG_REGEX = /"date(?:_unixtime)?": ?"(.+?)"/gi; async *parse(file: FileInput, progress?: Progress) { this.lastMessageTimestampInFile = await tryToFindTimestampAtEnd(TelegramParser.TS_MSG_REGEX, file); + // Detect format by reading the beginning of the file + const isMultiChatExport = await this.detectMultiChatFormat(file); + + if (isMultiChatExport) { + yield* this.parseMultiChatExport(file, progress); + } else { + yield* this.parseSingleChatExport(file, progress); + } + + // Reset state + this.lastChannelName = undefined; + this.lastChannelID = undefined; + this.lastEmittedMessageTimestamp = undefined; + } + + /** + * Detect if this is a multi-chat export by examining the structure. + * Multi-chat exports have: { chats: { list: [...] } } + * Single-chat exports have: { name, type, id, messages: [...] } + */ + private async detectMultiChatFormat(file: FileInput): Promise { + try { + // Read first 2MB to detect format without loading the entire file + const sampleSize = Math.min(2097152, file.size || 2097152); + const buffer = await file.slice(0, sampleSize); + const text = new TextDecoder('utf-8').decode(buffer); + + // Simple check: if the file contains "chats" key and "list" key in the root level, + // it's likely a multi-chat export. Single-chat exports have "name", "type", "id", "messages". + const hasChatsKey = text.includes('"chats"'); + const hasListKey = text.includes('"list"'); + const hasNameKey = text.includes('"name"'); + const hasMessagesKey = text.includes('"messages"'); + + // If it has chats and list but is early in the file, it's multi-chat + // If it has name and messages early, it's single-chat + if (hasChatsKey && hasListKey) { + return true; + } else if (hasNameKey && hasMessagesKey) { + return false; + } else { + return false; + } + } catch (err) { + // If detection fails, assume single-chat format for backward compatibility + return false; + } + } + + /** + * Parse a single-chat export format: { name, type, id, messages: [...] } + */ + private async *parseSingleChatExport(file: FileInput, progress?: Progress) { const stream = new JSONStream() .onObject("name", this.onChannelName.bind(this)) .onObject("type", this.onChannelType.bind(this)) @@ -30,10 +84,62 @@ export class TelegramParser extends Parser { .onArrayItem("messages", this.parseMessage.bind(this)); yield* streamJSONFromFile(stream, file, progress); + } + + /** + * Parse a multi-chat export format: { chats: { list: [TelegramChat, ...] } } + */ + private async *parseMultiChatExport(file: FileInput, progress?: Progress) { + const stream = new JSONStream() + .onObject("chats", this.parseChatsContainer.bind(this)); + + yield* streamJSONFromFile(stream, file, progress); + } + /** + * Parse the chats container object from multi-chat exports. + */ + private parseChatsContainer(chatsContainer: TelegramChatsContainer) { + for (const chat of chatsContainer.list) { + this.parseChatObject(chat); + } + } + + /** + * Parse a single chat object. Used by both single-chat and multi-chat formats. + * Resets state variables to avoid cross-chat contamination. + */ + private parseChatObject(chat: TelegramChat) { + // Reset state for this chat this.lastChannelName = undefined; + this.lastChannelType = undefined; this.lastChannelID = undefined; this.lastEmittedMessageTimestamp = undefined; + + // Set chat metadata + this.lastChannelName = chat.name; + this.lastChannelType = chat.type; + this.lastChannelID = chat.id; + + // Emit guild and channel + const pguild: PGuild = { + id: 0, + name: "Telegram Chats", + }; + const pchannel: PChannel = { + id: chat.id, + guildId: 0, + name: chat.name || "Telegram chat", + type: ["personal_chat", "bot_chat"].includes(chat.type || "") ? "dm" : "group", + }; + + this.emit("guild", pguild, this.lastMessageTimestampInFile); + this.emit("channel", pchannel, this.lastMessageTimestampInFile); + + // Process all messages in this chat + for (const message of chat.messages) { + this.parseMessage(message); + } } private onChannelName(channelName: string) { diff --git a/tests/parse/Parsers.test.ts b/tests/parse/Parsers.test.ts index e4eb615b..db7743e3 100644 --- a/tests/parse/Parsers.test.ts +++ b/tests/parse/Parsers.test.ts @@ -24,6 +24,7 @@ describe("should parse correctly", () => { { parser: WhatsAppParser, inputs: ["whatsapp/4A_11M.zip"] }, { parser: TelegramParser, inputs: ["telegram/DM_2A_7M.json"] }, + { parser: TelegramParser, inputs: ["telegram/MultiChat_2C_6M.json"] }, { parser: MessengerParser, inputs: ["messenger/2A_7M.json"] }, @@ -53,6 +54,7 @@ describe("timestamp of the last message at the end of the file", () => { { file: "discord/SV_5A_5M.json", regex: DiscordParser.TS_MSG_REGEX, lastMessageTimestamp: new Date("2018-05-20T16:09:51.118+00:00").getTime() }, { file: "telegram/DM_2A_7M.json", regex: TelegramParser.TS_MSG_REGEX, lastMessageTimestamp: 1691719862 }, + { file: "telegram/MultiChat_2C_6M.json", regex: TelegramParser.TS_MSG_REGEX, lastMessageTimestamp: 1672747200 }, ]; test.each(cases)("$file", async ({ file, regex, lastMessageTimestamp }) => { diff --git a/tests/samples/telegram/MultiChat_2C_6M.json b/tests/samples/telegram/MultiChat_2C_6M.json new file mode 100644 index 00000000..e1306cc5 --- /dev/null +++ b/tests/samples/telegram/MultiChat_2C_6M.json @@ -0,0 +1,112 @@ +{ + "about": "Test multi-chat export", + "personal_information": { + "user_id": 100000001, + "first_name": "Test", + "last_name": "User" + }, + "chats": { + "about": "This page lists all chats from this export.", + "list": [ + { + "name": "Chat One", + "type": "personal_chat", + "id": 700000001, + "messages": [ + { + "id": 1001, + "type": "message", + "date": "2023-01-01T10:00:00", + "date_unixtime": "1672574400", + "from": "Alice", + "from_id": "user300000000", + "text": "Hello from chat one!", + "text_entities": [ + { + "type": "plain", + "text": "Hello from chat one!" + } + ] + }, + { + "id": 1002, + "type": "message", + "date": "2023-01-01T10:01:00", + "date_unixtime": "1672574460", + "from": "Bob", + "from_id": "user700000000", + "text": "Hi there!", + "text_entities": [ + { + "type": "plain", + "text": "Hi there!" + } + ] + } + ] + }, + { + "name": "Chat Two", + "type": "private_group", + "id": 700000002, + "messages": [ + { + "id": 2001, + "type": "message", + "date": "2023-01-02T10:00:00", + "date_unixtime": "1672660800", + "from": "Charlie", + "from_id": "user800000000", + "text": "Welcome to chat two!", + "text_entities": [ + { + "type": "plain", + "text": "Welcome to chat two!" + } + ] + }, + { + "id": 2002, + "type": "message", + "date": "2023-01-02T10:02:00", + "date_unixtime": "1672660920", + "from": "David", + "from_id": "user900000000", + "text": "Group chat is fun!", + "text_entities": [ + { + "type": "plain", + "text": "Group chat is fun!" + } + ] + }, + { + "id": 2003, + "type": "service", + "date": "2023-01-02T10:03:00", + "date_unixtime": "1672660980", + "actor": "Charlie", + "actor_id": "user800000000", + "action": "phone_call", + "duration_seconds": 120, + "text": "", + "text_entities": [] + }, + { + "id": 2004, + "type": "message", + "date": "2023-01-03T10:00:00", + "date_unixtime": "1672747200", + "from": "Eve", + "from_id": "user101000000", + "photo": "(File not included. Change data exporting settings to download.)", + "width": 800, + "height": 600, + "text": "", + "text_entities": [] + } + ] + } + ] + } +} diff --git a/tests/samples/telegram/MultiChat_2C_6M.json.ts b/tests/samples/telegram/MultiChat_2C_6M.json.ts new file mode 100644 index 00000000..7e96d017 --- /dev/null +++ b/tests/samples/telegram/MultiChat_2C_6M.json.ts @@ -0,0 +1,65 @@ +import { AttachmentType } from "@pipeline/Attachments"; + +import type { ExpectedPartialParseResult } from "@tests/parse/Parse"; +import { PGUILD_DEFAULT } from "@tests/samples/telegram/Common"; + +export const expectedParse: ExpectedPartialParseResult = { + guilds: [PGUILD_DEFAULT], + channels: [ + { id: 700000001, guildId: 0, type: "dm", name: "Chat One" }, + { id: 700000002, guildId: 0, type: "group", name: "Chat Two" }, + ], + authors: [ + { id: "user300000000", name: "Alice", bot: false }, + { id: "user700000000", name: "Bob", bot: false }, + { id: "user800000000", name: "Charlie", bot: false }, + { id: "user900000000", name: "David", bot: false }, + { id: "user101000000", name: "Eve", bot: false }, + ], + messages: [ + { + id: "1001", + authorId: "user300000000", + channelId: 700000001, + textContent: "Hello from chat one!", + timestamp: 1672574400 * 1000, + }, + { + id: "1002", + authorId: "user700000000", + channelId: 700000001, + textContent: "Hi there!", + timestamp: 1672574460 * 1000, + }, + { + id: "2001", + authorId: "user800000000", + channelId: 700000002, + textContent: "Welcome to chat two!", + timestamp: 1672660800 * 1000, + }, + { + id: "2002", + authorId: "user900000000", + channelId: 700000002, + textContent: "Group chat is fun!", + timestamp: 1672660920 * 1000, + }, + { + id: "2004", + attachments: [AttachmentType.Image], + authorId: "user101000000", + channelId: 700000002, + timestamp: 1672747200 * 1000, + }, + ], + calls: [ + { + id: "2003", + authorId: "user800000000", + channelId: 700000002, + timestampStart: 1672660980 * 1000, + timestampEnd: (1672660980 + 120) * 1000, + }, + ], +}; From 49491a51d3c1b6c7687644adf99f7ce1e7292d38 Mon Sep 17 00:00:00 2001 From: Nikolai Berezovskii Date: Fri, 12 Sep 2025 18:44:25 +0400 Subject: [PATCH 2/2] revert: remove README changes to keep PR focused on core functionality --- README.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/README.md b/README.md index 1d285db1..99673f3f 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ You can generate reports from the following platforms: |-----------|----------------------------------------------------------------------------------|--------------|------------------|-------------------------------------------------------------------------------------|------------------|------------------------|-------------|-------| | Discord | `json` from [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter) | ✅ | ✅ | ✅ | ✅ | ✅ (until link expires) | ✅ (as text) | ✅ | | Messenger | `json` from [Facebook DYI export](https://www.facebook.com/dyi) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ (as text) | ❌ | -|| Telegram | `json` from [Telegram Desktop](https://desktop.telegram.org/)
Supports both single-chat exports and full data exports (all chats) | ✅ | ✅ | ✅ | ❌ (not provided) | ❌ | ✅ (as text) | ✅ | +| Telegram | `json` from [Telegram Desktop](https://desktop.telegram.org/) | ✅ | ✅ | ✅ | ❌ (not provided) | ❌ | ✅ (as text) | ✅ | | WhatsApp | `txt` or `zip` exported from a phone | ✅ | ❌ (not provided) | ✅* (if exported from iOS)
🟦 (generic if exported from Android) | ❌ (not provided) | ❌ | ✅ (as text) | ❌ | * not all languages are supported, check [WhatsApp.ts](/pipeline/parse/parsers/WhatsApp.ts). @@ -74,14 +74,6 @@ For example: npx chat-analytics -p discord -i "exported/*.json" -o report.html ``` -### Telegram exports - -Telegram Desktop supports two types of exports: -- **Single chat export**: Export messages from one specific chat (use "Export chat history" on a chat) -- **Data export**: Export all your Telegram data including all chats (use "Settings → Privacy & Security → Export Telegram data") - -Both formats are supported and will be automatically detected. - ## Docker Compose You can self-host the app using the official docker image provided at https://hub.docker.com/r/mlomb/chat-analytics. Check out the [Dockerfile](/Dockerfile).