diff --git a/README.md b/README.md index 932741c..8d082c9 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,14 @@ Each line in `conversations.jsonl` is one session: "start_time": "2025-06-15T10:00:00+00:00", "end_time": "2025-06-15T10:30:00+00:00", "messages": [ - {"role": "user", "content": "Fix the login bug", "timestamp": "..."}, + { + "role": "user", + "content": "Fix the login bug", + "content_parts": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": "..."}} + ], + "timestamp": "..." + }, { "role": "assistant", "content": "I'll investigate the login flow.", @@ -201,6 +208,8 @@ Each line in `conversations.jsonl` is one session: } ``` +`messages[].content_parts` is optional and preserves structured user content such as attachments when the source provides them. The canonical human-readable user text remains in `messages[].content`. + `tool_uses[].output.raw` is optional and preserves extra structured tool-result fields when the source provides them. The canonical human-readable result text remains in `tool_uses[].output.text`. Each HF repo also includes a `metadata.json` with aggregate stats. diff --git a/dataclaw/_cli/exporting.py b/dataclaw/_cli/exporting.py index 16d49ff..d7c6bf8 100644 --- a/dataclaw/_cli/exporting.py +++ b/dataclaw/_cli/exporting.py @@ -1,5 +1,7 @@ """Export and publish helpers for the DataClaw CLI.""" +import hashlib +import json as std_json import sys import urllib.error import urllib.request @@ -12,6 +14,17 @@ from .common import HF_TAG, REPO_URL, SKILL_URL, _format_token_count, _provider_dataset_tags +def _gemini_dedupe_fingerprint(session: dict, source: str) -> str | None: + if source != "gemini": + return None + + canonical = dict(session) + canonical["source"] = source + canonical.pop("project", None) + payload = std_json.dumps(canonical, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(payload.encode()).hexdigest() + + def export_to_jsonl( selected_projects: list[dict], output_path: Path, @@ -28,6 +41,7 @@ def export_to_jsonl( total_input_tokens = 0 total_output_tokens = 0 project_names = [] + seen_fingerprints: set[str] = set() try: fh = open(output_path, "wb") @@ -46,14 +60,22 @@ def export_to_jsonl( ) proj_count = 0 for session in sessions: + source = session.get("source") or project.get("source", default_source) model = session.get("model") if not model or model == "": skipped += 1 continue + fingerprint = _gemini_dedupe_fingerprint(session, source) + if fingerprint is not None and fingerprint in seen_fingerprints: + continue + session, n_redacted = redact_session(session, custom_strings=custom_strings) total_redactions += n_redacted + if fingerprint is not None: + seen_fingerprints.add(fingerprint) + f.write(json.dumps_bytes(session)) f.write(b"\n") total += 1 diff --git a/dataclaw/parsers/gemini.py b/dataclaw/parsers/gemini.py index 436cea1..302472c 100644 --- a/dataclaw/parsers/gemini.py +++ b/dataclaw/parsers/gemini.py @@ -1,12 +1,21 @@ import hashlib import logging import os +from collections import defaultdict, deque from pathlib import Path from typing import Any, Callable from .. import _json as json from ..anonymizer import Anonymizer -from .common import collect_project_sessions, make_session_result, make_stats, update_time_bounds +from ..secrets import should_skip_large_binary_string +from .common import ( + anonymize_value, + collect_project_sessions, + make_session_result, + make_stats, + parse_tool_input, + update_time_bounds, +) logger = logging.getLogger(__name__) @@ -282,6 +291,145 @@ def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: return {"tool": name, "input": inp, "output": out, "status": status} +def anonymize_text_preserving_blobs( + text: Any, + anonymizer: Anonymizer, + *, + strip: bool = False, + drop_empty: bool = True, +) -> str | None: + if not isinstance(text, str): + return None + if should_skip_large_binary_string(text): + return text + normalized = text.strip() if strip else text + if drop_empty and not normalized.strip(): + return None + return anonymizer.text(normalized) + + +def build_gemini_call_id(name: str, args: Any, counters: dict[str, int]) -> str: + counters[name] += 1 + return f"fc_{name}_{counters[name]}" + + +def anonymize_file_uri(file_uri: Any, anonymizer: Anonymizer) -> str | None: + if not isinstance(file_uri, str): + return None + if file_uri.startswith("file://"): + return f"file://{anonymizer.path(file_uri[7:])}" + return anonymizer.text(file_uri) + + +def parse_gemini_user_part( + part: Any, + anonymizer: Anonymizer, + pending_call_ids: dict[str, deque[str]], + call_counters: dict[str, int], +) -> tuple[str | None, dict[str, Any] | None]: + if isinstance(part, str): + text = anonymize_text_preserving_blobs(part, anonymizer, drop_empty=False) + if text is None: + return None, None + if should_skip_large_binary_string(part): + return None, {"type": "text", "text": text} + return text, None + + if not isinstance(part, dict): + return None, None + + if "text" in part: + text = anonymize_text_preserving_blobs(part.get("text"), anonymizer, drop_empty=False) + if text is None: + return None, None + if should_skip_large_binary_string(part.get("text", "")): + return None, {"type": "text", "text": text} + return text, None + + inline = part.get("inlineData") + if isinstance(inline, dict): + mime_type = inline.get("mimeType", "") + return None, { + "type": "image" if isinstance(mime_type, str) and mime_type.startswith("image/") else "document", + "source": { + "type": "base64", + "media_type": mime_type, + "data": inline.get("data", ""), + }, + } + + file_data = part.get("fileData") + if isinstance(file_data, dict): + source: dict[str, Any] = {"type": "url"} + url = anonymize_file_uri(file_data.get("fileUri"), anonymizer) + if url: + source["url"] = url + mime_type = file_data.get("mimeType") + if mime_type: + source["media_type"] = mime_type + return None, {"type": "document", "source": source} + + function_call = part.get("functionCall") + if isinstance(function_call, dict): + name = function_call.get("name", "unknown") + args = function_call.get("args", {}) + call_id = function_call.get("id") or build_gemini_call_id(name, args, call_counters) + pending_call_ids[name].append(call_id) + return None, { + "type": "tool_use", + "id": call_id, + "name": name, + "input": parse_tool_input(name, args, anonymizer), + } + + function_response = part.get("functionResponse") + if isinstance(function_response, dict): + name = function_response.get("name", "unknown") + tool_use_id = function_response.get("id") or ( + pending_call_ids[name].popleft() if pending_call_ids.get(name) else f"fc_{name}" + ) + response = function_response.get("response") + content: Any = None + if isinstance(response, dict) and "output" in response: + content = anonymize_text_preserving_blobs(response.get("output"), anonymizer) + elif response is not None: + content = anonymize_value("response", response, anonymizer) + part_result: dict[str, Any] = {"type": "tool_result", "tool_use_id": tool_use_id} + if content not in (None, "", [], {}): + part_result["content"] = content + return None, part_result + + return None, None + + +def parse_gemini_user_content(content: Any, anonymizer: Anonymizer) -> tuple[str | None, list[dict[str, Any]]]: + if isinstance(content, str): + text = anonymize_text_preserving_blobs(content, anonymizer, drop_empty=False) + if text is None: + return None, [] + if should_skip_large_binary_string(content): + return None, [{"type": "text", "text": text}] + return text, [] + + if not isinstance(content, list): + return None, [] + + text_parts: list[str] = [] + content_parts: list[dict[str, Any]] = [] + pending_call_ids: dict[str, deque[str]] = defaultdict(deque) + call_counters: dict[str, int] = defaultdict(int) + + for part in content: + text, content_part = parse_gemini_user_part(part, anonymizer, pending_call_ids, call_counters) + if text is not None: + text_parts.append(text) + if content_part: + content_parts.append(content_part) + + text_content = "\n".join(text_parts) if text_parts else None + return text_content, content_parts + + def parse_session_file( filepath: Path, anonymizer: Anonymizer, @@ -313,23 +461,15 @@ def parse_session_file( timestamp = msg_data.get("timestamp") if msg_type == "user": - content = msg_data.get("content") - if isinstance(content, list): - text_parts = [part.get("text", "") for part in content if isinstance(part, dict) and "text" in part] - text = "\n".join(text_parts) - elif isinstance(content, str): - text = content - else: - continue - if not text.strip(): + text, content_parts = parse_gemini_user_content(msg_data.get("content"), anonymizer) + if text is None and not content_parts: continue - messages.append( - { - "role": "user", - "content": anonymizer.text(text.strip()), - "timestamp": timestamp, - } - ) + message: dict[str, Any] = {"role": "user", "timestamp": timestamp} + if text is not None: + message["content"] = text + if content_parts: + message["content_parts"] = content_parts + messages.append(message) stats["user_messages"] += 1 update_time_bounds(metadata, timestamp) diff --git a/dataclaw/secrets.py b/dataclaw/secrets.py index 81d71cd..139ea2b 100644 --- a/dataclaw/secrets.py +++ b/dataclaw/secrets.py @@ -310,6 +310,9 @@ def redact_session(session: dict, custom_strings: list[str] | None = None) -> tu if custom_strings: msg[field], count = redact_custom_strings(msg[field], custom_strings) total += count + if msg.get("content_parts"): + msg["content_parts"], count = _redact_value(msg["content_parts"], custom_strings) + total += count for tool_use in msg.get("tool_uses", []): for field in ("input", "output"): if tool_use.get(field): diff --git a/docs/gemini-vs-cchv-gaps.md b/docs/gemini-vs-cchv-gaps.md new file mode 100644 index 0000000..20c997e --- /dev/null +++ b/docs/gemini-vs-cchv-gaps.md @@ -0,0 +1,280 @@ +# Gemini CLI Gaps vs CCHV + +## Scope + +This note compares Gemini CLI handling in: + +- DataClaw: `~/dataclaw` +- Claude Code History Viewer (CCHV): `~/claude-code-history-viewer` + +The goal is to identify Gemini CLI data that CCHV captures more faithfully than DataClaw today, while also noting important cases where DataClaw retains data that CCHV skips. + +## Summary + +CCHV preserves more Gemini message/event structure than DataClaw in several places. + +The biggest current DataClaw gaps are: + +1. It drops Gemini `info` / `warning` / `error` messages. +2. It drops `resultDisplay` tool UI output, including file-diff previews and tool-status strings. +3. It drops non-text user content parts such as `inlineData` images/documents. +4. It drops part-level `functionResponse` blocks embedded in message content. +5. It only keeps session-level token totals, while CCHV keeps per-message Gemini token usage. +6. It does not preserve top-level Gemini session metadata such as `summary` and `kind`. + +Important counterpoint: + +- CCHV explicitly skips Gemini sessions whose top-level `kind` is `subagent`, while DataClaw currently exports them as normal sessions. + +## Detailed Findings + +### 1. CCHV keeps `info` / `warning` / `error` messages; DataClaw drops them + +CCHV converts Gemini message records with types: + +- `user` +- `gemini` +- `info` +- `warning` +- `error` + +References: + +- CCHV Gemini message dispatch: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:464-485` +- CCHV system-message conversion: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:630-653` + +DataClaw only exports: + +- `user` +- `gemini` + +References: + +- DataClaw Gemini parser message handling: `~/dataclaw/dataclaw/parsers/gemini.py:311-375` + +Practical consequence: + +- DataClaw drops Gemini informational and error messages that CCHV exposes as system messages. + +Observed in real Gemini data on this machine: + +- `835` `info` messages +- `29` `error` messages + +Example real file: + +- `~/.gemini/tmp/comfyui-featherops/chats/session-2026-03-24T08-56-51cb7147.json:10,16,1111` + +### 2. CCHV keeps `resultDisplay`; DataClaw ignores it + +CCHV converts Gemini `toolCalls[].resultDisplay` into extra content blocks. + +References: + +- CCHV includes `resultDisplay` during Gemini tool-call conversion: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:592-597` +- CCHV `extract_result_display(...)`: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:893-928` + +DataClaw's Gemini parser never reads `resultDisplay`. + +References: + +- DataClaw Gemini tool-call parsing: `~/dataclaw/dataclaw/parsers/gemini.py:162-282` + +Practical consequence: + +- DataClaw drops user-visible tool UI output, including: + - short status strings such as `Found 4 matching file(s)` + - read-file previews such as `Read lines 50-150 ...` + - file-diff previews stored in `resultDisplay.fileDiff` + - potential subagent-progress markers (`isSubagentProgress`) if they appear + +Observed in real Gemini data on this machine: + +- `4386` string `resultDisplay` values +- `854` object `resultDisplay` values containing: + - `fileDiff` + - `fileName` + - `filePath` + - `originalContent` + - `newContent` + - `diffStat` + - `isNewFile` + +Example real file with file-diff previews: + +- `~/.gemini/tmp/comfyui-featherops/chats/session-2026-03-28T01-43-f9f3aa2a.json:305-306,350-351,395-396,440-441` + +### 3. CCHV keeps non-text Gemini content parts; DataClaw drops them in user messages + +CCHV converts Gemini content parts such as: + +- `inlineData` image/document blocks +- `fileData` URL-backed document blocks +- plain text parts +- `functionCall` +- `functionResponse` +- `executableCode` +- `codeExecutionResult` + +References: + +- CCHV content conversion helpers: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:660-867` + +DataClaw's Gemini parser, for `user` messages, only extracts parts containing `text` and drops the rest. + +References: + +- DataClaw user-message extraction: `~/dataclaw/dataclaw/parsers/gemini.py:315-323` + +Practical consequence: + +- DataClaw drops user attachments and other structured Gemini content parts that CCHV preserves. + +Observed in real Gemini data on this machine: + +- real `inlineData` image attachments exist in user messages, for example: + `~/.gemini/tmp/rocm-systems/chats/session-2026-03-06T03-33-68bc726c.json:4764,4794,5006,5036` + +These include large base64 image payloads that CCHV maps to image/document blocks. + +### 4. CCHV keeps part-level `functionResponse` blocks; DataClaw drops them when embedded in content + +CCHV converts `functionResponse` parts in Gemini content into `tool_result` blocks. + +References: + +- CCHV `functionResponse` conversion in `convert_gemini_content_to_claude(...)`: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:691-709` +- CCHV direct `functionResponse` part conversion: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:813-830` + +DataClaw does not preserve these when they appear inside a message `content` array, because its user-message parser keeps only text parts. + +References: + +- DataClaw user-message extraction: `~/dataclaw/dataclaw/parsers/gemini.py:315-323` + +Practical consequence: + +- DataClaw loses some Gemini tool-result structure that is encoded directly in content parts rather than only in `toolCalls[].result`. + +Observed in real Gemini data on this machine: + +- real `functionResponse` content parts exist, for example in: + `~/.gemini/tmp/rocm-systems/chats/session-2026-03-06T03-33-68bc726c.json:122,147,172,231` + +### 5. CCHV keeps per-message token usage; DataClaw only keeps session totals + +CCHV stores Gemini per-message token usage derived directly from each Gemini response record's `tokens` field. + +References: + +- CCHV Gemini usage extraction: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:601-608,616-627` + +DataClaw only aggregates token counts into session-level `stats`. + +References: + +- DataClaw Gemini token aggregation: `~/dataclaw/dataclaw/parsers/gemini.py:340-343` +- DataClaw normalized session shape: `~/dataclaw/dataclaw/parsers/common.py:56-71` + +Practical consequence: + +- DataClaw loses per-message Gemini usage, including cached-input attribution on individual assistant responses. + +### 6. CCHV keeps more Gemini session metadata than DataClaw exports + +CCHV extracts Gemini session metadata including: + +- `session_id` +- `kind` +- `start_time` +- `last_updated` +- `message_count` +- `has_tool_use` +- `summary` + +References: + +- CCHV Gemini lightweight metadata extraction: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:388-457` + +DataClaw exports a smaller normalized session shape and does not preserve top-level Gemini `summary` or `kind`. + +References: + +- DataClaw Gemini metadata initialization: `~/dataclaw/dataclaw/parsers/gemini.py:300-308` +- DataClaw normalized session shape: `~/dataclaw/dataclaw/parsers/common.py:56-71` + +Practical consequence: + +- DataClaw loses Gemini session metadata that CCHV surfaces in its session index/browser. + +Observed in real Gemini data on this machine: + +- `122` Gemini session files have top-level `summary` +- top-level `kind` values present include: + - `main` + - `subagent` + +## Important Counterpoint: CCHV skips Gemini `kind == "subagent"` sessions, but DataClaw exports them + +This is an important Gemini difference in the opposite direction. + +CCHV explicitly skips Gemini sessions whose top-level `kind` is `subagent` in both: + +- project/session listing +- search + +References: + +- CCHV session listing skip: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:167-169` +- CCHV project scan skip: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:85-88` +- CCHV search skip: `~/claude-code-history-viewer/src-tauri/src/providers/gemini.rs:283-286` + +DataClaw does not filter by `kind`, so it includes Gemini subagent sessions if they exist as chat files. + +References: + +- DataClaw Gemini discovery and parse paths read all `session-*.json` files: `~/dataclaw/dataclaw/parsers/gemini.py:123-159` + +Observed in real Gemini data on this machine: + +- there is exactly one real Gemini chat file with `kind: "subagent"`: + `~/.gemini/tmp/tmp/chats/session-2026-03-05T03-59-51c63ffc.json:82` +- DataClaw successfully parses and exports that session. + +So, unlike the Claude comparison, Gemini is not a simple one-way story where CCHV always preserves more. + +## Observed In Real Gemini Data On This Machine + +The following real Gemini structures exist on this machine and matter for the comparison: + +- `info` / `error` message types: + `~/.gemini/tmp/comfyui-featherops/chats/session-2026-03-24T08-56-51cb7147.json:10,16,1111` + +- `resultDisplay.fileDiff` edit previews: + `~/.gemini/tmp/comfyui-featherops/chats/session-2026-03-28T01-43-f9f3aa2a.json:305-306,350-351,395-396,440-441` + +- user `inlineData` image attachments: + `~/.gemini/tmp/rocm-systems/chats/session-2026-03-06T03-33-68bc726c.json:4764,4794,5006,5036` + +- content-part `functionResponse` blocks: + `~/.gemini/tmp/rocm-systems/chats/session-2026-03-06T03-33-68bc726c.json:122,147,172,231` + +- a real Gemini subagent session: + `~/.gemini/tmp/tmp/chats/session-2026-03-05T03-59-51c63ffc.json:82` + +These are all real, present data shapes, not just theoretical parser code paths. + +## Bottom Line + +Compared with CCHV, DataClaw currently loses more Gemini fidelity around: + +- `info` / `warning` / `error` messages +- `resultDisplay` tool UI output +- non-text content parts such as `inlineData` +- part-level `functionResponse` blocks +- per-message token usage +- top-level session metadata like `summary` and `kind` + +But CCHV also has one notable Gemini omission that DataClaw does not: + +- CCHV skips Gemini `kind == "subagent"` sessions from its normal session views, while DataClaw exports them. diff --git a/tests/test_cli_exporting.py b/tests/test_cli_exporting.py index 1519364..7319982 100644 --- a/tests/test_cli_exporting.py +++ b/tests/test_cli_exporting.py @@ -148,6 +148,73 @@ def test_skips_none_model(self, tmp_path, mock_anonymizer): assert meta["sessions"] == 0 assert meta["skipped"] == 1 + def test_dedupes_identical_gemini_sessions_ignoring_project_label(self, tmp_path, mock_anonymizer): + output = tmp_path / "out.jsonl" + session_upper = { + "session_id": "g1", + "model": "gemini-2.5-pro", + "git_branch": None, + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-01T00:01:00Z", + "messages": [{"role": "user", "content": "hi"}], + "stats": {"input_tokens": 1, "output_tokens": 2}, + "project": "gemini:ComfyUI", + "source": "gemini", + } + session_lower = {**session_upper, "project": "gemini:comfyui"} + projects = [ + {"dir_name": "upper", "display_name": "gemini:ComfyUI", "source": "gemini"}, + {"dir_name": "lower", "display_name": "gemini:comfyui", "source": "gemini"}, + ] + + def parse_project_sessions(*args, **kwargs): + return [session_upper] if args[0] == "upper" else [session_lower] + + meta = export_to_jsonl( + projects, + output, + mock_anonymizer, + parse_project_sessions_fn=parse_project_sessions, + default_source="gemini", + ) + + lines = output.read_text().strip().split("\n") + assert len(lines) == 1 + assert meta["sessions"] == 1 + + def test_keeps_distinct_gemini_snapshots(self, tmp_path, mock_anonymizer): + output = tmp_path / "out.jsonl" + session_old = { + "session_id": "g1", + "model": "gemini-2.5-pro", + "git_branch": None, + "start_time": "2026-01-01T00:00:00Z", + "end_time": "2026-01-01T00:01:00Z", + "messages": [{"role": "user", "content": "short"}], + "stats": {"input_tokens": 1, "output_tokens": 2}, + "project": "gemini:comfyui", + "source": "gemini", + } + session_new = { + **session_old, + "end_time": "2026-01-01T00:02:00Z", + "messages": [{"role": "user", "content": "longer"}], + "stats": {"input_tokens": 3, "output_tokens": 4}, + } + projects = [{"dir_name": "proj", "display_name": "gemini:comfyui", "source": "gemini"}] + + meta = export_to_jsonl( + projects, + output, + mock_anonymizer, + parse_project_sessions_fn=lambda *args, **kwargs: [session_old, session_new], + default_source="gemini", + ) + + lines = output.read_text().strip().split("\n") + assert len(lines) == 2 + assert meta["sessions"] == 2 + class TestPushToHuggingface: def test_missing_huggingface_hub(self, tmp_path, monkeypatch): diff --git a/tests/test_parser_gemini.py b/tests/test_parser_gemini.py new file mode 100644 index 0000000..77ff790 --- /dev/null +++ b/tests/test_parser_gemini.py @@ -0,0 +1,210 @@ +"""Tests for Gemini parser behavior.""" + +from dataclaw import _json as json +from dataclaw.parsers.gemini import parse_session_file + + +class TestParseGeminiUserContentParts: + def test_user_text_parts_preserve_whitespace_and_empty_parts(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-0", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": [ + {"text": "Alpha"}, + {"text": ""}, + {"text": " "}, + {"text": "Beta "}, + ], + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + message = result["messages"][0] + assert message["content"] == "Alpha\n\n \nBeta " + assert "content_parts" not in message + + def test_user_string_content_preserves_outer_whitespace(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-whitespace", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": " padded request ", + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + assert result["messages"][0]["content"] == " padded request " + + def test_all_whitespace_user_text_parts_are_not_dropped(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-blank", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": [ + {"text": " "}, + {"text": ""}, + ], + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + assert result["messages"][0]["content"] == " \n" + assert result["stats"]["user_messages"] == 1 + + def test_user_inline_data_preserved_without_duplicate_text(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-1", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": [ + {"text": "Please inspect this screenshot."}, + {"inlineData": {"mimeType": "image/png", "data": "QUJDRA=="}}, + ], + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + message = result["messages"][0] + assert message["content"] == "Please inspect this screenshot." + assert message["content_parts"] == [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + + def test_user_function_parts_preserved_and_linked(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-2", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": [ + {"text": "Use the read result below."}, + { + "functionCall": { + "name": "read_file", + "args": {"file_path": "/Users/testuser/Documents/myproject/src/app.py"}, + } + }, + { + "functionResponse": { + "name": "read_file", + "response": {"output": "print('hello')"}, + } + }, + ], + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + message = result["messages"][0] + assert message["content"] == "Use the read result below." + assert len(message["content_parts"]) == 2 + tool_use, tool_result = message["content_parts"] + assert tool_use["type"] == "tool_use" + assert tool_use["name"] == "read_file" + assert "testuser" not in tool_use["input"]["file_path"] + assert tool_result == { + "type": "tool_result", + "tool_use_id": tool_use["id"], + "content": "print('hello')", + } + + def test_large_blob_string_content_preserved_in_content_parts(self, tmp_path, mock_anonymizer): + blob = "data:image/png;base64," + ("A" * 5000) + session_file = tmp_path / "session-gemini.json" + session_file.write_text( + json.dumps( + { + "sessionId": "gemini-session-3", + "startTime": "2026-03-24T12:00:00Z", + "lastUpdated": "2026-03-24T12:00:01Z", + "messages": [ + { + "type": "user", + "timestamp": "2026-03-24T12:00:00Z", + "content": blob, + } + ], + } + ), + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + message = result["messages"][0] + assert "content" not in message + assert message["content_parts"] == [{"type": "text", "text": blob}] + assert result["stats"]["user_messages"] == 1 diff --git a/tests/test_secrets.py b/tests/test_secrets.py index 44139ee..ddec5ed 100644 --- a/tests/test_secrets.py +++ b/tests/test_secrets.py @@ -662,6 +662,23 @@ def test_none_content_skipped(self): result, count = redact_session(session) assert count == 0 + def test_redacts_content_parts_and_preserves_blob_payloads(self): + blob = "data:image/png;base64," + ("A" * 5000) + session = { + "messages": [ + { + "content_parts": [ + {"type": "tool_result", "content": "Key: sk-ant-api03-abcdefghijklmnopqrstuvwxyz"}, + {"type": "image", "source": {"type": "base64", "data": blob}}, + ] + } + ] + } + result, count = redact_session(session) + assert REDACTED in result["messages"][0]["content_parts"][0]["content"] + assert result["messages"][0]["content_parts"][1]["source"]["data"] == blob + assert count >= 1 + class TestLargeBinarySkipping: def test_detects_large_base64_blob(self):