From b09e62fbb2856e5b4ffd4436080b7feea06ff333 Mon Sep 17 00:00:00 2001 From: saminur Date: Fri, 19 Dec 2025 11:39:30 -0500 Subject: [PATCH 1/3] adding reconstruction codes for code review --- googledoc_reconstruction/README.md | 201 ++++++++++ .../ReconstructionState.py | 83 ++++ googledoc_reconstruction/command_state.py | 360 ++++++++++++++++++ googledoc_reconstruction/load_event.py | 347 +++++++++++++++++ .../main_reconstruction.py | 231 +++++++++++ googledoc_reconstruction/requirements.txt | 7 + 6 files changed, 1229 insertions(+) create mode 100644 googledoc_reconstruction/README.md create mode 100644 googledoc_reconstruction/ReconstructionState.py create mode 100644 googledoc_reconstruction/command_state.py create mode 100644 googledoc_reconstruction/load_event.py create mode 100644 googledoc_reconstruction/main_reconstruction.py create mode 100644 googledoc_reconstruction/requirements.txt diff --git a/googledoc_reconstruction/README.md b/googledoc_reconstruction/README.md new file mode 100644 index 00000000..b2869f7a --- /dev/null +++ b/googledoc_reconstruction/README.md @@ -0,0 +1,201 @@ +# WritingObserver – Google Docs Reconstruction + +This repository reconstructs a Google Doc’s content (including tabs-as-sections, tables-as-text, and smart-chip dropdowns-as-text) from WritingObserver logs containing `google_docs_save` events. It supports **incremental reconstruction** across multiple sessions by persisting a per-document reconstruction state and updating the same reconstructed Google Doc on subsequent runs. + +**Known limitation:** Images cannot be reconstructed because the logs do not contain image bytes or a retrievable image URL. Only opaque internal identifiers (e.g., `s-blob-v1-IMAGE-...`) are logged. + +--- + +## What this project does + +1. Reads `google_docs_save` events from a WritingObserver log file. +2. Applies events in chronological order to reconstruct document state. +3. Converts special structures (e.g., dropdowns) into readable text. +4. Creates a new reconstructed Google Doc on first run. +5. Updates the same reconstructed Google Doc on later runs. +6. Persists reconstruction state and document ID mappings. + +--- + +## Repository structure (recommended) + +``` +. +├── main_reconstruction.py # Entry point +├── ReconstructionState.py # Cross-document reconstruction state +├── command_state.py # DocState / TabState and command application +├── load_event.py # Log parsing → GoogleDocsSaveEvent +├── requirements.txt +├── state/ +│ ├── reconstruction_state.pkl +│ ├── reconstructed_doc_ids.json +│ └── google_docs_save_pointers.json +└── README.md +``` + +--- + +## Requirements + +- Python 3.9 or newer +- A Google account with access to Google Docs API + +Install dependencies: + +```bash +pip install -r requirements.txt +``` + +--- + +## Google Docs API setup + +### 1. Create OAuth credentials + +1. Go to Google Cloud Console. +2. Create or select a project. +3. Enable **Google Docs API**. +4. Configure OAuth consent screen. +5. Create OAuth Client ID: + - Type: **Desktop application** +6. Download the JSON file and rename it to: + +``` +credentials.json +``` + +Place it in the same directory as `main_reconstruction.py`. + +> ❗ Do NOT commit `credentials.json` or `token.json` to GitHub. + +--- + +### 2. Token generation + +On first run, the script will open an authentication flow and generate: + +``` +token.json +``` + +This file stores your OAuth refresh token and will be reused on future runs. + +--- + +## Running the reconstruction (local machine) + +1. Update the log path in `main_reconstruction.py`: + +```python +log_path = "/path/to/study_log_new1.log" +``` + +2. Run: + +```bash +python main_reconstruction.py +``` + +The script will: +- Parse new events +- Update reconstruction state +- Create or update the reconstructed Google Doc + +--- + +## Incremental reconstruction across sessions + +The system supports multiple runs over time: + +- Each document stores a `last_timestamp` +- Only events newer than the last run are applied +- The same reconstructed Google Doc is updated + +To reset everything, delete the `state/` directory. + +--- + +## Running on Google Colab + +1. Mount Drive: + +```python +from google.colab import drive +drive.mount('/content/drive') +``` + +2. Navigate to the code directory: + +```python +%cd /content/drive/MyDrive/WritingObserverProject/ReconstructionCode/Code +``` + +3. Install dependencies: + +```python +!pip install -r requirements.txt +``` + +4. Run reconstruction: + +```python +!python main_reconstruction.py +``` + +### Authentication note for Colab + +For Colab, console-based OAuth is recommended (`flow.run_console()`), because browser-based local server auth may fail in non-interactive runs. + +--- + +## Output format + +- Tabs are rendered as sections: + +``` +First Tab +========= + +``` + +- Dropdowns are rendered as: + +``` +DROPDOWN: Configuration Test – Option 1 +``` + +--- + +## Known limitations + +- **Images:** Not reconstructable (no image source in logs) +- **Tables:** Reconstructed as linearized text, not true Docs tables + +--- + +## Code review guide + +Main execution flow: + +1. `load_event.py` → parse logs +2. `ReconstructionState.py` → manage per-doc state +3. `command_state.py` → apply commands (insert/delete/dropdown) +4. `main_reconstruction.py` → orchestrate reconstruction + Docs API updates + +--- + +## Security & privacy + +Do NOT commit: +- `credentials.json` +- `token.json` +- log files +- `state/` directory + +Add them to `.gitignore`. + +--- + +## License + +Add license information here if distributing publicly. diff --git a/googledoc_reconstruction/ReconstructionState.py b/googledoc_reconstruction/ReconstructionState.py new file mode 100644 index 00000000..6886d33e --- /dev/null +++ b/googledoc_reconstruction/ReconstructionState.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 7 20:54:42 2025 + +@author: Saminur Islam +""" + + +from typing import Dict,Tuple,Iterable + +import os +import pickle + +from load_event import GoogleDocsSaveEvent +from command_state import DocState + +STATE_PATH = "state/reconstruction_state.pkl" + +class ReconstructionState: + """ + Holds reconstructed state for all docs. + Key: (user_id, doc_id) + """ + + def __init__(self): + self.docs: Dict[Tuple[str, str], DocState] = {} + + def get_or_create_doc(self, user_id: str, doc_id: str) -> DocState: + key = (user_id, doc_id) + if key not in self.docs: + self.docs[key] = DocState(user_id, doc_id) + return self.docs[key] + + + +def load_reconstruction_state(path: str = STATE_PATH) -> "ReconstructionState": + if not os.path.exists(path): + return ReconstructionState() + with open(path, "rb") as f: + return pickle.load(f) + + +def save_reconstruction_state(state: "ReconstructionState", + path: str = STATE_PATH) -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + pickle.dump(state, f) + + +def reconstruct_from_events( + events: Iterable["GoogleDocsSaveEvent"], + state: "ReconstructionState" = None, +) -> "ReconstructionState": + """ + Given a sequence of GoogleDocsSaveEvent objects, update + all docs (for all users/doc_ids) in memory. + + If `state` is provided, we mutate it in-place; otherwise + we create a new ReconstructionState. + """ + if state is None: + state = ReconstructionState() + + # Sort events chronologically to ensure the right order + sorted_events = sorted( + events, + key=lambda e: (e.server_time, e.timestamp) + ) + + for ev in sorted_events: + doc = state.get_or_create_doc(ev.user_id, ev.doc_id) + + # update per-doc meta + doc.last_server_time = ev.server_time + doc.last_timestamp = ev.timestamp + doc.last_url = ev.url + doc.chrome_identity = ev.chrome_identity + + for bundle in ev.bundles: + # pass event timestamp for tab ordering + doc.apply_bundle(bundle, ev.tab_id, event_timestamp=ev.timestamp) + + return state diff --git a/googledoc_reconstruction/command_state.py b/googledoc_reconstruction/command_state.py new file mode 100644 index 00000000..d208fe57 --- /dev/null +++ b/googledoc_reconstruction/command_state.py @@ -0,0 +1,360 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 7 20:54:05 2025 + +@author: Saminur Islam +""" + +import collections +from dataclasses import dataclass, field +from typing import Dict, Tuple, List, Optional + + +PLACEHOLDER = "\x00" + +def _insert_1based(text: str, ibi: int, s: str) -> str: + """ + Insert string s at 1-based index ibi, using the same logic as the old + google_text.insert: fill gaps with PLACEHOLDER if ibi is beyond the end. + """ + # index of next char after last char (1-based) + nextchar_index = len(text) + 1 + + # If the insert index is greater than nextchar_index, fill the gap + if ibi > nextchar_index: + gap = ibi - nextchar_index + # recursively insert the placeholders first + text = _insert_1based(text, nextchar_index, PLACEHOLDER * gap) + + # now ibi is in [1, len(text)+1] + # slice using 1-based logic (prefix up to ibi-1, then s, then suffix) + return text[: ibi - 1] + s + text[ibi - 1 :] + + +def _delete_1based(text: str, si: int, ei: int) -> str: + """ + Delete [si, ei) using the same semantics as the old google_text.delete. + It also fills gaps with PLACEHOLDER if si/ei are beyond the current end. + """ + lastchar_index = len(text) # last valid char (1-based index) + + # If si is beyond the end, fill gap first + if si > lastchar_index: + gap = si - lastchar_index + text = _insert_1based(text, lastchar_index + 1, PLACEHOLDER * gap) + lastchar_index = len(text) + + # If ei is beyond the end, fill that gap too + if ei > lastchar_index: + gap = ei - lastchar_index + text = _insert_1based(text, lastchar_index + 1, PLACEHOLDER * gap) + + # Now si, ei are in range. google_text used: + # start = text[0:si-1] + # end = text[ei:] + return text[: si - 1] + text[ei:] + +@dataclass +class TabState: + text: str = "" + elements: Dict[str, dict] = field(default_factory=dict) + name: Optional[str] = None # human-readable tab name ("First Tab", "Second Tab", etc.) + first_timestamp: Optional[int] = None # when this tab first saw edits + # NEW: dropdown metadata + dropdown_defs: Dict[str, dict] = field(default_factory=dict) # def_id -> ae command + dropdown_elems: Dict[str, dict] = field(default_factory=dict) # elem_id -> ae command (et == "dropdown") + dropdown_instances: List[Tuple[int, str]] = field(default_factory=list) # list of (spi, elem_id) + + + +class DocState: + """ + In-memory representation of one Google Doc reconstructed from + google_docs_save bundles. Handles multiple tabs. + """ + + def __init__(self, user_id: str, doc_id: str): + self.user_id = user_id + self.doc_id = doc_id + # tab_id -> TabState + self.tabs: Dict[str, TabState] = collections.defaultdict(TabState) + + # some meta + self.last_server_time: Optional[float] = None + self.last_timestamp: Optional[int] = None + self.last_url: Optional[str] = None + self.chrome_identity: Dict[str, Optional[str]] = {} + + # --- main public entry --- + + @staticmethod + def _extract_name_from_d(data): + """ + The mkch/ac/ucp commands put tab names into deeply nested lists like: + [ '', [[], [1, 'First Tab']] ] + or + [ 't.95y...', [1, 'Tab 2'], [1] ] + + This walks the structure and returns the first string it finds. + """ + def _walk(x): + if isinstance(x, str): + return x + if isinstance(x, list): + for item in x: + r = _walk(item) + if r: + return r + return None + + return _walk(data) + + def expand_dropdowns(self) -> None: + """ + After all events have been applied, replace the single-character + dropdown icon at each recorded position with a readable placeholder + like 'DROPDOWN: Configuration Test – Option 1'. + """ + for tab in self.tabs.values(): + if not tab.dropdown_instances: + continue + + # Process from right to left so earlier replacements do not affect + # positions of later ones. + for spi, elem_id in sorted(tab.dropdown_instances, key=lambda x: x[0], reverse=True): + dropdown_cmd = tab.dropdown_elems.get(elem_id) + if not dropdown_cmd: + continue + + epm = dropdown_cmd.get("epm", {}) + def_id = epm.get("dde_di") + selected_item_id = epm.get("dde-sii") + selected_fallback_value = epm.get("dde-fdv") # text of selected option + + # Look up the definition to get config name and items + def_cmd = tab.dropdown_defs.get(def_id, {}) + ddefe = def_cmd.get("epm", {}).get("ddefe-ddi", {}) + config_name = def_cmd.get("epm", {}).get("ddefe-t", "Dropdown") + items = ddefe.get("cv", {}).get("opValue", []) + + # Try to find the selected item label + selected_label = selected_fallback_value + for item in items: + if item.get("di-id") == selected_item_id: + selected_label = item.get("di-dv") or item.get("di-v") or selected_label + break + + human = f"DROPDOWN: {config_name} – {selected_label}" + + # Replace the single icon character at [spi] (1-based) with `human` + # tab.text[spi-1] should currently be '' + if 1 <= spi <= len(tab.text): + tab.text = tab.text[: spi - 1] + human + tab.text[spi:] + + def apply_bundle(self, bundle: dict, default_tab: str, event_timestamp: Optional[int] = None) -> None: + """ + Apply one google_docs_save bundle. + + event_timestamp is used to approximate tab creation / ordering. + """ + commands = bundle.get("commands", []) + for cmd in commands: + self._apply_cmd(cmd, default_tab, event_timestamp) + + # --- command handlers (same idea as before) --- + + # --- command handlers (same idea as before) --- + + def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] = None) -> None: + ty = cmd.get("ty") + if not ty: + return + + # ensure the tab exists and record first edit timestamp + tab = self.tabs[current_tab] + if event_timestamp is not None and tab.first_timestamp is None: + tab.first_timestamp = event_timestamp + + # Multi-command wrapper + if ty == "mlti": + for sub in cmd.get("mts", []): + self._apply_cmd(sub, current_tab, event_timestamp) + return + + # nm: "new mutation" with routing info (often contains tab id) + if ty == "nm": + target_tab = current_tab + nmr = cmd.get("nmr") or [] + # heuristic: LAST string starting with 't.' is the tab id + for x in reversed(nmr): + if isinstance(x, str) and x.startswith("t."): + target_tab = x + break + inner_cmd = cmd.get("nmc", {}) + self._apply_cmd(inner_cmd, target_tab, event_timestamp) + return + + + # ----- TAB METADATA COMMANDS ----- + + # mkch: initial tab name(s) (e.g. [[1, "Tab 1"]]) + if ty == "mkch": + data = cmd.get("d") + name = self._extract_name_from_d(data) + if name: + tab = self.tabs[current_tab] + tab.name = name + return + + # ucp: update caption / rename tab + # d looks like: ['t.95y...', [[], [1, 'Second Tab']]] + # or ['', [[], [1, 'First Tab']]] + if ty == "ucp": + data = cmd.get("d") + if not isinstance(data, list) or len(data) < 2: + return + tab_id = data[0] or current_tab + name = self._extract_name_from_d(data[1]) + if name: + tstate = self.tabs[tab_id] + tstate.name = name + if event_timestamp is not None and tstate.first_timestamp is None: + tstate.first_timestamp = event_timestamp + return + + # ac: add child (new tab) + # d looks like: ['t.95y...', [1, 'Tab 2'], [1]] + if ty == "ac": + data = cmd.get("d") + if not isinstance(data, list) or len(data) < 2: + return + tab_id = data[0] + if not isinstance(tab_id, str): + return + name = self._extract_name_from_d(data[1]) + tstate = self.tabs[tab_id] + if name: + tstate.name = name + if event_timestamp is not None and tstate.first_timestamp is None: + tstate.first_timestamp = event_timestamp + return + + # ----- TEXT EDITING COMMANDS ----- + + # Insert string (1-based indices, placeholder-aware) + if ty == "is": + tab = self.tabs[current_tab] + s = cmd.get("s", "") + ibi = cmd.get("ibi") + + # In your old code, ibi is 1-based, and if missing, you don't get an insert. + # Fall back to "append" if ibi missing, using 1-based len+1. + if ibi is None: + ibi = len(tab.text) + 1 + + try: + ibi_int = int(ibi) + except (TypeError, ValueError): + ibi_int = len(tab.text) + 1 + + tab.text = _insert_1based(tab.text, ibi_int, s) + return + + # Delete substring [si, ei) (1-based indices, placeholder-aware) + if ty == "ds": + tab = self.tabs[current_tab] + si = cmd.get("si") + ei = cmd.get("ei") + + # If indexes are missing, nothing to do + if si is None or ei is None: + return + + try: + si_int = int(si) + ei_int = int(ei) + except (TypeError, ValueError): + return + + # DS uses [si, ei) semantics; this matches your old implementation + tab.text = _delete_1based(tab.text, si_int, ei_int) + return + + # Alter substring [si, ei) -> s + # IMPORTANT: many 'as' commands are *style only* (no 's'). + # If there is no 's', we treat it as style and ignore. + if ty == "as": + if "s" not in cmd: + # style-only 'as' — do not touch text + return + + tab = self.tabs[current_tab] + s = cmd.get("s", "") + si = cmd.get("si") + ei = cmd.get("ei") + + if si is None or ei is None: + return + + try: + si_int = int(si) + ei_int = int(ei) + except (TypeError, ValueError): + return + + # emulate "delete then insert" at the same 1-based location + tab.text = _delete_1based(tab.text, si_int, ei_int) + tab.text = _insert_1based(tab.text, si_int, s) + return + + # ----- ELEMENT COMMANDS (images, inline objects, dropdowns, etc.) ----- + + if ty == "ae": + tab = self.tabs[current_tab] + el_id = cmd.get("id") + if not el_id: + return + + et = cmd.get("et") + + # Dropdown definition (holds the options) + if et == "dropdown-definition": + tab.dropdown_defs[el_id] = cmd + return + + # Actual dropdown element (one instance tied to a definition) + if et == "dropdown": + tab.dropdown_elems[el_id] = cmd + return + + # Fallback: other elements (images, etc.) + tab.elements[el_id] = cmd + return + + # te: tie element into the text at position 'spi' (1-based) + # We now special-case dropdowns to reconstruct a readable token. + if ty == "te": + tab = self.tabs[current_tab] + el_id = cmd.get("id") + spi = cmd.get("spi") + + if not el_id or not isinstance(spi, int): + return + + # Is this tying a dropdown element? + if el_id in tab.dropdown_elems: + # DON'T touch tab.text here. + # Just remember that at logical position `spi` there is this dropdown. + tab.dropdown_instances.append((spi, el_id)) + return + + # For non-dropdown elements (e.g. images) you can keep old behavior: + placeholder = f"[{el_id}]" + tab.text = _insert_1based(tab.text, spi, placeholder) + return + + + + # Other types (headings, document style, etc.) are formatting only. + # We ignore them to keep indices consistent but text intact. + return diff --git a/googledoc_reconstruction/load_event.py b/googledoc_reconstruction/load_event.py new file mode 100644 index 00000000..2704a9c9 --- /dev/null +++ b/googledoc_reconstruction/load_event.py @@ -0,0 +1,347 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 7 20:53:14 2025 + +@author: Saminur Islam +""" + +import json +import os +import re +from dataclasses import dataclass, asdict +from typing import Dict, Tuple, Iterator, Optional, Any + + +# ---------- Helpers ---------- + +def parse_tab_from_url(url: str) -> str: + """ + Extracts the tab id from a docs URL, e.g. + ...&tab=t.4n9p3wa3df6o + Returns 't.0' if nothing is found. + """ + if not url or "tab=" not in url: + return "t.0" + m = re.search(r"tab=([^&]+)", url) + return m.group(1) if m else "t.0" + + +# ---------- Data structures ---------- + +@dataclass +class GoogleDocsSaveEvent: + """ + Flattened view of a google_docs_save event. + This is what you will feed into the reconstruction later. + """ + user_id: str + doc_id: str + url: str + tab_id: str + timestamp: int # client.timestamp in ms + server_time: float # server.time (epoch seconds) + chrome_identity: Dict[str, Optional[str]] + bundles: list # client.bundles (raw) + + +@dataclass +class Pointer: + """ + Per (user, doc, tab) pointer so we only re-read new events. + """ + last_timestamp: int = 0 # max client.timestamp seen + last_server_time: float = 0.0 # max server.time seen + + +# key type for pointer dict +DocKey = Tuple[str, str, str] # (user_id, doc_id, tab_id) + + +# ---------- Pointer persistence ---------- + +def load_pointers(pointer_path: str) -> Dict[DocKey, Pointer]: + """ + Load pointer state from JSON file. + If file doesn't exist, return empty dict. + JSON structure: + { + "user_id|doc_id|tab_id": { + "last_timestamp": ..., + "last_server_time": ... + }, + ... + } + """ + if not os.path.exists(pointer_path): + return {} + + with open(pointer_path, "r", encoding="utf-8") as f: + raw = json.load(f) + + pointers: Dict[DocKey, Pointer] = {} + for key_str, data in raw.items(): + user_id, doc_id, tab_id = key_str.split("|", 2) + pointers[(user_id, doc_id, tab_id)] = Pointer( + last_timestamp=data.get("last_timestamp", 0), + last_server_time=data.get("last_server_time", 0.0), + ) + return pointers + + +def save_pointers(pointer_path: str, pointers: Dict[DocKey, Pointer]) -> None: + """ + Save pointer state to JSON file. + """ + os.makedirs(os.path.dirname(pointer_path), exist_ok=True) + out: Dict[str, Any] = {} + for (user_id, doc_id, tab_id), ptr in pointers.items(): + key_str = "|".join([user_id, doc_id, tab_id]) + out[key_str] = asdict(ptr) + + with open(pointer_path, "w", encoding="utf-8") as f: + json.dump(out, f, indent=2, sort_keys=True) + + +# ---------- Incremental log reader ---------- + +class IncrementalGoogleDocsSaveReader: + """ + Reads a log file (main_log.jsonl or *.study.log) incrementally and yields + only NEW google_docs_save events per (user_id, doc_id, tab_id). + + It uses a pointer file (JSON) to remember the last client.timestamp + processed for each (user, doc, tab). + """ + + def __init__(self, pointer_path: str): + self.pointer_path = pointer_path + self.pointers: Dict[DocKey, Pointer] = load_pointers(pointer_path) + + # --- public API --- + + def iter_new_events( + self, + log_path: str, + only_user: Optional[str] = None, + only_doc: Optional[str] = None, + ) -> Iterator[GoogleDocsSaveEvent]: + """ + Iterate over NEW google_docs_save events in a given log file. + + - If only_user is given, only events for that user_id are yielded. + - If only_doc is given, only events for that doc_id are yielded. + + After iterating, call `save()` to persist updated pointers. + """ + with open(log_path, "r", encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + + # Some log lines are: "\t". + # We only want the JSON part. + json_part = line.split("\t", 1)[0] + + try: + ev = json.loads(json_part) + except json.JSONDecodeError: + # You can log or print if you want to debug malformed lines + # print(f"Skipping malformed json on line {line_no}") + continue + + client = ev.get("client", {}) + + # We only care about google_docs_save + event_type = client.get("event") or ev.get("event") + if event_type != "google_docs_save": + continue + + # ---- extract identifiers ---- + auth = client.get("auth", {}) + user_id = auth.get("user_id") or auth.get("safe_user_id") + if not user_id: + # Can't index by user; skip + continue + + doc_id = ( + client.get("doc_id") + or client.get("object", {}).get("id") + or ev.get("doc_id") + ) + if not doc_id: + # Can't index by doc; skip + continue + + if only_user and user_id != only_user: + continue + if only_doc and doc_id != only_doc: + continue + + url = ( + client.get("url") + or client.get("object", {}).get("url") + or ev.get("url", "") + ) + tab_id = parse_tab_from_url(url) + + chrome_identity = ( + client.get("chrome_identity") + or ev.get("metadata", {}).get("chrome_identity") + or {} + ) + # Some logs put bundles only under client + bundles = client.get("bundles", []) + + # client.timestamp (ms) is your main ordering for save events + ts = client.get("timestamp") or ev.get("timestamp") or 0 + try: + ts_int = int(ts) + except (TypeError, ValueError): + ts_int = 0 + + server_time = ev.get("server", {}).get("time", 0.0) + try: + server_time_f = float(server_time) + except (TypeError, ValueError): + server_time_f = 0.0 + + key: DocKey = (user_id, doc_id, tab_id) + ptr = self.pointers.get(key) + + # If we have processed this timestamp already, skip + if ptr is not None and ts_int <= ptr.last_timestamp: + continue + + # Construct the flattened event object + gds_event = GoogleDocsSaveEvent( + user_id=user_id, + doc_id=doc_id, + url=url, + tab_id=tab_id, + timestamp=ts_int, + server_time=server_time_f, + chrome_identity={ + "email": chrome_identity.get("email"), + "id": chrome_identity.get("id"), + }, + bundles=bundles, + ) + + # Update pointer in-memory + new_last_ts = ts_int + new_last_server_time = max( + server_time_f, + ptr.last_server_time if ptr is not None else 0.0, + ) + self.pointers[key] = Pointer( + last_timestamp=new_last_ts, + last_server_time=new_last_server_time, + ) + + yield gds_event + + def save(self) -> None: + """ + Persist pointer state to disk. + Call this after you've finished processing events. + """ + save_pointers(self.pointer_path, self.pointers) + + + +def read_google_docs_save_log( + log_path: str, + only_user: Optional[str] = None, + only_doc: Optional[str] = None, +) -> Iterator[GoogleDocsSaveEvent]: + """ + Simple helper to read *all* google_docs_save events from a log file + (no pointer / incremental logic). + + You can optionally filter by user_id and/or doc_id. + """ + with open(log_path, "r", encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + + # Lines are often "\\t" + json_part = line.split("\t", 1)[0] + + try: + ev = json.loads(json_part) + except json.JSONDecodeError: + # Skip malformed lines + continue + + client = ev.get("client", {}) + + # Only care about google_docs_save + event_type = client.get("event") or ev.get("event") + if event_type != "google_docs_save": + continue + + # ---- extract identifiers ---- + auth = client.get("auth", {}) + user_id = auth.get("user_id") or auth.get("safe_user_id") + if not user_id: + continue + + if only_user and user_id != only_user: + continue + + doc_id = ( + client.get("doc_id") + or client.get("doc", {}).get("id") + or ev.get("client", {}) + .get("doc_id") # sometimes nested again + ) + if not doc_id: + continue + + if only_doc and doc_id != only_doc: + continue + + url = ( + client.get("url") + or client.get("object", {}).get("url") + or ev.get("url", "") + ) + tab_id = parse_tab_from_url(url) + + chrome_identity = ( + client.get("chrome_identity") + or ev.get("metadata", {}).get("chrome_identity") + or {} + ) + bundles = client.get("bundles", []) + + ts = client.get("timestamp") or ev.get("timestamp") or 0 + try: + ts_int = int(ts) + except (TypeError, ValueError): + ts_int = 0 + + server_time = ev.get("server", {}).get("time", 0.0) + try: + server_time_f = float(server_time) + except (TypeError, ValueError): + server_time_f = 0.0 + + yield GoogleDocsSaveEvent( + user_id=user_id, + doc_id=doc_id, + url=url, + tab_id=tab_id, + timestamp=ts_int, + server_time=server_time_f, + chrome_identity={ + "email": chrome_identity.get("email"), + "id": chrome_identity.get("id"), + }, + bundles=bundles, + ) + + diff --git a/googledoc_reconstruction/main_reconstruction.py b/googledoc_reconstruction/main_reconstruction.py new file mode 100644 index 00000000..56f9ca84 --- /dev/null +++ b/googledoc_reconstruction/main_reconstruction.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 7 20:58:10 2025 + +@author: Saminur Islam +""" + +from __future__ import print_function +import os.path +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +from googleapiclient.discovery import build + +from command_state import DocState +from command_state import PLACEHOLDER +from typing import List +from load_event import IncrementalGoogleDocsSaveReader, read_google_docs_save_log + +from ReconstructionState import ( + reconstruct_from_events, + load_reconstruction_state, + save_reconstruction_state, +) +# Dict, Tuple, Iterator, Optional, Any + +SCOPES = ["https://www.googleapis.com/auth/documents"] + +STATE_PATH = "state/reconstruction_state.pkl" +RECON_MAP_PATH = "state/reconstructed_doc_ids.json" + +'''Add small helpers to store a mapping from original doc → reconstructed doc ''' +import json + +def load_recon_map(path: str = RECON_MAP_PATH) -> dict: + if not os.path.exists(path): + return {} + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_recon_map(mapping: dict, path: str = RECON_MAP_PATH) -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(mapping, f, indent=2, sort_keys=True) + + +def get_docs_service(): + """ + Returns an authenticated Docs API client. + Requires credentials.json (OAuth client) in the working directory. + """ + creds = None + if os.path.exists("token.json"): + creds = Credentials.from_authorized_user_file("token.json", SCOPES) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + "credentials.json", SCOPES + ) + creds = flow.run_local_server(port=0) + with open("token.json", "w") as token: + token.write(creds.to_json()) + + return build("docs", "v1", credentials=creds) + + +def build_full_text(doc_state: DocState) -> str: + """ + Build the full plain-text representation for a doc_state, + with one section per tab. + """ + parts: List[str] = [] + + def _tab_sort_key(item): + tab_id, tab_state = item + return tab_state.first_timestamp or 0 + + for tab_id, tab in sorted(doc_state.tabs.items(), key=_tab_sort_key): + display_name = tab.name or tab_id + + parts.append(f"{display_name}\n") + parts.append(f"{'=' * len(display_name)}\n\n") + + cleaned_text = tab.text.replace(PLACEHOLDER, "") + parts.append(cleaned_text) + parts.append("\n\n") + + return "".join(parts) if parts else "" + + +def update_reconstructed_doc(service, doc_id: str, doc_state: DocState) -> None: + """ + Overwrite an existing reconstructed Google Doc with the latest + text from doc_state. + """ + full_text = build_full_text(doc_state) + + # Get current endIndex so we can delete (almost) everything + doc = service.documents().get(documentId=doc_id).execute() + body = doc.get("body", {}) + content = body.get("content", []) + if not content: + end_index = 1 + else: + # endIndex of the last structural element (includes trailing newline) + end_index = content[-1].get("endIndex", 1) + + requests = [] + + # Delete everything except the final newline character. + # Docs API doesn't allow us to delete the last newline in a segment. + if end_index > 2: + requests.append({ + "deleteContentRange": { + "range": { + "startIndex": 1, + "endIndex": end_index - 1 # <-- key change + } + } + }) + + # Insert our new content at index 1 + if full_text: + requests.append({ + "insertText": { + "location": {"index": 1}, + "text": full_text, + } + }) + + if requests: + service.documents().batchUpdate( + documentId=doc_id, + body={"requests": requests}, + ).execute() + + + +def create_reconstructed_doc(service, doc_state: DocState, + title_prefix: str = "Reconstructed") -> str: + title = f"{title_prefix} - {doc_state.doc_id[:20]}" + + created = service.documents().create(body={"title": title}).execute() + new_doc_id = created.get("documentId") + + full_text = build_full_text(doc_state) + + if full_text: + requests = [ + { + "insertText": { + "location": {"index": 1}, + "text": full_text, + } + } + ] + service.documents().batchUpdate( + documentId=new_doc_id, + body={"requests": requests}, + ).execute() + + return new_doc_id + + +# ---------- Example usage ---------- + +log_path = "logs/study_log_new.log" # first log! to see the cursor properly working chekc with study_log_new1.log after using the first one +pointer_path = "state/google_docs_save_pointers.json" + +reader = IncrementalGoogleDocsSaveReader(pointer_path) + +# 1) Load previous reconstruction state (if any) +recon_state = load_reconstruction_state(STATE_PATH) + +# # 2) Get only NEW events since last run +# events = list(reader.iter_new_events(log_path)) +all_events = list(read_google_docs_save_log(log_path)) + +print(f"Total events in log: {len(all_events)}") +new_events = [] +for ev in all_events: + key = (ev.user_id, ev.doc_id) + doc_state = recon_state.docs.get(key) + + # If we have already seen this doc and timestamp is not newer, skip + if doc_state is not None and doc_state.last_timestamp is not None: + if ev.timestamp <= doc_state.last_timestamp: + continue + + new_events.append(ev) + +print(f"New events to apply: {len(new_events)}") +# Nothing new? then nothing to do. +if not new_events: + print("No new google_docs_save events found.") +else: + # 3) Apply new events on top of existing state + recon_state = reconstruct_from_events(new_events, recon_state) + + # 4) Persist updated pointers and state + # reader.save() + save_reconstruction_state(recon_state, STATE_PATH) + + # 5) Talk to Docs + service = get_docs_service() + recon_map = load_recon_map() + + for (user_id, doc_id), doc_state in recon_state.docs.items(): + print(f"Updating reconstruction for user={user_id}, original doc={doc_id}") + + # Make dropdowns readable + doc_state.expand_dropdowns() + + # Look up existing reconstructed doc id (if any) + recon_doc_id = recon_map.get(doc_id) + + if recon_doc_id is None: + # First time -> create a new reconstructed doc + recon_doc_id = create_reconstructed_doc(service, doc_state) + recon_map[doc_id] = recon_doc_id + print(f" -> created new reconstructed doc: {recon_doc_id}") + else: + # Subsequent runs -> update the same reconstructed doc + update_reconstructed_doc(service, recon_doc_id, doc_state) + print(f" -> updated existing reconstructed doc: {recon_doc_id}") + + save_recon_map(recon_map) diff --git a/googledoc_reconstruction/requirements.txt b/googledoc_reconstruction/requirements.txt new file mode 100644 index 00000000..c9aacd4a --- /dev/null +++ b/googledoc_reconstruction/requirements.txt @@ -0,0 +1,7 @@ +google-api-python-client>=2.110.0 +google-auth>=2.20.0 +google-auth-oauthlib>=1.0.0 +google-auth-httplib2>=0.2.0 +requests>=2.31.0 +python-dateutil>=2.8.2 +tqdm>=4.66.0 From b3c784a9513f43273ba3c3dbc078e9278f994d11 Mon Sep 17 00:00:00 2001 From: saminur Date: Sun, 11 Jan 2026 12:28:38 -0500 Subject: [PATCH 2/3] working offline reducer for the google doc reconstruction --- .../DOCUMENTATION_UPDATES.md | 126 ++++++ googledoc_reconstruction/README.md | 40 ++ .../ReconstructionState.py | 61 ++- googledoc_reconstruction/command_state.py | 360 +++++++++++++++--- googledoc_reconstruction/load_event.py | 63 ++- .../main_reconstruction.py | 37 +- .../writing_observer/writing_analysis.py | 137 ++++--- 7 files changed, 706 insertions(+), 118 deletions(-) create mode 100644 googledoc_reconstruction/DOCUMENTATION_UPDATES.md diff --git a/googledoc_reconstruction/DOCUMENTATION_UPDATES.md b/googledoc_reconstruction/DOCUMENTATION_UPDATES.md new file mode 100644 index 00000000..97e3e1ed --- /dev/null +++ b/googledoc_reconstruction/DOCUMENTATION_UPDATES.md @@ -0,0 +1,126 @@ +# Documentation Updates - Multi-Tab System + +## Overview +Updated documentation in response to reviewer feedback requesting more elaboration on how tabs work and what commands are associated with tabs. The information about how tabs operate is now thoroughly documented. + +## Files Enhanced + +### 1. **command_state.py** - Core Tab Management +- **TabState class**: Added comprehensive docstring explaining tab structure, metadata, and embedded elements +- **DocState class**: Added detailed explanation of: + - How tabs function in Google Docs (separate sections/pages) + - Complete command-to-tab mapping (which commands affect which aspects) + - Tab lifecycle and reconstruction workflow + +- **apply_bundle() method**: Added explanation of bundle structure and how default_tab routing works + +- **_apply_cmd() method**: Major enhancement with: + - Complete command routing explanation + - Full list of supported command types + - Documentation of how each command type affects tabs + +#### Tab Metadata Commands (Now Documented) +- **MKCH** ("Make channel"): Initialize tab metadata with names + - Data structure: Deeply nested lists containing tab names + - When used: Document creation or tab addition + - Result: Sets human-readable tab names + +- **UCP** ("Update caption"): Rename existing tabs + - Data structure: Tab ID + new name in nested format + - When used: User renames a tab + - Handles both specific tab ID and current tab context + +- **AC** ("Add child"): Create new tabs + - Data structure: New tab ID + name data + - When used: User adds/duplicates a tab + - Initializes TabState and records creation timestamp + +#### Text Editing Commands (Now Documented) +- **IS** ("Insert string"): Add text at position +- **DS** ("Delete substring"): Remove text in range +- **AS** ("Alter substring"): Replace text (delete + insert) + - All with 1-based indexing and placeholder awareness + - Detailed semantics and fallback behavior + +#### Element Commands (Now Documented) +- **AE** ("Add element"): Register dropdowns, images, etc. + - Dropdown-definition: Holds available options + - Dropdown: Instance with selected value + - Other elements: Images (limited by log constraints) + +- **TE** ("Tie element"): Embed element into text + - Special handling for dropdowns (deferred reconstruction) + - Placeholders for other elements (images, etc.) + +#### Control Flow Commands +- **MLTI** ("Multi-command"): Wrapper containing sub-commands +- **NM** ("New mutation"): Routes commands to specific tabs via 'nmr' field + +### 2. **ReconstructionState.py** - Document-Level Tab Management +- **ReconstructionState class**: Added detailed docstring explaining: + - Multi-document state management + - Key responsibilities (tracking documents, routing events, persistence) + - Complete workflow (loading, processing, saving) + +- **reconstruct_from_events() function**: Enhanced with: + - Multi-tab reconstruction logic explanation + - Event sorting and ordering importance + - Tab routing mechanism + - Command processing pipeline + - Metadata tracking per document + +### 3. **load_event.py** - Event and Tab Extraction +- **parse_tab_from_url() function**: Added comprehensive docstring: + - Multi-tab URL structure explanation + - Tab ID format and examples + - Default behavior when tab not found + +- **GoogleDocsSaveEvent class**: Significantly expanded documentation: + - Multi-tab event structure + - Command bundle format + - Timing information usage + - Detailed attribute descriptions + +### 4. **main_reconstruction.py** - Tab Rendering +- **build_full_text() function**: Enhanced with: + - Multi-tab rendering strategy explanation + - Tab ordering logic (by first_timestamp) + - Element reconstruction approach + - Output format with examples + - Section separation and formatting + +## Key Concepts Now Documented + +### Tab Identification +- Tab IDs start with 't.' (e.g., 't.0', 't.95y...', 't.4n9p3wa3df6o') +- 't.0' is the default/initial tab +- Extracted from Google Docs URLs via 'tab=' parameter + +### Command Routing +- **URL routing**: Tab ID in URL's 'tab=' parameter +- **nm routing**: Tab ID in new mutation's 'nmr' field (Last 't.' string wins) +- **Explicit routing**: ucp and ac commands specify target tab +- **Default routing**: Text editing commands use current_tab parameter + +### Reconstruction Workflow +1. Events sorted chronologically (server_time, timestamp) +2. Commands routed to appropriate TabState +3. Text modified, metadata tracked, elements registered +4. After all events: expand_dropdowns() converts metadata to readable text +5. Multi-tab sections rendered with headers and separators + +### Element Handling +- **Dropdowns**: Reconstructed as readable "DROPDOWN: Config – Value" +- **Images**: Shown as placeholders (binary data not in logs) +- **Deferred processing**: Metadata gathered first, then reconstructed + +## Benefits for System Integration + +Developers can now understand: +1. **How tabs work**: Independent sections within a document with separate content +2. **Command mapping**: Which commands affect which tabs and tab properties +3. **Event flow**: How events route to tabs for processing +4. **Rendering**: How multi-tab content is presented in output +5. **State persistence**: How reconstruction state captures tab information + +This documentation provides the "crucial information for determining the best way these will operate within the system" as requested by the reviewer. diff --git a/googledoc_reconstruction/README.md b/googledoc_reconstruction/README.md index b2869f7a..25c24f11 100644 --- a/googledoc_reconstruction/README.md +++ b/googledoc_reconstruction/README.md @@ -17,6 +17,46 @@ This repository reconstructs a Google Doc’s content (including tabs-as-section --- +## Tabs and command routing + +Google Docs tabs are treated as separate sections within a single document. Each tab has: +- a unique tab id (e.g., `t.0`, `t.95y...`) +- independent text content +- its own embedded elements (dropdowns, images) + +How commands are routed to tabs: +- URL routing: the event URL may contain `tab=...` and sets the default tab. +- `nm` routing: a "new mutation" command may override the target tab using the `nmr` field. +- Explicit routing: `ucp` and `ac` can name a specific tab id inside their `d` payload. +- Default routing: text edits (`is`, `ds`, `as`) apply to the current/default tab. + +Tab ordering in the output is based on the first time each tab receives edits, so +newer tabs appear later when rendered as sections. + +--- + +## Command types (high level) + +Tab metadata: +- `mkch` initialize tab names from nested list data +- `ucp` rename existing tab +- `ac` create a new tab + +Text editing: +- `is` insert string at 1-based position +- `ds` delete substring range `[si, ei)` +- `as` replace substring range `[si, ei)` with a new string (style-only `as` are ignored) + +Embedded elements: +- `ae` register element metadata (dropdowns, images, etc.) +- `te` tie an element into text at a 1-based position + +Control flow: +- `mlti` batch of sub-commands +- `nm` mutation wrapper with routing + +--- + ## Repository structure (recommended) ``` diff --git a/googledoc_reconstruction/ReconstructionState.py b/googledoc_reconstruction/ReconstructionState.py index 6886d33e..aee80075 100644 --- a/googledoc_reconstruction/ReconstructionState.py +++ b/googledoc_reconstruction/ReconstructionState.py @@ -18,8 +18,30 @@ class ReconstructionState: """ - Holds reconstructed state for all docs. - Key: (user_id, doc_id) + Holds reconstructed state for all documents across all users. + + TAB MANAGEMENT AT THE DOCUMENT LEVEL: + This class manages the reconstruction state for multiple documents, where each document + can have multiple tabs. The docs dictionary uses (user_id, doc_id) as the key and stores + a DocState object for each document. + + Key responsibilities: + 1. Track which documents are being reconstructed (via their user_id, doc_id pairs) + 2. Route events to the correct DocState for processing + 3. Persist state to disk for incremental reconstruction across sessions + + WORKFLOW: + 1. For each GoogleDocsSaveEvent, extract (user_id, doc_id) + 2. Get or create a DocState for that document + 3. Pass the event's bundles to DocState.apply_bundle() with the event's tab_id + 4. After all events are processed, call expand_dropdowns() on each document + 5. Save the entire ReconstructionState for the next session + + USAGE: + - First run: load_reconstruction_state() creates empty ReconstructionState + - Process events: reconstruct_from_events(events, state) populates it + - Save: save_reconstruction_state(state) persists to disk + - Next run: load_reconstruction_state() retrieves previous state, events update it incrementally """ def __init__(self): @@ -52,11 +74,38 @@ def reconstruct_from_events( state: "ReconstructionState" = None, ) -> "ReconstructionState": """ - Given a sequence of GoogleDocsSaveEvent objects, update - all docs (for all users/doc_ids) in memory. + Given a sequence of GoogleDocsSaveEvent objects, update all docs (for all users/doc_ids) in memory. - If `state` is provided, we mutate it in-place; otherwise - we create a new ReconstructionState. + If `state` is provided, we mutate it in-place; otherwise we create a new ReconstructionState. + + MULTI-TAB RECONSTRUCTION LOGIC: + This function handles the core reconstruction workflow: + + 1. SORT EVENTS: Events are sorted by (server_time, timestamp) to ensure correct order. + This is critical for multi-tab documents where edits may arrive out of order. + + 2. ROUTE TO TAB: Each event has: + - tab_id: Extracted from the event's URL (e.g., 't.0', 't.4n9p3wa3df6o') + - bundles: List of command bundles to apply + Each bundle is applied to the specific tab via apply_bundle(bundle, tab_id, timestamp) + + 3. COMMAND PROCESSING: Within each bundle, commands like: + - Tab metadata (mkch, ucp, ac) update tab names and create/rename tabs + - Text editing (is, ds, as) modify tab content + - Element commands (ae, te) embed dropdowns, images, etc. + + 4. TRACKING METADATA: Per-document metadata is updated: + - last_server_time: Latest server timestamp seen + - last_timestamp: Latest client timestamp seen + - last_url: Most recent URL (includes tab_id) + - chrome_identity: Extension identity info + + Args: + events: Iterable of GoogleDocsSaveEvent objects from WritingObserver log + state: Existing ReconstructionState to update. If None, creates new one. + + Returns: + The updated ReconstructionState with all events applied """ if state is None: state = ReconstructionState() diff --git a/googledoc_reconstruction/command_state.py b/googledoc_reconstruction/command_state.py index d208fe57..fa931a7e 100644 --- a/googledoc_reconstruction/command_state.py +++ b/googledoc_reconstruction/command_state.py @@ -56,21 +56,86 @@ def _delete_1based(text: str, si: int, ei: int) -> str: @dataclass class TabState: - text: str = "" - elements: Dict[str, dict] = field(default_factory=dict) - name: Optional[str] = None # human-readable tab name ("First Tab", "Second Tab", etc.) - first_timestamp: Optional[int] = None # when this tab first saw edits - # NEW: dropdown metadata - dropdown_defs: Dict[str, dict] = field(default_factory=dict) # def_id -> ae command - dropdown_elems: Dict[str, dict] = field(default_factory=dict) # elem_id -> ae command (et == "dropdown") - dropdown_instances: List[Tuple[int, str]] = field(default_factory=list) # list of (spi, elem_id) + """ + Represents the state of a single tab within a Google Doc. + + Tabs in Google Docs function as separate sections/pages that users can switch between. + Each tab has independent content (text), metadata (name), and embedded elements (dropdowns, images). + + Key concepts: + - Tabs are identified by unique IDs (e.g., 't.95y...') + - Tabs can be renamed via 'ucp' (update caption) or created via 'ac' (add child) commands + - Text editing (insert, delete, alter) operations target the current tab + - Elements like dropdowns are tied to specific positions within a tab's text + """ + text: str = "" # The reconstructed text content of this tab + elements: Dict[str, dict] = field(default_factory=dict) # Embedded elements: elem_id -> metadata (images, etc.) + name: Optional[str] = None # Human-readable tab name (e.g., "First Tab", "Second Tab") + first_timestamp: Optional[int] = None # Client timestamp (ms) when this tab first received edits + + # Dropdown metadata: Smart-chip dropdowns are reconstructed from multiple command types + dropdown_defs: Dict[str, dict] = field(default_factory=dict) # Dropdown definitions: def_id -> 'ae' command (et == "dropdown-definition") + dropdown_elems: Dict[str, dict] = field(default_factory=dict) # Dropdown instances: elem_id -> 'ae' command (et == "dropdown") + dropdown_instances: List[Tuple[int, str]] = field(default_factory=list) # Positions where dropdowns appear: (text_position, elem_id) + + def to_dict(self) -> dict: + return { + "text": self.text, + "elements": self.elements, + "name": self.name, + "first_timestamp": self.first_timestamp, + "dropdown_defs": self.dropdown_defs, + "dropdown_elems": self.dropdown_elems, + "dropdown_instances": self.dropdown_instances, + } + + @staticmethod + def from_dict(data: dict) -> "TabState": + tab = TabState() + if not data: + return tab + tab.text = data.get("text", "") + tab.elements = data.get("elements", {}) or {} + tab.name = data.get("name") + tab.first_timestamp = data.get("first_timestamp") + tab.dropdown_defs = data.get("dropdown_defs", {}) or {} + tab.dropdown_elems = data.get("dropdown_elems", {}) or {} + tab.dropdown_instances = data.get("dropdown_instances", []) or [] + return tab class DocState: """ - In-memory representation of one Google Doc reconstructed from - google_docs_save bundles. Handles multiple tabs. + In-memory representation of one Google Doc reconstructed from google_docs_save bundles. + + TABS IN GOOGLE DOCS: + Google Docs supports multiple "tabs" (introduced in 2024), which function as independent + sections/pages within a single document. Each tab has: + - Unique ID (e.g., 't.0', 't.95y...', 't.4n9p3wa3df6o') + - Metadata: name, creation time + - Content: text with embedded elements (dropdowns, images) + + DocState tracks all tabs for a document and reconstructs their content from command bundles. + The 'tab_id' in events (extracted from URL 'tab=' parameter or included in commands) routes + text edits to the appropriate tab. + + COMMANDS AND THEIR RELATIONSHIP TO TABS: + - 'mkch': Initial tab metadata setup (sets tab names) + - 'ucp': Update caption/rename tab (modifies existing tab name) + - 'ac': Add child tab (creates new tab) + - 'nm': New mutation with routing (routes a command to a specific tab via 'nmr' field) + - 'is', 'ds', 'as': Text editing commands (insert, delete, substitute) - affect current tab + - 'ae': Add element (dropdowns, images) - stores metadata tied to current tab + - 'te': Tie element into text (embeds element at specific position in current tab) + - 'mlti': Multi-command wrapper (contains multiple sub-commands) + + The reconstruction process: + 1. Events arrive with a tab_id (from URL or command routing) + 2. Commands are applied to the appropriate TabState + 3. Text editing commands update tab.text + 4. Element commands store metadata in tab.elements/dropdown_defs/dropdown_elems + 5. After all events, expand_dropdowns() converts stored dropdown metadata into readable text """ def __init__(self, user_id: str, doc_id: str): @@ -85,6 +150,29 @@ def __init__(self, user_id: str, doc_id: str): self.last_url: Optional[str] = None self.chrome_identity: Dict[str, Optional[str]] = {} + def to_dict(self) -> dict: + return { + "user_id": self.user_id, + "doc_id": self.doc_id, + "tabs": {tab_id: tab.to_dict() for tab_id, tab in self.tabs.items()}, + "last_server_time": self.last_server_time, + "last_timestamp": self.last_timestamp, + "last_url": self.last_url, + "chrome_identity": self.chrome_identity, + } + + @staticmethod + def from_dict(data: dict) -> "DocState": + doc = DocState(data.get("user_id", ""), data.get("doc_id", "")) + doc.tabs = collections.defaultdict(TabState) + for tab_id, tab_data in (data.get("tabs") or {}).items(): + doc.tabs[tab_id] = TabState.from_dict(tab_data) + doc.last_server_time = data.get("last_server_time") + doc.last_timestamp = data.get("last_timestamp") + doc.last_url = data.get("last_url") + doc.chrome_identity = data.get("chrome_identity", {}) or {} + return doc + # --- main public entry --- @staticmethod @@ -144,7 +232,7 @@ def expand_dropdowns(self) -> None: selected_label = item.get("di-dv") or item.get("di-v") or selected_label break - human = f"DROPDOWN: {config_name} – {selected_label}" + human = f"DROPDOWN: {config_name} - {selected_label}" # Replace the single icon character at [spi] (1-based) with `human` # tab.text[spi-1] should currently be '' @@ -153,9 +241,22 @@ def expand_dropdowns(self) -> None: def apply_bundle(self, bundle: dict, default_tab: str, event_timestamp: Optional[int] = None) -> None: """ - Apply one google_docs_save bundle. - - event_timestamp is used to approximate tab creation / ordering. + Apply one google_docs_save bundle to this document. + + A bundle contains multiple commands that were executed together. This method + extracts commands from the bundle and applies each one in order. + + Args: + bundle: dict containing 'commands' list from google_docs_save event + default_tab: The tab_id to use if a command does not specify a different tab + (extracted from event URL or passed through) + event_timestamp: Client timestamp (ms) from the event; used to track when + tabs first received edits (important for tab ordering) + + The default_tab is significant because: + - Most text editing commands (is, ds, as) apply to the current/default tab + - Some commands (nm: new mutation) may override this with their own tab routing (nmr field) + - Tab metadata commands (mkch, ucp, ac) can reference specific tab_ids """ commands = bundle.get("commands", []) for cmd in commands: @@ -163,29 +264,66 @@ def apply_bundle(self, bundle: dict, default_tab: str, event_timestamp: Optional # --- command handlers (same idea as before) --- - # --- command handlers (same idea as before) --- - def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] = None) -> None: + """ + Apply a single command to the appropriate tab. + + COMMAND ROUTING TO TABS: + Commands specify which tab they operate on through several mechanisms: + 1. 'nm' (new mutation): Contains routing info (nmr) that may specify a tab_id + 2. 'ucp' (update caption), 'ac' (add child): First data field may contain tab_id + 3. Default: Text editing & element commands use current_tab parameter + + SUPPORTED COMMAND TYPES: + TAB METADATA: + - mkch: Set initial tab name(s) + - ucp: Update caption (rename tab) + - ac: Add child (create new tab) + + TEXT EDITING: + - is: Insert string at 1-based index + - ds: Delete substring [si, ei) + - as: Alter substring (delete then insert) + + ELEMENTS (EMBEDDED OBJECTS): + - ae: Add element (dropdowns, images) - stores metadata + - te: Tie element into text - embeds element at position + + CONTROL FLOW: + - mlti: Multi-command wrapper - contains multiple sub-commands + + Args: + cmd: The command dict with 'ty' (type) and type-specific fields + current_tab: Default tab_id if command doesn't specify otherwise + event_timestamp: For tracking when tabs first received edits + """ ty = cmd.get("ty") if not ty: return - # ensure the tab exists and record first edit timestamp + # Ensure the tab exists and record first edit timestamp + # This helps track when tabs were created/first edited tab = self.tabs[current_tab] if event_timestamp is not None and tab.first_timestamp is None: tab.first_timestamp = event_timestamp - # Multi-command wrapper + # MLTI: Multi-command wrapper + # Contains multiple sub-commands in 'mts' field; recursively apply each one if ty == "mlti": for sub in cmd.get("mts", []): self._apply_cmd(sub, current_tab, event_timestamp) return - # nm: "new mutation" with routing info (often contains tab id) + # NM: "New mutation" with routing info + # This command can specify which tab to route to via the 'nmr' (new mutation routing) field. + # The nmr is a list that may contain a tab_id (string starting with 't.'). + # We use a heuristic: the LAST string starting with 't.' in nmr is the target tab. + # The actual command to execute is in 'nmc' (new mutation command). if ty == "nm": - target_tab = current_tab + target_tab = current_tab # Default to current tab if routing not found nmr = cmd.get("nmr") or [] - # heuristic: LAST string starting with 't.' is the tab id + # Heuristic: LAST string starting with 't.' is the tab id + # (handles cases where multiple tab refs might be present) for x in reversed(nmr): if isinstance(x, str) and x.startswith("t."): target_tab = x @@ -195,9 +333,15 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] return - # ----- TAB METADATA COMMANDS ----- + # ========== TAB METADATA COMMANDS ========== + # These commands manage tab creation, naming, and deletion. + # They enable multi-tab document structure where each tab functions as a separate section. - # mkch: initial tab name(s) (e.g. [[1, "Tab 1"]]) + # MKCH: "Make channel" - Initialize tab metadata (typically tab names) + # Command structure: {'ty': 'mkch', 'd': [nested list containing tab names]} + # Data format: d contains deeply nested lists like [[1, "Tab 1"]] + # Purpose: Set up initial tab name(s) when document is created or tabs are added + # Result: Updates tab.name to human-readable value (e.g., "First Tab") if ty == "mkch": data = cmd.get("d") name = self._extract_name_from_d(data) @@ -206,49 +350,68 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] tab.name = name return - # ucp: update caption / rename tab - # d looks like: ['t.95y...', [[], [1, 'Second Tab']]] - # or ['', [[], [1, 'First Tab']]] + # UCP: "Update caption" - Rename existing tab + # Command structure: {'ty': 'ucp', 'd': [tab_id_or_empty, nested_data_with_name, ...]} + # Data format: d[0] is tab_id ('t.95y...') or empty string (uses current_tab) + # d[1] contains nested list with new name + # Example: ['t.95y...', [[], [1, 'Second Tab']]] + # Example: ['', [[], [1, 'First Tab']]] + # Purpose: Rename a tab (user changes "Tab 1" to "Overview", etc.) + # Result: Updates specified tab's name; records timestamp if first edit if ty == "ucp": data = cmd.get("d") if not isinstance(data, list) or len(data) < 2: return - tab_id = data[0] or current_tab + tab_id = data[0] or current_tab # If empty string, use current tab name = self._extract_name_from_d(data[1]) if name: tstate = self.tabs[tab_id] tstate.name = name + # Track when this tab first received operations if event_timestamp is not None and tstate.first_timestamp is None: tstate.first_timestamp = event_timestamp return - # ac: add child (new tab) - # d looks like: ['t.95y...', [1, 'Tab 2'], [1]] + # AC: "Add child" - Create new tab + # Command structure: {'ty': 'ac', 'd': [new_tab_id, name_data, ...]} + # Data format: d[0] is unique new tab_id (e.g., 't.95y...') + # d[1] contains nested list with tab name + # Example: ['t.95y...', [1, 'Tab 2'], [1]] + # Purpose: Create a new tab in the document (user clicks "+ Tab" or duplicate) + # Result: Initializes new TabState with name; records creation timestamp if ty == "ac": data = cmd.get("d") if not isinstance(data, list) or len(data) < 2: return - tab_id = data[0] + tab_id = data[0] # The new tab's unique identifier if not isinstance(tab_id, str): return name = self._extract_name_from_d(data[1]) - tstate = self.tabs[tab_id] + tstate = self.tabs[tab_id] # Creates new TabState via defaultdict if name: tstate.name = name + # Record when this tab was created if event_timestamp is not None and tstate.first_timestamp is None: tstate.first_timestamp = event_timestamp return - # ----- TEXT EDITING COMMANDS ----- - - # Insert string (1-based indices, placeholder-aware) + # ========== TEXT EDITING COMMANDS ========== + # These commands modify the text content of a tab. + # All use 1-based indexing and are placeholder-aware (handle gaps filled with \x00). + + # IS: "Insert string" - Add text at specified position + # Command structure: {'ty': 'is', 's': string_to_insert, 'ibi': 1_based_insert_position} + # Semantics: Insert 's' before position 'ibi' (1-based) + # Example: ibi=1 means insert at start, ibi=len(text)+1 means append + # Placeholder handling: Uses _insert_1based which fills gaps with PLACEHOLDER (\x00) + # Purpose: User types, copy-pastes, or undo restores text + # Result: tab.text modified with new string inserted at position if ty == "is": tab = self.tabs[current_tab] s = cmd.get("s", "") ibi = cmd.get("ibi") - # In your old code, ibi is 1-based, and if missing, you don't get an insert. - # Fall back to "append" if ibi missing, using 1-based len+1. + # ibi is 1-based. If missing, fall back to append (len+1) if ibi is None: ibi = len(tab.text) + 1 @@ -260,13 +423,19 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] tab.text = _insert_1based(tab.text, ibi_int, s) return - # Delete substring [si, ei) (1-based indices, placeholder-aware) + # DS: "Delete substring" - Remove text in range [si, ei) + # Command structure: {'ty': 'ds', 'si': start_1based, 'ei': end_1based} + # Semantics: Delete text from position si (inclusive) to ei (exclusive), 1-based + # Example: si=1, ei=5 deletes characters at positions 1,2,3,4 (not 5) + # Placeholder handling: Preserves PLACEHOLDERs; may fill new gaps + # Purpose: User deletes, backspace, undo, or cut operations + # Result: tab.text modified with substring removed if ty == "ds": tab = self.tabs[current_tab] si = cmd.get("si") ei = cmd.get("ei") - # If indexes are missing, nothing to do + # If indexes are missing, nothing to delete if si is None or ei is None: return @@ -276,16 +445,22 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] except (TypeError, ValueError): return - # DS uses [si, ei) semantics; this matches your old implementation + # DS uses [si, ei) semantics; si is included, ei is excluded tab.text = _delete_1based(tab.text, si_int, ei_int) return - # Alter substring [si, ei) -> s - # IMPORTANT: many 'as' commands are *style only* (no 's'). - # If there is no 's', we treat it as style and ignore. + # AS: "Alter substring" - Replace text in range [si, ei) with new string + # Command structure: {'ty': 'as', 's': replacement_string, 'si': start_1based, 'ei': end_1based} + # Semantics: Delete [si, ei), then insert 's' at si + # Example: Replace "old" with "new" at positions 1-4 + # IMPORTANT: Many 'as' commands are STYLE-ONLY (no 's' field); these are ignored + # Style-only 'as': Only changes formatting/style, doesn't change text content + # Placeholder handling: Combines delete and insert logic + # Purpose: User replaces text, formatting changes, or undo operations + # Result: tab.text modified with substring replaced if ty == "as": if "s" not in cmd: - # style-only 'as' — do not touch text + # Style-only 'as' command — formatting change only, no text modification return tab = self.tabs[current_tab] @@ -302,13 +477,26 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] except (TypeError, ValueError): return - # emulate "delete then insert" at the same 1-based location + # Emulate "delete then insert" as a single operation at the same 1-based location tab.text = _delete_1based(tab.text, si_int, ei_int) tab.text = _insert_1based(tab.text, si_int, s) return - # ----- ELEMENT COMMANDS (images, inline objects, dropdowns, etc.) ----- - + # ========== ELEMENT COMMANDS (embedded objects in text) ========== + # These commands manage embedded elements like dropdowns, images, and other inline objects. + # Elements are stored separately and tied into the text via 'te' commands. + + # AE: "Add element" - Register an element (dropdown, image, etc.) with metadata + # Command structure: {'ty': 'ae', 'id': element_id, 'et': element_type, 'epm': element_metadata} + # Element types (et field): + # - 'dropdown-definition': Defines a dropdown menu (holds available options) + # - 'dropdown': A dropdown instance tied to a definition (holds selected value) + # - Other types: Images, inline objects, etc. + # Metadata (epm): + # - For dropdown-definition: Contains option list (items), config name + # - For dropdown: Contains selected item ID and fallback value + # Purpose: Create dropdown definitions when first added, create dropdown instances + # Result: Stores metadata in appropriate tab dictionary for later reconstruction if ty == "ae": tab = self.tabs[current_tab] el_id = cmd.get("id") @@ -317,22 +505,31 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] et = cmd.get("et") - # Dropdown definition (holds the options) + # DROPDOWN DEFINITION: Holds the available options and configuration + # Stored separately because multiple dropdown instances may share one definition + # Example: A "Priority" dropdown used in multiple places shares one definition if et == "dropdown-definition": tab.dropdown_defs[el_id] = cmd return - # Actual dropdown element (one instance tied to a definition) + # DROPDOWN ELEMENT: One specific dropdown instance with a selected value + # References a definition and stores which option is currently selected + # Example: This dropdown's value is "High" if et == "dropdown": tab.dropdown_elems[el_id] = cmd return - # Fallback: other elements (images, etc.) + # FALLBACK: Other elements (images, shapes, etc.) + # These are stored but cannot be reconstructed from logs (e.g., images lack binary data) tab.elements[el_id] = cmd return - # te: tie element into the text at position 'spi' (1-based) - # We now special-case dropdowns to reconstruct a readable token. + # TE: "Tie element" - Embed element into text at specific position + # Command structure: {'ty': 'te', 'id': element_id, 'spi': 1_based_position} + # Semantics: Place element 'id' at position 'spi' (1-based) in tab.text + # Purpose: Insert a placeholder for an element (dropdown, image) into document text + # Result: For dropdowns, records position for later reconstruction as readable text + # For other elements, inserts placeholder (e.g., "[image-123]") if ty == "te": tab = self.tabs[current_tab] el_id = cmd.get("id") @@ -341,14 +538,19 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] if not el_id or not isinstance(spi, int): return - # Is this tying a dropdown element? + # DROPDOWN SPECIAL HANDLING: + # For dropdowns, we don't insert text here. Instead, we record the position + # so expand_dropdowns() can later reconstruct readable text like: + # "DROPDOWN: Priority – High" at the correct position. + # This deferred approach allows us to gather all metadata before reconstruction. if el_id in tab.dropdown_elems: - # DON'T touch tab.text here. - # Just remember that at logical position `spi` there is this dropdown. + # Record this dropdown's position for post-processing tab.dropdown_instances.append((spi, el_id)) return - # For non-dropdown elements (e.g. images) you can keep old behavior: + # NON-DROPDOWN ELEMENTS (e.g., images): + # For now, insert a simple placeholder. Images cannot be fully reconstructed + # since binary data is not included in logs. placeholder = f"[{el_id}]" tab.text = _insert_1based(tab.text, spi, placeholder) return @@ -358,3 +560,51 @@ def _apply_cmd(self, cmd: dict, current_tab: str, event_timestamp: Optional[int] # Other types (headings, document style, etc.) are formatting only. # We ignore them to keep indices consistent but text intact. return + + +def _render_tab_text(tab: TabState) -> str: + text = tab.text + if tab.dropdown_instances: + for spi, elem_id in sorted(tab.dropdown_instances, key=lambda x: x[0], reverse=True): + dropdown_cmd = tab.dropdown_elems.get(elem_id) + if not dropdown_cmd: + continue + + epm = dropdown_cmd.get("epm", {}) + def_id = epm.get("dde_di") + selected_item_id = epm.get("dde-sii") + selected_fallback_value = epm.get("dde-fdv") + + def_cmd = tab.dropdown_defs.get(def_id, {}) + ddefe = def_cmd.get("epm", {}).get("ddefe-ddi", {}) + config_name = def_cmd.get("epm", {}).get("ddefe-t", "Dropdown") + items = ddefe.get("cv", {}).get("opValue", []) + + selected_label = selected_fallback_value + for item in items: + if item.get("di-id") == selected_item_id: + selected_label = item.get("di-dv") or item.get("di-v") or selected_label + break + + human = f"DROPDOWN: {config_name} - {selected_label}" + if 1 <= spi <= len(text): + text = text[: spi - 1] + human + text[spi:] + + return text.replace(PLACEHOLDER, "") + + +def render_full_text(doc_state: DocState) -> str: + parts: List[str] = [] + + def _tab_sort_key(item): + tab_id, tab_state = item + return tab_state.first_timestamp or 0 + + for tab_id, tab in sorted(doc_state.tabs.items(), key=_tab_sort_key): + display_name = tab.name or tab_id + parts.append(f"{display_name}\n") + parts.append(f"{'=' * len(display_name)}\n\n") + parts.append(_render_tab_text(tab)) + parts.append("\n\n") + + return "".join(parts) if parts else "" diff --git a/googledoc_reconstruction/load_event.py b/googledoc_reconstruction/load_event.py index 2704a9c9..6d1f75b3 100644 --- a/googledoc_reconstruction/load_event.py +++ b/googledoc_reconstruction/load_event.py @@ -16,9 +16,33 @@ def parse_tab_from_url(url: str) -> str: """ - Extracts the tab id from a docs URL, e.g. - ...&tab=t.4n9p3wa3df6o - Returns 't.0' if nothing is found. + Extracts the tab id from a Google Docs URL. + + MULTI-TAB URL STRUCTURE: + Google Docs URLs include a 'tab' query parameter that identifies which tab the user is on: + https://docs.google.com/document/d/{doc_id}/edit#gid=0&tab=t.4n9p3wa3df6o + + This function extracts that tab identifier (e.g., 't.4n9p3wa3df6o') so events can be + routed to the correct tab during reconstruction. + + TAB ID FORMAT: + - Tab IDs start with 't.' followed by alphanumeric characters + - Special tab: 't.0' is the default/initial tab + - Examples: 't.0', 't.95y...', 't.4n9p3wa3df6o' + + Args: + url: A Google Docs URL string, may contain 'tab=' parameter + + Returns: + The extracted tab_id (e.g., 't.4n9p3wa3df6o') + If no tab parameter found, returns 't.0' as default tab + + Example: + >>> parse_tab_from_url("https://docs.google.com/.../edit?tab=t.95y...") + 't.95y...' + + >>> parse_tab_from_url("https://docs.google.com/.../edit") + 't.0' """ if not url or "tab=" not in url: return "t.0" @@ -31,8 +55,37 @@ def parse_tab_from_url(url: str) -> str: @dataclass class GoogleDocsSaveEvent: """ - Flattened view of a google_docs_save event. - This is what you will feed into the reconstruction later. + Flattened view of a google_docs_save event from WritingObserver log. + This is the primary data structure fed into the reconstruction pipeline. + + MULTI-TAB EVENT STRUCTURE: + Each event captures a user's save action in a Google Doc, which may involve one or more tabs. + The event includes: + - Document identification (user_id, doc_id) + - Tab information (tab_id extracted from the URL) + - Command bundles (commands that modified the document/tab) + - Timing information (client and server timestamps) + + COMMAND BUNDLES: + The 'bundles' list contains command bundles, where each bundle is a dict with: + - 'commands': list of individual commands to apply to the current tab + - Commands include tab metadata (mkch, ucp, ac), text editing (is, ds, as), + and element operations (ae, te) + + TIMING: + - timestamp: Client-side time (milliseconds) when the event was created + - server_time: Server-side time (epoch seconds) when event was received + These are used to chronologically order events and track when tabs were created/edited. + + Attributes: + user_id: Unique identifier for the user making edits + doc_id: Unique identifier for the Google Doc being edited + url: The full URL of the doc (contains tab_id in query parameter) + tab_id: Extracted tab identifier (e.g., 't.0', 't.95y...'); routes commands to correct tab + timestamp: Client timestamp in milliseconds (when user made changes) + server_time: Server timestamp in epoch seconds (when WritingObserver received event) + chrome_identity: Dict with extension identity info + bundles: List of command bundles to apply to the document """ user_id: str doc_id: str diff --git a/googledoc_reconstruction/main_reconstruction.py b/googledoc_reconstruction/main_reconstruction.py index 56f9ca84..a96a34d6 100644 --- a/googledoc_reconstruction/main_reconstruction.py +++ b/googledoc_reconstruction/main_reconstruction.py @@ -70,8 +70,41 @@ def get_docs_service(): def build_full_text(doc_state: DocState) -> str: """ - Build the full plain-text representation for a doc_state, - with one section per tab. + Build the full plain-text representation for a doc_state, with one section per tab. + + MULTI-TAB RENDERING STRATEGY: + Since the reconstructed Google Doc is a plain text document (not multi-tabbed), + each tab from the original document is rendered as a separate section with: + - Section header: The tab's name (e.g., "First Tab", "Overview") + - Section separator: Line of equals signs (e.g., "==========") + - Content: The reconstructed text from that tab + - Spacing: Blank lines between sections for readability + + TAB ORDERING: + Tabs are sorted by first_timestamp (when they first received edits), ensuring + that the rendering reflects the logical creation order of tabs. + + ELEMENT RECONSTRUCTION: + - Dropdowns: Rendered as readable text like "DROPDOWN: Priority – High" + - Images: Shown as placeholders like "[s-blob-v1-IMAGE-...]" (binary data not in logs) + - Placeholders: Text gaps filled with PLACEHOLDER (\x00) are removed + + OUTPUT FORMAT: + Tab A + ===== + + Content of Tab A... + + Tab B + ===== + + Content of Tab B... + + Args: + doc_state: DocState object with all tabs and their content + + Returns: + Full plain text with all tabs as sections """ parts: List[str] = [] diff --git a/modules/writing_observer/writing_observer/writing_analysis.py b/modules/writing_observer/writing_observer/writing_analysis.py index 67735bd0..13693957 100644 --- a/modules/writing_observer/writing_observer/writing_analysis.py +++ b/modules/writing_observer/writing_observer/writing_analysis.py @@ -7,12 +7,29 @@ ''' # Necessary for the wrapper code below. import datetime +import os import pmss import re +import sys import time import writing_observer.reconstruct_doc +GOOGLEDOC_RECON_PATH = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..", "googledoc_reconstruction") +) +if GOOGLEDOC_RECON_PATH not in sys.path: + sys.path.insert(0, GOOGLEDOC_RECON_PATH) + +try: + import command_state as gdoc_command_state + import load_event as gdoc_load_event + _HAS_GOOGLEDOC_RECON = True +except Exception: + gdoc_command_state = None + gdoc_load_event = None + _HAS_GOOGLEDOC_RECON = False + import learning_observer.adapters import learning_observer.communication_protocol.integration from learning_observer.stream_analytics.helpers import student_event_reducer, kvs_pipeline, KeyField, EventField, Scope @@ -195,22 +212,81 @@ async def reconstruct(event, internal_state): if event['client']['event'] not in ["google_docs_save", "document_history"]: return False, False - internal_state = writing_observer.reconstruct_doc.google_text.from_json( - json_rep=internal_state) - if event['client']['event'] == "google_docs_save": - bundles = event['client']['bundles'] - for bundle in bundles: + if not _HAS_GOOGLEDOC_RECON: + internal_state = writing_observer.reconstruct_doc.google_text.from_json( + json_rep=internal_state) + if event['client']['event'] == "google_docs_save": + bundles = event['client']['bundles'] + for bundle in bundles: + internal_state = writing_observer.reconstruct_doc.command_list( + internal_state, bundle['commands'] + ) + elif event['client']['event'] == "document_history": + change_list = [ + i[0] for i in event['client']['history']['changelog'] + ] internal_state = writing_observer.reconstruct_doc.command_list( - internal_state, bundle['commands'] + writing_observer.reconstruct_doc.google_text(), change_list ) + state = internal_state.json + if learning_observer.settings.module_setting('writing_observer', 'verbose'): + print(state) + return state, state + + if internal_state is None: + internal_state = {} + + doc_state_data = internal_state.get("doc_state") if isinstance(internal_state, dict) else None + if doc_state_data: + doc_state = gdoc_command_state.DocState.from_dict(doc_state_data) + else: + user_id = ( + event.get("client", {}).get("auth", {}).get("safe_user_id") + or event.get("client", {}).get("auth", {}).get("user_id") + or "" + ) + doc_id = get_doc_id(event) or "" + doc_state = gdoc_command_state.DocState(user_id, doc_id) + url = event.get("client", {}).get("url") or event.get("client", {}).get("object", {}).get("url") or "" + default_tab = gdoc_load_event.parse_tab_from_url(url) if gdoc_load_event else "t.0" + if internal_state.get("text"): + doc_state.tabs[default_tab].text = internal_state.get("text", "") + + url = event.get("client", {}).get("url") or event.get("client", {}).get("object", {}).get("url") or "" + default_tab = gdoc_load_event.parse_tab_from_url(url) if gdoc_load_event else "t.0" + + ts = event.get("client", {}).get("timestamp") + if ts is None: + ts = event.get("client", {}).get("metadata", {}).get("ts") + try: + ts_int = int(ts) if ts is not None else None + except (TypeError, ValueError): + ts_int = None + + if event['client']['event'] == "google_docs_save": + bundles = event.get("client", {}).get("bundles", []) + for bundle in bundles: + doc_state.apply_bundle(bundle, default_tab, event_timestamp=ts_int) elif event['client']['event'] == "document_history": change_list = [ i[0] for i in event['client']['history']['changelog'] ] - internal_state = writing_observer.reconstruct_doc.command_list( - writing_observer.reconstruct_doc.google_text(), change_list - ) - state = internal_state.json + for cmd in change_list: + doc_state.apply_bundle({"commands": [cmd]}, default_tab, event_timestamp=ts_int) + + doc_state.last_timestamp = ts_int + doc_state.last_url = url + server_time = event.get("server", {}).get("time") + if server_time is not None: + doc_state.last_server_time = server_time + + state = { + "text": gdoc_command_state.render_full_text(doc_state), + "position": internal_state.get("position", 0) if isinstance(internal_state, dict) else 0, + "edit_metadata": internal_state.get("edit_metadata", {"cursor": [], "length": []}) + if isinstance(internal_state, dict) else {"cursor": [], "length": []}, + "doc_state": doc_state.to_dict(), + } if learning_observer.settings.module_setting('writing_observer', 'verbose'): print(state) return state, state @@ -350,6 +426,7 @@ async def last_document(event, internal_state): Small bit of data -- the last document accessed. This can be extracted from `document_list`, but we don't need that level of complexity for the 1.0 dashboard. + This code accesses the code below which provides some hackish support functions for the analysis. Over time these may age off with a better model. @@ -363,46 +440,6 @@ async def last_document(event, internal_state): return False, False -# Basic class tests and extraction. -# ------------------------------- -# A big part of this project is wrapping up google doc events. -# In doing that we are reverse-engineering some of the elements -# particularly the event types. This code provides some basic -# wrappers for event types to simplify extraction of key elements -# and to simplify event recognition. -# -# Over time this will likely expand and will need to adapt to keep -# up with any changes in the event structure. For now it is just -# a thin abstraction layer on a few of the pieces. - -def is_visibility_eventp(event): - """ - Given an event return true if it is a visibility - event which indicates changing the doc shown or - active. - - Here we look for an event with 'client' - containing the field 'event_type' of - 'visibility' - """ - Event_Type = event.get('client', {}).get('event', None) - return (Event_Type == 'visibility') - - -def is_keystroke_eventp(event): - """ - Given an event return true if it is a keystroke - event which indicates changing the doc shown or - active. - - Here we look for an event with 'client' - containing the field 'event_type' of - 'keystroke' - """ - Event_Type = event.get('client', {}).get('event', None) - return (Event_Type == 'keystroke') - - # Simple hack to match URLs. This should probably be moved as well # but for now it works. # From 6aa5329118822e4b5062b0903d261c30cba06c92 Mon Sep 17 00:00:00 2001 From: saminur Date: Thu, 15 Jan 2026 00:14:34 -0500 Subject: [PATCH 3/3] added the tab sepecific code in extension --- extension/writing-process/src/background.js | 4 +++- extension/writing-process/src/writing_common.js | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/extension/writing-process/src/background.js b/extension/writing-process/src/background.js index 236a91d3..ee9bfc1a 100644 --- a/extension/writing-process/src/background.js +++ b/extension/writing-process/src/background.js @@ -12,7 +12,7 @@ var RAW_DEBUG = false; var WEBSOCKET_SERVER_URL = "wss://learning-observer.org/wsapi/in/"; import { googledocs_id_from_url } from './writing_common'; - +import { tab_id_from_url } from './writing_common'; import * as loEvent from 'lo_event/lo_event/lo_event.js'; import * as loEventDebug from 'lo_event/lo_event/debugLog.js'; import { websocketLogger } from 'lo_event/lo_event/websocketLogger.js'; @@ -210,6 +210,7 @@ chrome.webRequest.onBeforeRequest.addListener( versus GMT. */ event = { 'doc_id': googledocs_id_from_url(request.url), + 'tab_id': tab_id_from_url(request.url), 'url': request.url, 'bundles': JSON.parse(formdata.bundles), 'rev': formdata.rev, @@ -223,6 +224,7 @@ chrome.webRequest.onBeforeRequest.addListener( */ event = { 'doc_id': googledocs_id_from_url(request.url), + 'tab_id': tab_id_from_url(request.url), 'url': request.url, 'formdata': formdata, 'rev': formdata.rev, diff --git a/extension/writing-process/src/writing_common.js b/extension/writing-process/src/writing_common.js index 6242b3ab..3f807ee3 100644 --- a/extension/writing-process/src/writing_common.js +++ b/extension/writing-process/src/writing_common.js @@ -78,6 +78,21 @@ export function googledocs_id_from_url(url) { return null; } +export function tab_id_from_url(url) { + /* + Given a URL like: + https://docs.google.com/document/d//edit?tab=t.95yb7msfl8ul + extract the associated tab ID: + t.95yb7msfl8ul + Return null if not a valid URL or tab param. + */ + var match = url.match(/[?&]tab=([^&]+)/i); + if (match) { + return match[1]; + } + return null; +} + var writing_lasthash = ""; function unique_id() { /*