-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtabbed_doc.py
More file actions
209 lines (181 loc) · 7.15 KB
/
tabbed_doc.py
File metadata and controls
209 lines (181 loc) · 7.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Create a single Google Doc with one tab per page.
Uses the Google Docs API tabs feature: creates the doc, adds named tabs
for all pages via batchUpdate, inserts content per-tab, then deletes
the empty default tab.
"""
import asyncio
import copy
import sys
from auth import get_docs_service, get_drive_service
from docs_converter import convert_markdown_to_doc_requests
from google_drive import sanitize_doc_title
from recursive_crawler import normalize_url
def _truncate_to_word(text: str, max_len: int) -> str:
"""Truncate text to max_len, cutting at the last word boundary."""
if len(text) <= max_len:
return text
truncated = text[:max_len]
# Find last space to avoid cutting mid-word
last_space = truncated.rfind(" ")
if last_space > 0:
return truncated[:last_space].rstrip()
return truncated
def _inject_tab_id(requests: list[dict], tab_id: str) -> list[dict]:
"""Add tabId to all location/range objects in batchUpdate requests.
Walks each request dict recursively and injects 'tabId' into every
'location' and 'range' dict found. Returns a deep copy — does not
mutate the input.
"""
patched = copy.deepcopy(requests)
def _walk(obj):
if isinstance(obj, dict):
if 'index' in obj and 'tabId' not in obj:
# This looks like a location dict: {index: N}
obj['tabId'] = tab_id
if 'startIndex' in obj and 'endIndex' in obj and 'tabId' not in obj:
# This looks like a range dict: {startIndex, endIndex}
obj['tabId'] = tab_id
for value in obj.values():
_walk(value)
elif isinstance(obj, list):
for item in obj:
_walk(item)
_walk(patched)
return patched
async def create_tabbed_google_doc(
pages: list[tuple[str, str, str]],
doc_title: str,
folder_id: str,
base_url: str | None = None,
) -> str:
"""Create a single Google Doc with one tab per page.
Args:
pages: List of (url, title, markdown_with_source) tuples,
ordered by desired tab position.
doc_title: Title of the overall Google Doc.
base_url: The base URL prefix used for recursive crawl (for metadata tagging).
folder_id: Google Drive folder ID to place the doc in.
Returns:
URL of the created Google Doc.
"""
docs_service = get_docs_service()
drive_service = get_drive_service()
# Check for existing doc with the same title (direct query, fast)
safe_title = doc_title.replace("'", "\\'")
query = (
f"name='{safe_title}' and '{folder_id}' in parents "
f"and mimeType='application/vnd.google-apps.document' and trashed=false"
)
existing = await asyncio.to_thread(
lambda: drive_service.files().list(q=query, fields='files(id)', pageSize=1).execute()
)
if existing.get('files'):
existing_id = existing['files'][0]['id']
doc_url = f"https://docs.google.com/document/d/{existing_id}/edit"
print(f" ↺ Existing doc found for '{doc_title}', reusing: {doc_url}", file=sys.stderr)
return doc_url
# Step 1: Create blank doc via Drive API
file_metadata = {
'name': doc_title,
'mimeType': 'application/vnd.google-apps.document',
}
doc = await asyncio.to_thread(
lambda: drive_service.files().create(body=file_metadata, fields='id').execute()
)
doc_id = doc['id']
# Step 2: Move to target folder and tag with metadata
update_body = {}
if base_url:
update_body['appProperties'] = {
'doc_mode': 'tabbed',
'base_url': normalize_url(base_url)[:124],
}
await asyncio.to_thread(
lambda: drive_service.files().update(
fileId=doc_id,
addParents=folder_id,
removeParents='root',
body=update_body,
fields='id, parents',
).execute()
)
print(f" Created doc: {doc_title}", file=sys.stderr)
# Step 3: Get the default tab ID (we'll delete it later)
doc_info = await asyncio.to_thread(
lambda: docs_service.documents().get(
documentId=doc_id, fields='tabs'
).execute()
)
default_tab_id = doc_info['tabs'][0]['tabProperties']['tabId']
# Step 4: Create named tabs for ALL pages in a single batchUpdate
# Google Docs requires unique tab titles — disambiguate with URL path
seen_titles: set[str] = set()
tab_titles: list[str] = []
for url, title, md in pages:
tab_title = _truncate_to_word(title, 50)
if tab_title in seen_titles:
# Append the last path segment to disambiguate
from urllib.parse import urlparse
path = urlparse(url).path.rstrip("/")
slug = path.split("/")[-1] if "/" in path else path
suffix = f" ({slug})"
max_title_len = 50 - len(suffix)
tab_title = _truncate_to_word(title, max_title_len) + suffix
# If slug itself is too long, hard-truncate the whole thing
if len(tab_title) > 50:
tab_title = title[:47] + "..."
seen_titles.add(tab_title)
tab_titles.append(tab_title)
tab_requests = []
for i, tab_title in enumerate(tab_titles):
tab_requests.append({
'addDocumentTab': {
'tabProperties': {
'title': tab_title,
'index': i,
}
}
})
tab_response = await asyncio.to_thread(
lambda: docs_service.documents().batchUpdate(
documentId=doc_id,
body={'requests': tab_requests},
).execute()
)
# Step 5: Extract tab IDs from response
tab_ids = []
for reply in tab_response.get('replies', []):
if 'addDocumentTab' in reply:
tab_id = reply['addDocumentTab']['tabProperties']['tabId']
tab_ids.append(tab_id)
# Step 6: Insert content into each tab
for i, (tab_id, (url, title, md)) in enumerate(zip(tab_ids, pages), start=1):
requests = convert_markdown_to_doc_requests(md, doc_title=title)
if requests:
patched = _inject_tab_id(requests, tab_id)
try:
await asyncio.to_thread(
lambda patched=patched: docs_service.documents().batchUpdate(
documentId=doc_id,
body={'requests': patched},
).execute()
)
except Exception as e:
print(f" [{i}/{len(pages)}] FAILED: {title} ({e})", file=sys.stderr)
continue
print(f" [{i}/{len(pages)}] {title}", file=sys.stderr)
# Rate limit between tab insertions
await asyncio.sleep(0.5)
# Step 7: Delete the empty default tab
try:
await asyncio.to_thread(
lambda: docs_service.documents().batchUpdate(
documentId=doc_id,
body={'requests': [{'deleteDocumentTab': {'tabId': default_tab_id}}]},
).execute()
)
except Exception:
pass # Non-critical — just leaves an empty "Tab 1"
doc_url = f"https://docs.google.com/document/d/{doc_id}/edit"
return doc_url