-
Notifications
You must be signed in to change notification settings - Fork 25
Open
Description
In order to move or delete items in a list we must have the index of the item we want to move or delete. Is there a reason not to extract remaining data further?
Something like this:
import re
import json
from curl_cffi import requests as ccrequests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from html import unescape
BASE = "https://letterboxd.com"
def make_session():
sess = ccrequests.Session(impersonate="chrome120")
with open("lb_cookies.json", "r") as f:
cookies = json.load(f)
for c in cookies:
sess.cookies.set(
c["name"],
c["value"],
domain=c.get("domain"),
path=c.get("path", "/")
)
return sess
def fetch(sess, url):
r = sess.get(url, headers={"User-Agent": "Mozilla/5.0"})
r.raise_for_status()
return r.text
def get_progress_counts(html):
soup = BeautifulSoup(html, "lxml")
sec = soup.select_one("section.progress-panel")
if not sec:
return None, None
return int(sec["data-count"]), int(sec["data-total"])
def get_total_pages(html):
soup = BeautifulSoup(html, "lxml")
pages = []
for a in soup.select(".paginate-pages a[href]"):
m = re.search(r"/page/(\d+)/", a["href"])
if m:
pages.append(int(m.group(1)))
return max(pages) if pages else 1
def enrich_for_letterboxdpy(rec):
film = rec["film"]
raw = film["title_display"] or ""
m = re.search(r"\((\d{4})\)", raw)
year = int(m.group(1)) if m else None
base = {
"slug": film["slug"],
"name": film["title_alt"],
"year": year,
"url": film["url"],
}
return {**base, **rec}
def parse_page(html, page_num, per_page, total_pages, total_items):
soup = BeautifulSoup(html, "lxml")
lis = soup.select("li.posteritem")
records = []
for i, li in enumerate(lis, start=1):
index_global = (page_num - 1) * per_page + (i - 1)
li_attrs = li.attrs
react = li.select_one(".react-component")
r = react.attrs if react else {}
def decode(attr):
if attr not in r:
return None
try:
return json.loads(unescape(r[attr]))
except Exception:
return None
poster_ident = decode("data-postered-identifier")
resolvable = decode("data-resolvable-poster-path")
img = li.select_one("img")
rec = {
"list_entry": {
"object_id": li_attrs.get("data-object-id"),
"object_type": li_attrs.get("data-object-name"),
"owner_rating": int(li_attrs.get("data-owner-rating", 0)),
"owner_rating_present": li_attrs.get("data-owner-rating", "0") != "0",
},
"film": {
"film_id_raw": int(r.get("data-film-id")),
"uid": poster_ident.get("uid") if poster_ident else None,
"lid": poster_ident.get("lid") if poster_ident else None,
"slug": r.get("data-item-slug"),
"url": urljoin(BASE, r.get("data-item-link", "")),
"title_display": r.get("data-item-name"),
"title_alt": img.get("alt") if img else None,
},
"endpoints": {
"details_json": urljoin(BASE, r.get("data-details-endpoint", "")),
"poster_resolver": urljoin(BASE, r.get("data-poster-url", "")),
},
"poster_state": {
"width": int(r.get("data-image-width")),
"height": int(r.get("data-image-height")),
"empty_poster": r.get("data-empty-poster-src"),
"cache_busting_key": resolvable.get("cacheBustingKey") if resolvable else None,
"has_default": resolvable.get("hasDefaultPoster") if resolvable else None,
"is_adult": resolvable.get("isAdultThemed") if resolvable else None,
},
"capabilities": {
"likeable": r.get("data-likeable") == "true",
"watchable": r.get("data-watchable") == "true",
"rateable": r.get("data-rateable") == "true",
"request_metadata": r.get("data-request-poster-metadata") == "true",
"is_linked": r.get("data-is-linked") == "true",
"show_menu": r.get("data-show-menu") == "true",
},
"pagination": {
"page": page_num,
"per_page": per_page,
"total_pages": total_pages,
"total_items": total_items,
},
"position": {
"page": page_num,
"index_on_page": i,
"index_global": index_global,
},
}
records.append(enrich_for_letterboxdpy(rec))
return records, len(lis)
def extract_list(list_url, out_file="hw_list.jsonl"):
sess = make_session()
page1 = fetch(sess, list_url)
watched, total_items = get_progress_counts(page1)
if total_items is None:
raise RuntimeError("Could not find progress-panel data-total")
total_pages = get_total_pages(page1)
print(f"TOTAL ITEMS: {total_items}")
print(f"TOTAL PAGES: {total_pages}")
all_records = []
html = page1
for page in range(1, total_pages + 1):
if page > 1:
html = fetch(sess, f"{list_url.rstrip('/')}/page/{page}/")
temp = BeautifulSoup(html, "lxml").select("li.posteritem")
per_page = len(temp)
recs, _ = parse_page(html, page, per_page, total_pages, total_items)
all_records.extend(recs)
print(f"page {page}: {len(recs)}")
with open(out_file, "w", encoding="utf-8") as f:
for r in all_records:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f"\nWrote {len(all_records)} records to {out_file}")
return all_records
if __name__ == "__main__":
LIST = "https://letterboxd.com/grryboy/list/horror-watched/"
extract_list(LIST)
Metadata
Metadata
Assignees
Labels
No labels