Skip to content

User List movie data enrichment #160

@nuclearfall

Description

@nuclearfall

In order to move or delete items in a list we must have the index of the item we want to move or delete. Is there a reason not to extract remaining data further?

Something like this:

import re
import json
from curl_cffi import requests as ccrequests

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from html import unescape


BASE = "https://letterboxd.com"


def make_session():
    sess = ccrequests.Session(impersonate="chrome120")

    with open("lb_cookies.json", "r") as f:
        cookies = json.load(f)

    for c in cookies:
        sess.cookies.set(
            c["name"],
            c["value"],
            domain=c.get("domain"),
            path=c.get("path", "/")
        )

    return sess


def fetch(sess, url):
    r = sess.get(url, headers={"User-Agent": "Mozilla/5.0"})
    r.raise_for_status()
    return r.text


def get_progress_counts(html):
    soup = BeautifulSoup(html, "lxml")
    sec = soup.select_one("section.progress-panel")
    if not sec:
        return None, None
    return int(sec["data-count"]), int(sec["data-total"])


def get_total_pages(html):
    soup = BeautifulSoup(html, "lxml")
    pages = []
    for a in soup.select(".paginate-pages a[href]"):
        m = re.search(r"/page/(\d+)/", a["href"])
        if m:
            pages.append(int(m.group(1)))
    return max(pages) if pages else 1


def enrich_for_letterboxdpy(rec):
    film = rec["film"]
    raw = film["title_display"] or ""

    m = re.search(r"\((\d{4})\)", raw)
    year = int(m.group(1)) if m else None

    base = {
        "slug": film["slug"],
        "name": film["title_alt"],
        "year": year,
        "url": film["url"],
    }

    return {**base, **rec}


def parse_page(html, page_num, per_page, total_pages, total_items):
    soup = BeautifulSoup(html, "lxml")
    lis = soup.select("li.posteritem")

    records = []

    for i, li in enumerate(lis, start=1):
        index_global = (page_num - 1) * per_page + (i - 1)

        li_attrs = li.attrs
        react = li.select_one(".react-component")
        r = react.attrs if react else {}

        def decode(attr):
            if attr not in r:
                return None
            try:
                return json.loads(unescape(r[attr]))
            except Exception:
                return None

        poster_ident = decode("data-postered-identifier")
        resolvable = decode("data-resolvable-poster-path")
        img = li.select_one("img")

        rec = {
            "list_entry": {
                "object_id": li_attrs.get("data-object-id"),
                "object_type": li_attrs.get("data-object-name"),
                "owner_rating": int(li_attrs.get("data-owner-rating", 0)),
                "owner_rating_present": li_attrs.get("data-owner-rating", "0") != "0",
            },

            "film": {
                "film_id_raw": int(r.get("data-film-id")),
                "uid": poster_ident.get("uid") if poster_ident else None,
                "lid": poster_ident.get("lid") if poster_ident else None,
                "slug": r.get("data-item-slug"),
                "url": urljoin(BASE, r.get("data-item-link", "")),
                "title_display": r.get("data-item-name"),
                "title_alt": img.get("alt") if img else None,
            },

            "endpoints": {
                "details_json": urljoin(BASE, r.get("data-details-endpoint", "")),
                "poster_resolver": urljoin(BASE, r.get("data-poster-url", "")),
            },

            "poster_state": {
                "width": int(r.get("data-image-width")),
                "height": int(r.get("data-image-height")),
                "empty_poster": r.get("data-empty-poster-src"),
                "cache_busting_key": resolvable.get("cacheBustingKey") if resolvable else None,
                "has_default": resolvable.get("hasDefaultPoster") if resolvable else None,
                "is_adult": resolvable.get("isAdultThemed") if resolvable else None,
            },

            "capabilities": {
                "likeable": r.get("data-likeable") == "true",
                "watchable": r.get("data-watchable") == "true",
                "rateable": r.get("data-rateable") == "true",
                "request_metadata": r.get("data-request-poster-metadata") == "true",
                "is_linked": r.get("data-is-linked") == "true",
                "show_menu": r.get("data-show-menu") == "true",
            },

            "pagination": {
                "page": page_num,
                "per_page": per_page,
                "total_pages": total_pages,
                "total_items": total_items,
            },

            "position": {
                "page": page_num,
                "index_on_page": i,
                "index_global": index_global,
            },
        }

        records.append(enrich_for_letterboxdpy(rec))

    return records, len(lis)


def extract_list(list_url, out_file="hw_list.jsonl"):
    sess = make_session()
    page1 = fetch(sess, list_url)

    watched, total_items = get_progress_counts(page1)
    if total_items is None:
        raise RuntimeError("Could not find progress-panel data-total")

    total_pages = get_total_pages(page1)

    print(f"TOTAL ITEMS: {total_items}")
    print(f"TOTAL PAGES: {total_pages}")

    all_records = []
    html = page1

    for page in range(1, total_pages + 1):
        if page > 1:
            html = fetch(sess, f"{list_url.rstrip('/')}/page/{page}/")

        temp = BeautifulSoup(html, "lxml").select("li.posteritem")
        per_page = len(temp)

        recs, _ = parse_page(html, page, per_page, total_pages, total_items)
        all_records.extend(recs)

        print(f"page {page}: {len(recs)}")

    with open(out_file, "w", encoding="utf-8") as f:
        for r in all_records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"\nWrote {len(all_records)} records to {out_file}")
    return all_records


if __name__ == "__main__":
    LIST = "https://letterboxd.com/grryboy/list/horror-watched/"
    extract_list(LIST)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions