-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser_template.py
More file actions
69 lines (57 loc) · 2.48 KB
/
parser_template.py
File metadata and controls
69 lines (57 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Template: Writing a parser for a non-Wowhead site.
Each parser is a function that takes raw HTML and an entity ID,
extracts structured data, and returns a dict. Register it in
the PARSERS dict at the bottom of parsers.py so the scraper
automatically routes pages to your function.
"""
import re
from bs4 import BeautifulSoup
def parse_mysite_page(html: str, entity_id: int) -> dict:
"""Parse a page from mysite.example.com and return structured data.
Args:
html: Full page HTML as a string (already decoded).
entity_id: The numeric ID being scraped (used for output keying).
Returns:
Dict with extracted fields. At minimum, include 'id' and 'name'.
Add whatever fields the site exposes -- the scraper stores the
entire dict as JSON, so be greedy.
"""
soup = BeautifulSoup(html, "html.parser")
result: dict = {"id": entity_id}
# -- Name: grab from the page title or a heading --------------------------
heading = soup.select_one("h1.entry-title, h1.page-title, h1")
if heading:
result["name"] = heading.get_text(strip=True)
# -- Sidebar / infobox key-value pairs ------------------------------------
infobox: dict = {}
for row in soup.select("table.infobox tr"):
cells = row.find_all("td")
if len(cells) == 2:
key = cells[0].get_text(strip=True).rstrip(":").lower().replace(" ", "_")
val = cells[1].get_text(strip=True)
infobox[key] = val
if infobox:
result["infobox"] = infobox
# -- Embedded JSON data (common pattern: JS variable in a <script>) -------
# Many sites embed data like: var pageData = { ... };
# Use regex to pull the JSON blob, then json.loads() it.
import json
for script in soup.find_all("script"):
text = script.string or ""
m = re.search(r"var\s+pageData\s*=\s*(\{.+?\})\s*;", text, re.DOTALL)
if m:
try:
result["page_data"] = json.loads(m.group(1))
except json.JSONDecodeError:
pass # skip malformed blobs -- don't crash the scraper
return result
# ---- Registration -----------------------------------------------------------
# In parsers.py, add your parser to the PARSERS dict:
#
# PARSERS = {
# ...
# "mysite_entity": parse_mysite_page, # <-- add this line
# }
#
# The dict key must match the target name used in generate_id_lists.py
# so the scraper knows which parser handles which target's HTML.