-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_cards.py
More file actions
59 lines (50 loc) · 1.84 KB
/
fetch_cards.py
File metadata and controls
59 lines (50 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import sys
import json
import asyncio
from playwright.async_api import async_playwright
KEYWORDS = [
"火影子时", "木叶快报", "火影忍者手游", "火影手游", "丁次烤肉",
"木叶村广播站", "子时小周报", "饰品", "火影"
]
async def main():
user_id = sys.argv[1]
page_url = f"https://www.douyin.com/user/{user_id}?from_tab_name=main&showTab=post"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(viewport={"width": 1440, "height": 2200})
await page.goto(page_url, wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(8000)
for _ in range(3):
await page.mouse.wheel(0, 1800)
await page.wait_for_timeout(1500)
cards = await page.eval_on_selector_all(
'a[href*="/note/"], a[href*="/video/"]',
'''els => els.map((a, i) => ({
idx: i + 1,
href: a.href || '',
text: (a.innerText || (a.parentElement && a.parentElement.innerText) || '').replace(/\s+/g, ' ').trim().slice(0, 300)
}))'''
)
await browser.close()
result = []
seen = set()
for card in cards:
href = card.get("href") or ""
text = card.get("text") or ""
if not href or href in seen:
continue
if "source=Baiduspider" in href:
continue
if "/video/" not in href and "/note/" not in href:
continue
if not text:
continue
if not any(k in text for k in KEYWORDS) and "置顶" not in text:
continue
seen.add(href)
result.append(card)
if len(result) >= 12:
break
print(json.dumps(result, ensure_ascii=False))
if __name__ == "__main__":
asyncio.run(main())