-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplaywright_fetch.py
More file actions
86 lines (70 loc) · 2.18 KB
/
playwright_fetch.py
File metadata and controls
86 lines (70 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Playwright Fetching Module
Handles headless browser fetching with smart content waiting strategies.
"""
import asyncio
from playwright.async_api import BrowserContext
PLAYWRIGHT_TIMEOUT = 45000 # 45 seconds (in milliseconds)
async def smart_wait_for_content(page) -> None:
"""
Smart waiting for dynamic content with multiple strategies.
"""
# Wait for any common article content container
combined_selector = (
'article, main, [role="main"], .article-content, '
'.post-content, .entry-content, [itemprop="articleBody"]'
)
try:
await page.wait_for_selector(combined_selector, timeout=3000)
return
except Exception:
pass
# Fallback: wait for network to settle
try:
await page.wait_for_load_state('networkidle', timeout=5000)
except Exception:
pass
# Scroll to trigger lazy loading
try:
await page.evaluate("""
() => {
// Scroll to middle
window.scrollTo(0, document.body.scrollHeight / 2);
}
""")
await asyncio.sleep(0.5)
await page.evaluate("""
() => {
// Scroll to bottom
window.scrollTo(0, document.body.scrollHeight);
}
""")
await asyncio.sleep(0.5)
# Scroll back to top
await page.evaluate("() => window.scrollTo(0, 0)")
except Exception:
pass
async def fetch_with_playwright(
context: BrowserContext,
url: str
) -> str:
"""
Fetch page content using Playwright (headless browser).
This handles JavaScript-rendered content.
Args:
context: Shared Playwright browser context
url: Target URL
Returns:
str: Rendered HTML after JavaScript execution
"""
page = await context.new_page()
try:
# Navigate and wait for network to be mostly idle
await page.goto(url, timeout=PLAYWRIGHT_TIMEOUT, wait_until="domcontentloaded")
# Use smart waiting for dynamic content
await smart_wait_for_content(page)
# Get fully rendered HTML
html = await page.content()
return html
finally:
await page.close()