Skip to content

Commit 39c78b0

Browse files
IgzakIgzak
authored andcommitted
Revert "feat: add resource and file type blocking in browser params"
This reverts commit 90bed45. Intercepting slow down scrapper
1 parent 90bed45 commit 39c78b0

File tree

2 files changed

+10
-51
lines changed

2 files changed

+10
-51
lines changed

app/routers/any_page.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from fastapi import APIRouter, Query, Depends
88
from fastapi.requests import Request
99
from pydantic import BaseModel
10-
from playwright.async_api import Browser, Route
10+
from playwright.async_api import Browser
1111

1212
from internal import cache
1313
from internal.browser import (
@@ -23,8 +23,10 @@
2323
ProxyQueryParams,
2424
)
2525

26+
2627
router = APIRouter(prefix='/api/page', tags=['page'])
2728

29+
2830
class AnyPage(BaseModel):
2931
id: Annotated[str, Query(description='unique result ID')]
3032
url: Annotated[str, Query(description='page URL after redirects, may not match the query URL')]
@@ -38,24 +40,25 @@ class AnyPage(BaseModel):
3840
title: Annotated[str | None, Query(description="page's title")] = None
3941
status_code: Annotated[int, Query(description='HTTP status code of the page')]
4042

43+
4144
@router.get('', summary='Get any page from the given URL', response_model=AnyPage)
4245
async def get_any_page(
4346
request: Request,
4447
url: Annotated[URLParam, Depends()],
4548
common_params: Annotated[CommonQueryParams, Depends()],
4649
browser_params: Annotated[BrowserQueryParams, Depends()],
47-
proxy_params: Annotated[ProxyQueryParams, Depends()]
50+
proxy_params: Annotated[ProxyQueryParams, Depends()],
4851
) -> dict:
4952
"""
5053
Get any page from the given URL.<br><br>
5154
Page is fetched using Playwright, but no additional processing is done.
5255
"""
5356
# pylint: disable=duplicate-code
54-
# Split URL into parts: host with scheme, path with query, query params as a dict
57+
# split URL into parts: host with scheme, path with query, query params as a dict
5558
host_url, full_path, query_dict = split_url(request.url)
5659

57-
# Get cache data if exists
58-
r_id = cache.make_key(full_path) # Unique result ID
60+
# get cache data if exists
61+
r_id = cache.make_key(full_path) # unique result ID
5962
if common_params.cache:
6063
data = cache.load_result(key=r_id)
6164
if data:
@@ -64,19 +67,9 @@ async def get_any_page(
6467
browser: Browser = request.state.browser
6568
semaphore: asyncio.Semaphore = request.state.semaphore
6669

67-
# Create a new browser context
70+
# create a new browser context
6871
async with semaphore:
6972
async with new_context(browser, browser_params, proxy_params) as context:
70-
async def block_unwanted_resources(route: Route):
71-
if route.request.resource_type in (browser_params.block_types or []):
72-
await route.abort()
73-
elif route.request.url.lower().endswith(tuple(browser_params.block_extensions or [])):
74-
await route.abort()
75-
else:
76-
await route.continue_()
77-
78-
await context.route("**/*", block_unwanted_resources)
79-
8073
page = await context.new_page()
8174
status = await page_processing(
8275
page=page,
@@ -106,7 +99,7 @@ async def block_unwanted_resources(route: Route):
10699
if common_params.screenshot:
107100
r['screenshotUri'] = f'{host_url}/screenshot/{r_id}'
108101

109-
# Save result to disk
102+
# save result to disk
110103
if common_params.cache:
111104
cache.dump_result(r, key=r_id, screenshot=screenshot)
112105
return r

app/routers/query_params.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -201,28 +201,6 @@ def __init__(
201201
)
202202
),
203203
] = WaitUntilEnum.DOMCONTENTLOADED,
204-
block_types: Annotated[
205-
str | None,
206-
Query(
207-
alias='block-types',
208-
description=(
209-
'List of resource types to block on the page.<br>'
210-
'Examples: `image`, `media`, `font`, etc.<br>'
211-
'By default, all resources are allowed.'
212-
),
213-
),
214-
] = None,
215-
block_extensions: Annotated[
216-
str | None,
217-
Query(
218-
alias='block-extensions',
219-
description=(
220-
'List of file extensions to block on the page.<br>'
221-
'Examples: `.pdf`, `.jpg`, `.zip`, etc.<br>'
222-
'By default, no extensions are blocked.'
223-
),
224-
),
225-
] = None,
226204
sleep: Annotated[
227205
int,
228206
Query(
@@ -382,24 +360,12 @@ def __init__(
382360
self.timezone = timezone
383361
self.http_credentials = None
384362
self.extra_http_headers = None
385-
self.block_types = block_types
386-
self.block_extensions = block_extensions
387363

388364
if resource:
389365
resource = list(filter(None, map(str.strip, resource.split(','))))
390366
if resource:
391367
self.resource = resource
392368

393-
if block_extensions:
394-
block_extensions = list(filter(None, map(str.strip, block_extensions.split(','))))
395-
if block_extensions:
396-
self.block_extensions = block_extensions
397-
398-
if block_types:
399-
block_types = list(filter(None, map(str.strip, block_types.split(','))))
400-
if block_types:
401-
self.block_types = block_types
402-
403369
if device not in DEVICE_REGISTRY:
404370
raise QueryParsingError('device', 'Device not found', device)
405371

0 commit comments

Comments
 (0)