Fresh fruits
+ Shop Now +diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..7dbfac2
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: [alirezamika] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4e1ef42..7665a92 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -21,7 +21,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install setuptools wheel twine
+ pip install setuptools wheel twine pytest
+ pip install .
+ - name: Run tests
+ run: |
+ pytest -q
- name: Build and publish
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
new file mode 100644
index 0000000..48d4fab
--- /dev/null
+++ b/.github/workflows/stale-issues.yml
@@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+ schedule:
+ - cron: "30 1 * * *"
+
+jobs:
+ close-issues:
+ runs-on: ubuntu-latest
+ permissions:
+ issues: write
+ pull-requests: write
+ steps:
+ - uses: actions/stale@v5
+ with:
+ days-before-issue-stale: 30
+ days-before-issue-close: 14
+ stale-issue-label: "stale"
+ stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+ close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+ days-before-pr-stale: 30
+ days-before-pr-close: 14
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..6dd6873
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,23 @@
+name: Run Tests
+
+on:
+ push:
+ release:
+ types: [created]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.x'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install pytest
+ pip install .
+ - name: Run tests
+ run: pytest -q
diff --git a/README.md b/README.md
index ba50d91..a09a2f9 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
This project is made for automatic web scraping to make scraping easy.
It gets a url or the html content of a web page and a list of sample data which we want to scrape from that page. **This data can be text, url or any html tag value of that page.** It learns the scraping rules and returns the similar elements. Then you can use this learned object with new urls to get similar content or the exact same element of those new pages.
+
## Installation
It's compatible with python 3.
@@ -37,7 +38,7 @@ url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'
# We can add one or multiple candidates here.
# You can also put urls here to retrieve urls.
-wanted_list = ["How to call an external command?"]
+wanted_list = ["What are metaclasses in Python?"]
scraper = AutoScraper()
result = scraper.build(url, wanted_list)
@@ -108,7 +109,7 @@ from autoscraper import AutoScraper
url = 'https://github.com/alirezamika/autoscraper'
-wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '2.2k', 'https://github.com/alirezamika/autoscraper/issues']
+wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '6.2k', 'https://github.com/alirezamika/autoscraper/issues']
scraper = AutoScraper()
scraper.build(url, wanted_list)
@@ -140,6 +141,7 @@ scraper.load('yahoo-finance')
## Issues
Feel free to open an issue if you have any problem using the module.
+
## Support the project
diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py
index 63d3229..73f90a6 100644
--- a/autoscraper/auto_scraper.py
+++ b/autoscraper/auto_scraper.py
@@ -1,7 +1,5 @@
import hashlib
import json
-import unicodedata
-
from collections import defaultdict
from html import unescape
from urllib.parse import urljoin, urlparse
@@ -9,8 +7,15 @@
import requests
from bs4 import BeautifulSoup
-from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \
- ResultItem, FuzzyText
+from autoscraper.utils import (
+ FuzzyText,
+ ResultItem,
+ get_non_rec_text,
+ normalize,
+ text_match,
+ unique_hashable,
+ unique_stack_list,
+)
class AutoScraper(object):
@@ -38,8 +43,8 @@ class AutoScraper(object):
"""
request_headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
- (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
+ (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
def __init__(self, stack_list=None):
@@ -60,7 +65,7 @@ def save(self, file_path):
"""
data = dict(stack_list=self.stack_list)
- with open(file_path, 'w') as f:
+ with open(file_path, "w") as f:
json.dump(data, f)
def load(self, file_path):
@@ -77,7 +82,7 @@ def load(self, file_path):
None
"""
- with open(file_path, 'r') as f:
+ with open(file_path, "r") as f:
data = json.load(f)
# for backward compatibility
@@ -85,76 +90,100 @@ def load(self, file_path):
self.stack_list = data
return
- self.stack_list = data['stack_list']
+ self.stack_list = data["stack_list"]
@classmethod
- def _get_soup(cls, url=None, html=None, request_args=None):
+ def _fetch_html(cls, url, request_args=None):
request_args = request_args or {}
-
- if html:
- html = unicodedata.normalize("NFKD", unescape(html))
- return BeautifulSoup(html, 'lxml')
-
headers = dict(cls.request_headers)
if url:
- headers['Host'] = urlparse(url).netloc
+ headers["Host"] = urlparse(url).netloc
- user_headers = request_args.pop('headers', {})
+ user_headers = request_args.pop("headers", {})
headers.update(user_headers)
- html = requests.get(url, headers=headers, **request_args).text
- html = unicodedata.normalize("NFKD", unescape(html))
+ res = requests.get(url, headers=headers, **request_args)
+ if res.encoding == "ISO-8859-1" and not "ISO-8859-1" in res.headers.get(
+ "Content-Type", ""
+ ):
+ res.encoding = res.apparent_encoding
+ html = res.text
+ return html
+
+ @classmethod
+ def _get_soup(cls, url=None, html=None, request_args=None):
+ if html:
+ html = normalize(unescape(html))
+ return BeautifulSoup(html, "lxml")
- return BeautifulSoup(html, 'lxml')
+ html = cls._fetch_html(url, request_args)
+ html = normalize(unescape(html))
+
+ return BeautifulSoup(html, "lxml")
@staticmethod
def _get_valid_attrs(item):
- key_attrs = {'class', 'style'}
+ key_attrs = {"class", "style"}
attrs = {
- k: v if v != [] else '' for k, v in item.attrs.items() if k in key_attrs
+ k: v if v != [] else "" for k, v in item.attrs.items() if k in key_attrs
}
for attr in key_attrs:
if attr not in attrs:
- attrs[attr] = ''
+ attrs[attr] = ""
return attrs
@staticmethod
- def _child_has_text(child, text, url):
+ def _child_has_text(child, text, url, text_fuzz_ratio):
child_text = child.getText().strip()
- if text == child_text:
+ if text_match(text, child_text, text_fuzz_ratio):
parent_text = child.parent.getText().strip()
- if child_text == parent_text:
+ if child_text == parent_text and child.parent.parent:
return False
child.wanted_attr = None
return True
+ if text_match(text, get_non_rec_text(child), text_fuzz_ratio):
+ child.is_non_rec_text = True
+ child.wanted_attr = None
+ return True
+
for key, value in child.attrs.items():
if not isinstance(value, str):
continue
value = value.strip()
- if text == value:
+ if text_match(text, value, text_fuzz_ratio):
child.wanted_attr = key
return True
- if key in {'href', 'src'}:
+ if key in {"href", "src"}:
full_url = urljoin(url, value)
- if text == full_url:
+ if text_match(text, full_url, text_fuzz_ratio):
child.wanted_attr = key
child.is_full_url = True
return True
return False
- def _get_children(self, soup, text, url):
- text = text.strip()
+ def _get_children(self, soup, text, url, text_fuzz_ratio):
children = reversed(soup.findChildren())
- children = [x for x in children if self._child_has_text(x, text, url)]
+ children = [
+ x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)
+ ]
return children
- def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, update=False):
+ def build(
+ self,
+ url=None,
+ wanted_list=None,
+ wanted_dict=None,
+ html=None,
+ request_args=None,
+ update=False,
+ text_fuzz_ratio=1.0,
+ ):
"""
Automatically constructs a set of rules to scrape the specified target[s] from a web page.
The rules are represented as stack_list.
@@ -164,13 +193,14 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
url: str, optional
URL of the target web page. You should either pass url or html or both.
- wanted_list: list, optional
+ wanted_list: list of strings or compiled regular expressions, optional
A list of needed contents to be scraped.
AutoScraper learns a set of rules to scrape these targets. If specified,
wanted_dict will be ignored.
-
+
wanted_dict: dict, optional
- A dict of needed contents to be scraped. Keys are aliases and values are list of target texts.
+ A dict of needed contents to be scraped. Keys are aliases and values are list of target texts
+ or compiled regular expressions.
AutoScraper learns a set of rules to scrape these targets and sets its aliases.
html: str, optional
@@ -185,11 +215,17 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
If True, new learned rules will be added to the previous ones.
If False, all previously learned rules will be removed.
+ text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
+ The fuzziness ratio threshold for matching the wanted contents.
+
Returns:
--------
- None
+ List of similar results
"""
+ if not wanted_list and not (wanted_dict and any(wanted_dict.values())):
+ raise ValueError("No targets were supplied")
+
soup = self._get_soup(url=url, html=html, request_args=request_args)
result_list = []
@@ -198,31 +234,28 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
self.stack_list = []
if wanted_list:
- wanted_dict = {'': wanted_list}
+ wanted_dict = {"": wanted_list}
wanted_list = []
for alias, wanted_items in wanted_dict.items():
- wanted_items = [unicodedata.normalize("NFKD", w) for w in wanted_items]
+ wanted_items = [normalize(w) for w in wanted_items]
wanted_list += wanted_items
for wanted in wanted_items:
- children = self._get_children(soup, wanted, url)
+ children = self._get_children(soup, wanted, url, text_fuzz_ratio)
for child in children:
result, stack = self._get_result_for_child(child, soup, url)
- stack['alias'] = alias
+ stack["alias"] = alias
result_list += result
self.stack_list.append(stack)
result_list = [item.text for item in result_list]
result_list = unique_hashable(result_list)
- if all(w in result_list for w in wanted_list):
- self.stack_list = unique_stack_list(self.stack_list)
- return result_list
-
- return None
+ self.stack_list = unique_stack_list(self.stack_list)
+ return result_list
@classmethod
def _build_stack(cls, child, url):
@@ -234,25 +267,33 @@ def _build_stack(cls, child, url):
if not grand_parent:
break
- children = grand_parent.findAll(parent.name, cls._get_valid_attrs(parent),
- recursive=False)
+ children = grand_parent.findAll(
+ parent.name, cls._get_valid_attrs(parent), recursive=False
+ )
for i, c in enumerate(children):
if c == parent:
content.insert(
- 0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i))
+ 0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i)
+ )
break
- if grand_parent.name == 'html':
+ if not grand_parent.parent:
break
parent = grand_parent
- wanted_attr = getattr(child, 'wanted_attr', None)
- is_full_url = getattr(child, 'is_full_url', False)
- stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url)
- stack['url'] = url if is_full_url else ''
- stack['hash'] = hashlib.sha256(str(stack).encode('utf-8')).hexdigest()
- stack['stack_id'] = 'rule_' + get_random_str(4)
+ wanted_attr = getattr(child, "wanted_attr", None)
+ is_full_url = getattr(child, "is_full_url", False)
+ is_non_rec_text = getattr(child, "is_non_rec_text", False)
+ stack = dict(
+ content=content,
+ wanted_attr=wanted_attr,
+ is_full_url=is_full_url,
+ is_non_rec_text=is_non_rec_text,
+ )
+ stack["url"] = url if is_full_url else ""
+ stack["hash"] = hashlib.sha256(str(stack).encode("utf-8")).hexdigest()
+ stack["stack_id"] = "rule_" + stack["hash"][:8]
return stack
def _get_result_for_child(self, child, soup, url):
@@ -261,8 +302,10 @@ def _get_result_for_child(self, child, soup, url):
return result, stack
@staticmethod
- def _fetch_result_from_child(child, wanted_attr, is_full_url, url):
+ def _fetch_result_from_child(child, wanted_attr, is_full_url, url, is_non_rec_text):
if wanted_attr is None:
+ if is_non_rec_text:
+ return get_non_rec_text(child)
return child.getText().strip()
if wanted_attr not in child.attrs:
@@ -286,10 +329,12 @@ def _get_fuzzy_attrs(attrs, attr_fuzz_ratio):
def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs):
parents = [soup]
- stack_content = stack['content']
- contain_sibling_leaves = kwargs.get('contain_sibling_leaves', False)
+ stack_content = stack["content"]
+ contain_sibling_leaves = kwargs.get("contain_sibling_leaves", False)
for index, item in enumerate(stack_content):
children = []
+ if item[0] == "[document]":
+ continue
for parent in parents:
attrs = item[1]
@@ -308,17 +353,30 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs):
parents = children
- wanted_attr = stack['wanted_attr']
- is_full_url = stack['is_full_url']
- result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, is_full_url, url),
- getattr(i, 'child_index', 0)) for i in parents]
- result = [x for x in result if x.text]
+ wanted_attr = stack["wanted_attr"]
+ is_full_url = stack["is_full_url"]
+ is_non_rec_text = stack.get("is_non_rec_text", False)
+ result = [
+ ResultItem(
+ self._fetch_result_from_child(
+ i, wanted_attr, is_full_url, url, is_non_rec_text
+ ),
+ getattr(i, "child_index", 0),
+ )
+ for i in parents
+ ]
+ if not kwargs.get("keep_blank", False):
+ result = [x for x in result if x.text]
return result
- def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio, **kwargs):
+ def _get_result_with_stack_index_based(
+ self, stack, soup, url, attr_fuzz_ratio, **kwargs
+ ):
p = soup.findChildren(recursive=False)[0]
- stack_content = stack['content']
+ stack_content = stack["content"]
for index, item in enumerate(stack_content[:-1]):
+ if item[0] == "[document]":
+ continue
content = stack_content[index + 1]
attrs = content[1]
if attr_fuzz_ratio < 1.0:
@@ -329,27 +387,49 @@ def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio,
idx = min(len(p) - 1, item[2])
p = p[idx]
- result = [ResultItem(self._fetch_result_from_child(
- p, stack['wanted_attr'], stack['is_full_url'], url), getattr(p, 'child_index', 0))]
- result = [x for x in result if x.text]
+ result = [
+ ResultItem(
+ self._fetch_result_from_child(
+ p,
+ stack["wanted_attr"],
+ stack["is_full_url"],
+ url,
+ stack["is_non_rec_text"],
+ ),
+ getattr(p, "child_index", 0),
+ )
+ ]
+ if not kwargs.get("keep_blank", False):
+ result = [x for x in result if x.text]
return result
- def _get_result_by_func(self, func, url, html, soup, request_args, grouped,
- group_by_alias, unique, attr_fuzz_ratio, **kwargs):
+ def _get_result_by_func(
+ self,
+ func,
+ url,
+ html,
+ soup,
+ request_args,
+ grouped,
+ group_by_alias,
+ unique,
+ attr_fuzz_ratio,
+ **kwargs
+ ):
if not soup:
soup = self._get_soup(url=url, html=html, request_args=request_args)
- keep_order = kwargs.get('keep_order', False)
+ keep_order = kwargs.get("keep_order", False)
if group_by_alias or (keep_order and not grouped):
for index, child in enumerate(soup.findChildren()):
- setattr(child, 'child_index', index)
+ setattr(child, "child_index", index)
result_list = []
grouped_result = defaultdict(list)
for stack in self.stack_list:
if not url:
- url = stack.get('url', '')
+ url = stack.get("url", "")
result = func(stack, soup, url, attr_fuzz_ratio, **kwargs)
@@ -357,14 +437,17 @@ def _get_result_by_func(self, func, url, html, soup, request_args, grouped,
result_list += result
continue
- group_id = stack.get('alias', '') if group_by_alias else stack['stack_id']
+ group_id = stack.get("alias", "") if group_by_alias else stack["stack_id"]
grouped_result[group_id] += result
- return self._clean_result(result_list, grouped_result, grouped, group_by_alias,
- unique, keep_order)
+ return self._clean_result(
+ result_list, grouped_result, grouped, group_by_alias, unique, keep_order
+ )
@staticmethod
- def _clean_result(result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order):
+ def _clean_result(
+ result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order
+ ):
if not grouped and not grouped_by_alias:
if unique is None:
unique = True
@@ -385,9 +468,20 @@ def _clean_result(result_list, grouped_result, grouped, grouped_by_alias, unique
return dict(grouped_result)
- def get_result_similar(self, url=None, html=None, soup=None, request_args=None,
- grouped=False, group_by_alias=False, unique=None, attr_fuzz_ratio=1.0,
- keep_order=False, contain_sibling_leaves=False):
+ def get_result_similar(
+ self,
+ url=None,
+ html=None,
+ soup=None,
+ request_args=None,
+ grouped=False,
+ group_by_alias=False,
+ unique=None,
+ attr_fuzz_ratio=1.0,
+ keep_blank=False,
+ keep_order=False,
+ contain_sibling_leaves=False,
+ ):
"""
Gets similar results based on the previously learned rules.
@@ -419,6 +513,9 @@ def get_result_similar(self, url=None, html=None, soup=None, request_args=None,
attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
The fuzziness ratio threshold for matching html tag attributes.
+ keep_blank: bool, optional, defaults to False
+ If set to True, missing values will be returned as empty strings.
+
keep_order: bool, optional, defaults to False
If set to True, the results will be ordered as they are present on the web page.
@@ -432,13 +529,33 @@ def get_result_similar(self, url=None, html=None, soup=None, request_args=None,
"""
func = self._get_result_with_stack
- return self._get_result_by_func(func, url, html, soup, request_args, grouped,
- group_by_alias, unique, attr_fuzz_ratio,
- keep_order=keep_order,
- contain_sibling_leaves=contain_sibling_leaves)
-
- def get_result_exact(self, url=None, html=None, soup=None, request_args=None,
- grouped=False, group_by_alias=False, unique=None, attr_fuzz_ratio=1.0):
+ return self._get_result_by_func(
+ func,
+ url,
+ html,
+ soup,
+ request_args,
+ grouped,
+ group_by_alias,
+ unique,
+ attr_fuzz_ratio,
+ keep_blank=keep_blank,
+ keep_order=keep_order,
+ contain_sibling_leaves=contain_sibling_leaves,
+ )
+
+ def get_result_exact(
+ self,
+ url=None,
+ html=None,
+ soup=None,
+ request_args=None,
+ grouped=False,
+ group_by_alias=False,
+ unique=None,
+ attr_fuzz_ratio=1.0,
+ keep_blank=False,
+ ):
"""
Gets exact results based on the previously learned rules.
@@ -470,6 +587,9 @@ def get_result_exact(self, url=None, html=None, soup=None, request_args=None,
attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
The fuzziness ratio threshold for matching html tag attributes.
+ keep_blank: bool, optional, defaults to False
+ If set to True, missing values will be returned as empty strings.
+
Returns:
--------
List of exact results scraped from the web page.
@@ -477,11 +597,29 @@ def get_result_exact(self, url=None, html=None, soup=None, request_args=None,
"""
func = self._get_result_with_stack_index_based
- return self._get_result_by_func(func, url, html, soup, request_args, grouped,
- group_by_alias, unique, attr_fuzz_ratio)
-
- def get_result(self, url=None, html=None, request_args=None, grouped=False,
- group_by_alias=False, unique=None, attr_fuzz_ratio=1.0):
+ return self._get_result_by_func(
+ func,
+ url,
+ html,
+ soup,
+ request_args,
+ grouped,
+ group_by_alias,
+ unique,
+ attr_fuzz_ratio,
+ keep_blank=keep_blank,
+ )
+
+ def get_result(
+ self,
+ url=None,
+ html=None,
+ request_args=None,
+ grouped=False,
+ group_by_alias=False,
+ unique=None,
+ attr_fuzz_ratio=1.0,
+ ):
"""
Gets similar and exact results based on the previously learned rules.
@@ -520,8 +658,14 @@ def get_result(self, url=None, html=None, request_args=None, grouped=False,
"""
soup = self._get_soup(url=url, html=html, request_args=request_args)
- args = dict(url=url, soup=soup, grouped=grouped, group_by_alias=group_by_alias,
- unique=unique, attr_fuzz_ratio=attr_fuzz_ratio)
+ args = dict(
+ url=url,
+ soup=soup,
+ grouped=grouped,
+ group_by_alias=group_by_alias,
+ unique=unique,
+ attr_fuzz_ratio=attr_fuzz_ratio,
+ )
similar = self.get_result_similar(**args)
exact = self.get_result_exact(**args)
return similar, exact
@@ -540,7 +684,7 @@ def remove_rules(self, rules):
None
"""
- self.stack_list = [x for x in self.stack_list if x['stack_id'] not in rules]
+ self.stack_list = [x for x in self.stack_list if x["stack_id"] not in rules]
def keep_rules(self, rules):
"""
@@ -556,7 +700,7 @@ def keep_rules(self, rules):
None
"""
- self.stack_list = [x for x in self.stack_list if x['stack_id'] in rules]
+ self.stack_list = [x for x in self.stack_list if x["stack_id"] in rules]
def set_rule_aliases(self, rule_aliases):
"""
@@ -572,10 +716,10 @@ def set_rule_aliases(self, rule_aliases):
None
"""
- id_to_stack = {stack['stack_id']: stack for stack in self.stack_list}
+ id_to_stack = {stack["stack_id"]: stack for stack in self.stack_list}
for rule_id, alias in rule_aliases.items():
- id_to_stack[rule_id]['alias'] = alias
+ id_to_stack[rule_id]["alias"] = alias
def generate_python_code(self):
# deprecated
- print('This function is deprecated. Please use save() and load() instead.')
+ print("This function is deprecated. Please use save() and load() instead.")
diff --git a/autoscraper/utils.py b/autoscraper/utils.py
index 39503e3..15641f6 100644
--- a/autoscraper/utils.py
+++ b/autoscraper/utils.py
@@ -1,12 +1,8 @@
from collections import OrderedDict
-import random
-import string
-import warnings
+import unicodedata
-with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- from fuzzywuzzy import fuzz
+from difflib import SequenceMatcher
def unique_stack_list(stack_list):
@@ -26,9 +22,22 @@ def unique_hashable(hashable_items):
return list(OrderedDict.fromkeys(hashable_items))
-def get_random_str(n):
- chars = string.ascii_lowercase + string.digits
- return ''.join(random.choice(chars) for i in range(n))
+def get_non_rec_text(element):
+ return ''.join(element.find_all(text=True, recursive=False)).strip()
+
+
+def normalize(item):
+ if not isinstance(item, str):
+ return item
+ return unicodedata.normalize("NFKD", item.strip())
+
+
+def text_match(t1, t2, ratio_limit):
+ if hasattr(t1, 'fullmatch'):
+ return bool(t1.fullmatch(t2))
+ if ratio_limit >= 1:
+ return t1 == t2
+ return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit
class ResultItem():
@@ -47,4 +56,4 @@ def __init__(self, text, ratio_limit):
self.match = None
def search(self, text):
- return fuzz.ratio(self.text, text)/100. >= self.ratio_limit
+ return SequenceMatcher(None, self.text, text).ratio() >= self.ratio_limit
diff --git a/setup.py b/setup.py
index c0e902a..ae1972e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,39 +1,30 @@
-from setuptools import setup, find_packages
from codecs import open
from os import path
+from setuptools import find_packages, setup
+
here = path.abspath(path.dirname(__file__))
-with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
setup(
- name='autoscraper',
-
- version='1.1.7',
-
- description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python',
+ name="autoscraper",
+ version="1.1.14",
+ description="A Smart, Automatic, Fast and Lightweight Web Scraper for Python",
long_description_content_type="text/markdown",
long_description=long_description,
-
- url='https://github.com/alirezamika/autoscraper',
-
- author='Alireza Mika',
- author_email='alirezamika@gmail.com',
-
- license='MIT',
-
+ url="https://github.com/alirezamika/autoscraper",
+ author="Alireza Mika",
+ author_email="alirezamika@gmail.com",
+ license="MIT",
classifiers=[
- 'Development Status :: 4 - Beta',
- 'License :: OSI Approved :: MIT License',
- 'Programming Language :: Python :: 3',
+ "Development Status :: 4 - Beta",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3",
],
-
- keywords='scraping - scraper',
-
- packages=find_packages(exclude=['contrib', 'docs', 'tests']),
-
- python_requires='>=3.6',
- install_requires=['requests', 'bs4', 'lxml', 'fuzzywuzzy'],
-
+ keywords="scraping - scraper",
+ packages=find_packages(exclude=["contrib", "docs", "tests"]),
+ python_requires=">=3.6",
+ install_requires=["requests", "bs4", "lxml"],
)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..f6e3aaf
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,107 @@
+import sys
+from types import ModuleType
+from html.parser import HTMLParser
+
+class _Node:
+ def __init__(self, name, attrs, parent=None):
+ self.name = name
+ self.attrs = dict(attrs)
+ self.parent = parent
+ self.children = []
+ self.text = ""
+
+ def append_child(self, child):
+ self.children.append(child)
+ child.parent = self
+
+ def getText(self):
+ return self.text + "".join(c.getText() for c in self.children)
+
+ def findChildren(self, recursive=True):
+ result = []
+ for child in self.children:
+ result.append(child)
+ if recursive:
+ result.extend(child.findChildren(recursive))
+ return result
+
+ def findParent(self):
+ return self.parent
+
+ def _attr_match(self, child, attrs):
+ from autoscraper.utils import FuzzyText
+
+ for key, val in (attrs or {}).items():
+ actual = child.attrs.get(key, "")
+ if isinstance(actual, list):
+ actual = " ".join(actual)
+
+ if isinstance(val, FuzzyText):
+ if not val.search(actual):
+ return False
+ elif actual != val:
+ return False
+ return True
+
+ def findAll(self, name=None, attrs=None, recursive=True):
+ result = []
+ for child in self.children:
+ if (name is None or child.name == name) and self._attr_match(child, attrs):
+ result.append(child)
+ if recursive:
+ result.extend(child.findAll(name, attrs, recursive))
+ return result
+
+ def find_all(self, name=None, attrs=None, text=None, recursive=True):
+ if text:
+ res = []
+ if self.text.strip():
+ res.append(self.text)
+ for child in self.children:
+ if recursive:
+ res.extend(child.find_all(text=True, recursive=True))
+ elif child.text.strip():
+ res.append(child.text)
+ return res
+ return self.findAll(name, attrs, recursive)
+
+class _Parser(HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self.root = _Node("[document]", {})
+ self.current = self.root
+
+ def handle_starttag(self, tag, attrs):
+ node = _Node(tag, attrs)
+ self.current.append_child(node)
+ self.current = node
+
+ def handle_endtag(self, tag):
+ if self.current.parent:
+ self.current = self.current.parent
+
+ def handle_data(self, data):
+ self.current.text += data
+
+class BeautifulSoup(_Node):
+ def __init__(self, html, parser):
+ p = _Parser()
+ p.feed(html)
+ super().__init__(p.root.name, p.root.attrs)
+ self.children = p.root.children
+ for c in self.children:
+ c.parent = self
+
+bs4_mod = ModuleType("bs4")
+bs4_mod.BeautifulSoup = BeautifulSoup
+sys.modules.setdefault("bs4", bs4_mod)
+
+class _Response:
+ def __init__(self, text=""):
+ self.encoding = "utf-8"
+ self.headers = {"Content-Type": "text/html"}
+ self.text = text
+
+requests_mod = ModuleType("requests")
+requests_mod.get = lambda url, headers=None, **kw: _Response()
+sys.modules.setdefault("requests", requests_mod)
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/test_complex_features.py b/tests/integration/test_complex_features.py
new file mode 100644
index 0000000..57514be
--- /dev/null
+++ b/tests/integration/test_complex_features.py
@@ -0,0 +1,77 @@
+import pytest
+import re
+from autoscraper import AutoScraper
+
+HTML_COMPLEX = """
+
$1
+$2
+