From e53a496a70406e2eeb8d8ececd5f21db922955b3 Mon Sep 17 00:00:00 2001 From: alireza Date: Mon, 26 Oct 2020 12:34:40 +0330 Subject: [PATCH 01/30] Match non recursive element texts --- autoscraper/auto_scraper.py | 22 +++++++++++++++++----- autoscraper/utils.py | 4 ++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 63d3229..946a0d9 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \ - ResultItem, FuzzyText + ResultItem, FuzzyText, get_non_rec_text class AutoScraper(object): @@ -130,6 +130,11 @@ def _child_has_text(child, text, url): child.wanted_attr = None return True + if text == get_non_rec_text(child): + child.is_non_rec_text = True + child.wanted_attr = None + return True + for key, value in child.attrs.items(): if not isinstance(value, str): continue @@ -249,7 +254,9 @@ def _build_stack(cls, child, url): wanted_attr = getattr(child, 'wanted_attr', None) is_full_url = getattr(child, 'is_full_url', False) - stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url) + is_non_rec_text = getattr(child, 'is_non_rec_text', False) + stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url, + is_non_rec_text=is_non_rec_text) stack['url'] = url if is_full_url else '' stack['hash'] = hashlib.sha256(str(stack).encode('utf-8')).hexdigest() stack['stack_id'] = 'rule_' + get_random_str(4) @@ -261,8 +268,10 @@ def _get_result_for_child(self, child, soup, url): return result, stack @staticmethod - def _fetch_result_from_child(child, wanted_attr, is_full_url, url): + def _fetch_result_from_child(child, wanted_attr, is_full_url, url, is_non_rec_text): if wanted_attr is None: + if is_non_rec_text: + return get_non_rec_text(child) return child.getText().strip() if wanted_attr not in child.attrs: @@ -310,7 +319,9 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): wanted_attr = stack['wanted_attr'] is_full_url = stack['is_full_url'] - result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, is_full_url, url), + is_non_rec_text = stack['is_non_rec_text'] + result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, + is_full_url, url, is_non_rec_text), getattr(i, 'child_index', 0)) for i in parents] result = [x for x in result if x.text] return result @@ -330,7 +341,8 @@ def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio, p = p[idx] result = [ResultItem(self._fetch_result_from_child( - p, stack['wanted_attr'], stack['is_full_url'], url), getattr(p, 'child_index', 0))] + p, stack['wanted_attr'], stack['is_full_url'], url, stack['is_non_rec_text']), + getattr(p, 'child_index', 0))] result = [x for x in result if x.text] return result diff --git a/autoscraper/utils.py b/autoscraper/utils.py index 39503e3..9054e63 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -31,6 +31,10 @@ def get_random_str(n): return ''.join(random.choice(chars) for i in range(n)) +def get_non_rec_text(element): + return ''.join(element.find_all(text=True, recursive=False)).strip() + + class ResultItem(): def __init__(self, text, index): self.text = text From dad82b7df844bb5568087d6f6d83d06933ca85ba Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Mon, 26 Oct 2020 12:36:03 +0330 Subject: [PATCH 02/30] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba50d91..7f7f872 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ from autoscraper import AutoScraper url = 'https://github.com/alirezamika/autoscraper' -wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '2.2k', 'https://github.com/alirezamika/autoscraper/issues'] +wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '2.5k', 'https://github.com/alirezamika/autoscraper/issues'] scraper = AutoScraper() scraper.build(url, wanted_list) From 3f953ea36adaee07620a10c87d2a7d2d49dcaefd Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Mon, 26 Oct 2020 13:13:20 +0330 Subject: [PATCH 03/30] Update version to 1.1.8 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c0e902a..f52eabc 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='autoscraper', - version='1.1.7', + version='1.1.8', description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python', long_description_content_type="text/markdown", From 93af23450c6aa8afa5812636107868f86c73edb8 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Fri, 30 Oct 2020 19:42:45 +0100 Subject: [PATCH 04/30] replace fuzzywuzzy with difflib fuzzywuzzy uses difflib internally anyways, so fuzzywuzzy is not required. --- autoscraper/utils.py | 7 ++----- setup.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/autoscraper/utils.py b/autoscraper/utils.py index 9054e63..f67a95c 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -2,11 +2,8 @@ import random import string -import warnings -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - from fuzzywuzzy import fuzz +from difflib import SequenceMatcher def unique_stack_list(stack_list): @@ -51,4 +48,4 @@ def __init__(self, text, ratio_limit): self.match = None def search(self, text): - return fuzz.ratio(self.text, text)/100. >= self.ratio_limit + return SequenceMatcher(None, self.text, text).ratio() >= self.ratio_limit diff --git a/setup.py b/setup.py index f52eabc..39c6526 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,6 @@ packages=find_packages(exclude=['contrib', 'docs', 'tests']), python_requires='>=3.6', - install_requires=['requests', 'bs4', 'lxml', 'fuzzywuzzy'], + install_requires=['requests', 'bs4', 'lxml'], ) From 17ea7832157082a35bb082967160acd0db1a29d8 Mon Sep 17 00:00:00 2001 From: alireza Date: Thu, 5 Nov 2020 12:12:51 +0330 Subject: [PATCH 05/30] Resolve a backward compatibility issue --- autoscraper/auto_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 946a0d9..af17616 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -319,7 +319,7 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): wanted_attr = stack['wanted_attr'] is_full_url = stack['is_full_url'] - is_non_rec_text = stack['is_non_rec_text'] + is_non_rec_text = stack.get('is_non_rec_text', False) result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, is_full_url, url, is_non_rec_text), getattr(i, 'child_index', 0)) for i in parents] From 10153e896c11e3c6284ad405644891bc7253128b Mon Sep 17 00:00:00 2001 From: alireza Date: Thu, 5 Nov 2020 12:51:03 +0330 Subject: [PATCH 06/30] Add ability to set fuzziness ratio for matching the wanted items --- autoscraper/auto_scraper.py | 31 ++++++++++++++++--------------- autoscraper/utils.py | 6 ++++++ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index af17616..028ce8d 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \ - ResultItem, FuzzyText, get_non_rec_text + ResultItem, FuzzyText, get_non_rec_text, text_match class AutoScraper(object): @@ -119,10 +119,10 @@ def _get_valid_attrs(item): return attrs @staticmethod - def _child_has_text(child, text, url): + def _child_has_text(child, text, url, text_fuzz_ratio): child_text = child.getText().strip() - if text == child_text: + if text_match(text, child_text, text_fuzz_ratio): parent_text = child.parent.getText().strip() if child_text == parent_text: return False @@ -130,7 +130,7 @@ def _child_has_text(child, text, url): child.wanted_attr = None return True - if text == get_non_rec_text(child): + if text_match(text, get_non_rec_text(child), text_fuzz_ratio): child.is_non_rec_text = True child.wanted_attr = None return True @@ -140,7 +140,7 @@ def _child_has_text(child, text, url): continue value = value.strip() - if text == value: + if text_match(text, value, text_fuzz_ratio): child.wanted_attr = key return True @@ -153,13 +153,14 @@ def _child_has_text(child, text, url): return False - def _get_children(self, soup, text, url): + def _get_children(self, soup, text, url, text_fuzz_ratio): text = text.strip() children = reversed(soup.findChildren()) - children = [x for x in children if self._child_has_text(x, text, url)] + children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)] return children - def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, update=False): + def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, + update=False, text_fuzz_ratio=1.0): """ Automatically constructs a set of rules to scrape the specified target[s] from a web page. The rules are represented as stack_list. @@ -190,9 +191,12 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request If True, new learned rules will be added to the previous ones. If False, all previously learned rules will be removed. + text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0 + The fuzziness ratio threshold for matching the wanted contents. + Returns: -------- - None + List of similar results """ soup = self._get_soup(url=url, html=html, request_args=request_args) @@ -212,7 +216,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request wanted_list += wanted_items for wanted in wanted_items: - children = self._get_children(soup, wanted, url) + children = self._get_children(soup, wanted, url, text_fuzz_ratio) for child in children: result, stack = self._get_result_for_child(child, soup, url) @@ -223,11 +227,8 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request result_list = [item.text for item in result_list] result_list = unique_hashable(result_list) - if all(w in result_list for w in wanted_list): - self.stack_list = unique_stack_list(self.stack_list) - return result_list - - return None + self.stack_list = unique_stack_list(self.stack_list) + return result_list @classmethod def _build_stack(cls, child, url): diff --git a/autoscraper/utils.py b/autoscraper/utils.py index f67a95c..8c7c028 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -32,6 +32,12 @@ def get_non_rec_text(element): return ''.join(element.find_all(text=True, recursive=False)).strip() +def text_match(t1, t2, ratio_limit): + if ratio_limit >= 1: + return t1 == t2 + return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit + + class ResultItem(): def __init__(self, text, index): self.text = text From b34c72e8595f17c2e9bcbf3a0114d05af10b6590 Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Thu, 5 Nov 2020 12:52:41 +0330 Subject: [PATCH 07/30] Update version to 1.1.9 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 39c6526..8151af6 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='autoscraper', - version='1.1.8', + version='1.1.9', description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python', long_description_content_type="text/markdown", From b2e3391e7debaf430c2647e7fcea68a883d57d39 Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Sat, 28 Nov 2020 00:41:53 +0330 Subject: [PATCH 08/30] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7f7f872..69e97f9 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,8 @@ scraper.load('yahoo-finance') ## Issues Feel free to open an issue if you have any problem using the module. +[![Contact me on Codementor](https://www.codementor.io/m-badges/alirezamika/contact-me.svg)](https://www.codementor.io/@alirezamika?refer=badge) + ## Support the project Buy Me A Coffee From 5d5f39096238dee21aa46083638374ece58f128e Mon Sep 17 00:00:00 2001 From: alireza Date: Sun, 29 Nov 2020 16:31:28 +0330 Subject: [PATCH 09/30] Add support for regular expressions for wanted items --- autoscraper/auto_scraper.py | 15 +++++++-------- autoscraper/utils.py | 9 +++++++++ setup.py | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 028ce8d..7bfcf68 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -1,6 +1,5 @@ import hashlib import json -import unicodedata from collections import defaultdict from html import unescape @@ -10,7 +9,7 @@ from bs4 import BeautifulSoup from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \ - ResultItem, FuzzyText, get_non_rec_text, text_match + ResultItem, FuzzyText, get_non_rec_text, text_match, normalize class AutoScraper(object): @@ -92,7 +91,7 @@ def _get_soup(cls, url=None, html=None, request_args=None): request_args = request_args or {} if html: - html = unicodedata.normalize("NFKD", unescape(html)) + html = normalize(unescape(html)) return BeautifulSoup(html, 'lxml') headers = dict(cls.request_headers) @@ -102,7 +101,7 @@ def _get_soup(cls, url=None, html=None, request_args=None): user_headers = request_args.pop('headers', {}) headers.update(user_headers) html = requests.get(url, headers=headers, **request_args).text - html = unicodedata.normalize("NFKD", unescape(html)) + html = normalize(unescape(html)) return BeautifulSoup(html, 'lxml') @@ -154,7 +153,6 @@ def _child_has_text(child, text, url, text_fuzz_ratio): return False def _get_children(self, soup, text, url, text_fuzz_ratio): - text = text.strip() children = reversed(soup.findChildren()) children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)] return children @@ -170,13 +168,14 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request url: str, optional URL of the target web page. You should either pass url or html or both. - wanted_list: list, optional + wanted_list: list of strings or compiled regular expressions, optional A list of needed contents to be scraped. AutoScraper learns a set of rules to scrape these targets. If specified, wanted_dict will be ignored. wanted_dict: dict, optional - A dict of needed contents to be scraped. Keys are aliases and values are list of target texts. + A dict of needed contents to be scraped. Keys are aliases and values are list of target texts + or compiled regular expressions. AutoScraper learns a set of rules to scrape these targets and sets its aliases. html: str, optional @@ -212,7 +211,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request wanted_list = [] for alias, wanted_items in wanted_dict.items(): - wanted_items = [unicodedata.normalize("NFKD", w) for w in wanted_items] + wanted_items = [normalize(w) for w in wanted_items] wanted_list += wanted_items for wanted in wanted_items: diff --git a/autoscraper/utils.py b/autoscraper/utils.py index 8c7c028..5193708 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -2,6 +2,7 @@ import random import string +import unicodedata from difflib import SequenceMatcher @@ -32,7 +33,15 @@ def get_non_rec_text(element): return ''.join(element.find_all(text=True, recursive=False)).strip() +def normalize(item): + if not isinstance(item, str): + return item + return unicodedata.normalize("NFKD", item.strip()) + + def text_match(t1, t2, ratio_limit): + if hasattr(t1, 'fullmatch'): + return bool(t1.fullmatch(t2)) if ratio_limit >= 1: return t1 == t2 return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit diff --git a/setup.py b/setup.py index 8151af6..539d021 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='autoscraper', - version='1.1.9', + version='1.1.10', description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python', long_description_content_type="text/markdown", From 64a190e8676401b27d5aa6bf28f7145c20e9a9fe Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Tue, 15 Dec 2020 12:33:02 +0330 Subject: [PATCH 10/30] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 69e97f9..0575820 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,6 @@ scraper.load('yahoo-finance') ## Issues Feel free to open an issue if you have any problem using the module. -[![Contact me on Codementor](https://www.codementor.io/m-badges/alirezamika/contact-me.svg)](https://www.codementor.io/@alirezamika?refer=badge) ## Support the project From 5b9c0a076b63d64347dc8deabc6afc41e9e93f74 Mon Sep 17 00:00:00 2001 From: alireza Date: Sun, 10 Jan 2021 16:58:57 +0330 Subject: [PATCH 11/30] apply fuzziness ratio on full url matching --- autoscraper/auto_scraper.py | 263 +++++++++++++++++++++++++----------- 1 file changed, 184 insertions(+), 79 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 7bfcf68..2940ae4 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -1,6 +1,5 @@ import hashlib import json - from collections import defaultdict from html import unescape from urllib.parse import urljoin, urlparse @@ -8,8 +7,16 @@ import requests from bs4 import BeautifulSoup -from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \ - ResultItem, FuzzyText, get_non_rec_text, text_match, normalize +from autoscraper.utils import ( + FuzzyText, + ResultItem, + get_non_rec_text, + get_random_str, + normalize, + text_match, + unique_hashable, + unique_stack_list, +) class AutoScraper(object): @@ -37,8 +44,8 @@ class AutoScraper(object): """ request_headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \ - (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \ + (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" } def __init__(self, stack_list=None): @@ -59,7 +66,7 @@ def save(self, file_path): """ data = dict(stack_list=self.stack_list) - with open(file_path, 'w') as f: + with open(file_path, "w") as f: json.dump(data, f) def load(self, file_path): @@ -76,7 +83,7 @@ def load(self, file_path): None """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = json.load(f) # for backward compatibility @@ -84,7 +91,7 @@ def load(self, file_path): self.stack_list = data return - self.stack_list = data['stack_list'] + self.stack_list = data["stack_list"] @classmethod def _get_soup(cls, url=None, html=None, request_args=None): @@ -92,29 +99,29 @@ def _get_soup(cls, url=None, html=None, request_args=None): if html: html = normalize(unescape(html)) - return BeautifulSoup(html, 'lxml') + return BeautifulSoup(html, "lxml") headers = dict(cls.request_headers) if url: - headers['Host'] = urlparse(url).netloc + headers["Host"] = urlparse(url).netloc - user_headers = request_args.pop('headers', {}) + user_headers = request_args.pop("headers", {}) headers.update(user_headers) html = requests.get(url, headers=headers, **request_args).text html = normalize(unescape(html)) - return BeautifulSoup(html, 'lxml') + return BeautifulSoup(html, "lxml") @staticmethod def _get_valid_attrs(item): - key_attrs = {'class', 'style'} + key_attrs = {"class", "style"} attrs = { - k: v if v != [] else '' for k, v in item.attrs.items() if k in key_attrs + k: v if v != [] else "" for k, v in item.attrs.items() if k in key_attrs } for attr in key_attrs: if attr not in attrs: - attrs[attr] = '' + attrs[attr] = "" return attrs @staticmethod @@ -143,9 +150,9 @@ def _child_has_text(child, text, url, text_fuzz_ratio): child.wanted_attr = key return True - if key in {'href', 'src'}: + if key in {"href", "src"}: full_url = urljoin(url, value) - if text == full_url: + if text_match(text, full_url, text_fuzz_ratio): child.wanted_attr = key child.is_full_url = True return True @@ -154,11 +161,21 @@ def _child_has_text(child, text, url, text_fuzz_ratio): def _get_children(self, soup, text, url, text_fuzz_ratio): children = reversed(soup.findChildren()) - children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)] + children = [ + x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio) + ] return children - def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, - update=False, text_fuzz_ratio=1.0): + def build( + self, + url=None, + wanted_list=None, + wanted_dict=None, + html=None, + request_args=None, + update=False, + text_fuzz_ratio=1.0, + ): """ Automatically constructs a set of rules to scrape the specified target[s] from a web page. The rules are represented as stack_list. @@ -172,7 +189,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request A list of needed contents to be scraped. AutoScraper learns a set of rules to scrape these targets. If specified, wanted_dict will be ignored. - + wanted_dict: dict, optional A dict of needed contents to be scraped. Keys are aliases and values are list of target texts or compiled regular expressions. @@ -206,7 +223,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request self.stack_list = [] if wanted_list: - wanted_dict = {'': wanted_list} + wanted_dict = {"": wanted_list} wanted_list = [] @@ -219,7 +236,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request for child in children: result, stack = self._get_result_for_child(child, soup, url) - stack['alias'] = alias + stack["alias"] = alias result_list += result self.stack_list.append(stack) @@ -239,27 +256,33 @@ def _build_stack(cls, child, url): if not grand_parent: break - children = grand_parent.findAll(parent.name, cls._get_valid_attrs(parent), - recursive=False) + children = grand_parent.findAll( + parent.name, cls._get_valid_attrs(parent), recursive=False + ) for i, c in enumerate(children): if c == parent: content.insert( - 0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i)) + 0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i) + ) break - if grand_parent.name == 'html': + if grand_parent.name == "html": break parent = grand_parent - wanted_attr = getattr(child, 'wanted_attr', None) - is_full_url = getattr(child, 'is_full_url', False) - is_non_rec_text = getattr(child, 'is_non_rec_text', False) - stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url, - is_non_rec_text=is_non_rec_text) - stack['url'] = url if is_full_url else '' - stack['hash'] = hashlib.sha256(str(stack).encode('utf-8')).hexdigest() - stack['stack_id'] = 'rule_' + get_random_str(4) + wanted_attr = getattr(child, "wanted_attr", None) + is_full_url = getattr(child, "is_full_url", False) + is_non_rec_text = getattr(child, "is_non_rec_text", False) + stack = dict( + content=content, + wanted_attr=wanted_attr, + is_full_url=is_full_url, + is_non_rec_text=is_non_rec_text, + ) + stack["url"] = url if is_full_url else "" + stack["hash"] = hashlib.sha256(str(stack).encode("utf-8")).hexdigest() + stack["stack_id"] = "rule_" + get_random_str(4) return stack def _get_result_for_child(self, child, soup, url): @@ -295,8 +318,8 @@ def _get_fuzzy_attrs(attrs, attr_fuzz_ratio): def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): parents = [soup] - stack_content = stack['content'] - contain_sibling_leaves = kwargs.get('contain_sibling_leaves', False) + stack_content = stack["content"] + contain_sibling_leaves = kwargs.get("contain_sibling_leaves", False) for index, item in enumerate(stack_content): children = [] for parent in parents: @@ -317,18 +340,26 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): parents = children - wanted_attr = stack['wanted_attr'] - is_full_url = stack['is_full_url'] - is_non_rec_text = stack.get('is_non_rec_text', False) - result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, - is_full_url, url, is_non_rec_text), - getattr(i, 'child_index', 0)) for i in parents] + wanted_attr = stack["wanted_attr"] + is_full_url = stack["is_full_url"] + is_non_rec_text = stack.get("is_non_rec_text", False) + result = [ + ResultItem( + self._fetch_result_from_child( + i, wanted_attr, is_full_url, url, is_non_rec_text + ), + getattr(i, "child_index", 0), + ) + for i in parents + ] result = [x for x in result if x.text] return result - def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio, **kwargs): + def _get_result_with_stack_index_based( + self, stack, soup, url, attr_fuzz_ratio, **kwargs + ): p = soup.findChildren(recursive=False)[0] - stack_content = stack['content'] + stack_content = stack["content"] for index, item in enumerate(stack_content[:-1]): content = stack_content[index + 1] attrs = content[1] @@ -340,28 +371,48 @@ def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio, idx = min(len(p) - 1, item[2]) p = p[idx] - result = [ResultItem(self._fetch_result_from_child( - p, stack['wanted_attr'], stack['is_full_url'], url, stack['is_non_rec_text']), - getattr(p, 'child_index', 0))] + result = [ + ResultItem( + self._fetch_result_from_child( + p, + stack["wanted_attr"], + stack["is_full_url"], + url, + stack["is_non_rec_text"], + ), + getattr(p, "child_index", 0), + ) + ] result = [x for x in result if x.text] return result - def _get_result_by_func(self, func, url, html, soup, request_args, grouped, - group_by_alias, unique, attr_fuzz_ratio, **kwargs): + def _get_result_by_func( + self, + func, + url, + html, + soup, + request_args, + grouped, + group_by_alias, + unique, + attr_fuzz_ratio, + **kwargs + ): if not soup: soup = self._get_soup(url=url, html=html, request_args=request_args) - keep_order = kwargs.get('keep_order', False) + keep_order = kwargs.get("keep_order", False) if group_by_alias or (keep_order and not grouped): for index, child in enumerate(soup.findChildren()): - setattr(child, 'child_index', index) + setattr(child, "child_index", index) result_list = [] grouped_result = defaultdict(list) for stack in self.stack_list: if not url: - url = stack.get('url', '') + url = stack.get("url", "") result = func(stack, soup, url, attr_fuzz_ratio, **kwargs) @@ -369,14 +420,17 @@ def _get_result_by_func(self, func, url, html, soup, request_args, grouped, result_list += result continue - group_id = stack.get('alias', '') if group_by_alias else stack['stack_id'] + group_id = stack.get("alias", "") if group_by_alias else stack["stack_id"] grouped_result[group_id] += result - return self._clean_result(result_list, grouped_result, grouped, group_by_alias, - unique, keep_order) + return self._clean_result( + result_list, grouped_result, grouped, group_by_alias, unique, keep_order + ) @staticmethod - def _clean_result(result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order): + def _clean_result( + result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order + ): if not grouped and not grouped_by_alias: if unique is None: unique = True @@ -397,9 +451,19 @@ def _clean_result(result_list, grouped_result, grouped, grouped_by_alias, unique return dict(grouped_result) - def get_result_similar(self, url=None, html=None, soup=None, request_args=None, - grouped=False, group_by_alias=False, unique=None, attr_fuzz_ratio=1.0, - keep_order=False, contain_sibling_leaves=False): + def get_result_similar( + self, + url=None, + html=None, + soup=None, + request_args=None, + grouped=False, + group_by_alias=False, + unique=None, + attr_fuzz_ratio=1.0, + keep_order=False, + contain_sibling_leaves=False, + ): """ Gets similar results based on the previously learned rules. @@ -444,13 +508,31 @@ def get_result_similar(self, url=None, html=None, soup=None, request_args=None, """ func = self._get_result_with_stack - return self._get_result_by_func(func, url, html, soup, request_args, grouped, - group_by_alias, unique, attr_fuzz_ratio, - keep_order=keep_order, - contain_sibling_leaves=contain_sibling_leaves) - - def get_result_exact(self, url=None, html=None, soup=None, request_args=None, - grouped=False, group_by_alias=False, unique=None, attr_fuzz_ratio=1.0): + return self._get_result_by_func( + func, + url, + html, + soup, + request_args, + grouped, + group_by_alias, + unique, + attr_fuzz_ratio, + keep_order=keep_order, + contain_sibling_leaves=contain_sibling_leaves, + ) + + def get_result_exact( + self, + url=None, + html=None, + soup=None, + request_args=None, + grouped=False, + group_by_alias=False, + unique=None, + attr_fuzz_ratio=1.0, + ): """ Gets exact results based on the previously learned rules. @@ -489,11 +571,28 @@ def get_result_exact(self, url=None, html=None, soup=None, request_args=None, """ func = self._get_result_with_stack_index_based - return self._get_result_by_func(func, url, html, soup, request_args, grouped, - group_by_alias, unique, attr_fuzz_ratio) - - def get_result(self, url=None, html=None, request_args=None, grouped=False, - group_by_alias=False, unique=None, attr_fuzz_ratio=1.0): + return self._get_result_by_func( + func, + url, + html, + soup, + request_args, + grouped, + group_by_alias, + unique, + attr_fuzz_ratio, + ) + + def get_result( + self, + url=None, + html=None, + request_args=None, + grouped=False, + group_by_alias=False, + unique=None, + attr_fuzz_ratio=1.0, + ): """ Gets similar and exact results based on the previously learned rules. @@ -532,8 +631,14 @@ def get_result(self, url=None, html=None, request_args=None, grouped=False, """ soup = self._get_soup(url=url, html=html, request_args=request_args) - args = dict(url=url, soup=soup, grouped=grouped, group_by_alias=group_by_alias, - unique=unique, attr_fuzz_ratio=attr_fuzz_ratio) + args = dict( + url=url, + soup=soup, + grouped=grouped, + group_by_alias=group_by_alias, + unique=unique, + attr_fuzz_ratio=attr_fuzz_ratio, + ) similar = self.get_result_similar(**args) exact = self.get_result_exact(**args) return similar, exact @@ -552,7 +657,7 @@ def remove_rules(self, rules): None """ - self.stack_list = [x for x in self.stack_list if x['stack_id'] not in rules] + self.stack_list = [x for x in self.stack_list if x["stack_id"] not in rules] def keep_rules(self, rules): """ @@ -568,7 +673,7 @@ def keep_rules(self, rules): None """ - self.stack_list = [x for x in self.stack_list if x['stack_id'] in rules] + self.stack_list = [x for x in self.stack_list if x["stack_id"] in rules] def set_rule_aliases(self, rule_aliases): """ @@ -584,10 +689,10 @@ def set_rule_aliases(self, rule_aliases): None """ - id_to_stack = {stack['stack_id']: stack for stack in self.stack_list} + id_to_stack = {stack["stack_id"]: stack for stack in self.stack_list} for rule_id, alias in rule_aliases.items(): - id_to_stack[rule_id]['alias'] = alias + id_to_stack[rule_id]["alias"] = alias def generate_python_code(self): # deprecated - print('This function is deprecated. Please use save() and load() instead.') + print("This function is deprecated. Please use save() and load() instead.") From 3a004736815478aeafa0134c6c748b17da7b3335 Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Sun, 10 Jan 2021 16:59:41 +0330 Subject: [PATCH 12/30] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 539d021..74d69b6 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='autoscraper', - version='1.1.10', + version='1.1.11', description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python', long_description_content_type="text/markdown", From 0c5922c73e1d86f2714cd3ef9e559f4631a4a717 Mon Sep 17 00:00:00 2001 From: Alireza Mika Date: Mon, 11 Jan 2021 03:39:37 +0330 Subject: [PATCH 13/30] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0575820..4b4928d 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. -wanted_list = ["How to call an external command?"] +wanted_list = ["What are metaclasses in Python?"] scraper = AutoScraper() result = scraper.build(url, wanted_list) From 48048ff646737a2cc3a81116ae4dd421f78d0e6f Mon Sep 17 00:00:00 2001 From: Mika Date: Tue, 12 Jan 2021 11:51:18 +0330 Subject: [PATCH 14/30] Create FUNDING.yml --- .github/FUNDING.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..7dbfac2 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +github: [alirezamika] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] From 0b64c8bf8b7ee06e8147d4058916ade1d523001e Mon Sep 17 00:00:00 2001 From: alireza Date: Mon, 18 Jan 2021 21:52:17 +0330 Subject: [PATCH 15/30] Fix fetching result from root element --- autoscraper/auto_scraper.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 2940ae4..ee2a278 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -94,13 +94,7 @@ def load(self, file_path): self.stack_list = data["stack_list"] @classmethod - def _get_soup(cls, url=None, html=None, request_args=None): - request_args = request_args or {} - - if html: - html = normalize(unescape(html)) - return BeautifulSoup(html, "lxml") - + def _fetch_html(cls, url, request_args=None): headers = dict(cls.request_headers) if url: headers["Host"] = urlparse(url).netloc @@ -108,6 +102,17 @@ def _get_soup(cls, url=None, html=None, request_args=None): user_headers = request_args.pop("headers", {}) headers.update(user_headers) html = requests.get(url, headers=headers, **request_args).text + return html + + @classmethod + def _get_soup(cls, url=None, html=None, request_args=None): + request_args = request_args or {} + + if html: + html = normalize(unescape(html)) + return BeautifulSoup(html, "lxml") + + html = cls._fetch_html(url, request_args) html = normalize(unescape(html)) return BeautifulSoup(html, "lxml") @@ -130,7 +135,7 @@ def _child_has_text(child, text, url, text_fuzz_ratio): if text_match(text, child_text, text_fuzz_ratio): parent_text = child.parent.getText().strip() - if child_text == parent_text: + if child_text == parent_text and child.parent.parent: return False child.wanted_attr = None @@ -266,7 +271,7 @@ def _build_stack(cls, child, url): ) break - if grand_parent.name == "html": + if not grand_parent.parent: break parent = grand_parent @@ -322,6 +327,8 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): contain_sibling_leaves = kwargs.get("contain_sibling_leaves", False) for index, item in enumerate(stack_content): children = [] + if item[0] == "[document]": + continue for parent in parents: attrs = item[1] @@ -361,6 +368,8 @@ def _get_result_with_stack_index_based( p = soup.findChildren(recursive=False)[0] stack_content = stack["content"] for index, item in enumerate(stack_content[:-1]): + if item[0] == "[document]": + continue content = stack_content[index + 1] attrs = content[1] if attr_fuzz_ratio < 1.0: From a4ab3eda3daabe8bd1cd6ac104c5450b75797566 Mon Sep 17 00:00:00 2001 From: alireza Date: Mon, 18 Jan 2021 22:10:55 +0330 Subject: [PATCH 16/30] refactor fetch html method --- autoscraper/auto_scraper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index ee2a278..2486501 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -95,6 +95,7 @@ def load(self, file_path): @classmethod def _fetch_html(cls, url, request_args=None): + request_args = request_args or {} headers = dict(cls.request_headers) if url: headers["Host"] = urlparse(url).netloc @@ -106,8 +107,6 @@ def _fetch_html(cls, url, request_args=None): @classmethod def _get_soup(cls, url=None, html=None, request_args=None): - request_args = request_args or {} - if html: html = normalize(unescape(html)) return BeautifulSoup(html, "lxml") From 731f62534a6922d598cc2f99dab38043e390315c Mon Sep 17 00:00:00 2001 From: alireza Date: Sat, 23 Jan 2021 20:38:42 +0330 Subject: [PATCH 17/30] Fix requests encoding --- autoscraper/auto_scraper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 2486501..53662f1 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -102,7 +102,12 @@ def _fetch_html(cls, url, request_args=None): user_headers = request_args.pop("headers", {}) headers.update(user_headers) - html = requests.get(url, headers=headers, **request_args).text + res = requests.get(url, headers=headers, **request_args) + if res.encoding == "ISO-8859-1" and not "ISO-8859-1" in res.headers.get( + "Content-Type", "" + ): + res.encoding = res.apparent_encoding + html = res.text return html @classmethod From 1193d13db26d0681a777b6f626903fa3d107a22b Mon Sep 17 00:00:00 2001 From: alireza Date: Sat, 23 Jan 2021 20:39:40 +0330 Subject: [PATCH 18/30] Update version to 1.1.12 --- setup.py | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 74d69b6..d0458c6 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,30 @@ -from setuptools import setup, find_packages from codecs import open from os import path +from setuptools import find_packages, setup + here = path.abspath(path.dirname(__file__)) -with open(path.join(here, 'README.md'), encoding='utf-8') as f: +with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() setup( - name='autoscraper', - - version='1.1.11', - - description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python', + name="autoscraper", + version="1.1.12", + description="A Smart, Automatic, Fast and Lightweight Web Scraper for Python", long_description_content_type="text/markdown", long_description=long_description, - - url='https://github.com/alirezamika/autoscraper', - - author='Alireza Mika', - author_email='alirezamika@gmail.com', - - license='MIT', - + url="https://github.com/alirezamika/autoscraper", + author="Alireza Mika", + author_email="alirezamika@gmail.com", + license="MIT", classifiers=[ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", ], - - keywords='scraping - scraper', - - packages=find_packages(exclude=['contrib', 'docs', 'tests']), - - python_requires='>=3.6', - install_requires=['requests', 'bs4', 'lxml'], - + keywords="scraping - scraper", + packages=find_packages(exclude=["contrib", "docs", "tests"]), + python_requires=">=3.6", + install_requires=["requests", "bs4", "lxml"], ) From d33dbf0ec846abf492dfc286294cb85667840802 Mon Sep 17 00:00:00 2001 From: Mika Date: Thu, 28 Jan 2021 20:09:22 +0330 Subject: [PATCH 19/30] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4b4928d..bb84822 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ This project is made for automatic web scraping to make scraping easy. It gets a url or the html content of a web page and a list of sample data which we want to scrape from that page. **This data can be text, url or any html tag value of that page.** It learns the scraping rules and returns the similar elements. Then you can use this learned object with new urls to get similar content or the exact same element of those new pages. +[![Contact me on Codementor](https://www.codementor.io/m-badges/alirezamika/contact-me.svg)](https://www.codementor.io/@alirezamika?refer=badge) + ## Installation It's compatible with python 3. From 973ba6abed840d16907a556bc0192e2bf4806c6d Mon Sep 17 00:00:00 2001 From: Mika Date: Thu, 4 Feb 2021 00:42:16 +0330 Subject: [PATCH 20/30] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index bb84822..e1a3ca4 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ This project is made for automatic web scraping to make scraping easy. It gets a url or the html content of a web page and a list of sample data which we want to scrape from that page. **This data can be text, url or any html tag value of that page.** It learns the scraping rules and returns the similar elements. Then you can use this learned object with new urls to get similar content or the exact same element of those new pages. -[![Contact me on Codementor](https://www.codementor.io/m-badges/alirezamika/contact-me.svg)](https://www.codementor.io/@alirezamika?refer=badge) ## Installation From 3901d693282880814113abe0b2c1ca7a7bc336c8 Mon Sep 17 00:00:00 2001 From: George Sakkis Date: Sun, 10 Jul 2022 01:04:32 +0300 Subject: [PATCH 21/30] Add keep_blank option --- autoscraper/auto_scraper.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 53662f1..20d723d 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -363,7 +363,8 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): ) for i in parents ] - result = [x for x in result if x.text] + if not kwargs.get("keep_blank", False): + result = [x for x in result if x.text] return result def _get_result_with_stack_index_based( @@ -396,7 +397,8 @@ def _get_result_with_stack_index_based( getattr(p, "child_index", 0), ) ] - result = [x for x in result if x.text] + if not kwargs.get("keep_blank", False): + result = [x for x in result if x.text] return result def _get_result_by_func( @@ -474,6 +476,7 @@ def get_result_similar( group_by_alias=False, unique=None, attr_fuzz_ratio=1.0, + keep_blank=False, keep_order=False, contain_sibling_leaves=False, ): @@ -508,6 +511,9 @@ def get_result_similar( attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0 The fuzziness ratio threshold for matching html tag attributes. + keep_blank: bool, optional, defaults to False + If set to True, missing values will be returned as empty strings. + keep_order: bool, optional, defaults to False If set to True, the results will be ordered as they are present on the web page. @@ -531,6 +537,7 @@ def get_result_similar( group_by_alias, unique, attr_fuzz_ratio, + keep_blank=keep_blank, keep_order=keep_order, contain_sibling_leaves=contain_sibling_leaves, ) @@ -545,6 +552,7 @@ def get_result_exact( group_by_alias=False, unique=None, attr_fuzz_ratio=1.0, + keep_blank=False, ): """ Gets exact results based on the previously learned rules. @@ -577,6 +585,9 @@ def get_result_exact( attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0 The fuzziness ratio threshold for matching html tag attributes. + keep_blank: bool, optional, defaults to False + If set to True, missing values will be returned as empty strings. + Returns: -------- List of exact results scraped from the web page. @@ -594,6 +605,7 @@ def get_result_exact( group_by_alias, unique, attr_fuzz_ratio, + keep_blank=keep_blank, ) def get_result( From 26bc6bf78a14753f5dca6856999d96d5374dc64d Mon Sep 17 00:00:00 2001 From: Alireza Date: Sun, 17 Jul 2022 21:48:27 +0430 Subject: [PATCH 22/30] update version to 1.1.14 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d0458c6..ae1972e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="autoscraper", - version="1.1.12", + version="1.1.14", description="A Smart, Automatic, Fast and Lightweight Web Scraper for Python", long_description_content_type="text/markdown", long_description=long_description, From f209c3d2a216a2bd427e0e6437629f1f9c5e2152 Mon Sep 17 00:00:00 2001 From: Mika Date: Tue, 24 Sep 2024 11:47:20 +0330 Subject: [PATCH 23/30] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1a3ca4..a09a2f9 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ from autoscraper import AutoScraper url = 'https://github.com/alirezamika/autoscraper' -wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '2.5k', 'https://github.com/alirezamika/autoscraper/issues'] +wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '6.2k', 'https://github.com/alirezamika/autoscraper/issues'] scraper = AutoScraper() scraper.build(url, wanted_list) From e261605c325f0f099552f7c27dec5934f94e45a5 Mon Sep 17 00:00:00 2001 From: Mika Date: Tue, 8 Oct 2024 11:24:30 +0330 Subject: [PATCH 24/30] Create stale-issues.yml --- .github/workflows/stale-issues.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/stale-issues.yml diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml new file mode 100644 index 0000000..fe68424 --- /dev/null +++ b/.github/workflows/stale-issues.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "30 1 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: -1 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: 30 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} From 4429311e7506b28d37d77b8ff3c68029be664f57 Mon Sep 17 00:00:00 2001 From: Mika Date: Wed, 9 Oct 2024 09:54:11 +0330 Subject: [PATCH 25/30] Update stale-issues.yml --- .github/workflows/stale-issues.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml index fe68424..22c0024 100644 --- a/.github/workflows/stale-issues.yml +++ b/.github/workflows/stale-issues.yml @@ -13,10 +13,10 @@ jobs: - uses: actions/stale@v5 with: days-before-issue-stale: 30 - days-before-issue-close: -1 + days-before-issue-close: 0 stale-issue-label: "stale" stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: 30 - days-before-pr-close: -1 + days-before-pr-close: 9 repo-token: ${{ secrets.GITHUB_TOKEN }} From 348c355b7b5c17f9412cf40808a7889c0b2bf565 Mon Sep 17 00:00:00 2001 From: Mika Date: Wed, 9 Oct 2024 09:54:22 +0330 Subject: [PATCH 26/30] Update stale-issues.yml --- .github/workflows/stale-issues.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml index 22c0024..d84e6c4 100644 --- a/.github/workflows/stale-issues.yml +++ b/.github/workflows/stale-issues.yml @@ -18,5 +18,5 @@ jobs: stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: 30 - days-before-pr-close: 9 + days-before-pr-close: 0 repo-token: ${{ secrets.GITHUB_TOKEN }} From e95999cd41c6d3683362f3d531b056b487e4d1ea Mon Sep 17 00:00:00 2001 From: Mika Date: Sat, 12 Oct 2024 12:59:21 +0330 Subject: [PATCH 27/30] Update stale-issues.yml --- .github/workflows/stale-issues.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml index d84e6c4..48d4fab 100644 --- a/.github/workflows/stale-issues.yml +++ b/.github/workflows/stale-issues.yml @@ -13,10 +13,10 @@ jobs: - uses: actions/stale@v5 with: days-before-issue-stale: 30 - days-before-issue-close: 0 + days-before-issue-close: 14 stale-issue-label: "stale" stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: 30 - days-before-pr-close: 0 + days-before-pr-close: 14 repo-token: ${{ secrets.GITHUB_TOKEN }} From 621779b84e54af209282ad110acb7b632621e8da Mon Sep 17 00:00:00 2001 From: Mika Date: Sun, 8 Jun 2025 15:49:43 +0330 Subject: [PATCH 28/30] Refine complex tests and reorganize suite --- autoscraper/auto_scraper.py | 3 + tests/__init__.py | 0 tests/conftest.py | 107 +++++++++++++++++++++ tests/integration/__init__.py | 0 tests/integration/test_complex_features.py | 77 +++++++++++++++ tests/integration/test_real_world.py | 83 ++++++++++++++++ tests/unit/__init__.py | 0 tests/unit/test_additional_features.py | 41 ++++++++ tests/unit/test_build.py | 18 ++++ tests/unit/test_features.py | 57 +++++++++++ 10 files changed, 386 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_complex_features.py create mode 100644 tests/integration/test_real_world.py create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/test_additional_features.py create mode 100644 tests/unit/test_build.py create mode 100644 tests/unit/test_features.py diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 20d723d..4227e6b 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -224,6 +224,9 @@ def build( List of similar results """ + if not wanted_list and not (wanted_dict and any(wanted_dict.values())): + raise ValueError("No targets were supplied") + soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f6e3aaf --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,107 @@ +import sys +from types import ModuleType +from html.parser import HTMLParser + +class _Node: + def __init__(self, name, attrs, parent=None): + self.name = name + self.attrs = dict(attrs) + self.parent = parent + self.children = [] + self.text = "" + + def append_child(self, child): + self.children.append(child) + child.parent = self + + def getText(self): + return self.text + "".join(c.getText() for c in self.children) + + def findChildren(self, recursive=True): + result = [] + for child in self.children: + result.append(child) + if recursive: + result.extend(child.findChildren(recursive)) + return result + + def findParent(self): + return self.parent + + def _attr_match(self, child, attrs): + from autoscraper.utils import FuzzyText + + for key, val in (attrs or {}).items(): + actual = child.attrs.get(key, "") + if isinstance(actual, list): + actual = " ".join(actual) + + if isinstance(val, FuzzyText): + if not val.search(actual): + return False + elif actual != val: + return False + return True + + def findAll(self, name=None, attrs=None, recursive=True): + result = [] + for child in self.children: + if (name is None or child.name == name) and self._attr_match(child, attrs): + result.append(child) + if recursive: + result.extend(child.findAll(name, attrs, recursive)) + return result + + def find_all(self, name=None, attrs=None, text=None, recursive=True): + if text: + res = [] + if self.text.strip(): + res.append(self.text) + for child in self.children: + if recursive: + res.extend(child.find_all(text=True, recursive=True)) + elif child.text.strip(): + res.append(child.text) + return res + return self.findAll(name, attrs, recursive) + +class _Parser(HTMLParser): + def __init__(self): + super().__init__() + self.root = _Node("[document]", {}) + self.current = self.root + + def handle_starttag(self, tag, attrs): + node = _Node(tag, attrs) + self.current.append_child(node) + self.current = node + + def handle_endtag(self, tag): + if self.current.parent: + self.current = self.current.parent + + def handle_data(self, data): + self.current.text += data + +class BeautifulSoup(_Node): + def __init__(self, html, parser): + p = _Parser() + p.feed(html) + super().__init__(p.root.name, p.root.attrs) + self.children = p.root.children + for c in self.children: + c.parent = self + +bs4_mod = ModuleType("bs4") +bs4_mod.BeautifulSoup = BeautifulSoup +sys.modules.setdefault("bs4", bs4_mod) + +class _Response: + def __init__(self, text=""): + self.encoding = "utf-8" + self.headers = {"Content-Type": "text/html"} + self.text = text + +requests_mod = ModuleType("requests") +requests_mod.get = lambda url, headers=None, **kw: _Response() +sys.modules.setdefault("requests", requests_mod) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_complex_features.py b/tests/integration/test_complex_features.py new file mode 100644 index 0000000..57514be --- /dev/null +++ b/tests/integration/test_complex_features.py @@ -0,0 +1,77 @@ +import pytest +import re +from autoscraper import AutoScraper + +HTML_COMPLEX = """ +
+ +

Fresh fruits

+ Shop Now +
+""" + + +def test_extract_relative_link(): + scraper = AutoScraper() + url = "https://example.com/index.html" + result = scraper.build(url=url, html=HTML_COMPLEX, wanted_list=["https://example.com/apple"]) + assert "https://example.com/apple" in result + similar = scraper.get_result_similar( + url=url, html=HTML_COMPLEX, contain_sibling_leaves=True, unique=True + ) + assert set(similar) == { + "https://example.com/banana", + "https://example.com/apple", + "https://example.com/orange", + } + exact = scraper.get_result_exact(url=url, html=HTML_COMPLEX) + assert exact == ["https://example.com/apple"] + + +def test_build_with_regex(): + scraper = AutoScraper() + scraper.build(html=HTML_COMPLEX, wanted_list=[re.compile("Ban.*")]) + result = scraper.get_result_exact(html=HTML_COMPLEX) + assert "Banana" in result[0] + + +def test_update_appends_rules(): + scraper = AutoScraper() + scraper.build(html=HTML_COMPLEX, wanted_list=["Banana"]) + count = len(scraper.stack_list) + scraper.build(html=HTML_COMPLEX, wanted_list=["Apple"], update=True) + assert len(scraper.stack_list) == count + 1 + + +def test_remove_rules(): + scraper = AutoScraper() + scraper.build(html=HTML_COMPLEX, wanted_list=["Banana"]) + scraper.build(html=HTML_COMPLEX, wanted_list=["Apple"], update=True) + rule_ids = [s["stack_id"] for s in scraper.stack_list] + to_remove = rule_ids[0] + scraper.remove_rules([to_remove]) + remaining = [s["stack_id"] for s in scraper.stack_list] + assert to_remove not in remaining + assert len(remaining) == len(rule_ids) - 1 + + +def test_keep_blank_returns_empty(): + scraper = AutoScraper() + scraper.build(html=HTML_COMPLEX, wanted_list=["/shop"]) + html_blank = HTML_COMPLEX.replace('href="/shop"', 'href=""') + result = scraper.get_result_exact(html=html_blank, keep_blank=True) + assert result == [""] + + +def test_attr_fuzz_ratio(): + html_base = '' + html_variant = '' + scraper = AutoScraper() + scraper.build(html=html_base, wanted_list=["Buy"]) + res = scraper.get_result_exact(html=html_variant, attr_fuzz_ratio=0.8) + assert res == ["Buy"] diff --git a/tests/integration/test_real_world.py b/tests/integration/test_real_world.py new file mode 100644 index 0000000..a810acf --- /dev/null +++ b/tests/integration/test_real_world.py @@ -0,0 +1,83 @@ +import re +from autoscraper import AutoScraper + +HTML_PAGE_1 = """ +
+

Sony PlayStation 4 PS4 Pro 1TB 4K Console - Black

+ US $349.99 +
4.8
+
See details
+
+""" + +HTML_PAGE_2 = """ +
+

Acer Predator Helios 300 15.6'' 144Hz FHD Laptop i7-9750H 16GB 512GB GTX 1660 Ti

+ US $1,229.49 +
5.0
+
See details
+
+""" + +HTML_WALMART_1 = "
$8.95
" +HTML_WALMART_2 = "
$7.00
" +HTML_ETSY_1 = "$12.50+" +HTML_ETSY_2 = "$60.00" + + +def test_grouping_and_rule_removal(): + scraper = AutoScraper() + wanted = [ + "Sony PlayStation 4 PS4 Pro 1TB 4K Console - Black", + "US $349.99", + "4.8", + "See details", + ] + scraper.build(html=HTML_PAGE_1, wanted_list=wanted) + grouped = scraper.get_result_exact(html=HTML_PAGE_2, grouped=True) + unwanted = [r for r, v in grouped.items() if v == ["See details"]] + scraper.remove_rules(unwanted) + result = scraper.get_result_exact(html=HTML_PAGE_2) + assert result == [ + "Acer Predator Helios 300 15.6'' 144Hz FHD Laptop i7-9750H 16GB 512GB GTX 1660 Ti", + "US $1,229.49", + "5.0", + ] + + +def test_incremental_learning_multiple_sites(): + scraper = AutoScraper() + data = [ + (HTML_PAGE_1, ["US $349.99"]), + (HTML_WALMART_1, ["$8.95"]), + (HTML_ETSY_1, ["$12.50+"]), + ] + for html, wanted in data: + scraper.build(html=html, wanted_list=wanted, update=True) + assert "US $1,229.49" in scraper.get_result_exact(html=HTML_PAGE_2) + assert "$7.00" in scraper.get_result_exact(html=HTML_WALMART_2) + assert "$60.00" in scraper.get_result_exact(html=HTML_ETSY_2) + + +def test_attr_fuzz_ratio_realistic(): + base = "" + variant = "" + scraper = AutoScraper() + scraper.build(html=base, wanted_list=["Buy"]) + assert scraper.get_result_exact(html=variant, attr_fuzz_ratio=0.8) == ["Buy"] + + +def test_regex_name_extraction(): + scraper = AutoScraper() + scraper.build(html=HTML_PAGE_1, wanted_list=[re.compile(r".*PlayStation.*Console.*")]) + result = scraper.get_result_exact(html=HTML_PAGE_1) + assert any("PlayStation" in r for r in result) + + +def test_keep_blank_for_missing_rating(): + scraper = AutoScraper() + scraper.build(html=HTML_PAGE_1, wanted_list=["4.8"]) + html_no_rating = HTML_PAGE_2.replace("5.0", "") + res = scraper.get_result_exact(html=html_no_rating, keep_blank=True) + assert res == [""] + diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_additional_features.py b/tests/unit/test_additional_features.py new file mode 100644 index 0000000..c0fdc98 --- /dev/null +++ b/tests/unit/test_additional_features.py @@ -0,0 +1,41 @@ +from autoscraper import AutoScraper + +HTML = "
  • Banana
  • Apple
  • Orange
" +HTML_DUP = "
  • Banana
  • Banana
" + + +def test_text_fuzz_ratio_partial(): + scraper = AutoScraper() + scraper.build(html="
  • Banana
", wanted_list=["Banan"], text_fuzz_ratio=0.8) + assert scraper.get_result_exact(html="
  • Banana
") == ["Banana"] + + +def test_set_rule_aliases(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + rule_id = scraper.stack_list[0]["stack_id"] + scraper.set_rule_aliases({rule_id: "fruit"}) + result = scraper.get_result_similar(html=HTML, group_by_alias=True, contain_sibling_leaves=True) + assert result == {"fruit": ["Banana", "Apple", "Orange"]} + + +def test_grouped_results_by_rule(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + rule_id = scraper.stack_list[0]["stack_id"] + result = scraper.get_result_similar(html=HTML, grouped=True, contain_sibling_leaves=True) + assert result == {rule_id: ["Banana", "Apple", "Orange"]} + + +def test_similar_unique_false(): + scraper = AutoScraper() + scraper.build(html=HTML_DUP, wanted_list=["Banana"]) + result = scraper.get_result_similar(html=HTML_DUP, unique=False) + assert result == ["Banana", "Banana"] + + +def test_similar_keep_order(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + result = scraper.get_result_similar(html=HTML, contain_sibling_leaves=True, keep_order=True) + assert result == ["Banana", "Apple", "Orange"] diff --git a/tests/unit/test_build.py b/tests/unit/test_build.py new file mode 100644 index 0000000..65d5e05 --- /dev/null +++ b/tests/unit/test_build.py @@ -0,0 +1,18 @@ +import pytest +from autoscraper import AutoScraper + +HTML = "
  • Banana
  • Apple
  • Orange
" + + +def test_build_requires_targets(): + scraper = AutoScraper() + with pytest.raises(ValueError): + scraper.build(html=HTML) + + +def test_build_and_get_result_similar(): + scraper = AutoScraper() + result = scraper.build(html=HTML, wanted_list=["Banana"]) + assert result == ["Banana"] + similar = scraper.get_result_similar(html=HTML, contain_sibling_leaves=True) + assert similar == ["Banana", "Apple", "Orange"] diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py new file mode 100644 index 0000000..414bc27 --- /dev/null +++ b/tests/unit/test_features.py @@ -0,0 +1,57 @@ +import pytest + +from autoscraper import AutoScraper + +HTML = "
  • Banana
  • Apple
  • Orange
" +HTML_COMPLEX_ORDER = """ +
+

Banana

+

$1

+

Apple

+

$2

+
+""" + + +def test_get_result_exact_order(): + scraper = AutoScraper() + scraper.build(html=HTML_COMPLEX_ORDER, wanted_list=["Banana", "$2"]) + assert scraper.get_result_exact(html=HTML_COMPLEX_ORDER) == ["Banana", "$2"] + + +def test_group_by_alias(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_dict={"fruit": ["Banana"]}) + similar = scraper.get_result_similar( + html=HTML, group_by_alias=True, contain_sibling_leaves=True, unique=True + ) + assert similar == {"fruit": ["Banana", "Apple", "Orange"]} + + +def test_save_and_load(tmp_path): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + file_path = tmp_path / "model.json" + scraper.save(file_path) + new_scraper = AutoScraper() + new_scraper.load(file_path) + assert new_scraper.get_result_exact(html=HTML) == scraper.get_result_exact(html=HTML) + + +def test_keep_rules(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + first_rule = scraper.stack_list[0]["stack_id"] + scraper.build(html=HTML, wanted_list=["Apple"], update=True) + second_rule = scraper.stack_list[1]["stack_id"] + scraper.keep_rules([second_rule]) + assert len(scraper.stack_list) == 1 + assert scraper.stack_list[0]["stack_id"] == second_rule + + +def test_get_result_combined(): + scraper = AutoScraper() + scraper.build(html=HTML, wanted_list=["Banana"]) + similar, exact = scraper.get_result(html=HTML) + assert exact == ["Banana"] + assert similar == ["Banana"] From ea9e90c723c0d0ff4874e6a56839546463887dc0 Mon Sep 17 00:00:00 2001 From: Mika Date: Sun, 8 Jun 2025 15:54:40 +0330 Subject: [PATCH 29/30] Add CI workflows for tests --- .github/workflows/python-publish.yml | 6 +++++- .github/workflows/tests.yml | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4e1ef42..7665a92 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -21,7 +21,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install setuptools wheel twine pytest + pip install . + - name: Run tests + run: | + pytest -q - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..6dd6873 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,23 @@ +name: Run Tests + +on: + push: + release: + types: [created] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + pip install . + - name: Run tests + run: pytest -q From eec33398f292bc0f29b35dbcc52b2f4a362ea160 Mon Sep 17 00:00:00 2001 From: Mika Date: Sun, 8 Jun 2025 17:16:08 +0330 Subject: [PATCH 30/30] Remove unused get_random_str --- autoscraper/auto_scraper.py | 3 +-- autoscraper/utils.py | 7 ------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 4227e6b..73f90a6 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -11,7 +11,6 @@ FuzzyText, ResultItem, get_non_rec_text, - get_random_str, normalize, text_match, unique_hashable, @@ -294,7 +293,7 @@ def _build_stack(cls, child, url): ) stack["url"] = url if is_full_url else "" stack["hash"] = hashlib.sha256(str(stack).encode("utf-8")).hexdigest() - stack["stack_id"] = "rule_" + get_random_str(4) + stack["stack_id"] = "rule_" + stack["hash"][:8] return stack def _get_result_for_child(self, child, soup, url): diff --git a/autoscraper/utils.py b/autoscraper/utils.py index 5193708..15641f6 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -1,7 +1,5 @@ from collections import OrderedDict -import random -import string import unicodedata from difflib import SequenceMatcher @@ -24,11 +22,6 @@ def unique_hashable(hashable_items): return list(OrderedDict.fromkeys(hashable_items)) -def get_random_str(n): - chars = string.ascii_lowercase + string.digits - return ''.join(random.choice(chars) for i in range(n)) - - def get_non_rec_text(element): return ''.join(element.find_all(text=True, recursive=False)).strip()