diff --git a/.travis.yml b/.travis.yml index 2f2c722e..0eefe3a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,11 @@ language: python python: - 2.6 - 2.7 + - 3.4 + - 3.5 install: - - pip install -r requirements.txt --use-mirrors + - pip install jieba - python setup.py install script: python setup.py test diff --git a/README.rst b/README.rst index 5dc8ab0b..bf12c3dd 100644 --- a/README.rst +++ b/README.rst @@ -180,7 +180,7 @@ class. Goose in Korean ----------------- +--------------- In order to use Goose in Korean you have to use the StopWordsKorean class. @@ -197,24 +197,6 @@ class. 14년째 세계 각국의 통신·안전·전파 규격 시험과 인증 한 우물만 파고 있는 이 회사 박채규 대표가 만나기로 한 주인공이다. 그는 전기전자·무선통신·자동차 전장품 분야에 - -Known issues ------------- - -- There are some issues with unicode URLs. -- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance: - - >>> import urllib2 - >>> import goose - >>> url = "http://www.nytimes.com/2013/08/18/world/middleeast/pressure-by-us-failed-to-sway-egypts-leaders.html?hp" - >>> opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) - >>> response = opener.open(url) - >>> raw_html = response.read() - >>> g = goose.Goose() - >>> a = g.extract(raw_html=raw_html) - >>> a.cleaned_text - u'CAIRO \u2014 For a moment, at least, American and European diplomats trying to defuse the volatile standoff in Egypt thought they had a breakthrough.\n\nAs t' - TODO ---- diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..d1cd6da8 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -21,7 +21,6 @@ limitations under the License. """ import os -import platform from tempfile import mkstemp from goose.version import version_info, __version__ @@ -64,9 +63,12 @@ def crawl(self, crawl_candiate): try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) - except (UnicodeDecodeError, ValueError): - self.config.parser_class = parsers[0] - return self.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError) as e: + if parsers: + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) + else: + raise e return article def initialize(self): diff --git a/goose/cleaners.py b/goose/cleaners.py index c1384ee0..2ad975d0 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -20,6 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import unicode_literals + from goose.utils import ReplaceSequence @@ -48,7 +50,7 @@ def __init__(self, config, article): "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" - "|legende|ajoutVideo|timestamp|js_replies" + "|legende|ajoutVideo|timestamp|js_replies|disclaim" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re @@ -66,8 +68,7 @@ def __init__(self, config, article): .append("\t")\ .append("^\\s+$") - def clean(self): - doc_to_clean = self.article.doc + def clean(self, doc_to_clean): doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..7d83a34f 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -22,6 +22,9 @@ """ import os import tempfile + +import six + from goose.text import StopWords from goose.parsers import Parser from goose.parsers import ParserSoup @@ -30,10 +33,20 @@ HTTP_DEFAULT_TIMEOUT = 30 AVAILABLE_PARSERS = { - 'lxml': Parser, - 'soup': ParserSoup, + 'lxml': Parser } +if six.PY2: + AVAILABLE_PARSERS['soup'] = ParserSoup + +KNOWN_ARTICLE_CONTENT_PATTERNS = [ + {'attr': 'class', 'value': 'short-story'}, + {'attr': 'itemprop', 'value': 'articleBody'}, + {'attr': 'class', 'value': 'post-content'}, + {'attr': 'class', 'value': 'g-content'}, + {'tag': 'article'}, +] + class Configuration(object): @@ -99,6 +112,12 @@ def __init__(self): # http timeout self.http_timeout = HTTP_DEFAULT_TIMEOUT + # known context patterns. Goose at first will search context at dom nodes, qualifying these patterns + self.known_context_patterns = KNOWN_ARTICLE_CONTENT_PATTERNS + + # Strict mode. Generate exceptions on errors instead of swallowing them + self.strict = True + def get_parser(self): return AVAILABLE_PARSERS[self.parser_class] diff --git a/goose/crawler.py b/goose/crawler.py index 34daf048..e5713b57 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -39,7 +39,7 @@ from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter -from goose.network import HtmlFetcher +from goose.network import NetworkFetcher class CrawlCandidate(object): @@ -99,11 +99,13 @@ def __init__(self, config): # title extractor self.title_extractor = self.get_title_extractor() + # html fetcher + self.fetcher = NetworkFetcher(self.config) + # image extrator self.image_extractor = self.get_image_extractor() - # html fetcher - self.htmlfetcher = HtmlFetcher(self.config) + # TODO : log prefix self.logPrefix = "crawler:" @@ -161,7 +163,10 @@ def crawl(self, crawl_candidate): self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document - self.article.doc = self.cleaner.clean() + if not isinstance(self.article.doc, list): + self.article.doc = [self.cleaner.clean(self.article.doc)] + else: + self.article.doc = list(map(lambda doc1: self.cleaner.clean(deepcopy(doc1)), self.article.doc)) # big stuff self.article.top_node = self.extractor.calculate_best_node() @@ -212,11 +217,7 @@ def get_html(self, crawl_candidate, parsing_candidate): return crawl_candidate.raw_html # fetch HTML - html = self.htmlfetcher.get_html(parsing_candidate.url) - self.article.additional_data.update({ - 'request': self.htmlfetcher.request, - 'result': self.htmlfetcher.result, - }) + html = self.fetcher.fetch(parsing_candidate.url) return html def get_metas_extractor(self): @@ -244,7 +245,7 @@ def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): - return ImageExtractor(self.config, self.article) + return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) diff --git a/goose/exceptions.py b/goose/exceptions.py new file mode 100644 index 00000000..b75f3183 --- /dev/null +++ b/goose/exceptions.py @@ -0,0 +1,3 @@ +from .network import NetworkError + +__all__ = ['NetworkError'] diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..433ed0c9 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -25,13 +25,6 @@ from goose.extractors import BaseExtractor -KNOWN_ARTICLE_CONTENT_TAGS = [ - {'attr': 'itemprop', 'value': 'articleBody'}, - {'attr': 'class', 'value': 'post-content'}, - {'tag': 'article'}, -] - - class ContentExtractor(BaseExtractor): def get_language(self): @@ -47,16 +40,17 @@ def get_language(self): return self.config.target_language def get_known_article_tags(self): - for item in KNOWN_ARTICLE_CONTENT_TAGS: - nodes = self.parser.getElementsByTag( - self.article.doc, - **item) - if len(nodes): - return nodes[0] + nodes = [] + for item in self.config.known_context_patterns: + nodes.extend(self.parser.getElementsByTag( + self.article.doc, + **item)) + if len(nodes): + return nodes return None def is_articlebody(self, node): - for item in KNOWN_ARTICLE_CONTENT_TAGS: + for item in self.config.known_context_patterns: # attribute if "attr" in item and "value" in item: if self.parser.getAttribute(node, item['attr']) == item['value']: @@ -260,7 +254,7 @@ def update_score(self, node, addToScore): if score_string: current_score = int(score_string) - new_score = current_score + addToScore + new_score = current_score + int(addToScore) self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): @@ -315,16 +309,17 @@ def get_node_gravity_score(self, node): return None return int(grvScoreString) - def nodes_to_check(self, doc): + def nodes_to_check(self, docs): """\ returns a list of nodes we want to search on like paragraphs and tables """ nodes_to_check = [] - for tag in ['p', 'pre', 'td']: - items = self.parser.getElementsByTag(doc, tag=tag) - nodes_to_check += items + for doc in docs: + for tag in ['p', 'pre', 'td']: + items = self.parser.getElementsByTag(doc, tag=tag) + nodes_to_check += items return nodes_to_check def is_table_and_no_para_exist(self, e): diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 3af44f5f..ebaf6935 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,7 +23,7 @@ import re import os -from urlparse import urlparse, urljoin +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor from goose.image import Image @@ -48,9 +48,10 @@ def __init__(self, node, parent_depth, sibling_depth): class ImageExtractor(BaseExtractor): - def __init__(self, config, article): + def __init__(self, fetcher, config, article): super(ImageExtractor, self).__init__(config, article) + self.fetcher = fetcher self.custom_site_mapping = {} self.load_customesite_mapping() @@ -333,9 +334,7 @@ def get_local_image(self, src): """\ returns the bytes of the image file on disk """ - local_image = ImageUtils.store_image(None, - self.link_hash, src, self.config) - return local_image + return ImageUtils.store_image(self.fetcher, self.link_hash, src, self.config) def get_clean_domain(self): if self.article.domain: diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index 95acadd5..5a65aa16 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,8 +22,8 @@ """ import re -from urlparse import urljoin -from urlparse import urlparse + +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 351e3396..58ddd021 100644 --- a/goose/image.py +++ b/goose/image.py @@ -46,7 +46,7 @@ def __init__(self): self.extraction_type = "NA" # stores how many bytes this image is. - self.bytes = long(0) + self.bytes = 0 def get_src(self): return self.src @@ -87,7 +87,7 @@ def set_mime_type(self, mime_type): class LocallyStoredImage(object): def __init__(self, src='', local_filename='', - link_hash='', bytes=long(0), file_extension='', height=0, width=0): + link_hash='', bytes=0, file_extension='', height=0, width=0): self.src = src self.local_filename = local_filename self.link_hash = link_hash diff --git a/goose/network.py b/goose/network.py index 666a7d61..2aca4873 100644 --- a/goose/network.py +++ b/goose/network.py @@ -20,41 +20,41 @@ See the License for the specific language governing permissions and limitations under the License. """ -import urllib2 +import six +import requests -class HtmlFetcher(object): +class NetworkError(RuntimeError): + def __init__(self, status_code, reason): + self.reason = reason + self.status_code = status_code + + +class NetworkFetcher(object): def __init__(self, config): self.config = config - # set header - self.headers = {'User-agent': self.config.browser_user_agent} + self._connection = requests.Session() + self._connection.headers['User-agent'] = self.config.browser_user_agent + + self._url = None def get_url(self): - # if we have a result - # get the final_url - if self.result is not None: - return self.result.geturl() - return None + return self._url - def get_html(self, url): + def fetch(self, url): # utf-8 encode unicode url - if isinstance(url, unicode): + if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') - # set request - self.request = urllib2.Request( - url, - headers=self.headers) - # do request - try: - self.result = urllib2.urlopen( - self.request, - timeout=self.config.http_timeout) - except Exception: - self.result = None - - # read the result content - if self.result is not None: - return self.result.read() - return None + response = self._connection.get(url, timeout=self.config.http_timeout) + if response.ok: + self._url = response.url + text = response.content + else: + self._url = None + text = None + if self.config.strict: + raise NetworkError(response.status_code, response.reason) + + return text diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 1f8ba4bd..d2cb5019 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,7 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ -from HTMLParser import HTMLParser +from six.moves.html_parser import HTMLParser + from goose.text import innerTrim @@ -66,12 +67,14 @@ def get_formatted_text(self): self.remove_fewwords_paragraphs() return self.convert_to_text() + _text_parser = HTMLParser() + def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: - txt = HTMLParser().unescape(txt) + txt = self._text_parser.unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) return '\n\n'.join(txts) diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..fab3eb31 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -21,11 +21,12 @@ limitations under the License. """ import lxml.html -from lxml.html import soupparser + +import six + from lxml import etree from copy import deepcopy -from goose.text import innerTrim -from goose.text import encodeValue +from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str class Parser(object): @@ -50,13 +51,20 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): - html = encodeValue(html) - self.doc = lxml.html.fromstring(html) + encoding = get_encodings_from_content(html) + encoding = encoding and encoding[0] or None + if not encoding: + html = encodeValue(html) + self.doc = lxml.html.fromstring(html) + else: + html = smart_str(html, encoding=encoding) + parser = lxml.html.HTMLParser(encoding=encoding) + self.doc = lxml.html.fromstring(html, parser=parser) return self.doc @classmethod def nodeToString(self, node): - return etree.tostring(node) + return etree.tostring(node, encoding=six.text_type) @classmethod def replaceTag(self, node, tag): @@ -239,6 +247,7 @@ class ParserSoup(Parser): @classmethod def fromstring(self, html): + from lxml.html import soupparser html = encodeValue(html) self.doc = soupparser.fromstring(html) return self.doc diff --git a/goose/text.py b/goose/text.py index 3ef63d6b..3d67f5fb 100644 --- a/goose/text.py +++ b/goose/text.py @@ -23,16 +23,54 @@ import os import re import string + +import six + from goose.utils import FileHelper from goose.utils.encoding import smart_unicode from goose.utils.encoding import smart_str from goose.utils.encoding import DjangoUnicodeDecodeError +SPACE_SYMBOLS = re.compile(r'[\s\xa0\t]') TABSSPACE = re.compile(r'[\s\t]+') +def get_encodings_from_content(content): + """ + Code from: + https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py + Return encodings from given content string. + :param content: string to extract encodings from. + """ + if isinstance(content, six.binary_type) and six.PY3: + find_charset = re.compile( + br']', flags=re.I + ).findall + + find_pragma = re.compile( + br']', flags=re.I + ).findall + + find_xml = re.compile( + br'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + else: + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + return find_charset(content) + find_pragma(content) + find_xml(content) + + def innerTrim(value): - if isinstance(value, (unicode, str)): + if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space value = re.sub(TABSSPACE, ' ', value) value = ''.join(value.splitlines()) @@ -87,7 +125,6 @@ def set_word_count(self, cnt): class StopWords(object): PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") - TRANS_TABLE = string.maketrans('', '') _cached_stop_words = {} def __init__(self, language='en'): @@ -106,12 +143,13 @@ def __init__(self, language='en'): def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python - if isinstance(content, unicode): - content = content.encode('utf-8') - return content.translate(self.TRANS_TABLE, string.punctuation) + if not isinstance(content, six.text_type): + content = content.decode('utf-8') + tbl = dict.fromkeys(ord(x) for x in string.punctuation) + return content.translate(tbl) def candiate_words(self, stripped_input): - return stripped_input.split(' ') + return re.split(SPACE_SYMBOLS, stripped_input) def get_stopword_count(self, content): if not content: diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index 5a1de7d4..41cf9c95 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -26,7 +26,13 @@ import os import goose import codecs -import urlparse + +import six + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse class BuildURL(object): @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash): class RawHelper(object): @classmethod def get_parsing_candidate(self, url, raw_html): - if isinstance(raw_html, unicode): + if isinstance(raw_html, six.text_type): raw_html = raw_html.encode('utf-8') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) return ParsingCandidate(url, link_hash) @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl): # replace shebang is urls final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ if '#!' in url_to_crawl else url_to_crawl - link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) + url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url + link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time()) return ParsingCandidate(final_url, link_hash) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 4dc23ca7..f94f476e 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- -import types import datetime + +import six + from decimal import Decimal @@ -45,8 +47,8 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, + type(None), + six.integer_types, datetime.datetime, datetime.date, datetime.time, float, Decimal) ) @@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # Handle the common case first, saves 30-40% in performance when s # is an instance of unicode. This function gets called often in that # setting. - if isinstance(s, unicode): + if isinstance(s, six.text_type): return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types,): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: try: - s = unicode(str(s), encoding, errors) + s = six.text_type(s, encoding, errors) except UnicodeEncodeError: if not isinstance(s, Exception): raise @@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # output should be. s = u' '.join([force_unicode(arg, encoding, strings_only, errors) for arg in s]) - elif not isinstance(s, unicode): + elif not isinstance(s, six.text_type): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise DjangoUnicodeDecodeError(s, *e.args) else: @@ -109,13 +111,17 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): If strings_only is True, don't convert (some) non-string-like objects. """ - if strings_only and isinstance(s, (types.NoneType, int)): + if strings_only and isinstance(s, (type(None), int)): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, basestring): + if isinstance(s, six.text_type): + return s.encode(encoding, errors) + elif not isinstance(s, six.binary_type): try: - return str(s) + if six.PY2: + return str(s) + return str(s).encode(encoding, errors) except UnicodeEncodeError: if isinstance(s, Exception): # An Exception subclass containing non-ASCII data that doesn't @@ -123,10 +129,6 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # further exception. return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) - return unicode(s).encode(encoding, errors) - elif isinstance(s, unicode): - return s.encode(encoding, errors) - elif s and encoding != 'utf-8': - return s.decode('utf-8', errors).encode(encoding, errors) + return six.text_type(s).encode(encoding, errors) else: return s diff --git a/goose/utils/images.py b/goose/utils/images.py index 388d5c85..9c12a1f8 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,8 +22,9 @@ """ import hashlib import os -import urllib2 + from PIL import Image + from goose.utils.encoding import smart_str from goose.image import ImageDetails from goose.image import LocallyStoredImage @@ -35,9 +36,9 @@ class ImageUtils(object): def get_image_dimensions(self, identify_program, path): image_details = ImageDetails() try: - image = Image.open(path) - image_details.set_mime_type(image.format) - width, height = image.size + with Image.open(path) as image: + image_details.set_mime_type(image.format) + width, height = image.size image_details.set_width(width) image_details.set_height(height) except IOError: @@ -115,9 +116,6 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - req = urllib2.Request(src) - f = urllib2.urlopen(req) - data = f.read() - return data + return http_client.fetch(src) except Exception: return None diff --git a/goose/version.py b/goose/version.py index fedcbb6d..4f2a84c1 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 25) +version_info = (1, 0, 29) __version__ = ".".join(map(str, version_info)) diff --git a/goose/video.py b/goose/video.py index 8509bba0..0691ac96 100644 --- a/goose/video.py +++ b/goose/video.py @@ -21,6 +21,7 @@ limitations under the License. """ + class Video(object): """\ Video object diff --git a/requirements.txt b/requirements.txt index 7e6a6c09..8d153935 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ +requests Pillow lxml cssselect jieba -beautifulsoup +beautifulsoup # Only on python2 nltk +six diff --git a/setup.py b/setup.py index ebad2547..c4d1fabf 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,8 @@ """ import os +import sys + from setuptools import setup, find_packages from imp import load_source @@ -40,6 +42,9 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries :: Python Modules'] @@ -53,19 +58,27 @@ except Exception: long_description = description +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests'] +test_requirements = ['requests_mock'] +if sys.version_info[0] == 2: + requirements.append('beautifulsoup') + if sys.version_info[1] < 7: + test_requirements.append('unittest2') + setup(name='goose-extractor', - version=version.__version__, - description=description, - long_description=long_description, - keywords='scrapping, extractor, web scrapping', - classifiers=CLASSIFIERS, - author='Xavier Grangier', - author_email='grangier@gmail.com', - url='https://github.com/grangier/python-goose', - license='Apache', - packages=find_packages(), - include_package_data=True, - zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'], - test_suite="tests" -) + version=version.__version__, + description=description, + long_description=long_description, + keywords='scrapping, extractor, web scrapping', + classifiers=CLASSIFIERS, + author='Xavier Grangier', + author_email='grangier@gmail.com', + url='https://github.com/grangier/python-goose', + license='Apache', + packages=find_packages(), + include_package_data=True, + zip_safe=False, + install_requires=requirements, + test_suite="tests", + tests_require=test_requirements + ) diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py index 709040c1..a21d362e 100644 --- a/tests/extractors/authors.py +++ b/tests/extractors/authors.py @@ -21,12 +21,26 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): article = self.getArticle() - fields = ['authors'] - self.runArticleAssertions(article=article, fields=fields) + field = 'authors' + + # Do not call self.runArticleAssertions because need to sort results, + # because set not save ordering, so test failed; + + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + expected_value.sort() + result_value.sort() + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index e19d20e0..cdf6cb32 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -22,11 +22,14 @@ """ import os import json -import urllib2 import unittest import socket +import requests_mock -from StringIO import StringIO +try: + import urllib2 +except ImportError: + import urllib.request as urllib2 from goose import Goose from goose.utils import FileHelper @@ -37,7 +40,7 @@ # Response -class MockResponse(): +class MockResponse: """\ Base mock response class """ @@ -47,45 +50,8 @@ class MockResponse(): def __init__(self, cls): self.cls = cls - def content(self): - return "response" - - def response(self, req): - data = self.content(req) - url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) - resp.code = self.code - resp.msg = self.msg - return resp - - -class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): - """\ - Mocked HTTPHandler in order to query APIs locally - """ - cls = None - - def https_open(self, req): - return self.http_open(req) - - def http_open(self, req): - r = self.cls.callback(self.cls) - return r.response(req) - - @staticmethod - def patch(cls): - opener = urllib2.build_opener(MockHTTPHandler) - urllib2.install_opener(opener) - # dirty ! - for h in opener.handlers: - if isinstance(h, MockHTTPHandler): - h.cls = cls - return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] - - @staticmethod - def unpatch(): - # urllib2 - urllib2._opener = None + def contents(self): + pass class BaseMockTests(unittest.TestCase): @@ -98,10 +64,8 @@ def setUp(self): # patch DNS self.original_getaddrinfo = socket.getaddrinfo socket.getaddrinfo = self.new_getaddrinfo - MockHTTPHandler.patch(self) def tearDown(self): - MockHTTPHandler.unpatch() # DNS socket.getaddrinfo = self.original_getaddrinfo @@ -113,7 +77,7 @@ def _get_current_testname(self): class MockResponseExtractors(MockResponse): - def content(self, req): + def contents(self): test, suite, module, cls, func = self.cls.id().split('.') path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -123,7 +87,7 @@ def content(self, req): "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) - return content + yield self.cls.data['url'], content.encode('utf-8') class TestExtractionBase(BaseMockTests): @@ -132,6 +96,14 @@ class TestExtractionBase(BaseMockTests): """ callback = MockResponseExtractors + def setUp(self): + # patch DNS + self.original_getaddrinfo = socket.getaddrinfo + socket.getaddrinfo = self.new_getaddrinfo + + def tearDown(self): + socket.getaddrinfo = self.original_getaddrinfo + def getRawHtml(self): test, suite, module, cls, func = self.id().split('.') path = os.path.join( @@ -203,8 +175,12 @@ def runArticleAssertions(self, article, fields): self.assertEqual(expected_value, result_value, msg=msg) def extract(self, instance): - article = instance.extract(url=self.data['url']) - return article + article_url = self.data['url'] + with requests_mock.mock() as m: + for url, content in self.callback(self).contents(): + m.get(url, content=content) + article = instance.extract(url=article_url) + return article def getConfig(self): config = Configuration() diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 30dc2754..854c4bd1 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase from goose.text import StopWordsChinese from goose.text import StopWordsArabic diff --git a/tests/extractors/images.py b/tests/extractors/images.py index e47a1dde..9c089fe2 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -20,13 +20,15 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import + import os import json import hashlib import unittest -from base import MockResponse -from base import TestExtractionBase +from .base import MockResponse +from .base import TestExtractionBase from goose.configuration import Configuration from goose.image import Image @@ -40,8 +42,8 @@ class MockResponseImage(MockResponse): - def image_content(self, req): - md5_hash = hashlib.md5(req.get_full_url()).hexdigest() + def image_content(self, url): + md5_hash = hashlib.md5(url.encode('utf-8')).hexdigest() current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -51,12 +53,15 @@ def image_content(self, req): current_test, md5_hash) path = os.path.abspath(path) - f = open(path, 'rb') - content = f.read() - f.close() - return content - - def html_content(self, req): + try: + f = open(path, 'rb') + content = f.read() + f.close() + return content + except Exception: + return None + + def html_content(self): current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -66,12 +71,14 @@ def html_content(self, req): current_test, "%s.html" % current_test) path = os.path.abspath(path) - return FileHelper.loadResourceFile(path) - - def content(self, req): - if self.cls.data['url'] == req.get_full_url(): - return self.html_content(req) - return self.image_content(req) + return FileHelper.loadResourceFile(path).encode('utf-8') + + def contents(self): + yield self.cls.data['url'], self.html_content() + img_url = self.cls.data['expected']['top_image']['src'] + if img_url: + yield img_url, self.image_content(img_url) + # self.image_content() class ImageExtractionTests(TestExtractionBase): diff --git a/tests/extractors/links.py b/tests/extractors/links.py index 8539465e..ea15a459 100644 --- a/tests/extractors/links.py +++ b/tests/extractors/links.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleLinks(TestExtractionBase): diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py index fd45915a..a4eef74c 100644 --- a/tests/extractors/metas.py +++ b/tests/extractors/metas.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestMetas(TestExtractionBase): diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py index 415a784c..a0616227 100644 --- a/tests/extractors/opengraph.py +++ b/tests/extractors/opengraph.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestOpenGraph(TestExtractionBase): diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py index 8d2a13b9..355250d5 100644 --- a/tests/extractors/publishdate.py +++ b/tests/extractors/publishdate.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestPublishDate(TestExtractionBase): diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py index 22b17129..2f5562ba 100644 --- a/tests/extractors/tags.py +++ b/tests/extractors/tags.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleTags(TestExtractionBase): diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 09170205..c6f7813c 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestTitle(TestExtractionBase): diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py index 50300f43..3f72a604 100644 --- a/tests/extractors/tweets.py +++ b/tests/extractors/tweets.py @@ -20,8 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import -from base import TestExtractionBase +from .base import TestExtractionBase class TestArticleTweet(TestExtractionBase): diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 10be15ff..0350c8c3 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class ImageExtractionTests(TestExtractionBase): diff --git a/tests/parsers.py b/tests/parsers.py index 6614368d..6e5e1986 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -21,7 +21,12 @@ limitations under the License. """ import os -import unittest +try: + import unittest2 as unittest # Need to support skipIf in python 2.6 +except ImportError: + import unittest + +import six from goose.utils import FileHelper from goose.parsers import Parser @@ -254,11 +259,28 @@ def test_delAttribute(self): # remove an unexistant attribute self.parser.delAttribute(div, attr="bla") + def test_encoding(self): + """ + If pass unicode string to lxml.html.fromstring with encoding set in document will receive: + "ValueError: Unicode strings with encoding declaration are not supported. + Please use bytes input or XML fragments without declaration." + Test for this case. + """ + html = u""" + + """ + html += u'' + html += u'

Я рядочок

' + html += u'' + self.parser.fromstring(html) + class TestParser(ParserBase): pass class TestParserSoup(ParserBase): + + @unittest.skipIf(six.PY3, "supported only in python2") def setUp(self): self.parser = ParserSoup