From ea0583c38420d17cb5c35c17be6d6b6dc3a554f8 Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:22:08 +0100 Subject: [PATCH 1/6] try to find images using least costly methods save full image retrieval until last and make configurable (instead of all images on/off) use curated tags (opengraph, twitter card) where possible --- goose/crawler.py | 7 +++---- goose/images/extractors.py | 39 ++++++++++++++++++++++++++++++-------- goose/images/image.py | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..56995998 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -117,12 +117,11 @@ def crawl(self, crawl_candidate): # let's process it if self.article.top_node is not None: - # video handeling + # video handling self.video_extractor.get_videos() - # image handeling - if self.config.enable_image_fetching: - self.get_image() + # image handling + self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() diff --git a/goose/images/extractors.py b/goose/images/extractors.py index 08a207b0..12142549 100644 --- a/goose/images/extractors.py +++ b/goose/images/extractors.py @@ -81,27 +81,37 @@ def __init__(self, config, article): ) def get_best_image(self, doc, topNode): + # first check for known occurrences image = self.check_known_elements() if image: return image - image = self.check_large_images(topNode, 0, 0) - if image: - return image - + # then check for curated tags image = self.check_meta_tag() if image: return image + + # then make best (and most costly) guess + if self.config.enable_image_fetching: + image = self.check_large_images(topNode, 0, 0) + if image: + return image + return Image() def check_meta_tag(self): - # check link tag - image = self.check_link_tag() + # check opengraph tag + image = self.check_opengraph_tag() if image: return image - # check opengraph tag - image = self.check_opengraph_tag() + # check twitter card tag + image = self.check_twitter_card_tag() + if image: + return image + + # check link tag + image = self.check_link_tag() if image: return image @@ -337,6 +347,19 @@ def check_opengraph_tag(self): return self.get_image(item, src, extraction_type='opengraph') return None + def check_twitter_card_tag(self): + """\ + checks to see if we were able to + find twitter card tags on this page + """ + node = self.article.raw_doc + meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image') + for item in meta: + src = self.parser.getAttribute(item, attr='content') + if src: + return self.get_image(item, src, extraction_type='twitter') + return None + def get_local_image(self, src): """\ returns the bytes of the image file on disk diff --git a/goose/images/image.py b/goose/images/image.py index 351e3396..f0da4037 100644 --- a/goose/images/image.py +++ b/goose/images/image.py @@ -42,7 +42,7 @@ def __init__(self): self.width = 0 # what kind of image extraction was used for this? - # bestGuess, linkTag, openGraph tags? + # bestGuess, linkTag, openGraph tags, twitter card? self.extraction_type = "NA" # stores how many bytes this image is. From 3fd05812af4c01dad9eb310c026901bf5f8d230b Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:24:13 +0100 Subject: [PATCH 2/6] ignore .env --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bea68953..ca768ce2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ env/ ._* venv/ goose_extractor.egg-info/ +.env \ No newline at end of file From 2ac82364b5b68666bace4f09abf3b5d3bea7fd6e Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:26:45 +0100 Subject: [PATCH 3/6] added socket timeout to image retrieval handles 444 back from nginx a little better --- goose/images/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index 2767416f..2ff69d57 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -113,7 +113,7 @@ def clean_src_string(self, src): def fetch(self, http_client, src): try: req = urllib2.Request(src) - f = urllib2.urlopen(req) + f = urllib2.urlopen(req, timeout=30) data = f.read() return data except: From ad5d08727eade88e483dc33c13d3a7165085357e Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:31:48 +0100 Subject: [PATCH 4/6] made timeout 15 seconds --- goose/images/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index 2ff69d57..aa9dba39 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -113,7 +113,7 @@ def clean_src_string(self, src): def fetch(self, http_client, src): try: req = urllib2.Request(src) - f = urllib2.urlopen(req, timeout=30) + f = urllib2.urlopen(req, timeout=15) data = f.read() return data except: From d18ed20d21c6a4dbee6ab4d91f32cdc68e242608 Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Thu, 24 Jul 2014 12:57:13 +0100 Subject: [PATCH 5/6] only check image dimensions if image fetching enabled --- goose/images/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/extractors.py b/goose/images/extractors.py index 12142549..34e03c19 100644 --- a/goose/images/extractors.py +++ b/goose/images/extractors.py @@ -223,7 +223,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"): # check if we have a local image # in order to add more information # on the Image object - local_image = self.get_local_image(image.src) + local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None if local_image: image.bytes = local_image.bytes image.height = local_image.height From ef910678c8fe1d2326ccdd5ae836890730a6c7d3 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Mon, 12 Jan 2015 17:44:43 +0200 Subject: [PATCH 6/6] Fix title extraction if title is same as site_name --- .gitignore | 2 ++ goose/extractors/title.py | 3 +++ .../title/test_title_opengraph_empty.html | 14 ++++++++++++++ .../title/test_title_opengraph_empty.json | 6 ++++++ tests/extractors/title.py | 5 +++++ 5 files changed, 30 insertions(+) create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.html create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.json diff --git a/.gitignore b/.gitignore index 4bfadf57..4a5b33ff 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ env/ *.egg venv/ goose_extractor.egg-info/ +.env +.directory diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 8104c52b..f253f5b8 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -51,6 +51,9 @@ def clean_title(self, title): # my wonderfull article | TechCrunch title_words = title.split() + if not title_words: + return str() + # check if first letter is in TITLE_SPLITTERS # if so remove it if title_words[0] in TITLE_SPLITTERS: diff --git a/tests/data/extractors/title/test_title_opengraph_empty.html b/tests/data/extractors/title/test_title_opengraph_empty.html new file mode 100644 index 00000000..5d270152 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.html @@ -0,0 +1,14 @@ + + + + + Wrong article title - website + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/title/test_title_opengraph_empty.json b/tests/data/extractors/title/test_title_opengraph_empty.json new file mode 100644 index 00000000..376fc189 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "title": "" + } +} diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 36bee9a2..79d5f9c5 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -30,3 +30,8 @@ def test_title_opengraph(self): article = self.getArticle() fields = ['title'] self.runArticleAssertions(article=article, fields=fields) + + def test_title_opengraph_empty(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields)