diff --git a/.gitignore b/.gitignore index 4bfadf57..4a5b33ff 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ env/ *.egg venv/ goose_extractor.egg-info/ +.env +.directory diff --git a/goose/crawler.py b/goose/crawler.py index 34daf048..b3585dc5 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -180,8 +180,7 @@ def crawl(self, crawl_candidate): self.video_extractor.get_videos() # image handling - if self.config.enable_image_fetching: - self.get_image() + self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 3af44f5f..07d69c3c 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -74,27 +74,37 @@ def __init__(self, config, article): ) def get_best_image(self, doc, topNode): + # first check for known occurrences image = self.check_known_elements() if image: return image - image = self.check_large_images(topNode, 0, 0) - if image: - return image - + # then check for curated tags image = self.check_meta_tag() if image: return image + + # then make best (and most costly) guess + if self.config.enable_image_fetching: + image = self.check_large_images(topNode, 0, 0) + if image: + return image + return Image() def check_meta_tag(self): - # check link tag - image = self.check_link_tag() + # check opengraph tag + image = self.check_opengraph_tag() if image: return image - # check opengraph tag - image = self.check_opengraph_tag() + # check twitter card tag + image = self.check_twitter_card_tag() + if image: + return image + + # check link tag + image = self.check_link_tag() if image: return image @@ -205,7 +215,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"): # check if we have a local image # in order to add more information # on the Image object - local_image = self.get_local_image(image.src) + local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None if local_image: image.bytes = local_image.bytes image.height = local_image.height @@ -329,6 +339,19 @@ def check_opengraph_tag(self): return self.get_image(item, src, extraction_type='opengraph') return None + def check_twitter_card_tag(self): + """\ + checks to see if we were able to + find twitter card tags on this page + """ + node = self.article.raw_doc + meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image') + for item in meta: + src = self.parser.getAttribute(item, attr='content') + if src: + return self.get_image(item, src, extraction_type='twitter') + return None + def get_local_image(self, src): """\ returns the bytes of the image file on disk diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 8104c52b..f253f5b8 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -51,6 +51,9 @@ def clean_title(self, title): # my wonderfull article | TechCrunch title_words = title.split() + if not title_words: + return str() + # check if first letter is in TITLE_SPLITTERS # if so remove it if title_words[0] in TITLE_SPLITTERS: diff --git a/goose/image.py b/goose/image.py index 351e3396..f0da4037 100644 --- a/goose/image.py +++ b/goose/image.py @@ -42,7 +42,7 @@ def __init__(self): self.width = 0 # what kind of image extraction was used for this? - # bestGuess, linkTag, openGraph tags? + # bestGuess, linkTag, openGraph tags, twitter card? self.extraction_type = "NA" # stores how many bytes this image is. diff --git a/goose/utils/images.py b/goose/utils/images.py index 388d5c85..0475ab3b 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -116,7 +116,7 @@ def clean_src_string(self, src): def fetch(self, http_client, src): try: req = urllib2.Request(src) - f = urllib2.urlopen(req) + f = urllib2.urlopen(req, timeout=15) data = f.read() return data except Exception: diff --git a/tests/data/extractors/title/test_title_opengraph_empty.html b/tests/data/extractors/title/test_title_opengraph_empty.html new file mode 100644 index 00000000..5d270152 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.html @@ -0,0 +1,14 @@ + + + + + Wrong article title - website + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/title/test_title_opengraph_empty.json b/tests/data/extractors/title/test_title_opengraph_empty.json new file mode 100644 index 00000000..376fc189 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "title": "" + } +} diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 36bee9a2..79d5f9c5 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -30,3 +30,8 @@ def test_title_opengraph(self): article = self.getArticle() fields = ['title'] self.runArticleAssertions(article=article, fields=fields) + + def test_title_opengraph_empty(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields)