Skip to content
Open
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ env/
*.egg
venv/
goose_extractor.egg-info/
.env
.directory
3 changes: 1 addition & 2 deletions goose/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,7 @@ def crawl(self, crawl_candidate):
self.video_extractor.get_videos()

# image handling
if self.config.enable_image_fetching:
self.get_image()
self.get_image()

# post cleanup
self.article.top_node = self.extractor.post_cleanup()
Expand Down
41 changes: 32 additions & 9 deletions goose/extractors/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,37 @@ def __init__(self, config, article):
)

def get_best_image(self, doc, topNode):
# first check for known occurrences
image = self.check_known_elements()
if image:
return image

image = self.check_large_images(topNode, 0, 0)
if image:
return image

# then check for curated tags
image = self.check_meta_tag()
if image:
return image

# then make best (and most costly) guess
if self.config.enable_image_fetching:
image = self.check_large_images(topNode, 0, 0)
if image:
return image

return Image()

def check_meta_tag(self):
# check link tag
image = self.check_link_tag()
# check opengraph tag
image = self.check_opengraph_tag()
if image:
return image

# check opengraph tag
image = self.check_opengraph_tag()
# check twitter card tag
image = self.check_twitter_card_tag()
if image:
return image

# check link tag
image = self.check_link_tag()
if image:
return image

Expand Down Expand Up @@ -205,7 +215,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"):
# check if we have a local image
# in order to add more information
# on the Image object
local_image = self.get_local_image(image.src)
local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None
if local_image:
image.bytes = local_image.bytes
image.height = local_image.height
Expand Down Expand Up @@ -329,6 +339,19 @@ def check_opengraph_tag(self):
return self.get_image(item, src, extraction_type='opengraph')
return None

def check_twitter_card_tag(self):
"""\
checks to see if we were able to
find twitter card tags on this page
"""
node = self.article.raw_doc
meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image')
for item in meta:
src = self.parser.getAttribute(item, attr='content')
if src:
return self.get_image(item, src, extraction_type='twitter')
return None

def get_local_image(self, src):
"""\
returns the bytes of the image file on disk
Expand Down
3 changes: 3 additions & 0 deletions goose/extractors/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def clean_title(self, title):
# my wonderfull article | TechCrunch
title_words = title.split()

if not title_words:
return str()

# check if first letter is in TITLE_SPLITTERS
# if so remove it
if title_words[0] in TITLE_SPLITTERS:
Expand Down
2 changes: 1 addition & 1 deletion goose/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self):
self.width = 0

# what kind of image extraction was used for this?
# bestGuess, linkTag, openGraph tags?
# bestGuess, linkTag, openGraph tags, twitter card?
self.extraction_type = "NA"

# stores how many bytes this image is.
Expand Down
2 changes: 1 addition & 1 deletion goose/utils/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def clean_src_string(self, src):
def fetch(self, http_client, src):
try:
req = urllib2.Request(src)
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=15)
data = f.read()
return data
except Exception:
Expand Down
14 changes: 14 additions & 0 deletions tests/data/extractors/title/test_title_opengraph_empty.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<html>
<head>
<meta property="og:site_name" content="TechCrunch"/>
<meta property="og:title" content="TechCrunch"/>
<title>Wrong article title - website</title>
</head>
<body>
<div>
<p>
TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
</p>
</div>
</body>
</html>
6 changes: 6 additions & 0 deletions tests/data/extractors/title/test_title_opengraph_empty.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"url": "http://exemple.com/test_opengraphcontent",
"expected": {
"title": ""
}
}
5 changes: 5 additions & 0 deletions tests/extractors/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ def test_title_opengraph(self):
article = self.getArticle()
fields = ['title']
self.runArticleAssertions(article=article, fields=fields)

def test_title_opengraph_empty(self):
article = self.getArticle()
fields = ['title']
self.runArticleAssertions(article=article, fields=fields)