pistolero · vetal4444 · Jul 18, 2014 · Jul 18, 2014 · Jul 18, 2014 · Jul 18, 2014
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ env/
 *.egg
 venv/
 goose_extractor.egg-info/
+.env
+.directory
diff --git a/goose/crawler.py b/goose/crawler.py
@@ -180,8 +180,7 @@ def crawl(self, crawl_candidate):
             self.video_extractor.get_videos()
 
             # image handling
-            if self.config.enable_image_fetching:
-                self.get_image()
+            self.get_image()
 
             # post cleanup
             self.article.top_node = self.extractor.post_cleanup()

diff --git a/goose/extractors/images.py b/goose/extractors/images.py
@@ -74,27 +74,37 @@ def __init__(self, config, article):
         )
 
     def get_best_image(self, doc, topNode):
+        # first check for known occurrences
         image = self.check_known_elements()
         if image:
             return image
 
-        image = self.check_large_images(topNode, 0, 0)
-        if image:
-            return image
-
+        # then check for curated tags
         image = self.check_meta_tag()
         if image:
             return image
+
+        # then make best (and most costly) guess
+        if self.config.enable_image_fetching:
+            image = self.check_large_images(topNode, 0, 0)
+            if image:
+                return image
+
         return Image()
 
     def check_meta_tag(self):
-        # check link tag
-        image = self.check_link_tag()
+        # check opengraph tag
+        image = self.check_opengraph_tag()
         if image:
             return image
 
-        # check opengraph tag
-        image = self.check_opengraph_tag()
+        # check twitter card tag
+        image = self.check_twitter_card_tag()
+        if image:
+            return image
+
+        # check link tag
+        image = self.check_link_tag()
         if image:
             return image
 
@@ -205,7 +215,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"):
         # check if we have a local image
         # in order to add more information
         # on the Image object
-        local_image = self.get_local_image(image.src)
+        local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None
         if local_image:
             image.bytes = local_image.bytes
             image.height = local_image.height
@@ -329,6 +339,19 @@ def check_opengraph_tag(self):
                 return self.get_image(item, src, extraction_type='opengraph')
         return None
 
+    def check_twitter_card_tag(self):
+        """\
+        checks to see if we were able to
+        find twitter card tags on this page
+        """
+        node = self.article.raw_doc
+        meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image')
+        for item in meta:
+            src = self.parser.getAttribute(item, attr='content')
+            if src:
+                return self.get_image(item, src, extraction_type='twitter')
+        return None
+
     def get_local_image(self, src):
         """\
         returns the bytes of the image file on disk

diff --git a/goose/extractors/title.py b/goose/extractors/title.py
@@ -51,6 +51,9 @@ def clean_title(self, title):
         # my wonderfull article | TechCrunch
         title_words = title.split()
 
+        if not title_words:
+            return str()
+
         # check if first letter is in TITLE_SPLITTERS
         # if so remove it
         if title_words[0] in TITLE_SPLITTERS:

diff --git a/goose/image.py b/goose/image.py
@@ -42,7 +42,7 @@ def __init__(self):
         self.width = 0
 
         # what kind of image extraction was used for this?
-        # bestGuess, linkTag, openGraph tags?
+        # bestGuess, linkTag, openGraph tags, twitter card?
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.

diff --git a/goose/utils/images.py b/goose/utils/images.py
@@ -116,7 +116,7 @@ def clean_src_string(self, src):
     def fetch(self, http_client, src):
         try:
             req = urllib2.Request(src)
-            f = urllib2.urlopen(req)
+            f = urllib2.urlopen(req, timeout=15)
             data = f.read()
             return data
         except Exception:

diff --git a/tests/data/extractors/title/test_title_opengraph_empty.html b/tests/data/extractors/title/test_title_opengraph_empty.html
@@ -0,0 +1,14 @@
+<html>
+    <head>
+      <meta property="og:site_name" content="TechCrunch"/>
+      <meta property="og:title" content="TechCrunch"/>
+      <title>Wrong article title - website</title>
+    </head>
+    <body>
+        <div>
+            <p>
+              TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
+            </p>
+        </div>
+    </body>
+</html>
diff --git a/tests/data/extractors/title/test_title_opengraph_empty.json b/tests/data/extractors/title/test_title_opengraph_empty.json
@@ -0,0 +1,6 @@
+{
+    "url": "http://exemple.com/test_opengraphcontent",
+    "expected": {
+        "title": ""
+    }
+}
diff --git a/tests/extractors/title.py b/tests/extractors/title.py
@@ -30,3 +30,8 @@ def test_title_opengraph(self):
         article = self.getArticle()
         fields = ['title']
         self.runArticleAssertions(article=article, fields=fields)
+
+    def test_title_opengraph_empty(self):
+        article = self.getArticle()
+        fields = ['title']
+        self.runArticleAssertions(article=article, fields=fields)
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,5 @@ env/ @@
     *.egg
     venv/
     goose_extractor.egg-info/
+    .env
+    .directory