From ea0583c38420d17cb5c35c17be6d6b6dc3a554f8 Mon Sep 17 00:00:00 2001
From: Matt Jackson <email@mwjackson.net>
Date: Fri, 18 Jul 2014 16:22:08 +0100
Subject: [PATCH 1/6] try to find images using least costly methods

save full image retrieval until last and make configurable (instead of all images on/off)
use curated tags (opengraph, twitter card) where possible
---
 goose/crawler.py           |  7 +++----
 goose/images/extractors.py | 39 ++++++++++++++++++++++++++++++--------
 goose/images/image.py      |  2 +-
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/goose/crawler.py b/goose/crawler.py
index 211d410e..56995998 100644
--- a/goose/crawler.py
+++ b/goose/crawler.py
@@ -117,12 +117,11 @@ def crawl(self, crawl_candidate):
         # let's process it
         if self.article.top_node is not None:
 
-            # video handeling
+            # video handling
             self.video_extractor.get_videos()
 
-            # image handeling
-            if self.config.enable_image_fetching:
-                self.get_image()
+            # image handling
+            self.get_image()
 
             # post cleanup
             self.article.top_node = self.extractor.post_cleanup()
diff --git a/goose/images/extractors.py b/goose/images/extractors.py
index 08a207b0..12142549 100644
--- a/goose/images/extractors.py
+++ b/goose/images/extractors.py
@@ -81,27 +81,37 @@ def __init__(self, config, article):
         )
 
     def get_best_image(self, doc, topNode):
+        # first check for known occurrences
         image = self.check_known_elements()
         if image:
             return image
 
-        image = self.check_large_images(topNode, 0, 0)
-        if image:
-            return image
-
+        # then check for curated tags
         image = self.check_meta_tag()
         if image:
             return image
+
+        # then make best (and most costly) guess
+        if self.config.enable_image_fetching:
+            image = self.check_large_images(topNode, 0, 0)
+            if image:
+                return image
+
         return Image()
 
     def check_meta_tag(self):
-        # check link tag
-        image = self.check_link_tag()
+        # check opengraph tag
+        image = self.check_opengraph_tag()
         if image:
             return image
 
-        # check opengraph tag
-        image = self.check_opengraph_tag()
+        # check twitter card tag
+        image = self.check_twitter_card_tag()
+        if image:
+            return image
+
+        # check link tag
+        image = self.check_link_tag()
         if image:
             return image
 
@@ -337,6 +347,19 @@ def check_opengraph_tag(self):
                 return self.get_image(item, src, extraction_type='opengraph')
         return None
 
+    def check_twitter_card_tag(self):
+        """\
+        checks to see if we were able to
+        find twitter card tags on this page
+        """
+        node = self.article.raw_doc
+        meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image')
+        for item in meta:
+            src = self.parser.getAttribute(item, attr='content')
+            if src:
+                return self.get_image(item, src, extraction_type='twitter')
+        return None
+
     def get_local_image(self, src):
         """\
         returns the bytes of the image file on disk
diff --git a/goose/images/image.py b/goose/images/image.py
index 351e3396..f0da4037 100644
--- a/goose/images/image.py
+++ b/goose/images/image.py
@@ -42,7 +42,7 @@ def __init__(self):
         self.width = 0
 
         # what kind of image extraction was used for this?
-        # bestGuess, linkTag, openGraph tags?
+        # bestGuess, linkTag, openGraph tags, twitter card?
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.

From 3fd05812af4c01dad9eb310c026901bf5f8d230b Mon Sep 17 00:00:00 2001
From: Matt Jackson <email@mwjackson.net>
Date: Fri, 18 Jul 2014 16:24:13 +0100
Subject: [PATCH 2/6] ignore .env

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index bea68953..ca768ce2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ env/
 ._*
 venv/
 goose_extractor.egg-info/
+.env
\ No newline at end of file

From 2ac82364b5b68666bace4f09abf3b5d3bea7fd6e Mon Sep 17 00:00:00 2001
From: Matt Jackson <email@mwjackson.net>
Date: Fri, 18 Jul 2014 16:26:45 +0100
Subject: [PATCH 3/6] added socket timeout to image retrieval

handles 444 back from nginx a little better
---
 goose/images/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/goose/images/utils.py b/goose/images/utils.py
index 2767416f..2ff69d57 100644
--- a/goose/images/utils.py
+++ b/goose/images/utils.py
@@ -113,7 +113,7 @@ def clean_src_string(self, src):
     def fetch(self, http_client, src):
         try:
             req = urllib2.Request(src)
-            f = urllib2.urlopen(req)
+            f = urllib2.urlopen(req, timeout=30)
             data = f.read()
             return data
         except:

From ad5d08727eade88e483dc33c13d3a7165085357e Mon Sep 17 00:00:00 2001
From: Matt Jackson <email@mwjackson.net>
Date: Fri, 18 Jul 2014 16:31:48 +0100
Subject: [PATCH 4/6] made timeout 15 seconds

---
 goose/images/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/goose/images/utils.py b/goose/images/utils.py
index 2ff69d57..aa9dba39 100644
--- a/goose/images/utils.py
+++ b/goose/images/utils.py
@@ -113,7 +113,7 @@ def clean_src_string(self, src):
     def fetch(self, http_client, src):
         try:
             req = urllib2.Request(src)
-            f = urllib2.urlopen(req, timeout=30)
+            f = urllib2.urlopen(req, timeout=15)
             data = f.read()
             return data
         except:

From d18ed20d21c6a4dbee6ab4d91f32cdc68e242608 Mon Sep 17 00:00:00 2001
From: Matt Jackson <email@mwjackson.net>
Date: Thu, 24 Jul 2014 12:57:13 +0100
Subject: [PATCH 5/6] only check image dimensions if image fetching enabled

---
 goose/images/extractors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/goose/images/extractors.py b/goose/images/extractors.py
index 12142549..34e03c19 100644
--- a/goose/images/extractors.py
+++ b/goose/images/extractors.py
@@ -223,7 +223,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"):
         # check if we have a local image
         # in order to add more information
         # on the Image object
-        local_image = self.get_local_image(image.src)
+        local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None
         if local_image:
             image.bytes = local_image.bytes
             image.height = local_image.height

From ef910678c8fe1d2326ccdd5ae836890730a6c7d3 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Mon, 12 Jan 2015 17:44:43 +0200
Subject: [PATCH 6/6] Fix title extraction if title is same as site_name

---
 .gitignore                                         |  2 ++
 goose/extractors/title.py                          |  3 +++
 .../title/test_title_opengraph_empty.html          | 14 ++++++++++++++
 .../title/test_title_opengraph_empty.json          |  6 ++++++
 tests/extractors/title.py                          |  5 +++++
 5 files changed, 30 insertions(+)
 create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.html
 create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.json

diff --git a/.gitignore b/.gitignore
index 4bfadf57..4a5b33ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ env/
 *.egg
 venv/
 goose_extractor.egg-info/
+.env
+.directory
diff --git a/goose/extractors/title.py b/goose/extractors/title.py
index 8104c52b..f253f5b8 100644
--- a/goose/extractors/title.py
+++ b/goose/extractors/title.py
@@ -51,6 +51,9 @@ def clean_title(self, title):
         # my wonderfull article | TechCrunch
         title_words = title.split()
 
+        if not title_words:
+            return str()
+
         # check if first letter is in TITLE_SPLITTERS
         # if so remove it
         if title_words[0] in TITLE_SPLITTERS:
diff --git a/tests/data/extractors/title/test_title_opengraph_empty.html b/tests/data/extractors/title/test_title_opengraph_empty.html
new file mode 100644
index 00000000..5d270152
--- /dev/null
+++ b/tests/data/extractors/title/test_title_opengraph_empty.html
@@ -0,0 +1,14 @@
+<html>
+    <head>
+      <meta property="og:site_name" content="TechCrunch"/>
+      <meta property="og:title" content="TechCrunch"/>
+      <title>Wrong article title - website</title>
+    </head>
+    <body>
+        <div>
+            <p>
+              TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
+            </p>
+        </div>
+    </body>
+</html>
diff --git a/tests/data/extractors/title/test_title_opengraph_empty.json b/tests/data/extractors/title/test_title_opengraph_empty.json
new file mode 100644
index 00000000..376fc189
--- /dev/null
+++ b/tests/data/extractors/title/test_title_opengraph_empty.json
@@ -0,0 +1,6 @@
+{
+    "url": "http://exemple.com/test_opengraphcontent",
+    "expected": {
+        "title": ""
+    }
+}
diff --git a/tests/extractors/title.py b/tests/extractors/title.py
index 36bee9a2..79d5f9c5 100644
--- a/tests/extractors/title.py
+++ b/tests/extractors/title.py
@@ -30,3 +30,8 @@ def test_title_opengraph(self):
         article = self.getArticle()
         fields = ['title']
         self.runArticleAssertions(article=article, fields=fields)
+
+    def test_title_opengraph_empty(self):
+        article = self.getArticle()
+        fields = ['title']
+        self.runArticleAssertions(article=article, fields=fields)