From 50981db0040dd18f4707ddb54c358efdd25bc8da Mon Sep 17 00:00:00 2001 From: Rob McDaniel Date: Fri, 13 Mar 2015 19:58:18 -0700 Subject: [PATCH 1/2] fixing new york times content extraction failure --- goose/extractors/content.py | 4 +++- goose/version.py | 2 +- tests/data/extractors/content/test_newyorktimes.html | 0 tests/data/extractors/content/test_newyorktimes.json | 0 4 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 tests/data/extractors/content/test_newyorktimes.html create mode 100644 tests/data/extractors/content/test_newyorktimes.json diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..6f16f0ca 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -26,9 +26,11 @@ KNOWN_ARTICLE_CONTENT_TAGS = [ + {'tag': 'article'}, {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, - {'tag': 'article'}, + {'attr': 'class', 'value': "story-body-text"}, + {'attr': 'class', 'value': "story-content"}, ] diff --git a/goose/version.py b/goose/version.py index fedcbb6d..bfb200a2 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 25) +version_info = (1, 0, 26) __version__ = ".".join(map(str, version_info)) diff --git a/tests/data/extractors/content/test_newyorktimes.html b/tests/data/extractors/content/test_newyorktimes.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/extractors/content/test_newyorktimes.json b/tests/data/extractors/content/test_newyorktimes.json new file mode 100644 index 00000000..e69de29b From d3ce67b92f8807c26c991ba29150259c63a8f344 Mon Sep 17 00:00:00 2001 From: Rob McDaniel Date: Wed, 16 Dec 2015 14:30:59 -0800 Subject: [PATCH 2/2] fixing index error in title_words for case where title is single character which is also a splitter --- goose/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 31d69840..13d2f102 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -63,7 +63,7 @@ def clean_title(self, title): # check if last letter is in TITLE_SPLITTERS # if so remove it - if title_words[-1] in TITLE_SPLITTERS: + if len(title_words) != 0 and title_words[-1] in TITLE_SPLITTERS: title_words.pop(-1) # rebuild the title