diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..94f5dadb 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -39,7 +39,7 @@ class Configuration(object): def __init__(self): # What's the minimum bytes for an image we'd accept is, - # alot of times we want to filter out the author's little images + # a lot of times we want to filter out the author's little images # in the beginning of the article self.images_min_bytes = 4500 diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..c2b7941c 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -144,12 +144,12 @@ def calculate_best_node(self): def is_boostable(self, node): """\ - alot of times the first paragraph might be the caption under an image + a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs so we'll want to make sure that the next sibling is a paragraph and has at - least some substatial weight to it + least some substantial weight to it """ para = "p" steps_away = 0 diff --git a/goose/extractors/title.py b/goose/extractors/title.py index a59dca92..74775c0c 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -47,8 +47,8 @@ def clean_title(self, title): title = pattern.sub("", title).strip() # split the title in words - # TechCrunch | my wonderfull article - # my wonderfull article | TechCrunch + # TechCrunch | my wonderful article + # my wonderful article | TechCrunch title_words = title.split() # check for an empty title