-
Notifications
You must be signed in to change notification settings - Fork 39
Expand file tree
/
Copy pathnewsscraper.py
More file actions
174 lines (142 loc) · 5.06 KB
/
newsscraper.py
File metadata and controls
174 lines (142 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""A script for scraping news sites and writing latest articles to
json.
"""
import sys
import json
from time import mktime
from datetime import datetime
import feedparser as fp
import newspaper
from newspaper import Article
data = {}
data["newspapers"] = {}
def parse_config(fname):
# Loads the JSON files with news sites
with open(fname, "r") as data_file:
cfg = json.load(data_file)
for company, value in cfg.items():
if "link" not in value:
raise ValueError(f"Configuration item {company} missing obligatory 'link'.")
return cfg
def _handle_rss(company, value, count, limit):
"""If a RSS link is provided in the JSON file, this will be the first
choice.
Reason for this is that, RSS feeds often give more consistent and
correct data.
If you do not want to scrape from the RSS-feed, just leave the RSS
attr empty in the JSON file.
"""
fpd = fp.parse(value["rss"])
print(f"Downloading articles from {company}")
news_paper = {"rss": value["rss"], "link": value["link"], "articles": []}
for entry in fpd.entries:
# Check if publish date is provided, if no the article is
# skipped. This is done to keep consistency in the data and to
# keep the script from crashing.
if not hasattr(entry, "published"):
continue
if count > limit:
break
article = {}
article["link"] = entry.link
date = entry.published_parsed
article["published"] = datetime.fromtimestamp(mktime(date)).isoformat()
try:
content = Article(entry.link)
content.download()
content.parse()
except Exception as err:
# If the download for some reason fails (ex. 404) the
# script will continue downloading the next article.
print(err)
print("continuing...")
continue
article["title"] = content.title
article["text"] = content.text
news_paper["articles"].append(article)
print(f"{count} articles downloaded from {company}, url: {entry.link}")
count = count + 1
return count, news_paper
def _handle_fallback(company, value, count, limit):
"""This is the fallback method if a RSS-feed link is not provided.
It uses the python newspaper library to extract articles.
"""
print(f"Building site for {company}")
paper = newspaper.build(value["link"], memoize_articles=False)
news_paper = {"link": value["link"], "articles": []}
none_type_count = 0
for content in paper.articles:
if count > limit:
break
try:
content.download()
content.parse()
except Exception as err:
print(err)
print("continuing...")
continue
# Again, for consistency, if there is no found publish date the
# article will be skipped.
#
# After 10 downloaded articles from the same newspaper without
# publish date, the company will be skipped.
if content.publish_date is None:
print(f"{count} Article has date of type None...")
none_type_count = none_type_count + 1
if none_type_count > 10:
print("Too many noneType dates, aborting...")
none_type_count = 0
break
count = count + 1
continue
article = {
"title": content.title,
"text": content.text,
"link": content.url,
"published": content.publish_date.isoformat(),
}
news_paper["articles"].append(article)
print(
f"{count} articles downloaded from {company} using newspaper, url: {content.url}"
)
count = count + 1
none_type_count = 0
return count, news_paper
def run(config, limit=4):
"""Take a config object of sites and urls, and an upper limit.
Iterate through each news company.
Write result to scraped_articles.json.
"""
for company, value in config.items():
count = 1
if "rss" in value:
count, news_paper = _handle_rss(company, value, count, limit)
else:
count, news_paper = _handle_fallback(company, value, count, limit)
data["newspapers"][company] = news_paper
# Finally it saves the articles as a JSON-file.
try:
with open("scraped_articles.json", "w") as outfile:
json.dump(data, outfile, indent=2)
except Exception as err:
print(err)
def main():
"""News site scraper.
Takes a command line argument containing json.
"""
args = list(sys.argv)
if len(args) < 2:
sys.exit("Usage: newsscraper.py NewsPapers.json")
limit = 4
if "--limit" in args:
idx = args.index("--limit")
limit = int(args[idx + 1])
args = [args[i] for i in range(len(args)) if i not in (idx, idx + 1)]
fname = args[1]
try:
config = parse_config(fname)
except Exception as err:
sys.exit(err)
run(config, limit=limit)
if __name__ == "__main__":
main()