-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrsstimemachine.py
More file actions
executable file
·75 lines (54 loc) · 2.07 KB
/
rsstimemachine.py
File metadata and controls
executable file
·75 lines (54 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
# Example usage:
# ./rsstimemachine.py example.com | wget --force-directories --input-file=-
import sys
import requests
CDX_API_URL = 'http://web.archive.org/cdx/search/cdx'
# This uses an (undocumented?) flag 'id_' to force getting unmodified original files
DOWNLOAD_URL_PATTERN = 'http://web.archive.org/web/{timestamp}id_/{original}'
MIMETYPES = [
# Possibly add some more general plain text or unkown format mimetypes
'application/rss+xml',
'application/rdf+xml',
'application/atom+xml',
'application/xml',
'text/rss+xml',
'text/xml'
]
# Not sure how restrictive to make this
# (?i) makes the Java format regex case insensitive
# RSS_ORIGINAL_REGEX = r'(?i).*(rss|atom|feed|blog|podcast|).*\.(xml|rss|atom)
RSS_ORIGINAL_REGEX = r'(?i).*\.(xml|rss|atom)'
def rss_urls_by_mimetype(domain):
params = {
'collapse': 'digest',
'url': domain,
'matchType': 'domain',
'filter': 'mimetype:' + '|'.join(MIMETYPES)
}
req = requests.get(CDX_API_URL, params=params)
for line in req.content.splitlines():
urlkey, timestamp, original, mimetype, statuscode, digest, length = line.split(' ')
if statuscode == '200':
yield DOWNLOAD_URL_PATTERN.format(timestamp=timestamp, original=original)
def rss_urls_by_original_regex(domain):
params = {
'collapse': 'digest',
'url': domain,
'matchType': 'domain',
'filter': 'original:' + RSS_ORIGINAL_REGEX
}
req = requests.get(CDX_API_URL, params=params)
for line in req.content.splitlines():
urlkey, timestamp, original, mimetype, statuscode, digest, length = line.split(' ')
if statuscode == '200':
yield DOWNLOAD_URL_PATTERN.format(timestamp=timestamp, original=original)
if __name__ == '__main__':
if len(sys.argv) != 2:
sys.exit('USAGE: %s example.com' % sys.argv[0])
domain = sys.argv[1]
urls = set()
urls.update(rss_urls_by_mimetype(domain))
urls.update(rss_urls_by_original_regex(domain))
for url in urls:
print url