-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathall_issues.py
More file actions
141 lines (121 loc) · 6.76 KB
/
all_issues.py
File metadata and controls
141 lines (121 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from urllib.parse import urlparse
import requests
import re
import os
# Debug setting
debug = False
# Do you want to check for all back issues as well?
doBackIssues = True
# Place to put the files (Yes, I developed it on windows!)
# CHANGE THIS FOR YOUR OWN PATH
output_dir = "D:\\MagPi\\"
def MagazineLink(page):
#<a class="c-issue-actions__link c-link u-text-bold" href="/issues/24/pdf">Download Free PDF</a>
linkSearch = '<a class=\"c-button c-button--secondary c-button--block\" href=\"(?P<link>.+?pdf)\">Get PDF</a>'
return re.findall(linkSearch, str(page), flags=re.IGNORECASE)
def HelloWorldMagazineLink(page):
linkSearch = '<a class=\"(?:c-issue-actions__link c-link u-text-bold|c-button c-button--secondary u-mb-x2)\" href=\"(?P<link>.+?pdf)\">(?:Download Free PDF|Free Download)</a>'
return re.findall(linkSearch, str(page), flags=re.IGNORECASE)
def BookLinks(page):
#<a class="c-button c-button--secondary c-button--block" href="/books/unity-fps/pdf">Free Download</a>
linkRef = '<a class="c-button c-button--secondary c-button--block" href="(?P<link>.+?)">Get PDF</a>'
return re.findall(linkRef, str(page), flags=re.IGNORECASE)
# URLs for all the Raspberry Pi Foundation Free magazines and books
# With a locaiton and template filename
urlList = []
urlList.append(('https://magpi.raspberrypi.org/issues', os.path.join(output_dir, "MagPi", "MagPi<NUMB>.pdf"), MagazineLink, "download"))
urlList.append(('https://hackspace.raspberrypi.org/issues/', os.path.join(output_dir, "HackSpace", "HackSpaceMagazine<NUMB>.pdf"), MagazineLink, "download"))
urlList.append(('https://helloworld.raspberrypi.org/issues', os.path.join(output_dir, "HelloWorld", "HelloWorld_<NUMB>.pdf"), HelloWorldMagazineLink, ""))
urlList.append(('https://magpi.raspberrypi.org/books', os.path.join(output_dir, "Books", "<BOOK>"), BookLinks, "download"))
urlList.append(('https://hackspace.raspberrypi.org/books/', os.path.join(output_dir, "HackSpace_Books", "<BOOK>"), BookLinks, "download"))
# Wireframe things need updating
# https://whynowgaming.com/product-category/wireframe-pdf-magazines/
# Seems to be the URL, sadly paged listing not one nice long list.
# Very sadly no more issues, but good for getting the collection
#urlList.append(('https://wireframe.raspberrypi.org/issues/', os.path.join(output_dir, "WireFrame", "Wireframe<NUMB>.pdf"), MagazineLink, "download"))
#urlList.append(('https://wireframe.raspberrypi.org/books/', os.path.join(output_dir, "Wireframe_Books", "<BOOK>"), BookLinks, "download"))
fileNo = 1
def GetPage(url):
global fileNo
if debug:
print("Downloading " + str(url))
page = ''
# query the website and return the html to the variable 'page'
try:
r = requests.get(url, allow_redirects=True)
if debug:
open('D:\\Page_' + str(fileNo) + '.txt', 'wb').write(r.content)
fileNo += 1
page = r.content
except Exception as err:
print("There was an error")
print(err)
return page
def CheckAndDownloadFile(url, outfile):
# Check to make sure the file doesn't already exist, don't bother getting
# the linked page if it exists, less load on severs.
if not os.path.isfile(outFile):
# Get the download page
page = GetPage(url)
# Find the "Click if it didn't start" link with the actual download URL
#<a class="c-link" href="/downloads/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBa0FUIiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--d855b2aaf5533eb0d127165dafa32e8cb09fd523/HS%2342_Digital.pdf">click here to get your free PDF</a>.
#linkRegEx = "<a class=\"c-link\"[ download=\".+?\"]? href=\"(?P<link>.+)\">click here to get your free PDF</a>"
linkRegEx = "<a class=\"c-link\".+href=\"(?P<link>.+)\">click here to get your free PDF</a>"
dllink = re.search(linkRegEx, str(page), flags=re.IGNORECASE)
if dllink is None:
print("Download link not found")
else:
print("Getting :" + outFile)
if dllink.group("link").startswith("http"):
url = dllink.group("link")
else:
# Get the URL parts we need
urlParts = urlparse(url)
# Reconstruct the URL
url = urlParts.scheme + '://'+ urlParts.netloc + dllink.group("link")
r = requests.get(url, allow_redirects=True)
open(outFile, 'wb').write(r.content)
# Make sure the output folders is available
for (url, template, _, _) in urlList:
dir = os.path.dirname(template)
if not os.path.exists(dir):
print("Creating '{0}' now".format(dir))
os.makedirs(dir)
# Get the links from all the pages, using the RegEx supplied
for (url, template, processor, link_addition) in urlList:
if len(link_addition) > 0:
link_addition = "/" + link_addition
print("URL: " + url + " - Template:" + template)
# Get the primary page, with all the links
page = GetPage(url)
# Process the page, get the first link found
mainLinks = processor(page)
if len(mainLinks) == 0:
print("No main link(s) found")
else:
for link in mainLinks:
# Get the URL parts we need
urlParts = urlparse(url)
prependLink = urlParts.scheme + '://'+ urlParts.netloc
# Check to see if there's a number in the URL (at a certain position)
fileParts = link.split('/')
if fileParts[2].isnumeric():
# Generate the output filename from the template
outFile = template.replace("<NUMB>", "{:02d}".format(int(fileParts[2])))
# Check for the link and download
CheckAndDownloadFile(prependLink + link + link_addition, outFile)
if doBackIssues:
# Turn the main link into a template
templateLink = (prependLink + link + link_addition).replace("/" + fileParts[2] + "/", "/<NUMB>/")
# Go from the beginning to one before the last issue.
for issueNumb in range(1, int(fileParts[2])):
# Generate the output filename from the template
outFile = template.replace("<NUMB>", "{:02d}".format(issueNumb))
# Check for the link and download
CheckAndDownloadFile(templateLink.replace("<NUMB>", str(issueNumb)), outFile)
else:
# Do a book template replace
outFile = template.replace("<BOOK>", fileParts[2] + ".pdf")
# Generate the download link and pass it off
CheckAndDownloadFile(prependLink + link + link_addition, outFile)
print("Done...")