forked from mauromarano/tutsplus-downloader
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTutsplus.py
More file actions
executable file
·129 lines (103 loc) · 4.25 KB
/
Tutsplus.py
File metadata and controls
executable file
·129 lines (103 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /usr/pkg/bin/python
#-*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
import re
class Tutsplus:
login_url= 'https://tutsplus.com/sign_in'
login_post = 'https://tutsplus.com/sessions'
home_url = 'https://tutsplus.com'
def __init__(self, username, password):
self.username = username
self.password = password
self.login()
# Return the html source for a specified url
def get_source(self, url):
r = self.s.get(url)
return r.content
# It logs in and store the session for the future requests
def login(self):
self.s = requests.session()
soup = BeautifulSoup(self.get_source(self.login_url))
self.token = soup.find(attrs={"name":"csrf-token"})['content']
data = {
"session[login]": self.username,
"session[password]": self.password,
"authenticity_token": self.token,
"utf8": "✓"
}
self.s.post(self.login_post, data = data)
return True
# remove special characters for windows users
def sanitize_filename(self, name):
if os.name == "nt":
return re.sub('[<>:"/\\|?*]+', '', name)
else:
return name.replace('/','-')
# Download all video from a course url
def download_course(self, url):
# Variable needed to increment the video number
video_number = 1
# get source
source = self.get_source(url)
# update csrf token for each course
soup = BeautifulSoup(source)
self.token = soup.find(attrs={"name":"csrf-token"})['content']
# the course's name
course_title = self.sanitize_filename(soup.select('h1')[0].string.encode("utf-8"))
print "######### " + course_title + " #########"
if not os.path.exists(course_title) :
os.makedirs(course_title)
# store course page
with open(course_title + '/course.html', 'w') as fid:
fid.write(source)
# if the course includes sourcefiles download them first
sourcefile = soup.select('.course-actions__download-button')
if sourcefile:
print "[+] Downloading source files"
filename = course_title + '/sources.zip'
link = sourcefile[0]['href']
self.download_file(link, filename)
# array who stores the information about a course
course_info = self.get_info_from_course(soup)
for video in course_info:
print "[+] Downloading " + video['titolo'].encode("utf-8")
filename = course_title + '/[' + str(video_number).zfill(2) + '] ' + self.sanitize_filename(video['titolo']) + '.mp4'
self.download_video(video['link'], filename)
video_number = video_number + 1
def download_courses(self, courses):
for course in courses:
self.download_course(course)
def download_video(self, url, filename):
# the trick for video links is not to follow the redirect, but to fetch the download link manually
# otherwise we'll get an SignatureDoesNotMatch error from S3
data = {
"authenticity_token": self.token,
"_method": 'post'
}
soup = BeautifulSoup(self.s.post(url, data = data, allow_redirects=False).content)
url = soup.find_all('a')[0]['href']
self.download_file(url, filename)
# Function who downloads the file itself
def download_file(self, url, filename):
r = self.s.get(url, stream=True)
if not os.path.isfile(filename) :
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
# return an array with all the information about a video (title, url)
def get_info_from_course(self, soup):
arr = []
videos = soup.select('.lesson-index__lesson')
for video in videos:
titolo = video.select('.lesson-index__lesson-title')[0].string
link = video.select('a')[0]['href']
info = {
"titolo": titolo,
"link": link,
}
arr.append(info)
return arr