tutsplus-downloader/Tutsplus.py at master · cl1ck/tutsplus-downloader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /usr/pkg/bin/python
#-*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import os
import re

class Tutsplus:

    login_url= 'https://tutsplus.com/sign_in'
    login_post = 'https://tutsplus.com/sessions'
    home_url = 'https://tutsplus.com'

    def __init__(self, username, password):
        self.username = username
        self.password = password
        self.login()

    # Return the html source for a specified url
    def get_source(self, url):
        r = self.s.get(url)
        return r.content

    # It logs in and store the session for the future requests
    def login(self):
        self.s = requests.session()
        soup = BeautifulSoup(self.get_source(self.login_url))
        self.token = soup.find(attrs={"name":"csrf-token"})['content']

        data = {
            "session[login]": self.username,
            "session[password]": self.password,
            "authenticity_token": self.token,
            "utf8": "✓"
        }

        self.s.post(self.login_post, data = data)
        return True

    # remove special characters for windows users
    def sanitize_filename(self, name):
        if os.name  == "nt":
            return re.sub('[<>:"/\\|?*]+', '', name)
        else:
            return name.replace('/','-')

    # Download all video from a course url
    def download_course(self, url):
        # Variable needed to increment the video number
        video_number = 1

        # get source
        source = self.get_source(url)

        # update csrf token for each course
        soup = BeautifulSoup(source)
        self.token = soup.find(attrs={"name":"csrf-token"})['content']

        # the course's name
        course_title = self.sanitize_filename(soup.select('h1')[0].string.encode("utf-8"))
        print "######### " + course_title + " #########"
        if not os.path.exists(course_title) :
            os.makedirs(course_title)

        # store course page
        with open(course_title + '/course.html', 'w') as fid:
            fid.write(source)

        # if the course includes sourcefiles download them first
        sourcefile = soup.select('.course-actions__download-button')
        if sourcefile:
            print "[+] Downloading source files"
            filename = course_title + '/sources.zip'
            link = sourcefile[0]['href']
            self.download_file(link, filename)

        # array who stores the information about a course
        course_info = self.get_info_from_course(soup)

        for video in course_info:
            print "[+] Downloading " + video['titolo'].encode("utf-8")
            filename = course_title + '/[' + str(video_number).zfill(2) + '] ' + self.sanitize_filename(video['titolo']) + '.mp4'
            self.download_video(video['link'], filename)
            video_number = video_number + 1


    def download_courses(self, courses):
        for course in courses:
            self.download_course(course)

    def download_video(self, url, filename):
        # the trick for video links is not to follow the redirect, but to fetch the download link manually
        # otherwise we'll get an SignatureDoesNotMatch error from S3
        data = {
            "authenticity_token": self.token,
            "_method": 'post'
        }
        soup = BeautifulSoup(self.s.post(url, data = data, allow_redirects=False).content)
        url = soup.find_all('a')[0]['href']
        self.download_file(url, filename)

    # Function who downloads the file itself
    def download_file(self, url, filename):
        r = self.s.get(url, stream=True)
        if not os.path.isfile(filename) :
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)
                        f.flush()

    # return an array with all the information about a video (title, url)
    def get_info_from_course(self, soup):
        arr = []
        videos = soup.select('.lesson-index__lesson')

        for video in videos:

            titolo = video.select('.lesson-index__lesson-title')[0].string
            link = video.select('a')[0]['href']

            info = {
                "titolo": titolo,
                "link": link,
            }
            arr.append(info)

        return arr