jeffrey-yc · faneder · Nov 8, 2018 · Nov 8, 2018 · Nov 8, 2018
diff --git a/bornProbability.MD b/bornProbability.MD
@@ -0,0 +1,16 @@
+# Born Probability Homework
+
+## 一對夫婦計劃生孩子生到有女兒才停，或生了三個就停止。他們會擁有女兒的機率是多少？
+### step 1：機率模型
+
+- 每一個孩子是女孩的機率是0.49 ，是男孩的機率是0.51。
+
+各個孩子的性別是互相獨立的。
+### step 2：分配隨機數字。
+- 用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 ＝ 女孩; 49, 50, 51, …, 99 ＝ 男孩
+
+### step 3：模擬生孩子策略
+- 從表A當中讀取一對一對的數字，直到這對夫婦有了女兒，或已有三個孩子。
+- 10次重複中，有9次生女孩。會得到女孩的機率的估計是9/10=0.9。
+- 如果機率模型正確的話，用數學計算會有女孩的真正機率是0.867。(我們的模擬答案相當接近了。除非這對夫婦運氣很不好，他們應該可以成功擁有一個女
+兒。)
diff --git a/bornProbability.py b/bornProbability.py
@@ -0,0 +1,35 @@
+import random
+
+def bornProbability(n, showDetails = False):
+    girlCount = 0
+
+    for i in range(1, n+1):
+        if (showDetails): print(str(i) + ": ")
+        childCount = 0
+
+        while (True):
+            rn = random.randint(0, 99)
+            if (showDetails): print("(" + str(rn) +")", end='')
+
+            isGirl = True if rn <= 48 else False
+            childCount += 1
+
+            if (isGirl):
+                girlCount += 1
+                if (showDetails): print("女+")
+                break
+            elif childCount == 3:
+                if (showDetails): print("男")
+                break
+            else:
+                if (showDetails): print("男")
+
+        if (showDetails): print("\n")
+
+    p = girlCount / n
+    print('The probability of a girl being born is: ' + str(p))
+
+    return
+
+bornProbability(10, showDetails = True)
+bornProbability(10)
diff --git a/crawler.MD b/crawler.MD
@@ -0,0 +1,9 @@
+# Crawler Homework
+
+### 抓取 yahoo!電影的某部
+例如 : https://tw.movies.yahoo.com/movieinfo_main.html/id=5644
+
+### 需要抓取的資訊如下 :
+    - 電影名稱 (中英 )
+    - 上映日期, 類 型, 片 長, 導 演, 演 員, 發行公司, 官方網站, 劇情介紹
+    - 將擷取出來的資料存檔為dataframe
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,180 @@
+import numpy as np
+import pandas as pd
+import requests as rq
+from bs4 import BeautifulSoup
+
+class Crawler:
+
+    def __init__(self):
+        self.__soup = {}
+        self.__releaseDate = ''
+        self.__runTime = ''
+        self.__filmCorporation = ''
+        self.__IMDbScore = ''
+        self.__titleTw = ''
+        self.__titleEn = ''
+        self.__filmTypes = ''
+        self.__director = ''
+        self.__actors = ''
+        self.__officialLinks = ''
+        self.__storeInfo = ''
+
+    def setSoup(self, url):
+        response = rq.get(str(url))
+        self.__soup = BeautifulSoup(response.text, "lxml")
+
+    @property
+    def titleTw(self):
+        return self.__titleTw
+
+    @property
+    def titleEn(self):
+        return self.__titleEn
+
+    @property
+    def runTime(self):
+        return self.__runTime
+
+    @property
+    def filmCorporation(self):
+        return self.__filmCorporation
+
+    @property
+    def IMDbScore(self):
+        return self.__IMDbScore
+
+    @property
+    def filmTypes(self):
+        return self.__filmTypes
+
+    @property
+    def director(self):
+        return self.__director
+
+    @property
+    def actors(self):
+        return self.__actors
+
+    @property
+    def officialLinks(self):
+        return self.__officialLinks
+
+    @property
+    def storeInfo(self):
+        return self.__storeInfo
+
+    @property
+    def releaseDate(self):
+        return self.__releaseDate
+
+    def run(self):
+        mainTag = self.__soup.find("div", id = "content_l")
+        movieIntroTag = mainTag.find('div', class_ = 'movie_intro_info_r')
+        datas = [child.string.strip().replace(u'\u3000', u'').split('：') for child in movieIntroTag.find_all('span')]
+
+        try:
+            self.__titleTw = movieIntroTag.h1.string
+        except:
+            self.__titleTw = np.nan
+            print("Error: Couldn't find Traditional Chinese title.")
+
+        try:
+            self.__titleEn = movieIntroTag.h3.string
+        except:
+            self.__titleEn = np.nan
+            print("Error: Couldn't find English title.")
+
+        try:
+            self.__filmTypes = ', '.join([child.find('a').get_text().strip() for child in movieIntroTag.find_all('div', class_ = 'level_name')])
+        except:
+            self.__filmTypes = np.nan
+            print("Error: Couldn't find filmTypes.")
+
+        try:
+            self.__director = movieIntroTag.select("span + div")[0].get_text().strip()
+        except:
+            self.__director = np.nan
+            print("Error: Couldn't find director.")
+
+        try:
+            self.__officialLinks = [child.string.strip() for child in movieIntroTag.select("span + a")]
+        except:
+            self.__officialLinks = np.nan
+            print("Error: Couldn't find officialLinks.")
+
+        try:
+            self.__actors = ', '.join([child.get_text().strip() for child in movieIntroTag.select("span + div")[1].find_all('a')])
+        except:
+            self.__actors = np.nan
+            print("Error: Couldn't find actors.")
+
+        try:
+            self.__storeInfo = mainTag.find('div', class_ = 'storeinfo').get_text()
+        except:
+            self.__storeInfo = np.nan
+            print("Error: Couldn't find storeinfo.")
+
+        for data in datas:
+            if (data[0] == '上映日期'):
+                try:
+                    self.__releaseDate = data[1]
+                except:
+                    self.__releaseDate = np.nan
+                    print("Error: Couldn't find releaseDate.")
+                pass
+            if (data[0] == '片長'):
+                try:
+                    self.__runTime = data[1]
+                except:
+                    self.__runTime = dnp.nan
+                    print("Error: Couldn't find runTime.")
+                pass
+            if (data[0] == '發行公司'):
+                try:
+                    self.__filmCorporation = data[1]
+                except:
+                    self.__filmCorporation = np.nan
+                    print("Error: Couldn't find fileCorporation.")
+                pass
+            if (data[0] == 'IMDb分數'):
+                try:
+                    self.__IMDbScore = data[1]
+                except:
+                    self.__IMDbScore = np.nan
+                    print("Error: Couldn't find releaseDate.")
+                pass
+
+    def getDataFrame(self):
+        ptt_nba_dict = {
+            "電影名稱(中)": self.titleTw,
+            "電影名稱(英)": self.titleEn,
+            "上映日期": self.releaseDate,
+            "類型": self.filmTypes,
+            "片長": self.runTime,
+            "導演": self.director,
+            "演員": self.actors,
+            "發行公司": self.filmCorporation,
+            "IMDb分數": self.IMDbScore,
+            "劇情介紹": self.storeInfo,
+            "官方連結": self.officialLinks,
+        }
+
+        return pd.DataFrame(ptt_nba_dict)
+
+url = 'https://tw.movies.yahoo.com/movieinfo_main.html/id=5644'
+
+# initialize
+crawler = Crawler()
+crawler.setSoup(url)
+crawler.run()
+df = crawler.getDataFrame()
+
+# format dataframe style
+result = df.style
+result = result.set_properties(**{'text-align': 'left'})
+result.set_table_styles(
+    # select the table header with th and set it right align
+    [dict(selector="th", props=[("text-align", "left")])]
+)
+
+result