From 0fdc5b7ff66f82fb9bd288eeea803912719e4f1d Mon Sep 17 00:00:00 2001 From: faneder Date: Thu, 8 Nov 2018 17:00:32 +0800 Subject: [PATCH 1/3] born probability --- bornProbability.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 bornProbability.py diff --git a/bornProbability.py b/bornProbability.py new file mode 100644 index 0000000..347fdbd --- /dev/null +++ b/bornProbability.py @@ -0,0 +1,35 @@ +import random + +def bornProbability(n, showDetails = False): + girlCount = 0 + + for i in range(1, n+1): + if (showDetails): print(str(i) + ": ") + childCount = 0 + + while (True): + rn = random.randint(0, 99) + if (showDetails): print("(" + str(rn) +")", end='') + + isGirl = True if rn <= 48 else False + childCount += 1 + + if (isGirl): + girlCount += 1 + if (showDetails): print("女+") + break + elif childCount == 3: + if (showDetails): print("男") + break + else: + if (showDetails): print("男") + + if (showDetails): print("\n") + + p = girlCount / n + print('The probability of a girl being born is: ' + str(p)) + + return + +bornProbability(10, showDetails = True) +bornProbability(10) From 1211853e2a6c9d6248f96c7fd67149431c987e0a Mon Sep 17 00:00:00 2001 From: faneder Date: Thu, 8 Nov 2018 17:01:26 +0800 Subject: [PATCH 2/3] movie crawler --- crawler.py | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..00e6241 --- /dev/null +++ b/crawler.py @@ -0,0 +1,180 @@ +import numpy as np +import pandas as pd +import requests as rq +from bs4 import BeautifulSoup + +class Crawler: + + def __init__(self): + self.__soup = {} + self.__releaseDate = '' + self.__runTime = '' + self.__filmCorporation = '' + self.__IMDbScore = '' + self.__titleTw = '' + self.__titleEn = '' + self.__filmTypes = '' + self.__director = '' + self.__actors = '' + self.__officialLinks = '' + self.__storeInfo = '' + + def setSoup(self, url): + response = rq.get(str(url)) + self.__soup = BeautifulSoup(response.text, "lxml") + + @property + def titleTw(self): + return self.__titleTw + + @property + def titleEn(self): + return self.__titleEn + + @property + def runTime(self): + return self.__runTime + + @property + def filmCorporation(self): + return self.__filmCorporation + + @property + def IMDbScore(self): + return self.__IMDbScore + + @property + def filmTypes(self): + return self.__filmTypes + + @property + def director(self): + return self.__director + + @property + def actors(self): + return self.__actors + + @property + def officialLinks(self): + return self.__officialLinks + + @property + def storeInfo(self): + return self.__storeInfo + + @property + def releaseDate(self): + return self.__releaseDate + + def run(self): + mainTag = self.__soup.find("div", id = "content_l") + movieIntroTag = mainTag.find('div', class_ = 'movie_intro_info_r') + datas = [child.string.strip().replace(u'\u3000', u'').split(':') for child in movieIntroTag.find_all('span')] + + try: + self.__titleTw = movieIntroTag.h1.string + except: + self.__titleTw = np.nan + print("Error: Couldn't find Traditional Chinese title.") + + try: + self.__titleEn = movieIntroTag.h3.string + except: + self.__titleEn = np.nan + print("Error: Couldn't find English title.") + + try: + self.__filmTypes = ', '.join([child.find('a').get_text().strip() for child in movieIntroTag.find_all('div', class_ = 'level_name')]) + except: + self.__filmTypes = np.nan + print("Error: Couldn't find filmTypes.") + + try: + self.__director = movieIntroTag.select("span + div")[0].get_text().strip() + except: + self.__director = np.nan + print("Error: Couldn't find director.") + + try: + self.__officialLinks = [child.string.strip() for child in movieIntroTag.select("span + a")] + except: + self.__officialLinks = np.nan + print("Error: Couldn't find officialLinks.") + + try: + self.__actors = ', '.join([child.get_text().strip() for child in movieIntroTag.select("span + div")[1].find_all('a')]) + except: + self.__actors = np.nan + print("Error: Couldn't find actors.") + + try: + self.__storeInfo = mainTag.find('div', class_ = 'storeinfo').get_text() + except: + self.__storeInfo = np.nan + print("Error: Couldn't find storeinfo.") + + for data in datas: + if (data[0] == '上映日期'): + try: + self.__releaseDate = data[1] + except: + self.__releaseDate = np.nan + print("Error: Couldn't find releaseDate.") + pass + if (data[0] == '片長'): + try: + self.__runTime = data[1] + except: + self.__runTime = dnp.nan + print("Error: Couldn't find runTime.") + pass + if (data[0] == '發行公司'): + try: + self.__filmCorporation = data[1] + except: + self.__filmCorporation = np.nan + print("Error: Couldn't find fileCorporation.") + pass + if (data[0] == 'IMDb分數'): + try: + self.__IMDbScore = data[1] + except: + self.__IMDbScore = np.nan + print("Error: Couldn't find releaseDate.") + pass + + def getDataFrame(self): + ptt_nba_dict = { + "電影名稱(中)": self.titleTw, + "電影名稱(英)": self.titleEn, + "上映日期": self.releaseDate, + "類型": self.filmTypes, + "片長": self.runTime, + "導演": self.director, + "演員": self.actors, + "發行公司": self.filmCorporation, + "IMDb分數": self.IMDbScore, + "劇情介紹": self.storeInfo, + "官方連結": self.officialLinks, + } + + return pd.DataFrame(ptt_nba_dict) + +url = 'https://tw.movies.yahoo.com/movieinfo_main.html/id=5644' + +# initialize +crawler = Crawler() +crawler.setSoup(url) +crawler.run() +df = crawler.getDataFrame() + +# format dataframe style +result = df.style +result = result.set_properties(**{'text-align': 'left'}) +result.set_table_styles( + # select the table header with th and set it right align + [dict(selector="th", props=[("text-align", "left")])] +) + +result From e839a8346f17e1d56e881fe8ca1b2fb7260c9e30 Mon Sep 17 00:00:00 2001 From: faneder Date: Thu, 8 Nov 2018 17:01:49 +0800 Subject: [PATCH 3/3] add requirements --- bornProbability.MD | 16 ++++++++++++++++ crawler.MD | 9 +++++++++ 2 files changed, 25 insertions(+) create mode 100644 bornProbability.MD create mode 100644 crawler.MD diff --git a/bornProbability.MD b/bornProbability.MD new file mode 100644 index 0000000..dd8bce7 --- /dev/null +++ b/bornProbability.MD @@ -0,0 +1,16 @@ +# Born Probability Homework + +## 一對夫婦計劃生孩子生到有女兒才停,或生了三個就停止。他們會擁有女兒的機率是多少? +### step 1:機率模型 + +- 每一個孩子是女孩的機率是0.49 ,是男孩的機率是0.51。 + +各個孩子的性別是互相獨立的。 +### step 2:分配隨機數字。 +- 用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 = 女孩; 49, 50, 51, …, 99 = 男孩 + +### step 3:模擬生孩子策略 +- 從表A當中讀取一對一對的數字,直到這對夫婦有了女兒,或已有三個孩子。 +- 10次重複中,有9次生女孩。會得到女孩的機率的估計是9/10=0.9。 +- 如果機率模型正確的話,用數學計算會有女孩的真正機率是0.867。(我們的模擬答案相當接近了。除非這對夫婦運氣很不好,他們應該可以成功擁有一個女 +兒。) \ No newline at end of file diff --git a/crawler.MD b/crawler.MD new file mode 100644 index 0000000..deb55f4 --- /dev/null +++ b/crawler.MD @@ -0,0 +1,9 @@ +# Crawler Homework + +### 抓取 yahoo!電影的某部 +例如 : https://tw.movies.yahoo.com/movieinfo_main.html/id=5644 + +### 需要抓取的資訊如下 : + - 電影名稱 (中英 ) + - 上映日期, 類 型, 片 長, 導 演, 演 員, 發行公司, 官方網站, 劇情介紹 + - 將擷取出來的資料存檔為dataframe \ No newline at end of file