From d3bd7d77c6584a265a7ca76c36e0345d94a0dc41 Mon Sep 17 00:00:00 2001 From: "Feng, Wen Lung (David)" Date: Tue, 6 Nov 2018 19:05:54 +0800 Subject: [PATCH 1/6] =?UTF-8?q?homework-=E8=A8=88=E7=AE=97=E7=94=9F?= =?UTF-8?q?=E5=A5=B3=E5=85=92=E6=A9=9F=E7=8E=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- daughters.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 daughters.py diff --git a/daughters.py b/daughters.py new file mode 100644 index 0000000..4c39033 --- /dev/null +++ b/daughters.py @@ -0,0 +1,39 @@ +import numpy as np +import sys + + +def get_opportunities(families, show_detail=False): + daughters = 0 + for family in range(0, families): + children = 0 + is_son = True + if show_detail: + sys.stdout.write(str(family)+":") + while is_son and children < 3: + rn = np.random.randint(0,99) + if show_detail: + sys.stdout.write("("+str(rn)+")") + + children += 1 + + if rn <= 48: + daughters += 1 + is_son = False + + if not is_son: + if show_detail: + sys.stdout.write('女+') + elif children == 3: + if show_detail: + sys.stdout.write("男") + else: + if show_detail: + sys.stdout.write("男") + if show_detail: + print(" ") + + return daughters/families + + +print("Opportunity if 10 families try: ", get_opportunities(10, True)) +print("Opportunity if 10,000 families try: ", get_opportunities(10000)) From 7a73eb317446e089d57938914b236011b4c27c56 Mon Sep 17 00:00:00 2001 From: "Feng, Wen Lung (David)" Date: Tue, 6 Nov 2018 19:10:51 +0800 Subject: [PATCH 2/6] add header --- daughters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/daughters.py b/daughters.py index 4c39033..b89a9b2 100644 --- a/daughters.py +++ b/daughters.py @@ -1,3 +1,8 @@ +''' +## 馮文龍 2018/11/06 +## License MIT +''' + import numpy as np import sys From a4853d26a0b675b93f98165b14082fee2d84d1ad Mon Sep 17 00:00:00 2001 From: "Feng, Wen Lung (David)" Date: Wed, 7 Nov 2018 08:06:57 +0800 Subject: [PATCH 3/6] =?UTF-8?q?=E4=BF=AE=E6=94=B9Header=20comment=20?= =?UTF-8?q?=E5=8F=8ALicense=20=E6=95=98=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- daughters.py | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/daughters.py b/daughters.py index b89a9b2..b437c05 100644 --- a/daughters.py +++ b/daughters.py @@ -1,6 +1,47 @@ ''' ## 馮文龍 2018/11/06 -## License MIT +## License GPL v2 + +一對夫婦計劃生孩子生到有女兒才停,或生了三個就停止。 +他們會擁有女兒的機率是多少? + * 第l 步:機率模型 + 每一個孩子是女孩的機率是0.49 ,是男孩的機率是0.51。 + 各個孩子的性別是互相獨立的。 + * 第2 步:分配隨機數字。 + 用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 = 女孩; 49, 50, 51, …, 99 = 男孩 + * 第3 步:模擬生孩子策略 + 從表A當中讀取一對一對的數字,直到這對夫婦有了女兒,或已有三個孩子。 + 10次重複中,有9次生女孩。會得到女孩的機率的估計是9/10=0.9。 + 如果機率模型正確的話,用數學計算會有女孩的真正機率是0.867。(我們的模 + 擬答案相當接近了。除非這對夫婦運氣很不好,他們應該可以成功擁有一個女 + 兒。) +R : +girl.born <- function(n, show.id = F){ + girl.count <- 0 + for (i in 1:n) { + if (show.id) cat(i,": ") + child.count <- 0 + repeat { + rn <- sample(0:99, 1) # random number + if (show.id) cat(paste0("(", rn, ")")) + is.girl <- ifelse(rn <= 48, TRUE, FALSE) + child.count <- child.count + 1 + if (is.girl){ + girl.count <- girl.count + 1 + if (show.id) cat("女+") + break + } else if (child.count == 3) { + if (show.id) cat("男") + break + } else{ + if (show.id) cat("男") + } + } + if (show.id) cat("\n") + } + p <- girl.count / n + p +} ''' import numpy as np @@ -40,5 +81,5 @@ def get_opportunities(families, show_detail=False): return daughters/families -print("Opportunity if 10 families try: ", get_opportunities(10, True)) -print("Opportunity if 10,000 families try: ", get_opportunities(10000)) +print("Opportunities with detail, if 10 families try: ", get_opportunities(10, True)) +print("Opportunities, if 10,000 families try: ", get_opportunities(10000)) From fc5553d18bd12d39e165e118267f8a37ffe0e241 Mon Sep 17 00:00:00 2001 From: "Feng, Wen Lung (David)" Date: Wed, 7 Nov 2018 17:27:52 +0800 Subject: [PATCH 4/6] Homework - Opportunities & Crawl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 兩個作業 1. 生女兒機率 2. 爬蟲 --- DavidFeng_PythonLab/daughters.py | 39 +++++ DavidFeng_PythonLab/myCrawlLab.py | 176 ++++++++++++++++++++++ DavidFeng_PythonLab/outputs/davidfeng.txt | 9 ++ 3 files changed, 224 insertions(+) create mode 100644 DavidFeng_PythonLab/daughters.py create mode 100644 DavidFeng_PythonLab/myCrawlLab.py create mode 100644 DavidFeng_PythonLab/outputs/davidfeng.txt diff --git a/DavidFeng_PythonLab/daughters.py b/DavidFeng_PythonLab/daughters.py new file mode 100644 index 0000000..4c39033 --- /dev/null +++ b/DavidFeng_PythonLab/daughters.py @@ -0,0 +1,39 @@ +import numpy as np +import sys + + +def get_opportunities(families, show_detail=False): + daughters = 0 + for family in range(0, families): + children = 0 + is_son = True + if show_detail: + sys.stdout.write(str(family)+":") + while is_son and children < 3: + rn = np.random.randint(0,99) + if show_detail: + sys.stdout.write("("+str(rn)+")") + + children += 1 + + if rn <= 48: + daughters += 1 + is_son = False + + if not is_son: + if show_detail: + sys.stdout.write('女+') + elif children == 3: + if show_detail: + sys.stdout.write("男") + else: + if show_detail: + sys.stdout.write("男") + if show_detail: + print(" ") + + return daughters/families + + +print("Opportunity if 10 families try: ", get_opportunities(10, True)) +print("Opportunity if 10,000 families try: ", get_opportunities(10000)) diff --git a/DavidFeng_PythonLab/myCrawlLab.py b/DavidFeng_PythonLab/myCrawlLab.py new file mode 100644 index 0000000..7c0e417 --- /dev/null +++ b/DavidFeng_PythonLab/myCrawlLab.py @@ -0,0 +1,176 @@ +# Filename : davidfengCrawlLab.py +# Created by : 馮文龍 +# License : GPL v2 +# output file : ./outputs/davidfeng.txt + +### +# 抓取yahoo!電影的某部電影, 例如: +# ○ https://tw.movies.yahoo.com/movieinfo_main.html/id=5644 +# ○ 需要抓取的資訊如下: +# ■ 電影名稱 (中英) +# ■ 上映日期 u 類 型 u 片 長 u 導 演 u 演 員 u 發行公司 u 官方網站 u 劇情介紹 +# ■ 將擷取出來的資料存檔,檔名: 編號.txt +### + +from reppy.robots import Robots +import requests +from html.parser import HTMLParser +from html.entities import name2codepoint +import os + + +class MyHTMLParser(HTMLParser): + movie_info_focus = False + chinese_title_data_focus = False + english_title_data_focus = False + movie_category_focus = False + director_focus = False + website_focus = False + actors_focus = False + __title_en = '' + __title_zh = '' + __categories = '' + __info = {} + __actors = '' + __website = '' + __intro = '' + + @property + def title_en(self): + return self.__title_en + + @property + def title_zh(self): + return self.__title_zh + + @property + def categories(self): + return self.__categories + + @property + def info(self): + return self.__info + + @property + def actors(self): + return self.__actors + + @property + def website(self): + return self.__website + + @property + def intro(self): + return self.__intro + + def handle_starttag(self, tag, attrs): + if tag == 'div': + for attr in attrs: + if attr[0] == 'class' and attr[1] == 'movie_intro_info_r': # start parsing movie info + self.movie_info_focus = True + elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'level_name': + self.movie_category_focus = True + elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'movie_intro_list': + if not self.director_focus: + self.director_focus = True + else: + self.director_focus = False + self.actors_focus = True + + if self.movie_info_focus: + if tag == 'h1': + self.chinese_title_data_focus = True + if tag == 'h3': + self.english_title_data_focus = True + if tag == 'dl': + for attr in attrs: + if attr[0] == 'class' and attr[1] == 'evaluatebox': # end parsing movie info + self.movie_info_focus = False + + def handle_endtag(self, tag): + if self.movie_category_focus: + if tag == 'div': + self.movie_category_focus = False + elif self.chinese_title_data_focus: + if tag == 'h1': + self.chinese_title_data_focus = False + elif self.english_title_data_focus: + if tag == 'h3': + self.english_title_data_focus = False + + def handle_data(self, data): + if data.strip() == '、': + return + + if self.chinese_title_data_focus: + self.__title_zh = self.__title_zh + data.strip() + elif self.english_title_data_focus: + self.__title_en = self.__title_en + data.strip() + elif self.movie_category_focus and data.strip() != '': + self.__categories = self.__categories + data.strip() + ' ' + elif self.director_focus and data.strip() != '': + if data.strip() == '演員:': + self.__info['演員:'] = ''; + else: + self.__info['導演'] = data.strip() + elif self.actors_focus and data.strip() != '': + if data.strip() == '官方連結:': + self.__info['官方連結:'] = '' + self.actors_focus = False + self.website_focus = True + else: + self.__info['演員:'] = self.__info['演員:'] + data.strip() + ',' + elif self.website_focus and data.strip() != '': + self.__info['官方連結:'] = data.strip() + self.__info['演員:'] = self.__info['演員:'][:-1] # remove the trailing ',' + self.website_focus = False + elif self.movie_info_focus and data.strip() != '': + dict_item = data.split(':') + if dict_item[0] != 'IMDb分數': + self.__info[dict_item[0]] = dict_item[1] + + def handle_decl(self, data): + pass + + +robots_url = "https://tw.movies.yahoo.com/robots.txt" +site_url = "https://tw.movies.yahoo.com/movieinfo_main.html/id=5644" +parser = MyHTMLParser() + +# 讀取 robots.txt 判斷是否允許瀏覽 +robot = Robots.fetch(robots_url) +sitemaps = robot.sitemaps +is_allowed = robot.allowed(site_url, '*') + +print('網頁 url:', site_url) +for sitemap in sitemaps: + print("sitemap :", sitemap) + print('sitemap 可存取? ', robot.allowed(sitemap, '*')) + +print('網頁可存取 ? -> ', is_allowed) +print() + +if is_allowed: + req = requests.get(site_url, timeout=5) + parser.feed(req.text) + try: + if not os.path.exists('outputs'): + os.makedirs('outputs') + except OSError: + print('Error: Creating directory "outputs". ') + f = open("outputs/davidfeng.txt", "w") + print('中文片名:', parser.title_zh) + f.write('中文片名:'+parser.title_zh+"\n") + print('Movie\'s Title:', parser.title_en) + f.write('Movie\'s Title:' + parser.title_en+"\n") + print('類型:', parser.categories) + f.write('類型:' + parser.categories+"\n") + for item in parser.info: + print(item, ':', parser.info[item]) + f.write(item + ':' + parser.info[item]+"\n") + f.close() +else: + print("網頁不允許訪問") + + + diff --git a/DavidFeng_PythonLab/outputs/davidfeng.txt b/DavidFeng_PythonLab/outputs/davidfeng.txt new file mode 100644 index 0000000..f2bee06 --- /dev/null +++ b/DavidFeng_PythonLab/outputs/davidfeng.txt @@ -0,0 +1,9 @@ +中文片名:驚奇4超人 +Movie's Title:The Fantastic Four +類型:動作 科幻 +上映日期:2015-08-06 +片  長:01時40分 +發行公司:福斯影業 +導演:喬許傳克 +演員::傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan) +官方連結::https://www.facebook.com/foxmovies.tw From 94510ab80b7789359802f0f1dfc1f42322cf27ae Mon Sep 17 00:00:00 2001 From: "Feng, Wen Lung (David)" Date: Wed, 7 Nov 2018 17:31:16 +0800 Subject: [PATCH 5/6] Delete daughters.py --- daughters.py | 85 ---------------------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 daughters.py diff --git a/daughters.py b/daughters.py deleted file mode 100644 index b437c05..0000000 --- a/daughters.py +++ /dev/null @@ -1,85 +0,0 @@ -''' -## 馮文龍 2018/11/06 -## License GPL v2 - -一對夫婦計劃生孩子生到有女兒才停,或生了三個就停止。 -他們會擁有女兒的機率是多少? - * 第l 步:機率模型 - 每一個孩子是女孩的機率是0.49 ,是男孩的機率是0.51。 - 各個孩子的性別是互相獨立的。 - * 第2 步:分配隨機數字。 - 用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 = 女孩; 49, 50, 51, …, 99 = 男孩 - * 第3 步:模擬生孩子策略 - 從表A當中讀取一對一對的數字,直到這對夫婦有了女兒,或已有三個孩子。 - 10次重複中,有9次生女孩。會得到女孩的機率的估計是9/10=0.9。 - 如果機率模型正確的話,用數學計算會有女孩的真正機率是0.867。(我們的模 - 擬答案相當接近了。除非這對夫婦運氣很不好,他們應該可以成功擁有一個女 - 兒。) -R : -girl.born <- function(n, show.id = F){ - girl.count <- 0 - for (i in 1:n) { - if (show.id) cat(i,": ") - child.count <- 0 - repeat { - rn <- sample(0:99, 1) # random number - if (show.id) cat(paste0("(", rn, ")")) - is.girl <- ifelse(rn <= 48, TRUE, FALSE) - child.count <- child.count + 1 - if (is.girl){ - girl.count <- girl.count + 1 - if (show.id) cat("女+") - break - } else if (child.count == 3) { - if (show.id) cat("男") - break - } else{ - if (show.id) cat("男") - } - } - if (show.id) cat("\n") - } - p <- girl.count / n - p -} -''' - -import numpy as np -import sys - - -def get_opportunities(families, show_detail=False): - daughters = 0 - for family in range(0, families): - children = 0 - is_son = True - if show_detail: - sys.stdout.write(str(family)+":") - while is_son and children < 3: - rn = np.random.randint(0,99) - if show_detail: - sys.stdout.write("("+str(rn)+")") - - children += 1 - - if rn <= 48: - daughters += 1 - is_son = False - - if not is_son: - if show_detail: - sys.stdout.write('女+') - elif children == 3: - if show_detail: - sys.stdout.write("男") - else: - if show_detail: - sys.stdout.write("男") - if show_detail: - print(" ") - - return daughters/families - - -print("Opportunities with detail, if 10 families try: ", get_opportunities(10, True)) -print("Opportunities, if 10,000 families try: ", get_opportunities(10000)) From ab4be1e5e48b5b1798c86d2862019b454fa4225c Mon Sep 17 00:00:00 2001 From: dfeng99 Date: Fri, 9 Nov 2018 15:54:10 +0800 Subject: [PATCH 6/6] =?UTF-8?q?Add=20=E5=8A=87=E6=83=85=E4=BB=8B=E7=B4=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DavidFeng_PythonLab/myCrawlLab.py | 19 ++++++++++++++----- DavidFeng_PythonLab/outputs/davidfeng.txt | 5 +++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/DavidFeng_PythonLab/myCrawlLab.py b/DavidFeng_PythonLab/myCrawlLab.py index 7c0e417..a81cd77 100644 --- a/DavidFeng_PythonLab/myCrawlLab.py +++ b/DavidFeng_PythonLab/myCrawlLab.py @@ -27,6 +27,7 @@ class MyHTMLParser(HTMLParser): director_focus = False website_focus = False actors_focus = False + intro_focus = False __title_en = '' __title_zh = '' __categories = '' @@ -76,6 +77,8 @@ def handle_starttag(self, tag, attrs): else: self.director_focus = False self.actors_focus = True + elif attr[0] == 'class' and attr[1] == 'gray_infobox storeinfo': + self.intro_focus = True if self.movie_info_focus: if tag == 'h1': @@ -97,6 +100,8 @@ def handle_endtag(self, tag): elif self.english_title_data_focus: if tag == 'h3': self.english_title_data_focus = False + elif self.intro_focus and tag == 'span': + self.intro_focus = False def handle_data(self, data): if data.strip() == '、': @@ -110,24 +115,26 @@ def handle_data(self, data): self.__categories = self.__categories + data.strip() + ' ' elif self.director_focus and data.strip() != '': if data.strip() == '演員:': - self.__info['演員:'] = ''; + self.__info['演員'] = ''; else: self.__info['導演'] = data.strip() elif self.actors_focus and data.strip() != '': if data.strip() == '官方連結:': - self.__info['官方連結:'] = '' + self.__info['官方連結'] = '' self.actors_focus = False self.website_focus = True else: - self.__info['演員:'] = self.__info['演員:'] + data.strip() + ',' + self.__info['演員'] = self.__info['演員'] + data.strip() + ',' elif self.website_focus and data.strip() != '': - self.__info['官方連結:'] = data.strip() - self.__info['演員:'] = self.__info['演員:'][:-1] # remove the trailing ',' + self.__info['官方連結'] = data.strip() + self.__info['演員'] = self.__info['演員'][:-1] # remove the trailing ',' self.website_focus = False elif self.movie_info_focus and data.strip() != '': dict_item = data.split(':') if dict_item[0] != 'IMDb分數': self.__info[dict_item[0]] = dict_item[1] + elif self.intro_focus and data.strip() != '': + self.__intro = data.strip() def handle_decl(self, data): pass @@ -168,6 +175,8 @@ def handle_decl(self, data): for item in parser.info: print(item, ':', parser.info[item]) f.write(item + ':' + parser.info[item]+"\n") + print('劇情介紹:', parser.intro) + f.write('劇情介紹:' + parser.intro+"\n") f.close() else: print("網頁不允許訪問") diff --git a/DavidFeng_PythonLab/outputs/davidfeng.txt b/DavidFeng_PythonLab/outputs/davidfeng.txt index f2bee06..403f809 100644 --- a/DavidFeng_PythonLab/outputs/davidfeng.txt +++ b/DavidFeng_PythonLab/outputs/davidfeng.txt @@ -5,5 +5,6 @@ Movie's Title:The Fantastic Four 片  長:01時40分 發行公司:福斯影業 導演:喬許傳克 -演員::傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan) -官方連結::https://www.facebook.com/foxmovies.tw +演員:傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan) +官方連結:https://www.facebook.com/foxmovies.tw +劇情介紹:福斯影片重金打造全新《驚奇4超人》英雄漫畫鉅作再度搬上大銀幕,改編自漫威漫畫(Marvel)的超人氣經典,描述四位原本生活平凡的年輕人,被傳送至一個處處充滿危機的宇宙時空,從此四人的外貌身形都起了巨大的變化。而這些無法逆轉的身體變化也促使「驚奇4超人」,重新適應學習,並善用他們的超能力,同時併肩合作以抵禦外侮,保護即將遭敵軍破壞殆盡的地球家園,但他們所要對抗的竟是昔日摯友…