From d3bd7d77c6584a265a7ca76c36e0345d94a0dc41 Mon Sep 17 00:00:00 2001
From: "Feng, Wen Lung (David)" <dfeng99@users.noreply.github.com>
Date: Tue, 6 Nov 2018 19:05:54 +0800
Subject: [PATCH 1/6] =?UTF-8?q?homework-=E8=A8=88=E7=AE=97=E7=94=9F?=
 =?UTF-8?q?=E5=A5=B3=E5=85=92=E6=A9=9F=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 daughters.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 daughters.py

diff --git a/daughters.py b/daughters.py
new file mode 100644
index 0000000..4c39033
--- /dev/null
+++ b/daughters.py
@@ -0,0 +1,39 @@
+import numpy as np
+import sys
+
+
+def get_opportunities(families, show_detail=False):
+    daughters = 0
+    for family in range(0, families):
+        children = 0
+        is_son = True
+        if show_detail:
+            sys.stdout.write(str(family)+":")
+        while is_son and children < 3:
+            rn = np.random.randint(0,99)
+            if show_detail:
+                sys.stdout.write("("+str(rn)+")")
+
+            children += 1
+
+            if rn <= 48:
+                daughters += 1
+                is_son = False
+
+            if not is_son:
+                if show_detail:
+                    sys.stdout.write('女+')
+            elif children == 3:
+                if show_detail:
+                    sys.stdout.write("男")
+            else:
+                if show_detail:
+                    sys.stdout.write("男")
+        if show_detail:
+            print(" ")
+
+    return daughters/families
+
+
+print("Opportunity if 10 families try: ", get_opportunities(10, True))
+print("Opportunity if 10,000 families try: ", get_opportunities(10000))

From 7a73eb317446e089d57938914b236011b4c27c56 Mon Sep 17 00:00:00 2001
From: "Feng, Wen Lung (David)" <dfeng99@users.noreply.github.com>
Date: Tue, 6 Nov 2018 19:10:51 +0800
Subject: [PATCH 2/6] add header

---
 daughters.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/daughters.py b/daughters.py
index 4c39033..b89a9b2 100644
--- a/daughters.py
+++ b/daughters.py
@@ -1,3 +1,8 @@
+'''
+## 馮文龍 2018/11/06
+## License MIT
+'''
+
 import numpy as np
 import sys
 

From a4853d26a0b675b93f98165b14082fee2d84d1ad Mon Sep 17 00:00:00 2001
From: "Feng, Wen Lung (David)" <dfeng99@users.noreply.github.com>
Date: Wed, 7 Nov 2018 08:06:57 +0800
Subject: [PATCH 3/6] =?UTF-8?q?=E4=BF=AE=E6=94=B9Header=20comment=20?=
 =?UTF-8?q?=E5=8F=8ALicense=20=E6=95=98=E8=BF=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 daughters.py | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/daughters.py b/daughters.py
index b89a9b2..b437c05 100644
--- a/daughters.py
+++ b/daughters.py
@@ -1,6 +1,47 @@
 '''
 ## 馮文龍 2018/11/06
-## License MIT
+## License GPL v2
+
+一對夫婦計劃生孩子生到有女兒才停，或生了三個就停止。
+他們會擁有女兒的機率是多少？
+    * 第l 步：機率模型
+        每一個孩子是女孩的機率是0.49 ，是男孩的機率是0.51。
+        各個孩子的性別是互相獨立的。
+    * 第2 步：分配隨機數字。
+        用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 ＝ 女孩; 49, 50, 51, …, 99 ＝ 男孩
+    * 第3 步：模擬生孩子策略
+        從表A當中讀取一對一對的數字，直到這對夫婦有了女兒，或已有三個孩子。
+        10次重複中，有9次生女孩。會得到女孩的機率的估計是9/10=0.9。
+        如果機率模型正確的話，用數學計算會有女孩的真正機率是0.867。(我們的模
+        擬答案相當接近了。除非這對夫婦運氣很不好，他們應該可以成功擁有一個女
+        兒。)
+R :
+girl.born <- function(n, show.id = F){
+    girl.count <- 0
+    for (i in 1:n) {
+        if (show.id) cat(i,": ")
+        child.count <- 0
+        repeat {
+            rn <- sample(0:99, 1) # random number
+            if (show.id) cat(paste0("(", rn, ")"))
+            is.girl <- ifelse(rn <= 48, TRUE, FALSE)
+            child.count <- child.count + 1
+            if (is.girl){
+                girl.count <- girl.count + 1
+                if (show.id) cat("女+")
+                break
+            } else if (child.count == 3) {
+                if (show.id) cat("男")
+                break
+            } else{
+                if (show.id) cat("男")
+            }
+        }
+    if (show.id) cat("\n")
+    }
+    p <- girl.count / n
+    p
+}
 '''
 
 import numpy as np
@@ -40,5 +81,5 @@ def get_opportunities(families, show_detail=False):
     return daughters/families
 
 
-print("Opportunity if 10 families try: ", get_opportunities(10, True))
-print("Opportunity if 10,000 families try: ", get_opportunities(10000))
+print("Opportunities with detail, if 10 families try: ", get_opportunities(10, True))
+print("Opportunities, if 10,000 families try: ", get_opportunities(10000))

From fc5553d18bd12d39e165e118267f8a37ffe0e241 Mon Sep 17 00:00:00 2001
From: "Feng, Wen Lung (David)" <dfeng99@users.noreply.github.com>
Date: Wed, 7 Nov 2018 17:27:52 +0800
Subject: [PATCH 4/6] Homework - Opportunities & Crawl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

兩個作業
1. 生女兒機率
2. 爬蟲
---
 DavidFeng_PythonLab/daughters.py          |  39 +++++
 DavidFeng_PythonLab/myCrawlLab.py         | 176 ++++++++++++++++++++++
 DavidFeng_PythonLab/outputs/davidfeng.txt |   9 ++
 3 files changed, 224 insertions(+)
 create mode 100644 DavidFeng_PythonLab/daughters.py
 create mode 100644 DavidFeng_PythonLab/myCrawlLab.py
 create mode 100644 DavidFeng_PythonLab/outputs/davidfeng.txt

diff --git a/DavidFeng_PythonLab/daughters.py b/DavidFeng_PythonLab/daughters.py
new file mode 100644
index 0000000..4c39033
--- /dev/null
+++ b/DavidFeng_PythonLab/daughters.py
@@ -0,0 +1,39 @@
+import numpy as np
+import sys
+
+
+def get_opportunities(families, show_detail=False):
+    daughters = 0
+    for family in range(0, families):
+        children = 0
+        is_son = True
+        if show_detail:
+            sys.stdout.write(str(family)+":")
+        while is_son and children < 3:
+            rn = np.random.randint(0,99)
+            if show_detail:
+                sys.stdout.write("("+str(rn)+")")
+
+            children += 1
+
+            if rn <= 48:
+                daughters += 1
+                is_son = False
+
+            if not is_son:
+                if show_detail:
+                    sys.stdout.write('女+')
+            elif children == 3:
+                if show_detail:
+                    sys.stdout.write("男")
+            else:
+                if show_detail:
+                    sys.stdout.write("男")
+        if show_detail:
+            print(" ")
+
+    return daughters/families
+
+
+print("Opportunity if 10 families try: ", get_opportunities(10, True))
+print("Opportunity if 10,000 families try: ", get_opportunities(10000))
diff --git a/DavidFeng_PythonLab/myCrawlLab.py b/DavidFeng_PythonLab/myCrawlLab.py
new file mode 100644
index 0000000..7c0e417
--- /dev/null
+++ b/DavidFeng_PythonLab/myCrawlLab.py
@@ -0,0 +1,176 @@
+# Filename : davidfengCrawlLab.py
+# Created by : 馮文龍
+# License : GPL v2
+# output file : ./outputs/davidfeng.txt
+
+###
+# 抓取yahoo!電影的某部電影, 例如:
+# ○ https://tw.movies.yahoo.com/movieinfo_main.html/id=5644
+# ○ 需要抓取的資訊如下:
+# ■ 電影名稱 (中英)
+# ■ 上映日期 u 類 型 u 片 長 u 導 演 u 演 員 u 發行公司 u 官方網站 u 劇情介紹
+# ■ 將擷取出來的資料存檔，檔名: 編號.txt
+###
+
+from reppy.robots import Robots
+import requests
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+import os
+
+
+class MyHTMLParser(HTMLParser):
+    movie_info_focus = False
+    chinese_title_data_focus = False
+    english_title_data_focus = False
+    movie_category_focus = False
+    director_focus = False
+    website_focus = False
+    actors_focus = False
+    __title_en = ''
+    __title_zh = ''
+    __categories = ''
+    __info = {}
+    __actors = ''
+    __website = ''
+    __intro = ''
+
+    @property
+    def title_en(self):
+        return self.__title_en
+
+    @property
+    def title_zh(self):
+        return self.__title_zh
+
+    @property
+    def categories(self):
+        return self.__categories
+
+    @property
+    def info(self):
+        return self.__info
+
+    @property
+    def actors(self):
+        return self.__actors
+
+    @property
+    def website(self):
+        return self.__website
+
+    @property
+    def intro(self):
+        return self.__intro
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'div':
+            for attr in attrs:
+                if attr[0] == 'class' and attr[1] == 'movie_intro_info_r': # start parsing movie info
+                    self.movie_info_focus = True
+                elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'level_name':
+                    self.movie_category_focus = True
+                elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'movie_intro_list':
+                    if not self.director_focus:
+                        self.director_focus = True
+                    else:
+                        self.director_focus = False
+                        self.actors_focus = True
+
+        if self.movie_info_focus:
+            if tag == 'h1':
+                self.chinese_title_data_focus = True
+            if tag == 'h3':
+                self.english_title_data_focus = True
+            if tag == 'dl':
+                for attr in attrs:
+                    if attr[0] == 'class' and attr[1] == 'evaluatebox': # end parsing movie info
+                        self.movie_info_focus = False
+
+    def handle_endtag(self, tag):
+        if self.movie_category_focus:
+            if tag == 'div':
+                self.movie_category_focus = False
+        elif self.chinese_title_data_focus:
+            if tag == 'h1':
+                self.chinese_title_data_focus = False
+        elif self.english_title_data_focus:
+            if tag == 'h3':
+                self.english_title_data_focus = False
+
+    def handle_data(self, data):
+        if data.strip() == '、':
+            return
+
+        if self.chinese_title_data_focus:
+            self.__title_zh = self.__title_zh + data.strip()
+        elif self.english_title_data_focus:
+            self.__title_en = self.__title_en + data.strip()
+        elif self.movie_category_focus and data.strip() != '':
+            self.__categories = self.__categories + data.strip() + ' '
+        elif self.director_focus and data.strip() != '':
+            if data.strip() == '演員：':
+                self.__info['演員：'] = '';
+            else:
+                self.__info['導演'] = data.strip()
+        elif self.actors_focus and data.strip() != '':
+            if data.strip() == '官方連結：':
+                self.__info['官方連結：'] = ''
+                self.actors_focus = False
+                self.website_focus = True
+            else:
+                self.__info['演員：'] = self.__info['演員：'] + data.strip() + ','
+        elif self.website_focus and data.strip() != '':
+            self.__info['官方連結：'] = data.strip()
+            self.__info['演員：'] = self.__info['演員：'][:-1] # remove the trailing ','
+            self.website_focus = False
+        elif self.movie_info_focus and data.strip() != '':
+            dict_item = data.split('：')
+            if dict_item[0] != 'IMDb分數':
+                self.__info[dict_item[0]] = dict_item[1]
+
+    def handle_decl(self, data):
+        pass
+
+
+robots_url = "https://tw.movies.yahoo.com/robots.txt"
+site_url = "https://tw.movies.yahoo.com/movieinfo_main.html/id=5644"
+parser = MyHTMLParser()
+
+# 讀取 robots.txt 判斷是否允許瀏覽
+robot = Robots.fetch(robots_url)
+sitemaps = robot.sitemaps
+is_allowed = robot.allowed(site_url, '*')
+
+print('網頁 url:', site_url)
+for sitemap in sitemaps:
+    print("sitemap :", sitemap)
+    print('sitemap 可存取? ', robot.allowed(sitemap, '*'))
+
+print('網頁可存取 ? -> ', is_allowed)
+print()
+
+if is_allowed:
+    req = requests.get(site_url, timeout=5)
+    parser.feed(req.text)
+    try:
+        if not os.path.exists('outputs'):
+            os.makedirs('outputs')
+    except OSError:
+        print('Error: Creating directory "outputs". ')
+    f = open("outputs/davidfeng.txt", "w")
+    print('中文片名:', parser.title_zh)
+    f.write('中文片名:'+parser.title_zh+"\n")
+    print('Movie\'s Title:', parser.title_en)
+    f.write('Movie\'s Title:' + parser.title_en+"\n")
+    print('類型:', parser.categories)
+    f.write('類型:' + parser.categories+"\n")
+    for item in parser.info:
+        print(item, ':', parser.info[item])
+        f.write(item + ':' + parser.info[item]+"\n")
+    f.close()
+else:
+    print("網頁不允許訪問")
+
+
+
diff --git a/DavidFeng_PythonLab/outputs/davidfeng.txt b/DavidFeng_PythonLab/outputs/davidfeng.txt
new file mode 100644
index 0000000..f2bee06
--- /dev/null
+++ b/DavidFeng_PythonLab/outputs/davidfeng.txt
@@ -0,0 +1,9 @@
+中文片名:驚奇4超人
+Movie's Title:The Fantastic Four
+類型:動作 科幻 
+上映日期:2015-08-06
+片　　長:01時40分
+發行公司:福斯影業
+導演:喬許傳克
+演員：:傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan)
+官方連結：:https://www.facebook.com/foxmovies.tw

From 94510ab80b7789359802f0f1dfc1f42322cf27ae Mon Sep 17 00:00:00 2001
From: "Feng, Wen Lung (David)" <dfeng99@users.noreply.github.com>
Date: Wed, 7 Nov 2018 17:31:16 +0800
Subject: [PATCH 5/6] Delete daughters.py

---
 daughters.py | 85 ----------------------------------------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 daughters.py

diff --git a/daughters.py b/daughters.py
deleted file mode 100644
index b437c05..0000000
--- a/daughters.py
+++ /dev/null
@@ -1,85 +0,0 @@
-'''
-## 馮文龍 2018/11/06
-## License GPL v2
-
-一對夫婦計劃生孩子生到有女兒才停，或生了三個就停止。
-他們會擁有女兒的機率是多少？
-    * 第l 步：機率模型
-        每一個孩子是女孩的機率是0.49 ，是男孩的機率是0.51。
-        各個孩子的性別是互相獨立的。
-    * 第2 步：分配隨機數字。
-        用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 ＝ 女孩; 49, 50, 51, …, 99 ＝ 男孩
-    * 第3 步：模擬生孩子策略
-        從表A當中讀取一對一對的數字，直到這對夫婦有了女兒，或已有三個孩子。
-        10次重複中，有9次生女孩。會得到女孩的機率的估計是9/10=0.9。
-        如果機率模型正確的話，用數學計算會有女孩的真正機率是0.867。(我們的模
-        擬答案相當接近了。除非這對夫婦運氣很不好，他們應該可以成功擁有一個女
-        兒。)
-R :
-girl.born <- function(n, show.id = F){
-    girl.count <- 0
-    for (i in 1:n) {
-        if (show.id) cat(i,": ")
-        child.count <- 0
-        repeat {
-            rn <- sample(0:99, 1) # random number
-            if (show.id) cat(paste0("(", rn, ")"))
-            is.girl <- ifelse(rn <= 48, TRUE, FALSE)
-            child.count <- child.count + 1
-            if (is.girl){
-                girl.count <- girl.count + 1
-                if (show.id) cat("女+")
-                break
-            } else if (child.count == 3) {
-                if (show.id) cat("男")
-                break
-            } else{
-                if (show.id) cat("男")
-            }
-        }
-    if (show.id) cat("\n")
-    }
-    p <- girl.count / n
-    p
-}
-'''
-
-import numpy as np
-import sys
-
-
-def get_opportunities(families, show_detail=False):
-    daughters = 0
-    for family in range(0, families):
-        children = 0
-        is_son = True
-        if show_detail:
-            sys.stdout.write(str(family)+":")
-        while is_son and children < 3:
-            rn = np.random.randint(0,99)
-            if show_detail:
-                sys.stdout.write("("+str(rn)+")")
-
-            children += 1
-
-            if rn <= 48:
-                daughters += 1
-                is_son = False
-
-            if not is_son:
-                if show_detail:
-                    sys.stdout.write('女+')
-            elif children == 3:
-                if show_detail:
-                    sys.stdout.write("男")
-            else:
-                if show_detail:
-                    sys.stdout.write("男")
-        if show_detail:
-            print(" ")
-
-    return daughters/families
-
-
-print("Opportunities with detail, if 10 families try: ", get_opportunities(10, True))
-print("Opportunities, if 10,000 families try: ", get_opportunities(10000))

From ab4be1e5e48b5b1798c86d2862019b454fa4225c Mon Sep 17 00:00:00 2001
From: dfeng99 <profit99>
Date: Fri, 9 Nov 2018 15:54:10 +0800
Subject: [PATCH 6/6] =?UTF-8?q?Add=20=E5=8A=87=E6=83=85=E4=BB=8B=E7=B4=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DavidFeng_PythonLab/myCrawlLab.py         | 19 ++++++++++++++-----
 DavidFeng_PythonLab/outputs/davidfeng.txt |  5 +++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/DavidFeng_PythonLab/myCrawlLab.py b/DavidFeng_PythonLab/myCrawlLab.py
index 7c0e417..a81cd77 100644
--- a/DavidFeng_PythonLab/myCrawlLab.py
+++ b/DavidFeng_PythonLab/myCrawlLab.py
@@ -27,6 +27,7 @@ class MyHTMLParser(HTMLParser):
     director_focus = False
     website_focus = False
     actors_focus = False
+    intro_focus = False
     __title_en = ''
     __title_zh = ''
     __categories = ''
@@ -76,6 +77,8 @@ def handle_starttag(self, tag, attrs):
                     else:
                         self.director_focus = False
                         self.actors_focus = True
+                elif attr[0] == 'class' and attr[1] == 'gray_infobox storeinfo':
+                    self.intro_focus = True
 
         if self.movie_info_focus:
             if tag == 'h1':
@@ -97,6 +100,8 @@ def handle_endtag(self, tag):
         elif self.english_title_data_focus:
             if tag == 'h3':
                 self.english_title_data_focus = False
+        elif self.intro_focus and tag == 'span':
+            self.intro_focus = False
 
     def handle_data(self, data):
         if data.strip() == '、':
@@ -110,24 +115,26 @@ def handle_data(self, data):
             self.__categories = self.__categories + data.strip() + ' '
         elif self.director_focus and data.strip() != '':
             if data.strip() == '演員：':
-                self.__info['演員：'] = '';
+                self.__info['演員'] = '';
             else:
                 self.__info['導演'] = data.strip()
         elif self.actors_focus and data.strip() != '':
             if data.strip() == '官方連結：':
-                self.__info['官方連結：'] = ''
+                self.__info['官方連結'] = ''
                 self.actors_focus = False
                 self.website_focus = True
             else:
-                self.__info['演員：'] = self.__info['演員：'] + data.strip() + ','
+                self.__info['演員'] = self.__info['演員'] + data.strip() + ','
         elif self.website_focus and data.strip() != '':
-            self.__info['官方連結：'] = data.strip()
-            self.__info['演員：'] = self.__info['演員：'][:-1] # remove the trailing ','
+            self.__info['官方連結'] = data.strip()
+            self.__info['演員'] = self.__info['演員'][:-1] # remove the trailing ','
             self.website_focus = False
         elif self.movie_info_focus and data.strip() != '':
             dict_item = data.split('：')
             if dict_item[0] != 'IMDb分數':
                 self.__info[dict_item[0]] = dict_item[1]
+        elif self.intro_focus and data.strip() != '':
+            self.__intro = data.strip()
 
     def handle_decl(self, data):
         pass
@@ -168,6 +175,8 @@ def handle_decl(self, data):
     for item in parser.info:
         print(item, ':', parser.info[item])
         f.write(item + ':' + parser.info[item]+"\n")
+    print('劇情介紹:', parser.intro)
+    f.write('劇情介紹:' + parser.intro+"\n")
     f.close()
 else:
     print("網頁不允許訪問")
diff --git a/DavidFeng_PythonLab/outputs/davidfeng.txt b/DavidFeng_PythonLab/outputs/davidfeng.txt
index f2bee06..403f809 100644
--- a/DavidFeng_PythonLab/outputs/davidfeng.txt
+++ b/DavidFeng_PythonLab/outputs/davidfeng.txt
@@ -5,5 +5,6 @@ Movie's Title:The Fantastic Four
 片　　長:01時40分
 發行公司:福斯影業
 導演:喬許傳克
-演員：:傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan)
-官方連結：:https://www.facebook.com/foxmovies.tw
+演員:傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan)
+官方連結:https://www.facebook.com/foxmovies.tw
+劇情介紹:福斯影片重金打造全新《驚奇4超人》英雄漫畫鉅作再度搬上大銀幕，改編自漫威漫畫（Marvel）的超人氣經典，描述四位原本生活平凡的年輕人，被傳送至一個處處充滿危機的宇宙時空，從此四人的外貌身形都起了巨大的變化。而這些無法逆轉的身體變化也促使「驚奇4超人」，重新適應學習，並善用他們的超能力，同時併肩合作以抵禦外侮，保護即將遭敵軍破壞殆盡的地球家園，但他們所要對抗的竟是昔日摯友…