jeffrey-yc · dfeng99 · Nov 6, 2018 · Nov 6, 2018 · Nov 7, 2018 · Nov 7, 2018
diff --git a/DavidFeng_PythonLab/daughters.py b/DavidFeng_PythonLab/daughters.py
@@ -0,0 +1,39 @@
+import numpy as np
+import sys
+
+
+def get_opportunities(families, show_detail=False):
+    daughters = 0
+    for family in range(0, families):
+        children = 0
+        is_son = True
+        if show_detail:
+            sys.stdout.write(str(family)+":")
+        while is_son and children < 3:
+            rn = np.random.randint(0,99)
+            if show_detail:
+                sys.stdout.write("("+str(rn)+")")
+
+            children += 1
+
+            if rn <= 48:
+                daughters += 1
+                is_son = False
+
+            if not is_son:
+                if show_detail:
+                    sys.stdout.write('女+')
+            elif children == 3:
+                if show_detail:
+                    sys.stdout.write("男")
+            else:
+                if show_detail:
+                    sys.stdout.write("男")
+        if show_detail:
+            print(" ")
+
+    return daughters/families
+
+
+print("Opportunity if 10 families try: ", get_opportunities(10, True))
+print("Opportunity if 10,000 families try: ", get_opportunities(10000))
diff --git a/DavidFeng_PythonLab/myCrawlLab.py b/DavidFeng_PythonLab/myCrawlLab.py
@@ -0,0 +1,185 @@
+# Filename : davidfengCrawlLab.py
+# Created by : 馮文龍
+# License : GPL v2
+# output file : ./outputs/davidfeng.txt
+
+###
+# 抓取yahoo!電影的某部電影, 例如:
+# ○ https://tw.movies.yahoo.com/movieinfo_main.html/id=5644
+# ○ 需要抓取的資訊如下:
+# ■ 電影名稱 (中英)
+# ■ 上映日期 u 類 型 u 片 長 u 導 演 u 演 員 u 發行公司 u 官方網站 u 劇情介紹
+# ■ 將擷取出來的資料存檔，檔名: 編號.txt
+###
+
+from reppy.robots import Robots
+import requests
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+import os
+
+
+class MyHTMLParser(HTMLParser):
+    movie_info_focus = False
+    chinese_title_data_focus = False
+    english_title_data_focus = False
+    movie_category_focus = False
+    director_focus = False
+    website_focus = False
+    actors_focus = False
+    intro_focus = False
+    __title_en = ''
+    __title_zh = ''
+    __categories = ''
+    __info = {}
+    __actors = ''
+    __website = ''
+    __intro = ''
+
+    @property
+    def title_en(self):
+        return self.__title_en
+
+    @property
+    def title_zh(self):
+        return self.__title_zh
+
+    @property
+    def categories(self):
+        return self.__categories
+
+    @property
+    def info(self):
+        return self.__info
+
+    @property
+    def actors(self):
+        return self.__actors
+
+    @property
+    def website(self):
+        return self.__website
+
+    @property
+    def intro(self):
+        return self.__intro
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'div':
+            for attr in attrs:
+                if attr[0] == 'class' and attr[1] == 'movie_intro_info_r': # start parsing movie info
+                    self.movie_info_focus = True
+                elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'level_name':
+                    self.movie_category_focus = True
+                elif self.movie_info_focus and attr[0] == 'class' and attr[1] == 'movie_intro_list':
+                    if not self.director_focus:
+                        self.director_focus = True
+                    else:
+                        self.director_focus = False
+                        self.actors_focus = True
+                elif attr[0] == 'class' and attr[1] == 'gray_infobox storeinfo':
+                    self.intro_focus = True
+
+        if self.movie_info_focus:
+            if tag == 'h1':
+                self.chinese_title_data_focus = True
+            if tag == 'h3':
+                self.english_title_data_focus = True
+            if tag == 'dl':
+                for attr in attrs:
+                    if attr[0] == 'class' and attr[1] == 'evaluatebox': # end parsing movie info
+                        self.movie_info_focus = False
+
+    def handle_endtag(self, tag):
+        if self.movie_category_focus:
+            if tag == 'div':
+                self.movie_category_focus = False
+        elif self.chinese_title_data_focus:
+            if tag == 'h1':
+                self.chinese_title_data_focus = False
+        elif self.english_title_data_focus:
+            if tag == 'h3':
+                self.english_title_data_focus = False
+        elif self.intro_focus and tag == 'span':
+            self.intro_focus = False
+
+    def handle_data(self, data):
+        if data.strip() == '、':
+            return
+
+        if self.chinese_title_data_focus:
+            self.__title_zh = self.__title_zh + data.strip()
+        elif self.english_title_data_focus:
+            self.__title_en = self.__title_en + data.strip()
+        elif self.movie_category_focus and data.strip() != '':
+            self.__categories = self.__categories + data.strip() + ' '
+        elif self.director_focus and data.strip() != '':
+            if data.strip() == '演員：':
+                self.__info['演員'] = '';
+            else:
+                self.__info['導演'] = data.strip()
+        elif self.actors_focus and data.strip() != '':
+            if data.strip() == '官方連結：':
+                self.__info['官方連結'] = ''
+                self.actors_focus = False
+                self.website_focus = True
+            else:
+                self.__info['演員'] = self.__info['演員'] + data.strip() + ','
+        elif self.website_focus and data.strip() != '':
+            self.__info['官方連結'] = data.strip()
+            self.__info['演員'] = self.__info['演員'][:-1] # remove the trailing ','
+            self.website_focus = False
+        elif self.movie_info_focus and data.strip() != '':
+            dict_item = data.split('：')
+            if dict_item[0] != 'IMDb分數':
+                self.__info[dict_item[0]] = dict_item[1]
+        elif self.intro_focus and data.strip() != '':
+            self.__intro = data.strip()
+
+    def handle_decl(self, data):
+        pass
+
+
+robots_url = "https://tw.movies.yahoo.com/robots.txt"
+site_url = "https://tw.movies.yahoo.com/movieinfo_main.html/id=5644"
+parser = MyHTMLParser()
+
+# 讀取 robots.txt 判斷是否允許瀏覽
+robot = Robots.fetch(robots_url)
+sitemaps = robot.sitemaps
+is_allowed = robot.allowed(site_url, '*')
+
+print('網頁 url:', site_url)
+for sitemap in sitemaps:
+    print("sitemap :", sitemap)
+    print('sitemap 可存取? ', robot.allowed(sitemap, '*'))
+
+print('網頁可存取 ? -> ', is_allowed)
+print()
+
+if is_allowed:
+    req = requests.get(site_url, timeout=5)
+    parser.feed(req.text)
+    try:
+        if not os.path.exists('outputs'):
+            os.makedirs('outputs')
+    except OSError:
+        print('Error: Creating directory "outputs". ')
+    f = open("outputs/davidfeng.txt", "w")
+    print('中文片名:', parser.title_zh)
+    f.write('中文片名:'+parser.title_zh+"\n")
+    print('Movie\'s Title:', parser.title_en)
+    f.write('Movie\'s Title:' + parser.title_en+"\n")
+    print('類型:', parser.categories)
+    f.write('類型:' + parser.categories+"\n")
+    for item in parser.info:
+        print(item, ':', parser.info[item])
+        f.write(item + ':' + parser.info[item]+"\n")
+    print('劇情介紹:', parser.intro)
+    f.write('劇情介紹:' + parser.intro+"\n")
+    f.close()
+else:
+    print("網頁不允許訪問")
+
+
+
diff --git a/DavidFeng_PythonLab/outputs/davidfeng.txt b/DavidFeng_PythonLab/outputs/davidfeng.txt
@@ -0,0 +1,10 @@
+中文片名:驚奇4超人
+Movie's Title:The Fantastic Four
+類型:動作 科幻 
+上映日期:2015-08-06
+片　　長:01時40分
+發行公司:福斯影業
+導演:喬許傳克
+演員:傑米貝爾(Jamie Bell),凱特瑪拉(Kate Mara),麥爾斯泰勒(Miles Teller),麥可B喬丹(Michael B. Jordan)
+官方連結:https://www.facebook.com/foxmovies.tw
+劇情介紹:福斯影片重金打造全新《驚奇4超人》英雄漫畫鉅作再度搬上大銀幕，改編自漫威漫畫（Marvel）的超人氣經典，描述四位原本生活平凡的年輕人，被傳送至一個處處充滿危機的宇宙時空，從此四人的外貌身形都起了巨大的變化。而這些無法逆轉的身體變化也促使「驚奇4超人」，重新適應學習，並善用他們的超能力，同時併肩合作以抵禦外侮，保護即將遭敵軍破壞殆盡的地球家園，但他們所要對抗的竟是昔日摯友…