Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions bornProbability.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Born Probability Homework

## 一對夫婦計劃生孩子生到有女兒才停,或生了三個就停止。他們會擁有女兒的機率是多少?
### step 1:機率模型

- 每一個孩子是女孩的機率是0.49 ,是男孩的機率是0.51。

各個孩子的性別是互相獨立的。
### step 2:分配隨機數字。
- 用兩個數字模擬一個孩子的性別: 00, 01, 02, …, 48 = 女孩; 49, 50, 51, …, 99 = 男孩

### step 3:模擬生孩子策略
- 從表A當中讀取一對一對的數字,直到這對夫婦有了女兒,或已有三個孩子。
- 10次重複中,有9次生女孩。會得到女孩的機率的估計是9/10=0.9。
- 如果機率模型正確的話,用數學計算會有女孩的真正機率是0.867。(我們的模擬答案相當接近了。除非這對夫婦運氣很不好,他們應該可以成功擁有一個女
兒。)
35 changes: 35 additions & 0 deletions bornProbability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import random

def bornProbability(n, showDetails = False):
girlCount = 0

for i in range(1, n+1):
if (showDetails): print(str(i) + ": ")
childCount = 0

while (True):
rn = random.randint(0, 99)
if (showDetails): print("(" + str(rn) +")", end='')

isGirl = True if rn <= 48 else False
childCount += 1

if (isGirl):
girlCount += 1
if (showDetails): print("女+")
break
elif childCount == 3:
if (showDetails): print("男")
break
else:
if (showDetails): print("男")

if (showDetails): print("\n")

p = girlCount / n
print('The probability of a girl being born is: ' + str(p))

return

bornProbability(10, showDetails = True)
bornProbability(10)
9 changes: 9 additions & 0 deletions crawler.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Crawler Homework

### 抓取 yahoo!電影的某部
例如 : https://tw.movies.yahoo.com/movieinfo_main.html/id=5644

### 需要抓取的資訊如下 :
- 電影名稱 (中英 )
- 上映日期, 類 型, 片 長, 導 演, 演 員, 發行公司, 官方網站, 劇情介紹
- 將擷取出來的資料存檔為dataframe
180 changes: 180 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import numpy as np
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup

class Crawler:

def __init__(self):
self.__soup = {}
self.__releaseDate = ''
self.__runTime = ''
self.__filmCorporation = ''
self.__IMDbScore = ''
self.__titleTw = ''
self.__titleEn = ''
self.__filmTypes = ''
self.__director = ''
self.__actors = ''
self.__officialLinks = ''
self.__storeInfo = ''

def setSoup(self, url):
response = rq.get(str(url))
self.__soup = BeautifulSoup(response.text, "lxml")

@property
def titleTw(self):
return self.__titleTw

@property
def titleEn(self):
return self.__titleEn

@property
def runTime(self):
return self.__runTime

@property
def filmCorporation(self):
return self.__filmCorporation

@property
def IMDbScore(self):
return self.__IMDbScore

@property
def filmTypes(self):
return self.__filmTypes

@property
def director(self):
return self.__director

@property
def actors(self):
return self.__actors

@property
def officialLinks(self):
return self.__officialLinks

@property
def storeInfo(self):
return self.__storeInfo

@property
def releaseDate(self):
return self.__releaseDate

def run(self):
mainTag = self.__soup.find("div", id = "content_l")
movieIntroTag = mainTag.find('div', class_ = 'movie_intro_info_r')
datas = [child.string.strip().replace(u'\u3000', u'').split(':') for child in movieIntroTag.find_all('span')]

try:
self.__titleTw = movieIntroTag.h1.string
except:
self.__titleTw = np.nan
print("Error: Couldn't find Traditional Chinese title.")

try:
self.__titleEn = movieIntroTag.h3.string
except:
self.__titleEn = np.nan
print("Error: Couldn't find English title.")

try:
self.__filmTypes = ', '.join([child.find('a').get_text().strip() for child in movieIntroTag.find_all('div', class_ = 'level_name')])
except:
self.__filmTypes = np.nan
print("Error: Couldn't find filmTypes.")

try:
self.__director = movieIntroTag.select("span + div")[0].get_text().strip()
except:
self.__director = np.nan
print("Error: Couldn't find director.")

try:
self.__officialLinks = [child.string.strip() for child in movieIntroTag.select("span + a")]
except:
self.__officialLinks = np.nan
print("Error: Couldn't find officialLinks.")

try:
self.__actors = ', '.join([child.get_text().strip() for child in movieIntroTag.select("span + div")[1].find_all('a')])
except:
self.__actors = np.nan
print("Error: Couldn't find actors.")

try:
self.__storeInfo = mainTag.find('div', class_ = 'storeinfo').get_text()
except:
self.__storeInfo = np.nan
print("Error: Couldn't find storeinfo.")

for data in datas:
if (data[0] == '上映日期'):
try:
self.__releaseDate = data[1]
except:
self.__releaseDate = np.nan
print("Error: Couldn't find releaseDate.")
pass
if (data[0] == '片長'):
try:
self.__runTime = data[1]
except:
self.__runTime = dnp.nan
print("Error: Couldn't find runTime.")
pass
if (data[0] == '發行公司'):
try:
self.__filmCorporation = data[1]
except:
self.__filmCorporation = np.nan
print("Error: Couldn't find fileCorporation.")
pass
if (data[0] == 'IMDb分數'):
try:
self.__IMDbScore = data[1]
except:
self.__IMDbScore = np.nan
print("Error: Couldn't find releaseDate.")
pass

def getDataFrame(self):
ptt_nba_dict = {
"電影名稱(中)": self.titleTw,
"電影名稱(英)": self.titleEn,
"上映日期": self.releaseDate,
"類型": self.filmTypes,
"片長": self.runTime,
"導演": self.director,
"演員": self.actors,
"發行公司": self.filmCorporation,
"IMDb分數": self.IMDbScore,
"劇情介紹": self.storeInfo,
"官方連結": self.officialLinks,
}

return pd.DataFrame(ptt_nba_dict)

url = 'https://tw.movies.yahoo.com/movieinfo_main.html/id=5644'

# initialize
crawler = Crawler()
crawler.setSoup(url)
crawler.run()
df = crawler.getDataFrame()

# format dataframe style
result = df.style
result = result.set_properties(**{'text-align': 'left'})
result.set_table_styles(
# select the table header with th and set it right align
[dict(selector="th", props=[("text-align", "left")])]
)

result