-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmlb.py
More file actions
135 lines (115 loc) · 3.37 KB
/
mlb.py
File metadata and controls
135 lines (115 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import datetime
import sys
import time
import typing
import requests
import re
import bs4
import pandas as pd
from loguru import logger
from pydantic import BaseModel
from tqdm import tqdm
sys.path.append('./')
from financialdata_v.router import Router
def crawler_team(
team: str,
) -> pd.DataFrame:
url = f"https://www.mlb.com/{team}/stats/"
print(url)
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "html5lib")
# 全部球員的紀錄
players_record = []
try:
# soup 變成tbody
soup = soup.find("tbody", class_= 'notranslate')
# records
records = soup.find_all('tr')
for record in records:
# name 相關的資料
name_relate = record.find_all("span",class_= "full-3fV3c9pF")
name = name_relate[0].text + " " + name_relate[1].text
position = record.find("div", class_ = "position-28TbwVOg").text
# 個別球員的記錄
play_record = [name, position]
bat_records = record.find_all("td", scope = "row")
for bat_record in bat_records:
header = bat_record["headers"][0]
if re.search(r'tb-\d+-header-col[2|3|4|5|6|7|8|9|10|11|12|13|15|16]', header):
play_record.append(bat_record.text)
else:
pass
players_record.append(play_record)
df = pd.DataFrame(players_record, columns = ['Name', 'POS', 'Team', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB', 'CS', 'AVG', 'OBP', 'SLG', 'OPS'])
except:
df = "Error"
return df
def clear_data(
df: pd.DataFrame
) -> pd.DataFrame:
for x in ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB', 'CS']:
df[x] = df[x].apply(lambda x: int(x))
for x in ['AVG', 'OBP', 'SLG', 'OPS']:
df[x] = df[x].apply(lambda x: float(x))
df = df[df['AB'] != 0]
df = df.drop(columns = ['AVG', 'OPS'])
df = df.rename(columns = {
'2B':'Double',
'3B':'Third'
})
return df
class BaseballRecord(BaseModel):
Name: str
POS: str
Team: str
G: int
AB: int
R: int
H: int
Double: int
Third: int
HR: int
RBI: int
BB: int
SO: int
SB: int
CS: int
OBP: float
SLG: float
def check_schema(
df: pd.DataFrame,
) -> pd.DataFrame:
"""檢查資料型態, 確保每次要上傳資料庫前, 型態正確"""
df_dict = df.to_dict("records")
df_schema = [
BaseballRecord(**dd).__dict__
for dd in df_dict
]
df = pd.DataFrame(df_schema)
return df
def main(
team: typing.List[str]
) -> pd.DataFrame:
all_team = pd.DataFrame({})
for t in team:
team_record = crawler_team(t)
team_record = clear_data(team_record)
team_record = check_schema(team_record)
all_team = all_team.append(team_record)
all_team.reset_index(inplace= True, drop= True)
db_router = Router()
try:
all_team.to_sql(
name = "all_team_data",
con = db_router.mysql_mlb_data_conn,
if_exists = "replace",
index = False,
chunksize = 1000
)
except Exception as e:
logger.info(e)
return all_team
if __name__ == "__main__":
team = sys.argv[1:]
a = main(team)
print(a)