NBA-MVP-Predictor/mavericks.py at main · kai26x/NBA-MVP-Predictor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# -*- coding: utf-8 -*-
"""Mavericks.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JjesiHM2DVV5BE6xS170mjvhIxIer8pg

# AWS Maverick Dataset

Retrieval and Instantiation
"""

import os

requisite='boto3'
os.system(f"pip install {requisite}")

import boto3
import io
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set Buffer
buffer_pbp = io.BytesIO()
buffer_players = io.BytesIO()

# Create connection to S3
s3 = boto3.resource('s3', aws_access_key_id = 'AKIAWNNDBSXELJDB2NPI', aws_secret_access_key = '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')

# Read Players Data from S3
players_object = s3.Object('utd-hackathon', 'game_players.parquet')
players_object.download_fileobj(buffer_players)
df_players = pd.read_parquet(buffer_players)

print(df_players.head(5))
print(df_players.columns)
print(df_players.shape)

# Extract the following data from the dataFrame
df_players = df_players[["season", "name", "fgm", "fga"]] # TODO: expand this
print(df_players.head(5))

# Merging the data so find the total stats a player has in a season and specific season type
aggregate_functions = {"fgm": "sum", "fga" : "sum"}
df_players = df_players.groupby(['season', "name"]).agg(aggregate_functions).reset_index()
print(df_players.head(5))
print(df_players.tail(5))
print(pd.unique(df_players['season']))

"""# Kaggle Dataset

Instantiation and Visualization
"""

df_mvp = pd.read_csv("mvp_votings.csv", index_col=0)
print(df_mvp.columns)
# Example of renaming the column. Don't rename here tho, rename at the end when everything is finished
#column_order = ['field goal attempts(fga)', 'field goal 3 pointer attempts(fg3a)', 'free throw attempts(fta)', 'player efficiency rating(per)', 'true shooting percentage(ts_pct)', 'usage percentage(usg_pct)', 'box plus-minus(bpm)', 'season', 'player', 'win_pct', 'votes_first', 'points_won', 'points_max', 'award_share','g', 'mp_per_g', 'pts_per_g', 'total rebound percentage(trb_per_g)', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'win shares(ws)', 'ws_per_48']
#df_mvp.columns = column_order
print(df_mvp.head(10))

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,10))
df_mvp_visual = df_mvp
df_mvp_visual['Beat Threshold']=abs(df_mvp_visual['award_share']) > 0.5 #maybe adjust threshold
def scatter(attribute):
    p1=sns.lmplot(x= attribute, y="award_share", data= df_mvp_visual, hue='Beat Threshold', fit_reg=False,height=8,aspect=4)
    ax = p1.axes[0,0]
    plt.show()
scatter("fga") # Obvious trend
scatter("fg3a") # No obvious trend
scatter("fta") # Somewhat obvious trend
#scatter("per") # Obvious trend
#scatter("ts_pct") # Obvious trend
#scatter("usg_pct") # Obvious trend
#scatter("bpm") # Obvious trend
#scatter("win_pct") # Obvious trend
#scatter("g") # Obvious trend
#scatter("mp_per_g") # Obvious trend
#scatter("pts_per_g") # Obvious trend
#scatter("trb_per_g") # No Obvious trend
#scatter("ast_per_g") # No Obvious trend
#scatter("stl_per_g") # No Obvious trend
#scatter("blk_per_g") # No Obvious trend
#scatter("fg_pct") # Obvious trend
#scatter("fg3_pct") # Obvious trend (?)
#scatter("ft_pct") # Obvious trend
#scatter("ws") # Obvious trend
#scatter("ws_per_48") # Obvious trend

df_mvp['MVP'] = "No"
for season in df_mvp['season'].unique():
  # Isolate the season's dataframe
  season_df=df_mvp.loc[df_mvp['season'] == season]
  # Find the 3 largest award share's player of that season
  list_MVP = season_df["award_share"].nlargest(3)
  # Annotate the MVP position for the player
  df_mvp['MVP'][list_MVP.index[0]] = "First"
  df_mvp['MVP'][list_MVP.index[1]] = "Second"
  df_mvp['MVP'][list_MVP.index[2]] = "Third"

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
'''
features = ['fga', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm', 'win_pct', 'g', 'mp_per_g', 'pts_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'ws', 'ws_per_48']

test_columns = df_mvp.pop('award_share')
#gs = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=665)

# split the data set into training and test
train_ix, test_ix = next(gs.split(df_mvp, test_columns, groups=df_mvp.season))
X_train = df_mvp.loc[train_ix][features]
y_train = test_columns.loc[train_ix]

X_test = df_mvp.loc[test_ix][features]
y_test = test_columns.loc[test_ix]
'''
X = df_mvp[['fga', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm', 'win_pct', 'g', 'mp_per_g', 'pts_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'ws', 'ws_per_48']]

y = df_mvp['award_share']


# split the data set into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=665)

# Standardize the dataset
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Train a random forest regressor
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=665)
rf.fit(X_train_std, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test_std)

# Evaluate the model
'''
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
'''

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
# Standardize the dataset
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

from sklearn.tree import DecisionTreeRegressor
basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(X_train_std, y_train)
predictions=basic_model.predict(X_test_std)

df=pd.DataFrame(X_test)
df['prediction']=predictions
df['award_share']=y_test
df['season']=[df_mvp['season'][index] for index in df.reset_index()['index']]
df['player']=[df_mvp['player'][index] for index in df.reset_index()['index']]
df['MVP']=[df_mvp['MVP'][index] for index in df.reset_index()['index']]
X = ['fga', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm', 'win_pct', 'g', 'mp_per_g', 'pts_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'ws', 'ws_per_48']
df=df[X + ['player','season','award_share','prediction', 'MVP']]

df['MVP_prediction'] = "No"
for season in df['season'].unique():
  # Isolate the season's dataframe
  season_df=df.loc[df['season'] == season]
  # Find the 3 largest award share's player of that season
  list_MVP = season_df["prediction"].nlargest(3)
  # Annotate the MVP position for the player
  df['MVP_prediction'][list_MVP.index[0]] = "First"
  df['MVP_prediction'][list_MVP.index[1]] = "Second"
  df['MVP_prediction'][list_MVP.index[2]] = "Third"

print(df.head(10))

correct = 0
total = 0
for season in df['season'].unique():
  print('Season ' + season + ' ---------------------------------------- ')
  # Isolate the season's dataframe
  season_df=df.loc[df['season'] == season]
  season_df=season_df[['player', 'award_share', 'MVP', 'prediction', 'MVP_prediction']]
  first_place = season_df.loc[season_df['MVP_prediction'] == 'First']
  total += 1
  print(first_place)
  #print(first_place['MVP'].item())
  #print(first_place['MVP_prediction'].item())
  if(first_place['MVP'].item() == first_place['MVP_prediction'].item()):
    print(' Correctly determine MVP which is ' + first_place['player'].item())
    print('   Predicted award share of ' + str(first_place['prediction'].item()))
    print('   Actual award share of ' + str(first_place['award_share'].item()))
    correct += 1
  else:
    print(' Incorrectly determine MVP which is ' + first_place['player'].item())
    print('   Predicted award share of ' + str(first_place['prediction'].item()))
    print('   Actual award share of ' + str(first_place['award_share'].item()))
    true_first_place = season_df.loc[season_df['MVP'] == 'First']
    print(' Actual MVP is ' + true_first_place['player'].item())
    print('   Predicted award share of ' + str(true_first_place['prediction'].item()))
    print('   Actual award share of ' + str(true_first_place['award_share'].item()))
print('====================================')
print('Accuracy is ' + str(correct) + ' out of ' + str(total) + ", or " + str(correct/total))