Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
547 changes: 547 additions & 0 deletions Module2/Module1.ipynb

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions Module2/assignment2.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
# %load assignment2.py #automatically load the python notebook into ipython notebook
# %%writefile [-a] assignment2.py #fill the cells back into the python notebook
#%save assignment2.py
import pandas as pd

# TODO: Load up the 'tutorial.csv' dataset
#

# .. your code here ..
data = pd.read_csv("Datasets/tutorial.csv")



# TODO: Print the results of the .describe() method
#
# .. your code here ..

print(data.describe())


# TODO: Figure out which indexing method you need to
# use in order to index your dataframe with: [2:4,'col3']
# And print the results
#
# .. your code here ..

index3 = data.ix[2:4, 'col3']
print(index3)
18 changes: 12 additions & 6 deletions Module2/assignment3.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# %load assignment3.py
import pandas as pd

# TODO: Load up the dataset
# Ensuring you set the appropriate header column names
#
# .. your code here ..
servoData = pd.read_csv('Datasets/servo.data', names = ['motor', 'screw', 'pgain', 'vgain', 'class'])
print(servoData.head())


# TODO: Create a slice that contains all entries
# having a vgain equal to 5. Then print the
# length of (# of samples in) that slice:
#
# .. your code here ..

vgainLess = servoData[servoData.vgain == 5]
print(len(vgainLess))

# TODO: Create a slice that contains all entries
# having a motor equal to E and screw equal
# to E. Then print the length of (# of
# samples in) that slice:
#
# .. your code here ..

newSlice =servoData[(servoData.motor == 'E') & (servoData.screw == 'E')]
print(len(newSlice))


# TODO: Create a slice that contains all entries
Expand All @@ -29,11 +34,12 @@
# you've found it, print it:
#
# .. your code here ..

lastSlice = servoData[servoData.pgain == 4].vgain.mean() #Tried out chaining the process
print(lastSlice)
#lastSlice = servoData[servoData.pgain == 4]
#print(lastSlice.vgain.mean())


# TODO: (Bonus) See what happens when you run
# the .dtypes method on your dataframe!



print(servoData.dtypes)
52 changes: 44 additions & 8 deletions Module2/assignment4.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,37 @@
# %load assignment4.py
import pandas as pd


# TODO: Load up the table, and extract the dataset
# out of it. If you're having issues with this, look
# carefully at the sample code provided in the reading
#
# .. your code here ..


#using Beafiul soup to get the table and save it as a csv file
import csv
import urllib
from bs4 import BeautifulSoup

with open('listing.csv', 'w') as f:
writer = csv.writer(f)
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2"
u = urllib.request.urlopen(url)
try:
html = u.read()
finally:
u.close()
soup=BeautifulSoup(html)
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
row = [elem.text for elem in tds] #initially there was a 'elem.text.encoding('utf-8') here and was giving me issue
writer.writerow(row)


listing = pd.read_csv("listing.csv", names = ['RK', 'PLAYER', 'TEAM', 'GP', 'G', 'A', 'PTS', '+/-',\
'PIM', 'PTS/G', 'SOG', 'PCT', 'GWG', 'G', 'A', 'G', 'A'],\
encoding = 'ISO-8859-1')
#listing.drop('RK', axis = 1, inplace = True)
#listing.iloc[:, 1].map(lambda x: x.rstrip("'").lstrip("'")) a very good tool to keep 'rstrip()' and 'lstrip()' on pd.Series
listing.head()
# TODO: Rename the columns so that they match the
# column definitions provided to you on the website
#
Expand All @@ -18,6 +42,8 @@
#
# .. your code here ..

listing_notNull = listing.dropna(axis = 0, thresh= 4)
print(listing_notNull.head())

# TODO: At this point, look through your dataset by printing
# it. There probably still are some erroneous rows in there.
Expand All @@ -27,23 +53,33 @@
# .. your code here ..


#suggestions of more pythonic way to do this part!
listing_notDup = listing_notNull[listing_notNull['RK'] != 'RK']
print(listing_notDup.head())

# TODO: Get rid of the 'RK' column
#
# .. your code here ..


listing_notDup.drop('RK', axis = 1, inplace = True)
# TODO: Ensure there are no holes in your index by resetting
# it. By the way, don't store the original index
#
# .. your code here ..



listing_notDup = listing_notDup.reset_index(drop = True) #'drop = True' ensures that there is no column name index in the data
# TODO: Check the data type of all columns, and ensure those
# that should be numeric are numeric


#Data is all objects because of the way that the data was read in
listing_notDup = listing_notDup.apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
print(listing_notDup.dtypes)

# TODO: Your dataframe is now ready! Use the appropriate
# commands to answer the questions on the course lab page.

#How many unique rows exits after the cleaning operation
listing_notDup.info()

#How many unique values are in PCT
print(listing_notDup.PCT.nunique()) #nunique() does the job, another lesson for easy analysis the .columnname type of python coding is
# good and allows for more expression of possible methods on column series
56 changes: 51 additions & 5 deletions Module2/assignment5.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load assignment5.py
import pandas as pd
import numpy as np

Expand All @@ -7,6 +8,9 @@
# Load up the dataset, setting correct header labels.
#
# .. your code here ..
censusData = pd.read_csv("Datasets/census.data", names= ['education', 'age', 'capitalGain', 'race', 'capitalLoss',\
'hoursPerWeek', 'sex', 'classification'], na_values = '?')
censusData.head()



Expand All @@ -18,15 +22,26 @@
# a text editor / spread sheet program? If you see 'object' where
# you expect to see 'int32' / 'float64', that is a good indicator
# that there is probably a string or missing value in a column.

#Do dtypes
print(censusData.isnull().values.any(), '\n', '--------------', '\n', censusData.dtypes)

#This shows that we don't have any null value in the dataset but the data contains some unneccasary values cos capitalGain
# is meant to be an integer
# use `your_data_frame['your_column'].unique()` to see the unique
# values of each column and identify the rogue values. If these
# should be represented as nans, you can convert them using
# na_values when loading the dataframe.
#
# .. your code here ..



censusData.education.unique()
censusData.age.unique()
censusData.capitalGain.unique()
censusData.race.unique()
censusData.capitalLoss.unique()
censusData.hoursPerWeek.unique()
censusData.sex.unique()
censusData.classification.unique()
#
# TODO:
# Look through your data and identify any potential categorical
Expand All @@ -40,12 +55,43 @@
#
# .. your code here ..

#Code to prepare the ordering of column 'education'
education = censusData.education.unique()
education.tolist()
#To Do:
#Reorder the education list
orderEdu = [7, 3, 0, 5, 1, 12, 2, 9, 6, 8, 10, 11]
neweducation = [education[i] for i in orderEdu]
print(neweducation)
print(education)

#Merge these ordering to education to create a nominal row
censusData.education = censusData.education.astype('category', ordered = True, \
categories = neweducation)

#censusData.education = censusData.education.dropna()
censusData.capitalGain = censusData.capitalGain.interpolate()
censusData.head()
print('The data type of column education is:', '%s' % censusData.education.dtype)


#check if any value has Nan value
print(censusData.isnull().any())
censusData.dropna(inplace = True)
#Reprint check if Nan value is in data
censusData.reset_index(drop = True)
#censusData.head()
#You can print 'censusData.isnull().any()' to check if there is still null values
#censusData.education = censusData.education.astype('category', ordered = True, categories = neweducation)
#censusData.education.cat.codes

#Apply get_dummies on 'sex', 'race' and 'classification'
censusData = pd.get_dummies(censusData, columns= ['race', 'sex', 'classification'])

#
# TODO:
# Print out your dataframe
#
# .. your code here ..
print(censusData.head())


#I have been told that 'classification' should take ordinal values rather than nominal values(questionable though)
46 changes: 46 additions & 0 deletions Module2/listing.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
1,"Jamie Benn, LW",DAL,82,35,52,87,1,64,1.06,253,13.8,6,10,13,2,3
2,"John Tavares, C ",NYI,82,38,48,86,5,46,1.05,278,13.7,8,13,18,0,1
3,"Sidney Crosby, C ",PIT,77,28,56,84,5,47,1.09,237,11.8,3,10,21,0,0
4,"Alex Ovechkin, LW",WSH,81,53,28,81,10,58,1.00,395,13.4,11,25,9,0,0
�,"Jakub Voracek, RW",PHI,82,22,59,81,1,78,0.99,221,10.0,3,11,22,0,0
6,"Nicklas Backstrom, C ",WSH,82,18,60,78,5,40,0.95,153,11.8,3,3,30,0,0
7,"Tyler Seguin, C ",DAL,71,37,40,77,-1,20,1.08,280,13.2,5,13,16,0,0
8,"Jiri Hudler, LW",CGY,78,31,45,76,17,14,0.97,158,19.6,5,6,10,0,0
�,"Daniel Sedin, LW",VAN,82,20,56,76,5,18,0.93,226,8.9,5,4,21,0,0
10,"Vladimir Tarasenko, RW",STL,77,37,36,73,27,31,0.95,264,14.0,6,8,10,0,0
,PP,SH
RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
�,"Nick Foligno, LW",CBJ,79,31,42,73,16,50,0.92,182,17.0,3,11,15,0,0
�,"Claude Giroux, C ",PHI,81,25,48,73,-3,36,0.90,279,9.0,4,14,23,0,0
�,"Henrik Sedin, C ",VAN,82,18,55,73,11,22,0.89,101,17.8,0,5,20,0,0
14,"Steven Stamkos, C ",TB,82,43,29,72,2,49,0.88,268,16.0,6,13,12,0,0
�,"Tyler Johnson, C ",TB,77,29,43,72,33,24,0.94,203,14.3,6,8,9,0,0
16,"Ryan Johansen, C ",CBJ,82,26,45,71,-6,40,0.87,202,12.9,0,7,19,2,0
17,"Joe Pavelski, C ",SJ,82,37,33,70,12,29,0.85,261,14.2,5,19,12,0,0
�,"Evgeni Malkin, C ",PIT,69,28,42,70,-2,60,1.01,212,13.2,4,9,17,0,0
�,"Ryan Getzlaf, C ",ANA,77,25,45,70,15,62,0.91,191,13.1,6,3,10,0,2
20,"Rick Nash, LW",NYR,79,42,27,69,29,36,0.87,304,13.8,8,6,6,4,1
,PP,SH
RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
21,"Max Pacioretty, LW",MTL,80,37,30,67,38,32,0.84,302,12.3,10,7,4,3,2
�,"Logan Couture, C ",SJ,82,27,40,67,-6,12,0.82,263,10.3,4,6,18,2,0
23,"Jonathan Toews, C ",CHI,81,28,38,66,30,36,0.81,192,14.6,7,6,11,2,1
�,"Erik Karlsson, D ",OTT,82,21,45,66,7,42,0.80,292,7.2,3,6,24,0,0
�,"Henrik Zetterberg, LW",DET,77,17,49,66,-6,32,0.86,227,7.5,3,4,24,0,0
26,"Pavel Datsyuk, C ",DET,63,26,39,65,12,8,1.03,165,15.8,5,8,16,0,0
�,"Joe Thornton, C ",SJ,78,16,49,65,-4,30,0.83,131,12.2,0,4,18,0,0
28,"Nikita Kucherov, RW",TB,82,28,36,64,38,37,0.78,190,14.7,2,2,13,0,0
�,"Patrick Kane, RW",CHI,61,27,37,64,10,10,1.05,186,14.5,5,6,16,0,0
�,"Mark Stone, RW",OTT,80,26,38,64,21,14,0.80,157,16.6,6,5,8,1,0
,PP,SH
RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
�,"Alexander Steen, LW",STL,74,24,40,64,8,33,0.86,223,10.8,5,8,16,0,0
�,"Kyle Turris, C ",OTT,82,24,40,64,5,36,0.78,215,11.2,6,4,12,1,0
�,"Johnny Gaudreau, LW",CGY,80,24,40,64,11,14,0.80,167,14.4,4,8,13,0,0
�,"Anze Kopitar, C ",LA,79,16,48,64,-2,10,0.81,134,11.9,4,6,18,0,0
35,"Radim Vrbata, RW",VAN,79,31,32,63,6,20,0.80,267,11.6,7,12,11,0,0
�,"Jaden Schwartz, LW",STL,75,28,35,63,13,16,0.84,184,15.2,4,8,8,0,2
�,"Filip Forsberg, C ",NSH,82,26,37,63,15,24,0.77,237,11.0,6,6,13,0,0
�,"Jordan Eberle, RW",EDM,81,24,39,63,-16,24,0.78,183,13.1,2,6,15,0,0
�,"Ondrej Palat, LW",TB,75,16,47,63,31,24,0.84,139,11.5,5,3,8,1,1
40,"Zach Parise, LW",MIN,74,33,29,62,21,41,0.84,259,12.7,3,11,5,0,0
Expand Down
Loading