authman · adekunleba · Oct 17, 2016 · Oct 18, 2016 · Oct 26, 2016 · Oct 28, 2016
diff --git a/Module2/Module1.ipynb b/Module2/Module1.ipynb
diff --git a/Module2/assignment2.py b/Module2/assignment2.py
@@ -1,20 +1,25 @@
+# %load assignment2.py #automatically load the python notebook into ipython notebook
+# %%writefile [-a] assignment2.py #fill the cells back into the python notebook
+#%save assignment2.py
 import pandas as pd
 
 # TODO: Load up the 'tutorial.csv' dataset
-#
+
 # .. your code here ..
+data = pd.read_csv("Datasets/tutorial.csv")
 
 
 
 # TODO: Print the results of the .describe() method
 #
 # .. your code here ..
-
+print(data.describe())
 
 
 # TODO: Figure out which indexing method you need to
 # use in order to index your dataframe with: [2:4,'col3']
 # And print the results
 #
 # .. your code here ..
-
+index3 = data.ix[2:4, 'col3']
+print(index3)
diff --git a/Module2/assignment3.py b/Module2/assignment3.py
@@ -1,25 +1,30 @@
+# %load assignment3.py
 import pandas as pd
 
 # TODO: Load up the dataset
 # Ensuring you set the appropriate header column names
 #
 # .. your code here ..
+servoData = pd.read_csv('Datasets/servo.data', names = ['motor', 'screw', 'pgain', 'vgain', 'class'])
+print(servoData.head())
 
 
 # TODO: Create a slice that contains all entries
 # having a vgain equal to 5. Then print the 
 # length of (# of samples in) that slice:
 #
 # .. your code here ..
-
+vgainLess = servoData[servoData.vgain == 5]
+print(len(vgainLess))
 
 # TODO: Create a slice that contains all entries
 # having a motor equal to E and screw equal
 # to E. Then print the length of (# of
 # samples in) that slice:
 #
 # .. your code here ..
-
+newSlice =servoData[(servoData.motor == 'E') & (servoData.screw == 'E')] 
+print(len(newSlice))
 
 
 # TODO: Create a slice that contains all entries
@@ -29,11 +34,12 @@
 # you've found it, print it:
 #
 # .. your code here ..
-
+lastSlice = servoData[servoData.pgain == 4].vgain.mean() #Tried out chaining the process
+print(lastSlice) 
+#lastSlice = servoData[servoData.pgain == 4]
+#print(lastSlice.vgain.mean())
 
 
 # TODO: (Bonus) See what happens when you run
 # the .dtypes method on your dataframe!
-
-
-
+print(servoData.dtypes)
diff --git a/Module2/assignment4.py b/Module2/assignment4.py
@@ -1,13 +1,37 @@
+# %load assignment4.py
 import pandas as pd
 
-
 # TODO: Load up the table, and extract the dataset
 # out of it. If you're having issues with this, look
 # carefully at the sample code provided in the reading
 #
 # .. your code here ..
-
-
+#using Beafiul soup to get the table and save it as a csv file
+import csv
+import urllib
+from bs4 import BeautifulSoup
+
+with open('listing.csv', 'w') as f:
+    writer = csv.writer(f)
+    url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2"
+    u = urllib.request.urlopen(url)
+    try:
+        html = u.read()
+    finally:
+        u.close()
+    soup=BeautifulSoup(html)
+    for tr in soup.find_all('tr')[2:]:
+        tds = tr.find_all('td')
+        row = [elem.text for elem in tds] #initially there was a 'elem.text.encoding('utf-8') here and was giving me issue
+        writer.writerow(row)
+
+
+listing = pd.read_csv("listing.csv", names = ['RK', 'PLAYER', 'TEAM', 'GP', 'G', 'A', 'PTS', '+/-',\
+                                              'PIM', 'PTS/G', 'SOG', 'PCT', 'GWG', 'G', 'A', 'G', 'A'],\
+                      encoding = 'ISO-8859-1')
+#listing.drop('RK', axis = 1, inplace = True)
+#listing.iloc[:, 1].map(lambda x: x.rstrip("'").lstrip("'")) a very good tool to keep 'rstrip()' and 'lstrip()' on pd.Series
+listing.head()
 # TODO: Rename the columns so that they match the
 # column definitions provided to you on the website
 #
@@ -18,6 +42,8 @@
 #
 # .. your code here ..
 
+listing_notNull = listing.dropna(axis = 0, thresh= 4)
+print(listing_notNull.head())
 
 # TODO: At this point, look through your dataset by printing
 # it. There probably still are some erroneous rows in there.
@@ -27,23 +53,33 @@
 # .. your code here ..
 
 
+#suggestions of more pythonic way to do this part!
+listing_notDup = listing_notNull[listing_notNull['RK'] != 'RK']
+print(listing_notDup.head())
+
 # TODO: Get rid of the 'RK' column
 #
 # .. your code here ..
 
-
+listing_notDup.drop('RK', axis = 1, inplace = True)
 # TODO: Ensure there are no holes in your index by resetting
 # it. By the way, don't store the original index
 #
 # .. your code here ..
-
-
-
+listing_notDup = listing_notDup.reset_index(drop = True) #'drop = True' ensures that there is no column name index in the data
 # TODO: Check the data type of all columns, and ensure those
 # that should be numeric are numeric
 
-
+#Data is all objects because of the way that the data was read in
+listing_notDup = listing_notDup.apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
+print(listing_notDup.dtypes)
 
 # TODO: Your dataframe is now ready! Use the appropriate 
 # commands to answer the questions on the course lab page.
 
+#How many unique rows exits after the cleaning operation
+listing_notDup.info()
+
+#How many unique values are in PCT
+print(listing_notDup.PCT.nunique()) #nunique() does the job, another lesson for easy analysis the .columnname type of python coding is
+#                             good and allows for more expression of possible methods on column series
diff --git a/Module2/assignment5.py b/Module2/assignment5.py
@@ -1,3 +1,4 @@
+# %load assignment5.py
 import pandas as pd
 import numpy as np
 
@@ -7,6 +8,9 @@
 # Load up the dataset, setting correct header labels.
 #
 # .. your code here ..
+censusData = pd.read_csv("Datasets/census.data", names= ['education', 'age', 'capitalGain', 'race', 'capitalLoss',\
+                                                         'hoursPerWeek', 'sex', 'classification'], na_values = '?')
+censusData.head()
 
 
 
@@ -18,15 +22,26 @@
 # a text editor / spread sheet program? If you see 'object' where
 # you expect to see 'int32' / 'float64', that is a good indicator
 # that there is probably a string or missing value in a column.
+
+#Do dtypes
+print(censusData.isnull().values.any(), '\n', '--------------', '\n', censusData.dtypes)
+
+#This shows that we don't have any null value in the dataset but the data contains some unneccasary values cos capitalGain
+#    is meant to be an integer
 # use `your_data_frame['your_column'].unique()` to see the unique
 # values of each column and identify the rogue values. If these
 # should be represented as nans, you can convert them using
 # na_values when loading the dataframe.
 #
 # .. your code here ..
-
-
-
+censusData.education.unique()
+censusData.age.unique()
+censusData.capitalGain.unique()
+censusData.race.unique()
+censusData.capitalLoss.unique()
+censusData.hoursPerWeek.unique()
+censusData.sex.unique()
+censusData.classification.unique()
 #
 # TODO:
 # Look through your data and identify any potential categorical
@@ -40,12 +55,43 @@
 #
 # .. your code here ..
 
+#Code to prepare the ordering of column 'education'
+education = censusData.education.unique()
+education.tolist()
+#To Do:
+#Reorder the education list
+orderEdu = [7, 3, 0, 5, 1, 12, 2, 9, 6, 8, 10, 11]
+neweducation = [education[i] for i in orderEdu]
+print(neweducation)
+print(education)
 
+#Merge these ordering to education to create a nominal row
+censusData.education = censusData.education.astype('category', ordered = True, \
+                                                  categories = neweducation)
+
+#censusData.education = censusData.education.dropna()
+censusData.capitalGain = censusData.capitalGain.interpolate()
+censusData.head()
+print('The data type of column education is:', '%s' % censusData.education.dtype) 
+
+
+#check if any value has Nan value
+print(censusData.isnull().any())
+censusData.dropna(inplace = True)
+#Reprint check if Nan value is in data
+censusData.reset_index(drop = True)
+#censusData.head()
+#You can print 'censusData.isnull().any()' to check if there is still null values
+#censusData.education = censusData.education.astype('category', ordered = True, categories = neweducation)
+#censusData.education.cat.codes
+
+#Apply get_dummies on 'sex', 'race' and 'classification'
+censusData = pd.get_dummies(censusData, columns= ['race', 'sex', 'classification'])
 
-#
 # TODO:
 # Print out your dataframe
 #
 # .. your code here ..
+print(censusData.head())
 
-
+#I have been told that 'classification' should take ordinal values rather than nominal values(questionable though)
diff --git a/Module2/listing.csv b/Module2/listing.csv
@@ -0,0 +1,46 @@
+1,"Jamie Benn, LW",DAL,82,35,52,87,1,64,1.06,253,13.8,6,10,13,2,3
+2,"John Tavares, C ",NYI,82,38,48,86,5,46,1.05,278,13.7,8,13,18,0,1
+3,"Sidney Crosby, C ",PIT,77,28,56,84,5,47,1.09,237,11.8,3,10,21,0,0
+4,"Alex Ovechkin, LW",WSH,81,53,28,81,10,58,1.00,395,13.4,11,25,9,0,0
+�,"Jakub Voracek, RW",PHI,82,22,59,81,1,78,0.99,221,10.0,3,11,22,0,0
+6,"Nicklas Backstrom, C ",WSH,82,18,60,78,5,40,0.95,153,11.8,3,3,30,0,0
+7,"Tyler Seguin, C ",DAL,71,37,40,77,-1,20,1.08,280,13.2,5,13,16,0,0
+8,"Jiri Hudler, LW",CGY,78,31,45,76,17,14,0.97,158,19.6,5,6,10,0,0
+�,"Daniel Sedin, LW",VAN,82,20,56,76,5,18,0.93,226,8.9,5,4,21,0,0
+10,"Vladimir Tarasenko, RW",STL,77,37,36,73,27,31,0.95,264,14.0,6,8,10,0,0
+,PP,SH
+RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
+�,"Nick Foligno, LW",CBJ,79,31,42,73,16,50,0.92,182,17.0,3,11,15,0,0
+�,"Claude Giroux, C ",PHI,81,25,48,73,-3,36,0.90,279,9.0,4,14,23,0,0
+�,"Henrik Sedin, C ",VAN,82,18,55,73,11,22,0.89,101,17.8,0,5,20,0,0
+14,"Steven Stamkos, C ",TB,82,43,29,72,2,49,0.88,268,16.0,6,13,12,0,0
+�,"Tyler Johnson, C ",TB,77,29,43,72,33,24,0.94,203,14.3,6,8,9,0,0
+16,"Ryan Johansen, C ",CBJ,82,26,45,71,-6,40,0.87,202,12.9,0,7,19,2,0
+17,"Joe Pavelski, C ",SJ,82,37,33,70,12,29,0.85,261,14.2,5,19,12,0,0
+�,"Evgeni Malkin, C ",PIT,69,28,42,70,-2,60,1.01,212,13.2,4,9,17,0,0
+�,"Ryan Getzlaf, C ",ANA,77,25,45,70,15,62,0.91,191,13.1,6,3,10,0,2
+20,"Rick Nash, LW",NYR,79,42,27,69,29,36,0.87,304,13.8,8,6,6,4,1
+,PP,SH
+RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
+21,"Max Pacioretty, LW",MTL,80,37,30,67,38,32,0.84,302,12.3,10,7,4,3,2
+�,"Logan Couture, C ",SJ,82,27,40,67,-6,12,0.82,263,10.3,4,6,18,2,0
+23,"Jonathan Toews, C ",CHI,81,28,38,66,30,36,0.81,192,14.6,7,6,11,2,1
+�,"Erik Karlsson, D ",OTT,82,21,45,66,7,42,0.80,292,7.2,3,6,24,0,0
+�,"Henrik Zetterberg, LW",DET,77,17,49,66,-6,32,0.86,227,7.5,3,4,24,0,0
+26,"Pavel Datsyuk, C ",DET,63,26,39,65,12,8,1.03,165,15.8,5,8,16,0,0
+�,"Joe Thornton, C ",SJ,78,16,49,65,-4,30,0.83,131,12.2,0,4,18,0,0
+28,"Nikita Kucherov, RW",TB,82,28,36,64,38,37,0.78,190,14.7,2,2,13,0,0
+�,"Patrick Kane, RW",CHI,61,27,37,64,10,10,1.05,186,14.5,5,6,16,0,0
+�,"Mark Stone, RW",OTT,80,26,38,64,21,14,0.80,157,16.6,6,5,8,1,0
+,PP,SH
+RK,PLAYER,TEAM,GP,G,A,PTS,+/-,PIM,PTS/G,SOG,PCT,GWG,G,A,G,A
+�,"Alexander Steen, LW",STL,74,24,40,64,8,33,0.86,223,10.8,5,8,16,0,0
+�,"Kyle Turris, C ",OTT,82,24,40,64,5,36,0.78,215,11.2,6,4,12,1,0
+�,"Johnny Gaudreau, LW",CGY,80,24,40,64,11,14,0.80,167,14.4,4,8,13,0,0
+�,"Anze Kopitar, C ",LA,79,16,48,64,-2,10,0.81,134,11.9,4,6,18,0,0
+35,"Radim Vrbata, RW",VAN,79,31,32,63,6,20,0.80,267,11.6,7,12,11,0,0
+�,"Jaden Schwartz, LW",STL,75,28,35,63,13,16,0.84,184,15.2,4,8,8,0,2
+�,"Filip Forsberg, C ",NSH,82,26,37,63,15,24,0.77,237,11.0,6,6,13,0,0
+�,"Jordan Eberle, RW",EDM,81,24,39,63,-16,24,0.78,183,13.1,2,6,15,0,0
+�,"Ondrej Palat, LW",TB,75,16,47,63,31,24,0.84,139,11.5,5,3,8,1,1
+40,"Zach Parise, LW",MIN,74,33,29,62,21,41,0.84,259,12.7,3,11,5,0,0