ml-utils/utils.py at master · SharathGa/ml-utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
This contains all the reusable code that I need for ML
'''


''' importing the necessary packages for the analysis '''
import pandas as pd
import numpy as np

"""
	These are the commonly used functions that we need for
	classfication problems and for test and analysis.
	The following are list of functions available in this class
	1.eda - does the premilinary exploratory data analysis.
	2.catfreq - calculates the value counts of categorical variables.

"""

def eda(df):
    #wrting a funciton to do the basic necessary eda analysis of a dataframe

    #checking the shape of the data
    print('The dimensions of the data is:')
    print(df.shape,'\n')

    #listing the name of the columns
    print('The column names are:')
    print(df.columns,'\n')

    #inspecting the head and tail of a df
    print('The head and tail of a df:')
    cf = pd.concat([df.head(), df.tail()],axis = 0 )
    print(cf,2*'\n')

    #checking the type of the columns
    print('The the following is the info: ')
    print(df.info(),2*'\n')

    #checking the total missing values columnwise
    print('the total column wise missing values are: ')
    print(df.isnull().sum(),2*'\n')

    #checking the relative percentage of missing values wrt to dataset
    print('The relative percentage of missing values to the dataset: ')
    print((df.isnull().sum()/ len(df) )*100 )


 #replacing values in function
def replace_val(df, column , dictionary,fill_na= True):
	'''
	This function is used to replace/map values from the column with
	the mappings given in the dict and if fillna is true , replace the
	categorical variables with the mode of that column and for continous
	variables replace it with mean of the data in that column
	'''
	# if type(df['column']=='object'):
	# 	if fillna=='True':
	# 		df.column.map(dictionary).fillna(df.column.mean())
	# 	else


#Calculating the frequency counts of categorical columns of the data
def catfreq(df,top= 5):

    '''This function calculates the top n value counts for categorical variables. If the total categorical
       columns are greater than 8, It becomes cumbersome to view the reports. I am thinking on an idea solve this
    '''

    cat = df.select_dtypes(include = ['object']).columns
    if len(cat)> 8:
        print ('There are more than 8 columns')
    else:
        for x in cat:
             print('The Frequency counts for {} column:'.format(x))
             print( df[x].value_counts()[:top], '\n')


#converting a word to number :
def convert_to_int(word):
    ''' we are going to write a universal fucntion that replaces the
    categorical values in the categorical column with a mapped value
    from the dictionary'''
    wordmap = { 'one':1 , 'two':2 , 'three':3}
    return wordmap[word]


#Adding a missing value imputation function calculator function
''' This function will calculate and display the missing values in the dataset
 '''
def mvc( df ) :
''' this will impute categorical values in the data '''
    for col in df.columns:
        #for a single column it is dtype, and for a all columns
        # it is dtypes.
        if df[col].dtype == 'object':
            df[col] = cd Decd df[col].fillna(df[col].value_counts().index[0])


''' for imputing values missing in numeric columns '''
def mvn(df, impute_type ) :
    for col in df.columns:
        if df[col].dtype == 'numeric':
            if impute_type == 'median':
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(df[col].mean())