Python-for-Data-Engineers/5.SetIndexAndHandleNA.py at main · pjra99/Python-for-Data-Engineers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np

def p(df):
    print(df)

def main():
    data = [(101, 'Rajesh', 'IT', 75000, 'Chennai'),
            (102, 'Priya', 'HR', 60000, 'Mumbai'),
            (103, 'Anil', 'IT', None, 'Hyderabad'),
            (104, 'Sneha', None, 62000, 'Pune'),
            (105, 'Manish', None, 90000, None),
            (106, 'Suresh', 'IT', 78000, None)]

    columns = ["EmpID", "Name", "Department", "Salary", "Location"]

    df = pd.DataFrame(data, columns=columns)

    #We can set index using set_index with the desired name of the column. Now set_index returns a new df if we don't use inplace
    #attribute, or we set inplace attribute to false, like in below example
    df_ = df.set_index("EmpID")

    p(df_)

    #If we set inplace attribute to True, that means that the original df is modified
    df.set_index("EmpID", inplace=True)

    #Replacing NA values using fillna, values inside NA function is the replacement value, like shown in below examples
    df1 = df.fillna('Not Avl')
    df2 = df.fillna(0)

    #We can also pass a dictionary in fillna function, as a param, where the key is the column name and the value is the
    #replacement value for that column, see the below ex
    df3 = df.fillna({
        "EmpID":0,
        "Department":"Dep to be assigned yet",
        "Salary":0,
        "Location":"Not Avl"
    })
    #In the below output, EmpID value won't be replaced because indexed columns don't accepted a replacement value as that's
    #the whole purpose of index, right?
    p(df3)

    #Forward fill- Carry forward previous row cell's value in the None/NaN cells
    df4 = df.fillna(method="ffill")
    p(df4)
    #Similarly we can use Backward fill, to fill next row cell's value in the cells having no value, the method name is 'bfill'

    #Now there's another attribute 'limit' which can be used with ffill and bfill. Limit is used to specify upto how many consecutive empty cells in a column,
    #we want to have the same value as the last cell, having a value. See the below example

    df5 = df.fillna(method='ffill',  limit=1)
    #above implementation will only apply ffill in 1 cell ahead of the last cell having value in the column Location, i.e., 5th row
    p(df5)

    #Interpolate- It calculates the replacement value for empty cells and fills it, based on the previous and next row values in each column. If it doesn't work in all the cells
    df6 = df.interpolate()
    p(df6)

    #Dropna - can be used to drop rows with null values
    df7 = df.dropna()
    p(df7)

    #Dropna accepts the parmaters like 'how' and 'thresh'
    #'all' deletes rows having NA in all the columns
    #'thresh' param can be used to sepcifes the number of valid columns in a row, for keeping the row
    df8 = df.dropna(how="all")
    p(df8)

    df9 = df.dropna(thresh=3)
    p(df9)

    weather_data = [
    ('2017-01-01', 32.0, 6.0, 'Rain'),
    ('2017-01-04', None, 9.0, 'Sunny'),
    ('2017-01-05', 28.0, None, 'Snow'),
    ('2017-01-07', 32.0, None, 'NA'),
    ('2017-01-10', 34.0, 8.0, 'Cloudy'),
    ('2017-01-11', 40.0, 12.0, 'Sunny')
    ]

    df10 = pd.DataFrame(weather_data, columns=['Date', 'Temp', 'Windspeed', 'Event'])
    df10['Date'] = pd.to_datetime(df10['Date']) #set date column to date type
    df10.set_index('Date', inplace=True) #setting 'Date' column as the index of the data frame
    p(df10)

    #Data range can be used to create a date_range with a range of start date to end date
    dt = pd.date_range('01-01-2017', '01-11-2017')
    #Now this can be used to create an index and then set the index of any df, as shown below
    ind = pd.DatetimeIndex(dt)
    df11 = df10.reindex(ind)
    p(df11)


    #Replace - Replace method in python can be used to replace certain values in python's df
    df12= df10.replace('Rain', 'Rainy')
    #If we are using replace function like this, then we are providing target value and replacement value
    #But this will target the cells having the value 'Rain' in any column. Instead wecan provide a dict parma (with key value pairs, key as the column name, value as the target value)
    #along with a replacement value, see below

    df13 = df10.replace({
        'Temp':np.nan,
        'Windspeed': np.nan,
        'Event':'NA'
    }, 'Not Avl')
    p(df13)
    #We can also use regex as target values for replacement, but we need to set regex="True", see below commented example
    # df14 = df10.replace({
    #     'Event':'Regular Exp'
    # }, 'Not Avl')

    df14 = pd.DataFrame({
    'score': ['exceptional', 'average', 'good', 'poor', 'average', 'exceptional'],
    'student': ['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica']
    })

    #We can use a lists as parameters as well in replace method. 1st list would contain the target values and second the replacement ones
    df15 = df14.replace(['poor', 'average','good', 'exceptional'], [1,2,3,4])
    p(df15)


if __name__ =="__main__":
    main()