-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmalariaCleaningAnalysisDescVis.py
More file actions
720 lines (549 loc) · 20.5 KB
/
malariaCleaningAnalysisDescVis.py
File metadata and controls
720 lines (549 loc) · 20.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
#%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
os.chdir(os.path.join(os.getcwd(), 'Data-Analysis-Project'))
print(os.getcwd())
except:
pass
#%%
#import libs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import numpy.random as nr
import math
#%%
# load datasets
Malaria = pd.read_csv('./NG_2015_MIS_07012019_1354_135943/nmis.csv')
#%%
Malaria.head(20)
#%%
# assign human-readable names to column/variable names in the dataset
#%%
Malaria.columns=['Case Identification', 'Region', 'Type of Place of Residence', 'Source of Drinking Water', 'Type of Toilet Facility',
'Has Electricity', 'Main Floor Material', 'Main Wall Material', 'Main Roof Material', 'Has Bicycle', 'Has Motorcycle/Scooter',
'Has Car/Truck', 'Has Mosquito Bed Net for Sleeping', 'Owns Land Suitable for Agriculture', 'Has Bank Account',
'Wealth Index', 'Cost of Treatment for Fever', 'State']
print(Malaria.shape)
#%%
Malaria.head()
#%%
# Some of the column/variable names contain wild/special chars
Malaria.columns=[str.replace('/','or') for str in Malaria.columns]
#%%
Malaria.head()
#%%
Malaria.dtypes
# We are going to check which of our variables have missing values
#%%
# check for missing values
Malaria.isnull().sum(axis=0)
#%% [markdown]
# 'Type of Toilet Facility' and 'Cost of Treatment for Fever' contains missing values. We are going to remove both columns from our dataset.
#%%
Malaria.drop(['Cost of Treatment for Fever','Type of Toilet Facility'], axis=1, inplace=True)
#%%
Malaria.head()
#%%
#put our table in form of pandas dataframe for analysis
df = pd.DataFrame(Malaria)
#%%
#Descriptive statistics/analysis
#%%
df['State'].value_counts()
#%%
df['Has Electricity'].value_counts()
#%%
df['Source of Drinking Water'].value_counts()
#%%
df['Wealth Index'].value_counts()
#%%
df['Has Mosquito Bed Net for Sleeping'].value_counts()
#%%
df.groupby('Wealth Index')['State'].describe()
#%% [markdown]
# From the table above, Lagos State has the top number of richest people and Sokoto State has the top number of poorest people.
#%%
df.groupby('Has Mosquito Bed Net for Sleeping')['State'].describe()
#%% [markdown]
# From above table Bauchi State has the Highest number of people with Mosquito Bed Net for Sleeping, While Edo State has the least Number.
#%%
df.groupby('Has Electricity')['State'].describe()
#%% [markdown]
# From above table Lagos State has the Highest number of people with access to Electricity, While Adamawa State has the least Number.
#%%
df['Source of Drinking Water'].value_counts()
#%% [markdown]
# From the table above, most people source of drinking water is Tube Well or Borehole.
#%%
#APPLICATION OF MACHINE LEARNING MODELS
#%%
Malaria.head()
#%%
Malaria_ML = pd.read_csv("./NG_2015_MIS_07012019_1354_135943/numeric_nmis.csv")
#%%
Malaria_ML.head()
#%%
Malaria_ML.columns=['Case Identification', 'Region', 'Type of Place of Residence', 'Source of Drinking Water', 'Type of Toilet Facility',
'Has Electricity', 'Main Floor Material', 'Main Wall Material', 'Main Roof Material', 'Has Bicycle', 'Has Motorcycle/Scooter',
'Has Car/Truck', 'Has Mosquito Bed Net for Sleeping', 'Owns Land Suitable for Agriculture', 'Has Bank Account',
'Wealth Index', 'Cost of Treatment for Fever', 'State']
print(Malaria_ML.shape)
Malaria_ML.head()
#%%
Malaria_ML.columns=[str.replace('/','or') for str in Malaria_ML.columns]
#%%
Malaria_ML.isnull().sum(axis=0)
#%%
Malaria_ML.drop(['Cost of Treatment for Fever','Type of Toilet Facility'], axis=1, inplace=True)
#%%
Malaria_ML.head()
#%%
Malaria_ML.tail()
#%%
def plot_corr(Malaria_ML, size=11):
"""
Function plots a graphical correlation matrix for each pair of columns in the dataframe.
Input:
Malaria: pandas DataFrame
size: vertical and horizontal size of the plot
Displays:
matrix of correlation between columns. Blue-cyan-yellow-red-darkred => less to more correlated
0 ------------------> 1
Expect a darkred line running from top left to bottom right
"""
corr = Malaria_ML.corr() # data frame correlation function
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr) # color code the rectangles by correlation value
plt.xticks(range(len(corr.columns)), corr.columns)
# draw x tick marks
plt.yticks(range(len(corr.columns)), corr.columns)
# draw y tick marks
#%%
# Correlated Feature Check.
# Correlation by color. Red is most correlated with other variable, Yellow is self to self correlated and Blue is least correlated with other variable.
#%%
plot_corr(Malaria_ML)
#%%
# State and Case Identification appears to be correlated.
# Drop State Column
del Malaria_ML['State']
#%%
Malaria_ML.head(5)
#%%
Malaria_ML.corr()
#%%
plot_corr(Malaria_ML)
#%%
# The correlations look good. There appear to be no coorelated columns.
## Next we want to check class distribution
#%%
num_obs = len(Malaria_ML)
num_true = len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 1])
num_false = len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 0])
print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))
#%% [markdown]
# Our class distribution is fairly good.
#%% [markdown]
# # Spliting the data
# 70% for training, 30% for testing
#%%
#Let us explore our target variable and visualize it
##Pictorial representation of the target variable
#%%
sb.countplot(x='Has Mosquito Bed Net for Sleeping', data=Malaria_ML, palette='hls')
plt.show()
#%%
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
feature_col_names = ['Region', 'Type of Place of Residence', 'Source of Drinking Water', 'Has Electricity', 'Wealth Index', 'Has Bicycle', 'Has MotorcycleorScooter', 'Has CarorTruck' , 'Owns Land Suitable for Agriculture', 'Has Bank Account' , 'Main Floor Material' ,'Main Wall Material' , 'Main Roof Material'] #independent variables (feature variables)
predicted_class_names = ['Has Mosquito Bed Net for Sleeping'] #dependent variable (target)
X = Malaria_ML[feature_col_names].values # predictor feature columns (8 X m)
y = Malaria_ML[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 0.30 #test_size specifies the proportion of the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)
# test_size = 0.3 is 30%, 42 is the answer to everything
#%%
#Get an idea bout the rows and columns we have obtained
print("\nX_train:\n")
print(X_train.shape)
print("\nX_test:\n")
print(X_test.shape)
#%%
# check we have the the desired 70% train, 30% test split of the data.
#%%
print("{0:0.2f}% in training set".format((len(X_train)/len(Malaria_ML.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(Malaria_ML.index)) * 100))
#%%
# Verifying predicted value was split correctly.
#%%
print("Original True : {0} ({1:0.2f}%)".format(len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 1]), (len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 1])/len(Malaria_ML.index)) * 100.0))
print("Original False : {0} ({1:0.2f}%)".format(len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 0]), (len(Malaria_ML.loc[Malaria_ML['Has Mosquito Bed Net for Sleeping'] == 0])/len(Malaria_ML.index)) * 100.0))
print("")
print("Training True : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Test True : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Test False : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))
#%%
# # Training Algorithm - Using Naive Bayes Machine Learning Model
# # Using Logistic Regression
from sklearn.naive_bayes import GaussianNB
# create Gaussian Naive Bayes Model object and train it with the data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train.ravel())
#%% [markdown]
# Performance on Training Data
#%%
# predict values using the training data
nb_predict_train = nb_model.predict(X_train)
# import the performance metrics library
from sklearn import metrics
# Accuracy
print("Accuracy: {0:.0f}%".format(metrics.accuracy_score(y_train, nb_predict_train)*100))
print()
#%% [markdown]
# Our accurancy rate is 63% on the training data. This is below the 70% benchmark for our ideal ML Model.
#%% [markdown]
# Performance on Testing Data
#%%
# predict values using the testing data
nb_predict_test = nb_model.predict(X_test)
from sklearn import metrics
# training metrics
print("nb_predict_test", nb_predict_test)
print ("y_test", y_test)
print("Accuracy: {0:.0f}%".format(metrics.accuracy_score(y_test, nb_predict_test)*100))
#%%
#Accuracy on testing data is also below our 70% benchmark.
#%%
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test))
#%% [markdown]
# Our Recall and Precision rate is 70% and 77% respectively. This is ok. However we would try other models if they would work better.
#%% [markdown]
# # Using Random Forest
#%%
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=10) # Create random forest object
rf_model.fit(X_train, y_train.ravel())
#%%
# Predict Training Data
#%%
rf_predict_train = rf_model.predict(X_train)
# training metrics
print("Accuracy: {0:.0f}%".format(metrics.accuracy_score(y_train, rf_predict_train)*100))
#%% [markdown]
# Random Forest Accuracy level looks much better.
#%% [markdown]
# Predict Test Data
#%%
rf_predict_test = rf_model.predict(X_test)
# training metrics
print("Accuracy: {0:.0f}%".format(metrics.accuracy_score(y_test, rf_predict_test)*100))
#%% [markdown]
# But this is slightly below 70% for our test data.
#%%
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))
#%% [markdown]
# Our precision and Recall recorded good values based on true 'Yes' and 'No' for ownership of Mosquito Bed Net for Sleeping though the accuracy level on the test data is slightly less than our 70% benchmark.
#%% [markdown]
# # Using Logistic Regression
#%%
from sklearn.linear_model import LogisticRegression
lr_model =LogisticRegression(C=0.7, random_state=42, solver='liblinear', max_iter=10000)
lr_model.fit(X_train, y_train.ravel())
lr_predict_test = lr_model.predict(X_test)
## training metrics
#Confusion Matrix Evaluation Metrics
print("Accuracy: {0:.0f}%".format(metrics.accuracy_score(y_test, lr_predict_test)*100))
print("Precision: {0:.0f}%".format(metrics.precision_score(y_test, lr_predict_test)*100))
print("Recall: {0:.0f}%".format(metrics.recall_score(y_test, lr_predict_test)*100))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))
print(metrics.confusion_matrix(y_test, lr_predict_test))
#%%
##Visualizing Confusion Matrix using Heatmap
fig, ax = plt.subplots()
tick_marks = np.arange(len(['Has Mosquito Bed Net for Sleeping']))
plt.xticks(tick_marks, ['Has Mosquito Bed Net for Sleeping'])
plt.yticks(tick_marks, ['Has Mosquito Bed Net for Sleeping'])
sb.heatmap(pd.DataFrame(metrics.confusion_matrix(y_test, lr_predict_test)), annot=True, cmap="viridis", fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Has Mosquito Bed Net for Sleeping')
#%% [markdown]
# Logistic Regression Model performed best for our prediction. So we would finally go with the Logistics Regression Model.
#%% [markdown]
# # Using our trained Model (Logistic Regression)
#%% [/]
# Save trained model to file
from sklearn.externals import joblib
joblib.dump(lr_model, "Malaria Model")
#%%
#load trained model
lr_model = joblib.load('Malaria Model')
#%%
#Test prediction on data and once the model is loaded
Malaria_Predic = pd.read_csv("./NG_2015_MIS_07012019_1354_135943/numeric_mtd.csv")
#%%
Malaria_Predic.head()
#%%
#Test data contains a few rows
#%%
#We will do some cleaning as before
Malaria_Predic.columns=['Case Identification', 'Region', 'Type of Place of Residence', 'Source of Drinking Water', 'Type of Toilet Facility',
'Has Electricity', 'Main Floor Material', 'Main Wall Material', 'Main Roof Material', 'Has Bicycle', 'Has Motorcycle/Scooter',
'Has Car/Truck', 'Has Mosquito Bed Net for Sleeping', 'Owns Land Suitable for Agriculture', 'Has Bank Account',
'Wealth Index', 'Cost of Treatment for Fever', 'State']
print(Malaria_Predic.shape)
Malaria_Predic.head()
#%%
Malaria_Predic.columns=[str.replace('/','or') for str in Malaria_Predic.columns]
#%%
Malaria_Predic.drop(['Type of Toilet Facility', 'Cost of Treatment for Fever', 'Case Identification', 'State'], axis=1, inplace=True)
#%%
Malaria_Predic.head()
#%%
#We need to drop 'Has Mosquito Bed Net for Sleeping" since that is what we are preicting
#Store data without the column with prefix X as we did with the X_train and X_test to indicate that it only contains the columns we are predicting
#%%
X_predic = Malaria_Predic
del X_predic['Has Mosquito Bed Net for Sleeping']
#%%
X_predic
#%% [markdown]
# At this point our data is ready to be used for prediction.
#%% [markdown]
# Predict 'Has Mosquito Bed Net for Sleeping' with the prediction data. Returns 1 if True, 0 if false
#%%
Malaria_Predic.head()
#%%
lr_model.predict(X_predic)
#%%
# Our Model predicts well. Mision Accomplished!!
Malaria_Visual = pd.read_csv("./NG_2015_MIS_07012019_1354_135943/numeric_nmis.csv")
Malaria_Visual.columns=['Case Identification', 'Region', 'Type of Place of Residence', 'Source of Drinking Water', 'Type of Toilet Facility',
'Has Electricity', 'Main Floor Material', 'Main Wall Material', 'Main Roof Material', 'Has Bicycle', 'Has Motorcycle/Scooter',
'Has Car/Truck', 'Has Mosquito Bed Net for Sleeping', 'Owns Land Suitable for Agriculture', 'Has Bank Account',
'Wealth Index', 'Cost of Treatment for Fever', 'State']
print(Malaria_Visual.shape)
Malaria_Visual.head()
#%%
#Check for Missing Values
(Malaria_Visual.astype(np.object).isnull()).any()
#%% [markdown]
# Column 'Cost of Treatment of Fever' containing NaN values is removed.
#%%
Malaria_Visual.drop('Cost of Treatment for Fever', axis = 1, inplace = True)
#%%
Malaria_Visual.head()
#%% [markdown]
# We would put the table in form of Pandas DataFrame.
#%%
df=pd.DataFrame (Malaria_Visual)
#%% [markdown]
# Now we would create and assign a list of dictionaries to recode the numerical values of SOME categorical variables in our dataset with human-readable text.
#%%
dict = [['Has Electricity',
{1:'yes',
0:'No'}],
['Type of Place of Residence',
{1:'Urban',
2:'Rural'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Source of Drinking Water',
{10:'Piped water',
11:'Piped into dwelling',
12:'Piped to yard/plot',
13:'public tap/standpipe',
14:'Piped to Neighbour',
20:'Tube well water',
21:'Tube well or borehole',
30:'Dug well (open/protected)',
31:'Protected well',
32:'Unprotected well',
40:'Surface water',
41:'Protected spring',
42:'Unprotected spring',
43:'River/dam/lake/ponds/stream/canal/irrigation channel',
51:'Rain water',
61:'Tanker truck',
62:'Cart with small tank',
71:'Bottled water',
72:'Sachet water',
96:'Other'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Region',
{1:'North central',
2:'North east',
3:'North west',
4:'South east',
5:'South south',
6:'South west'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['State',
{10:'Sokoto',
20:'Zamfara',
30:'Katsina',
40:'Jigawa',
50:'Yobe',
60:'Borno-Urban',
70:'Adamawa',
80:'Gombe',
90:'Bauchi',
100:'Kano',
110:'Kaduna',
120:'Kebbi',
130:'Niger',
140:'FCT Abuja',
150:'Nasarawa',
160:'Plateau',
170:'Taraba',
180:'Benue',
190:'Kogi',
200:'Kwara',
210:'Oyo',
220:'Osun',
230:'Ekiti',
240:'Ondo',
250:'Edo',
260:'Anambra',
270:'Enugu',
280:'Ebonyi',
290:'Cross River',
300:'Akwa Ibom',
310:'Abia',
320:'Imo',
330:'Rivers',
340:'Bayelsa',
350:'Delta',
360:'Lagos',
370:'Ogun'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Has Bank Account',
{1:'yes',
0: 'No'}],
['Has Bicycle',
{1:'yes',
0:'No'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Has Mosquito Bed Net for Sleeping',
{1:'yes',
0: 'No'}],
['Has Car/Truck',
{1:'yes',
0:'No'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Wealth Index',
{1:'Poorest',
2:'Poorer',
3:'Middle',
4:'Richer',
5:'Richest'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
dict = [['Has Motorcycle/Scooter',
{1:'yes',
0: 'No'}]]
for col_dict in dict:
col=col_dict[0]
dict=col_dict[1]
df[col]=[dict[x] for x in df[col]]
#%%
df
#%% [markdown]
# Fine with the missing values check and recoding of some categorical variables
# Now on to visualizing the dataset.
#%%
def plot_box(df, cols, col_x = 'Has Mosquito Bed Net for Sleeping'):
for col in cols:
sb.set_style("whitegrid")
sb.boxplot(col_x, col, data=df)
plt.xlabel(col_x) # Set text for the x axis
plt.ylabel(col)# Set text for y axis
plt.show()
num_cols = ['Case Identification']
plot_box(df, num_cols)
#%% [markdown]
# From the boxplot above, there is obvious gap in the number of people who indicated having no Mosquito Bed Net for Sleeping and those who indicated they have.
#%%
def plot_box(df, col, col_y = 'Case Identification'):
sb.set_style("whitegrid")
sb.boxplot(col, col_y, data=df)
plt.xlabel(col) # Set text for the x axis
plt.ylabel(col_y)# Set text for y axis
plt.show()
plot_box(df, 'Wealth Index')
#%% [markdown]
# From the box plot, the gap between Richer and Richest is not obvious. While the gap between the Middle, Poorest and Poorer is very obvious.
#%%
def plot_box(df, col, col_y = 'Case Identification'):
sb.set_style("whitegrid")
sb.boxplot(col, col_y, data=df)
plt.xlabel(col) # Set text for the x axis
plt.ylabel(col_y)# Set text for y axis
plt.show()
plot_box(df, 'Region')
#%% [markdown]
# As expected regions are distinct from each other.
#%%
def plot_box(df, col, col_y = 'Case Identification'):
sb.set_style("whitegrid")
sb.boxplot(col, col_y, data=df)
plt.xlabel(col) # Set text for the x axis
plt.ylabel(col_y)# Set text for y axis
plt.show()
plot_box(df, 'Has Electricity')
#%% [markdown]
# There is obvious difference in the number of people having and not having electricity.
#%%
def plot_box(df, col, col_y = 'Case Identification'):
sb.set_style("whitegrid")
sb.boxplot(col, col_y, data=df)
plt.xlabel(col) # Set text for the x axis
plt.ylabel(col_y)# Set text for y axis
plt.show()
plot_box(df, 'Type of Place of Residence')
#%% [markdown]
# As expected type of places of residence is also obviously distinct.
#%% [markdown]