-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtitanic.py
More file actions
104 lines (61 loc) · 2.28 KB
/
titanic.py
File metadata and controls
104 lines (61 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 15 12:12:19 2017
@author: Varun
"""
import pandas as pd
import numpy as np
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
train.head()
train.describe()
data = [train, test]
data.head()
train.isnull().sum()
test.isnull().sum()
train.columns.values
print(train.info())
train.describe(include=['O'])
print(train[['Pclass','Survived']].groupby(['Pclass']).mean())
print(pd.crosstab(train['Pclass'], train['Survived']))
print(train[['Sex','Survived']].groupby(['Sex']).mean())
print(train[['SibSp','Survived']].groupby(['SibSp']).mean())
print(train[['Parch','Survived']].groupby(['Parch']).mean())
train = train.drop(['Cabin', 'Ticket'], axis=1)
test = test.drop(['Cabin', 'Ticket'], axis=1)
train.info()
train[['Age','Survived']]
for row in data:
row['Title'] = row.Name.str.extract('([A-Za-z]+)\.')
pd.crosstab(train['Title'],train['Sex'])
data
train.head()
for row in data:
row['Title'] = row['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
row['Title'] = row['Title'].replace('Mlle', 'Miss')
row['Title'] = row['Title'].replace('Ms', 'Miss')
row['Title'] = row['Title'].replace('Mme', 'Mrs')
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for row in data:
row['Title'] = row['Title'].map(title_mapping)
row['Title'] = row['Title'].fillna(0)
train.head()
train = train.drop(['Name','PassengerId','Ticket','Cabin'], axis=1)
test = test.drop(['Name','PassengerId','Ticket','Cabin'], axis=1)
train.info()
for row in data:
row['Sex'] = row['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
train.head()
test.head()
train = pd.get_dummies(train, columns=['Sex'])
test = pd.get_dummies(test, columns=['Sex'])
test.head()
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
train = pd.get_dummies(train, columns=['AgeBand'])
train.head()
pd.cut(train['Fare'],100)