SanFranciscoCrimeClassification/newClassification.py at master · boruiwang/SanFranciscoCrimeClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 13:16:51 2016

@author: Kevin
"""

import pandas as pd
import time
import csv
import numpy as np
import os

from sklearn import preprocessing, cross_validation
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sknn.mlp import Classifier, Layer
from sklearn.decomposition import PCA

clear = lambda: os.system('cls')
clear()

train = pd.read_csv('newtrain.csv', parse_dates = ['Dates'])
train = train.head(n=100000)

print train.head()


train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
#train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
train['hour'] = train['Dates'].dt.hour
train['evening'] = train['Dates'].dt.hour.isin([18,19,20,21,22,23,0,1,2,3,4,5,6])
train['Year'] = train['Dates'].dt.year
#train = train[train['Year'].isin([2011,2012,2013,2014,2015])]
train['Month'] = train['Dates'].dt.month


start = time.time()
lenth = len(train)
cur = 0

print '  -> processing time:', time.time() - start
#print train.head()
print len(set(train['StreetNo'])), len(set(train['Address']))

le = LabelEncoder()
crime = le.fit_transform(train.Category)

hour = pd.get_dummies(train.hour)
district = pd.get_dummies(train.PdDistrict)
StreetNo = pd.get_dummies(train.StreetNo)
evening = pd.get_dummies(train.evening)
ContainOf = pd.get_dummies(train.AddressContainOf)
Year = pd.get_dummies(train.Year)
Month = pd.get_dummies(train.Month)

train_data = pd.concat([hour, district, StreetNo, evening, ContainOf, train['X'], train['Y']], axis=1)
train_data['crime'] = crime
crime_data = train_data.iloc[:,:-1]
crime_label = train_data['crime']

classifiers = [
    BernoulliNB(),
    RandomForestClassifier(max_depth=10, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=12, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=14, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=16, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=18, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=20, n_estimators=1024, n_jobs=-1),
    RandomForestClassifier(max_depth=22, n_estimators=1024, n_jobs=-1),
    KNeighborsClassifier(n_neighbors=100, weights='distance', algorithm='ball_tree', leaf_size=100, p=10, metric='minkowski'),
    #XGBClassifier(max_depth=16,n_estimators=1024),
    GradientBoostingClassifier(n_estimators=10, learning_rate=1.0,max_depth=5, random_state=0),
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8), algorithm="SAMME.R", n_estimators=128),
    ]

#print train.head()

newClassifiers = [
    BernoulliNB(),
    RandomForestClassifier(max_depth=16, n_estimators=1024, n_jobs=-1),
    GradientBoostingClassifier(max_depth=16, n_estimators=1024)
    #KNeighborsClassifier(n_neighbors=50, weights='distance', algorithm='ball_tree', leaf_size=100, p=10, metric='minkowski', n_jobs=-1),
    ]


#[train_d, test_d, train_labels, test_labels] = cross_validation.train_test_split(crime_data, crime_label, test_size=0.2, random_state=20160217)
skf = cross_validation.StratifiedKFold(crime_label, n_folds=2, random_state=20160217, shuffle=True)
for train_index, test_index in skf:
    train_d, test_d = crime_data.iloc[train_index,:], crime_data.iloc[test_index,:]
    train_labels, test_labels = crime_label[train_index], crime_label[test_index]
    print train_d.shape, test_d.shape
    for classifier in classifiers:
        print classifier.__class__.__name__
        start = time.time()
        classifier.fit(train_d, train_labels)
        print '  -> Training time:', time.time() - start

        start = time.time()
        #score_result = classifier.score(test_d, test_labels)
        #print '  -> caluclate score time', time.time() - start

        start = time.time()
        predicted = np.array(classifier.predict_proba(test_d))
        print '  -> predict_proba time:', time.time() - start

        start = time.time()
        log_result = log_loss(test_labels, predicted)
        print '  -> calculate log_loss time:', time.time() - start

        #print "score = ", score_result, "log loss = ",log_result
        print "log_loss = ", log_result