claire/LearningModel1.py at master · daugaard/claire · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import configparser
import logging
from time import gmtime

import couchdb

from clairelib.HomeState import HomeState
from clairelib.NetworkService import NetworkService
import clairelib.couch.ViewDefinitions as ViewDefinitions

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as ClassificationModel

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

config = configparser.ConfigParser()
config.read('config.cfg')

if not "General" in config.sections():
    print("Missing general section in configuration file. Please check config.cfg exists.")
    exit()

# Configuration
home_name = config.get("General", "home_name")

couchdb_server = config.get("CouchDB", "url")
couchdb_name = config.get("CouchDB", "db")

logfile = config.get("Log","logfile")
ouput_log_to_console = config.getboolean("Log","ouput_log_to_console")

# Connect to CouchDB
couch = couchdb.Server(couchdb_server)

try:
    couchdb = couch[couchdb_name]
except couchdb.http.ResourceNotFound:
    print("Error database $(couchdb_name)s does not exist")

# Sync all views
ViewDefinitions.sync(couchdb)

# Load all home states in the database
home_states = HomeState.view(couchdb, "_design/home_state/_view/by_time")

# Generate dataset from home states
X = []
y = []
for home in home_states:
    X.append(home.feature_vector())
    y.append(home.output_vector())

# Now code the time values (weekday, hour and minute) as categorial features in one-of-k (aka one-hot) scheme
encoder = OneHotEncoder(categorical_features=[0,1,2], sparse=False) # One code feature 0,1 and 2
X = encoder.fit_transform(X)

# Split into random training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=gmtime().tm_sec) #2)

# Fit to model
model = ClassificationModel()

model.fit(X_train, y_train)

y_predictions = model.predict(X_test)

# Extract predictions for each output variable and calculate accuracy and f1 score
for ov in range(len(y_test[0])):
    variable_y_predictions = [prediction[ov] for prediction in y_predictions]
    variable_y_test = [test[ov] for test in y_test]
    print("Accuracy Score for output variable {}: {} %".format(ov, round(accuracy_score(variable_y_test, variable_y_predictions, True)*100, 2)))
    #print("F1 Score for output variable {}: {}".format(ov, f1_score(variable_y_test, variable_y_predictions)))

# Store the preprocessor and model

joblib.dump(encoder, "models/feature_vector_encoder.pkl")
joblib.dump(model, "models/random_forest_model.pkl")