claire/Learning.py at master · daugaard/claire · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import configparser
import logging
from time import gmtime
from datetime import datetime, timedelta

import couchdb

from clairelib.HomeState import HomeState
from clairelib.NetworkService import NetworkService
import clairelib.couch.ViewDefinitions as ViewDefinitions

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as ClassificationModel
from sklearn.ensemble import RandomForestRegressor as RegressionModel

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from numpy import *

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.externals import joblib

config = configparser.ConfigParser()
config.read('config.cfg')

if not "General" in config.sections():
    print("Missing general section in configuration file. Please check config.cfg exists.")
    exit()

# Configuration
home_name = config.get("General", "home_name")

couchdb_server = config.get("CouchDB", "url")
couchdb_name = config.get("CouchDB", "db")

logfile = config.get("Log","logfile")
ouput_log_to_console = config.getboolean("Log","ouput_log_to_console")

# Connect to CouchDB
couch = couchdb.Server(couchdb_server)

try:
    couchdb = couch[couchdb_name]
except couchdb.http.ResourceNotFound:
    print("Error database $(couchdb_name)s does not exist")

# Sync all views
ViewDefinitions.sync(couchdb)

# Load all home states in the database
home_states = HomeState.view(couchdb, "_design/home_state/_view/by_time")

first_home_state = HomeState.view(couchdb, "_design/home_state/_view/by_time", limit=1).rows[0]

# Use this code if you only want to train on a subset of data
#home_states = home_states[str(datetime.now()-timedelta(days=14)):str(datetime.now())]

# Get all output devices in this home
output_devices = first_home_state.output_devices()

# Generate dataset from home states for each output device
Xs = {}
ys = {}
for device in output_devices:
    print("Generating training set for", device['name'])
    # Initialize empty X and y datasets for out output device
    Xs[device['device_id']] = []
    ys[device['device_id']] = []
    # For each home state generate the datasets
    for home in home_states:
        Xs[device['device_id']].append(home.feature_vector_for_output_device( device ))
        ys[device['device_id']].append(home.output_vector_for_device_id( device['device_id'] ))

# Now code the time values (weekday, hour and minute) as categorial features in one-of-k (aka one-hot) scheme
encoder = OneHotEncoder(categorical_features=[0,1,2], sparse=False) # One code feature 0,1 and 2

for device in output_devices:
    print("Training model for", device['name'], "with type", device['type'])
    X = Xs[device['device_id']]
    y = ys[device['device_id']]

    # Encode time values using encoder
    X = encoder.fit_transform(X)

    # Split into random training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=gmtime().tm_sec) #2)

    # Fit to model
    if device['type'] == 'BinaryPowerSwitchDevice':
        model = ClassificationModel(n_estimators=100,max_features='auto',n_jobs=-1, class_weight='balanced_subsample') #{0:1,1:2}
    else:
        model = RegressionModel(n_estimators=10,max_features='log2',n_jobs=-1)

    print("Cross Validation Score: ", round(mean(cross_val_score(model, X, y))*100,2))

    model.fit(X, y)

    y_predictions = model.predict(X_test)

    # Score predictions - calculate accuracy and f1 score
    if device['type'] == 'BinaryPowerSwitchDevice':
        print("Accuracy Score: {} %".format(round(accuracy_score(y_test, y_predictions, True)*100, 2)))
    else:
        print("Mean Sq. Error Score: {}".format(round(mean_squared_error(y_test, y_predictions),2)))


    # Store the preprocessor and model
    joblib.dump(model, "models/random_forest_model_device_{}.pkl".format(device['device_id']))

# Store encoder
joblib.dump(encoder, "models/feature_vector_encoder.pkl")