generative_rf/python/beijing_weather.py at main · rom1mouret/generative_rf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3

import urllib.request
from tempfile import gettempdir
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from generative_rf import FeatureGenerator


# download the data if it's not in /tmp
data_path = os.path.join(gettempdir(), "beijing_weather.csv")
if not os.path.exists(data_path):
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv"
    urllib.request.urlretrieve(url, data_path)

cols = ["month", "hour", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir", "pm2.5"]
target = "TEMP"
features = [c for c in cols if c != target]


def read_chunks():
    chunksize = 4096
    n_rows = 43824
    n_chunks = 1 + (n_rows - 1) // chunksize
    chunks = pd.read_csv(data_path, header=0, usecols=cols, chunksize=chunksize)
    for df in tqdm(chunks, total=n_chunks):
        yield df

# the data generator we are going to evaluate
gen_rf = FeatureGenerator()
pred = []
ref = []

# train/predict on-the-fly
for i, df in enumerate(read_chunks()):
    df.fillna(df.mean(axis=0, skipna=True), inplace=True)
    X = df[features].values
    y = df[target].values
    if i > 0:
        # prediction
        pred += gen_rf.predict(X).tolist()
        ref += y.tolist()

    # training
    if i == 0:
        gen_rf.register(RandomForestRegressor().fit(X, y))
    else:
        # generate data from the forest
        X2, w2 = gen_rf.generate(10000)
        y2 = gen_rf.predict(X2)

        # merge with current data
        X_all = np.concatenate([X, X2], axis=0)
        y_all = np.concatenate([y, y2], axis=0)
        w = np.array([1] * X.shape[0] + [w2] * X2.shape[0])
        w *= len(w) / w.sum()

        # train a new forest from all the data
        new_rf = RandomForestRegressor().fit(X_all, y_all, sample_weight=w)
        gen_rf.register(new_rf).reinforce(X2, w2)

    # always call these functions at the end of each iteration
    gen_rf.reinforce(X).update_moments(X)

# visualization
def rolling_avg(arr):
    csum = np.cumsum(arr)
    return (csum[1000:] - csum[:-1000])/1000

x = np.arange(len(rolling_avg(ref)))
plt.plot(x, rolling_avg(ref), label="ground-truth")
plt.plot(x, rolling_avg(pred), label="Generative RF")
plt.xlabel("time")
plt.ylabel(target)
plt.legend()
plt.xticks([], [])
plt.title("Beijing Temperature")
for dpi in (70, 80, 90):
    plt.savefig("vs_ground_truth_%i.png" % dpi, dpi=dpi)