-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLinearReg_MissingData.py
More file actions
197 lines (148 loc) · 5.86 KB
/
LinearReg_MissingData.py
File metadata and controls
197 lines (148 loc) · 5.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import numpy as np
from numpy.linalg import inv
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
from matplotlib import pyplot as plt
"""
A base class for linear regression with 1D output.
Notes on this implementation are in the 'EM algorithm' section of the
repository.
Note 1: currently, this assumes that each input dimension has the same
independent Gaussian prior over missing values.
Note 2: this implementation assumes that the model is linear in the
inputs and the parameters to be estimated. If nonlinear in the inputs,
we lose closed-form expressions for the posterior over z (the solution
in that case would be to a sampling scheme like MCMC, SMC etc.)
P.L.Green
"""
class LinearReg_MissingData():
"""
Description
-----------
Linear regression with missing input data.
Parameters
----------
U - inputs, where missing parts are marked as 'nan'
Y - observed outputs
N - no. training points
D - dimension of inputs
theta_init - initial estimate of model parameters
mu_z - mean of prior over z
sigma_z - std of prior over z
sigma - std of measurement noise
Nitt - no. iterations of the EM algorithm used in training
Methods
-------
expectation()
maximisation()
train()
"""
def __init__(self, U, Y, N, D, theta_init, mu_z, sigma_z, sigma):
self.U = np.vstack(U)
self.Y = np.vstack(Y)
self.theta = np.vstack(theta_init)
self.N = N
self.D = D
# Identify indices that have missing data
self.d = np.isnan(self.U)
self.d_c = np.invert(self.d)
# Prior over z
self.mu_z = mu_z
self.sigma_z = sigma_z
# Noise standard deviation
self.sigma = sigma
def expectation(self):
"""
Description
-----------
Finds the expected values of u and u * u^T
"""
# Stores the expected values of u
self.EU = np.copy(self.U)
# Stores the expected values of u * u^T
self.EUU = np.zeros([self.N, self.D, self.D])
# Posterior distributions of z (we don't use these, but we output
# them in case they are useful)
self.posterior_z = []
# Loop over all data points
for n in range(self.N):
# If there are missing values in this array
if np.any(self.d[n]):
# To simplify notation
d = self.d[n]
d_c = self.d_c[n]
theta_d = self.theta[d]
theta_d_c = self.theta[d_c]
# 2D version of d
D2 = np.array([d]).T @ np.array([d])
# Create the mean and covariance matrix for our prior
# over z. Note that, even if there's only one missing
# component, Mu_z and Sigma_z will always be 2D.
Mu_z = np.vstack(np.repeat(self.mu_z, np.sum(d)))
Sigma_z = np.diag(np.repeat(self.sigma_z**2, np.sum(d)))
# Posterior covariance matrix (note that this is the same
# regardless of the number of known / missing points there
# are).
A = (self.sigma**-2 * theta_d @ theta_d.T +
inv(Sigma_z))
# If the data point contains both missing and known values
if np.any(self.d_c[n]):
# Known values
x = np.vstack(self.U[n, d_c])
# 2D version of d_c etc.
D2_c = np.array([d_c]).T @ np.array([d_c])
D2_cross = np.invert(D2 + D2_c)
# Needed to evaluate posterior mean (see notes)
b = (self.sigma**-2 * self.Y[n] * theta_d.T -
self.sigma**-2 * theta_d_c.T @ x @ theta_d.T +
Mu_z.T @ inv(Sigma_z)).T
# Find required expected functions of latent variables
EZ = inv(A) @ b
EZZ = inv(A) + EZ @ EZ.T
# Place E[x * x^T] in E[u * u^T]
self.EUU[n, D2_c] = (x @ x.T).flatten()
self.EUU[n, D2_cross] = np.repeat((x @ EZ.T).flatten(), 2)
# If the data point only contains missing values
else:
# Needed to evaluate posterior mean (see notes)
b = (self.sigma**-2 * self.Y[n] * theta_d.T -
Mu_z.T @ inv(Sigma_z)).T
# Find required expected functions of latent variables
EZ = inv(A) @ b
EZZ = inv(A) + EZ @ EZ.T
# Place E[z] in E[u]
self.EU[n, d] = EZ.T
# Place E[z * z^T] in E[u * u^T]
self.EUU[n, D2] = EZZ.flatten()
# Save posterior over z
self.posterior_z.append(mvn(mean=EZ.T[0], cov=inv(A)))
else:
self.posterior_z.append([])
self.EUU[n] = np.array([self.U[n]]).T @ np.array([self.U[n]])
def maximisation(self):
"""
Description
-----------
Finds maximum likelihood values of model parameters
"""
num = np.zeros([self.D, 1])
den_inv = np.zeros([self.D, self.D])
for n in range(self.N):
num += self.Y[n] * np.vstack(self.EU[n])
den_inv += self.EUU[n]
self.theta = inv(den_inv) @ num
def train(self, Nitt=10):
"""
Description
-----------
Trains model using the EM algorithm
Parameters
----------
Nitt - no. EM iterations used in training
"""
self.theta_store = np.zeros([Nitt + 1, self.D])
self.theta_store[0] = self.theta.T
for i in range(Nitt):
self.expectation()
self.maximisation()
self.theta_store[i + 1] = self.theta.T