-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgp_minimal.py
164 lines (137 loc) · 7.02 KB
/
gp_minimal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# This file provides a small self-contained demonstration of a Gaussian process
# model. The model really is just a Gaussian distribution! A few lines of
# standard linear algebra give the conditional distributions you need to perform
# inference. You will probably need to invest some effort to understanding this
# demonstration, and working out the correspondence to the maths in the notes.
# No matter what I provide, you'll need to do some hard work. But once you've
# reproduced the ideas for yourself you should find that nothing that
# complicated is going on.
#
# Alternatively, gp_altdemo.py is a simpler demo, in that it requires less
# linear algebra, but I've used a non-standard presentation to achieve that
# trick: it will still require thought!
#
# One messy detail that we do encounter is that large covariance matrices
# constructed from the Gaussian kernel are poorly-conditioned. This script
# applies a standard hack to work around that.
#
# The class notes link to several polished and featureful Gaussian process
# toolboxes. These hide the maths from you behind user-friendly interfaces,
# and come with more extensive demonstrations.
#
# Iain Murray, November 2016
import numpy as np
import matplotlib.pyplot as plt
## The kernel function
######################################################################
# This demo uses a Gaussian kernel function,
# k(x1,x2) = sigma_f**2 * np.exp(-0.5 * np.sum(((x1-x2)/ell)**2))
# We create kernel functions that take NxD and MxD design matrices, to give NxM
# kernel values. rbf_fn is a fixed unit bandwidth RBF kernel, and
# gauss_kernel_fn rescales that using free parameters ell (1xD or 1x1) and sigma_f (1x1).
rbf_fn = lambda X1, X2: \
np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:])
gauss_kernel_fn = lambda X1, X2, ell, sigma_f: \
sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell))
# Pick some particular parameters for this demo:
k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 3.0, 10.0)
# You could also try:
# k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 0.5, 1.0)
# k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 0.5, 1.0) + gauss_kernel_fn(X1, X2, 3.0, 10.0)
# ...
# gauss_kernel_fn is equivalent to the following slower function:
def gauss_kernel_naive(X1, X2, ell, sigma_f):
N, M = X1.shape[0], X2.shape[0]
K = np.zeros((N, M))
for n in range(N):
for m in range(M):
K[n,m] = sigma_f**2 * np.exp(
-0.5*np.sum(((X1[n,:]-X2[m,:])/ell)**2))
return K
## Sampling from the prior
######################################################################
# Pick the input locations that we want to see the function at.
X_grid = np.arange(0, 10, 0.02)[:,None]
# Compute covariance of function values for those points. The Gaussian kernel
# gives positive definite matrices in theory. In practice, we need to add a tiny
# amount of noise to the process (a "jitter" or "nugget" term) if we need a
# matrix that is positive definite given finite numerical precision.
N_grid = X_grid.shape[0]
K_grid = k_fn(X_grid, X_grid) + 1e-9*np.eye(N_grid)
# To sample from Gausian with covariance K=L*L',
# we just multiply L by a vector standard normals:
L_grid = np.linalg.cholesky(K_grid)
plt.figure(1)
plt.clf()
for ii in range(3):
f_grid = np.dot(L_grid, np.random.randn(N_grid))
plt.plot(X_grid, f_grid, '-')
plt.legend(['prior sample 1', 'prior sample 2', 'prior sample 3'])
plt.xlabel('x')
plt.ylabel('f')
## Sampling from the prior in two stages
######################################################################
# We don't have to sample "the whole" function, or a dense grid.
# Pick four of the original grid points, just sample a setting of those and plot as crosses:
idx = np.round(N_grid*np.array([0.2,0.4,0.6,0.8])).astype(int)
X_locs1 = X_grid[idx, :]
N_locs1 = idx.size
K_locs1 = k_fn(X_locs1, X_locs1)
L_locs1 = np.linalg.cholesky(K_locs1)
plt.figure(2)
plt.clf()
noise_var = 0.0 # add no noise to the samples, we want to look at function values
#noise_var = 1.0 # add some noise to the samples, we want to simulate data
f_locs1 = np.dot(L_locs1, np.random.randn(N_locs1)) + \
np.sqrt(noise_var)*np.random.randn(N_locs1)
plt.plot(X_locs1, f_locs1, 'x', markersize=20, markeredgewidth=2)
# We could later choose to sample any of the other function values, conditioned
# on the ones we already picked. We compute all the relevant covariances, and
# apply standard formulae to get the mean and covariance of the Gaussian over
# the remaining function values, conditioned on the ones we've already picked.
X_rest = np.delete(X_grid, idx, 0)
K_rest = k_fn(X_rest, X_rest)
K_rest_locs1 = k_fn(X_rest, X_locs1)
M = K_locs1 + noise_var*np.eye(N_locs1)
# We could compute cholesky(M) once and use it to solve both the linear systems in the next two lines
rest_cond_mu = np.dot(K_rest_locs1, np.linalg.solve(M, f_locs1))
rest_cond_cov = K_rest - np.dot(K_rest_locs1, np.linalg.solve(M, K_rest_locs1.T))
# Then we sample 3 different completions of the f_locs1 function values that we
# sampled above. Each completion by itself is just illustrating a sample of the
# function from the prior. The different completions illustrate different
# samples from the posterior at test locations, given a set of observed function
# values.
N_rest = X_rest.shape[0]
L_rest = np.linalg.cholesky(rest_cond_cov + 1e-9*np.eye(N_rest))
for ii in range(3):
rest_sample = np.dot(L_rest, np.random.randn(N_rest)) + rest_cond_mu
plt.plot(X_rest, rest_sample, '-')
# If you change noise_var from 0.0 to 1.0 above, you can simulate first looking
# at noisy values of the function. The conditional distribution is then the
# posterior over some function values given some noisy observations.
# If you'd rather plot the mean prediction and error bars rather than samples,
# you can do that too. The thick black solid line shows the mean, and dashed
# thick black lines show +/- 2 standard deviations -- at any particular
# location, we have ~95% belief the function will lie in this range.
plt.plot(X_rest, rest_cond_mu, '-k', linewidth=2)
rest_cond_std = np.sqrt(np.diag(rest_cond_cov))
plt.plot(X_rest, rest_cond_mu + 2*rest_cond_std, '--k', linewidth=2)
plt.plot(X_rest, rest_cond_mu - 2*rest_cond_std, '--k', linewidth=2)
# The mean and error bars can be computed more cheaply if you don't want to know
# the whole posterior. See the notes.
plt.legend(['Initial Samples', 'completion 1', 'completion 2', 'completion 3',
'mean completion', 'credible band'])
plt.xlabel('x')
plt.ylabel('f')
plt.show()
# Final remarks:
# --------------
# - None of the points have to be on a grid.
# - The initial training points X_locs1 could be anywhere.
# - The remaining test points X_rest could be anywhere.
# - If the points aren't sampled densely and in order, we wouldn't
# join them with straight lines. We'd just plot individual points.
# - The inputs can be in D-dimensions. The kernel function is the
# only part of the code that looks at the original input features.
# None of the rest of the code cares where the scalar function
# values are located in input space.