-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path20171211-minimal-character-rnn.py
177 lines (156 loc) · 6.99 KB
/
20171211-minimal-character-rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# ==========================================================================
# Play with Recurrent Neural Net
# --------------------------------------------------------------------------
# Code Reference: Andrej Karpathy and Yoav Goldberg (links in the code)
#
# Practice RNN with only NumPy
#
# Imagine neural unit as a function Fn(input_data) -> output_data. In a
# typical feed forward neural nets, the output of a function is fed to
# another neural unit. The final output becomes Fn3(Fn2(Fn1(input_data)))
#
# In RNN, the output of a function can be fed back to itself and becomes
# Fn1(Fn1(Fn1(input_data))). Reusage of the same function seems reasonable
# when the tasks handled by the neural units are the same. e.g. Reading
# the first sentence of a paragraph should not be that different from
# reading the middle sentence. Reusing the same unit would also allow
# faster back-propagation.
#
# Specifically, an RNN for character-level text generation would take in a
# context (hidden state, "h") and a current character, to produce:
# i) one output = the generated next character
# ii) one output = ("hidden state") = store the context so far
#
# ==========================================================================
import numpy as np
def RNN():
'''
Minimal character-level Vanilla RNN model.
From Andrej Karpathy
http://karpathy.github.io/2015/05/21/rnn-effectiveness/
https://gist.github.com/karpathy/d4dee566867f8291f086
'''
def LossFun(inputs, target, hprev):
'''
inputs, targets are both list of integers
hprev is Hx1 array of initial hidden state
returns the loss, gradients on model parameters, and last hidden state
'''
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = np.copy(hprev)
loss = 0
for t in range(len(inputs)):
xs[t] = np.zeros((vocab_size, 1)) # [v,1]
xs[t][inputs[t]] = 1 # Simple one hot encoding
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # [h,1] h(t) = tanh(wX + wh(t-1) + bh)
ys[t] = np.dot(Why, hs[t]) + by # y = wh + by = [y, 1]
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
loss += -np.log(ps[t][targets[t], 0]) # Realize that the tensor is 3D
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0])
for t in reversed(range(len(inputs))):
dy = np.copy(ps[t]) # [y,1]
dy[targets[t]] -= 1 # Derive http://cs231n.github.io/neural-networks-case-study/#grad
dWhy += np.dot(dy, hs[t].T) # [y,h]
dby += dy # [y,1]
dh = np.dot(Why.T, dy) + dhnext #+ dhnext, propagate
dhraw = (1-hs[t]*hs[t])*dh
dbh += dhraw
dWxh += np.dot(dhraw, xs[t].T)
dWhh += np.dot(dhraw, hs[t-1].T)
dhnext = np.dot(Whh.T, dhraw)
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam) # Clip to mitigate exploding gradients
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
def sample(h, seed_ix, n):
'''
sample a sequence of integers from the model
h is memory state, seed_ix is seed letter for first time step
'''
x = np.zeros((vocab_size, 1))
x[seed_ix] = 1
ixes = []
for t in range(n):
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
y = np.dot(Why, h) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel())
x = np.zeros((vocab_size, 1))
x[ix]=1
ixes.append(ix)
return ixes
# data I/O
data = open('tmp/index.txt', 'r', encoding='utf-8').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {} chars, {} unique'.format(data_size, vocab_size))
char_to_ix = {ch:i for i,ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
# Hyperparam
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
# Model Param
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vocab_size) * seq_length
while True:
if p+seq_length+1 >= len(data) or n==0:
hprev = np.zeros((hidden_size,1)) # reset RNN memory
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
if n%100 == 0:
sample_ix = sample(hprev, np.random.randint(0, vocab_size), 100)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print("----\n {} \n----".format(txt))
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = LossFun(inputs, targets, hprev)
smooth_loss = smooth_loss * 0.999 + loss * 0.001
if n % 100 == 0:
print("iter {}, loss {}".format(n, smooth_loss))
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam #? what is m
param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
p += seq_length
n += 1
# From http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139
# Maximum Likelihood n-character predictor
def char_lm():
def train_char_lm(fname, order=4):
data = open(fname, 'r',encoding='utf-8').read()
from collections import defaultdict, Counter
lm = defaultdict(Counter)
data += " " * order
for i in range(len(data) - order):
history, char = data[i:i+order], data[i+order]
lm[history][char] += 1
def normalize(counter):
s = float(sum(counter.values()))
return [(c,cnt/s) for c,cnt in counter.items()]
return {hist:normalize(chars) for hist,chars in lm.items()}
def generate_letter(lm, history, candidates):
if history not in lm: return np.random.choice(candidates)
pred, prob = zip(*lm[history])
return np.random.choice(pred, p=prob)
def generate_text(lm, order, nletters=1000):
history = " " * order
candidates = [a[0] for hist in lm.values() for a in hist]
candidates = list(set(candidates))
out = []
for i in range(nletters):
c = generate_letter(lm, history, candidates)
history = history[1:] + c
out.append(c)
return "".join(out)
lm = train_char_lm("tmp/index.txt", order=2)
print(generate_text(lm, 2))
RNN()