2017-12-26 33 views
1

numpy로 LSTM을 구현하고 있으며 몇 번의 교육을 반복 한 후 nan 값을 실행했습니다. 이 가이드를 따르고 있습니다.LSTM ~ 250 훈련 반복 후 Numpy, Loss, Cell State, Gradients, Weights가 발생합니다.

https://wiseodd.github.io/techblog/2016/08/12/lstm-backprop/

모델 정의 :

import numpy as np 


H = 128 # Number of LSTM layer's neurons 
D = ... # Number of input dimension == number of items in vocabulary 
Z = H + D # Because we will concatenate LSTM state with the input 

model = dict(
    Wf=np.random.randn(Z, H)/np.sqrt(Z/2.), 
    Wi=np.random.randn(Z, H)/np.sqrt(Z/2.), 
    Wc=np.random.randn(Z, H)/np.sqrt(Z/2.), 
    Wo=np.random.randn(Z, H)/np.sqrt(Z/2.), 
    Wy=np.random.randn(H, D)/np.sqrt(D/2.), 
    bf=np.zeros((1, H)), 
    bi=np.zeros((1, H)), 
    bc=np.zeros((1, H)), 
    bo=np.zeros((1, H)), 
    by=np.zeros((1, D)) 
) 

내 모델 :

# RNN class 
class RNN: 

    def __init__(self, n, d, RL, LR): 
     """Pass input size (n), number of memory cells (d), recurrence length (RL), and learning rate (LR)""" 
     self.n, self.d, self.z, z = n, d, n + d, n + d 
     self.d = d 
     self.z, z = n + d, n + d 
     self.RL = RL 
     self.LR = LR 

     self.x = [] 

     self.Cells = [Cell(n, d, self)] 

     self.Wi, self.Wf, self.Wo, self.Wc, self.Wy = randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(d, n)/sqrt(d/2) 
     self.bi, self.bf, self.bo, self.bc, self.by = randn(d, 1), randn(d, 1), randn(d, 1), randn(d, 1), randn(n, 1) 
     self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n)) 
     self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1)) 

내 세포 :

class Cell: 

def __init__(self, n, d, rnn): 
    """Pass the input size (n) and memory cell size (d), create hidden state of size d, pass rnn (self)""" 
    self.n, self.d, self.h, self.z, z = n, d, zeros((d, 1)), n + d, n + d 
    self.rnn = rnn 

그들의 피드 포워드 :

def lstm_forward(X, state): 
    m = model 
    Wf, Wi, Wc, Wo, Wy = m['Wf'], m['Wi'], m['Wc'], m['Wo'], m['Wy'] 
    bf, bi, bc, bo, by = m['bf'], m['bi'], m['bc'], m['bo'], m['by'] 

    h_old, c_old = state 

    # One-hot encode 
    X_one_hot = np.zeros(D) 
    X_one_hot[X] = 1. 
    X_one_hot = X_one_hot.reshape(1, -1) 

    # Concatenate old state with current input 
    X = np.column_stack((h_old, X_one_hot)) 

    hf = sigmoid(X @ Wf + bf) 
    hi = sigmoid(X @ Wi + bi) 
    ho = sigmoid(X @ Wo + bo) 
    hc = tanh(X @ Wc + bc) 

    c = hf * c_old + hi * hc 
    h = ho * tanh(c) 

    y = h @ Wy + by 
    prob = softmax(y) 

    state = (h, c) # Cache the states of current h & c for next iter 
    cache = ... # Add all intermediate variables to this cache 

    return prob, state, cache 

내 피드 포워드 :

def feedforward(self, x, c_, h_): 
    """Pass an input of size n, the previous hidden state(ht), and the previous cell state(c)""" 
    n, d = self.n, self.d 
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
    bi, bf, bo, bc, by = self.rnn.bi, self.rnn.bf, self.rnn.bo, self.rnn.bc, self.rnn.by 

    index = x  # one hot encoding 
    x = zeros((n, 1)) 
    x[index] = 1 
    g = concat((x, h_))   # input g is input x + previous hidden state 

    it = sigmoid(dot(Wi.T, g) + bi)  # gate activations 
    ft = sigmoid(dot(Wf.T, g) + bf) 
    ot = sigmoid(dot(Wo.T, g) + bo) 
    ct = tanh(dot(Wc.T, g) + bc)  # non linearity activation 
    c = ft * c_ + it * ct  # cell state 

    ht = ot * tanh(c)  # squashed hidden state 
    yt = dot(Wy.T, ht) + by  # output state 
    p = softmax(yt)  # call softmax, get probability 

    self.c_, self.h_ = c_, h_ 
    self.it, self.ft, self.ot, self.ct = it, ft, ot, ct 
    self.c, self.ht, self.yt, self.p, self.g = c, ht, yt, p, g 

    return ht, c 

그들의 backprop :

def lstm_backward(prob, y_train, d_next, cache): 
    # Unpack the cache variable to get the intermediate variables used in forward step 
    ... = cache 
    dh_next, dc_next = d_next 

    # Softmax loss gradient 
    dy = prob.copy() 
    dy[1, y_train] -= 1. 

    # Hidden to output gradient 
    dWy = h.T @ dy 
    dby = dy 
    # Note we're adding dh_next here 
    dh = dy @ Wy.T + dh_next 

    # Gradient for ho in h = ho * tanh(c) 
    dho = tanh(c) * dh 
    dho = dsigmoid(ho) * dho 

    # Gradient for c in h = ho * tanh(c), note we're adding dc_next here 
    dc = ho * dh * dtanh(c) 
    dc = dc + dc_next 

    # Gradient for hf in c = hf * c_old + hi * hc 
    dhf = c_old * dc 
    dhf = dsigmoid(hf) * dhf 

    # Gradient for hi in c = hf * c_old + hi * hc 
    dhi = hc * dc 
    dhi = dsigmoid(hi) * dhi 

    # Gradient for hc in c = hf * c_old + hi * hc 
    dhc = hi * dc 
    dhc = dtanh(hc) * dhc 

    # Gate gradients, just a normal fully connected layer gradient 
    dWf = X.T @ dhf 
    dbf = dhf 
    dXf = dhf @ Wf.T 

    dWi = X.T @ dhi 
    dbi = dhi 
    dXi = dhi @ Wi.T 

    dWo = X.T @ dho 
    dbo = dho 
    dXo = dho @ Wo.T 

    dWc = X.T @ dhc 
    dbc = dhc 
    dXc = dhc @ Wc.T 

    # As X was used in multiple gates, the gradient must be accumulated here 
    dX = dXo + dXc + dXi + dXf 
    # Split the concatenated X, so that we get our gradient of h_old 
    dh_next = dX[:, :H] 
    # Gradient for c_old in c = hf * c_old + hi * hc 
    dc_next = hf * dc 

    grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby) 
    state = (dh_next, dc_next) 

    return grad, state 

내 backprop :

def backpropagate(self, y, ht1, ct1): 

    n, d = self.n, self.d 
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
    dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy 
    dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby 
    c_, h_ = self.c_, self.h_ 
    it, ft, ot, ct = self.it, self.ft, self.ot, self.ct 
    c, ht, yt, p = self.c, self.ht, self.yt, self.p 
    g = self.g 

    dy = copy(p) 
    dy[y] -= 1 

    loss = cross_ent(p, y) 

    dh = dot(Wy, dy) + ht1 

    do = tanh(c) * dh 
    do = dsigmoid(ot) * do 

    dc = ot * dh * dtanh(c) 
    dc = dc + ct1 

    df = c_ * dc 
    df = dsigmoid(ft) * df 

    di = ct * dc 
    di = dsigmoid(it) * di 

    dct = it * dc 
    dct = dtanh(ct) * dct 

    dWf += dot(g, df.T) 
    dWi += dot(g, di.T) 
    dWo += dot(g, do.T) 
    dWc += dot(g, dc.T) 
    dWy += dot(ht, dy.T) 

    dbf += df 
    dbi += di 
    dbo += do 
    dbc += dc 
    dby += dy 

    dxi = dot(Wi, di) 
    dxf = dot(Wf, df) 
    dxo = dot(Wo, do) 
    dxc = dot(Wc, dct) 

    dx = dxf + dxi + dxo + dxc 

    dht1 = dx[n:] 
    dct1 = ft * dc 

    return loss, dht1, dct1 

그들의 교육 단계 :

내 교육 단계 : 전체에서

def FeedForward(self, inputs, ht_, ct_): 

    n, d, rl, Cells = self.n, self.d, self.RL, self.Cells 

    while len(Cells) < rl: 
     Cells.append(Cell(n, d, self)) 

    for cell, x in zip(Cells, range(len(inputs))): 
     ht_, ct_ = cell.feedforward(x, ht_, ct_) 

    return ht_, ct_ 



def BPTT(self, outputs, ht1, ct1): 

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL 
    Cells = self.Cells 

    avg_loss = 0 

    for i in reversed(range(rl)): 
     loss, ht1, ct1 = Cells[i].backpropagate(outputs[i], ht1, ct1) 
     avg_loss += loss 

    avg_loss /= rl 

    return avg_loss, ht1, ct1 


def train(self, inputs, outputs): 

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL 
    index = 0 
    LR = 0.1 
    loss = 0 

    ht_, ct_ = zeros((d, 1)), zeros((d, 1)) 
    ht1, ct1 = zeros((d, 1)), zeros((d, 1)) 

    while index < len(outputs): 
     xlist = inputs[index:index + rl] 
     ylist = outputs[index:index + rl] 
     ht_, ct_ = self.FeedForward(xlist, ht_, ct_) 
     loss, ht1, ct1 = self.BPTT(ylist, ht1, ct1) 
     #print(loss) 
     self.update(LR) 
     index += rl 

def update(self, LR): 

    n, d, z = self.n, self.d, self.n + self.d 

    self.Wi -= LR * self.dWi 
    self.Wf -= LR * self.dWf 
    self.Wo -= LR * self.dWo 
    self.Wc -= LR * self.dWc 
    self.Wy -= LR * self.dWy 
    self.bi -= LR * self.dbi 
    self.bf -= LR * self.dbf 
    self.bo -= LR * self.dbo 
    self.bc -= LR * self.dbc 
    self.by -= LR * self.dby 

    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n)) 
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1)) 

내 코드 :

# Import logistic function that doesn't explode outside a 64 bit float 
from scipy.special import expit as sigmoid 
from numpy import zeros, zeros_like, tanh, exp, sum, dot, sqrt, log, argmax, concatenate as concat, copy 
from numpy.random import randn 


# derivative of sigmoid function 
def dsigmoid(z): 
    return sigmoid(z) * (1 - sigmoid(z)) 

# derivative of hyperbolic tangent 
def dtanh(z): 
    return 1 - tanh(z) ** 2 

# probability function 
def softmax(z): 
    return exp(z)/sum(exp(z)) 

# cross entropy loss 
def cross_ent(p, y): 
    return -log(p[y]) 


# RNN class 
class RNN: 
def __init__(self, n, d, RL, LR): 
    """Pass input size (n), number of memory cells (d), recurrence length (RL), and learning rate (LR)""" 
    self.n, self.d, self.z, z = n, d, n + d, n + d 
    self.d = d 
    self.z, z = n + d, n + d 
    self.RL = RL 
    self.LR = LR 

    self.x = [] 

    self.Cells = [Cell(n, d, self)] 

    self.Wi, self.Wf, self.Wo, self.Wc, self.Wy = randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(z, d)/sqrt(z/2), randn(d, n)/sqrt(d/2) 
    self.bi, self.bf, self.bo, self.bc, self.by = randn(d, 1), randn(d, 1), randn(d, 1), randn(d, 1), randn(n, 1) 
    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n)) 
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1)) 

def FeedForward(self, inputs, ht_, ct_): 

    n, d, rl, Cells = self.n, self.d, self.RL, self.Cells 

    while len(Cells) < rl: 
     Cells.append(Cell(n, d, self)) 

    for cell, x in zip(Cells, range(len(inputs))): 
     ht_, ct_ = cell.feedforward(x, ht_, ct_) 

    return ht_, ct_ 



def BPTT(self, outputs, ht1, ct1): 

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL 
    Cells = self.Cells 

    avg_loss = 0 

    for i in reversed(range(rl)): 
     loss, ht1, ct1 = Cells[i].backpropagate(outputs[i], ht1, ct1) 
     avg_loss += loss 

    avg_loss /= rl 

    return avg_loss, ht1, ct1 


def train(self, inputs, outputs): 

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL 
    index = 0 
    LR = 0.1 
    loss = 0 

    ht_, ct_ = zeros((d, 1)), zeros((d, 1)) 
    ht1, ct1 = zeros((d, 1)), zeros((d, 1)) 

    while index < len(outputs): 
     xlist = inputs[index:index + rl] 
     ylist = outputs[index:index + rl] 
     ht_, ct_ = self.FeedForward(xlist, ht_, ct_) 
     loss, ht1, ct1 = self.BPTT(ylist, ht1, ct1) 
     #print(loss) 
     self.update(LR) 
     index += rl 

def update(self, LR): 

    n, d, z = self.n, self.d, self.n + self.d 

    self.Wi -= LR * self.dWi 
    self.Wf -= LR * self.dWf 
    self.Wo -= LR * self.dWo 
    self.Wc -= LR * self.dWc 
    self.Wy -= LR * self.dWy 
    self.bi -= LR * self.dbi 
    self.bf -= LR * self.dbf 
    self.bo -= LR * self.dbo 
    self.bc -= LR * self.dbc 
    self.by -= LR * self.dby 

    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n)) 
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1)) 

클래스 셀 : 오류가

dx = dxf + dxi + dxo + dxc 
012입니다 던지는

def __init__(self, n, d, rnn): 
    """Pass the input size (n) and memory cell size (d), create hidden state of size d, pass rnn (self)""" 
    self.n, self.d, self.h, self.z, z = n, d, zeros((d, 1)), n + d, n + d 
    self.rnn = rnn 


def feedforward(self, x, c_, h_): 
    """Pass an input of size n, the previous hidden state(ht), and the previous cell state(c)""" 
    n, d = self.n, self.d 
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
    bi, bf, bo, bc, by = self.rnn.bi, self.rnn.bf, self.rnn.bo, self.rnn.bc, self.rnn.by 

    index = x  # one hot encoding 
    x = zeros((n, 1)) 
    x[index] = 1 
    g = concat((x, h_))   # input g is input x + previous hidden state 

    it = sigmoid(dot(Wi.T, g) + bi)  # gate activations 
    ft = sigmoid(dot(Wf.T, g) + bf) 
    ot = sigmoid(dot(Wo.T, g) + bo) 
    ct = tanh(dot(Wc.T, g) + bc)  # non linearity activation 
    c = ft * c_ + it * ct  # cell state 

    ht = ot * tanh(c)  # squashed hidden state 
    yt = dot(Wy.T, ht) + by  # output state 
    p = softmax(yt)  # call softmax, get probability 

    self.c_, self.h_ = c_, h_ 
    self.it, self.ft, self.ot, self.ct = it, ft, ot, ct 
    self.c, self.ht, self.yt, self.p, self.g = c, ht, yt, p, g 

    return ht, c 


def backpropagate(self, y, ht1, ct1): 

    n, d = self.n, self.d 
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
    dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy 
    dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby 
    c_, h_ = self.c_, self.h_ 
    it, ft, ot, ct = self.it, self.ft, self.ot, self.ct 
    c, ht, yt, p = self.c, self.ht, self.yt, self.p 
    g = self.g 

    dy = copy(p) 
    dy[y] -= 1 

    loss = cross_ent(p, y) 

    dh = dot(Wy, dy) + ht1 

    do = tanh(c) * dh 
    do = dsigmoid(ot) * do 

    dc = ot * dh * dtanh(c) 
    dc = dc + ct1 

    df = c_ * dc 
    df = dsigmoid(ft) * df 

    di = ct * dc 
    di = dsigmoid(it) * di 

    dct = it * dc 
    dct = dtanh(ct) * dct 

    dWf += dot(g, df.T) 
    dWi += dot(g, di.T) 
    dWo += dot(g, do.T) 
    dWc += dot(g, dc.T) 
    dWy += dot(ht, dy.T) 

    dbf += df 
    dbi += di 
    dbo += do 
    dbc += dc 
    dby += dy 

    dxi = dot(Wi, di) 
    dxf = dot(Wf, df) 
    dxo = dot(Wo, do) 
    dxc = dot(Wc, dct) 

    dx = dxf + dxi + dxo + dxc 

    dht1 = dx[n:] 
    dct1 = ft * dc 

    return loss, dht1, dct1 

file = open("trumptweets.txt", 'r', encoding='utf8').read() 

text = list(file) 

alphabet = list(set(text)) 

n = (len(alphabet)) 
d = 100 

encode = {ch:i for i,ch in enumerate(alphabet)} 
decode = {i:ch for i,ch in enumerate(alphabet)} 

inputs = [encode[ch] for ch in text] 
outputs = [inputs[i + 1] for i in range(len(inputs)-1)] 


RNN = LSTM.RNN(n, d, 100, 0.1) 

RNN.train(inputs, outputs) 

의 선 3,516,

잘못된 값 오류 곱셈에

do = tanh(c) * dh 
dc = ot * dh * dtanh(c) 
dct = it * dc 

잘못된 값입니다.

4 가지 훈련시기 또는 약 200 샘플 후에 C가 폭발합니다.

마찬가지로 손실이 발생합니다.

나는 내 업데이트 기능이 나는 LSTM에 대한 일관성있는 업데이트 규칙을 찾을 수 없습니다로 함께 할 수있는 뭔가가있을 수 있습니다 생각한다. 숨겨진 상태 및 셀 상태의 기울기는 패스

답변

0

I 그래디언트 통제 불능 폭발 하였다 의심이 에러 처리.

는 일본어 :

def backpropagate(self, y, ht1, ct1): 

n, d = self.n, self.d 
Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy 
dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby 
c_, h_ = self.c_, self.h_ 
it, ft, ot, ct = self.it, self.ft, self.ot, self.ct 
c, ht, yt, p = self.c, self.ht, self.yt, self.p 
g = self.g 

dy = copy(p) 
dy[y] -= 1 

loss = cross_ent(p, y) 

dh = dot(Wy, dy) + ht1 

do = tanh(c) * dh 
do = dsigmoid(ot) * do 

dc = ot * dh * dtanh(c) 
dc = dc + ct1 

고정 :

def backpropagate(self, y, ht1, ct1): 
    n, d = self.n, self.d 
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy 
    dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy 
    dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby 
    c_, h_ = self.c_, self.h_ 
    it, ft, ot, ct = self.it, self.ft, self.ot, self.ct 
    c, ht, yt, p = self.c, self.ht, self.yt, self.p 
    g = self.g 

    dy = copy(p) 
    dy[y] -= 1 

    loss = cross_ent(p, y) 

    dh = dot(Wy, dy) + ht1 
    dh = clip(dh, -6, 6) 

    do = tanh(c) * dh 
    do = dsigmoid(ot) * do 

    dc = ot * dh * dtanh(c) 
    dc = dc + ct1 
    dc = clip(dc, -6, 6)