짧은 시퀀스를 예측하기 위해 아주 사소한 LSTM을 만들었지 만, 예상치 못한 방식으로 과부하가되지 않고 0의 손실에 접근합니다.왜 사소한 LSTM이 과장되지 않습니까?
대신이 시퀀스를 그대로 배울 수있는 충분한 자유도가 있더라도 약 1.5의 손실로 수렴합니다.
import tensorflow as tf
import time
tf.logging.set_verbosity(tf.logging.DEBUG)
#
# Training data, just a single sequence
#
train_input = [[0, 1, 2, 3, 4, 5, 0, 6, 7, 0]]
train_output = [[1, 2, 3, 4, 5, 0, 6, 7, 8, 0]]
#
# Training metadata
#
batch_size = 1
sequence_length = 10
n_classes = 9
# Network size
rnn_cell_size = 10
rnn_layers = 2
embedding_rank = 3
#
# Training hyperparameters
#
epochs = 100
n_batches = 100
learning_rate = 0.01
#
# Model
#
features = tf.placeholder(tf.int32, [None, sequence_length], name="features")
embeddings = tf.Variable(tf.random_uniform([n_classes, embedding_rank], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, features)
cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_cell_size) for i in range(rnn_layers)])
initial_state = cell.zero_state(batch_size, tf.float32)
cell, _ = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
# Convert sequences x batches x outputs to (sequences * batches) x outputs
flat_lstm_output = tf.reshape(cell, [-1, rnn_cell_size])
output = tf.contrib.layers.fully_connected(inputs=flat_lstm_output, num_outputs=n_classes)
softmax = tf.nn.softmax(output)
#
# Training
#
targets = tf.placeholder(tf.int32, [None, sequence_length])
# Convert sequences x batches x targets to (sequences * batches) x targets
flat_targets = tf.reshape(targets, [-1])
loss = tf.losses.sparse_softmax_cross_entropy(flat_targets, softmax)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(epochs):
loss_sum = 0
epoch_start = time.time()
for j in range(n_batches):
_, step_loss = sess.run([train_op, loss], {
features: train_input,
targets: train_output,
})
loss_sum = loss_sum + step_loss
print('avg_loss', loss_sum/n_batches, 'avg_time', (time.time() - epoch_start)/n_batches)
여기에 기본적인 느낌이 들지 않습니다. 내가 뭘 잘못하고있는 걸까요?
편집
나는 더를 단순화하기 위해 노력하고, 지금은 (도 수렴하지 않는) 다음과 같은 더 사소한 예를 아래로 해요 :
import tensorflow as tf
import time
tf.logging.set_verbosity(tf.logging.DEBUG)
#
# Training data, just a single sequence
#
train_input = [0, 1, 2, 3, 4]
train_output = [1, 2, 3, 4, 5]
#
# Training metadata
#
batch_size = 1
sequence_length = 5
n_classes = 6
#
# Training hyperparameters
#
epochs = 100
n_batches = 100
learning_rate = 0.01
#
# Model
#
features = tf.placeholder(tf.int32, [None])
one_hot = tf.contrib.layers.one_hot_encoding(features, n_classes)
output = tf.contrib.layers.fully_connected(inputs=one_hot, num_outputs=10)
output = tf.contrib.layers.fully_connected(inputs=output, num_outputs=n_classes)
#
# Training
#
targets = tf.placeholder(tf.int32, [None])
one_hot_targets = tf.one_hot(targets, depth=n_classes)
loss = tf.losses.softmax_cross_entropy(one_hot_targets, output)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(epochs):
loss_sum = 0
epoch_start = time.time()
for j in range(n_batches):
_, step_loss = sess.run([train_op, loss], {
features: train_input,
targets: train_output,
})
loss_sum = loss_sum + step_loss
print('avg_loss', loss_sum/n_batches, 'avg_time', (time.time() - epoch_start)/n_batches)