TensorFlow练习25: 使用深度学习做阅读理解+完形填空
Link Share :http://blog.topspeedsnail.com/archives/11062
- via RSS
记的在学生时代,英语考试有这么一种类型的题,叫:阅读理解。首先让你读一段洋文材料,然后回答一些基于这个洋文材料提的问题。
我先给你出一道阅读理解
Big Panda learned to code when he was 21. He live in China and have no life, feel like a big loser. But here is one thing Panda want you to remember…it´s never too late! You can do anything if you put your heart on it!
____ is the loser.(下划线处该填什么呢?)
我出的这道填空题,对人来说轻而易举,但是要让机器回答就很难了。机器阅读和理解人类语言是非常有挑战性的。
本帖就使用TensorFlow练习一个阅读理解,看看准确率能到什么程度。
使用的数据集
本帖只使用"非死不可"提供的《Children's Book Test》数据集。
数据预处理
import re
import random
import ast
import itertools
import pickle
import numpy as np
train_data_file = './CBTest/data/cbtest_NE_train.txt'
valid_data_file = './CBTest/data/cbtest_NE_valid_2000ex.txt'
def preprocess_data(data_file, out_file):
# stories[x][0] tories[x][1] tories[x][2]
stories = []
with open(data_file) as f:
story = []
for line in f:
line = line.strip()
if not line:
story = []
else:
_, line = line.split(' ', 1)
if line:
if '\t' in line:
q, a, _, answers = line.split('\t')
# tokenize
q = [s.strip() for s in re.split('(\W+)+', q) if s.strip()]
stories.append((story, q, a))
else:
line = [s.strip() for s in re.split('(\W+)+', line) if s.strip()]
story.append(line)
samples = []
for story in stories:
story_tmp = []
content = []
for c in story[0]:
content += c
story_tmp.append(content)
story_tmp.append(story[1])
story_tmp.append(story[2])
samples.append(story_tmp)
random.shuffle(samples)
print(len(samples))
with open(out_file, "w") as f:
for sample in samples:
f.write(str(sample))
f.write('\n')
preprocess_data(train_data_file, 'train.data')
preprocess_data(valid_data_file, 'valid.data')
# 创建词汇表
def read_data(data_file):
stories = []
with open(data_file) as f:
for line in f:
line = ast.literal_eval(line.strip())
stories.append(line)
return stories
stories = read_data('train.data') + read_data('valid.data')
content_length = max([len(s) for s, _, _ in stories])
question_length = max([len(q) for _, q, _ in stories])
print(content_length, question_length)
vocab = sorted(set(itertools.chain(*(story + q + [answer] for story, q, answer in stories))))
vocab_size = len(vocab) + 1
print(vocab_size)
word2idx = dict((w, i + 1) for i,w in enumerate(vocab))
pickle.dump((word2idx, content_length, question_length, vocab_size), open('vocab.data', "wb"))
# From keras 补齐
def pad_sequences(sequences, maxlen=None, dtype='int32',
padding='post', truncating='post', value=0.):
lengths = [len(s) for s in sequences]
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
# take the sample shape from the first non empty sequence
# checking for consistency in the main loop below.
sample_shape = tuple()
for s in sequences:
if len(s) > 0:
sample_shape = np.asarray(s).shape[1:]
break
x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
for idx, s in enumerate(sequences):
if len(s) == 0:
continue # empty list was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError('Truncating type "%s" not understood' % truncating)
# check `trunc` has expected shape
trunc = np.asarray(trunc, dtype=dtype)
if trunc.shape[1:] != sample_shape:
raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
(trunc.shape[1:], idx, sample_shape))
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError('Padding type "%s" not understood' % padding)
return x
# 转为向量
def to_vector(data_file, output_file):
word2idx, content_length, question_length, _ = pickle.load(open('vocab.data', "rb"))
X = []
Q = []
A = []
with open(data_file) as f_i:
for line in f_i:
line = ast.literal_eval(line.strip())
x = [word2idx[w] for w in line[0]]
q = [word2idx[w] for w in line[1]]
a = [word2idx[line[2]]]
X.append(x)
Q.append(q)
A.append(a)
X = pad_sequences(X, content_length)
Q = pad_sequences(Q, question_length)
with open(output_file, "w") as f_o:
for i in range(len(X)):
f_o.write(str([X[i].tolist(), Q[i].tolist(), A[i]]))
f_o.write('\n')
to_vector('train.data', 'train.vec')
to_vector('valid.data', 'valid.vec')
"""
# to_word
word2idx, content_length, question_length, _ = pickle.load(open('vocab.data', "rb"))
def get_value(dic,value):
for name in dic:
if dic[name] == value:
return name
with open('train.vec') as f:
for line in f:
line = ast.literal_eval(line.strip())
for word in line[0]:
print(get_value(word2idx, word))
"""
生成的文件:vocab.data词汇表、train.vec、valid.vec数据的向量表示。
训练
https://arxiv.org/pdf/1606.02245v4.pdf
import tensorflow as tf
import pickle
import numpy as np
import ast
from collections import defaultdict
train_data = 'train.vec'
valid_data = 'valid.vec'
word2idx, content_length, question_length, vocab_size = pickle.load(open('vocab.data', "rb"))
print(content_length, question_length, vocab_size)
batch_size = 64
train_file = open(train_data)
def get_next_batch():
X = []
Q = []
A = []
for i in range(batch_size):
for line in train_file:
line = ast.literal_eval(line.strip())
X.append(line[0])
Q.append(line[1])
A.append(line[2][0])
break
if len(X) == batch_size:
return X, Q, A
else:
train_file.seek(0)
return get_next_batch()
def get_test_batch():
with open(valid_data) as f:
X = []
Q = []
A = []
for line in f:
line = ast.literal_eval(line.strip())
X.append(line[0])
Q.append(line[1])
A.append(line[2][0])
return X, Q, A
X = tf.placeholder(tf.int32, [batch_size, content_length]) # 洋文材料
Q = tf.placeholder(tf.int32, [batch_size, question_length]) # 问题
A = tf.placeholder(tf.int32, [batch_size]) # 答案
# drop out
keep_prob = tf.placeholder(tf.float32)
def glimpse(weights, bias, encodings, inputs):
weights = tf.nn.dropout(weights, keep_prob)
inputs = tf.nn.dropout(inputs, keep_prob)
attention = tf.transpose(tf.matmul(weights, tf.transpose(inputs)) + bias)
attention = tf.batch_matmul(encodings, tf.expand_dims(attention, -1))
attention = tf.nn.softmax(tf.squeeze(attention, -1))
return attention, tf.reduce_sum(tf.expand_dims(attention, -1) * encodings, 1)
def neural_attention(embedding_dim=384, encoding_dim=128):
embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=0.22), dtype=tf.float32)
tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(1e-4), [embeddings])
with tf.variable_scope('encode'):
with tf.variable_scope('X'):
X_lens = tf.reduce_sum(tf.sign(tf.abs(X)), 1)
embedded_X = tf.nn.embedding_lookup(embeddings, X)
encoded_X = tf.nn.dropout(embedded_X, keep_prob)
gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim)
outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_X, sequence_length=X_lens, dtype=tf.float32, swap_memory=True)
encoded_X = tf.concat(2, outputs)
with tf.variable_scope('Q'):
Q_lens = tf.reduce_sum(tf.sign(tf.abs(Q)), 1)
embedded_Q = tf.nn.embedding_lookup(embeddings, Q)
encoded_Q = tf.nn.dropout(embedded_Q, keep_prob)
gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim)
outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_Q, sequence_length=Q_lens, dtype=tf.float32, swap_memory=True)
encoded_Q = tf.concat(2, outputs)
W_q = tf.Variable(tf.random_normal([2*encoding_dim, 4*encoding_dim], stddev=0.22), dtype=tf.float32)
b_q = tf.Variable(tf.random_normal([2*encoding_dim, 1], stddev=0.22), dtype=tf.float32)
W_d = tf.Variable(tf.random_normal([2*encoding_dim, 6*encoding_dim], stddev=0.22), dtype=tf.float32)
b_d = tf.Variable(tf.random_normal([2*encoding_dim, 1], stddev=0.22), dtype=tf.float32)
g_q = tf.Variable(tf.random_normal([10*encoding_dim, 2*encoding_dim], stddev=0.22), dtype=tf.float32)
g_d = tf.Variable(tf.random_normal([10*encoding_dim, 2*encoding_dim], stddev=0.22), dtype=tf.float32)
with tf.variable_scope('attend') as scope:
infer_gru = tf.nn.rnn_cell.GRUCell(4*encoding_dim)
infer_state = infer_gru.zero_state(batch_size, tf.float32)
for iter_step in range(8):
if iter_step > 0:
scope.reuse_variables()
_, q_glimpse = glimpse(W_q, b_q, encoded_Q, infer_state)
d_attention, d_glimpse = glimpse(W_d, b_d, encoded_X, tf.concat_v2([infer_state, q_glimpse], 1))
gate_concat = tf.concat_v2([infer_state, q_glimpse, d_glimpse, q_glimpse * d_glimpse], 1)
r_d = tf.sigmoid(tf.matmul(gate_concat, g_d))
r_d = tf.nn.dropout(r_d, keep_prob)
r_q = tf.sigmoid(tf.matmul(gate_concat, g_q))
r_q = tf.nn.dropout(r_q, keep_prob)
combined_gated_glimpse = tf.concat_v2([r_q * q_glimpse, r_d * d_glimpse], 1)
_, infer_state = infer_gru(combined_gated_glimpse, infer_state)
return tf.to_float(tf.sign(tf.abs(X))) * d_attention
def train_neural_attention():
X_attentions = neural_attention()
loss = -tf.reduce_mean(tf.log(tf.reduce_sum(tf.to_float(tf.equal(tf.expand_dims(A, -1), X)) * X_attentions, 1) + tf.constant(0.00001)))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads_and_vars = optimizer.compute_gradients(loss)
capped_grads_and_vars = [(tf.clip_by_norm(g, 5), v) for g,v in grads_and_vars]
train_op = optimizer.apply_gradients(capped_grads_and_vars)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# writer = tf.summary.FileWriter()
# 恢复前一次训练
ckpt = tf.train.get_checkpoint_state('.')
if ckpt != None:
print(ckpt.model_checkpoint_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
print("没找到模型")
for step in range(20000):
train_x, train_q, train_a = get_next_batch()
loss_, _ = sess.run([loss, train_op], feed_dict={X:train_x, Q:train_q, A:train_a, keep_prob:0.7})
print(loss_)
# 保存模型并计算准确率
if step % 1000 == 0:
path = saver.save(sess, 'machine_reading.model', global_step=step)
print(path)
test_x, test_q, test_a = get_test_batch()
test_x, test_q, test_a = np.array(test_x[:batch_size]), np.array(test_q[:batch_size]), np.array(test_a[:batch_size])
attentions = sess.run(X_attentions, feed_dict={X:test_x, Q:test_q, keep_prob:1.})
correct_count = 0
for x in range(test_x.shape[0]):
probs = defaultdict(int)
for idx, word in enumerate(test_x[x,:]):
probs[word] += attentions[x, idx]
guess = max(probs, key=probs.get)
if guess == test_a[x]:
correct_count += 1
print(correct_count / test_x.shape[0])
train_neural_attention()
我只想说,这个东西比我水平高!至少在速度上能甩我十条大街。
- Attention-over-Attention Neural Networks for Reading Comprehension
- Iterative Alternating Neural Attention for Machine Reading
- TensorFlow练习13: 制作一个简单的聊天机器人
如要转载,请保持本文完整,并注明作者@斗大的熊猫和本文原始地址:http://blog.topspeedsnail.com/archives/11062
作者暂无likerid, 赞赏暂由本网站代持,当作者有likerid后会全部转账给作者(我们会尽力而为)。Tips: Until now, everytime you want to store your article, we will help you store it in Filecoin network. In the future, you can store it in Filecoin network using your own filecoin.
Support author:
Author's Filecoin address:
Or you can use Likecoin to support author: