【2020华为云AI实战营】使用 ModelArt Notebook 训练"作诗"机器人
作诗的机器人, 在网上有很多源代码, 我的源代码取自https://github.com/yuyongsheng/tensorflow_poems
作者使用ModelArt进行了100Epoch的训练, 用了4个小时来训练模型:
先来看看输出的效果 (附上 春, 夏, 秋, 冬 四首诗)
完整的代码和Notebook文件 Peom_Gen.ipynb, 在附件中, 可以下载直接加载到NoteBook中进行训练.
1. 这个案例中, 对诗词文件进行了预处理, 规范成了统一的格式. 不需要在对诗词进行预处理:
2. 代码使用的是Python, Tensorflow进行训练
首先定义训练的参数
#train
#定义参数
#该模块只能在开始运行时调用一次, 再次调用, 会有参数重复定义的错误
import os
import tensorflow as tf
#from poems.model import rnn_model
#from poems.poems import process_poems, generate_batch
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size.')
tf.app.flags.DEFINE_float('learning_rate', 0.01, 'learning rate.')
tf.app.flags.DEFINE_string('model_dir', os.path.abspath('./model'), 'model save path.')
tf.app.flags.DEFINE_string('file_path', os.path.abspath('./data/poems.txt'), 'file name of poems.')
tf.app.flags.DEFINE_string('model_prefix', 'poems', 'model save prefix.')
tf.app.flags.DEFINE_integer('epochs', 100, 'train how many epochs.')
FLAGS = tf.app.flags.FLAGS
FLAGS.epochs = 101
def main(_):
print(FLAGS.file_path, FLAGS.epochs)
if __name__ == '__main__':
tf.app.run()
诗词进行预处理, 生成训练数据
#poems.poems.process_poems
#诗词预处理, 生成训练数据
import collections
import numpy as np
start_token = 'B'
end_token = 'E'
def process_poems(file_name):
# poems -> list of numbers
poems = []
with open(file_name, "r", encoding='utf-8', ) as f:
for line in f.readlines():
try:
#文件中每一句是一首诗, 格式为"诗名:内容"
#只保留诗体内容,并进行处理, 不对诗名进行处理
#对内容进行过滤, 含有特殊符号, 或者诗句长度<5, >79的不做处理
#符合条件的诗句, B内容E结构保存
title, content = line.strip().split(':')
content = content.replace(' ', '')
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or \
start_token in content or end_token in content:
continue
if len(content) < 5 or len(content) > 79:
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
pass
# poems = sorted(poems, key=len)
#获取所有字符
#for peom in poems
# form word in poem
# word
all_words = [word for poem in poems for word in poem]
# collections.Counter(['a','b','c','a','b','b'])
# counter({'b': 3, 'a': 2, 'c': 1})
counter = collections.Counter(all_words)
words = sorted(counter.keys(), key=lambda x: counter[x], reverse=True)
words.append(' ')
L = len(words)
#创建字典,根据每个字的概率,进行编码
word_int_map = dict(zip(words, range(L)))
#使用编码, 重新构建诗句
poems_vector = [list(map(lambda word: word_int_map.get(word, L), poem)) for poem in poems]
return poems_vector, word_int_map, words
诗词预处理, 生成RNN训练需要的Batch数据
#poems.poems.generate_batch
#诗词预处理, 生成RNN模型训练的Batch
import collections
import numpy as np
start_token = 'B'
end_token = 'E'
def generate_batch(batch_size, poems_vec, word_to_int):
n_chunk = len(poems_vec) // batch_size
x_batches = []
y_batches = []
for i in range(n_chunk):
start_index = i * batch_size
end_index = start_index + batch_size
batches = poems_vec[start_index:end_index]
#计算batch中长度最大的诗句
length = max(map(len, batches))
#构建数组,并向左移动数组元素, 用来计算前后字的关系, 进行预测
x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
for row, batch in enumerate(batches):
x_data[row, :len(batch)] = batch
y_data = np.copy(x_data)
y_data[:, :-1] = x_data[:, 1:]
"""
x_data y_data
[6,2,4,6,9] [2,4,6,9,9]
[1,4,2,8,5] [4,2,8,5,5]
"""
x_batches.append(x_data)
y_batches.append(y_data)
return x_batches, y_batches
定义RNN训练模型
#peoms.model.rnn_model
#定义RNN训练模型
import tensorflow as tf
import numpy as np
def rnn_model(model, input_data, output_data, vocab_size, rnn_size=128, num_layers=2, batch_size=64,
learning_rate=0.01):
"""
construct rnn seq2seq model.
:param model: model class
:param input_data: input data placeholder
:param output_data: output data placeholder
:param vocab_size:
:param rnn_size:
:param num_layers:
:param batch_size:
:param learning_rate:
:return:
"""
end_points = {}
#可以选择RNN, GRU 和LSTM三种不同模式, 本案例使用LSTM
if model == 'rnn':
cell_fun = tf.contrib.rnn.BasicRNNCell
elif model == 'gru':
cell_fun = tf.contrib.rnn.GRUCell
elif model == 'lstm':
cell_fun = tf.contrib.rnn.BasicLSTMCell
cell = cell_fun(rnn_size, state_is_tuple=True)
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
if output_data is not None:
initial_state = cell.zero_state(batch_size, tf.float32)
else:
initial_state = cell.zero_state(1, tf.float32)
with tf.device("/cpu:0"):
embedding = tf.get_variable('embedding', initializer=tf.random_uniform(
[vocab_size + 1, rnn_size], -1.0, 1.0))
inputs = tf.nn.embedding_lookup(embedding, input_data)
# [batch_size, ?, rnn_size] = [64, ?, 128]
outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)
output = tf.reshape(outputs, [-1, rnn_size])
weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1]))
bias = tf.Variable(tf.zeros(shape=[vocab_size + 1]))
logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias)
# [?, vocab_size+1]
if output_data is not None:
# output_data must be one-hot encode
labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1)
# should be [?, vocab_size+1]
loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
# loss shape should be [?, vocab_size+1]
total_loss = tf.reduce_mean(loss)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
end_points['initial_state'] = initial_state
end_points['output'] = output
end_points['train_op'] = train_op
end_points['total_loss'] = total_loss
end_points['loss'] = loss
end_points['last_state'] = last_state
else:
prediction = tf.nn.softmax(logits)
end_points['initial_state'] = initial_state
end_points['last_state'] = last_state
end_points['prediction'] = prediction
return end_points
训练模型
#train
#训练主程序
FLAGS.epochs = 101
def run_training():
if not os.path.exists(FLAGS.model_dir):
os.makedirs(FLAGS.model_dir)
#诗词预处理, 生成训练数据
poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path)
#诗词预处理, 生成RNN模型训练的Batch
batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int)
#定义训练网络输入和输出占位数据
input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
#定义训练网络
end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(
vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate)
saver = tf.train.Saver(tf.global_variables())
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
# sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess)
# sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
sess.run(init_op)
start_epoch = 0
checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
if checkpoint:
saver.restore(sess, checkpoint)
print("## restore from the checkpoint {0}".format(checkpoint))
start_epoch += int(checkpoint.split('-')[-1])
print('## start training...')
try:
n_chunk = len(poems_vector) // FLAGS.batch_size
for epoch in range(start_epoch, FLAGS.epochs):
n = 0
for batch in range(n_chunk):
loss, _, _ = sess.run([
end_points['total_loss'],
end_points['last_state'],
end_points['train_op']
], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
n += 1
print('Epoch: %d, batch: %d, training loss: %.6f' % (epoch, batch, loss))
if epoch % 6 == 0:
saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)
except KeyboardInterrupt:
print('## Interrupt manually, try saving checkpoint for now...')
saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)
print('## Last epoch were saved, next time will start from epoch {}.'.format(epoch))
def main():
run_training()
if __name__ == '__main__':
# 清空计算图的命令, 再次运行训练是, 必须先清空计算图, 否则可能出现变量重复定义
tf.reset_default_graph()
使用训练好的模型, 进行诗词创作:
#compose_poem
#生成诗句
import tensorflow as tf
from poems.model import rnn_model
from poems.poems import process_poems
import numpy as np
start_token = 'B'
end_token = 'E'
model_dir = './model/'
corpus_file = './data/poems.txt'
lr = 0.0001
def to_word(predict, vocabs):
predict = predict[0]
predict /= np.sum(predict)
sample = np.random.choice(np.arange(len(predict)), p=predict)
if sample > len(vocabs):
return vocabs[-1]
else:
return vocabs[sample]
def gen_poem(begin_word):
batch_size = 1
print('## loading corpus from %s' % model_dir)
poems_vector, word_int_map, vocabularies = process_poems(corpus_file)
input_data = tf.placeholder(tf.int32, [batch_size, None])
#with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): #modified
end_points = rnn_model(model='lstm', input_data=input_data, output_data=None, vocab_size=len(
vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=lr)
#with tf.variable_scope('saver', reuse = True ): #modified
saver = tf.train.Saver(tf.global_variables())
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
sess.run(init_op)
checkpoint = tf.train.latest_checkpoint(model_dir)
saver.restore(sess, checkpoint)
x = np.array([list(map(word_int_map.get, start_token))])
[predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
feed_dict={input_data: x})
word = begin_word or to_word(predict, vocabularies)
poem_ = ''
i = 0
while word != end_token:
poem_ += word
i += 1
if i > 108:
break
x = np.array([[word_int_map[word]]])
[predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],
feed_dict={input_data: x, end_points['initial_state']: last_state})
word = to_word(predict, vocabularies)
return poem_
def pretty_print_poem(poem_):
poem_sentences = poem_.split('。')
for s in poem_sentences:
if s != '' and len(s) > 10:
print(s + '。')
if __name__ == '__main__':
tf.reset_default_graph()# 清空计算图的命令
begin_char = '月'#青修改这里 input('## please input the first character:')
poem = gen_poem(begin_char)
pretty_print_poem(poem_=poem)
在NoteBook进行代码的编写和测试过程中, 由于Notebook不会清除tensorflow模型的图 graph 中模型的信息, 需要在每次训练和作诗的时候, 手动清除一次数据.
tf.reset_default_graph()# 清空计算图的命令
- 点赞
- 收藏
- 关注作者
评论(0)