【2020华为云AI实战营】使用 ModelArt Notebook 训练"作诗"机器人

举报
TaiShan 发表于 2020/09/02 12:02:21 2020/09/02
【摘要】 本文介绍了如何使用ModelArt Notebook, 结合NPL方法, 训练作诗机器人的模型


作诗的机器人, 在网上有很多源代码, 我的源代码取自https://github.com/yuyongsheng/tensorflow_poems


作者使用ModelArt进行了100Epoch的训练, 用了4个小时来训练模型:


先来看看输出的效果 (附上 春, 夏, 秋, 冬 四首诗)


春2.png


夏.png

秋3.png

冬2.png

完整的代码和Notebook文件 Peom_Gen.ipynb, 在附件中, 可以下载直接加载到NoteBook中进行训练.


1.  这个案例中, 对诗词文件进行了预处理, 规范成了统一的格式. 不需要在对诗词进行预处理: 


2.  代码使用的是Python, Tensorflow进行训练


首先定义训练的参数

#train

#定义参数

#该模块只能在开始运行时调用一次, 再次调用, 会有参数重复定义的错误

import os

import tensorflow as tf

#from poems.model import rnn_model

#from poems.poems import process_poems, generate_batch


tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size.')

tf.app.flags.DEFINE_float('learning_rate', 0.01, 'learning rate.')

tf.app.flags.DEFINE_string('model_dir', os.path.abspath('./model'), 'model save path.')

tf.app.flags.DEFINE_string('file_path', os.path.abspath('./data/poems.txt'), 'file name of poems.')

tf.app.flags.DEFINE_string('model_prefix', 'poems', 'model save prefix.')

tf.app.flags.DEFINE_integer('epochs', 100, 'train how many epochs.')


FLAGS = tf.app.flags.FLAGS

FLAGS.epochs = 101


def main(_):

    print(FLAGS.file_path, FLAGS.epochs)


if __name__ == '__main__':

    tf.app.run()


诗词进行预处理, 生成训练数据

#poems.poems.process_poems

#诗词预处理, 生成训练数据

import collections

import numpy as np


start_token = 'B'

end_token = 'E'


def process_poems(file_name):

    # poems -> list of numbers

    poems = []

    with open(file_name, "r", encoding='utf-8', ) as f:

        for line in f.readlines():

            try:

                #文件中每一句是一首诗, 格式为"诗名:内容"

                #只保留诗体内容,并进行处理, 不对诗名进行处理

                #对内容进行过滤, 含有特殊符号, 或者诗句长度<5, >79的不做处理

                #符合条件的诗句, B内容E结构保存

                title, content = line.strip().split(':')

                content = content.replace(' ', '')

                if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or \

                        start_token in content or end_token in content:

                    continue

                if len(content) < 5 or len(content) > 79:

                    continue

                content = start_token + content + end_token

                poems.append(content)

            except ValueError as e:

                pass

    # poems = sorted(poems, key=len)


    #获取所有字符 

    #for peom in poems

    #    form word in poem

    #        word

    all_words = [word for poem in poems for word in poem]

    # collections.Counter(['a','b','c','a','b','b'])

    # counter({'b': 3, 'a': 2, 'c': 1})

    counter = collections.Counter(all_words)

    words = sorted(counter.keys(), key=lambda x: counter[x], reverse=True)


    words.append(' ')

    L = len(words)

    #创建字典,根据每个字的概率,进行编码

    word_int_map = dict(zip(words, range(L)))

    #使用编码, 重新构建诗句

    poems_vector = [list(map(lambda word: word_int_map.get(word, L), poem)) for poem in poems]


    return poems_vector, word_int_map, words



诗词预处理, 生成RNN训练需要的Batch数据

#poems.poems.generate_batch

#诗词预处理, 生成RNN模型训练的Batch

import collections

import numpy as np


start_token = 'B'

end_token = 'E'


def generate_batch(batch_size, poems_vec, word_to_int):

    n_chunk = len(poems_vec) // batch_size

    x_batches = []

    y_batches = []

    for i in range(n_chunk):

        start_index = i * batch_size

        end_index = start_index + batch_size


        batches = poems_vec[start_index:end_index]

        #计算batch中长度最大的诗句

        length = max(map(len, batches))

        #构建数组,并向左移动数组元素, 用来计算前后字的关系, 进行预测

        x_data = np.full((batch_size, length), word_to_int[' '], np.int32)

        for row, batch in enumerate(batches):

            x_data[row, :len(batch)] = batch

        y_data = np.copy(x_data)

        y_data[:, :-1] = x_data[:, 1:]

        """

        x_data             y_data

        [6,2,4,6,9]       [2,4,6,9,9]

        [1,4,2,8,5]       [4,2,8,5,5]

        """

        x_batches.append(x_data)

        y_batches.append(y_data)

    return x_batches, y_batches


定义RNN训练模型

#peoms.model.rnn_model

#定义RNN训练模型

import tensorflow as tf

import numpy as np



def rnn_model(model, input_data, output_data, vocab_size, rnn_size=128, num_layers=2, batch_size=64,

              learning_rate=0.01):

    """

    construct rnn seq2seq model.

    :param model: model class

    :param input_data: input data placeholder

    :param output_data: output data placeholder

    :param vocab_size:

    :param rnn_size:

    :param num_layers:

    :param batch_size:

    :param learning_rate:

    :return:

    """

    end_points = {}


    #可以选择RNN, GRU 和LSTM三种不同模式, 本案例使用LSTM

    if model == 'rnn':

        cell_fun = tf.contrib.rnn.BasicRNNCell

    elif model == 'gru':

        cell_fun = tf.contrib.rnn.GRUCell

    elif model == 'lstm':

        cell_fun = tf.contrib.rnn.BasicLSTMCell


    cell = cell_fun(rnn_size, state_is_tuple=True)

    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)


    if output_data is not None:

        initial_state = cell.zero_state(batch_size, tf.float32)

    else:

        initial_state = cell.zero_state(1, tf.float32)


    with tf.device("/cpu:0"):

        embedding = tf.get_variable('embedding', initializer=tf.random_uniform(

            [vocab_size + 1, rnn_size], -1.0, 1.0))

        inputs = tf.nn.embedding_lookup(embedding, input_data)


    # [batch_size, ?, rnn_size] = [64, ?, 128]

    outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)

    output = tf.reshape(outputs, [-1, rnn_size])


    weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1]))

    bias = tf.Variable(tf.zeros(shape=[vocab_size + 1]))

    logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias)

    # [?, vocab_size+1]


    if output_data is not None:

        # output_data must be one-hot encode

        labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1)

        # should be [?, vocab_size+1]


        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)

        # loss shape should be [?, vocab_size+1]

        total_loss = tf.reduce_mean(loss)

        train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)


        end_points['initial_state'] = initial_state

        end_points['output'] = output

        end_points['train_op'] = train_op

        end_points['total_loss'] = total_loss

        end_points['loss'] = loss

        end_points['last_state'] = last_state

    else:

        prediction = tf.nn.softmax(logits)


        end_points['initial_state'] = initial_state

        end_points['last_state'] = last_state

        end_points['prediction'] = prediction


    return end_points


训练模型

#train

#训练主程序

FLAGS.epochs = 101


def run_training():

    if not os.path.exists(FLAGS.model_dir):

        os.makedirs(FLAGS.model_dir)

    #诗词预处理, 生成训练数据

    poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path)

    #诗词预处理, 生成RNN模型训练的Batch

    batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int)

    #定义训练网络输入和输出占位数据

    input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None])

    output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None])

    #定义训练网络

    end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(

        vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate)

    

    saver = tf.train.Saver(tf.global_variables())

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

    with tf.Session() as sess:

        # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess)

        # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

        sess.run(init_op)


        start_epoch = 0

        checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)

        if checkpoint:

            saver.restore(sess, checkpoint)

            print("## restore from the checkpoint {0}".format(checkpoint))

            start_epoch += int(checkpoint.split('-')[-1])

        print('## start training...')

        try:

            n_chunk = len(poems_vector) // FLAGS.batch_size

            for epoch in range(start_epoch, FLAGS.epochs):

                n = 0

                for batch in range(n_chunk):

                    loss, _, _ = sess.run([

                        end_points['total_loss'],

                        end_points['last_state'],

                        end_points['train_op']

                    ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})

                    n += 1

                    print('Epoch: %d, batch: %d, training loss: %.6f' % (epoch, batch, loss))

                if epoch % 6 == 0:

                    saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)

        except KeyboardInterrupt:

            print('## Interrupt manually, try saving checkpoint for now...')

            saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch)

            print('## Last epoch were saved, next time will start from epoch {}.'.format(epoch))

            


def main():

    run_training()


if __name__ == '__main__':

    # 清空计算图的命令, 再次运行训练是, 必须先清空计算图, 否则可能出现变量重复定义

    tf.reset_default_graph()



使用训练好的模型, 进行诗词创作: 


#compose_poem

#生成诗句

import tensorflow as tf

from poems.model import rnn_model

from poems.poems import process_poems

import numpy as np


start_token = 'B'

end_token = 'E'

model_dir = './model/'

corpus_file = './data/poems.txt'


lr = 0.0001



def to_word(predict, vocabs):

    predict = predict[0]       

    predict /= np.sum(predict)

    sample = np.random.choice(np.arange(len(predict)), p=predict)

    if sample > len(vocabs):

        return vocabs[-1]

    else:

        return vocabs[sample]



def gen_poem(begin_word):

    batch_size = 1

    print('## loading corpus from %s' % model_dir)

    poems_vector, word_int_map, vocabularies = process_poems(corpus_file)


    input_data = tf.placeholder(tf.int32, [batch_size, None])

    #with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): #modified

    end_points = rnn_model(model='lstm', input_data=input_data, output_data=None, vocab_size=len(

        vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=lr)


    #with tf.variable_scope('saver', reuse = True ): #modified

    saver = tf.train.Saver(tf.global_variables())

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

    with tf.Session() as sess:

        sess.run(init_op)


        checkpoint = tf.train.latest_checkpoint(model_dir)

        saver.restore(sess, checkpoint)


        x = np.array([list(map(word_int_map.get, start_token))])


        [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],

                                         feed_dict={input_data: x})

        word = begin_word or to_word(predict, vocabularies)

        poem_ = ''


        i = 0

        while word != end_token:

            poem_ += word

            i += 1

            if i > 108:

                break

            x = np.array([[word_int_map[word]]])

            [predict, last_state] = sess.run([end_points['prediction'], end_points['last_state']],

                                             feed_dict={input_data: x, end_points['initial_state']: last_state})

            word = to_word(predict, vocabularies)


        return poem_



def pretty_print_poem(poem_):

    poem_sentences = poem_.split('。')

    for s in poem_sentences:

        if s != '' and len(s) > 10:

            print(s + '。')


if __name__ == '__main__':

    tf.reset_default_graph()# 清空计算图的命令

    begin_char = '月'#青修改这里 input('## please input the first character:')

    poem = gen_poem(begin_char)

    pretty_print_poem(poem_=poem)



在NoteBook进行代码的编写和测试过程中, 由于Notebook不会清除tensorflow模型的图 graph  中模型的信息, 需要在每次训练和作诗的时候, 手动清除一次数据.


  tf.reset_default_graph()# 清空计算图的命令

【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。