华为云深度学习kaggle猫狗识别-进阶版
使用华为云深度学习服务完成kaggle猫狗识别竞赛-进阶版
进阶版主要使用了moxing和tensorflow框架好处在于可以直接使用华为云GPU非常迅速即可调试坏处在于需要对自己的代码进行修改以适配框架
1. 数据处理
将图片数据存储为tfrecord我对图片进行了resize处理可以产生image size 128和64两种tfrecord同时划分了train、eval数据集
代码参考华为云dls-example的使用moxing玩冰山
# 前面的部分代码和keras模式基本相同将图片数据处理为numpy数组不同的地方在于这个numpy数组我reshape为二维的了 # coding=utf-8 from __future__ import division, absolute_import, print_function import os, cv2, random import pandas as pd import tensorflow as tf import numpy as np import moxing as mox import matplotlib.pyplot as plt import moxing.tensorflow as mtf from sklearn.model_selection import train_test_split from tensorflow import keras from tensorflow.python.keras import layers slim = tf.contrib.slim print('load lib success', tf.VERSION) TRAIN_DIR = 's3://dls-dogcat/input/train/' TEST_DIR = 's3://dls-dogcat/input/test/' OUTPUT_DIR = 's3://dls-dogcat/input/128' TRAIN64 = 'train64.tfrecord' EVAL64 = 'eval64.tfrecord' TEST64 = 'test64.tfrecord' TRAIN128 = 'train128.tfrecord' EVAL128 = 'eval128.tfrecord' TEST128 = 'test128.tfrecord' train_image_url = [TRAIN_DIR + i for i in mox.file.list_directory(TRAIN_DIR)] IMAGE_SIZE = 64 CHANNELS = 3 random.shuffle(train_image_url) TRAIN_SIZE = len(train_image_url) TEST_SIZE = len(tf.gfile.ListDirectory(TEST_DIR)) test_image_url = [] for i in range(1, TEST_SIZE + 1): path = TEST_DIR + str(i) + '.jpg' test_image_url.append(path) print('Train images : {}, Test images : {}'.format(TRAIN_SIZE, TEST_SIZE)) def read_image(file_path): img = cv2.imdecode(np.fromstring(mox.file.read(file_path, binary=True), np.uint8), cv2.IMREAD_COLOR) return cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_CUBIC) def prep_data(images): count = len(images) data = np.ndarray((count, IMAGE_SIZE * IMAGE_SIZE * CHANNELS), dtype=np.float32) # data也要改变对应下面 for i, image_file in enumerate(images): image = read_image(image_file) image = image.reshape(-1) # 为了使用tfrecord的floatlist我reshape了一下其实可以使用byteslist就不用reshape data[i] = image / 255.0 if i%1000 == 0: print('Processed {} of {}'.format(i, count)) return data train_set = prep_data(train_image_url) test_set = prep_data(test_image_url) print('Train_set shape: {}, Test_set shape: {}'.format(train_set.shape, test_set.shape)) train_labels = np.ndarray((len(train_image_url), 1), np.int64) for i, url in enumerate(train_image_url): tag = url.split('.')[-3].split('/')[-1] if tag == 'dog': train_labels[i] = 1 elif tag == 'cat': train_labels[i] = 0 # 在这里划分数据集80%train 20%eval print('Train_labels shape: {}'.format(train_labels.shape)) train_imgs, eval_imgs, train_labels, eval_labels = train_test_split(train_set, train_labels, test_size=0.2) print(train_imgs.shape, eval_imgs.shape, train_labels.shape, eval_labels.shape) # 将train和eval的numpy数组写入tfrecord的函数这里需要定义Features一个是image保存图片数组一个是label保存标签1、0 # FloatList要求数组为一维的所以上面我reshape了如果使用byteslist就不用 def convert_and_encode_to_tfrecord(num_samples, images, labels, output_file): output_file = os.path.join(OUTPUT_DIR, output_file) with tf.python_io.TFRecordWriter(output_file) as tfrecord_writer: for i in range(num_samples): example = tf.train.Example(features=tf.train.Features(feature={ 'image': tf.train.Feature(float_list=tf.train.FloatList(value=images[i])), 'label': tf.train.Feature(int64_list=tf.train.Int64List(value=labels[i])) })) tfrecord_writer.write(example.SerializeToString()) # 此处执行完毕会产生train64.tfrecord和eval64.tfrecord两个文件train0.916GBeval234MBtfrecord比源文件大是很正常的 convert_and_encode_to_tfrecord(train_imgs.shape[0], train_imgs, train_labels, TRAIN64) convert_and_encode_to_tfrecord(eval_imgs.shape[0], eval_imgs, eval_labels, EVAL64) # 同理这里将test写入tfrecord但是test没有label所以只需要一个feature def convert_test_and_encode_to_tfrecord(num_samples, images, output_file): output_file = os.path.join(OUTPUT_DIR, output_file) with tf.python_io.TFRecordWriter(output_file) as tfrecord_writer: for i in range(num_samples): example = tf.train.Example(features=tf.train.Features(feature={ 'image': tf.train.Feature(float_list=tf.train.FloatList(value=images[i])) })) tfrecord_writer.write(example.SerializeToString()) # 大概586MB convert_test_and_encode_to_tfrecord(test_set.shape[0], test_set, TEST64) # 读取tfrecord的数据并显示出来就可以验证是否正确的存入了数据使用mox的get_tfrecord参数照着改就完事了 # 这个方法不能与下面读取test一起使用因为使用了session def read_and_decode_tfrecord(dataset_dir, file_pattern, num_samples): keys_to_features = { 'image': tf.FixedLenFeature((IMAGE_SIZE * IMAGE_SIZE * CHANNELS,), tf.float32, default_value=None), 'label': tf.FixedLenFeature([1], tf.int64, default_value=None), } # 这里可以实现对一维数组的reshape我们将数据恢复为图片的三维形式 items_to_handlers = { 'image': slim.tfexample_decoder.Tensor('image', shape=[IMAGE_SIZE, IMAGE_SIZE, CHANNELS]), 'label': slim.tfexample_decoder.Tensor('label', shape=[]), } dataset = mtf.get_tfrecord(dataset_dir=dataset_dir, file_pattern=file_pattern, num_samples=num_samples, keys_to_features=keys_to_features, items_to_handlers=items_to_handlers, shuffle=False, num_epochs=1) image, label = dataset.get(['image', 'label']) sv = tf.train.Supervisor() # 这里可以显示图片20张图片 with sv.managed_session() as sess: plt.figure() for i in range(20): subp = plt.subplot(4, 5, i + 1) plt.subplots_adjust(hspace=1) subp.imshow(sess.run(image)) label_eval = sess.run(label) subp.set_title('label=%s' % (label_eval)) plt.show() read_and_decode_tfrecord(dataset_dir=OUTPUT_DIR, file_pattern='eval*.tfrecord', num_samples=5000) def read_test_and_decode_tfrecord(dataset_dir, file_pattern, num_samples): keys_to_features = { 'image': tf.FixedLenFeature((IMAGE_SIZE * IMAGE_SIZE * CHANNELS,), tf.float32, default_value=None) } items_to_handlers = { 'image': slim.tfexample_decoder.Tensor('image', shape=[IMAGE_SIZE, IMAGE_SIZE, CHANNELS]) } dataset = mtf.get_tfrecord(dataset_dir=dataset_dir, file_pattern=file_pattern, num_samples=num_samples, keys_to_features=keys_to_features, items_to_handlers=items_to_handlers, shuffle=False, num_epochs=1) image = dataset.get(['image']) sv = tf.train.Supervisor() with sv.managed_session() as sess: plt.figure() for i in range(20): subp = plt.subplot(4, 5, i + 1) plt.subplots_adjust(hspace=1) subp.imshow(sess.run(image[0])) # 注意这个位置test读出来的形式不一样需要取image[0]才是正确的图片信息此处有伏笔 plt.show() read_test_and_decode_tfrecord(dataset_dir=OUTPUT_DIR, file_pattern='test*.tfrecord', num_samples=12500)
2. 代码
经过上面的数据处理会得到三个tfrecord分别是train、eval、test可以修改参数得到128大小的tfrecord文件大小为train3.66GBeval0.915GBtest2.28GB
# 前面部分不必多说是一些固定的库以及参数 from __future__ import absolute_import, division, print_function import os os.environ.pop('http_proxy', None) import math import numpy as np import pandas as pd import tensorflow as tf import moxing.tensorflow as mox from tensorflow.python.keras import layers slim = tf.contrib.slim NUM_SAMPLES_TRAIN = 20000 NUM_SAMPLES_EVAL = 5000 NUM_SAMPLES_TEST = 12500 # image_size对应之前存入tfrecord的图片大小 IMAGE_SIZE = 128 CHANNELS = 3 # 定义flags参数这些参数将在创建作业时指定有batch_sizedata_url数据集路径log_url日志以及checkpoint的路径、csv文件存储路径is_training决定训练还是预测 tf.flags.DEFINE_integer('batch_size', 16, 'Mini-batch size') tf.flags.DEFINE_string('data_url', 's3://zxy/model/zzy', 'Dir of dataset') tf.flags.DEFINE_string('log_dir', 's3://zxy/model/zzy/log', 'Dir of log') tf.flags.DEFINE_boolean('is_training', True, 'True for train. False for eval and predict.') flags = tf.flags.FLAGS # 这个地方加上去可以避免因为obs传数据不稳定造成的失败本质上是将桶内的数据拷贝到cache中 import atexit import logging _data_url = flags.data_url _log_dir = flags.log_dir if not mox.file.is_directory(_log_dir): mox.file.make_dirs(_log_dir) mox.file.make_dirs('/cache/data_url') mox.file.make_dirs('/cache/log_dir') mox.file.copy_parallel(_data_url, '/cache/data_url') mox.file.copy_parallel(_log_dir, '/cache/log_dir') flags.data_url = '/cache/data_url' flags.log_dir = '/cache/log_dir' atexit.register(lambda: mox.file.copy_parallel('/cache/log_dir', _log_dir)) logger = logging.getLogger() while logger.handlers: logger.handlers.pop() # 这个部分实现了查询当前环境下的GPU数量并据此定义steps后面要用 num_gpus = mox.get_flag('num_gpus') # if using distributed, the number of workers is related to the number of machines. num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int(math.ceil(float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) submission = pd.DataFrame(columns=['label']) # 定义输入函数返回image、label使用方式类似上面显示tfrecord的图片 def input_fn(run_mode, **kwargs): # Train if run_mode == mox.ModeKeys.TRAIN: num_samples = NUM_SAMPLES_TRAIN num_epochs = None shuffle = True file_pattern = 'train*.tfrecord' # Eval or Test else: num_epochs = 1 shuffle = False if run_mode == mox.ModeKeys.EVAL: num_samples = NUM_SAMPLES_EVAL file_pattern = 'eval*.tfrecord' else: num_samples = NUM_SAMPLES_TEST file_pattern = 'test*.tfrecord' keys_to_features = { 'image': tf.FixedLenFeature((IMAGE_SIZE * IMAGE_SIZE * CHANNELS,), tf.float32, default_value=None) } items_to_handlers = { 'image': slim.tfexample_decoder.Tensor('image', shape=[IMAGE_SIZE, IMAGE_SIZE, CHANNELS]) } if run_mode != mox.ModeKeys.PREDICT: keys_to_features['label'] = tf.FixedLenFeature([1], tf.int64, default_value=None) items_to_handlers['label'] = slim.tfexample_decoder.Tensor('label', shape=[]) # returns an instance of 'DatasetDataProvider' # defined in 'tensorflow/contrib/slim/python/data/dataset_data_provider.py' dataset = mox.get_tfrecord(dataset_dir=flags.data_url, file_pattern=file_pattern, num_samples=num_samples, keys_to_features=keys_to_features, items_to_handlers=items_to_handlers, num_epochs=num_epochs, shuffle=shuffle) # 如果运行的是predict返回image如果是train或eval返回image和label if run_mode == mox.ModeKeys.PREDICT: image = dataset.get(['image']) # Non-DMA safe string cannot tensor may not be copied to a GPU. # So we encode string to a list of integer. return image else: image, label = dataset.get(['image', 'label']) return image, label # Data augementation(Only using in training data) # if run_mode == mox.ModeKeys.TRAIN: # image = tf.image.random_flip_left_right(image) # image = tf.image.random_flip_up_down(image) # image = tf.image.rot90(image, k=tf.random_uniform(shape=(), maxval=3, minval=0, dtype=tf.int32)) # 定义CNN模型不多说和keras模式差不多 f1, f2, f3, f4 = 32, 64, 128, 256 k1, k2, k3, k4 = 5, 5, 5, 5 s1, s2, s3, s4 = 2, 2, 2, 2 c1, c2, c3, c4 = 2, 2, 2, 2 fc1, fc2, fc3, fc4 = 256, 128, 64, 128 # Classification Model def model_v1(images, run_mode): is_training = (run_mode == mox.ModeKeys.TRAIN) # Conv Layer 1 x = layers.Conv2D(f1, k1, c1, padding='SAME', activation=tf.nn.relu, input_shape=(IMAGE_SIZE, IMAGE_SIZE, CHANNELS))(images) x = layers.MaxPool2D(strides=s1, padding='SAME')(x) x = layers.Dropout(0.5)(x, training=is_training) # Conv Layer 2 x = layers.Conv2D(f2, k2, c2, padding='SAME', activation=tf.nn.relu)(x) x = layers.MaxPool2D(strides=s2, padding='SAME')(x) x = layers.Dropout(0.5)(x, training=is_training) # Conv Layer 3 x = layers.Conv2D(f3, k3, c3, padding='SAME', activation=tf.nn.relu)(x) x = layers.MaxPool2D(strides=s3, padding='SAME')(x) x = layers.Dropout(0.5)(x, training=is_training) # Conv Layer 4 x = layers.Conv2D(f4, k4, c4, padding='SAME', activation=tf.nn.relu)(x) x = layers.MaxPool2D(strides=s4, padding='SAME')(x) x = layers.Dropout(0.5)(x, training=is_training) # Flatten the data for upcoming dense layers x = layers.Flatten()(x) # Dense Layers x = layers.Dense(fc1, activation=tf.nn.relu)(x) x = layers.Dropout(0.2)(x, training=is_training) # Dense Layer 2 x = layers.Dense(fc2, activation=tf.nn.relu)(x) x = layers.Dropout(0.2)(x, training=is_training) # Sigmoid Layer logits =layers.Dense(2)(x) return logits # 定义model函数这里是核心部分从input_fn得到的数据在这里与model_fn发生反应 # 通过mox.ModelSpec打印信息或者输出信息与keras不同的是这里使用softmax以及one-hot编码sigmoid应该也是可以的 def model_fn(inputs, run_mode, **kwargs): # In train or eval, id_or_labels represents labels. In predict, id_or_labels represents id. if run_mode == mox.ModeKeys.PREDICT: images = inputs[0] else: images, labels = inputs # Reshape angles from [batch_size] to [batch_size, 1] # Apply your version of model logits = model_v1(images, run_mode) if run_mode == mox.ModeKeys.PREDICT: logits = tf.nn.softmax(logits) model_spec = mox.ModelSpec(output_info={'logits': logits}) else: labels_one_hot = slim.one_hot_encoding(labels, 2) loss = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels_one_hot) model_spec = mox.ModelSpec(loss=loss, log_info={'loss': loss}) return model_spec # 这里对应model_fn的output_info将logits[1]即dog的概率保存到DataFrame中之后写入 def output_fn(outputs): global submission for output in outputs: for logits in output['logits']: # Get the probability of label==1 label = logits[1] df = pd.DataFrame([label], columns=['label']) submission = submission.append(df) # 主函数根据创建作业时指定的is_training决定运行哪个方法True对应训练False对应eval和test if __name__ == '__main__': # In training mode, using max_number_of_steps to control the epochs # Else, only run one epoch in eval and test mode. if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.0001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.log_dir, max_number_of_steps=steps_per_epoch * 150, # 这里与训练次数相关自行修改 log_every_n_steps=20, save_summary_steps=50, save_model_secs=120) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.log_dir) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn, run_mode=mox.ModeKeys.PREDICT, batch_size=20, # 这个地方很关键必须为test size能整除的数12500/20ok否则会丢失一部分测试数据 max_number_of_steps=int(NUM_SAMPLES_TEST / 20), # 对应上面 log_every_n_steps=50, output_every_n_steps=1, # 这个地方必须为1否则数据无法存入DataFrame中 checkpoint_path=flags.log_dir) # Write results to file. tf.gfile allow writing file to EBS/s3 submission_file = os.path.join(flags.log_dir, 'submission.csv') print('start to write!!!!!') print('info: ', submission.label.values[:10]) result = submission.to_csv(path_or_buf=None, index=False) with tf.gfile.Open(submission_file, 'w') as f: f.write(result) print('Success!!!')
3. 结果及分析
*使用image size为64的情况*
Loss:
Score:
*使用image size为128的情况*
Loss:
Score:
1. 在CNN不变的情况下128大小的训练可以达到更低的损失但是似乎结果并不比64好而且与keras对比似乎这个训练过程中有过拟合的问题减少max_number_of_steps应该能得到更好的效果
2. 使用GPU的速度可快多了最初的keras使用CPU跑了10小时以上使用4个GPU可以在几十分钟内跑完更多轮的数据效果相当的好。
总结
1. 进行图片识别时基本的方式是使用卷积神经网络所以使用基于tensorflow的keras可以迅速搭建一个卷积神经网络。
2. 使用CNN的过程中可能会遇到内存不够的情况一方面可以考虑优化CNN结构另一方面可以考虑使用vgg16等已经训练好的模型加上自己的模型进行调试。
3. 使用华为云进行开发的过程中会遇到一些问题比如读取图片的方式被限定tensorflow1.8版本model使用dataset会报错文件写入的方式被限定notebook意外断开连接等等。
4. 使用华为云很方便基于一个云平台基本可以完成所有的机器学习开发而且提供了不少的学习工具还是很实用的。
5. 希望华为云可以提供更加直接的云计算平台使用方式就我所想的应该是提供一个可以ssh的服务器这样子代码迁移更加方便。
- 点赞
- 收藏
- 关注作者
评论(0)