- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

强化学习笔记5-Python/OpenAI/TensorFlow/ROS-阶段复习

zhangrelay 发表于 2021/07/15 03:40:56 2021/07/15

【摘要】到目前为止，已经完成了4节课程的学习，侧重OpenAI，分别如下：基础知识：https://blog.csdn.net/zhangrelay/article/details/91361113程序指令：https://blog.csdn.net/zhangrelay/article/details/91414600规划博弈：https://blog.csdn.net/zha...

到目前为止，已经完成了4节课程的学习，侧重OpenAI，分别如下：

基础知识：https://blog.csdn.net/zhangrelay/article/details/91361113
程序指令：https://blog.csdn.net/zhangrelay/article/details/91414600
规划博弈：https://blog.csdn.net/zhangrelay/article/details/91867331
时间差分：https://blog.csdn.net/zhangrelay/article/details/92012795

这时候，再重新看之前博文，侧重ROS，分别如下：

安装配置：https://blog.csdn.net/zhangrelay/article/details/89702997
环境构建：https://blog.csdn.net/zhangrelay/article/details/89817010
深度学习：https://blog.csdn.net/zhangrelay/article/details/90177162

通过上面一系列探索学习，就能够完全掌握人工智能学工具（OpenAI）和机器人学工具（ROS）。

理解如下环境中，Q学习和SARSA差异：

Q学习-circuit2_turtlebot_lidar_qlearn.py：


  
   
    
     
    
    
     
      #!/usr/bin/env python
     
    
   
    
     
    
    
     
      import gym
     
    
   
    
     
    
    
     
      from gym import wrappers
     
    
   
    
     
    
    
     
      import gym_gazebo
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      import numpy
     
    
   
    
     
    
    
     
      import random
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      import qlearn
     
    
   
    
     
    
    
     
      import liveplot
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      def render():
     
    
   
    
     
    
    
     
       render_skip = 0 #Skip first X episodes.
     
    
   
    
     
    
    
     
       render_interval = 50 #Show render Every Y episodes.
     
    
   
    
     
    
    
     
       render_episodes = 10 #Show Z episodes every rendering.
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if (x%render_interval == 0) and (x != 0) and (x > render_skip):
     
    
   
    
     
    
    
     
       env.render()
     
    
   
    
     
    
    
      elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
     
    
   
    
     
    
    
     
       env.render(close=True)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      if __name__ == '__main__':
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       outdir = '/tmp/gazebo_gym_experiments'
     
    
   
    
     
    
    
     
       env = gym.wrappers.Monitor(env, outdir, force=True)
     
    
   
    
     
    
    
     
       plotter = liveplot.LivePlot(outdir)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       last_time_steps = numpy.ndarray(0)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       qlearn = qlearn.QLearn(actions=range(env.action_space.n),
     
    
   
    
     
    
    
     
       alpha=0.2, gamma=0.8, epsilon=0.9)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       initial_epsilon = qlearn.epsilon
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       epsilon_discount = 0.9986
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       start_time = time.time()
     
    
   
    
     
    
    
     
       total_episodes = 10000
     
    
   
    
     
    
    
     
       highest_reward = 0
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      for x in range(total_episodes):
     
    
   
    
     
    
    
     
       done = False
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       cumulated_reward = 0 #Should going forward give more reward then L/R ?
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       observation = env.reset()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if qlearn.epsilon > 0.05:
     
    
   
    
     
    
    
     
       qlearn.epsilon *= epsilon_discount
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #render() #defined above, not env.render()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       state = ''.join(map(str, observation))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      for i in range(1500):
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      # Pick an action based on the current state
     
    
   
    
     
    
    
     
       action = qlearn.chooseAction(state)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      # Execute the action and get feedback
     
    
   
    
     
    
    
     
       observation, reward, done, info = env.step(action)
     
    
   
    
     
    
    
     
       cumulated_reward += reward
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if highest_reward < cumulated_reward:
     
    
   
    
     
    
    
     
       highest_reward = cumulated_reward
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       nextState = ''.join(map(str, observation))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       qlearn.learn(state, action, reward, nextState)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env._flush(force=True)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if not(done):
     
    
   
    
     
    
    
     
       state = nextState
     
    
   
    
     
    
    
      else:
     
    
   
    
     
    
    
     
       last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
     
    
   
    
     
    
    
      break
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if x%100==0:
     
    
   
    
     
    
    
     
       plotter.plot(env)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       m, s = divmod(int(time.time() - start_time), 60)
     
    
   
    
     
    
    
     
       h, m = divmod(m, 60)
     
    
   
    
     
    
    
      print ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+" Time: %d:%02d:%02d" % (h, m, s))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #Github table content
     
    
   
    
     
    
    
      print ("\n|"+str(total_episodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       l = last_time_steps.tolist()
     
    
   
    
     
    
    
     
       l.sort()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #print("Parameters: a="+str)
     
    
   
    
     
    
    
     
       print("Overall score: {:0.2f}".format(last_time_steps.mean()))
     
    
   
    
     
    
    
     
       print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env.close()

SARSA-circuit2_turtlebot_lidar_sarsa.py：


  
   
    
     
    
    
     
      #!/usr/bin/env python
     
    
   
    
     
    
    
     
      import gym
     
    
   
    
     
    
    
     
      from gym import wrappers
     
    
   
    
     
    
    
     
      import gym_gazebo
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      import numpy
     
    
   
    
     
    
    
     
      import random
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      import liveplot
     
    
   
    
     
    
    
     
      import sarsa
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      if __name__ == '__main__':
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       outdir = '/tmp/gazebo_gym_experiments'
     
    
   
    
     
    
    
     
       env = gym.wrappers.Monitor(env, outdir, force=True)
     
    
   
    
     
    
    
     
       plotter = liveplot.LivePlot(outdir)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       last_time_steps = numpy.ndarray(0)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       sarsa = sarsa.Sarsa(actions=range(env.action_space.n),
     
    
   
    
     
    
    
     
       epsilon=0.9, alpha=0.2, gamma=0.9)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       initial_epsilon = sarsa.epsilon
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       epsilon_discount = 0.9986
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       start_time = time.time()
     
    
   
    
     
    
    
     
       total_episodes = 10000
     
    
   
    
     
    
    
     
       highest_reward = 0
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      for x in range(total_episodes):
     
    
   
    
     
    
    
     
       done = False
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       cumulated_reward = 0 #Should going forward give more reward then L/R ?
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       observation = env.reset()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if sarsa.epsilon > 0.05:
     
    
   
    
     
    
    
     
       sarsa.epsilon *= epsilon_discount
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #render() #defined above, not env.render()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       state = ''.join(map(str, observation))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      for i in range(1500):
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      # Pick an action based on the current state
     
    
   
    
     
    
    
     
       action = sarsa.chooseAction(state)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      # Execute the action and get feedback
     
    
   
    
     
    
    
     
       observation, reward, done, info = env.step(action)
     
    
   
    
     
    
    
     
       cumulated_reward += reward
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if highest_reward < cumulated_reward:
     
    
   
    
     
    
    
     
       highest_reward = cumulated_reward
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       nextState = ''.join(map(str, observation))
     
    
   
    
     
    
    
     
       nextAction = sarsa.chooseAction(nextState)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #sarsa.learn(state, action, reward, nextState)
     
    
   
    
     
    
    
     
       sarsa.learn(state, action, reward, nextState, nextAction)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env._flush(force=True)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if not(done):
     
    
   
    
     
    
    
     
       state = nextState
     
    
   
    
     
    
    
      else:
     
    
   
    
     
    
    
     
       last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
     
    
   
    
     
    
    
      break
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      if x%100==0:
     
    
   
    
     
    
    
     
       plotter.plot(env)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       m, s = divmod(int(time.time() - start_time), 60)
     
    
   
    
     
    
    
     
       h, m = divmod(m, 60)
     
    
   
    
     
    
    
      print ("EP: "+str(x+1)+" - [alpha: "+str(round(sarsa.alpha,2))+" - gamma: "+str(round(sarsa.gamma,2))+" - epsilon: "+str(round(sarsa.epsilon,2))+"] - Reward: "+str(cumulated_reward)+" Time: %d:%02d:%02d" % (h, m, s))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #Github table content
     
    
   
    
     
    
    
      print ("\n|"+str(total_episodes)+"|"+str(sarsa.alpha)+"|"+str(sarsa.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       l = last_time_steps.tolist()
     
    
   
    
     
    
    
     
       l.sort()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      #print("Parameters: a="+str)
     
    
   
    
     
    
    
     
       print("Overall score: {:0.2f}".format(last_time_steps.mean()))
     
    
   
    
     
    
    
     
       print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
       env.close()

复习：时间差分https://blog.csdn.net/zhangrelay/article/details/92012795

其中案例出租车demo与上面turtlebot-demo，理解并掌握ROS和OpenAI这两大工具最基本的应用。

文章来源: zhangrelay.blog.csdn.net，作者：zhangrelay，版权归原作者所有，如需转载，请联系作者。

原文链接：zhangrelay.blog.csdn.net/article/details/92050001

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

强化学习笔记5-Python/OpenAI/TensorFlow/ROS-阶段复习

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

强化学习笔记5-Python/OpenAI/TensorFlow/ROS-阶段复习

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品