从零实现PPO算法PyTorch实战CartPole平衡杆训练与调参全解析1. 强化学习与PPO算法基础在开始实战之前我们需要先理解几个核心概念。强化学习Reinforcement Learning是机器学习的一个重要分支它关注的是智能体Agent如何通过与环境Environment的交互来学习最优策略Policy从而最大化累积奖励Reward。PPOProximal Policy Optimization是一种基于策略梯度的强化学习算法它通过限制策略更新的幅度来保证训练的稳定性。与传统的策略梯度方法相比PPO有两个主要优势重要性采样允许重复使用旧策略收集的数据裁剪机制防止策略更新过大导致性能崩溃import gym import torch import torch.nn as nn import torch.optim as optim import numpy as np from collections import deque import matplotlib.pyplot as plt # 设置随机种子保证可重复性 SEED 42 torch.manual_seed(SEED) np.random.seed(SEED)2. 环境与网络架构搭建2.1 CartPole环境介绍CartPole是一个经典的强化学习测试环境目标是通过左右移动小车来保持杆子竖直。环境提供以下状态信息小车位置小车速度杆子角度杆子角速度动作空间是离散的0向左移动和1向右移动。每保持平衡一步获得1奖励当杆子倾斜超过15度或小车移动超出边界时回合结束。env gym.make(CartPole-v1) env.seed(SEED) state_dim env.observation_space.shape[0] action_dim env.action_space.n print(f状态空间维度: {state_dim}) print(f动作空间维度: {action_dim})2.2 Actor-Critic网络实现PPO采用Actor-Critic架构其中Actor网络输出动作概率分布Critic网络评估状态价值class ActorCritic(nn.Module): def __init__(self, state_dim, action_dim): super(ActorCritic, self).__init__() # 共享的特征提取层 self.shared_layers nn.Sequential( nn.Linear(state_dim, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh() ) # Actor分支 self.actor nn.Sequential( nn.Linear(64, action_dim), nn.Softmax(dim-1) ) # Critic分支 self.critic nn.Linear(64, 1) def forward(self, state): features self.shared_layers(state) action_probs self.actor(features) state_value self.critic(features) return action_probs, state_value3. PPO核心算法实现3.1 经验收集与存储PPO需要收集智能体与环境交互的经验状态、动作、奖励等用于训练。我们使用一个简单的缓冲区来存储这些数据。class PPOBuffer: def __init__(self, buffer_size, state_dim): self.states np.zeros((buffer_size, state_dim), dtypenp.float32) self.actions np.zeros(buffer_size, dtypenp.int64) self.rewards np.zeros(buffer_size, dtypenp.float32) self.values np.zeros(buffer_size, dtypenp.float32) self.log_probs np.zeros(buffer_size, dtypenp.float32) self.dones np.zeros(buffer_size, dtypenp.bool_) self.ptr 0 self.max_size buffer_size def store(self, state, action, reward, value, log_prob, done): idx self.ptr % self.max_size self.states[idx] state self.actions[idx] action self.rewards[idx] reward self.values[idx] value self.log_probs[idx] log_prob self.dones[idx] done self.ptr 1 def get(self): return ( self.states, self.actions, self.rewards, self.values, self.log_probs, self.dones )3.2 优势函数计算优势函数A(s,a) Q(s,a) - V(s)衡量了在状态s下采取动作a比平均情况好多少。我们使用广义优势估计GAE来计算优势函数。def compute_gae(rewards, values, dones, gamma0.99, lam0.95): batch_size len(rewards) advantages np.zeros(batch_size, dtypenp.float32) last_advantage 0 for t in reversed(range(batch_size)): if t batch_size - 1: next_non_terminal 1.0 - dones[t] next_value values[t] else: next_non_terminal 1.0 - dones[t] next_value values[t1] delta rewards[t] gamma * next_value * next_non_terminal - values[t] advantages[t] delta gamma * lam * next_non_terminal * last_advantage last_advantage advantages[t] returns advantages values return advantages, returns3.3 PPO损失函数PPO的核心在于其特殊的损失函数设计包括策略损失、价值函数损失和熵奖励。def ppo_loss(old_probs, states, actions, advantages, returns, clip_ratio0.2, entropy_coef0.01): # 计算新策略的概率和状态价值 new_probs, new_values model(torch.FloatTensor(states)) new_probs new_probs.gather(1, torch.LongTensor(actions).unsqueeze(1)) old_probs torch.FloatTensor(old_probs).unsqueeze(1) # 重要性采样比率 ratio (new_probs / old_probs).squeeze() # 裁剪策略损失 surr1 ratio * torch.FloatTensor(advantages) surr2 torch.clamp(ratio, 1.0 - clip_ratio, 1.0 clip_ratio) * torch.FloatTensor(advantages) policy_loss -torch.min(surr1, surr2).mean() # 价值函数损失 value_loss nn.MSELoss()(new_values.squeeze(), torch.FloatTensor(returns)) # 熵奖励鼓励探索 entropy -(new_probs * torch.log(new_probs 1e-10)).mean() entropy_bonus entropy_coef * entropy total_loss policy_loss 0.5 * value_loss - entropy_bonus return total_loss, policy_loss.item(), value_loss.item(), entropy.item()4. 训练流程与调参技巧4.1 完整训练循环def train_ppo(env, model, optimizer, epochs100, steps_per_epoch4000, max_ep_len1000, clip_ratio0.2, train_iters80, gamma0.99, lam0.95, lr3e-4): buffer PPOBuffer(steps_per_epoch, env.observation_space.shape[0]) episode_rewards [] episode_lengths [] for epoch in range(epochs): state env.reset() ep_reward 0 ep_len 0 for t in range(steps_per_epoch): with torch.no_grad(): action_probs, value model(torch.FloatTensor(state)) action torch.multinomial(action_probs, 1).item() log_prob torch.log(action_probs[action]) next_state, reward, done, _ env.step(action) ep_reward reward ep_len 1 buffer.store(state, action, reward, value.item(), log_prob.item(), done) state next_state if done or (ep_len max_ep_len): episode_rewards.append(ep_reward) episode_lengths.append(ep_len) state env.reset() ep_reward 0 ep_len 0 # 计算优势函数和回报 states, actions, rewards, values, log_probs, dones buffer.get() advantages, returns compute_gae(rewards, values, dones, gamma, lam) # 标准化优势函数 advantages (advantages - advantages.mean()) / (advantages.std() 1e-8) # 更新模型 for _ in range(train_iters): loss, policy_loss, value_loss, entropy ppo_loss( log_probs, states, actions, advantages, returns, clip_ratio ) optimizer.zero_grad() loss.backward() optimizer.step() # 打印训练信息 if (epoch 1) % 10 0: avg_reward np.mean(episode_rewards[-10:]) avg_len np.mean(episode_lengths[-10:]) print(fEpoch: {epoch1}, Avg Reward: {avg_reward:.1f}, Avg Length: {avg_len:.1f}) print(fLoss: {loss.item():.3f}, Policy Loss: {policy_loss:.3f}, Value Loss: {value_loss:.3f}, Entropy: {entropy:.3f}) return episode_rewards4.2 关键调参经验在PPO算法中以下几个参数对训练效果影响最大折扣因子gamma控制未来奖励的重要性较高值0.99更关注长期回报较低值0.9更关注即时奖励GAE参数lambda平衡偏差和方差接近1低偏差但高方差接近0高偏差但低方差裁剪比例clip_ratio控制策略更新幅度典型值0.1-0.3过大失去PPO的约束效果过小学习速度变慢学习率lr影响参数更新速度建议从3e-4开始尝试可以配合学习率衰减使用批量大小每次更新使用的样本数CartPole128-2048更复杂环境更大批量# 超参数设置 hyperparams { epochs: 200, steps_per_epoch: 4000, max_ep_len: 1000, clip_ratio: 0.2, train_iters: 80, gamma: 0.99, lam: 0.95, lr: 3e-4 } # 初始化模型和优化器 model ActorCritic(state_dim, action_dim) optimizer optim.Adam(model.parameters(), lrhyperparams[lr]) # 开始训练 rewards train_ppo(env, model, optimizer, **hyperparams)5. 训练结果分析与可视化训练完成后我们可以绘制奖励曲线来观察学习过程def plot_rewards(rewards, window_size100): plt.figure(figsize(12, 6)) # 原始奖励曲线 plt.subplot(1, 2, 1) plt.plot(rewards) plt.title(Raw Training Rewards) plt.xlabel(Episode) plt.ylabel(Reward) # 滑动平均奖励曲线 plt.subplot(1, 2, 2) moving_avg np.convolve(rewards, np.ones(window_size)/window_size, modevalid) plt.plot(moving_avg) plt.title(fMoving Average (window{window_size})) plt.xlabel(Episode) plt.ylabel(Reward) plt.tight_layout() plt.show() plot_rewards(rewards)典型的训练曲线会经历以下几个阶段探索期奖励波动大智能体随机尝试不同动作学习期奖励开始稳步上升稳定期奖励达到环境最大值CartPole为500如果训练出现问题可能表现为奖励不增长学习率可能过大/过小或网络结构不合适奖励突然下降策略更新过大需要减小clip_ratio奖励波动大增大批量大小或调整GAE参数6. 模型测试与部署训练完成后我们可以测试模型在实际环境中的表现def test_model(env, model, episodes10, renderFalse): total_rewards [] for ep in range(episodes): state env.reset() done False ep_reward 0 while not done: if render: env.render() with torch.no_grad(): action_probs, _ model(torch.FloatTensor(state)) action torch.argmax(action_probs).item() state, reward, done, _ env.step(action) ep_reward reward total_rewards.append(ep_reward) print(fEpisode {ep1}: Reward {ep_reward}) avg_reward np.mean(total_rewards) print(f\nAverage reward over {episodes} episodes: {avg_reward:.1f}) return total_rewards # 测试训练好的模型 test_rewards test_model(env, model, episodes10, renderTrue) env.close()对于实际部署我们可以保存模型参数供后续使用# 保存模型 torch.save(model.state_dict(), ppo_cartpole.pth) # 加载模型 loaded_model ActorCritic(state_dim, action_dim) loaded_model.load_state_dict(torch.load(ppo_cartpole.pth)) loaded_model.eval()7. 常见问题与解决方案在实际实现PPO算法时可能会遇到以下典型问题训练不稳定现象奖励曲线剧烈波动解决方案减小学习率增大clip_ratio增加批量大小收敛速度慢现象奖励增长缓慢解决方案增大学习率减小clip_ratio调整网络结构过早收敛到次优策略现象奖励停滞在较低水平解决方案增加熵奖励系数减小gamma值梯度爆炸现象损失变为NaN解决方案梯度裁剪减小学习率检查网络初始化样本效率低现象需要大量交互数据解决方案增大GAE的lambda值增加并行环境数量# 梯度裁剪示例 for _ in range(train_iters): optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # 梯度裁剪 optimizer.step()