GAEdef compute_gae(rewards, values, dones, next_values, gamma0.99, lam0.95): 最简 GAE 实现 # TD 误差deltas rewards gamma * next_values * (1 - dones) - values# 反向计算 GAEadvantages []gae 0for t in range(len(deltas) - 1, -1, -1):gae deltas[t] gamma * lam * gae * (1 - dones[t])advantages.insert(0, gae)advantages torch.tensor(advantages)returns advantages valuesreturn advantages, returns策略损失def compute_policy_loss(log_probs, old_log_probs, advantages, action_maskNone, clip_eps0.2):ratio (log_probs - old_log_probs).exp()surr1 ratio * advantagessurr2 ratio.clamp(1.0 - clip_eps, 1.0 clip_eps) * advantagesloss -torch.min(surr1, surr2)if action_mask is None:return loss.mean(-1).mean()return ((loss * action_mask).sum(-1) / action_mask.sum(-1)).mean()价值损失def compute_value_loss(values, old_values, returns, action_maskNone, clip_eps: float None):if clip_eps is not None:values_clipped old_values (values - old_values).clamp(-clip_eps, clip_eps)surr1 (values_clipped - returns) ** 2surr2 (values - returns) ** 2loss torch.max(surr1, surr2)else:loss (values - returns) ** 2if action_mask is None:return loss.mean(-1).mean()return ((loss * action_mask).sum(-1) / action_mask.sum(-1)).mean()