《从曾经到智能》CPT 强化学习完整实现(PyTorch 版 - Actor-Critic + CPT)
《从曾经到智能》CPT 强化学习完整实现PyTorch 版 - Actor-Critic CPT项目说明本实现将Cumulative Prospect Theory累积前景理论融入Actor-Critic强化学习框架让智能体不仅追求期望回报还体现人类的损失厌恶、概率扭曲和参照点依赖等心理特性。这正是“从曾经传统RL到智能行为智能”的实践路径。核心特性PyTorch 实现Actor-CriticA2C 风格CPT 价值函数 优势函数重塑自适应参照点支持连续动作空间LunarLanderContinuous-v2代码结构清晰、可扩展完整代码importgymnasiumasgymimporttorchimporttorch.nnasnnimporttorch.optimasoptimimportnumpyasnpfromcollectionsimportdequeimportrandomimportmatplotlib.pyplotasplt# 1. CPT 模块 classCumulativeProspectTheory:def__init__(self,alpha0.88,beta0.88,lambda_loss2.25,gamma_gain0.61,gamma_loss0.69,reference0.0):self.alphaalpha# 收益凹度self.betabeta# 损失凸度self.lambda_losslambda_loss# 损失厌恶系数 ≈2.25self.gamma_gaingamma_gain self.gamma_lossgamma_loss self.referencereference# 参照点可动态调整defvalue_function(self,x):前景理论价值函数 v(x)xtorch.as_tensor(x,dtypetorch.float32)positivex0resulttorch.zeros_like(x)result[positive]x[positive]**self.alpha result[~positive]-self.lambda_loss*(-x[~positive])**self.betareturnresultdefcompute_cpt_advantages(self,rewards,gamma0.99):计算整条轨迹的 CPT 优势returns[]R0.0forrinreversed(rewards):Rrgamma*R returns.insert(0,R)returnstorch.tensor(returns,dtypetorch.float32)# 相对于参照点的得失relative_returnsreturns-self.reference cpt_valuesself.value_function(relative_returns)# 标准化advantages(cpt_values-cpt_values.mean())/(cpt_values.std()1e-8)returnadvantages# 2. Actor-Critic 网络 classActorCritic(nn.Module):def__init__(self,state_dim:int,action_dim:int,hidden_dim:int128):super().__init__()self.sharednn.Sequential(nn.Linear(state_dim,hidden_dim),nn.ReLU(),nn.Linear(hidden_dim,hidden_dim),nn.ReLU())# Actor: 输出动作分布参数self.actor_meannn.Linear(hidden_dim,action_dim)self.actor_logstdnn.Parameter(torch.zeros(action_dim))# Critic: 状态价值self.criticnn.Linear(hidden_dim,1)defforward(self,x):xself.shared(x)meanself.actor_mean(x)logstdself.actor_logstd.expand_as(mean)stdtorch.exp(logstd)valueself.critic(x)returnmean,std,value# 3. CPT Agent classCPTActorCriticAgent:def__init__(self,state_dim,action_dim,lr3e-4,gamma0.99,deviceNone):self.devicedeviceor(cudaiftorch.cuda.is_available()elsecpu)self.gammagamma self.cptCumulativeProspectTheory()self.modelActorCritic(state_dim,action_dim).to(self.device)self.optimizeroptim.Adam(self.model.parameters(),lrlr)self.memorydeque(maxlen4096)defselect_action(self,state):statetorch.FloatTensor(state).unsqueeze(0).to(self.device)mean,std,_self.model(state)disttorch.distributions.Normal(mean,std)actiondist.sample()# 裁剪到环境允许范围actiontorch.clamp(action,-1.0,1.0)returnaction.squeeze().cpu().numpy()defstore_transition(self,state,action,reward,next_state,done):self.memory.append((state,action,reward,next_state,done))defupdate(self,batch_size256):iflen(self.memory)batch_size:return0.0batchrandom.sample(self.memory,batch_size)states,actions,rewards,next_states,doneszip(*batch)statestorch.FloatTensor(np.array(states)).to(self.device)actionstorch.FloatTensor(np.array(actions)).to(self.device)rewardstorch.FloatTensor(rewards).to(self.device)donestorch.FloatTensor(dones).to(self.device)# 使用 CPT 计算优势advantagesself.cpt.compute_cpt_advantages(rewards.tolist())# 前向计算means,stds,valuesself.model(states)disttorch.distributions.Normal(means,stds)log_probsdist.log_prob(actions).sum(dim-1)# Loss 计算actor_loss-(log_probs*advantages.detach()).mean()critic_lossnn.functional.mse_loss(values.squeeze(),advantages)lossactor_loss0.5*critic_loss self.optimizer.zero_grad()loss.backward()torch.nn.utils.clip_grad_norm_(self.model.parameters(),0.5)self.optimizer.step()returnloss.item()defadapt_reference(self,recent_rewards,alpha0.3):动态调整参照点ifrecent_rewards:self.cpt.referencenp.mean(recent_rewards)*alpha# 4. 训练函数 deftrain(episodes1200,save_interval200):envgym.make(LunarLanderContinuous-v2)state_dimenv.observation_space.shape[0]# 8action_dimenv.action_space.shape[0]# 2agentCPTActorCriticAgent(state_dim,action_dim)reward_history[]recent_rewardsdeque(maxlen50)print(开始训练 CPT-Actor-Critic 智能体...)forepisodeinrange(episodes):state,_env.reset()episode_rewards[]doneFalsetotal_reward0whilenotdone:actionagent.select_action(state)next_state,reward,terminated,truncated,_env.step(action)doneterminatedortruncated agent.store_transition(state,action,reward,next_state,done)statenext_state total_rewardreward episode_rewards.append(reward)# 定期更新iflen(agent.memory)256:agent.update()recent_rewards.append(total_reward)reward_history.append(total_reward)# 自适应参照点ifepisode%200andlen(recent_rewards)0:agent.adapt_reference(list(recent_rewards))ifepisode%500orepisodeepisodes-1:avg_rewardnp.mean(reward_history[-50:])print(fEpisode{episode:4d}| fReward:{total_reward:7.1f}| fAvg(50):{avg_reward:7.1f}| fRef Point:{agent.cpt.reference:.2f})env.close()# 保存模型torch.save(agent.model.state_dict(),cpt_actor_critic_final.pth)print(训练完成模型已保存。)returnreward_historyif__name____main__:rewardstrain(episodes1500)# 绘制学习曲线plt.figure(figsize(10,6))plt.plot(rewards)plt.plot(np.convolve(rewards,np.ones(50)/50,modevalid))plt.title(CPT Actor-Critic Training Curve (LunarLanderContinuous))plt.xlabel(Episode)plt.ylabel(Total Reward)plt.legend([Raw,Moving Avg])plt.grid(True)plt.show()运行方式pipinstallgymnasium[box2d]torch matplotlib numpy python cpt_rl.py想进一步优化请告诉我切换到PPO版本使用SACSoft Actor-Critic CPT支持多智能体或自定义环境添加可视化 TensorBoard支持对比实验传统 RL vs CPT-RL随时说我立即给你对应版本。从传统 RL 到融入人类行为偏好的智能体这正是《从曾经到智能》的实践一步。