Switched to jupyter and got simple_v3 to run

This commit is contained in:
2025-09-01 15:19:02 -06:00
parent 35e90ad016
commit 4a4734bdb8
5 changed files with 949 additions and 48 deletions

125
main.py
View File

@ -9,7 +9,7 @@ import matplotlib.pyplot as plt
from tqdm import tqdm
import gymnasium as gym
from pettingzoo.mpe import simple_reference_v3
from pettingzoo.mpe import simple_reference_v3,simple_v3
import pettingzoo
class A2C(nn.Module):
@ -53,8 +53,8 @@ class A2C(nn.Module):
self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr)
self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optim, step_size=100, gamma=1)
self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optim, step_size=100, gamma=1)
#self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optim, step_size=100, gamma=0.9)
#self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optim, step_size=100, gamma=0.9)
def forward(self, x: np.array) -> tuple[torch.tensor, torch.tensor]:
x = torch.Tensor(x).to(self.device)
@ -89,32 +89,34 @@ class A2C(nn.Module):
ent_coef: float,
) -> tuple[torch.tensor, torch.tensor]:
T = len(rewards)
advantages = torch.zeros(T, device=self.device)
#T = len(rewards)
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
#advantages = torch.zeros(T, device=self.device)
# compute the advantages using GAE
gae = 0.0
for t in reversed(range(T - 1)):
td_error = (
rewards[t] + gamma * masks[t] * value_preds[t+1] - value_preds[t]
)
gae = td_error + gamma * 0.95 * masks[t] * gae
advantages[t] = gae
#gae = 0.0
#for t in reversed(range(T - 1)):
# td_error = (
# rewards[t] + gamma * masks[t] * value_preds[t+1] - value_preds[t]
# )
# gae = td_error + gamma * 0.95 * masks[t] * gae
# advantages[t] = gae
#advantages = (advantages - advantages.mean()) / advantages.std()
# calculate the loss of the minibatch for actor and critic
critic_loss = advantages.pow(2).mean()
#critic_loss = advantages.pow(2).mean()
#give a bonus for higher entropy to encourage exploration
actor_loss = (
-(advantages.detach() * action_log_probs).mean() - ent_coef * torch.Tensor(entropy).mean()
)
#actor_loss = (
# -(advantages.detach() * action_log_probs).mean() - ent_coef * torch.Tensor(entropy).mean()
#)
#advantages = torch.zeros(len(rewards), device=self.device)
#compute advantages
#mask - 0 if end of episode
#gamma - coeffecient for value prediction
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
#for t in range(len(rewards) - 1):
#advantages[t] = (rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
#print(advantages[t])
@ -122,25 +124,25 @@ class A2C(nn.Module):
#(rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
#rewards = np.array(rewards)
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
#returns = []
#R = 0
#for r, mask in zip(reversed(rewards), reversed(masks)):
# R = r + gamma * R * mask
# returns.insert(0, R)
returns = []
R = 0
for r, mask in zip(reversed(rewards), reversed(masks)):
R = r + gamma * R * mask
returns.insert(0, R)
#returns = torch.FloatTensor(returns)
#values = torch.stack(value_preds).squeeze(1)
returns = torch.FloatTensor(returns)
values = torch.stack(value_preds).squeeze(1)
#advantage = returns - values
advantage = returns - values
#calculate critic loss - MSE
#critic_loss = advantages.pow(2).mean()
critic_loss = advantage.pow(2).mean()
#critic_loss = advantages.pow(2).mean()
#calculate actor loss - give bonus for entropy to encourage exploration
#actor_loss = -(advantages.detach() * action_log_probs).mean() - ent_coef * entropy.mean()
#entropy = -torch.stack(entropy).sum(dim=-1).mean()
#actor_loss = (-action_log_probs * advantages.detach()).mean() - ent_coef * torch.Tensor(entropy).mean()
actor_loss = (-action_log_probs * advantage.detach()).mean() - ent_coef * torch.Tensor(entropy).mean()
#print(action_log_probs)
#print(actor_loss)
return (critic_loss, actor_loss)
@ -150,19 +152,18 @@ class A2C(nn.Module):
critic_loss.backward()
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
self.critic_optim.step()
self.critic_scheduler.step()
#self.critic_scheduler.step()
self.actor_optim.zero_grad()
actor_loss.backward()
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
self.actor_optim.step()
self.actor_scheduler.step()
#self.actor_scheduler.step()
def set_eval(self):
self.critic.eval()
self.actor.eval()
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,5))
fig.suptitle(
f"training plots for the Simple Reference environment"
@ -247,7 +248,8 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
agent1_rewards = []
agent0_entropy = []
agent1_entropy = []
env = simple_reference_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
#env = simple_reference_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
env = simple_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
#obs_space
#action_space
@ -265,7 +267,7 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
#alice = A2C(n_features = env.observation_space("alice_0").shape[0], n_actions = env.action_space("alice_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
agent0 = A2C(n_features = env.observation_space("agent_0").shape[0], n_actions = env.action_space("agent_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
agent1 = A2C(n_features = env.observation_space("agent_1").shape[0], n_actions = env.action_space("agent_1").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
#agent1 = A2C(n_features = env.observation_space("agent_1").shape[0], n_actions = env.action_space("agent_1").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
#print(env.action_space("agent_0").n)
#print(env.observation_space("agent_0"))
@ -293,9 +295,9 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
#actions["bob_0"] = bob_action.item()
#actions["alice_0"] = alice_action.item()
agent_0_action, agent_0_log_probs, agent_0_state_val, agent_0_ent = agent0.select_action(torch.FloatTensor(observations["agent_0"]).unsqueeze(0))
agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
#agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
actions["agent_0"] = agent_0_action
actions["agent_1"] = agent_1_action
#actions["agent_1"] = agent_1_action
observations, rewards, terminations, truncations, infos = env.step(actions)
#print(rewards)
agent_0_rewards.append(rewards["agent_0"])
@ -304,11 +306,11 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
agent_0_ents.append(agent_0_ent.item())
agent_0_mask.append( 1 if env.agents else 0)
agent_1_rewards.append(rewards["agent_1"])
agent_1_probs.append(agent_1_log_probs)
agent_1_pred.append(agent_1_state_val)
agent_1_ents.append(agent_1_ent.item())
agent_1_mask.append( 1 if env.agents else 0)
#agent_1_rewards.append(rewards["agent_1"])
#agent_1_probs.append(agent_1_log_probs)
#agent_1_pred.append(agent_1_state_val)
#agent_1_ents.append(agent_1_ent.item())
#agent_1_mask.append( 1 if env.agents else 0)
#eve_closs, eve_aloss = eve.get_losses([rewards["eve_0"]], eve_log_probs, eve_state_val, eve_ent, [1], gamma, ent_coef)
#print("Eve: Critic Loss: " + str(eve_closs.item()) + " Actor Loss: " + str(eve_aloss.item()))
#eve.update_params(eve_closs, eve_aloss)
@ -318,17 +320,17 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
agent0_actor_loss.append(agent_0_aloss.item())
#print("Agent 0 loss: Critic: " + str(agent_0_closs.item()) + ", Actor: " + str(agent_0_aloss.item()))
agent0.update_params(agent_0_closs, agent_0_aloss)
agent_1_closs, agent_1_aloss = agent1.get_losses(agent_1_rewards, torch.stack(agent_1_probs), agent_1_pred, agent_1_ents, agent_1_mask, gamma, ent_coef)
agent1_critic_loss.append(agent_1_closs.item())
agent1_actor_loss.append(agent_1_aloss.item())
#agent_1_closs, agent_1_aloss = agent1.get_losses(agent_1_rewards, torch.stack(agent_1_probs), agent_1_pred, agent_1_ents, agent_1_mask, gamma, ent_coef)
#agent1_critic_loss.append(agent_1_closs.item())
#agent1_actor_loss.append(agent_1_aloss.item())
#print("Agent 1 loss: Critic: " + str(agent_1_closs.item()) + ", Actor: " + str(agent_1_aloss.item()))
agent1.update_params(agent_1_closs, agent_1_aloss)
#agent1.update_params(agent_1_closs, agent_1_aloss)
agent0_rewards.append(np.array(agent_0_rewards).sum())
agent1_rewards.append(np.array(agent_1_rewards).sum())
#agent1_rewards.append(np.array(agent_1_rewards).sum())
#print(np.array(agent_0_ents).sum())
agent0_entropy.append(np.array(agent_0_ents).sum())
agent1_entropy.append(np.array(agent_1_ents).sum())
agent0_entropy.append(np.array(agent_0_ents).mean())
#agent1_entropy.append(np.array(agent_1_ents).sum())
@ -337,13 +339,42 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
drawPlots()
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
actor0_weights_path = "weights/actor0_weights.h5"
critic0_weights_path = "weights/critic0_weights.h5"
actor1_weights_path = "weights/actor1_weights.h5"
critic1_weights_path = "weights/critic1_weights.h5"
if not os.path.exists("weights"):
os.mkdir("weights")
torch.save(agent0.actor.state_dict(), actor0_weights_path)
torch.save(agent0.critic.state_dict(), critic0_weights_path)
#torch.save(agent1.actor.state_dict(), actor1_weights_path)
#torch.save(agent1.critic.state_dict(), critic1_weights_path)
agent0.set_eval()
#agent1.set_eval()
#env = simple_reference_v3.parallel_env(render_mode="human")
env = simple_v3.parallel_env(render_mode="human")
while True:
observations, infos = env.reset()
while env.agents:
plt.pause(0.001)
actions = {}
agent_0_action, agent_0_log_probs, agent_0_state_val, agent_0_ent = agent0.select_action(torch.FloatTensor(observations["agent_0"]).unsqueeze(0))
#agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
actions["agent_0"] = agent_0_action
#actions["agent_1"] = agent_1_action
observations, rewards, terminations, truncations, infos = env.step(actions)
env.close()
#environment hyperparams
n_episodes = 1000
train(10000, 0.999, 0.01, 0.0001, 0.0005)
train(10000, 0.9, 0.03, 0.001, 0.005)
best = 1
for gamma in np.arange(0.999, 0.99, -0.1):
for ent_coef in np.arange(0, 0.1, 0.01):