Switched to jupyter and got simple_v3 to run
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
*.png
|
*.png
|
||||||
*.h5
|
*.h5
|
||||||
|
*.pt
|
353
.ipynb_checkpoints/Test-checkpoint.ipynb
Normal file
353
.ipynb_checkpoints/Test-checkpoint.ipynb
Normal file
File diff suppressed because one or more lines are too long
156
.ipynb_checkpoints/test-checkpoint.py
Normal file
156
.ipynb_checkpoints/test-checkpoint.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
import torch
|
||||||
|
from pettingzoo.mpe import simple_reference_v3,simple_v3
|
||||||
|
import numpy as np
|
||||||
|
from IPython.display import clear_output
|
||||||
|
from IPython.core.debugger import set_trace
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
class Model(torch.nn.Module):
|
||||||
|
def __init__(self, observation_space, action_space):
|
||||||
|
super(Model, self).__init__()
|
||||||
|
self.features = torch.nn.Sequential(
|
||||||
|
torch.nn.Linear(observation_space, 32),
|
||||||
|
torch.nn.ReLU(),
|
||||||
|
torch.nn.Linear(32, 128),
|
||||||
|
torch.nn.ReLu()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.critic = torch.nn.Sequential(
|
||||||
|
torch.nn.Linear(128, 256),
|
||||||
|
torch.nn.ReLU(),
|
||||||
|
torch.nn.Linear(256, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.actor = torch.nn.Sequential(
|
||||||
|
torch.nn.Linear(128, 256),
|
||||||
|
torch.nn.ReLU(),
|
||||||
|
torch.nn.Linear(256, action_space)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.features(x)
|
||||||
|
value = self.critic(x)
|
||||||
|
actions = self.actor(x)
|
||||||
|
return value, actions
|
||||||
|
|
||||||
|
def get_critic(self, x):
|
||||||
|
x = self.features(x)
|
||||||
|
return self.critic(x)
|
||||||
|
|
||||||
|
def evaluate_action(self, state, action):
|
||||||
|
value, actor_features = self.forward(state)
|
||||||
|
dist = torch.distributions.Categorical(actor_features)
|
||||||
|
log_probs = dist.log_prob(action).view(-1, 1)
|
||||||
|
entropy = dist.entropy().mean()
|
||||||
|
|
||||||
|
return value, log_rpobs, entropy
|
||||||
|
|
||||||
|
def act(self, state):
|
||||||
|
value, actor_features = self.forward(state)
|
||||||
|
dist = torch.distributions.Categorical(actor_features)
|
||||||
|
|
||||||
|
chosen_action = dist.sample()
|
||||||
|
return chosen_action.item()
|
||||||
|
|
||||||
|
class Memory(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.states, self.actions, self.true_values = [], [], []
|
||||||
|
|
||||||
|
def push(self, state, action, true_value):
|
||||||
|
self.states.append(state)
|
||||||
|
self.actions.append(action)
|
||||||
|
self.true_values.append(true_value)
|
||||||
|
|
||||||
|
def pop_all(self):
|
||||||
|
states = torch.stack(self.states)
|
||||||
|
actions = LongTensor(self.actions)
|
||||||
|
true_values = FloatTensor(self.true_values).unsqueeze(1)
|
||||||
|
|
||||||
|
self.states, self.actions, self.true_values = [], [], []
|
||||||
|
return states, actions, true_values
|
||||||
|
|
||||||
|
class Worker(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.env = simple_v3.parallel_env()
|
||||||
|
self.episode_reward = 0
|
||||||
|
self.state = FloatTensor(self.env.reset()[0])
|
||||||
|
|
||||||
|
def get_batch(self):
|
||||||
|
states, actions, rewards, dones = [], [], [], []
|
||||||
|
for _ in range(batch_size):
|
||||||
|
action = model.act(torch.Tensor(self.state["agent_0"]).unsqueeze(0))
|
||||||
|
actions = []
|
||||||
|
actions["agent_0"] = action
|
||||||
|
next_state, rewards, terminations, truncations, _ = env.step(actions)
|
||||||
|
self.episode_reward += rewards["agent_0"]
|
||||||
|
states.append(torch.Tensor(self.state["agent_0"]))
|
||||||
|
actions.append(action)
|
||||||
|
rewards.append(reward["agent_0"])
|
||||||
|
done = terminations["agent_0"] or truncations["agent_0"]
|
||||||
|
dones.append(done)
|
||||||
|
|
||||||
|
if done:
|
||||||
|
self.state = FloatTensor(self.env.reset()[0])
|
||||||
|
data['episode_rewards'].append(self.episode_reward)
|
||||||
|
self.episode_reward = 0
|
||||||
|
else:
|
||||||
|
self.state = FloatTensor(next_state)
|
||||||
|
values = compute_true_values(states, rewards, dones).unsqueeze(1)
|
||||||
|
return states, actions, values
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def compute_true_values(states, rewards, dones):
|
||||||
|
true_values = []
|
||||||
|
rewards = FloatTensor(rewards)
|
||||||
|
dones = FloatTensor(dones)
|
||||||
|
states = torch.stack(states)
|
||||||
|
|
||||||
|
if dones[-1] == True:
|
||||||
|
next_value = rewards[-1]
|
||||||
|
else:
|
||||||
|
next_value = model.get_critic(states[-1].unsqueeze(0))
|
||||||
|
|
||||||
|
true_values.append(next_value)
|
||||||
|
for i in reversed(range(0, len(rewards) -1)):
|
||||||
|
if not dones[i]:
|
||||||
|
next_value = rewards[i] + next_value * gamma
|
||||||
|
else:
|
||||||
|
next_value = rewards[i]
|
||||||
|
true_values.append(next_value)
|
||||||
|
true_values.reverse()
|
||||||
|
return FloatTensor(true_values)
|
||||||
|
|
||||||
|
def reflect(memory):
|
||||||
|
states, actions, true_values = memory.pop_all()
|
||||||
|
values, log_probs, entropy = model.evaluate_action(states, actions)
|
||||||
|
advantages = true_values - values
|
||||||
|
critic_loss = advantages.pow(2).mean()
|
||||||
|
actor_loss = -(log_probs * advantages.detach()).mean()
|
||||||
|
total_loss = (critic_coef * critic_loss) + actor_loss - (entropy_coef * entropy)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
total_loss.backward()
|
||||||
|
tourch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
|
||||||
|
optimizer.step()
|
||||||
|
return values.mean().item()
|
||||||
|
|
||||||
|
def plot(data, frame_idx):
|
||||||
|
clear_output(True)
|
||||||
|
plt.figure(figsize=(20, 5))
|
||||||
|
if data['episode_rewards']:
|
||||||
|
ax = plt.subplot(121)
|
||||||
|
ax.plt.gca()
|
||||||
|
average_score = np.mean(data['episode_rewards'][-100:])
|
||||||
|
plt.title(f"Frame: {frame_idx} - Average Store: {average_score}")
|
||||||
|
plt.grid()
|
||||||
|
plt.plot(data['episode_rewards'])
|
||||||
|
if data['values']:
|
||||||
|
ax = plt.subplot(122)
|
||||||
|
average_value = np.mean(data['values'][-1000:0])
|
||||||
|
plt.title(f"Frame: {frame_idx} - Average Values: {average_value}")
|
||||||
|
plt.plot(data['values'])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
env = simple_v3.parallel_env()
|
||||||
|
model = Model(env.observation_space("agent_0").shape[0], env.action_space("agent_0").n)
|
||||||
|
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, eps=1e-5)
|
360
Test.ipynb
Normal file
360
Test.ipynb
Normal file
File diff suppressed because one or more lines are too long
125
main.py
125
main.py
@ -9,7 +9,7 @@ import matplotlib.pyplot as plt
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
from pettingzoo.mpe import simple_reference_v3
|
from pettingzoo.mpe import simple_reference_v3,simple_v3
|
||||||
import pettingzoo
|
import pettingzoo
|
||||||
|
|
||||||
class A2C(nn.Module):
|
class A2C(nn.Module):
|
||||||
@ -53,8 +53,8 @@ class A2C(nn.Module):
|
|||||||
self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr)
|
self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr)
|
||||||
self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
|
self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
|
||||||
|
|
||||||
self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optim, step_size=100, gamma=1)
|
#self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optim, step_size=100, gamma=0.9)
|
||||||
self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optim, step_size=100, gamma=1)
|
#self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optim, step_size=100, gamma=0.9)
|
||||||
|
|
||||||
def forward(self, x: np.array) -> tuple[torch.tensor, torch.tensor]:
|
def forward(self, x: np.array) -> tuple[torch.tensor, torch.tensor]:
|
||||||
x = torch.Tensor(x).to(self.device)
|
x = torch.Tensor(x).to(self.device)
|
||||||
@ -89,32 +89,34 @@ class A2C(nn.Module):
|
|||||||
ent_coef: float,
|
ent_coef: float,
|
||||||
) -> tuple[torch.tensor, torch.tensor]:
|
) -> tuple[torch.tensor, torch.tensor]:
|
||||||
|
|
||||||
T = len(rewards)
|
#T = len(rewards)
|
||||||
advantages = torch.zeros(T, device=self.device)
|
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
|
||||||
|
#advantages = torch.zeros(T, device=self.device)
|
||||||
|
|
||||||
# compute the advantages using GAE
|
# compute the advantages using GAE
|
||||||
gae = 0.0
|
#gae = 0.0
|
||||||
for t in reversed(range(T - 1)):
|
#for t in reversed(range(T - 1)):
|
||||||
td_error = (
|
# td_error = (
|
||||||
rewards[t] + gamma * masks[t] * value_preds[t+1] - value_preds[t]
|
# rewards[t] + gamma * masks[t] * value_preds[t+1] - value_preds[t]
|
||||||
)
|
# )
|
||||||
gae = td_error + gamma * 0.95 * masks[t] * gae
|
# gae = td_error + gamma * 0.95 * masks[t] * gae
|
||||||
advantages[t] = gae
|
# advantages[t] = gae
|
||||||
|
|
||||||
|
#advantages = (advantages - advantages.mean()) / advantages.std()
|
||||||
|
|
||||||
# calculate the loss of the minibatch for actor and critic
|
# calculate the loss of the minibatch for actor and critic
|
||||||
critic_loss = advantages.pow(2).mean()
|
#critic_loss = advantages.pow(2).mean()
|
||||||
|
|
||||||
#give a bonus for higher entropy to encourage exploration
|
#give a bonus for higher entropy to encourage exploration
|
||||||
actor_loss = (
|
#actor_loss = (
|
||||||
-(advantages.detach() * action_log_probs).mean() - ent_coef * torch.Tensor(entropy).mean()
|
# -(advantages.detach() * action_log_probs).mean() - ent_coef * torch.Tensor(entropy).mean()
|
||||||
)
|
#)
|
||||||
|
|
||||||
|
|
||||||
#advantages = torch.zeros(len(rewards), device=self.device)
|
#advantages = torch.zeros(len(rewards), device=self.device)
|
||||||
#compute advantages
|
#compute advantages
|
||||||
#mask - 0 if end of episode
|
#mask - 0 if end of episode
|
||||||
#gamma - coeffecient for value prediction
|
#gamma - coeffecient for value prediction
|
||||||
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
|
|
||||||
#for t in range(len(rewards) - 1):
|
#for t in range(len(rewards) - 1):
|
||||||
#advantages[t] = (rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
|
#advantages[t] = (rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
|
||||||
#print(advantages[t])
|
#print(advantages[t])
|
||||||
@ -122,25 +124,25 @@ class A2C(nn.Module):
|
|||||||
#(rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
|
#(rewards[t] + masks[t] * gamma * (value_preds[t+1] - value_preds[t]))
|
||||||
#rewards = np.array(rewards)
|
#rewards = np.array(rewards)
|
||||||
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
|
#rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
|
||||||
#returns = []
|
returns = []
|
||||||
#R = 0
|
R = 0
|
||||||
#for r, mask in zip(reversed(rewards), reversed(masks)):
|
for r, mask in zip(reversed(rewards), reversed(masks)):
|
||||||
# R = r + gamma * R * mask
|
R = r + gamma * R * mask
|
||||||
# returns.insert(0, R)
|
returns.insert(0, R)
|
||||||
|
|
||||||
#returns = torch.FloatTensor(returns)
|
returns = torch.FloatTensor(returns)
|
||||||
#values = torch.stack(value_preds).squeeze(1)
|
values = torch.stack(value_preds).squeeze(1)
|
||||||
|
|
||||||
#advantage = returns - values
|
advantage = returns - values
|
||||||
|
|
||||||
|
|
||||||
#calculate critic loss - MSE
|
#calculate critic loss - MSE
|
||||||
#critic_loss = advantages.pow(2).mean()
|
critic_loss = advantage.pow(2).mean()
|
||||||
#critic_loss = advantages.pow(2).mean()
|
#critic_loss = advantages.pow(2).mean()
|
||||||
#calculate actor loss - give bonus for entropy to encourage exploration
|
#calculate actor loss - give bonus for entropy to encourage exploration
|
||||||
#actor_loss = -(advantages.detach() * action_log_probs).mean() - ent_coef * entropy.mean()
|
#actor_loss = -(advantages.detach() * action_log_probs).mean() - ent_coef * entropy.mean()
|
||||||
#entropy = -torch.stack(entropy).sum(dim=-1).mean()
|
#entropy = -torch.stack(entropy).sum(dim=-1).mean()
|
||||||
#actor_loss = (-action_log_probs * advantages.detach()).mean() - ent_coef * torch.Tensor(entropy).mean()
|
actor_loss = (-action_log_probs * advantage.detach()).mean() - ent_coef * torch.Tensor(entropy).mean()
|
||||||
#print(action_log_probs)
|
#print(action_log_probs)
|
||||||
#print(actor_loss)
|
#print(actor_loss)
|
||||||
return (critic_loss, actor_loss)
|
return (critic_loss, actor_loss)
|
||||||
@ -150,19 +152,18 @@ class A2C(nn.Module):
|
|||||||
critic_loss.backward()
|
critic_loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
|
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
|
||||||
self.critic_optim.step()
|
self.critic_optim.step()
|
||||||
self.critic_scheduler.step()
|
#self.critic_scheduler.step()
|
||||||
|
|
||||||
self.actor_optim.zero_grad()
|
self.actor_optim.zero_grad()
|
||||||
actor_loss.backward()
|
actor_loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
|
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
|
||||||
self.actor_optim.step()
|
self.actor_optim.step()
|
||||||
self.actor_scheduler.step()
|
#self.actor_scheduler.step()
|
||||||
|
|
||||||
def set_eval(self):
|
def set_eval(self):
|
||||||
self.critic.eval()
|
self.critic.eval()
|
||||||
self.actor.eval()
|
self.actor.eval()
|
||||||
|
|
||||||
|
|
||||||
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,5))
|
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,5))
|
||||||
fig.suptitle(
|
fig.suptitle(
|
||||||
f"training plots for the Simple Reference environment"
|
f"training plots for the Simple Reference environment"
|
||||||
@ -247,7 +248,8 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
agent1_rewards = []
|
agent1_rewards = []
|
||||||
agent0_entropy = []
|
agent0_entropy = []
|
||||||
agent1_entropy = []
|
agent1_entropy = []
|
||||||
env = simple_reference_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
|
#env = simple_reference_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
|
||||||
|
env = simple_v3.parallel_env(max_cycles = 50, render_mode="rgb_array")
|
||||||
#obs_space
|
#obs_space
|
||||||
#action_space
|
#action_space
|
||||||
|
|
||||||
@ -265,7 +267,7 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
#alice = A2C(n_features = env.observation_space("alice_0").shape[0], n_actions = env.action_space("alice_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
#alice = A2C(n_features = env.observation_space("alice_0").shape[0], n_actions = env.action_space("alice_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
||||||
|
|
||||||
agent0 = A2C(n_features = env.observation_space("agent_0").shape[0], n_actions = env.action_space("agent_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
agent0 = A2C(n_features = env.observation_space("agent_0").shape[0], n_actions = env.action_space("agent_0").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
||||||
agent1 = A2C(n_features = env.observation_space("agent_1").shape[0], n_actions = env.action_space("agent_1").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
#agent1 = A2C(n_features = env.observation_space("agent_1").shape[0], n_actions = env.action_space("agent_1").n, device = device, critic_lr = critic_lr, actor_lr = actor_lr)
|
||||||
#print(env.action_space("agent_0").n)
|
#print(env.action_space("agent_0").n)
|
||||||
#print(env.observation_space("agent_0"))
|
#print(env.observation_space("agent_0"))
|
||||||
|
|
||||||
@ -293,9 +295,9 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
#actions["bob_0"] = bob_action.item()
|
#actions["bob_0"] = bob_action.item()
|
||||||
#actions["alice_0"] = alice_action.item()
|
#actions["alice_0"] = alice_action.item()
|
||||||
agent_0_action, agent_0_log_probs, agent_0_state_val, agent_0_ent = agent0.select_action(torch.FloatTensor(observations["agent_0"]).unsqueeze(0))
|
agent_0_action, agent_0_log_probs, agent_0_state_val, agent_0_ent = agent0.select_action(torch.FloatTensor(observations["agent_0"]).unsqueeze(0))
|
||||||
agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
|
#agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
|
||||||
actions["agent_0"] = agent_0_action
|
actions["agent_0"] = agent_0_action
|
||||||
actions["agent_1"] = agent_1_action
|
#actions["agent_1"] = agent_1_action
|
||||||
observations, rewards, terminations, truncations, infos = env.step(actions)
|
observations, rewards, terminations, truncations, infos = env.step(actions)
|
||||||
#print(rewards)
|
#print(rewards)
|
||||||
agent_0_rewards.append(rewards["agent_0"])
|
agent_0_rewards.append(rewards["agent_0"])
|
||||||
@ -304,11 +306,11 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
agent_0_ents.append(agent_0_ent.item())
|
agent_0_ents.append(agent_0_ent.item())
|
||||||
agent_0_mask.append( 1 if env.agents else 0)
|
agent_0_mask.append( 1 if env.agents else 0)
|
||||||
|
|
||||||
agent_1_rewards.append(rewards["agent_1"])
|
#agent_1_rewards.append(rewards["agent_1"])
|
||||||
agent_1_probs.append(agent_1_log_probs)
|
#agent_1_probs.append(agent_1_log_probs)
|
||||||
agent_1_pred.append(agent_1_state_val)
|
#agent_1_pred.append(agent_1_state_val)
|
||||||
agent_1_ents.append(agent_1_ent.item())
|
#agent_1_ents.append(agent_1_ent.item())
|
||||||
agent_1_mask.append( 1 if env.agents else 0)
|
#agent_1_mask.append( 1 if env.agents else 0)
|
||||||
#eve_closs, eve_aloss = eve.get_losses([rewards["eve_0"]], eve_log_probs, eve_state_val, eve_ent, [1], gamma, ent_coef)
|
#eve_closs, eve_aloss = eve.get_losses([rewards["eve_0"]], eve_log_probs, eve_state_val, eve_ent, [1], gamma, ent_coef)
|
||||||
#print("Eve: Critic Loss: " + str(eve_closs.item()) + " Actor Loss: " + str(eve_aloss.item()))
|
#print("Eve: Critic Loss: " + str(eve_closs.item()) + " Actor Loss: " + str(eve_aloss.item()))
|
||||||
#eve.update_params(eve_closs, eve_aloss)
|
#eve.update_params(eve_closs, eve_aloss)
|
||||||
@ -318,17 +320,17 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
agent0_actor_loss.append(agent_0_aloss.item())
|
agent0_actor_loss.append(agent_0_aloss.item())
|
||||||
#print("Agent 0 loss: Critic: " + str(agent_0_closs.item()) + ", Actor: " + str(agent_0_aloss.item()))
|
#print("Agent 0 loss: Critic: " + str(agent_0_closs.item()) + ", Actor: " + str(agent_0_aloss.item()))
|
||||||
agent0.update_params(agent_0_closs, agent_0_aloss)
|
agent0.update_params(agent_0_closs, agent_0_aloss)
|
||||||
agent_1_closs, agent_1_aloss = agent1.get_losses(agent_1_rewards, torch.stack(agent_1_probs), agent_1_pred, agent_1_ents, agent_1_mask, gamma, ent_coef)
|
#agent_1_closs, agent_1_aloss = agent1.get_losses(agent_1_rewards, torch.stack(agent_1_probs), agent_1_pred, agent_1_ents, agent_1_mask, gamma, ent_coef)
|
||||||
agent1_critic_loss.append(agent_1_closs.item())
|
#agent1_critic_loss.append(agent_1_closs.item())
|
||||||
agent1_actor_loss.append(agent_1_aloss.item())
|
#agent1_actor_loss.append(agent_1_aloss.item())
|
||||||
#print("Agent 1 loss: Critic: " + str(agent_1_closs.item()) + ", Actor: " + str(agent_1_aloss.item()))
|
#print("Agent 1 loss: Critic: " + str(agent_1_closs.item()) + ", Actor: " + str(agent_1_aloss.item()))
|
||||||
agent1.update_params(agent_1_closs, agent_1_aloss)
|
#agent1.update_params(agent_1_closs, agent_1_aloss)
|
||||||
|
|
||||||
agent0_rewards.append(np.array(agent_0_rewards).sum())
|
agent0_rewards.append(np.array(agent_0_rewards).sum())
|
||||||
agent1_rewards.append(np.array(agent_1_rewards).sum())
|
#agent1_rewards.append(np.array(agent_1_rewards).sum())
|
||||||
#print(np.array(agent_0_ents).sum())
|
#print(np.array(agent_0_ents).sum())
|
||||||
agent0_entropy.append(np.array(agent_0_ents).sum())
|
agent0_entropy.append(np.array(agent_0_ents).mean())
|
||||||
agent1_entropy.append(np.array(agent_1_ents).sum())
|
#agent1_entropy.append(np.array(agent_1_ents).sum())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -337,13 +339,42 @@ def train(n_episodes, gamma, ent_coef, actor_lr, critic_lr):
|
|||||||
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
|
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
|
||||||
drawPlots()
|
drawPlots()
|
||||||
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
|
plt.savefig('plots(gamma=' + str(gamma) + ',ent=' + str(ent_coef) + ',alr=' + str(actor_lr) + ',clr=' + str(critic_lr) + ').png')
|
||||||
|
|
||||||
|
actor0_weights_path = "weights/actor0_weights.h5"
|
||||||
|
critic0_weights_path = "weights/critic0_weights.h5"
|
||||||
|
actor1_weights_path = "weights/actor1_weights.h5"
|
||||||
|
critic1_weights_path = "weights/critic1_weights.h5"
|
||||||
|
|
||||||
|
if not os.path.exists("weights"):
|
||||||
|
os.mkdir("weights")
|
||||||
|
|
||||||
|
torch.save(agent0.actor.state_dict(), actor0_weights_path)
|
||||||
|
torch.save(agent0.critic.state_dict(), critic0_weights_path)
|
||||||
|
#torch.save(agent1.actor.state_dict(), actor1_weights_path)
|
||||||
|
#torch.save(agent1.critic.state_dict(), critic1_weights_path)
|
||||||
|
|
||||||
|
agent0.set_eval()
|
||||||
|
#agent1.set_eval()
|
||||||
|
#env = simple_reference_v3.parallel_env(render_mode="human")
|
||||||
|
env = simple_v3.parallel_env(render_mode="human")
|
||||||
|
while True:
|
||||||
|
observations, infos = env.reset()
|
||||||
|
while env.agents:
|
||||||
|
plt.pause(0.001)
|
||||||
|
actions = {}
|
||||||
|
agent_0_action, agent_0_log_probs, agent_0_state_val, agent_0_ent = agent0.select_action(torch.FloatTensor(observations["agent_0"]).unsqueeze(0))
|
||||||
|
#agent_1_action, agent_1_log_probs, agent_1_state_val, agent_1_ent = agent1.select_action(torch.FloatTensor(observations["agent_1"]).unsqueeze(0))
|
||||||
|
actions["agent_0"] = agent_0_action
|
||||||
|
#actions["agent_1"] = agent_1_action
|
||||||
|
observations, rewards, terminations, truncations, infos = env.step(actions)
|
||||||
|
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#environment hyperparams
|
#environment hyperparams
|
||||||
n_episodes = 1000
|
n_episodes = 1000
|
||||||
train(10000, 0.999, 0.01, 0.0001, 0.0005)
|
train(10000, 0.9, 0.03, 0.001, 0.005)
|
||||||
best = 1
|
best = 1
|
||||||
for gamma in np.arange(0.999, 0.99, -0.1):
|
for gamma in np.arange(0.999, 0.99, -0.1):
|
||||||
for ent_coef in np.arange(0, 0.1, 0.01):
|
for ent_coef in np.arange(0, 0.1, 0.01):
|
||||||
|
Reference in New Issue
Block a user