'Why does ep_rew_mean decrease over time?
In order to learn about reinforcement learning for optimization I have written some code to try to find the maximum cardinality matching in a graph. Not only does it not work well, when I set it so that done = True
after 1000 steps, ep_rew_mean actually decreases over time. That is it gets worse and worse. This is code which you should be able to just copy and paste:
import igraph as ig
from tqdm import tqdm
import numpy as np
import gym
from gym import spaces
from math import exp
import random
from sb3_contrib import TRPO
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.env_util import make_vec_env
class MaxMatchEnv(gym.Env):
metadata = {'render.modes': ['console']}
def __init__(self, array_length=10):
super(MaxMatchEnv, self).__init__()
# Size of the 1D-grid
self.array_length = array_length
self.agent_pos = [0]*array_length
self.action_space = spaces.Discrete(array_length)
self.observation_space = spaces.Box(low=0, high=1, shape=(array_length,), dtype=np.uint8)
self.matching = set() # set of edges
self.matching_nodes = set() # set of node ids (ints)
self.matching_size = len([v for v in g_matching.matching if v < num_variables and v != -1])
self.best_found = 0
def reset(self):
# Initialize the array to have random values
self.time = 0
#print(self.agent_pos)
self.agent_pos = [0]*self.array_length
self.matching = set()
self.matching_nodes = set()
return np.array(self.agent_pos)
def step(self, action):
self.time += 1
#print("Current state:", self.agent_pos)
#print("Edges included", self.matching_nodes)
assert(2 * sum(self.agent_pos) == len(self.matching_nodes)) # should be twice as many nodes in the matching as edges
reward = 0
edge = g.es[action]
if not(edge.source in self.matching_nodes or edge.target in self.matching_nodes):
#print("Adding edge", action)
self.matching.add(edge)
self.matching_nodes.add(edge.source)
self.matching_nodes.add(edge.target)
self.agent_pos[action] = 1
if sum(self.agent_pos) > self.best_found:
self.best_found = sum(self.agent_pos)
print(max(env.get_attr("best_found")), end=" ")
reward = 1
elif self.agent_pos[action] == 1:
#print("Removing edge", action)
self.matching_nodes.remove(edge.source)
self.matching_nodes.remove(edge.target)
self.matching.remove(edge)
self.agent_pos[action] = 0
reward = -1
if self.time == 1000:
done = True
else:
done = False
# done = sum(self.agent_pos) == self.matching_size
info = {}
return np.array(self.agent_pos), reward, done, info
def render(self, mode='console'):
print(sum(self.agent_pos))
def close(self):
pass
if __name__ == '__main__':
random.seed(7)
num_variables = 1000
g = ig.Graph.Random_Bipartite(num_variables, num_variables, p=3/num_variables)
g_matching = g.maximum_bipartite_matching()
print("Matching size", len([v for v in g_matching.matching if v < num_variables and v != -1]))
env = make_vec_env(lambda: MaxMatchEnv(array_length=len(g.es)), n_envs=12)
model = PPO('MlpPolicy', env, verbose=1).learn(10000000)
This is the output:
----
| rollout/ | |
| ep_len_mean | 1e+03 |
| ep_rew_mean | 399 |
| time/ | |
| fps | 3071 |
| iterations | 1 |
| time_elapsed | 8 |
| total_timesteps | 24576 |
---------------------------------
----------------------------------------
| rollout/ | |
| ep_len_mean | 1e+03 |
| ep_rew_mean | 347 |
| time/ | |
| fps | 927 |
| iterations | 2 |
| time_elapsed | 52 |
| total_timesteps | 49152 |
| train/ | |
| approx_kl | 0.13847528 |
| clip_fraction | 0.646 |
| clip_range | 0.2 |
| entropy_loss | -7.19 |
| explained_variance | 0.00317 |
| learning_rate | 0.0003 |
| loss | 0.307 |
| n_updates | 10 |
| policy_gradient_loss | -0.125 |
| value_loss | 2.81 |
----------------------------------------
---------------------------------------
| rollout/ | |
| ep_len_mean | 1e+03 |
| ep_rew_mean | 306 |
| time/ | |
| fps | 740 |
| iterations | 3 |
| time_elapsed | 99 |
| total_timesteps | 73728 |
| train/ | |
| approx_kl | 0.1270247 |
| clip_fraction | 0.765 |
| clip_range | 0.2 |
| entropy_loss | -4.77 |
| explained_variance | 0.604 |
| learning_rate | 0.0003 |
| loss | 0.0528 |
| n_updates | 20 |
| policy_gradient_loss | -0.124 |
| value_loss | 1.16 |
---------------------------------------
----------------------------------------
| rollout/ | |
| ep_len_mean | 1e+03 |
| ep_rew_mean | 279 |
| time/ | |
| fps | 671 |
| iterations | 4 |
| time_elapsed | 146 |
| total_timesteps | 98304 |
| train/ | |
| approx_kl | 0.09246389 |
| clip_fraction | 0.69 |
| clip_range | 0.2 |
| entropy_loss | -4.13 |
| explained_variance | 0.875 |
| learning_rate | 0.0003 |
| loss | 0.0912 |
| n_updates | 30 |
| policy_gradient_loss | -0.104 |
| value_loss | 0.757 |
----------------------------------------
----------------------------------------
| rollout/ | |
| ep_len_mean | 1e+03 |
| ep_rew_mean | 227 |
| time/ | |
| fps | 631 |
| iterations | 5 |
| time_elapsed | 194 |
| total_timesteps | 122880 |
| train/ | |
| approx_kl | 0.07252013 |
| clip_fraction | 0.596 |
| clip_range | 0.2 |
| entropy_loss | -3.67 |
| explained_variance | 0.944 |
| learning_rate | 0.0003 |
| loss | 0.0491 |
| n_updates | 40 |
| policy_gradient_loss | -0.0927 |
| value_loss | 0.61 |
----------------------------------------
Why is this happening?
Solution 1:[1]
You can try adding:
if self.time == 1000:
done = True
self.reset()
Currently the MaxMatchEnv
is not reset, which might mean that getting rewards becomes harder in new simulations, as they are not comparable as the initial state differs.
If this does solve this issue you can also try increasing the time to find out if giving the model more time in subsequent episodes helps.
EDIT: you also need to add self.best_found = 0
in your environment reset
method.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |