'Why does ep_rew_mean decrease over time?

In order to learn about reinforcement learning for optimization I have written some code to try to find the maximum cardinality matching in a graph. Not only does it not work well, when I set it so that done = True after 1000 steps, ep_rew_mean actually decreases over time. That is it gets worse and worse. This is code which you should be able to just copy and paste:

import igraph as ig
from tqdm import tqdm
import numpy as np
import gym
from gym import spaces
from math import exp
import random

from sb3_contrib import TRPO
from stable_baselines3 import PPO, DQN,  A2C
from stable_baselines3.common.env_util import make_vec_env

class MaxMatchEnv(gym.Env):
    metadata = {'render.modes': ['console']}
    def __init__(self, array_length=10):
        super(MaxMatchEnv, self).__init__()
        # Size of the 1D-grid
        self.array_length = array_length
        self.agent_pos = [0]*array_length
        self.action_space = spaces.Discrete(array_length)
        self.observation_space = spaces.Box(low=0, high=1, shape=(array_length,), dtype=np.uint8)
        self.matching = set()  # set of edges
        self.matching_nodes = set() # set of node ids (ints)
        self.matching_size = len([v for v in g_matching.matching if v < num_variables and v != -1])
        self.best_found = 0
        
    def reset(self):
        # Initialize the array to have random values
        self.time = 0
        #print(self.agent_pos)
        self.agent_pos = [0]*self.array_length
        self.matching = set()
        self.matching_nodes = set()
        return np.array(self.agent_pos)
    
        
    def step(self, action):
        self.time += 1

        #print("Current state:", self.agent_pos)
        #print("Edges included", self.matching_nodes)
        assert(2 * sum(self.agent_pos) == len(self.matching_nodes)) # should be twice as many nodes in the matching as edges

        reward = 0
        edge = g.es[action]
        if not(edge.source in self.matching_nodes or edge.target in self.matching_nodes):
            #print("Adding edge", action)
            self.matching.add(edge)
            self.matching_nodes.add(edge.source)
            self.matching_nodes.add(edge.target)
            self.agent_pos[action] = 1
            if sum(self.agent_pos) > self.best_found:
                self.best_found = sum(self.agent_pos)
                print(max(env.get_attr("best_found")), end=" ")
            reward = 1
        elif self.agent_pos[action] == 1:
            #print("Removing edge", action)
            self.matching_nodes.remove(edge.source)
            self.matching_nodes.remove(edge.target)
            self.matching.remove(edge)
            self.agent_pos[action] = 0
            reward = -1
        if self.time == 1000:
            done = True
        else:
            done = False
        # done = sum(self.agent_pos) == self.matching_size
        info = {}
        return np.array(self.agent_pos), reward, done, info

    def render(self, mode='console'):
        print(sum(self.agent_pos))

    def close(self):
        pass


if __name__ == '__main__':
    random.seed(7)
    num_variables = 1000
    g = ig.Graph.Random_Bipartite(num_variables, num_variables, p=3/num_variables)
    g_matching = g.maximum_bipartite_matching()
    print("Matching size", len([v for v in g_matching.matching if v < num_variables and v != -1]))

    env = make_vec_env(lambda: MaxMatchEnv(array_length=len(g.es)), n_envs=12)

    model = PPO('MlpPolicy', env, verbose=1).learn(10000000)

This is the output:

----
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 399      |
| time/              |          |
|    fps             | 3071     |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 24576    |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 347        |
| time/                   |            |
|    fps                  | 927        |
|    iterations           | 2          |
|    time_elapsed         | 52         |
|    total_timesteps      | 49152      |
| train/                  |            |
|    approx_kl            | 0.13847528 |
|    clip_fraction        | 0.646      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.19      |
|    explained_variance   | 0.00317    |
|    learning_rate        | 0.0003     |
|    loss                 | 0.307      |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.125     |
|    value_loss           | 2.81       |
----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1e+03     |
|    ep_rew_mean          | 306       |
| time/                   |           |
|    fps                  | 740       |
|    iterations           | 3         |
|    time_elapsed         | 99        |
|    total_timesteps      | 73728     |
| train/                  |           |
|    approx_kl            | 0.1270247 |
|    clip_fraction        | 0.765     |
|    clip_range           | 0.2       |
|    entropy_loss         | -4.77     |
|    explained_variance   | 0.604     |
|    learning_rate        | 0.0003    |
|    loss                 | 0.0528    |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.124    |
|    value_loss           | 1.16      |
---------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 279        |
| time/                   |            |
|    fps                  | 671        |
|    iterations           | 4          |
|    time_elapsed         | 146        |
|    total_timesteps      | 98304      |
| train/                  |            |
|    approx_kl            | 0.09246389 |
|    clip_fraction        | 0.69       |
|    clip_range           | 0.2        |
|    entropy_loss         | -4.13      |
|    explained_variance   | 0.875      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0912     |
|    n_updates            | 30         |
|    policy_gradient_loss | -0.104     |
|    value_loss           | 0.757      |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 227        |
| time/                   |            |
|    fps                  | 631        |
|    iterations           | 5          |
|    time_elapsed         | 194        |
|    total_timesteps      | 122880     |
| train/                  |            |
|    approx_kl            | 0.07252013 |
|    clip_fraction        | 0.596      |
|    clip_range           | 0.2        |
|    entropy_loss         | -3.67      |
|    explained_variance   | 0.944      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0491     |
|    n_updates            | 40         |
|    policy_gradient_loss | -0.0927    |
|    value_loss           | 0.61       |
----------------------------------------

Why is this happening?



Solution 1:[1]

You can try adding:

if self.time == 1000:
    done = True
    self.reset()
        

Currently the MaxMatchEnv is not reset, which might mean that getting rewards becomes harder in new simulations, as they are not comparable as the initial state differs.

If this does solve this issue you can also try increasing the time to find out if giving the model more time in subsequent episodes helps.

EDIT: you also need to add self.best_found = 0 in your environment reset method.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1