'How can I deal with Reinforcement Problem when the episode length is infinite?

I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.

Below is the code of NotifEnv class:

class NotifEnv(py_environment.PyEnvironment):

  def __init__(self, duration, discount, state_size):
    
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
    
    self._state_spec = array_spec.BoundedArraySpec(
            shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
    
    self._discount_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
  
    self.step_count = 0
    
    self.duration = duration
    
    self.state_size = state_size
    
    self.discount = discount
    
    self._episode_ended = False
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)

    
  def observation_spec(self):
    """Return state_spec."""
    return self._state_spec 

  def action_spec(self):
    """Return action_spec."""
    return self._action_spec

  def _reset(self):
    """Return initial_time_step."""
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)
    
    self._episode_ended = False
    
    self.step_count = 0
    
    return ts.restart(np.array(self.state, dtype=np.float32))

  def _step(self, action):
    """Apply action and return new time_step."""
    
    if action<0 or action>23 :
        raise ValueError('`action` should be between 0 and 23.')
    
    self.step_count += 1
    curr_reward = get_reward(action)

    self.state[:-2] = self.state[2:]
    self.state[-2:] = [action+1, curr_reward]
    
    if self.step_count >= duration :
        self.episode_ended = True
        self.step_count = 0
        return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)

    else :
        return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)

Used the below hyperparameters :

duration = 365 
discount = 0.99 
state_size = 7 

I am currently considering the duration as 365 (as the episode length) and last 7 days actions(time of day at which notification was sent) and reward pairs in the state spec.

What should be the correct duration of episode considering that the notifications will be sent regularly each day infinitely?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source