'DQN doesn't learn
I'm trying to implement a DQN in CarPole environment using Pytorch. I don't know why, but no matter how long I've tried to train the agent, even though the scores generally increase, they just fluctuate without maintaining high scores. The code was from a DQN tutorial written for tensorflow, which run normally, but when I try to convert to Pytorch, it doesn't learn. Here's the model:
class Net(nn.Module):
def __init__(self, state_size, action_size):
super(Net, self).__init__()
self.fc1 = nn.Linear(state_size, 24)
self.fc2 = nn.Linear(24, 24)
self.fc3 = nn.Linear(24, action_size)
def forward(self, inputs):
x = torch.from_numpy(inputs)
x = F.relu(self.fc1(x.float()))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
class DQNAgent(nn.Module):
def __init__(self, state_size, action_size):
super(DQNAgent, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.model, self.criterion, self.optimizer = self.build_model()
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.gamma = 0.95
def build_model(self):
model = Net(state_size, action_size)
model = model.float()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # might need to return criterion and optimizer
return model, criterion, optimizer
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
act_values = self.model(state)
return np.argmax(act_values.detach().numpy())
def replay(self, batch_size):
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
if done:
target = reward
elif not done:
target = reward + self.gamma*torch.max(self.model(next_state)) # --> a tensor
target_f = self.model(state)
target_f[0][action] = target
# self.model.fit(state, target_f, epochs=1, verbose=0)
loss = self.criterion(self.model(state), target_f)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if self.epsilon > self.epsilon_min:
self.epsilon = self.epsilon*self.epsilon_decay
def load(self, name):
pass
def save(self, name):
pass
... and train:
for e in range(n_episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
for time in range(5000):
# env.render()
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward = reward if not done else -10
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print("Episode {}/{}, score: {}, e: {:.2}".format(e, n_episodes, time, agent.epsilon))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size)
memory = agent.memory
If anyone could give me any suggestions/advices, it would be super appreciated!!! Very confused now. Thank you!
Solution 1:[1]
In case of the unstability of model, you'd better transform the variable "state" and "next_state" into two tensors before inputing the network. This is your code:
elif not done:
target = reward + self.gamma*torch.max(self.model(next_state))
before inputing the state data into the network, you should add this code:
next_state = torch.tensor(next_state)
for the evaluated Q value according to state, you can change your code as:
target_f = self.model(torch.tensor(state), requires_grad=True)
loss = self.criterion(self.model(torch.tensor(state, requires_grad=True)), target_f)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | speedhawk1 |