我尝试使用 Q-learning 算法解决车杆问题。然而,在实施和执行算法之后,q-table 与执行程序之前相同。 q-table在q学习算法的过程中是否应该继续更新?在运行 Qlearning 函数后,我的 q-learning 实现总是给我相同的 q 表。这是正常的吗?谢谢!
# %%
import numpy as np
import pandas as pd
import random
from pylab import plt, mpl
import time
import gym
# %%
#make cart pole env
env = gym.make('CartPole-v1')#, render_mode = 'human')
# %%
# make a Q table for Q learning
def Qtable(statenum, actionnum, binsize=35):
bins =[
np.linspace(-4.8, 4.8, binsize), # cart position
np.linspace(-5, 5, binsize), # cart velocity
np.linspace(-.418, .418, binsize), # pole angle
np.linspace(-4, 4, binsize) # pole velocity
]
qtable = np.random.uniform(low=-1,high=1,size=([binsize] * statenum + [actionnum]))
return qtable, bins
# %%
def Discrete(state, bins):
index=[]
for i in range(len(state)):
index.append(np.digitize(state[i], bins[i])-1)
return tuple(index)
#return tuple(int(np.digitize(s, b)-1) for s, b in zip(state, bins))
# %%
# %%
q_table, bins = Qtable(len(env.observation_space.high), env.action_space.n)
# %%
q_table
# %%
def Qlearning(qtable, bins, episodes=2000, gamma=0.9, lr=0.2, timestep=2000, epsilon=0.2):
# Q learning
for episode in range(1,episodes+1):
step=0
state = env.reset()
current_state = Discrete(env.reset(),bins) # initial observation
score=0
done=False
while not done:
if random.uniform(0,1)<epsilon:
action = env.action_space.sample()
else:
action = np.argmax(qtable[current_state])
obs, reward, done, info = env.step(action)
next_state = Discrete(obs, bins)
step+=1
score+=reward
max_future_q = np.max(q_table[next_state])
current_q = q_table[current_state+(action,)]
new_q = (1-lr)*current_q + lr*(reward + gamma*max_future_q)
q_table[current_state+(action,)] = new_q
current_state = next_state
if episode%timestep==0:
#env.render()
print('episode: {}, score: {}, step: {}'.format(episode, score,step))
if score == 500:
print('episode: {}, score: {}, step: {}'.format(episode, score, step))
print(f'success! epsilon: {epsilon}, gamma: {gamma}, lr: {lr} in {step} steps')
break
# %%
Qlearning(q_table, bins, lr = 0.2, gamma = 0.995, episodes = 3000, timestep = 1)
# %%
q_table
# %%
Q表没有更新