我正在使用下面的代码(改编自https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/rl/ipynb/actor_critic_cartpole.ipynb)来尝试并校准两个连续变量。变量通过一个虚拟函数运行,一旦它们足够接近目标值,该函数就会停止。我目前遇到的问题是梯度仅在第二个变量上实现,而第一个变量保持不变。需要更改什么才能使两个变量根据各自的梯度发生变化?
num_inputs = 4
num_actions = 2
num_hidden = 128
inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="sigmoid")(common)
critic = layers.Dense(1)(common)
model = keras.Model(inputs=inputs, outputs=[action, critic])
def gaussian_noise_layer(input_layer, std=1):
noise = tf.random.normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32)
return input_layer + noise
def evaluate_estimation(X) :
minB =np.array([0,0]);
maxB=np.array([195,60]);
correctVal =np.array([130,20])
X = minB + gaussian_noise_layer(X) * (maxB - minB)
done = (max((abs(correctVal-X)/correctVal)*100)<1)
rld_e =np.array([X[0]-correctVal[0],(X[0]-correctVal[0])/correctVal[0],X[1]-correctVal[1],(X[1]-correctVal[1])/correctVal[1]])
reward =sum( ( ( 1/((X-correctVal)+eps))**2)**0.5)
return rld_e, reward, done
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_historys = [[],[]]
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
while True: # Run until solved
state = np.zeros(num_inputs)
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs, critic_value = model(state)
critic_value_history.append(critic_value[0, 0])
# here action == action__probs
#used sigmoid function so no need to use tf.math.log?
action_probs_historys[0].append(action_probs[0,0])
action_probs_historys[1].append(action_probs[0,1])
#action_probs_history.append(tf.math.log(action_probs[0,0]))
#for i, policy_branch in enumerate(policy):
# action_probs_history.append(policy_branch)
#action_probs_history = torch.stack(action_probs_history, dim=1)
# Apply the sampled action in our environment
state, reward, done = evaluate_estimation( action_probs[0])
print(timestep,action_probs[0])
rewards_history.append(reward)
episode_reward += reward
if done:
print("correct value was found",action_probs[0])
raise Exception
break
# Update running reward to check condition for solving
running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
print("Update running reward to check condition for solving",running_reward)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = []
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.insert(0, discounted_sum)
# Normalize
returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
returns = returns.tolist()
print("normalize",returns)
# Calculating loss values to update our network
actor_losses = [[],[]]
critic_losses = []
for i, action_probs_history in enumerate( action_probs_historys):
history = zip(action_probs_history, critic_value_history, returns)
print("get hubert loss")
for log_prob, value, ret in history:
# At this point in history, the critic estimated that we would get a
# total reward = `value` in the future. We took an action with log probability
# of `log_prob` and ended up recieving a total reward = `ret`.
# The actor must be updated so that it predicts an action that leads to
# high rewards (compared to critic's estimate) with high probability.
diff = ret - value
actor_losses[i].append(-log_prob * diff) # actor loss
#print(actor_losses,-log_prob , diff)
# The critic must be updated so that it predicts a better estimate of
# the future rewards.
if i ==0:
critic_losses.append(
huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
)
# Backpropagation
#print("backpopagation",np.sum(critic_losses),critic_losses)
#loss_value=[[],[]]
#for i, actor_losse in enumerate( actor_losses):
loss_value1 = sum(actor_losses[0]) + sum(critic_losses)
loss_value2 = sum(actor_losses[1]) + sum(critic_losses)
print("loss_value",[loss_value1,loss_value2])
grads = tape.gradient([loss_value1,loss_value2], model.trainable_variables)
#print(loss_value,actor_losses,critic_losses)
#raise Exception
#print(loss_value,np.sum(critic_losses),model.trainable_variables, actor_losses )
print("grads",grads)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# Clear the loss and reward history
print("clear")
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()