我试图不在 for 循环中使用 model.predict() 或 model.fit() 来加速训练,所以我试图对我的案例实施 this solution 但我得到一个错误。该模型有三个输入。
这是我的代码:
n_possible_movements = 9
MINIBATCH_SIZE = 32
class DQNAgent(object):
def __init__(self):
self.epsilon = 1.0
self.epsilon_decay = 0.8
self.epsilon_min = 0.1
self.learning_rate = 10e-4
self.tau = 1e-3
# Main models
self.model_uav_pos = self._build_pos_model()
# Target networks
self.target_model_uav_pos = self._build_pos_model()
# Copy weights
self.target_model_uav_pos.set_weights(self.model_uav_pos.get_weights())
# An array with last n steps for training
self.replay_memory_pos_nn = deque(maxlen=REPLAY_MEMORY_SIZE)
def _build_pos_model(self): # compile the DNN
# create the DNN model
dnn = self.create_pos_dnn()
opt = Adam(learning_rate=self.learning_rate) #, decay=self.epsilon_decay)
dnn.compile(loss="mse", optimizer=opt)
return dnn
def create_pos_dnn(self):
# initialize the input shape
pos_input_shape = (2,)
requests_input_shape = (len(env.ues),)
number_of_satisfied_ues_input_shape = (1,)
# How many possible outputs we can have
output_nodes = n_possible_movements
# Initialize the inputs
uav_current_position = Input(shape=pos_input_shape, name='pos')
ues_requests = Input(shape=requests_input_shape, name='requests')
number_of_satisfied_ues = Input(shape=number_of_satisfied_ues_input_shape, name='number_of_satisfied_ues')
# Put them in a list
list_inputs = [uav_current_position, ues_requests, number_of_satisfied_ues]
# Merge all input features into a single large vector
x = layers.concatenate(list_inputs)
# Add a 1st Hidden (Dense) Layer
dense_layer_1 = Dense(512, activation="relu")(x)
# Add a 2nd Hidden (Dense) Layer
dense_layer_2 = Dense(512, activation="relu")(dense_layer_1)
# Add a 3rd Hidden (Dense) Layer
dense_layer_3 = Dense(256, activation="relu")(dense_layer_2)
# Output layer
output_layer = Dense(output_nodes, activation="linear")(dense_layer_3)
model = Model(inputs=list_inputs, outputs=output_layer)
# return the DNN
return model
def remember_pos_nn(self, state, action, reward, next_state, done):
self.replay_memory_pos_nn.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
def act_upon_choosing_a_new_position(self, state): # state is a tuple (uav_position, requests_array, number_satisfaction)
if np.random.rand() <= self.epsilon: # if acting randomly, take random action
return random.randrange(n_possible_movements)
pos = np.array([state[0]])
reqs = np.array([state[1]])
number_satisfaction = np.array([state[2]])
act_values = self.model_uav_pos.predict([pos, reqs, number_satisfaction]) # if not acting randomly, predict reward value based on current state
return np.argmax(act_values[0])
def target_train(self):
weights = self.model_uav_pos.get_weights()
target_weights = self.target_model_uav_pos.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
self.target_model_uav_pos.set_weights(target_weights)
这是我介绍链接中建议的更改之前的训练功能
def train_pos_nn(self):
print("In Training..")
# Start training only if certain number of samples is already saved
if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
print("Exiting Training: Replay Memory Not Full Enough...")
return
# Get a minibatch of random samples from memory replay table
list_memory = list(self.replay_memory_pos_nn)
random.shuffle(list_memory)
minibatch = random.sample(list_memory, MINIBATCH_SIZE)
start_time = time.time()
# Enumerate our batches
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
print('...Starting Training...')
target = 0
pos = np.array([current_state[0]])
reqs = np.array([current_state[1]])
number_satisfaction = np.array([current_state[2]])
pos_next = np.array([new_current_state[0]])
reqs_next = np.array([new_current_state[1]])
number_satisfaction_next = np.array([new_current_state[2]])
# If not a terminal state, get new q from future states, otherwise set it to 0
# almost like with Q Learning, but we use just part of equation here
if not done:
print("Predict Next State")
target = reward + DISCOUNT * np.amax(self.target_model_uav_pos.predict([pos_next, reqs_next, number_satisfaction_next]))
else:
target = reward
# Update Q value for given state
print("Predict State")
target_f = self.model_uav_pos.predict([pos, reqs, number_satisfaction])
target_f = np.array(target_f)
target_f[0][action] = target
self.model_uav_pos.fit([pos, reqs, number_satisfaction], \
target_f, \
verbose=2, \
shuffle=False, \
callbacks=None, \
epochs=1 \
)
end_time = time.time()
print("Time", end_time - start_time)
# Update target network counter every episode
self.target_train()
这是我引入变化后的训练功能:
def train_pos_nn(self):
print("In Training..")
# Start training only if certain number of samples is already saved
if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
print("Exiting Training: Replay Memory Not Full Enough...")
return
# Get a minibatch of random samples from memory replay table
list_memory = list(self.replay_memory_pos_nn)
random.shuffle(list_memory)
minibatch = random.sample(list_memory, MINIBATCH_SIZE)
# Draw a sample
samples = random.sample(list_memory, MINIBATCH_SIZE)
start_time = time.time()
# Prepare the batch
state, action, reward, new_state, done = zip(*samples)
nstate = []
cstate = []
start_time_2 = time.time()
for n_state in new_state:
pos_next = np.array([n_state[0]])
reqs_next = np.array([n_state[1]])
number_satisfaction_next = np.array([n_state[2]])
nstate.append([pos_next,reqs_next,number_satisfaction_next])
for curr_state in state:
pos = np.array([curr_state[0]])
reqs = np.array([curr_state[1]])
number_satisfaction = np.array([curr_state[2]])
cstate.append([pos,reqs,number_satisfaction])
end_time_2 = time.time()
print("Time 2", end_time_2 - start_time_2)
#next_state = np.concatenate(new_state)
#next_state = np.concatenate(nstate)
#print("next_state", nstate[0], "len", len(nstate))#np.asarray(nstate).shape)# np.shape(nstate))
done = np.array(done)[:,None]
state = np.concatenate(state)
reward = np.array(reward)[:,None]
q_future = self.target_model_uav_pos.predict(nstate)#np.vstack(nstate))
targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)
# Fit the model
self.model.fit(cstate, targets, epochs=1, verbose=2)
end_time = time.time()
print("Time", end_time - start_time)
self.target_train()
这一行
q_future = self.target_model_uav_pos.predict(nstate)
抛出错误ValueError: Layer "model_69" expects 3 input(s), but it received 96 input tensors
(nstate中的32个示例中的每一个都有3个输入,当我使用predict_on_batch()时出现相同的错误)
我不知道如何正确地做。任何帮助将不胜感激。