我试图在我的per_process_gpu_memory_fraction
中加入tf.GPUOptions()
值,然后用set_session()
改变Keras会话,然而,记忆分数从未真正改变。在第一次运行while循环后,保留319MB,如nvidia-smi
所示
a)在调用clear_session()
时永远不会被释放,并且
b)不会在while循环的下一次迭代中上升。
import GPUtil
import time
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
y_pred = None
errors = 0
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
mem_amount = 0.005 # intentionally allocated a small amount so it needs to
# increment the mem_amount
x_train = np.empty((10000, 100))
y_train = np.random.randint(0, 9, size=10000)
y_train = to_categorical(y_train, 10)
while y_pred is None:
print("mem", mem_amount)
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
print(sess)
model.fit(x_train, y_train, epochs=5, batch_size=32)
y_pred = model.predict(x_train)
except (ResourceExhaustedError, UnknownError) as e:
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
clear_session()
errors += 1
pass
else:
clear_session()
if __name__ == "__main__":
model_trainer()
令人费解的是,Keras愿意参加新的会议(如get_session()
电话所示),但不会应用新的GPUOptions
。
除了上面的例子我还尝试过:
clear_session()
del model
clear_session()
del model
gc.collect()
这些都没有发布VRAM。
我的总体目标是使用“试验和错误”,直到该过程有足够的VRAM进行训练,因为似乎没有好的方法来确定Keras模型需要多少VRAM而不仅仅运行它,这样我就可以在单个GPU上并行运行多个模型。当ResourceExhaustedError
发生时,我想释放由Keras持有的VRAM,然后再尝试使用更多的VRAM。有没有办法实现这个目标?
在搜索了一段时间之后,我发现Tensorflow只会接受VRAM,并且在它死之前永远不会释放它,即使使用del model,clear_session()也是如此。我也试过这里显示的方法(https://github.com/keras-team/keras/issues/9379),它使用:
from keras import backend as K
K.clear_session()
from numba import cuda
cuda.select_device(0)
cuda.close()
这导致了一个错误,因为当Tensorflow再次尝试访问GPU时,它指向内存空间的指针无效(因为它被cuda.close()杀死)。因此,解决它的唯一方法是使用进程,而不是线程(也尝试过,与以前相同的问题)。
我发现的另一件事是虽然有一些方法可以尝试估计Keras模型将使用的VRAM数量,但这并不是一种非常准确的方法。 (参见:How to determine needed memory of Keras model?)我也尝试直接从Keras层进行计算,并且变化很大,因此也不准确。所以这真的只会让你通过捕获ResourceExhaustedError并再次尝试来尝试错误。
下面是我在单个GPU上运行多个不同Keras模型的代码。
import GPUtil
import time
import multiprocessing
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
mem_amount = 0.05
x_train = np.empty((100000, 100))
y_train = np.random.randint(0, 9, size=100000)
y_train = to_categorical(y_train, 10)
manager = multiprocessing.Manager()
return_dict = manager.dict()
def worker(mem_amount, return_dict):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=2048, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
get_session()
model.fit(x_train, y_train, epochs=5, batch_size=1000)
return_dict["valid"] = True
except (ResourceExhaustedError, UnknownError) as e:
return
while "valid" not in list(return_dict.keys()):
print("mem", mem_amount)
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
# can add in a for loop to have multiple models
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
p = multiprocessing.Process(target=worker, args=(mem_amount, return_dict))
p.start()
p.join()
print(return_dict.values())
if "valid" not in list(return_dict.keys()):
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
else:
break
else:
time.sleep(10)
if __name__ == "__main__":
model_trainer()