当我的深度学习模型运行验证时,出现“CUDA 错误:内存不足”问题。虽然之前在训练阶段、前向和反向传播阶段——本来应该占用大量内存并保存很多梯度,但并没有出现“CUDA错误:内存不足”的状态。
我的模型有 21257650 个参数。在 GPU GTX 1070 8GB 内存上进行训练。
##############错误如下:###############
Traceback (most recent call last):
File "main.py", line 303, in <module>
main(args)
File "main.py", line 284, in main
val_loss, val_psnr = validation(args, epoch, writer)
File "main.py", line 144, in validation
for i, (images, gt_image) in loop:
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\tqdm\std.py", line 1182, in __iter__
for obj in iterable:
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\dataloader.py", line 681, in __next__
data = self._next_data()
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\dataloader.py", line 1402, in _process_data
data.reraise()
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\_utils.py", line 461, in reraise
raise exception
RuntimeError: Caught RuntimeError in pin memory thread for device 0.
Original Traceback (most recent call last):
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 34, in _pin_memory_lo
op
data = pin_memory(data, device)
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 65, in pin_memory
return type(data)([pin_memory(sample, device) for sample in data]) # type: ignore[call-arg]
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 65, in <listcomp>
return type(data)([pin_memory(sample, device) for sample in data]) # type: ignore[call-arg]
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 65, in pin_memory
return type(data)([pin_memory(sample, device) for sample in data]) # type: ignore[call-arg]
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 65, in <listcomp>
return type(data)([pin_memory(sample, device) for sample in data]) # type: ignore[call-arg]
File "C:\Users\Anh\anaconda3\envs\Kienv1\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 50, in pin_memory
return data.pin_memory(device)
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
这是我的验证码:
def validation(args, epoch, writer):
torch.cuda.empty_cache()
losses, psnrs, ssims = utils.init_meters(args.loss, reset_loss=True)
model.eval()
criterion.eval()
args.out_counter = 0 # Reset the output images index
start = time.time()
with torch.no_grad(): #no_grad() to my model tells PyTorch that I don't want to store any previous computations, thus freeing my GPU space.
loop = tqdm(enumerate(val_loader), total=len(val_loader))
for i, (images, gt_image) in loop:
# Build input batch
images = [img_.to(device) for img_ in images]
gt = [gt_img.to(device) for gt_img in gt_image]
print("gt len: " + str(len(gt)))
# Forward
out = model(images) # out = [framet1, framet2]
if device.type == "cuda":
print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB")
loss = 0
loss_specific = []
# Compute loss for each output
for output, target in zip(out, gt):
single_loss, single_loss_specific = criterion(output, target)
loss += single_loss.item() # use loss.item instead of loss which requires grads, solved the problem CUDA out Memory
loss_specific.append(single_loss_specific)
# Save loss values
for k, v in losses.items():
if k != 'total':
v.update(loss_specific[k].item())
losses['total'].update(loss.item())
# Calc metrics
for output, target in zip(out, gt):
utils.eval_metrics(output, target, psnrs, ssims)
# Tensorboard
if i % args.log_iter == 0:
utils.log_tensorboard(writer, losses, psnrs.avg, ssims.avg.item(),
optimizer.param_groups[0]['lr'], epoch * len(train_loader) + i, 'val')
# Save result images
# if epoch % 15 == 0:
# epoch_path = os.path.join(args.save_path, args.result_images_folder, 'Epoch_' + str(epoch))
# utils.save_batch_images(args, out, gt, epoch_path)
# update progress bar
loop.set_description("(Val)")
loop.set_postfix(loss=loss.item())
# Calculatet execution validation time
val_time_elapsed = time.time() - start
# save val metrics to csv
if epoch % 2 == 0:
utils.save_metrics(args, os.path.join(args.save_path, args.graph_folder), epoch,
losses['total'].avg, psnrs.avg, ssims.avg.item(),
optimizer.param_groups[0]['lr'], val_time_elapsed, 'val')
print('Validating Results: \tValidation Loss: {:.6f}\tVal Time: {:.2f}'
'\tPSNR: {:.4f}\tSSIM: {:.3f}'.format(losses['total'].avg, val_time_elapsed,
psnrs.avg, ssims.avg.item()))
return losses['total'].avg, psnrs.avg
从错误来看,似乎有另一个进程正在使用您的 GPU。运行验证时,请留意 GPU 内存 (nvidia-smi)
由于参数过大,训练后 GPU 缓存可能已满,经常会出现 OOM 错误。训练后使用torch.empty_cache(),然后进行验证