我正在尝试向神经网络数据集提供图像,但收到此错误 我不知道可能是什么原因,因为所有图像都有不同的尺寸 我也尝试过更改批量大小和内核,但没有成功。
File "c:\Users\david\Desktop\cs_agent\main.py", line 49, in <module>
for i, data in enumerate(train_loader, 0):
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 530, in __next__
data = self._next_data()
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 570, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch
return self.collate_fn(data)
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in default_collate
return [default_collate(samples) for samples in transposed] # Backwards compatibility.
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in <listcomp>
return [default_collate(samples) for samples in transposed] # Backwards compatibility.
File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 138, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [3, 300, 535] at entry 0 and [3, 1080, 1920] at entry 23
这是我的主文件
import numpy as np
import matplotlib.pyplot as plt
import torch
import dataset
import os
from torch.utils.data import DataLoader
import torch.nn as nn
import torchvision
import check_device
import neural_network
import torch.optim as optim
EPS = 1.e-7
LR=0.5
WEIGHT_DECAY=0.5
batch_size =50
#DATA LOADING ###################################################################################################################
test_dataset =dataset.csHeadBody(csv_file="images\\test_labels.csv",root_dir="images\\test")
train_dataset =dataset.csHeadBody(csv_file="images\\train_labels.csv",root_dir="images\\train")
train_loader =DataLoader(dataset =train_dataset,batch_size=batch_size,shuffle=True)
test_loader =DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True)
#DATA LOADING ###################################################################################################################END
#NEURAL NET #####################################################################################################################################################
net=neural_network.Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#NEURAL NET END ######################################################################################
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
# get the inputs; data is a list of [inputs, labels]
print(data)
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
running_loss = 0.0
print('Finished Training')
这是我的数据集文件
class csHeadBody(Dataset):
def __init__(self, csv_file, root_dir, transform=None, target_transform=None):
self.img_labels = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.img_labels)
def __getitem__(self, idx):
img_path = os.path.join(self.root_dir, self.img_labels.iloc[idx, 0])
image = read_image(img_path)
label = self.img_labels.iloc[idx, 1]
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label
这是我的神经网络架构
import torch.nn.functional as F
import torch.nn as nn
import torch
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 535, 535)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
您需要调整卷积层和线性层的参数。第一个参数是输入通道数(
conv1
中的标准 RGB 图像为 3),然后是输出通道数,然后是卷积核大小。为了澄清这一点,我在下面的代码中使用了命名参数。该代码适用于方形输入大小为 224x224
像素的图像(标准 imagenet 大小,根据需要进行调整)。如果您想要与图像大小无关的代码,您可以使用全局平均池化(最后一个卷积层中每个通道的平均值)之类的东西。下面的网络支持两者:
class Net(nn.Module):
def __init__(self, use_global_average_pooling: bool = False):
super().__init__()
self.use_global_average_pooling = use_global_average_pooling
self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3)
self.pool = nn.MaxPool2d(kernel_size=(2, 2))
self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
if use_global_average_pooling:
self.fc_gap = nn.Linear(64, 10)
else:
self.fc_1 = nn.Linear(54 * 54 * 64, 84) # 54 img side times 64 out channels from conv2
self.fc_2 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x))) # img side: (224 - 2) // 2 = 111
x = self.pool(F.relu(self.conv2(x))) # img side: (111 - 2) // 2 = 54
if self.use_global_average_pooling:
# mean for global average pooling (mean over channel dimension)
x = x.mean(dim=(-1, -2))
x = F.relu(self.fc_gap(x))
else: # use all features
x = torch.flatten(x, 1)
x = F.relu(self.fc_1(x))
x = self.fc_2(x)
return x
此外,
torchvision.io.read_image
中使用的Dataset
函数返回一个uint8
张量,其整数值从0到255。您需要网络的浮点值,因此必须将结果除以255
获取 [0, 1]
范围内的值。此外,神经网络在标准化输入时效果最佳(减去平均值,然后除以训练数据集的标准误差)。我在下面的图像变换中添加了标准化。为了方便起见,它使用 imagenet 均值和标准误差,如果您的图像与 imagenet 图像相似,那么它应该可以正常工作(否则您可以在自己的图像上计算它们)。
请注意,调整大小可能会扭曲您的图像(不保持原始宽高比)。通常这不是问题,但如果是的话,您可能需要用恒定的颜色(例如黑色)填充图像,以将其调整为所需的尺寸(torchvision 库中也有相应的转换)。
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
transforms = torchvision.transforms.Compose([
torchvision.transforms.Lambda(lambda x: x / 255.),
torchvision.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
torchvision.transforms.Resize((224, 224)),
])
您可能还需要调整
Dataset
中的代码以将图像加载为 RGB 图像(如果它们也有 Alpha 通道)。这可以这样做:
image = read_image(img_path, mode=torchvision.io.image.ImageReadMode.RGB)
然后您可以使用以下方法初始化您的
Dataset
:
test_dataset = dataset.csHeadBody(csv_file="images\\test_labels.csv", root_dir="images\\test", transform=transforms)
train_dataset = dataset.csHeadBody(csv_file="images\\train_labels.csv", root_dir="images\\train", transform=transforms)
我还没有测试过代码,如果不起作用请告诉我!
TLDR:错误是由于传递给
collate_fn
的输入大小不同造成的。如果您需要使用不同的输入大小,那么您必须在转换中处理它(例如 SmallestMaxSize + Pad 或 LongestMaxSize + Crop)或编写自定义批量采样器。
该错误实际上与神经网络架构无关。这与您尝试创建由不同尺寸的图像组成的批次有关(错误表示:
stack expects each tensor to be equal size, but got [3, 300, 535] at entry 0 and [3, 1080, 1920] at entry 23
)。当您将 torch.utils.data.DataLoader
与指定的 batch_size
和 shuffle
(其他参数默认)一起使用时,将使用默认的 sampler
和 collate_fn
(更多 here)。默认的 collate_fn
尝试堆叠来自 dataset __getitem__
方法的数据。在您的情况下,__getitem__
返回不同大小的图像([3, 300, 535]
和[3, 1080, 1920]
)并将这些输入传递给torch.stack
,在那里抛出错误。有两种常见的可能性可以处理多种输入尺寸的情况:
DataLoader
)以确保在单个批次中仅加载相同尺寸的图像。第二个选项实施起来有点困难,但在我看来它更好,因为我们不添加或删除图像中的任何信息(特别适合生成人工智能目的)。要实现此选项,我们需要在数据集类中存储图像大小信息(对于每个加载的文件路径)。然后在自定义采样器中,我们将使用该信息来创建具有仅与相同大小的图像相关的索引的批次。
Dataset
班级from typing import Callable
import torchvision.datasets
import imagesize
from PIL import Image
class FilepathsDataset(torchvision.datasets.VisionDataset):
def __init__(
self,
image_filepaths: list[str],
transform: Callable,
):
self.image_filepaths = image_filepaths
imgsz_idxs = {}
for idx, path in enumerate(image_filepaths):
# imagesize package loads only the file headers so it works really fast
width, height = imagesize.get(path)
size = (width, height)
if size not in imgsz_idxs:
imgsz_idxs[size] = [idx]
else:
imgsz_idxs[size].append(idx)
self.imgsz_idxs = imgsz_idxs
self.transform = transform
def get_raw_data(self, idx: int) -> Image.Image:
"""Return raw image"""
image_filepath = self.image_filepaths[idx]
image = Image.open(image_filepath).convert("RGB")
return image
def __len__(self) -> int:
return len(self.image_filepaths)
def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
"""Return transformed image and mask"""
target = <some_target> # TODO: load your target here
image = self.get_raw_data(idx)
image = self.transform(image)
return image, target
Sampler
课程此采样器会随机排列索引并默认删除最后一批(以确保所有批次具有相同数量的样本)
import copy
import random
from torch.utils.data import Sampler
class SameResolutionSampler(Sampler):
def __init__(self, batch_size: int, imgsz_idxs: dict[tuple[int, int], list[int]]):
self.imgsz_idxs = imgsz_idxs
self.batch_size = batch_size
def __iter__(self) -> list[list[int]]:
imgsz_idxs = copy.deepcopy(self.imgsz_idxs)
batches = []
for size in imgsz_idxs:
while len(imgsz_idxs[size]) >= self.batch_size: # drop last
batch_idxs = []
for i in range(self.batch_size):
idx = imgsz_idxs[size].pop()
batch_idxs.append(idx)
batches.append(batch_idxs)
random.shuffle(batches)
return iter(batches)
def __len__(self):
batches_per_resolution = [
len(idxs) // self.batch_size for idxs in self.imgsz_idxs.values()
]
return int(sum(batches_per_resolution))
DataLoader
实例filepaths = _ # TODO: paths to your images
transform = _ # TODO: your transforms
batch_size = _ # TODO: your batch_size
ds = FilepathsDataset(filepaths, transform)
sampler = SameResolutionSampler(batch_size, ds.imgsz_idxs)
dataloader = DataLoader(ds, batch_sampler=sampler)
注意:此外,如果您使用第二个选项(自定义采样器)并训练一些具有线性头的卷积神经网络,则必须检查网络是否适用于不同的输入大小,或在线性层之前添加 Adaptive Pooling 以确保尺寸正确。