我的火车数据集包含 1800 张图像。由于我的内存限制,我只使用 8 的批量大小。我正在 Google Colab Pro 上训练它。从 8 开始增加会引发内存不足错误。我最初对其进行了 10 个 epoch 的训练。即使在验证数据集上,损失看起来也不错。我还分享了最后一个纪元的结果
我正在遵循以下流程:
from PIL import Image
import torch
from torch.utils.data import Dataset
class ImageCaptioningDataset(Dataset):
def __init__(self, dataframe, processor, max_image_size=100):
self.dataframe = dataframe
self.processor = processor
self.max_image_size = max_image_size
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
item = self.dataframe.iloc[idx]
image_path = item["image"]
labels = item["label"]
# Load and preprocess the image
image = Image.open(image_path).convert("RGB")
image = image.resize((self.max_image_size, self.max_image_size), Image.ANTIALIAS)
# Convert labels to string (adjust this step based on your label format)
text = str(labels)
encoding = self.processor(images=image, text=text, padding="max_length", return_tensors="pt")
# remove batch dimension
encoding = {k: v.squeeze() for k, v in encoding.items()}
return encoding
from transformers import AutoProcessor, BlipForConditionalGeneration
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
model.train()
train_dataset = ImageCaptioningDataset(dataframe=df, processor=processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataset = ImageCaptioningDataset(dataframe=df_val, processor=processor)
val_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
for epoch in range(10):
print("Epoch:", epoch)
for idx, batch in enumerate(train_dataloader):
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device)
outputs = model(input_ids=input_ids,
pixel_values=pixel_values,
labels=input_ids)
loss = outputs.loss
print("Loss:", loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation
model.eval()
with torch.no_grad():
total_loss = 0
total_samples = 0
for idx, batch in enumerate(val_dataloader):
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device)
outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
loss = outputs.loss
total_loss += loss.item()
total_samples += input_ids.size(0)
avg_val_loss = total_loss / total_samples
print("Validation Loss:", avg_val_loss)
最后一个epoch结果是:
Epoch: 9
Loss: 1.4171884059906006
Loss: 1.4414808750152588
Loss: 1.437009334564209
Loss: 1.4208693504333496
Loss: 1.4376342296600342
.....
.....
.....
Loss: 1.448195219039917
Loss: 1.4534223079681396
Loss: 1.4529722929000854
Loss: 1.4163848161697388
Loss: 1.517378807067871
Loss: 1.4786649942398071
Loss: 1.4230802059173584
Validation Loss: 0.1795918705713683
但仍在测试中,每个图像都返回相同的标题。列车数据都有独特的标题。但我不明白训练是否顺利,并且也经过了验证测试。那为什么会出现这种情况。
Image Path: PMC2829594_1752-1947-4-10-4.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC529466_1477-7819-2-36-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC125313_cc1496-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC280706_1477-7819-1-24-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC2440391_1752-1947-2-212-2.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC2848035_1749-8090-5-12-3.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC2700440_ATM-03-108-g001.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC2542347_1757-1626-1-122-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
Image Path: PMC2727506_1749-8090-4-40-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion with a small
Image Path: PMC2783120_1757-1626-2-164-1.jpg
Generated Caption: chest x - ray showing a large right - sided pleural effusion.
请指导我在这里做错了什么。我该如何改善这个问题?
谢谢!
请问您这个问题解决了吗? 我还想在自己的数据集上微调 BLIP 模型以实现图像字幕任务。 期待收到您的回复! 祝你好运! 谢谢!