使用 CNN + LSTM 组合的视频分类损失没有减少，指标也没有改善

Question

我正在尝试构建视频的二元分类网络。数据集类加载每个视频 16/32 帧及其标签。该模型是预训练的 Resnet101、LSTM 和用于分类的线性层的组合。我使用 lr=0.01 的 SGD 优化器，并以二元交叉熵损失为标准。所有时期的损失都徘徊在 0.6 左右，并且模型要么预测全 0，要么预测全 1

有人可以查看我的网络架构和培训代码并找出问题所在。我在下面添加数据集类、模型类和训练脚本

注意：忽略验证和训练分割相同的事实，我试图在较小的子集上过度拟合模型，以确保它可以学习数据

数据集类

from torch.utils.data import Dataset, DataLoader, Subset
import glob
from PIL import Image
import torch
import numpy as np
import random
import os
import cv2
import csv
import shutil
np.random.seed(2020)
random.seed(2020)
torch.manual_seed(2020)


class VideoDataset(Dataset):
    """
    Load and save pairs of 16 frame in .npy file
    save .npy file name and corresponding label in class var
    Can skip_frames
    Assumes that video names are not duplicated in any splits
    """
    def __init__(
            self, 
            vids_dir, 
            labels_path, 
            transform,
            cache_dir='cache',
            sequence_length=32
    ):      
        self.transform = transform
        self.vids_dir = vids_dir
        self.labels_path = labels_path
        self.cache_dir = cache_dir
        self.sequence_length = sequence_length
        self.create_cache_dir()

        # extracting 16 frames per clip, logic could vary as needed
        # current logic select 1 out of 2 frames, each clip has either 30, 60 90 or 120 frame
        self.vid_name_label_map = {}
        self.get_video_label_map()

        self.frames_label_map = [] # [{label: ['img_name1', 'img_name2', ... 16 image paths]}]
        self.get_total_frame_count()

        # log
        print(f'initiated video dataset of {vids_dir}')
    
    def create_cache_dir(self):
        if os.path.exists(self.cache_dir):
            # pass
            shutil.rmtree(self.cache_dir)
        os.makedirs(self.cache_dir, exist_ok=True)

    def get_video_label_map(self):
        with open(self.labels_path, 'r') as fp:
            csv_data = csv.reader(fp)
            for row in csv_data:
                vid_name, label = row[0], int(row[1])
                self.vid_name_label_map[vid_name] = label
        
    def get_total_frame_count(self):
        # self.ds_len = 0
        for vid_name in os.listdir(self.vids_dir):
            curr_sample_frame_label_map = {}
            curr_label = self.vid_name_label_map[vid_name.split('.')[0]]
            vid_path = os.path.join(self.vids_dir, vid_name)
            cap = cv2.VideoCapture(vid_path)
            vid_frames = int(cap.get(7))
            # samples_in_vid = vid_frames // 30 # current dataset has either 30, 60, 90, 120
            # sample_ind = 0
            frames_to_capture = np.linspace(0, vid_frames-1, self.sequence_length, dtype=np.int16)
            curr_sample_frame_label_map[curr_label] = []
            # sample_ind += 1            
            for frame_ind in range(vid_frames):
                success, frame = cap.read()
                if not success:
                    continue

                if frame_ind not in frames_to_capture:
                    continue           

                save_as = os.path.join(self.cache_dir, f'{vid_name}_{frame_ind}.jpg')
                if not os.path.exists(save_as):
                    cv2.imwrite(save_as, frame)
                curr_sample_frame_label_map[curr_label].append(save_as)

                # self.ds_len += 1           
            
            # if sequence length not met, pad with last frames 
            if len(curr_sample_frame_label_map[curr_label]) != self.sequence_length:
                for _ in range(self.sequence_length - len(curr_sample_frame_label_map[curr_label])):
                    curr_sample_frame_label_map[curr_label].append(curr_sample_frame_label_map[curr_label][-1])
            # print('current sample frames = ', len(curr_sample_frame_label_map[curr_label]))
            self.frames_label_map.append(curr_sample_frame_label_map.copy())

    def __len__(self):
        return len(self.frames_label_map)
    
    def __getitem__(self, idx):
        label_img_paths_map = self.frames_label_map[idx]
        label = list(label_img_paths_map.keys())[0]
        path2imgs = label_img_paths_map[label]
        frames = []
        for p2i in path2imgs:
            frame = Image.open(p2i)
            frames.append(frame)
        
        frames_tr = []
        for frame in frames:
            frame = self.transform(frame)
            frames_tr.append(frame)
        if len(frames_tr)>0:
            frames_tr = torch.stack(frames_tr)
        label = label            
        return frames_tr, label

def collate_fn(batch):
    imgs_batch, label_batch = list(zip(*batch))
    imgs_batch = [imgs for imgs in imgs_batch if len(imgs)>0]
    label_batch = [torch.tensor(l) for l, imgs in zip(label_batch, imgs_batch) if len(imgs)>0]
    imgs_tensor = torch.stack(imgs_batch)
    labels_tensor = torch.stack(label_batch).to(dtype=torch.float32)
    return imgs_tensor,labels_tensor        

    
if __name__ == "__main__":
    from transforms import train_transformer

    ds = VideoDataset(
        vids_dir='dataset_merged_clips/videos',
        labels_path='dataset_merged_clips/labels.csv',
        cache_dir='cache',
        transform=train_transformer
    )
        
    ds.__getitem__(0)
    print(ds)

模特班

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnet101


class CNNLSTM(nn.Module):
    def __init__(self, num_classes=2):
        super(CNNLSTM, self).__init__()
        self.resnet = resnet101(pretrained=True)
        self.resnet.fc = nn.Sequential(nn.Linear(self.resnet.fc.in_features, 300))
        self.lstm = nn.LSTM(input_size=300, hidden_size=256, num_layers=3)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.sig = nn.Sigmoid()
       
    def forward(self, x_3d):
        hidden = None
        for t in range(x_3d.size(1)):
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  
            out, hidden = self.lstm(x.unsqueeze(0), hidden)         

        x = self.fc1(out[-1, :, :])
        x = F.relu(x)
        x = self.fc2(x)
        x = self.sig(x)
        return x

训练脚本

import os
import numpy as np
import torch
from torch.nn import CrossEntropyLoss, BCELoss
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from dataset2 import VideoDataset
from dataset2 import collate_fn as rnn_collate_fn
from transforms import train_transformer
from transforms import test_transformer
from resnet_rcnn_model import DecoderRNN
from resnet_rcnn_model import ResCNNEncoder
from cnnlstm import CNNLSTM

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# training parameters
k = 1               # number of target category
epochs = 100        # training epochs
batch_size = 4
learning_rate = 0.001
log_interval = 10   # interval for displaying training info

# pretrained weights
use_hac_pretrained_weights = False

# save model path
save_model_path = 'saved_models3'
if not os.path.isdir(save_model_path):
    os.makedirs(save_model_path)

def train(model, device, train_loader, optimizer, criterion, epoch):
    # set model as training mode
    model.train()

    train_loss = 0

    all_y = np.zeros(0, dtype=np.int64)
    all_pred_y = np.zeros(0, dtype=np.int64)

    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        # y = y.type(torch.LongTensor)
        X, y = X.to(device), y.to(device)
        N_count += X.size(0)
        
        output = model(X)
        loss = criterion(output, y.view(-1,1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()        
        train_loss += loss.item()

        y_pred = output.round().type(torch.int64).view(-1).cpu().numpy()
        all_pred_y = np.concatenate((all_pred_y, y_pred))
        all_y = np.concatenate((all_y, y.type(torch.int64).cpu().numpy()))

    train_loss /= (batch_idx+1)  
    print('\nEPOCH:', epoch+1, '\nTRAIN')
    print('loss:', train_loss)
    print(confusion_matrix(all_y, all_pred_y))  
    print(classification_report(all_y, all_pred_y)  )
    
    return train_loss


def validation(model, device, criterion, test_loader):
    # set model as testing mode
    model.eval()
    test_loss = 0
    all_y = np.zeros(0, dtype=np.int64)
    all_pred_y = np.zeros(0, dtype=np.int64)
    with torch.no_grad():
        for batch_idx, (X, y) in enumerate(test_loader):
            X, y = X.to(device), y.to(device)
            output = model(X)

            loss = criterion(output, y.view(-1,1))
            test_loss += loss.item()                

            y_pred = output.round().type(torch.int64).view(-1).cpu().numpy()
            all_pred_y = np.concatenate((all_pred_y, y_pred))
            all_y = np.concatenate((all_y, y.type(torch.int64).cpu().numpy()))

    test_loss /= (batch_idx + 1)

    print('VAL')
    print('loss:', test_loss)
    print(confusion_matrix(all_y, all_pred_y))  
    print(classification_report(all_y, all_pred_y))    

    return test_loss

# CREATE MODEL
cnnlstm_model = CNNLSTM(num_classes=k).to(device)

val_ds = VideoDataset(
    vids_dir='dataset_merged_clips/videos',
    labels_path='dataset_merged_clips/labels.csv',
    transform=test_transformer,
    sequence_length=16
)

train_dl = DataLoader(val_ds, batch_size= batch_size,
                        shuffle=True, collate_fn= rnn_collate_fn, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size= batch_size,
                        shuffle=False, collate_fn= rnn_collate_fn, pin_memory=True) 

# Combine all EncoderCNN + DecoderRNN parameters
crnn_params = cnnlstm_model.parameters()

# criterion = CrossEntropyLoss()
criterion = BCELoss()
optimizer = torch.optim.SGD(crnn_params, lr=learning_rate, momentum=0.9)
lr_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=3, verbose=True)

least_test_loss = 10

for epoch in range(epochs):
    # train, test model
    epoch_train_loss = train(cnnlstm_model, device, train_dl, optimizer, criterion, epoch)
    epoch_test_loss = validation(cnnlstm_model, device, criterion, val_dl)

    # save Pytorch models of best record
    if epoch_test_loss < least_test_loss:
        torch.save(cnnlstm_model.state_dict(), os.path.join(save_model_path, 'cnnlstm_epoch{}.pt'.format(epoch + 1)))  # save motion_encoder
        torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch{}.pt'.format(epoch + 1)))      # save optimizer
        print("Epoch {} model saved!".format(epoch + 1))
        least_test_loss = epoch_test_loss

转变

h, w =224, 224
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

test_transformer = transforms.Compose([
            transforms.Resize((h,w)),
            transforms.ToTensor(),
            transforms.Normalize(mean, std),
            ])

我尝试过的：训练网络对视频序列进行分类（二元分类）我的期望：随着时间的推移减少的损失值以及改进的指标会发生什么：损失在 50 多个时期内徘徊在 0.7 左右，指标没有改善，徘徊在 50% 左右，这是随机的？

Answer 1

CNNLSTM

模型的一个潜在问题是预训练的resnet模型中的

fc

层被随机初始化的

fc

层替换。这意味着

fc

层没有有用的权重，因此，我们必须在训练过程中对其进行训练（更新其权重）。

目前整个 renset 前向传递是在

torch.no_grad()

上下文中运行的，即训练过程不会更新 resnet 模型任何层的权重，包括新添加的

fc

层。这可以通过删除

with torch.no_grad():

语句来解决，在这种情况下，resnet 模型的所有层都将被更新。或者，我们可以将 resnet 模型中除最后一个

requires_grad

层之外的所有参数设置为

fc

为

False

：

class CNNLSTM(nn.Module):
    def __init__(self, num_classes=2):
        super(CNNLSTM, self).__init__()
        self.resnet = resnet101(pretrained=True)
    
        # Modify the last fully connected layer to match the desired output size
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 300)

        # Freeze all parameters in the ResNet backbone except for the last fully connected layer
        for param in self.resnet.parameters():
            param.requires_grad = False
        for param in self.resnet.fc.parameters():
            param.requires_grad = True

        # Define the LSTM layer and the additional fully connected layers
        self.lstm = nn.LSTM(input_size=300, hidden_size=256, num_layers=3)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, x_3d):
        hidden = None
        for t in range(x_3d.size(1)):
            x = self.resnet(x_3d[:, t, :, :, :])
            out, hidden = self.lstm(x.unsqueeze(0), hidden)

        x = self.fc1(out[-1, :, :])
        x = F.relu(x)
        x = self.fc2(x)
        x = self.sig(x)
        return x

    def trainable_parameters(self):
        # Return parameters of the fully connected layers and LSTM
        return list(self.resnet.fc.parameters()) + list(self.lstm.parameters()) + list(self.fc1.parameters()) + list(self.fc2.parameters())

此外，更新优化器中使用的参数：

crnn_params = cnnlstm_model.trainable_parameters()

使用 CNN + LSTM 组合的视频分类损失没有减少，指标也没有改善

问题描述投票：0回答：1

1个回答

最新问题

使用 CNN + LSTM 组合的视频分类损失没有减少，指标也没有改善

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1