google colab A100 比我本地的 Nvidia RTX 3070 慢很多,为什么?

问题描述 投票:0回答:1

在这里学习pytorch。从书上得到以下代码。它在我的 Nvidia RTX3070 上运行得很快(每个 epoch 需要 3.6 秒),但在 Google colab 上,即使我选择运行时间为 A100,每个 epoch 也需要大约 7 秒。

知道为什么吗? 预先感谢!

import torch
from torch import nn, optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms

torch.manual_seed(1)
batch_size = 128 
learning_rate = 1e-2 
num_epoches = 10 

train_dataset = datasets.MNIST(
    root='./data', 
    train=True, 
    transform=transforms.ToTensor(), 
    download=True)  

test_dataset = datasets.MNIST(
    root='./data',
    train=False, 
    transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


class Cnn(nn.Module):
    def __init__(self, in_dim, n_class):  # 28x28x1
        super(Cnn, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_dim, 6, 3, stride=1, padding=1),  # 28 x28
            nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # 14 x 14
            nn.Conv2d(6, 16, 5, stride=1, padding=0),  # 10 * 10*16
            nn.ReLU(True),
            nn.MaxPool2d(2, 2))  # 5x5x16

        self.fc = nn.Sequential(
            nn.Linear(400, n_class)) #120),  # 400 = 5 * 5 * 16
            #nn.Linear(120, 84),
            #nn.Linear(84, n_class))

    def forward(self, x):
        out = self.conv(x)
        out = out.view(out.size(0), 400)  # 400 = 5 * 5 * 16,
        out = self.fc(out)
        return out

model = Cnn(1, 10).to('cuda') 

print(model)

# loss和optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
import time
start = time.time()

for epoch in range(num_epoches):
    print('epoch {}'.format(epoch + 1))
    print('*' * 10)
    running_loss = 0.0
    running_acc = 0.0
    for i, data in enumerate(train_loader, 1):
        img, label = data
        img = Variable(img).to("cuda")
        label = Variable(label).to("cuda")
        out = model(img)
        loss = criterion(out, label)  # loss
        running_loss += loss.item() * label.size(0) 
        _, pred = torch.max(out, 1) 
        num_correct = (pred == label).sum() 
        # accuracy = (pred == label).float().mean()
        running_acc += num_correct.item() 
        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()

    print('Train Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format(
        epoch + 1, running_loss / (len(train_dataset)), running_acc / (len(
            train_dataset))))
    print(f"took {time.time() - start}")
    start = time.time()


torch.save(model.state_dict(), './cnn.pth')
pytorch google-colaboratory
1个回答
0
投票

你可以尝试在DataLoader中设置一个更大的batch_size,例如16或32。默认batch_size将是1,这在A100上相当慢。

© www.soinside.com 2019 - 2024. All rights reserved.