我使用 PyTorch 实现了一个简单的线性回归模型,并尝试使用 GPU 加速训练。尽管如此,我并没有观察到训练时间有任何改善。对模型进行分析会产生严重的 CPU 瓶颈,而不是预期的 GPU 优化。以下是分析器输出的详细信息:
Top 10 Functions by CPU Time:
model_fit: CPU time = 6268886.00 us
Optimizer.step#FISTA.step: CPU time = 402481.00 us
cudaLaunchKernel: CPU time = 302777.00 us
aten::to: CPU time = 109666.00 us
aten::to_copy: CPU time = 107876.00 us
aten::mul: CPU time = 85097.00 us
aten::copy: CPU time = 82672.00 us
aten::abs: CPU time = 78300.00 us
aten::quantile: CPU time = 75502.00 us
aten::empty_strided: CPU time = 70781.00 us
Top 10 Functions by CUDA Time:
model_fit: CUDA time = 41545.00 us
Optimizer.step#FISTA.step: CUDA time = 23568.00 us
aten::mul: CUDA time = 8151.00 us
aten::sum: CUDA time = 5403.00 us
aten::sub: CUDA time = 4679.00 us
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array<char*, 3> >(int, at::native::CUDAFunctor_add, at::detail::Array<char*, 3>): CUDA time = 4208.00 us
aten::copy_: CUDA time = 4170.00 us
aten::mse_loss: CUDA time = 3822.00 us
aten::abs: CUDA time = 3601.00 us
CPU 上似乎仍然发生大量计算,尤其是乘法运算和数据传输(aten::to 、aten::_to_copy )。这是我的模型代码的相关部分:
class LinearRegressionLoss(nn.Module):
def __init__(self, lambda_, nu):
super().__init__()
self.lamb = lambda_
self.nu = nu
self.mse_loss = nn.MSELoss(reduction='sum')
def forward(self, input, target, theta):
square_root_lasso_loss = torch.sqrt(self.mse_loss(input, target))
regularization = self.lamb * (torch.sum(torch.abs(theta.weight)) + torch.abs(theta.bias))
total_loss = square_root_lasso_loss + regularization
return total_loss, square_root_lasso_loss
class LinearModel():
def __init__(self, lambda_):
self.path_type = path_type
self.lambda_= lambda_
self.device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
def fit(self, X, y, verbose=False):
# Data Processing
X = torch.tensor(X.values, dtype=torch.float, device=self.device)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = torch.tensor(y.values.squeeze(), dtype=torch.float, device=self.device)
# Model Parameters
self.theta = nn.Linear(X.shape[1], 1, dtype=torch.float, device=self.device)
# Training
for i in range(-1, 6):
init_lr = 0.1
lambi = self.lambda_ * (np.exp(i) / (1 + np.exp(i)) if i < 5 else 1)
rel_err = 1e-11 if i==5 else 1e-5
if verbose:
print(f"Lambda = {lambi.item():.4f}")
self.train_model(X, y, lambi, init_lr, rel_err, verbose)
if verbose: print("MODEL FITTED !")
def forward(self, X):
if X.device != self.device:
X = X.to(self.device)
return self.theta(X).squeeze()
def train_model(self, X, y, lambda_, init_lr, rel_err, verbose):
loss_fn = LinearRegressionLoss(lambda_, nu).to(self.device)
train_score_fn = nn.MSELoss(reduction = 'mean').to(self.device)
epoch, last_loss = 0, np.inf
optimizer = FISTA(params=self.theta.parameters(), lr=init_lr, lambda_=lambda_)
lr_factor = 0.9
max_epochs = 10000
while epoch < max_epochs:
optimizer.zero_grad()
y_pred = self.forward(X)
loss, bare_loss = loss_fn(y_pred, y, self.theta)
loss = loss.detach()
train_loss = train_score_fn(y_pred, y).detach()
if loss > last_loss:
learning_rate = optimizer.param_groups[0]['lr']
optimizer = FISTA(params=self.theta.parameters(), lr=learning_rate*lr_factor, lambda_=lambda_)
if epoch % 20 == 0:
if verbose:
print(f"\tEpoch: {epoch} | Loss: {loss.item():.5f}")
if epoch > 0 and abs(loss - last_loss) / loss < rel_err:
if verbose: print(f"\n\t Descent stopped: loss is no longer decreasing significantly.\n")
break
last_loss = loss
epoch += 1
bare_loss.backward()
optimizer.step()
if epoch == max_epochs and verbose: print("FISTA descent stopped: maximum iterations reached")
优化器:
class FISTA(torch.optim.Optimizer):
def __init__(self, params, lr, lambda_):
self.lr = lr
self.lambda_ = lambda_
defaults = dict(lr=lr)
super(FISTA, self).__init__(params, defaults)
def shrinkage_operator(self, u, lambda_):
'''Applies the shrinkage operator to a PyTorch tensor.'''
return u.sign() * torch.clamp(u.abs() - lambda_, min=0.0)
@torch.no_grad()
def step(self, closure=None):
'''Performs a single optimization step.'''
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad
state = self.state[p]
if 'x_prev' not in state:
# We use .detach() to ensure we do not track history
state['x_prev'] = p.detach().clone()
state['y_prev'] = p.detach().clone()
state['t_prev'] = torch.tensor(1., device=p.device)
x_prev, y_prev, t_prev = state['x_prev'], state['y_prev'], state['t_prev']
x_next = self.shrinkage_operator(y_prev - self.lr * grad, self.lr * self.lambda_)
t_next = (1. + torch.sqrt(1. + 4. * t_prev ** 2)) / 2.
y_next = x_next + ((t_prev - 1) / t_next) * (x_next - x_prev)
state['x_prev'].copy_(x_next)
state['y_prev'].copy_(y_next)
state['t_prev'].copy_(t_next)
p.copy_(x_next)
return loss
任何关于如何更好地利用此 PyTorch 模型中的 GPU 功能的见解或建议将不胜感激!
编辑:我向模型类添加了一个“设备”参数,这样如果给定,则强制模型的设备。结果:
t = time.time()
model = LinearModel(path_type=0)
model.fit(X, y, False)
print(time.time()-t)
-->0.7786831855773926
t = time.time()
model = LinearModel(path_type=0, device='cpu')
model.fit(X, y, False)
print(time.time()-t)
--> 0.800177001953125
您正在CPU上使用一些任务,然后将它们转移到GPU上。如果直接在 GPU 上初始化它们,可以减少不必要的开销。
在 LinearModel 类中,您根据 GPU 可用性来初始化设备,但不会始终将张量移动到 GPU。修改 fit 方法以确保无论指定的设备如何,张量都会移动到 GPU:
def fit(self, X, y, verbose=False):
# Data Processing
X = torch.tensor(X.values, dtype=torch.float).to(self.device) # Always move to device
scaler = StandardScaler()
X = torch.tensor(scaler.fit_transform(X), dtype=torch.float).to(self.device) # Always move to device
y = torch.tensor(y.values.squeeze(), dtype=torch.float).to(self.device) # Always move to device
您的自定义优化器 FISTA 也需要修改以确保在 GPU 张量上执行操作。具体来说,shrinkage_operator 函数应该在 GPU 张量上运行,并且梯度应该在 GPU 张量上计算和更新。
def shrinkage_operator(self, u, lambda_):
'''Applies the shrinkage operator to a PyTorch tensor.'''
return u.sign() * torch.clamp(u.abs() - lambda_, min=0.0)
@torch.no_grad()
def step(self, closure=None):
'''Performs a single optimization step.'''
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.to(p.device) # Move gradient to device
state = self.state[p]
if 'x_prev' not in state:
# We use .detach() to ensure we do not track history
state['x_prev'] = p.detach().clone().to(p.device) # Move to device
state['y_prev'] = p.detach().clone().to(p.device) # Move to device
state['t_prev'] = torch.tensor(1., device=p.device)
x_prev, y_prev, t_prev = state['x_prev'], state['y_prev'], state['t_prev']
x_next = self.shrinkage_operator(y_prev - self.lr * grad, self.lr * self.lambda_)
t_next = (1. + torch.sqrt(1. + 4. * t_prev ** 2)) / 2.
y_next = x_next + ((t_prev - 1) / t_next) * (x_next - x_prev)
state['x_prev'].copy_(x_next)
state['y_prev'].copy_(y_next)
state['t_prev'].copy_(t_next)
p.data.copy_(x_next) # Use .data to avoid creating a computational graph
return loss
我评论了需要更改的行。请告诉我是否有效。