使用 numba 的代码比不使用 numba 的代码慢

问题描述 投票:0回答:1

使用 Numba 的代码比不使用 Numba 的代码慢,我不知道发生了什么。请指导我。

#with numba
import numpy as np
from numba import njit, prange
import time

(X_train, Y_train), (X_val, Y_val) = tf.keras.datasets.cifar10.load_data()

@njit(nopython= True)
def cal_padding(input, padding):
    output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
    output[ padding :  padding + input.shape[0],  padding :  padding + input.shape[1]] = input
    return output

@njit(nopython = True)
def conv_per_channel(data, kernel, padding = 1, stride = 1): #input ở đây là từng channel trong ảnh, kernel cũng là từng channel trong bộ
    h, w = data.shape
    data = cal_padding(data, padding)
    
    kernel_size = kernel.shape
    h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
    w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
    
    tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
    h_kernel, w_kernel = kernel_size
    xRows = []
    yCols = []
    for i in range(w_kernel):
      for j in range(h_kernel):
        yCols.append(j)
        xRows.append(i)

    for i in range(h_output):
      for j in prange(w_output):
        root_pixel = [i,j]
        new_val = 0.0
        for k in range(w_kernel * h_kernel):
          pixel_in_filter_x = xRows[k]
          pixel_in_filter_y = yCols[k]

          pixel_in_input_x = pixel_in_filter_x + root_pixel[0]
          pixel_in_input_y = pixel_in_filter_y + root_pixel[1]
          new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
        if new_val > 255:
          new_val = 255
        elif new_val < 0:
          new_val = 0
        tranformed[i,j] = new_val
      
    return tranformed

X_train = np.transpose(X_train, (0,3,1,2))
X_train500 = X_train[0:500]
print(X_train[0][0].shape)

sobelX = np.array([[-1,0,1], [-2,0,2],[-1,0,1]])
start = time.time()
conv_per_channel(X_train[0][0], sobelX)
end = time.time()
print(end-start)
#without numba
import numpy as np
import tensorflow as tf

from numba import jit, prange
import time

(X_train, Y_train), (X_val, Y_val) = tf.keras.datasets.cifar10.load_data()

# @jit(nopython= True)
def cal_padding(input, padding):
    output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
    output[ padding :  padding + input.shape[0],  padding :  padding + input.shape[1]] = input
    return output

# @jit(nopython = True)
def conv_per_channel(data, kernel, padding = 1, stride = 1): #input ở đây là từng channel trong ảnh, kernel cũng là từng channel trong bộ
    h, w = data.shape
    data = cal_padding(data, padding)
    
    kernel_size = kernel.shape
    h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
    w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
    
    tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
    h_kernel, w_kernel = kernel_size
    xRows = []
    yCols = []
    for i in range(w_kernel):
      for j in range(h_kernel):
        yCols.append(j)
        xRows.append(i)

    for i in range(h_output):
      for j in prange(w_output):
        root_pixel = [i,j]
        new_val = 0.0
        for k in range(w_kernel * h_kernel):
          pixel_in_filter_x = xRows[k]
          pixel_in_filter_y = yCols[k]

          pixel_in_input_x = pixel_in_filter_x + root_pixel[0]
          pixel_in_input_y = pixel_in_filter_y + root_pixel[1]
          new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
        if new_val > 255:
          new_val = 255
        elif new_val < 0:
          new_val = 0
        tranformed[i,j] = new_val
      
    return tranformed

X_train = np.transpose(X_train, (0,3,1,2))
X_train500 = X_train[0:500]
print(X_train[0][0].shape)

sobelX = np.array([[-1,0,1], [-2,0,2],[-1,0,1]])
start = time.time()
conv_per_channel(X_train[0][0], sobelX)
end = time.time()
print(end-start)

使用 Numba

@jit()
装饰器此代码运行速度较慢! 我尝试使用 (parallel=True) 或 (cache = True) 但它甚至更慢。

只是为了帮助更好地理解这段代码的目的:

#with numba
(32, 32)
0.6485073566436768

#without numba
(32, 32)
0.007578611373901367
python parallel-processing numba multitasking
1个回答
0
投票

这会让 Numba 优化器感到困惑:

    xRows = []
    yCols = []
    for i in range(w_kernel):
      for j in range(h_kernel):
        yCols.append(j)
        xRows.append(i)
...
          pixel_in_filter_x = xRows[k]
          pixel_in_filter_y = yCols[k]

Python中的列表非常灵活,可以容纳多种不同类型的对象。从性能角度来看,问题在于 Numba 不知道

pixel_in_filter_x
的类型。它需要这些信息来生成有效的代码。 Numba 是围绕优化对 NumPy 数组(而不是列表)的访问而构建的。

最后,我发现

fastmath
标志使此代码速度提高了约 5%。

示例:

@jit(nopython= True)
def cal_padding_nb(input, padding):
    output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
    output[ padding :  padding + input.shape[0],  padding :  padding + input.shape[1]] = input
    return output

@jit(nopython = True, fastmath=True)
def conv_per_channel_nb(data, kernel, padding = 1, stride = 1):
    h, w = data.shape
    data = cal_padding_nb(data, padding)
    
    kernel_size = kernel.shape
    h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
    w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
    
    tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
    h_kernel, w_kernel = kernel_size

    for i in range(h_output):
      for j in prange(w_output):
        new_val = 0.0
        for a in range(w_kernel):
          for b in range(h_kernel):
            pixel_in_filter_x = a
            pixel_in_filter_y = b

            pixel_in_input_x = pixel_in_filter_x + i
            pixel_in_input_y = pixel_in_filter_y + j
            new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
        if new_val > 255:
          new_val = 255
        elif new_val < 0:
          new_val = 0
        tranformed[i,j] = new_val
      
    return tranformed

我发现这比原始的非 numba 版本快得多。

计时,忽略 numba 版本的第一次迭代:

Original, without numba: 11.3 ms ± 92.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Numba, without lists: 31 µs ± 824 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
© www.soinside.com 2019 - 2024. All rights reserved.