使用 Numba 的代码比不使用 Numba 的代码慢,我不知道发生了什么。请指导我。
#with numba
import numpy as np
from numba import njit, prange
import time
(X_train, Y_train), (X_val, Y_val) = tf.keras.datasets.cifar10.load_data()
@njit(nopython= True)
def cal_padding(input, padding):
output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
output[ padding : padding + input.shape[0], padding : padding + input.shape[1]] = input
return output
@njit(nopython = True)
def conv_per_channel(data, kernel, padding = 1, stride = 1): #input ở đây là từng channel trong ảnh, kernel cũng là từng channel trong bộ
h, w = data.shape
data = cal_padding(data, padding)
kernel_size = kernel.shape
h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
h_kernel, w_kernel = kernel_size
xRows = []
yCols = []
for i in range(w_kernel):
for j in range(h_kernel):
yCols.append(j)
xRows.append(i)
for i in range(h_output):
for j in prange(w_output):
root_pixel = [i,j]
new_val = 0.0
for k in range(w_kernel * h_kernel):
pixel_in_filter_x = xRows[k]
pixel_in_filter_y = yCols[k]
pixel_in_input_x = pixel_in_filter_x + root_pixel[0]
pixel_in_input_y = pixel_in_filter_y + root_pixel[1]
new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
if new_val > 255:
new_val = 255
elif new_val < 0:
new_val = 0
tranformed[i,j] = new_val
return tranformed
X_train = np.transpose(X_train, (0,3,1,2))
X_train500 = X_train[0:500]
print(X_train[0][0].shape)
sobelX = np.array([[-1,0,1], [-2,0,2],[-1,0,1]])
start = time.time()
conv_per_channel(X_train[0][0], sobelX)
end = time.time()
print(end-start)
#without numba
import numpy as np
import tensorflow as tf
from numba import jit, prange
import time
(X_train, Y_train), (X_val, Y_val) = tf.keras.datasets.cifar10.load_data()
# @jit(nopython= True)
def cal_padding(input, padding):
output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
output[ padding : padding + input.shape[0], padding : padding + input.shape[1]] = input
return output
# @jit(nopython = True)
def conv_per_channel(data, kernel, padding = 1, stride = 1): #input ở đây là từng channel trong ảnh, kernel cũng là từng channel trong bộ
h, w = data.shape
data = cal_padding(data, padding)
kernel_size = kernel.shape
h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
h_kernel, w_kernel = kernel_size
xRows = []
yCols = []
for i in range(w_kernel):
for j in range(h_kernel):
yCols.append(j)
xRows.append(i)
for i in range(h_output):
for j in prange(w_output):
root_pixel = [i,j]
new_val = 0.0
for k in range(w_kernel * h_kernel):
pixel_in_filter_x = xRows[k]
pixel_in_filter_y = yCols[k]
pixel_in_input_x = pixel_in_filter_x + root_pixel[0]
pixel_in_input_y = pixel_in_filter_y + root_pixel[1]
new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
if new_val > 255:
new_val = 255
elif new_val < 0:
new_val = 0
tranformed[i,j] = new_val
return tranformed
X_train = np.transpose(X_train, (0,3,1,2))
X_train500 = X_train[0:500]
print(X_train[0][0].shape)
sobelX = np.array([[-1,0,1], [-2,0,2],[-1,0,1]])
start = time.time()
conv_per_channel(X_train[0][0], sobelX)
end = time.time()
print(end-start)
使用 Numba
@jit()
装饰器此代码运行速度较慢!
我尝试使用 (parallel=True) 或 (cache = True) 但它甚至更慢。
只是为了帮助更好地理解这段代码的目的:
#with numba
(32, 32)
0.6485073566436768
#without numba
(32, 32)
0.007578611373901367
这会让 Numba 优化器感到困惑:
xRows = []
yCols = []
for i in range(w_kernel):
for j in range(h_kernel):
yCols.append(j)
xRows.append(i)
...
pixel_in_filter_x = xRows[k]
pixel_in_filter_y = yCols[k]
Python中的列表非常灵活,可以容纳多种不同类型的对象。从性能角度来看,问题在于 Numba 不知道
pixel_in_filter_x
的类型。它需要这些信息来生成有效的代码。 Numba 是围绕优化对 NumPy 数组(而不是列表)的访问而构建的。
最后,我发现
fastmath
标志使此代码速度提高了约 5%。
示例:
@jit(nopython= True)
def cal_padding_nb(input, padding):
output = np.zeros((input.shape[0]+ padding*2, input.shape[1]+ padding*2))
output[ padding : padding + input.shape[0], padding : padding + input.shape[1]] = input
return output
@jit(nopython = True, fastmath=True)
def conv_per_channel_nb(data, kernel, padding = 1, stride = 1):
h, w = data.shape
data = cal_padding_nb(data, padding)
kernel_size = kernel.shape
h_output = int((h - kernel_size[0] + 2*padding + stride) / stride)
w_output = int((w - kernel_size[1] + 2*padding + stride) / stride)
tranformed = np.zeros((h_output, w_output), dtype=data.dtype)
h_kernel, w_kernel = kernel_size
for i in range(h_output):
for j in prange(w_output):
new_val = 0.0
for a in range(w_kernel):
for b in range(h_kernel):
pixel_in_filter_x = a
pixel_in_filter_y = b
pixel_in_input_x = pixel_in_filter_x + i
pixel_in_input_y = pixel_in_filter_y + j
new_val = new_val + (data[pixel_in_input_x, pixel_in_input_y] * kernel[pixel_in_filter_x][pixel_in_filter_y])
if new_val > 255:
new_val = 255
elif new_val < 0:
new_val = 0
tranformed[i,j] = new_val
return tranformed
我发现这比原始的非 numba 版本快得多。
计时,忽略 numba 版本的第一次迭代:
Original, without numba: 11.3 ms ± 92.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Numba, without lists: 31 µs ± 824 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)