使用 pyopencl、arrayfire 或其他 python opencl 库根据欧氏距离制作掩模

问题描述 投票:0回答:2

我正在给定坐标周围做 2D 或 3D 二进制掩码,然后使用 scipy.ndimage.label 将它们识别为标签。 现在,我有一个 cupy 解决方案,一个 numpy 解决方案。 Cupy 很快,numpy 很慢,两者都各司其职。我正在尝试让它在其他非 nvidia GPU 或 CPU 上运行,使用 openCl 或任何其他更“跨平台”的形式,例如 pyopencl 或 numba,甚至是tensorflow(或以上所有)

这是我当前的功能:

def make_labels_links(shape,j,radius=5,_algo="GPU"):
    import scipy.ndimage as ndi

    if _algo == "GPU":
        import cupy as cp

        if 'z' in j:
            pos = cp.dstack((j.z,j.y,j.x))[0]#.astype(int)
            print("3D",j)
        else:
            pos = cp.dstack((j.y,j.x))[0]#.astype(int)
            print("2D",j)

        ndim = len(shape)
        # radius = validate_tuple(radius, ndim)
        pos = cp.atleast_2d(pos)

        # if include_edge:
        in_mask = cp.array([cp.sum(((cp.indices(shape).T - p) / radius)**2, -1) <= 1
                    for p in pos])
        # else:
        #     in_mask = [np.sum(((np.indices(shape).T - p) / radius)**2, -1) < 1
        #                for p in pos]
        mask_total = cp.any(in_mask, axis=0).T

        ##if they overlap the labels won't match the points
        #we can make np.ones * ID of the point and then np.max(axis=-1)
        labels, nb = ndi.label(cp.asnumpy(mask_total))
        
    #this is super slow
    elif _algo=='CPU':
        if 'z' in j:
            # "Need to loop each t and do one at a time"
            pos = np.dstack((j.z,j.y,j.x))[0]#.astype(int)
            print("3D",j)
        else:
            pos = np.dstack((j.y,j.x))[0]#.astype(int)
            print("2D",j)

        ndim = len(shape)
        pos = np.atleast_2d(pos)
        in_mask = np.array([np.sum(((np.indices(shape).T - p) / radius)**2, -1) <= 1
                    for p in pos])
        mask_total = np.any(in_mask, axis=0).T

        labels, nb = ndi.label(mask_total)

形状例如 (1024,1024) 或 (20,1024,1024) j 是 pandas 数据框的位置 例如:

    ID  Frame   z   y   x   mass    size    ecc
0   14  0   9.121369    24.139295   297.156056  5311.661985 1.383264    NaN
1   7   0   8.270742    24.880990   308.757851  6341.082129 1.403242    NaN
2   1   0   4.881552    28.963021   291.900410  6587.918081 1.330116    NaN
3   8   0   8.015537    31.137655   304.999176  5423.140504 1.341228    NaN
4   15  0   10.910010   32.275774   298.940183  6033.069086 1.333235    NaN
5   4   0   7.057360    33.958352   313.252877  8123.989610 1.395905    NaN
6   9   0   7.922786    39.966382   308.092925  6683.742777 1.398941    NaN
7   5   0   7.163435    40.774834   329.106137  5225.643216 1.402079    NaN
8   10  0   7.665061    49.844910   306.998559  6054.428176 1.408336    NaN
9   11  0   7.909787    50.951857   320.125061  5057.551819 1.314976    NaN
10  6   0   7.148229    52.051917   312.929513  9427.345719 1.351461    NaN
11  12  0   7.599882    55.917379   303.916436  6220.891194 1.401507    NaN
12  0   0   3.992557    59.060168   298.188021  7489.421410 1.391892    NaN
13  13  0   7.684166    62.945226   303.940392  7970.470413 1.384929    NaN
14  3   0   6.102017    70.796726   316.141957  5091.926046 1.393204    NaN
15  2   0   4.937488    87.916144   192.836998  5068.320508 1.330015    NaN

我尝试使用 pyopencl 来实现,但无法实现,有什么建议吗? 谢谢

python numba cupy pyopencl arrayfire
2个回答
0
投票

尝试用数组操作替换

for
循环。例如,在掩码计算中,我们可以按如下方式重新排列平方和:

inarr = np.indices(shape).T
a = np.sum(inarr, axis=-1, keepdims=True)
a2 = np.sum(inarr ** 2, axis=-1, keepdims=True)

b = pos
for _ in range(len(shape)):
    b = np.expand_dims(b, axis=0)
b2 = b ** 2

sq_sum = a2 - a * 2 * b + b2 * len(shape)

mask_total = np.any(sq_sum <= radius ** 2, axis=-1).T

0
投票

这适用于 pyopencl

def make_labels_links_opencl(shape, j, radius=5):

    import numpy as np
    import pyopencl as cl
    from scipy.ndimage import label
    if 'z' in j:
        # "Need to loop each t and do one at a time"
        positions = np.dstack((j.z,j.y,j.x))[0]#.astype(int)
        print("3D",j)
    else:
        positions = np.dstack((j.y,j.x))[0]#.astype(int)
        print("2D",j)
    # Prepare data
    mask = np.zeros(shape, dtype=np.uint8)
    positions_flat = positions.flatten().astype(np.float32)
    radius = np.float32(radius)
    
#    platform = cl.get_platforms()
#    my_gpu_devices = platform[1].get_devices(device_type=cl.device_type.GPU)
#    ctx = cl.Context(devices=my_gpu_devices)

    # PyOpenCL setup
     ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Create buffers
    mask_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=mask)
    positions_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=positions_flat)
    
    # Kernel code
    kernel_code = """
    __kernel void fill_mask(__global uchar *mask, __global const float *positions, const float radius, const int num_positions) {
        int x = get_global_id(0);
        int y = get_global_id(1);
        int z = get_global_id(2);
        int width = get_global_size(0);
        int height = get_global_size(1);
        int depth = get_global_size(2);
        int idx = x + y * width + z * width * height;
        
        for (int i = 0; i < num_positions; ++i) {
            float pz = positions[i * 3];
            float py = positions[i * 3 + 1];
            float px = positions[i * 3 + 2];
            
            float distance = sqrt(pow(px - x, 2) + pow(py - y, 2) + pow(pz - z, 2));
            if (distance <= radius) {
                mask[idx] = 1;
                break;
            }
        }
    }
    """
    
    # Build kernel
    prg = cl.Program(ctx, kernel_code).build()
    
    # Execute kernel
    global_size = shape[::-1]  # Note: PyOpenCL uses column-major order, so we reverse the dimensions
    prg.fill_mask(queue, global_size, None, mask_buf, positions_buf, radius, np.int32(len(positions)))
    
    # Read back the results
    cl.enqueue_copy(queue, mask, mask_buf)
    labels,_=label(mask)
    return labels,positions
© www.soinside.com 2019 - 2024. All rights reserved.