Metal - 优化内存访问

Question

这个问题有两个部分，但它们密切相关：

问题1

Metal 是否提供了使用共享线程组内存的方法？

例如，在 CUDA 中，您可以显式地将数据从设备内存加载到共享内存中，如下所示：

__shared__ float example1

Metal 提供这样的功能吗？看来所有缓冲区访问都是从全局内存加载的，除非幕后有一些隐藏的魔法。

问题2

这可能并非 Metal 所独有，因此任何 GPU 专家都可能提供帮助。苹果提供了一个矩阵乘法示例here - 我将粘贴下面的内核以供参考：

typedef struct
{
    ushort m, k, n, pbytes, qbytes;
} MetalMatrixDim;


kernel void MatrixMultiply(const device float*       A    [[ buffer(0) ]],
                           const device float*       B    [[ buffer(1) ]],
                           device float*             C    [[ buffer(2) ]],
                           constant MetalMatrixDim&  dims [[ buffer(3) ]],
                           ushort2                   gid  [[ thread_position_in_grid ]])
{
    ushort m = dims.m;
    ushort k = dims.k;
    ushort n = dims.n;

    ushort pbytes = dims.pbytes;
    ushort qbytes = dims.qbytes;

    ushort2 gidIn = ushort2(gid.x << 3, gid.y << 3);

    if (gidIn.x >= m || gidIn.y >= k) return;

    const device float4* a = (const device float4*)(A + gidIn.x);
    const device float4* b = (const device float4*)(B + gidIn.y);

    C = (device float*)((device char*)C + gidIn.x*qbytes);

    device float4* c = (device float4*)(C + gidIn.y);

    const device float4* Bend = (const device float4*)((const device char*)B + qbytes*n);

    float4 s0  = 0.0f, s1  = 0.0f, s2  = 0.0f, s3  = 0.0f;
    float4 s4  = 0.0f, s5  = 0.0f, s6  = 0.0f, s7  = 0.0f;
    float4 s8  = 0.0f, s9  = 0.0f, s10 = 0.0f, s11 = 0.0f;
    float4 s12 = 0.0f, s13 = 0.0f, s14 = 0.0f, s15 = 0.0f;

    do
    {
        float4 aCurr0 = a[0];
        float4 aCurr1 = a[1];
        float4 bCurr0 = b[0];
        float4 bCurr1 = b[1];

        s0   += (aCurr0.x * bCurr0);
        s2   += (aCurr0.y * bCurr0);
        s4   += (aCurr0.z * bCurr0);
        s6   += (aCurr0.w * bCurr0);

        s1   += (aCurr0.x * bCurr1);
        s3   += (aCurr0.y * bCurr1);
        s5   += (aCurr0.z * bCurr1);
        s7   += (aCurr0.w * bCurr1);

        s8   += (aCurr1.x * bCurr0);
        s10  += (aCurr1.y * bCurr0);
        s12  += (aCurr1.z * bCurr0);
        s14  += (aCurr1.w * bCurr0);

        s9   += (aCurr1.x * bCurr1);
        s11  += (aCurr1.y * bCurr1);
        s13  += (aCurr1.z * bCurr1);
        s15  += (aCurr1.w * bCurr1);

        a = (device float4*)((device char*)a + pbytes);
        b = (device float4*)((device char*)b + qbytes);

    } while(b < Bend);

    c[0] = s0;  c[1] = s1;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s2;  c[1] = s3;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s4;  c[1] = s5;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s6;  c[1] = s7;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s8;  c[1] = s9;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s10; c[1] = s11; c = (device float4*)((device char*)c + qbytes);
    c[0] = s12; c[1] = s13; c = (device float4*)((device char*)c + qbytes);
    c[0] = s14; c[1] = s15;
}

问题：对于每个线程，该内核计算输出的 8 x 8 扇区

。这是什么原因呢？为什么不允许每个线程计算

的单个元素，这将消除 8 的倍数大小限制并为较小的矩阵提供更好的并行化？

我认为这个实现必须以某种方式进行优化，并且我猜测它与线程同步和内存访问有关 - 这就是为什么我将它与问题 1 捆绑在一起。有什么想法吗？

Answer 1

我不认为你的两个问题之间有任何关系。关于问题 1：是的，Metal 在计算函数中提供了共享线程组内存。只需在变量声明上指定

threadgroup

地址空间限定符即可。例如：

threadgroup float example1;

您还可以指定线程组缓冲区作为计算函数的输入参数。

kernel void my_func(...,
                    threadgroup float *example2 [[threadgroup(0)]],
                    ...)
{
    ...
}

缓冲区由设备分配。缓冲区的大小是使用计算命令编码器的

-setThreadgroupMemoryLength:atIndex:

方法设置的。

Answer 2

值得补充的是，如果您知道内核中需要多少数组长度，则可以通过以下方式在内核中分配数组：

float *example2[50];

当您不知道并且只有主机可以决定线程组共享内存长度时，您可以使用以下方法：

kernel void my_func(...,
                    threadgroup float *example2 [[threadgroup(0)]],
                    ...)
setThreadgroupMemoryLength:atIndex:

而且

threadgroup float example1;

也是开销，因为在内核中只需要使用内存地址空间属性来指定指针变量

Metal - 优化内存访问

问题描述投票：0回答：2

问题1

问题2

2个回答

最新问题

Metal - 优化内存访问

问题描述 投票：0回答：2

问题1

问题2

2个回答

最新问题

问题描述投票：0回答：2