CUDA:对未签名字符的原子操作

问题描述 投票:2回答:2

我是CUDA初学者。我在全局内存中有一个无符号字符的像素缓冲区,该缓冲区可以被任何线程和所有线程更新。因此,为了避免像素值怪异,我想在线程尝试更新一个时执行atomicExch。但是编程指南说该功能仅适用于32位或64位字,而我只想原子交换一个8位字节。有办法吗?

谢谢。

cuda atomic
2个回答
1
投票

您可能使用互斥量变量实现关键部分。所以像

get_the_lock
exch_data
release

http://forums.nvidia.com/index.php?showtopic=185809

Implementing a critical section in CUDA


0
投票

我最近刚遇到这个问题。从理论上讲,原子操作/乐观重试应该比锁/互斥体快,因此对其他数据类型使用原子操作的“ hack”解决方案对我而言似乎比使用关键节好。

这里有一些基于how to implement atomicMin for charatomicAdd for short的线程的实现。

我已经测试了所有这些,并且测试似乎表明它们到目前为止可以正常工作。

atomicAdd的第1版,用于char

__device__ static inline char atomicAdd(char* address, char val) {
    // offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
    size_t long_address_modulo = (size_t) address & 3;
    // the 32-bit address that overlaps the same memory
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    // A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
    // The "4" signifies the position where the first byte of the second argument will end up in the output.
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
    // for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
    unsigned int selector = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;

    long_old = *base_address;

    do {
        long_assumed = long_old;
        // replace bits in long_old that pertain to the char address with those from val
        long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
        replacement = __byte_perm(long_old, long_val, selector);
        long_old = atomicCAS(base_address, long_assumed, replacement);
    } while (long_old != long_assumed);
    return __byte_perm(long_old, 0, long_address_modulo);
}

char的atomicCAS

__device__ static inline char atomicCAS(char* address, char expected, char desired) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};

    unsigned int sel = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;
    char old;

    long_val = (unsigned int) desired;
    long_old = *base_address;
    do {
        long_assumed = long_old;
        replacement = __byte_perm(long_old, long_val, sel);
        long_old = atomicCAS(base_address, long_assumed, replacement);
        old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
    } while (expected == old && long_assumed != long_old);

    return old;
}

atomicAdd的版本2用于char(使用位移而不是__byte_perm,并且必须处理溢出)

__device__ static inline char atomicAdd2(char* address, char val) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
    unsigned int long_old = atomicAdd(base_address, long_val);

    if (long_address_modulo == 3) {
        // the first 8 bits of long_val represent the char value,
        // hence the first 8 bits of long_old represent its previous value.
        return (char) (long_old >> 24);
    } else {
        // bits that represent the char value within long_val
        unsigned int mask = 0x000000ff << (8 * long_address_modulo);
        unsigned int masked_old = long_old & mask;
        // isolate the bits that represent the char value within long_old, add the long_val to that,
        // then re-isolate by excluding bits that represent the char value
        unsigned int overflow = (masked_old + long_val) & ~mask;
        if (overflow) {
            atomicSub(base_address, overflow);
        }
        return (char) (masked_old >> 8 * long_address_modulo);
    }
}

对于atomicMin,请检查this thread

© www.soinside.com 2019 - 2024. All rights reserved.