我有一个带有 RT1176 SoC(800MHz Cortex-m7 和 400MHz Cortex-m4)的 Google Coral Dev Micro,m7 运行 FreeRTOS,m4 运行裸机,使用 GCC none eabi 9.3.1 进行编译,并带有以下标志:
-Wall -Wno-psabi -mthumb -fno-common -ffunction-sections -fdata-sections -ffreestanding -fno-builtin -mapcs-frame --specs=nano.specs --specs=nosys.specs -u _printf_float -std=gnu99 -g -Os -save-temps -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -DNDEBUG
我想使用共享内存在两个内核之间传输定时器中断中生成的数据,并且我在共享内存中创建了自己的无锁 FIFO 缓冲区:
fifo.h:
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
template <typename T, size_t num_elements>
class LockLessFifo {
public:
LockLessFifo() : read_idx(0), write_idx(0), buffer({0}) {}
size_t size() {
if (read_idx <= write_idx) {
return write_idx - read_idx;
} else {
return num_elements - (read_idx - write_idx);
}
}
constexpr std::size_t capacity() { return buffer.max_size(); }
void put(const T& entry) {
// Wait for space
while ((num_elements - size()) < 2) {
;
}
size_t new_write_idx = (write_idx + 1) % num_elements;
buffer[new_write_idx] = entry;
// Update the write index AFTER writing the data
write_idx = new_write_idx;
}
T get(const bool block = true) {
if (!block && size() == 0)
return -1;
while (size() == 0) {
;
}
// Consume
size_t new_read_idx = (read_idx + 1) % num_elements;
T retrieved_value = buffer[new_read_idx];
// Update the read index AFTER reading (to signal availability to the producer)
read_idx = new_read_idx;
return retrieved_value;
}
private:
std::atomic<std::size_t> read_idx;
std::atomic<std::size_t> write_idx;
std::array<std::atomic<T>, num_elements> buffer;
};
shared_fifo.h
#include "fifo.h"
using shared_element_t = uint32_t;
constexpr std::size_t SHARED_MEMORY_SIZE = 0x1700; // Manually calculated free space using map file
constexpr std::size_t SHARED_MEMORY_ELEMENTS = SHARED_MEMORY_SIZE / sizeof(shared_element_t);
LockLessFifo<shared_element_t, SHARED_MEMORY_ELEMENTS> shared_fifo __attribute__((section(".noinit.$rpmsg_sh_mem")));
我并不期望 FIFO 代码是完美的,但它是实用的。我只是分享了它的完整内容并提供了所有背景信息。
从m7读取FIFO如下: main_m7.cpp
#include <cstdio>
#include "libs/base/ipc_m7.h"
#include "shared_fifo.h"
extern "C" [[noreturn]] void app_main(void* param) {
(void)param;
coralmicro::IpcM7::GetSingleton()->StartM4();
uint32_t counter1, counter2;
while (true) {
counter1 = shared_fifo.get(); // Works
counter2 = shared_fifo.get(); // Always 0
const std::size_t fifo_size = shared_fifo.size();
constexpr std::size_t fifo_capacity = shared_fifo.capacity();
printf("[M7] counter: %lu/%lu size: %u/%u\r\n", counter1, counter2, fifo_size, fifo_capacity);
}
}
m4 上的 FIFO 填充如下:
main_m4.cpp
#include <atomic>
#include <cmath>
#include <cstdio>
#include "fsl_pit.h"
#include "shared_fifo.h"
static std::atomic<std::size_t> counter1 = 0;
static std::atomic<std::size_t> counter2 = 0;
void my_pit_irq() {
// Clear IRQ flag
PIT_ClearStatusFlags(PIT1, kPIT_Chnl_0, kPIT_TimerFlag);
counter1.store(counter1.load() + 1); // Works
counter2.fetch_add(1); // Always 0
SDK_ISR_EXIT_BARRIER;
}
extern "C" [[noreturn]] void app_main(void* param) {
(void)param;
configure_and_start_timer(); // removed for clarity
while (true) {
shared_fifo.put(counter1.load()); // Works
shared_fifo.put(counter2.load()); // Always 0
}
}
这段代码的输出是:
(...)
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
(...)
为什么不正确的增量(计数器 1)起作用,而正确的增量(计数器 2)不起作用?我看了一下拆解:
PIT 中断:
_Z10my_pit_irqv:
// Clear IRQ flag
ldr r3, .L3
movs r2, #1
str r2, [r3, #268]
ldr r2, .L3+4 // Load .LANCHOR0
dmb ish
ldr r3, [r2] // Load value stored at address of .LANCHOR0 (.load())
dmb ish
adds r3, r3, #1 // Increment by 1
dmb ish
str r3, [r2] // Store incremented value (.store())
ldr r3, .L3+8 // Load .LANCHOR1
dmb ish
dmb ish // Dubble barrier?
.L2:
ldrex r2, [r3] // load
adds r2, r2, #1 // increment
strex r1, r2, [r3] // store
cmp r1, #0 // check
bne .L2 // retry
dmb ish
dsb 0xF
bx lr
.L4:
.align 2
.L3:
.word 1074626560
.word .LANCHOR0 // counter1
.word .LANCHOR1 // counter2
主要:
app_main:
push {r0, r1, r2, lr}
ldr r6, .L14 // Counter1
ldr r4, .L14+4
ldr r5, .L14+8 // Counter2
.L13:
// This is the same as for counter2
add r1, sp, #4
ldr r3, [r6] // Load the value of counter1
dmb ish
mov r0, r4
str r3, [sp, #4]
bl _ZN5LockLessFifoImLj1472EE3putERKm // Put it in the FIFO
dmb ish
// This is the same as for counter1
add r1, sp, #4
ldr r3, [r5] // Load the value of counter2
dmb ish
mov r0, r4
str r3, [sp, #4]
bl _ZN5LockLessFifoImLj1472EE3putERKm // Put it in the FIFO
b .L13
.L15:
.align 2
.L14:
.word .LANCHOR0
.word _ZN511shared_fifoE
.word .LANCHOR1
我不明白哪里出了问题。我知道 FIFO 可以正常工作,因为增量正在工作,并且我们看到终端中打印的数字正在上升。生成的程序集非常相似,对我来说看起来是正确的。不幸的是,我(还)无法连接调试器,因为我需要将 JTAG 接头焊接到板上,并且我们需要等待才能进行任何修改。
感谢您抽出时间来查看。
在评论中 @NateEldredge 的帮助下,我找到了答案:C++ 原子在这个硬件平台上无法正确编译。编译器生成
dbm ish
指令,仅同步内部可共享域。
我通过在每个核心上增加共享计数器 1000000 次来验证这一点,结果不等于 2000000。
这可能可以通过手动将
dbm ish
替换为 dbm osh
指令来修复,但快速而肮脏的修复是不够的:
// the attempt that didn't work
for (size_t i = 0; i < 1000000; i++) {
asm("dmb osh" : : : "memory");
unica::shared_fifo.shared_counter++; // not actually atomic wrt. other core
asm("dmb osh" : : : "memory");
}
不幸的是,我将放弃使用
std::atomic
并重新使用 volatile
并在必要时屏蔽 IRQ