初始化共享内存时 mmap 之后出现 memset 总线错误

问题描述 投票:0回答:1

场景是一个进程将程序指标统计数据写入共享内存文件。另一个进程将读取该文件并将数据发送到指标服务器。 在编写器进程开始尝试 memset 共享内存后的几分钟内,会发生总线错误和核心转储。 简化的代码是:

Init() {
    tpid_ = get_thread_pid();
    table_name_ = executable_name();
    max_counter_num_ = 1024;
    MI << "tpid=" << tpid_ << ", table_name=" << table_name_;

    shm_root_ = "/dev/shm/counters";
    shm_dir_ = shm_root_ + "/" + table_name_;
    char buf[1024];
    sprintf(buf, "/counters/%s/%d", table_name_.c_str(), tpid_);
    shm_file_ = buf;
    table_size_ = max_counter_num_ * sizeof(Entry) + sizeof(Table);
    shm_size_ = table_size_;
    MI << "shm_file_=" << shm_file_ << ", shm_size_=" << shm_size_;
    table_ = NULL;
    entries_ = NULL;
    err_ = kOk;
    errno_ = 0;

    Attach();
}

int Attach() {
    int err;
    // only one shm file for a given thread
    int oflag = O_CREAT | O_RDWR | O_EXCL;
    int dir_mode = S_IRWXO | S_IRWXU | S_IRWXG;
    int file_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;

    umask(0);
    err = mkdir(shm_root_.c_str(), dir_mode);
    if (err && errno != EEXIST) {
        MW << "dir=" << shm_root_ << ", mkdir,err=" << errno << ":" << strerror(errno);
        err_ = kSysMkdir;
        errno_ = errno;
        return kSysMkdir;
    }

    err = mkdir(shm_dir_.c_str(), dir_mode);
    if (err && errno != EEXIST) {
        MW << "dir=" << shm_dir_ << ", mkdir,err=" << errno << ":" << strerror(errno);
        err_ = kSysMkdir;
        errno_ = errno;
        return kSysMkdir;
    }

    shm_fd_ = shm_open(shm_file_.c_str(), oflag, file_mode);
    if (shm_fd_ == -1) {
        MW << "file=" << shm_file_ << ", shm_open,err=" << errno << ":" << strerror(errno);
        err_ = kSysShmOpen;
        errno_ = errno;
        return kSysShmOpen;
    }

    err = ftruncate(shm_fd_, shm_size_);
    if (err == -1) {
        err_ = kSysFtruncate;
        errno_ = errno;
        MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", ftruncate,err=" << errno << ":" << strerror(errno);
        return kSysFtruncate;
    }
    MI << "ftruncate success, return:" << err;

    int proto = PROT_READ | PROT_WRITE;
    int flags = MAP_SHARED;

    void *addr = mmap(NULL, shm_size_, proto, flags, shm_fd_, 0);

    if (addr == MAP_FAILED) {
        MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap,err=" << errno << ":" << strerror(errno);
        err_ = kSysMmap;
        errno_ = errno;
        return kSysMmap;
    }
    MI << "mmap success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr << ", shm_size_=" << shm_size_;
    memset(addr, 0, shm_size_);
    MI << "mmemset success";
    table_ = (Table *)addr;
    entries_ = (Entry *)((char *)addr + sizeof(Table));

    table_->n_entry = 0;
    table_->max_entry_num = max_counter_num_;
    table_->last_timestamp = time(NULL);
    gethostname(table_->endpoint, MAX_HOSTNAME_LEN);
    MI << "Attach success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr;
    return 0;
}

Attach方法中的memset运行多次后触发总线错误。 日志是:

I0509 10:34:27.081708 920 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:27.081895 920 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:27.081923 920 shm_counter.cpp:70] tpid=920, table_name=knn_realtime I0509 10:34:27.081974 920 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/920, shm_size_=8340560 I0509 10:34:27.082068 920 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:27.082094 920 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000, shm_size_=8340560 I0509 10:34:27.087743 920 shm_counter.cpp:153] mmemset success I0509 10:34:27.087836 920 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000 I0509 10:34:29.168845 867 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:29.189066 867 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:29.192279 867 shm_counter.cpp:70] tpid=867, table_name=knn_realtime I0509 10:34:29.192375 867 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/867, shm_size_=8340560 I0509 10:34:29.192492 867 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:29.192528 867 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000, shm_size_=8340560 I0509 10:34:29.198365 867 shm_counter.cpp:153] mmemset success I0509 10:34:29.198501 867 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000 I0509 10:34:33.079138 923 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:33.079949 923 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:33.080663 923 shm_counter.cpp:70] tpid=923, table_name=knn_realtime I0509 10:34:33.080742 923 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/923, shm_size_=8340560 I0509 10:34:33.080799 923 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:33.080823 923 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000, shm_size_=8340560 I0509 10:34:33.086556 923 shm_counter.cpp:153] mmemset success I0509 10:34:33.086634 923 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000 I0509 10:34:35.251169 853 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:35.251964 853 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:35.252004 853 shm_counter.cpp:70] tpid=853, table_name=knn_realtime I0509 10:34:35.252027 853 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/853, shm_size_=8340560 I0509 10:34:35.252079 853 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:35.252110 853 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000, shm_size_=8340560 I0509 10:34:35.257800 853 shm_counter.cpp:153] mmemset success I0509 10:34:35.257881 853 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000 I0509 10:34:35.505298 868 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:35.513530 868 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:35.513626 868 shm_counter.cpp:70] tpid=868, table_name=knn_realtime I0509 10:34:35.513657 868 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/868, shm_size_=8340560 I0509 10:34:35.513706 868 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:35.513723 868 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000, shm_size_=8340560 I0509 10:34:35.519409 868 shm_counter.cpp:153] mmemset success I0509 10:34:35.519487 868 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000 I0509 10:34:37.302048 899 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:37.308457 899 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:37.308708 899 shm_counter.cpp:70] tpid=899, table_name=knn_realtime I0509 10:34:37.308753 899 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/899, shm_size_=8340560 I0509 10:34:37.308815 899 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:37.308836 899 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000, shm_size_=8340560 I0509 10:34:37.314566 899 shm_counter.cpp:153] mmemset success I0509 10:34:37.314673 899 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000 I0509 10:34:38.097653 872 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:38.097880 872 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:38.097911 872 shm_counter.cpp:70] tpid=872, table_name=knn_realtime I0509 10:34:38.097932 872 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/872, shm_size_=8340560 I0509 10:34:38.097973 872 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:38.097998 872 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000, shm_size_=8340560 I0509 10:34:38.103730 872 shm_counter.cpp:153] mmemset success I0509 10:34:38.103808 872 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000 I0509 10:34:41.335608 905 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:41.337450 905 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:41.337491 905 shm_counter.cpp:70] tpid=905, table_name=knn_realtime I0509 10:34:41.337515 905 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/905, shm_size_=8340560 I0509 10:34:41.337570 905 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:41.337599 905 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000, shm_size_=8340560 I0509 10:34:41.343400 905 shm_counter.cpp:153] mmemset success I0509 10:34:41.343483 905 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000 I0509 10:34:41.964365 906 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime I0509 10:34:41.965782 906 shm_counter.cpp:52] executable_name=knn_realtime I0509 10:34:41.965902 906 shm_counter.cpp:70] tpid=906, table_name=knn_realtime I0509 10:34:41.965938 906 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/906, shm_size_=8340560 I0509 10:34:41.965989 906 shm_counter.cpp:138] ftruncate success, return:0 I0509 10:34:41.966127 906 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/906, shm_fd=28, mmap_addr=0x7fb4d200b000, shm_size_=8340560
stderr 显示:

*** Aborted at 1715222081 (unix time) try "date -d @1715222081" if you are using GNU date *** PC: @ 0x7fb55cc12720 __memset_sse2 *** SIGBUS (@0x7fb4d2062000) received by PID 35 (TID 0x7fb5049ff700) from PID 18446744072938201088; stack trace: *** @ 0x7fb561e9d6d0 (unknown) @ 0x7fb55cc12720 __memset_sse2 @ 0x7fb56ffd60b1 falcon::ShmCounterImpl::Attach() @ 0x7fb56ffd6a17 falcon::ShmCounterImpl::ShmCounterImpl() @ 0x7fb56ffd7382 __tls_init @ 0x7fb56ffd74b2 falcon::ShmCounter::Set() @ 0x58b2f7 KnnRecallServiceHandler::ProcessThriftFramedRequest() @ 0x7fb56d946b31 brpc::policy::ProcessThriftFramedRequestNoExcept() @ 0x7fb56d942849 brpc::policy::ProcessThriftRequest() @ 0x7fb56d99f36a brpc::ProcessInputMessage() @ 0x7fb56d9a03a8 brpc::InputMessenger::OnNewMessages() @ 0x7fb56d9a7e6d brpc::Socket::ProcessEvent() @ 0x7fb56d87fadf bthread::TaskGroup::task_runner() @ 0x7fb56d88d761 bthread_make_fcontext
系统为4.19.91-2048.3.miks7.x86_64。这些进程在容器和一个 Pod 中运行。

c++ linux shared-memory mmap memset
1个回答
0
投票
/dev/shm 似乎空间不足: shm 64M 0 64M 0% /dev/shm shm 初始化了 8 次,第九次崩溃。 8340560 * 8 /1024/1024=63M。下次init将超过64M限制。

我正在核实,稍后回复。

© www.soinside.com 2019 - 2024. All rights reserved.