gcc为什么选择最基本的memset()实现?

问题描述 投票:0回答:1

我的裸机程序手动调用

memset()
将整个/对齐的 4k 页面归零(我没有使用 uint64_t,而是另一个 8 字节的东西):

  uint64_t something[512] __attribute__((aligned(4096)));
  memset(something, 0x0, 4096);

我正在编译类似于这个...

%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
     -O3 \
     -std=gnu99 \
     -nostartfiles \
     --specs=nano.specs \
     -march=armv8.1-a \
     -Wl,--gc-sections \
     -Tlinker_script.lds \
     my_code.c \
     -o my_code.elf

当我反汇编并查看链接的

memset()
时,这是基本/通用/一次一个字节,实现:

000000004010399c <memset>:
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    4010399c:   d2800003    mov x3, #0x0                    // #0
    401039a0:   eb03005f    cmp x2, x3
    401039a4:   54000041    b.ne    401039ac <memset+0x10>  // b.any
    *s++ = (char) c;

  return m;
}
    401039a8:   d65f03c0    ret
    *s++ = (char) c;
    401039ac:   38236801    strb    w1, [x0, x3]
    401039b0:   91000463    add x3, x3, #0x1
    401039b4:   17fffffb    b   401039a0 <memset+0x4>

我期待使用

stp
或矢量指令的 aarch64 优化版本。我的编译器有一个
/path/to/gcc-11.1.0/newlib-nano
子目录。

我已经删除了

--specs=nano.specs
并摆弄了各种选项,但我不确定我能在这里做什么......

我怎样才能得到优化的

memset()
实施

请注意,我使用

/path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x
从许多不同的
libc_a.memset.o
文件中提取
*.a
文件,但它们都是空的:
/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a
/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a
/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a
/path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a
等...不是逐字节实现,只是空的。我这样做是为了寻找一个好的实现,但我显然不明白这里发生了什么......

c gcc libc memset
1个回答
0
投票

你可以在这里看到代码。你的图书馆是用定义

PREFER_SIZE_OVER_SPEED
编译的。你需要重新编译你的库。

https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c

/*
FUNCTION
    <<memset>>---set an area of memory
INDEX
    memset
ANSI_SYNOPSIS
    #include <string.h>
    void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
TRAD_SYNOPSIS
    #include <string.h>
    void *memset(<[dst]>, <[c]>, <[length]>)
    void *<[dst]>;
    int <[c]>;
    size_t <[length]>;
DESCRIPTION
    This function converts the argument <[c]> into an unsigned
    char and fills the first <[length]> characters of the array
    pointed to by <[dst]> to the value.
RETURNS
    <<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
    <<memset>> requires no supporting OS subroutines.
QUICKREF
    memset ansi pure
*/

#include <string.h>

#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

_PTR
_DEFUN (memset, (m, c, n),
    _PTR m _AND
    int c _AND
    size_t n)
{
  char *s = (char *) m;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  int i;
  unsigned long buffer;
  unsigned long *aligned_addr;
  unsigned int d = c & 0xff;    /* To avoid sign extension, copy C to an
                   unsigned variable.  */

  while (UNALIGNED (s))
    {
      if (n--)
        *s++ = (char) c;
      else
        return m;
    }

  if (!TOO_SMALL (n))
    {
      /* If we get this far, we know that n is large and s is word-aligned. */
      aligned_addr = (unsigned long *) s;

      /* Store D into each char sized location in BUFFER so that
         we can set large blocks quickly.  */
      buffer = (d << 8) | d;
      buffer |= (buffer << 16);
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        buffer = (buffer << i) | buffer;

      /* Unroll the loop.  */
      while (n >= LBLOCKSIZE*4)
        {
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          n -= 4*LBLOCKSIZE;
        }

      while (n >= LBLOCKSIZE)
        {
          *aligned_addr++ = buffer;
          n -= LBLOCKSIZE;
        }
      /* Pick up the remainder with a bytewise loop.  */
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    *s++ = (char) c;

  return m;
}
© www.soinside.com 2019 - 2024. All rights reserved.