我的裸机程序手动调用
memset()
将整个/对齐的 4k 页面归零(我没有使用 uint64_t,而是另一个 8 字节的东西):
uint64_t something[512] __attribute__((aligned(4096)));
memset(something, 0x0, 4096);
我正在编译类似于这个...
%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
-O3 \
-std=gnu99 \
-nostartfiles \
--specs=nano.specs \
-march=armv8.1-a \
-Wl,--gc-sections \
-Tlinker_script.lds \
my_code.c \
-o my_code.elf
当我反汇编并查看链接的
memset()
时,这是基本/通用/一次一个字节,实现:
000000004010399c <memset>:
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
4010399c: d2800003 mov x3, #0x0 // #0
401039a0: eb03005f cmp x2, x3
401039a4: 54000041 b.ne 401039ac <memset+0x10> // b.any
*s++ = (char) c;
return m;
}
401039a8: d65f03c0 ret
*s++ = (char) c;
401039ac: 38236801 strb w1, [x0, x3]
401039b0: 91000463 add x3, x3, #0x1
401039b4: 17fffffb b 401039a0 <memset+0x4>
我期待使用
stp
或矢量指令的 aarch64 优化版本。我的编译器有一个 /path/to/gcc-11.1.0/newlib-nano
子目录。
我已经删除了
--specs=nano.specs
并摆弄了各种选项,但我不确定我能在这里做什么......
我怎样才能得到优化的
memset()
实施?
请注意,我使用
从许多不同的/path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x
文件中提取libc_a.memset.o
文件,但它们都是空的:*.a
、/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a
、/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a
、/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a
等...不是逐字节实现,只是空的。我这样做是为了寻找一个好的实现,但我显然不明白这里发生了什么....../path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a
你可以在这里看到代码。你的图书馆是用定义
PREFER_SIZE_OVER_SPEED
编译的。你需要重新编译你的库。
https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c
/*
FUNCTION
<<memset>>---set an area of memory
INDEX
memset
ANSI_SYNOPSIS
#include <string.h>
void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
TRAD_SYNOPSIS
#include <string.h>
void *memset(<[dst]>, <[c]>, <[length]>)
void *<[dst]>;
int <[c]>;
size_t <[length]>;
DESCRIPTION
This function converts the argument <[c]> into an unsigned
char and fills the first <[length]> characters of the array
pointed to by <[dst]> to the value.
RETURNS
<<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
<<memset>> requires no supporting OS subroutines.
QUICKREF
memset ansi pure
*/
#include <string.h>
#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
_PTR
_DEFUN (memset, (m, c, n),
_PTR m _AND
int c _AND
size_t n)
{
char *s = (char *) m;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
while (UNALIGNED (s))
{
if (n--)
*s++ = (char) c;
else
return m;
}
if (!TOO_SMALL (n))
{
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
buffer = (d << 8) | d;
buffer |= (buffer << 16);
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
buffer = (buffer << i) | buffer;
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
n -= 4*LBLOCKSIZE;
}
while (n >= LBLOCKSIZE)
{
*aligned_addr++ = buffer;
n -= LBLOCKSIZE;
}
/* Pick up the remainder with a bytewise loop. */
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
*s++ = (char) c;
return m;
}