在我的 sha256 哈希类中,有以下函数。
uint32_t shift_right ...
uint32_t rotate_right(uint32_t x, uint32_t n)
{
return ((x >> n) | (x << ((sizeof(x) << 3) - n)));
}
uint32_t rotate_left ...
uint32_t choose ...
uint32_t majority ...
void unpack32(size_t x, uint8_t* str)
{
*((str)+3) = (uint8_t)((x));
*((str)+2) = (uint8_t)((x) >> 8);
*((str)+1) = (uint8_t)((x) >> 16);
*((str)+0) = (uint8_t)((x) >> 24);
}
void pack32(const uint8_t* str, uint32_t* x)
{
*(x) = ((uint32_t) * ((str)+3))
| ((uint32_t) * ((str)+2) << 8)
| ((uint32_t) * ((str)+1) << 16)
| ((uint32_t) * ((str)+0) << 24);
}
uint32_t f1(uint32_t x)
{
return (rotate_right(x, 2) ^ rotate_right(x, 13) ^ rotate_right(x, 32));
}
uint32_t f2(uint32_t x)
{
return (rotate_right(x, 6) ^ rotate_right(x, 11) ^ rotate_right(x, 25));
}
uint32_t f3(uint32_t x)
{
return (rotate_right(x, 7) ^ rotate_right(x, 18) ^ shift_right(x, 3));
}
uint32_t f4(uint32_t x)
{
return (rotate_right(x, 17) ^ rotate_right(x, 19) ^ shift_right(x, 10));
}
我的问题是:如果我使用下面的宏来代替这些函数,会不会更快一点?如果速度很快,为什么?
#define rotate_right(x, n) ((x >> n) | (x << ((sizeof(x) << 3) - n)))
使用以下代码:
#include <iostream>
#include <cstdint>
#define macro_rotate_right(x, n) ((x >> n) | (x << ((sizeof(x) << 3) - n)))
uint32_t rotate_right(uint32_t x, uint32_t n)
{
return ((x >> n) | (x << ((sizeof(x) << 3) - n)));
}
uint32_t test1() {
uint32_t input1;
std::cin>>input1;
uint32_t input2;
std::cin>>input2;
return macro_rotate_right(input1,input2);
}
uint32_t test2() {
uint32_t input1;
std::cin>>input1;
uint32_t input2;
std::cin>>input2;
return rotate_right(input1,input2);
}
int main() {
test1();
test2();
return 0;
}
Godbolt
-std=c++20 -O2
在两次测试中都显示了相同的组件:
rotate_right(unsigned int, unsigned int):
mov eax, edi
mov ecx, esi
ror eax, cl
ret
test1():
sub rsp, 24
mov edi, OFFSET FLAT:_ZSt3cin
lea rsi, [rsp+8]
call std::basic_istream<char, std::char_traits<char> >& std::basic_istream<char, std::char_traits<char> >::_M_extract<unsigned int>(unsigned int&)
lea rsi, [rsp+12]
mov edi, OFFSET FLAT:_ZSt3cin
call std::basic_istream<char, std::char_traits<char> >& std::basic_istream<char, std::char_traits<char> >::_M_extract<unsigned int>(unsigned int&)
mov ecx, DWORD PTR [rsp+12]
mov eax, DWORD PTR [rsp+8]
add rsp, 24
ror eax, cl
ret
test2():
sub rsp, 24
mov edi, OFFSET FLAT:_ZSt3cin
lea rsi, [rsp+8]
call std::basic_istream<char, std::char_traits<char> >& std::basic_istream<char, std::char_traits<char> >::_M_extract<unsigned int>(unsigned int&)
lea rsi, [rsp+12]
mov edi, OFFSET FLAT:_ZSt3cin
call std::basic_istream<char, std::char_traits<char> >& std::basic_istream<char, std::char_traits<char> >::_M_extract<unsigned int>(unsigned int&)
mov ecx, DWORD PTR [rsp+12]
mov eax, DWORD PTR [rsp+8]
add rsp, 24
ror eax, cl
ret
main:
sub rsp, 8
call test1()
call test2()
xor eax, eax
add rsp, 8
ret
因此它们的性能是相同的。事实上,该值是内联的。