且构网

分享程序员开发的那些事...
且构网 - 分享程序员编程开发的那些事

将字节转换为unsigned int的最快方法

更新时间:2022-01-26 23:18:38

正确的答案几乎总是:

编写正确的代码,启用优化功能,信任您的编译器.

给定:

void count_values(std::array<uint32_t, 256^3>& results,
                  const unsigned char* from,
                  const unsigned char* to)
{
    for(; from != to; from  = std::next(from, 3)) {
        ++results[(*from << 16) | (*std::next(from, 1) << 8) | *(std::next(from,2))];
    }
}

-O3

收益率(内含解释性注释):

Yields (explanatory comments inlined):

__Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    jmp LBB0_2
    .align  4, 0x90
LBB0_1:                                 ## %.lr.ph
                                        ##   in Loop: Header=BB0_2 Depth=1
# dereference from and extend the 8-bit value to 32 bits
    movzbl  (%rsi), %eax
    shlq    $16, %rax            # shift left 16
    movzbl  1(%rsi), %ecx        # dereference *(from+1) and extend to 32bits by padding with zeros
    shlq    $8, %rcx             # shift left 8
    orq %rax, %rcx               # or into above result 
    movzbl  2(%rsi), %eax        # dreference *(from+2) and extend to 32bits
    orq %rcx, %rax               # or into above result
    incl    (%rdi,%rax,4)        # increment the correct counter
    addq    $3, %rsi             # from += 3
LBB0_2:                                 ## %.lr.ph
                                        ## =>This Inner Loop Header: Depth=1
    cmpq    %rdx, %rsi           # while from != to
    jne LBB0_1
## BB#3:                                ## %._crit_edge
    popq    %rbp
    retq
    .cfi_endproc

请注意,没有必要偏离标准构造或标准调用.编译器会生成完美的代码.

Notice that there is no need to stray away from standard constructs or standard calls. The compiler produces perfect code.

为了进一步证明这一点,让我们疯狂起来,编写一个自定义迭代器,使我们可以将函数简化为:

To further prove the point, let's go crazy and write a custom iterator that allows us to reduce the function to this:

void count_values(std::array<uint32_t, 256^3>& results,
                  byte_triple_iterator from,
                  byte_triple_iterator to)
{
    assert(iterators_correct(from, to));
    while(from != to) {
        ++results[*from++];
    }
}

这是这种迭代器的(基本)实现:

And here is a (basic) implementation of such an iterator:

struct byte_triple_iterator
{
    constexpr byte_triple_iterator(const std::uint8_t* p)
    : _ptr(p)
    {}

    std::uint32_t operator*() const noexcept {
        return (*_ptr << 16) | (*std::next(_ptr, 1) << 8) | *(std::next(_ptr,2));
    }

    byte_triple_iterator& operator++() noexcept {
        _ptr = std::next(_ptr, 3);
        return *this;
    }

    byte_triple_iterator operator++(int) noexcept {
        auto copy = *this;
        _ptr = std::next(_ptr, 3);
        return copy;
    }

    constexpr const std::uint8_t* byte_ptr() const {
        return _ptr;
    }

private:

    friend bool operator<(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return from._ptr < to._ptr;
    }

    friend bool operator==(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return from._ptr == to._ptr;
    }

    friend bool operator!=(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return not(from == to);
    }

    friend std::ptrdiff_t byte_difference(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return to._ptr - from._ptr;
    }

    const std::uint8_t* _ptr;
};

bool iterators_correct(const byte_triple_iterator& from,
                       const byte_triple_iterator& to)
{
    if (not(from < to))
        return false;
    auto dist = to.byte_ptr() - from.byte_ptr();
    return dist % 3 == 0;
}

现在我们拥有什么?

  • 一个断言,以检查我们的源代码确实是正确的长度(在调试版本中)
  • 保证大小合适的输出结构

但是对我们的目标代码做了什么?(使用 -O3 -DNDEBUG 编译)

But what's it done to our object code? (compile with -O3 -DNDEBUG)

    .globl  __Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
    .align  4, 0x90
__Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    jmp LBB1_2
    .align  4, 0x90
LBB1_1:                                 ## %.lr.ph
                                        ##   in Loop: Header=BB1_2 Depth=1
    movzbl  (%rsi), %eax
    shlq    $16, %rax
    movzbl  1(%rsi), %ecx
    shlq    $8, %rcx
    orq %rax, %rcx
    movzbl  2(%rsi), %eax
    orq %rcx, %rax
    incl    (%rdi,%rax,4)
    addq    $3, %rsi
LBB1_2:                                 ## %.lr.ph
                                        ## =>This Inner Loop Header: Depth=1
    cmpq    %rdx, %rsi
    jne LBB1_1
## BB#3:                                ## %._crit_edge
    popq    %rbp
    retq
    .cfi_endproc

答案:没什么-同样有效.

课程?没有真的!相信您的编译器!

The lesson? No really! Trust your compiler!!!