通过简单的附加分支刺激性能降低

Question

在我的项目中，我有一个函数，出于性能原因，应该有条件地跳过代码路径。如果条件为真，我按预期增加了50％。但如果条件为假，则在最坏的情况下，正常路径的性能会降低30％。因为算法传递了数百个循环，所以我无法理解为什么一个简单的附加if子句会产生如此大的影响。

该函数是项目FFmpeg.org中libavfilter / vf_fillborders.c的一部分

static void mirror_borders16(FillBordersContext *s, AVFrame *frame)
{
    for (int p = 0; p < s->nb_planes; p++) {
        uint16_t *data = (uint16_t *)frame->data[p];
        int lz = frame->linesize[p] / sizeof(uint16_t);
        int width = s->planewidth[p];
        int left = s->borders[p].left;
        int right = s->borders[p].right;
        int height = s->planeheight[p];
        int height2 = height * lz;
        int top = s->borders[p].top;
        int top2 = top * lz;
        int bottom = height - s->borders[p].bottom;
        int bottom2 = bottom * lz;

        /* fill left and right borders from top to bottom border */
/********* Here is the additional code line: **********/
        if (left > 0 || right > 0) // in case skip for performance
/******************************************************/
            for (int y = top2; y < bottom2; y += lz) {
                for (int x = 0; x < left; x++)
                    data[y + x] = data[y + left * 2 - 1 - x];
                for (int x = 0; x < right; x++)
                    data[y + width - right + x] = data[y + width - right - 1 - x];
            }

        /* fill top and bottom borders */
        for (int y = 0; y < top2; y += lz)
            memcpy(data + y, data + (top2 * 2 - lz - y), width * sizeof(uint16_t));
        for (int y = 0; y < height2 - bottom2; y += lz)
            memcpy(data + (bottom2 + y),
                    data + (bottom2 - lz - y), width * sizeof(uint16_t));
    }
}

在类似的函数中，我使用相同的技巧来避免使用y在if (left > 0 || right < width)上的无用循环。在这种情况下，额外的if子句仅消耗约0.5％的可预期值。这里的代码：

static void smear_borders16(FillBordersContext *s, AVFrame *frame)
{
    for (int p = 0; p < s->nb_planes; p++) {
        uint16_t *data = (uint16_t *)frame->data[p];
        int lz = frame->linesize[p] / sizeof(uint16_t);
        int width = s->planewidth[p];
        int left = s->borders[p].left;
        int right = width - s->borders[p].right;
        int height = s->planeheight[p];
        int height2 = height * lz;
        int top = s->borders[p].top;
        int top2 = top * lz;
        int bottom = height - s->borders[p].bottom;
        int bottom2 = bottom * lz;

        /* fill left and right borders from top to bottom border */
        if (left > 0 || right < width) // in case skip for performance
            for (int y = top2; y < bottom2; y += lz) {
                for (int x = 0; x < left; x++)
                    data[y + x] = data[y + left];
                for (int x = right; x < width; x++)
                    data[y + x] = data[y + right - 1];
            }

        /* fill top and bottom borders */
        for (int y = 0; y < top2; y += lz)
            memcpy(data + y, data + top2, width * sizeof(uint16_t));
        for (int y = bottom2; y < height2; y += lz)
            memcpy(data + y, data + (bottom2 - lz), width * sizeof(uint16_t));
    }
}

我的处理器是Intel P8600。希望MCVE可以在这里找到：https://translate.google.com/translate?sl=de&tl=en&u=forum.ubuntuusers.de%2Fpost%2F9064193如果您不理解翻译的德语说明，请发表评论。

Answer 1

我已经调查了拆解生成的机器代码。插入if (left > 0 || right > 0)会导致编译器对后续代码进行重大更改。看起来，编译器在第二种情况下优化次优，这可以解释20％的性能下降。

184         /* fill left and right borders from top to bottom border */
185         
186             for (int y = top2; y < bottom2; y += lz) {
   0x00000000002004ca <+138>:   cmp    %r14d,%esi
   0x00000000002004d1 <+145>:   jge    0x2005ad <mirror_borders16+365>
   0x00000000002004d7 <+151>:   movslq %ebx,%rax
   0x00000000002004da <+154>:   lea    -0x2(%rbp),%r15
   0x00000000002004de <+158>:   lea    -0x1(%r11),%r12d
   0x00000000002004e2 <+162>:   lea    (%rax,%rax,1),%r8
   0x00000000002004e6 <+166>:   movslq %esi,%rax
   0x000000000020050e <+206>:   mov    $0x1,%r12d
   0x0000000000200514 <+212>:   mov    %ebx,%r9d
   0x0000000000200517 <+215>:   mov    %rbx,0x30(%rsp)
   0x000000000020051c <+220>:   sub    %rax,%r15
   0x000000000020051f <+223>:   sub    %edx,%r12d
   0x0000000000200522 <+226>:   mov    %r14d,%ebx
   0x0000000000200525 <+229>:   nopl   (%rax)
   0x00000000002005a0 <+352>:   lea    (%r12,%rsi,1),%eax
   0x00000000002005a4 <+356>:   cmp    %eax,%ebx
   0x00000000002005a6 <+358>:   jg     0x200528 <mirror_borders16+232>
   0x00000000002005a8 <+360>:   mov    0x30(%rsp),%rbx

187                 for (int x = 0; x < left; x++)
   0x0000000000200528 <+232>:   test   %r11d,%r11d
   0x000000000020052b <+235>:   jle    0x20055f <mirror_borders16+287>
   0x000000000020052d <+237>:   movslq %esi,%r14
   0x0000000000200530 <+240>:   mov    %rdi,%rdx
   0x0000000000200533 <+243>:   mov    %ecx,(%rsp)
   0x0000000000200536 <+246>:   add    %r14,%r14
   0x0000000000200539 <+249>:   lea    0x0(%rbp,%r14,1),%rax
   0x000000000020053e <+254>:   add    %r13,%r14
   0x0000000000200541 <+257>:   nopl   0x0(%rax)
   0x0000000000200557 <+279>:   cmp    %rax,%r14
   0x000000000020055a <+282>:   jne    0x200548 <mirror_borders16+264>
   0x000000000020055c <+284>:   mov    (%rsp),%ecx

188                     data[y + x] = data[y + left * 2 - 1 - x];
   0x00000000002004e9 <+169>:   lea    (%r11,%r11,1),%edx
   0x00000000002004ed <+173>:   sub    $0x1,%ecx
   0x00000000002004f0 <+176>:   lea    0x0(%rbp,%rax,2),%rdi
   0x00000000002004f5 <+181>:   lea    -0x1(%r10),%eax
   0x00000000002004f9 <+185>:   add    %r12,%r12
   0x00000000002004fc <+188>:   mov    %r15,%r13
   0x00000000002004ff <+191>:   sub    %r10d,%ecx
   0x0000000000200502 <+194>:   add    %esi,%ecx
   0x0000000000200504 <+196>:   add    %rax,%rax
   0x0000000000200507 <+199>:   sub    %r12,%r13
   0x000000000020050a <+202>:   lea    -0x1(%rsi,%rdx,1),%esi
   0x0000000000200548 <+264>:   movzwl (%rax),%ecx
   0x000000000020054b <+267>:   sub    $0x2,%rax
   0x000000000020054f <+271>:   add    $0x2,%rdx
   0x0000000000200553 <+275>:   mov    %cx,-0x2(%rdx)

189                 for (int x = 0; x < right; x++)
   0x000000000020055f <+287>:   test   %r10d,%r10d
   0x0000000000200562 <+290>:   jle    0x200597 <mirror_borders16+343>
   0x0000000000200564 <+292>:   lea    0x1(%rcx),%edx
   0x0000000000200567 <+295>:   movslq %ecx,%r14
   0x000000000020056a <+298>:   mov    %ecx,(%rsp)
   0x000000000020056d <+301>:   add    %r14,%r14
   0x0000000000200570 <+304>:   movslq %edx,%rdx
   0x0000000000200573 <+307>:   lea    0x0(%rbp,%r14,1),%rax
   0x0000000000200578 <+312>:   add    %r15,%r14
   0x000000000020057b <+315>:   lea    0x0(%rbp,%rdx,2),%rdx
   0x000000000020058f <+335>:   cmp    %rax,%r14
   0x0000000000200592 <+338>:   jne    0x200580 <mirror_borders16+320>
   0x0000000000200594 <+340>:   mov    (%rsp),%ecx
   0x0000000000200597 <+343>:   add    %r9d,%esi
   0x000000000020059a <+346>:   add    %r9d,%ecx
   0x000000000020059d <+349>:   add    %r8,%rdi

190                     data[y + width - right + x] = data[y + width - right - 1 - x];
   0x0000000000200580 <+320>:   movzwl (%rax),%ecx
   0x0000000000200583 <+323>:   sub    $0x2,%rax
   0x0000000000200587 <+327>:   add    $0x2,%rdx
   0x000000000020058b <+331>:   mov    %cx,-0x2(%rdx)

191             }

跳过无用的循环：

184         /* fill left and right borders from top to bottom border */
185         if (left > 0 || right > 0) // in case skip for performance
   0x00000000002004f7 <+135>:   test   %r8d,%r8d
   0x00000000002004fe <+142>:   jg     0x200640 <mirror_borders16+464>
   0x0000000000200504 <+148>:   test   %ecx,%ecx
   0x0000000000200506 <+150>:   jg     0x200640 <mirror_borders16+464>

186             for (int y = top2; y < bottom2; y += lz) {
   0x0000000000200640 <+464>:   cmp    0x24(%rsp),%r15d
   0x0000000000200645 <+469>:   jge    0x20050c <mirror_borders16+156>
   0x000000000020064b <+475>:   mov    0x20(%rsp),%ebp
   0x0000000000200661 <+497>:   mov    %r15d,0x4c(%rsp)
   0x0000000000200666 <+502>:   sub    %ecx,%r9d
   0x0000000000200669 <+505>:   lea    -0x1(%rax,%r15,1),%esi
   0x000000000020066e <+510>:   mov    0x24(%rsp),%r15d
   0x0000000000200673 <+515>:   sub    %eax,%ebp
   0x0000000000200675 <+517>:   lea    (%r11,%rdx,2),%rdi
   0x0000000000200679 <+521>:   lea    -0x1(%r8),%edx
   0x000000000020067d <+525>:   mov    %ebp,%r10d
   0x0000000000200680 <+528>:   add    %r9d,%ebp
   0x0000000000200683 <+531>:   lea    -0x2(%r11),%r9
   0x0000000000200687 <+535>:   movslq %ebx,%r13
   0x000000000020068a <+538>:   add    %rdx,%rdx
   0x000000000020068d <+541>:   mov    %ebx,%r12d
   0x0000000000200690 <+544>:   mov    %r9,%r14
   0x0000000000200693 <+547>:   mov    $0x1,%r9d
   0x0000000000200699 <+553>:   add    %r13,%r13
   0x000000000020069c <+556>:   sub    %ecx,%r10d
   0x000000000020069f <+559>:   sub    %rdx,%r14
   0x00000000002006a2 <+562>:   sub    %eax,%r9d
   0x00000000002006a5 <+565>:   mov    %rbx,0x38(%rsp)
   0x00000000002006aa <+570>:   nopw   0x0(%rax,%rax,1)
   0x000000000020072d <+701>:   lea    (%r9,%rsi,1),%eax
   0x0000000000200731 <+705>:   cmp    %eax,%r15d
   0x0000000000200734 <+708>:   jg     0x2006b0 <mirror_borders16+576>
   0x000000000020073a <+714>:   mov    0x38(%rsp),%rbx
   0x000000000020073f <+719>:   mov    0x4c(%rsp),%r15d
   0x0000000000200744 <+724>:   jmpq   0x20050c <mirror_borders16+156>
   0x0000000000200749 <+729>:   repz retq 
   0x000000000020074b:  nopl   0x0(%rax,%rax,1)

187                 for (int x = 0; x < left; x++)
   0x00000000002006b0 <+576>:   test   %r8d,%r8d
   0x00000000002006b3 <+579>:   jle    0x2006ec <mirror_borders16+636>
   0x00000000002006b5 <+581>:   movslq %esi,%rbx
   0x00000000002006b8 <+584>:   mov    %rdi,%rdx
   0x00000000002006bb <+587>:   mov    %ecx,0x8(%rsp)
   0x00000000002006bf <+591>:   add    %rbx,%rbx
   0x00000000002006c2 <+594>:   lea    (%r11,%rbx,1),%rax
   0x00000000002006c6 <+598>:   add    %r14,%rbx
   0x00000000002006c9 <+601>:   nopl   0x0(%rax)
   0x00000000002006df <+623>:   cmp    %rax,%rbx
   0x00000000002006e2 <+626>:   jne    0x2006d0 <mirror_borders16+608>
   0x00000000002006e4 <+628>:   mov    0x8(%rsp),%ecx
   0x00000000002006f0 <+640>:   mov    %esi,0x8(%rsp)
   0x00000000002006f4 <+644>:   cltq   
   0x00000000002006f6 <+646>:   lea    (%r11,%rax,2),%rdx
   0x00000000002006fa <+650>:   lea    0x0(%rbp,%rsi,1),%eax
   0x00000000002006fe <+654>:   cltq   
   0x0000000000200700 <+656>:   lea    (%r11,%rax,2),%rbx
   0x0000000000200704 <+660>:   xor    %eax,%eax
   0x0000000000200706 <+662>:   nopw   %cs:0x0(%rax,%rax,1)

188                     data[y + x] = data[y + left * 2 - 1 - x];
   0x000000000020064f <+479>:   lea    (%r8,%r8,1),%eax
   0x0000000000200653 <+483>:   mov    0x18(%rsp),%r11
   0x0000000000200658 <+488>:   mov    $0x1,%r9d
   0x000000000020065e <+494>:   movslq %r15d,%rdx
   0x00000000002006d0 <+608>:   movzwl (%rax),%ecx
   0x00000000002006d3 <+611>:   sub    $0x2,%rax
   0x00000000002006d7 <+615>:   add    $0x2,%rdx
   0x00000000002006db <+619>:   mov    %cx,-0x2(%rdx)

189                 for (int x = 0; x < right; x++)
   0x00000000002006e8 <+632>:   test   %ecx,%ecx
   0x00000000002006ea <+634>:   jle    0x200727 <mirror_borders16+695>
   0x00000000002006ec <+636>:   lea    (%r10,%rsi,1),%eax
   0x000000000020071f <+687>:   cmp    %eax,%ecx
   0x0000000000200721 <+689>:   jg     0x200710 <mirror_borders16+672>
   0x0000000000200723 <+691>:   mov    0x8(%rsp),%esi
   0x0000000000200727 <+695>:   add    %r12d,%esi
   0x000000000020072a <+698>:   add    %r13,%rdi

190                     data[y + width - right + x] = data[y + width - right - 1 - x];
   0x0000000000200710 <+672>:   movzwl (%rdx),%esi
   0x0000000000200713 <+675>:   sub    $0x2,%rdx
   0x0000000000200717 <+679>:   mov    %si,(%rbx,%rax,2)
   0x000000000020071b <+683>:   add    $0x1,%rax

191             }

通过简单的附加分支刺激性能降低

问题描述投票：2回答：1

1个回答

最新问题

通过简单的附加分支刺激性能降低

问题描述 投票：2回答：1

1个回答

最新问题

问题描述投票：2回答：1