IT

왜 GCC는 거의 동일한 C 코드에 대해 이렇게 완전히 다른 어셈블리를 생성합니까?

lottoking 2020. 5. 17. 11:08
반응형

왜 GCC는 거의 동일한 C 코드에 대해 이렇게 완전히 다른 어셈블리를 생성합니까?


최적화 된 ftol함수를 작성하는 동안 에서 매우 이상한 동작을 발견했습니다 GCC 4.6.1. 먼저 코드를 보여 드리겠습니다 (명확하게하기 위해 차이점을 표시했습니다).

fast_trunc_one, C :

int fast_trunc_one(int i) {
    int mantissa, exponent, sign, r;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);
    sign = i & 0x80000000;

    if (exponent < 0) {
        r = mantissa << -exponent;                       /* diff */
    } else {
        r = mantissa >> exponent;                        /* diff */
    }

    return (r ^ -sign) + sign;                           /* diff */
}

fast_trunc_two, C :

int fast_trunc_two(int i) {
    int mantissa, exponent, sign, r;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);
    sign = i & 0x80000000;

    if (exponent < 0) {
        r = (mantissa << -exponent) ^ -sign;             /* diff */
    } else {
        r = (mantissa >> exponent) ^ -sign;              /* diff */
    }

    return r + sign;                                     /* diff */
}

똑같은 것 같아? 글쎄, GCC는 동의하지 않는다. gcc -O3 -S -Wall -o test.s test.c이것으로 컴파일 한 후 어셈블리 출력입니다.

fast_trunc_one, 생성 :

_fast_trunc_one:
LFB0:
    .cfi_startproc
    movl    4(%esp), %eax
    movl    $150, %ecx
    movl    %eax, %edx
    andl    $8388607, %edx
    sarl    $23, %eax
    orl $8388608, %edx
    andl    $255, %eax
    subl    %eax, %ecx
    movl    %edx, %eax
    sarl    %cl, %eax
    testl   %ecx, %ecx
    js  L5
    rep
    ret
    .p2align 4,,7
L5:
    negl    %ecx
    movl    %edx, %eax
    sall    %cl, %eax
    ret
    .cfi_endproc

fast_trunc_two, 생성 :

_fast_trunc_two:
LFB1:
    .cfi_startproc
    pushl   %ebx
    .cfi_def_cfa_offset 8
    .cfi_offset 3, -8
    movl    8(%esp), %eax
    movl    $150, %ecx
    movl    %eax, %ebx
    movl    %eax, %edx
    sarl    $23, %ebx
    andl    $8388607, %edx
    andl    $255, %ebx
    orl $8388608, %edx
    andl    $-2147483648, %eax
    subl    %ebx, %ecx
    js  L9
    sarl    %cl, %edx
    movl    %eax, %ecx
    negl    %ecx
    xorl    %ecx, %edx
    addl    %edx, %eax
    popl    %ebx
    .cfi_remember_state
    .cfi_def_cfa_offset 4
    .cfi_restore 3
    ret
    .p2align 4,,7
L9:
    .cfi_restore_state
    negl    %ecx
    sall    %cl, %edx
    movl    %eax, %ecx
    negl    %ecx
    xorl    %ecx, %edx
    addl    %edx, %eax
    popl    %ebx
    .cfi_restore 3
    .cfi_def_cfa_offset 4
    ret
    .cfi_endproc

그것은 극단적 인 차이입니다. 이것은 실제로 프로필에도 fast_trunc_one나타나고보다 약 30 % 빠릅니다 fast_trunc_two. 이제 내 질문 :이 원인은 무엇입니까?


OP의 수정 사항과 동기화되도록 업데이트되었습니다.

코드를 수정하여 GCC가 첫 번째 사례를 최적화하는 방법을 확인했습니다.

왜 그렇게 다른지 이해하기 전에 먼저 GCC가 어떻게 최적화하는지 이해해야합니다 fast_trunc_one().

믿거 나 말거나 다음과 같이 fast_trunc_one()최적화하고 있습니다.

int fast_trunc_one(int i) {
    int mantissa, exponent;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);

    if (exponent < 0) {
        return (mantissa << -exponent);             /* diff */
    } else {
        return (mantissa >> exponent);              /* diff */
    }
}

이것은 원래 fast_trunc_one()레지스터 이름과 모든 것과 정확히 동일한 어셈블리를 생성합니다 .

xor에 대한 어셈블리 가 없습니다 fast_trunc_one(). 그게 나를 위해 줬어.


어떻게 요?


1 단계: sign = -sign

먼저 sign변수를 살펴 보겠습니다 . 이후로 sign = i & 0x80000000;가능한 값은 두 가지뿐입니다 sign.

  • sign = 0
  • sign = 0x80000000

이제 두 경우 모두 sign == -sign. 따라서 원래 코드를 다음과 같이 변경하면

int fast_trunc_one(int i) {
    int mantissa, exponent, sign, r;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);
    sign = i & 0x80000000;

    if (exponent < 0) {
        r = mantissa << -exponent;
    } else {
        r = mantissa >> exponent;
    }

    return (r ^ sign) + sign;
}

원본과 정확히 동일한 어셈블리를 생성합니다 fast_trunc_one(). 어셈블리를 아끼지 않겠지 만 등록 이름과 모두 동일합니다.


2 단계 : 수학 축소 :x + (y ^ x) = y

sign0또는 두 값 중 하나만 사용할 수 있습니다 0x80000000.

  • x = 0, 다음 x + (y ^ x) = y다음 사소한 보유.
  • 추가 및 조정 기준 0x80000000은 동일합니다. 부호 비트를 뒤집습니다. 따라서 x + (y ^ x) = y때도 보유 x = 0x80000000합니다.

따라서로 x + (y ^ x)줄어 듭니다 y. 코드는 다음과 같이 단순화됩니다.

int fast_trunc_one(int i) {
    int mantissa, exponent, sign, r;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);
    sign = i & 0x80000000;

    if (exponent < 0) {
        r = (mantissa << -exponent);
    } else {
        r = (mantissa >> exponent);
    }

    return r;
}

다시, 이것은 정확히 동일한 어셈블리 (레지스터 이름과 모두)로 컴파일됩니다.


위의 버전은 마침내 다음과 같이 줄어 듭니다.

int fast_trunc_one(int i) {
    int mantissa, exponent;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);

    if (exponent < 0) {
        return (mantissa << -exponent);             /* diff */
    } else {
        return (mantissa >> exponent);              /* diff */
    }
}

GCC가 어셈블리에서 생성하는 것과 거의 똑같습니다.


그렇다면 컴파일러가 왜 fast_trunc_two()같은 것을 최적화하지 않습니까?

핵심 부분 fast_trunc_one()x + (y ^ x) = y최적화입니다. 에서 표현 분기에 분할되고있다.fast_trunc_two()x + (y ^ x)

나는이 최적화를하지 않기 위해 GCC를 혼동하기에 충분하다고 생각합니다. ( ^ -sign지점 밖으로 들어 올려 r + sign마지막에 병합해야 합니다.)

예를 들어, 다음과 같은 어셈블리를 생성합니다 fast_trunc_one().

int fast_trunc_two(int i) {
    int mantissa, exponent, sign, r;

    mantissa = (i & 0x07fffff) | 0x800000;
    exponent = 150 - ((i >> 23) & 0xff);
    sign = i & 0x80000000;

    if (exponent < 0) {
        r = ((mantissa << -exponent) ^ -sign) + sign;             /* diff */
    } else {
        r = ((mantissa >> exponent) ^ -sign) + sign;              /* diff */
    }

    return r;                                     /* diff */
}

이것이 컴파일러의 본질입니다. 그들이 가장 빠르거나 최선의 길을 택한다고 가정하면, 그것은 거짓입니다. "현대 컴파일러"가 빈칸을 채우고, 최고의 작업을 수행하고, 가장 빠른 코드를 만드는 등의 이유로 최적화하기 위해 코드에 아무것도 할 필요가 없음을 의미하는 사람. 실제로 gcc가 3.x에서 팔에 적어도 4.x. 이 시점에서 4.x는 3.x를 따라 잡았을 지 모르지만 초기에는 더 느린 코드를 생성했습니다. 실제로 코드를 작성하는 방법을 배울 수 있으므로 컴파일러가 열심히 일할 필요가 없으며 결과적으로보다 일관되고 기대되는 결과를 얻을 수 있습니다.

The bug here is your expectations of what will be produced, not what was actually produced. If you want the compiler to generate the same output, feed it the same input. Not mathematically the same, not kinda the same, but actually the same, no different paths, no sharing or distributing operations from one version to the other. This is a good exercise in understanding how to write your code and seeing what compilers do with it. Don't make the mistake of assuming that because one version of gcc for one processor target one day produced a certain result that that is a rule for all compilers and all code. You have to use many compilers and many targets to get a feel for what is going on.

gcc is pretty nasty, I invite you to look behind the curtain, look at the guts of gcc, try to add a target or modify something yourself. It is barely held together by duct tape and bailing wire. An extra line of code added or removed in critical places and it comes crumbling down. The fact that it has produced usable code at all is something to be pleased about, instead of worrying about why it didnt meet other expectations.

did you look at what different versions of gcc produce? 3.x and 4.x in particular 4.5 vs 4.6 vs 4.7, etc? and for different target processors, x86, arm, mips, etc or different flavors of x86 if that is the native compiler you use, 32 bit vs 64 bit, etc? And then llvm (clang) for different targets?

Mystical has done an excellent job in the thought process required to work through the problem of analyzing/optimizing the code, expecting a compiler to come up with any of that is, well, not expected of any "modern compiler".

Without getting into the math properties, code of this form

if (exponent < 0) {
  r = mantissa << -exponent;                       /* diff */
} else {
  r = mantissa >> exponent;                        /* diff */
}
return (r ^ -sign) + sign;                           /* diff */

is going to lead the compiler to A: implement it in that form, perform the if-then-else then converge on common code to finish up and return. or B: save a branch since this is the tail end of the function. Also not bother with using or saving r.

if (exponent < 0) {
  return((mantissa << -exponent)^-sign)+sign;
} else {
  return((mantissa << -exponent)^-sign)+sign;
}

Then you can get into as Mystical pointed out the sign variable disappears all together for the code as written. I wouldn't expect the compiler to see the sign variable go away so you should have done that yourself and not forced the compiler to try to figure it out.

This is a perfect opportunity to dig into the gcc source code. It appears you have found a case where the optimizer saw one thing in one case then another thing in another case. Then take the next step and see if you can't get gcc to see that case. Every optimization is there because some individual or group recognized the optimization and intentionally put it there. For this optimization to be there and work every time someone has to put it there (and then test it, and then maintain it into the future).

Definitely do not assume that less code is faster and more code is slower, it is very easy to create and find examples of that not being true. It might more often than not be the case of less code being faster than more code. As I demonstrated from the start though you can create more code to save branching in that case or looping, etc and have the net result be faster code.

The bottom line is you fed a compiler different source and expected the same results. The problem is not the compiler output but the expectations of the user. It is fairly easy to demonstrate for a particular compiler and processor, the addition of one line of code that makes a whole function dramatically slower. For example why does changing a = b + 2; to a = b + c + 2; cause _fill_in_the_blank_compiler_name_ generate radically different and slower code? The answer of course being the compiler was fed different code on the input so it is perfectly valid for the compiler to generate different output. (even better is when you swap two unrelated lines of code and cause the output to change dramatically) There is no expected relationship between the complexity and size of the input to the complexity and size of the output. Feed something like this into clang:

for(ra=0;ra<20;ra++) dummy(ra);

It produced somewhere between 60-100 lines of assembler. It unrolled the loop. I didn't count the lines, if you think about it, it has to add, copy the result to the input to the function call, make the function call, three operations minimum. so depending on the target that is probably 60 instructions at least, 80 if four per loop, 100 if five per loop, etc.


Mysticial has already given a great explanation, but I thought I'd add, FWIW, that there's really nothing fundamental about why a compiler would make the optimization for one and not the other.

LLVM's clang compiler, for example, gives the same code for both functions (except for the function name), giving:

_fast_trunc_two:                        ## @fast_trunc_one
        movl    %edi, %edx
        andl    $-2147483648, %edx      ## imm = 0xFFFFFFFF80000000
        movl    %edi, %esi
        andl    $8388607, %esi          ## imm = 0x7FFFFF
        orl     $8388608, %esi          ## imm = 0x800000
        shrl    $23, %edi
        movzbl  %dil, %eax
        movl    $150, %ecx
        subl    %eax, %ecx
        js      LBB0_1
        shrl    %cl, %esi
        jmp     LBB0_3
LBB0_1:                                 ## %if.then
        negl    %ecx
        shll    %cl, %esi
LBB0_3:                                 ## %if.end
        movl    %edx, %eax
        negl    %eax
        xorl    %esi, %eax
        addl    %edx, %eax
        ret

This code isn't as short as the first gcc version from the OP, but not as long as the second.

Code from another compiler (which I won't name), compiling for x86_64, produces this for both functions:

fast_trunc_one:
        movl      %edi, %ecx        
        shrl      $23, %ecx         
        movl      %edi, %eax        
        movzbl    %cl, %edx         
        andl      $8388607, %eax    
        negl      %edx              
        orl       $8388608, %eax    
        addl      $150, %edx        
        movl      %eax, %esi        
        movl      %edx, %ecx        
        andl      $-2147483648, %edi
        negl      %ecx              
        movl      %edi, %r8d        
        shll      %cl, %esi         
        negl      %r8d              
        movl      %edx, %ecx        
        shrl      %cl, %eax         
        testl     %edx, %edx        
        cmovl     %esi, %eax        
        xorl      %r8d, %eax        
        addl      %edi, %eax        
        ret                         

which is fascinating in that it computes both sides of the if and then uses a conditional move at the end to pick the right one.

The Open64 compiler produces the following:

fast_trunc_one: 
    movl %edi,%r9d                  
    sarl $23,%r9d                   
    movzbl %r9b,%r9d                
    addl $-150,%r9d                 
    movl %edi,%eax                  
    movl %r9d,%r8d                  
    andl $8388607,%eax              
    negl %r8d                       
    orl $8388608,%eax               
    testl %r8d,%r8d                 
    jl .LBB2_fast_trunc_one         
    movl %r8d,%ecx                  
    movl %eax,%edx                  
    sarl %cl,%edx                   
.Lt_0_1538:
    andl $-2147483648,%edi          
    movl %edi,%eax                  
    negl %eax                       
    xorl %edx,%eax                  
    addl %edi,%eax                  
    ret                             
    .p2align 5,,31
.LBB2_fast_trunc_one:
    movl %r9d,%ecx                  
    movl %eax,%edx                  
    shll %cl,%edx                   
    jmp .Lt_0_1538                  

and similar, but not identical, code for fast_trunc_two.

Anyway, when it comes to optimization, it's a lottery — it is what it is... It isn't always easy to know why you code gets compiled any particular way.

참고URL : https://stackoverflow.com/questions/10250419/why-does-gcc-generate-such-radically-different-assembly-for-nearly-the-same-c-co

반응형