美文网首页
汇编优化问题

汇编优化问题

作者: Zparkle | 来源:发表于2019-04-01 19:15 被阅读0次

    1.循环计算字符串长度中的代码耗时问题

    在用gdb阅读bomblab 问题1时,阅读了一下String_length函数,有个疑问

    String_length:
        cmpb  $0x0,(%rdi)  ;judge if the string is null
        je d
        mov  %rdi,%rdx ;move the firsd attr to local postion
        add  $0x1,%rdx ;move pos to next byte
        mov   %edx,%eax;set the return value to now postion
        sub  %edi,%eax;you know that 'edi' is the half of 'rdi' which save where the string begin. And after this, you can see the length in 'eax' register
        cmpb  $0x0,(%rdx);judge if at the tail of the string
        jne ; if not equal, return to add and make loop
        repz retq; is equal,means the data in 'eax' is the final length of the string
        mov  $0x0,%eax;set the return value to 0 if there is no string at the input attr
        retq
        
    

    So the question at the middle of this function. You can see the 5,6th sentence of String_length function. To calculate the length of the input string, the function use 1 mov and 1 sub to calculate the length between tail and head. But why we don't just set $0x0 to the return value %rax and add one at each step? I think to cal sub, computer will do more than just call add.(取反加一再加?)
    So why can't we modify the code like the follow one?

    String_length:
        cmpb  $0x0,(%rdi)
        je d
        mov  %rdi,%rdx 
        mov $0x0,%eax;return value set to 0
    here:
        add  $0x1,%rdx 
        add  $0x1,%eax
        cmpb  $0x0,(%rdx)
        jne (goto here)
        repz retq
        mov  $0x0,%eax
        retq
        
    

    They are both have 11 sentences and I will do the test if the second one run faster.(I wish so)

    2019/4/2 I do the test

    the source code of my test program as below:

    #include<stdio.h>
    #include<time.h>
    int String_length_2(char* str){
            int length = 0;
            __asm__
            __volatile__("cmpb $0x0,(%%rbx);\n\t\
            je equal_2;\n\t\
            mov %%rbx,%%rdx;\n\t\
            loop_2:\n\t\
            add $0x1,%%rdx;\n\t\
            mov %%edx,%%ecx;\n\t\
            sub %%ebx,%%ecx;\n\t\
            cmpb $0x0,(%%rdx);\n\t\
            jne loop_2;\n\t\
            jmp end_2;\n\t\
            equal_2:\n\t\
            mov $0x0,%0;\n\t\
            end_2:":"=c"(length):"b"(str));
            return length;
    }
    int String_length(char* str){
            int length = 0;
            __asm__
            __volatile__("cmpb $0x0,(%1);\n\t\
            je equal;\n\t\
            mov $0x0,%0;\n\t\
            loop:\n\t\
            add $0x1,%1;\n\t\
            add $0x1,%0;\n\t\
            cmpb $0x0,(%1);\n\t\
            jne loop;\n\t\
            jmp end;\n\t\
            equal:\n\t\
            mov $0x0,%0;\n\t\
            end:":"=r"(length):"b"(str));
            return length;
    }
    int main(){
            char str[] = "nice!";
            char str2[] = "";
            int times,length;
            clock_t start,end;
            start = clock();
            for(times = 200000000; times>0; times--){
                    length = String_length(str);
            }
            end = clock();
            printf("my_func time consume=%ld\nand length=%d\n",(end-start),length);
            start = clock();
            for(times = 200000000;times>0;times--){
                    length = String_length_2(str);
            }
            end = clock();
            printf("origin_func time consume=%ld\nand length=%d\n",(end-start),length);
    }
    
    

    Unfortunately, the result is:

    zhuangh7@LAPTOP-BK6LH6G7:/mnt/c/bomb./hello my_func time consume=875000 and length=5 origin_func time consume=812500 and length=5 zhuangh7@LAPTOP-BK6LH6G7:/mnt/c/bomb ./hello
    my_func time consume=859375
    and length=5
    origin_func time consume=828125
    and length=5

    I have no idea why the origin function perform better now. Wish someone can give me a hand.

    30 minutes later


    马腿还是牛逼啊……
    立即数参与的运算更慢一些,所以我把代码改成了:

    int String_length(char* str){
            int length = 0;
            __asm__
            __volatile__("cmpb $0x0,(%1);\n\t\
            je equal;\n\t\
            mov $0x0,%0;\n\t\
            mov $0x1,%%rdx\n\t\
            loop:\n\t\
            add $0x1,%1;\n\t\
            add %%edx,%0;\n\t\
            cmpb $0x0,(%1);\n\t\
            jne loop;\n\t\
            jmp end;\n\t\
            equal:\n\t\
            mov $0x0,%0;\n\t\
            end:":"=r"(length):"b"(str));
            return length;
    }
    
    

    把立即数1赋值到某个寄存器rdx上,然后在循环中调用寄存器相加。
    结果:

    my_func time consume=875000
    and length=5
    origin_func time consume=968750
    and length=5

    就很棒。下一个问题他到底是怎么搞出的这种奇葩代码计算字符串长度的,C语言源代码时什么,编译器又是如何得出这种代码的,没有任何思路,不再讨论。

    相关文章

      网友评论

          本文标题:汇编优化问题

          本文链接:https://www.haomeiwen.com/subject/yddbbqtx.html