一 C语言实现一个向量求和运算
#include#include #include #include #include #include class Test { public: Test() = default; ~Test() = default; int Func() { int len = time(NULL)%24; uint8_t *a = (uint8_t*)malloc(len); memset(a, 0x3, len); uint8_t *b = (uint8_t*)malloc(len); memset(b, 0x3, len); c = (uint8_t*)malloc(len); for (int i = 0; i < len; i++) c[i] = a[i] + b[i]; datalen = len; return 0; } public: uint8_t *c{NULL}; int datalen{0}; }; int main() { Test t; t.Func(); for (int i = 0; i < t.datalen; i++) printf("%d ", t.c[i]); return 0; }
二 不加编译优化反汇编
_main: ; { 100000eb0: 55 pushq %rbp //入栈保存栈顶 100000eb1: 48 89 e5 movq %rsp, %rbp //更新rsp 到栈顶 quard bytes 64bit 100000eb4: 48 83 ec 30 subq $48, %rsp //栈顶下移 48 bytes 100000eb8: 31 c0 xorl %eax, %eax //清空eax 100000eba: 89 c7 movl %eax, %edi //edi -> eax , 第一个参数存入eax 100000ebc: c7 45 fc 00 00 00 00 movl $0, -4(%rbp) //0 存入 rbp 下面4个字节,就是上面 rsp - 48 腾出来的空间 ; int len = time(NULL)%24; 100000ec3: e8 b8 00 00 00 callq 184//应该是调用函数time 100000ec8: 48 99 cqto 100000eca: bf 18 00 00 00 movl $24, %edi //24存入 edi 100000ecf: 48 f7 ff idivq %rdi //除法,猜测余数结果存入了一个固定的寄存器 100000ed2: 89 d1 movl %edx, %ecx //edi 已经对24取余了,将结果存入ecx 100000ed4: 89 4d f8 movl %ecx, -8(%rbp) //len的栈地址 rbp - 8, 计算的结果 rbp - 8 为len这个变量 ; uint8_t *a = (uint8_t*)malloc(len); 100000ed7: 48 63 7d f8 movslq -8(%rbp), %rdi //rbp 存入 rdi , rdi作为函数的第一个参数, rsi其次之 100000edb: e8 94 00 00 00 callq 148 //调用malloc函数 100000ee0: 48 89 45 f0 movq %rax, -16(%rbp) //返回的申请的内存首地址,存入 rbp - 16这个栈地址中 ; memset(a, 0x3, len); 100000ee4: 48 8b 7d f0 movq -16(%rbp), %rdi //将要调用memset函数, 这个指针式第一个参数 100000ee8: 48 63 55 f8 movslq -8(%rbp), %rdx //rbp - 8存储的len 变量,作为第三个参数 100000eec: be 03 00 00 00 movl $3, %esi //esi 为存储的memset的值参数。 100000ef1: e8 84 00 00 00 callq 132 //调用memset函数 ; uint8_t *b = (uint8_t*)malloc(len); 100000ef6: 48 63 7d f8 movslq -8(%rbp), %rdi 100000efa: e8 75 00 00 00 callq 117 //函数地址和上面的一样 100000eff: 48 89 45 e8 movq %rax, -24(%rbp) //rax为返回值寄存器,存入, rbp - 24中 ; memset(b, 0x3, len); 100000f03: 48 8b 7d e8 movq -24(%rbp), %rdi //再将 rbp - 2 栈变量 b存入 rdi 100000f07: 48 63 55 f8 movslq -8(%rbp), %rdx //刚刚上面返回的地址,存入rdx,长度len 100000f0b: be 03 00 00 00 movl $3, %esi //3存入 esi,memset的值 100000f10: e8 65 00 00 00 callq 101 //调用memset ; uint8_t *c = (uint8_t*)malloc(len); 100000f15: 48 63 7d f8 movslq -8(%rbp), %rdi 100000f19: e8 56 00 00 00 callq 86 100000f1e: 48 89 45 e0 movq %rax, -32(%rbp) ; for (int i = 0; i < len; i++) 100000f22: c7 45 dc 00 00 00 00 movl $0, -36(%rbp) //定义一个栈变量i,并赋值为0 100000f29: 8b 45 dc movl -36(%rbp), %eax //i存储到 eax寄存器 100000f2c: 3b 45 f8 cmpl -8(%rbp), %eax //将len和eax寄存器做比较 100000f2f: 0f 8d 37 00 00 00 jge 55 <_main+0xbc> //判断循环是否结束 ; c[i] = a[i] + b[i]; 100000f35: 48 8b 45 f0 movq -16(%rbp), %rax //指针a存入rax 100000f39: 48 63 4d dc movslq -36(%rbp), %rcx // i变量存放到rcx 寄存器,rcx一般作为循环计数 100000f3d: 0f b6 14 08 movzbl (%rax,%rcx), %edx // 数据偏移存放到 edx寄存器,就是 a[i] 100000f41: 48 8b 45 e8 movq -24(%rbp), %rax //指针b 100000f45: 48 63 4d dc movslq -36(%rbp), %rcx 100000f49: 0f b6 34 08 movzbl (%rax,%rcx), %esi //数据偏移存放到 esi寄存器,就是 b[i] 100000f4d: 01 f2 addl %esi, %edx //两个数相加,结果在esi 100000f4f: 40 88 d7 movb %dl, %dil 100000f52: 48 8b 45 e0 movq -32(%rbp), %rax //指针c的偏移 100000f56: 48 63 4d dc movslq -36(%rbp), %rcx 100000f5a: 40 88 3c 08 movb %dil, (%rax,%rcx) // rax + rcx = dil c[i] = a[i] + b[i]计算完成 ; for (int i = 0; i < len; i++) 100000f5e: 8b 45 dc movl -36(%rbp), %eax 100000f61: 83 c0 01 addl $1, %eax //计数器 + 1 100000f64: 89 45 dc movl %eax, -36(%rbp) 100000f67: e9 bd ff ff ff jmp -67 <_main+0x79> //判断循环是否结束了 100000f6c: 31 c0 xorl %eax, %eax ; return 0;
三 -O3 编译优化
_main: ; { 100000e10: 55 pushq %rbp 100000e11: 48 89 e5 movq %rsp, %rbp //和上面一样 100000e14: 41 57 pushq %r15 100000e16: 41 56 pushq %r14 100000e18: 41 55 pushq %r13 100000e1a: 41 54 pushq %r12 100000e1c: 53 pushq %rbx 100000e1d: 50 pushq %rax ; int len = time(NULL)%24; 100000e1e: 31 ff xorl %edi, %edi //edi = 0 100000e20: e8 47 01 00 00 callq 327//获取当前时间 100000e25: 49 89 c6 movq %rax, %r14 // 返回值存入r14 100000e28: 48 b9 ab aa aa aa aa aa aa 2a movabsq $3074457345618258603, %rcx // 100000e32: 48 f7 e9 imulq %rcx // ??? 100000e35: 48 89 d0 movq %rdx, %rax 100000e38: 48 c1 e8 3f shrq $63, %rax //rax右移位 63 100000e3c: 48 c1 ea 02 shrq $2, %rdx //rdx右移位2 100000e40: 48 01 c2 addq %rax, %rdx // 100000e43: 48 c1 e2 03 shlq $3, %rdx //左移位 3 ? 100000e47: 48 8d 04 52 leaq (%rdx,%rdx,2), %rax 100000e4b: 49 29 c6 subq %rax, %r14 // ; uint8_t *a = (uint8_t*)malloc(len); 100000e4e: 4c 89 f7 movq %r14, %rdi //最终len 存放于r14中 100000e51: e8 04 01 00 00 callq 260 100000e56: 49 89 c7 movq %rax, %r15 和之前代码一样,申请内存 ; memset(a, 0x3, len); 100000e59: 48 89 c7 movq %rax, %rdi 100000e5c: be 03 00 00 00 movl $3, %esi 100000e61: 4c 89 f2 movq %r14, %rdx 100000e64: e8 f7 00 00 00 callq 247 //和优化前代码一样,memset ; uint8_t *b = (uint8_t*)malloc(len); 100000e69: 4c 89 f7 movq %r14, %rdi 100000e6c: e8 e9 00 00 00 callq 233 100000e71: 49 89 c4 movq %rax, %r12 ; memset(b, 0x3, len); 100000e74: 48 89 c7 movq %rax, %rdi 100000e77: be 03 00 00 00 movl $3, %esi 100000e7c: 4c 89 f2 movq %r14, %rdx 100000e7f: e8 dc 00 00 00 callq 220 ; c = (uint8_t*)malloc(len); 100000e84: 4c 89 f7 movq %r14, %rdi 100000e87: e8 ce 00 00 00 callq 206 ; for (int i = 0; i < len; i++) 100000e8c: 45 85 f6 testl %r14d, %r14d 100000e8f: 0f 8e b3 00 00 00 jle 179 <_main+0x138> 100000e95: 49 89 c5 movq %rax, %r13 100000e98: 44 89 f0 movl %r14d, %eax ; c[i] = a[i] + b[i]; 100000e9b: 48 83 f8 20 cmpq $32, %rax 100000e9f: 73 04 jae 4 <_main+0x95> 100000ea1: 31 c9 xorl %ecx, %ecx //清空ecx 100000ea3: eb 5b jmp 91 <_main+0xf0> 100000ea5: 44 89 f2 movl %r14d, %edx 100000ea8: 83 e2 1f andl $31, %edx 100000eab: 48 89 c1 movq %rax, %rcx 100000eae: 48 29 d1 subq %rdx, %rcx 100000eb1: 31 f6 xorl %esi, %esi 100000eb3: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) 100000ebd: 0f 1f 00 nopl (%rax) 100000ec0: f3 41 0f 6f 04 37 movdqu (%r15,%rsi), %xmm0 //rsi存储的 指针偏移, r15 = a -> xmm0 100000ec6: f3 41 0f 6f 4c 37 10 movdqu 16(%r15,%rsi), %xmm1 //r15 + 16 -> xmm1 100000ecd: f3 41 0f 6f 14 34 movdqu (%r12,%rsi), %xmm2 //r12 为b 100000ed3: 66 0f fc d0 paddb %xmm0, %xmm2 //累加,每次加16字节 100000ed7: f3 41 0f 6f 44 34 10 movdqu 16(%r12,%rsi), %xmm0 //b + 16 -> xmm0 100000ede: 66 0f fc c1 paddb %xmm1, %xmm0 //累加 a + 16 ++ b + 16 100000ee2: f3 41 0f 7f 54 35 00 movdqu %xmm2, (%r13,%rsi) //数据存入c 前面16字节, 100000ee9: f3 41 0f 7f 44 35 10 movdqu %xmm0, 16(%r13,%rsi) //数据存入 c后面的字节 ; for (int i = 0; i < len; i++) 100000ef0: 48 83 c6 20 addq $32, %rsi 100000ef4: 48 39 f1 cmpq %rsi, %rcx 100000ef7: 75 c7 jne -57 <_main+0xb0> 100000ef9: 48 85 d2 testq %rdx, %rdx 100000efc: 74 18 je 24 <_main+0x106> 100000efe: 66 90 nop ; c[i] = a[i] + b[i];