-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeltas_amd64.s
253 lines (225 loc) · 9.82 KB
/
deltas_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_compute_deltas(SB), $0-32
MOVQ input+0(FP), DI
MOVQ length+8(FP), SI
MOVQ output+16(FP), DX
MOVQ starting_point+24(FP), CX
LONG $0xc16ef9c5 // vmovd xmm0, ecx
LONG $0x5879e2c4; BYTE $0xc0 // vpbroadcastd xmm0, xmm0
LONG $0x04fe8348 // cmp rsi, 4
JAE LBB0_2
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
JB LBB0_9
JMP LBB0_22
LBB0_2:
WORD $0x8949; BYTE $0xf0 // mov r8, rsi
LONG $0x02e8c149 // shr r8, 2
LONG $0x01f88349 // cmp r8, 1
JNE LBB0_4
WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
JMP LBB0_6
LBB0_4:
LONG $0xfee08349 // and r8, -2
LONG $0x104a8d48 // lea rcx, [rdx + 16]
WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
WORD $0x8948; BYTE $0xf8 // mov rax, rdi
LBB0_5:
LONG $0x08f0fbc5 // vlddqu xmm1, oword [rax]
LONG $0x0f71e3c4; WORD $0x0cc0 // vpalignr xmm0, xmm1, xmm0, 12
LONG $0xc0faf1c5 // vpsubd xmm0, xmm1, xmm0
LONG $0x417ffac5; BYTE $0xf0 // vmovdqu oword [rcx - 16], xmm0
LONG $0x40f0fbc5; BYTE $0x10 // vlddqu xmm0, oword [rax + 16]
LONG $0x0f79e3c4; WORD $0x0cc9 // vpalignr xmm1, xmm0, xmm1, 12
LONG $0xc9faf9c5 // vpsubd xmm1, xmm0, xmm1
LONG $0x097ffac5 // vmovdqu oword [rcx], xmm1
LONG $0x02c18349 // add r9, 2
LONG $0x20c08348 // add rax, 32
LONG $0x20c18348 // add rcx, 32
WORD $0x394d; BYTE $0xc8 // cmp r8, r9
JNE LBB0_5
LBB0_6:
LONG $0x04c6f640 // test sil, 4
JE LBB0_8
LONG $0x04e1c149 // shl r9, 4
LONG $0xf07ba1c4; WORD $0x0f0c // vlddqu xmm1, oword [rdi + r9]
LONG $0x0f71e3c4; WORD $0x0cc0 // vpalignr xmm0, xmm1, xmm0, 12
LONG $0xc0faf1c5 // vpsubd xmm0, xmm1, xmm0
LONG $0x7f7aa1c4; WORD $0x0a04 // vmovdqu oword [rdx + r9], xmm0
LONG $0xc16ff9c5 // vmovdqa xmm0, xmm1
LBB0_8:
WORD $0x8949; BYTE $0xf2 // mov r10, rsi
LONG $0xfce28349 // and r10, -4
WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
JAE LBB0_22
LBB0_9:
LONG $0x1679c3c4; WORD $0x03c1 // vpextrd r9d, xmm0, 3
WORD $0x8949; BYTE $0xf0 // mov r8, rsi
WORD $0x294d; BYTE $0xd0 // sub r8, r10
LONG $0x20f88349 // cmp r8, 32
JB LBB0_10
LONG $0x92048d4a // lea rax, [rdx + 4*r10]
LONG $0xb70c8d48 // lea rcx, [rdi + 4*rsi]
WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
JAE LBB0_14
LONG $0xb2048d48 // lea rax, [rdx + 4*rsi]
LONG $0x970c8d4a // lea rcx, [rdi + 4*r10]
WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
JAE LBB0_14
LBB0_10:
WORD $0x894c; BYTE $0xd3 // mov rbx, r10
LBB0_18:
WORD $0x8941; BYTE $0xf2 // mov r10d, esi
WORD $0x2941; BYTE $0xda // sub r10d, ebx
WORD $0x8949; BYTE $0xd8 // mov r8, rbx
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xf0 // add r8, rsi
LONG $0x03e28349 // and r10, 3
JE LBB0_23
LBB0_19:
WORD $0x048b; BYTE $0x9f // mov eax, dword [rdi + 4*rbx]
WORD $0xc189 // mov ecx, eax
WORD $0x2944; BYTE $0xc9 // sub ecx, r9d
WORD $0x0c89; BYTE $0x9a // mov dword [rdx + 4*rbx], ecx
LONG $0x01c38348 // add rbx, 1
WORD $0x8941; BYTE $0xc1 // mov r9d, eax
LONG $0xffc28349 // add r10, -1
JNE LBB0_19
LONG $0x03f88349 // cmp r8, 3
JB LBB0_22
JMP LBB0_21
LBB0_23:
WORD $0x8944; BYTE $0xc8 // mov eax, r9d
LONG $0x03f88349 // cmp r8, 3
JB LBB0_22
LBB0_21:
LONG $0x9f048b44 // mov r8d, dword [rdi + 4*rbx]
WORD $0x8944; BYTE $0xc1 // mov ecx, r8d
WORD $0xc129 // sub ecx, eax
WORD $0x0c89; BYTE $0x9a // mov dword [rdx + 4*rbx], ecx
LONG $0x049f448b // mov eax, dword [rdi + 4*rbx + 4]
WORD $0xc189 // mov ecx, eax
WORD $0x2944; BYTE $0xc1 // sub ecx, r8d
LONG $0x049a4c89 // mov dword [rdx + 4*rbx + 4], ecx
LONG $0x9f448b44; BYTE $0x08 // mov r8d, dword [rdi + 4*rbx + 8]
WORD $0x8944; BYTE $0xc1 // mov ecx, r8d
WORD $0xc129 // sub ecx, eax
LONG $0x089a4c89 // mov dword [rdx + 4*rbx + 8], ecx
LONG $0x0c9f448b // mov eax, dword [rdi + 4*rbx + 12]
WORD $0xc189 // mov ecx, eax
WORD $0x2944; BYTE $0xc1 // sub ecx, r8d
LONG $0x0c9a4c89 // mov dword [rdx + 4*rbx + 12], ecx
LONG $0x04c38348 // add rbx, 4
WORD $0x3948; BYTE $0xde // cmp rsi, rbx
JNE LBB0_21
JMP LBB0_22
LBB0_14:
WORD $0x894d; BYTE $0xc1 // mov r9, r8
LONG $0xe0e18349 // and r9, -32
LONG $0x0a1c8d4b // lea rbx, [r10 + r9]
LONG $0x387de3c4; WORD $0x01c0 // vinserti128 ymm0, ymm0, xmm0, 1
LONG $0x971c8d4e // lea r11, [rdi + 4*r10]
LONG $0x60c38349 // add r11, 96
LONG $0x92148d4e // lea r10, [rdx + 4*r10]
LONG $0x60c28349 // add r10, 96
WORD $0xc931 // xor ecx, ecx
LBB0_15:
LONG $0x6f7ec1c4; WORD $0x8b4c; BYTE $0xa0 // vmovdqu ymm1, yword [r11 + 4*rcx - 96]
LONG $0x6f7ec1c4; WORD $0x8b54; BYTE $0xc0 // vmovdqu ymm2, yword [r11 + 4*rcx - 64]
LONG $0x6f7ec1c4; WORD $0x8b5c; BYTE $0xe0 // vmovdqu ymm3, yword [r11 + 4*rcx - 32]
LONG $0x467de3c4; WORD $0x21e1 // vperm2i128 ymm4, ymm0, ymm1, 33
LONG $0x6f7ec1c4; WORD $0x8b04 // vmovdqu ymm0, yword [r11 + 4*rcx]
LONG $0x0f75e3c4; WORD $0x0ce4 // vpalignr ymm4, ymm1, ymm4, 12
LONG $0x4675e3c4; WORD $0x21ea // vperm2i128 ymm5, ymm1, ymm2, 33
LONG $0x0f6de3c4; WORD $0x0ced // vpalignr ymm5, ymm2, ymm5, 12
LONG $0x466de3c4; WORD $0x21f3 // vperm2i128 ymm6, ymm2, ymm3, 33
LONG $0x0f65e3c4; WORD $0x0cf6 // vpalignr ymm6, ymm3, ymm6, 12
LONG $0x4665e3c4; WORD $0x21f8 // vperm2i128 ymm7, ymm3, ymm0, 33
LONG $0x0f7de3c4; WORD $0x0cff // vpalignr ymm7, ymm0, ymm7, 12
LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4
LONG $0xd5faedc5 // vpsubd ymm2, ymm2, ymm5
LONG $0xdefae5c5 // vpsubd ymm3, ymm3, ymm6
LONG $0xe7fafdc5 // vpsubd ymm4, ymm0, ymm7
LONG $0x7f7ec1c4; WORD $0x8a4c; BYTE $0xa0 // vmovdqu yword [r10 + 4*rcx - 96], ymm1
LONG $0x7f7ec1c4; WORD $0x8a54; BYTE $0xc0 // vmovdqu yword [r10 + 4*rcx - 64], ymm2
LONG $0x7f7ec1c4; WORD $0x8a5c; BYTE $0xe0 // vmovdqu yword [r10 + 4*rcx - 32], ymm3
LONG $0x7f7ec1c4; WORD $0x8a24 // vmovdqu yword [r10 + 4*rcx], ymm4
LONG $0x20c18348 // add rcx, 32
WORD $0x3949; BYTE $0xc9 // cmp r9, rcx
JNE LBB0_15
WORD $0x394d; BYTE $0xc8 // cmp r8, r9
JNE LBB0_17
LBB0_22:
VZEROUPPER
RET
LBB0_17:
LONG $0x397de3c4; WORD $0x01c0 // vextracti128 xmm0, ymm0, 1
LONG $0x1679c3c4; WORD $0x03c1 // vpextrd r9d, xmm0, 3
JMP LBB0_18
TEXT ·_compute_prefix_sum(SB), $0-32
MOVQ input+0(FP), DI
MOVQ length+8(FP), SI
MOVQ output+16(FP), DX
MOVQ starting_point+24(FP), CX
LONG $0xc16ef9c5 // vmovd xmm0, ecx
LONG $0x5879e2c4; BYTE $0xc0 // vpbroadcastd xmm0, xmm0
LONG $0x04fe8348 // cmp rsi, 4
JAE LBB1_2
WORD $0xc031 // xor eax, eax
WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
JB LBB1_6
JMP LBB1_10
LBB1_2:
WORD $0x8949; BYTE $0xf0 // mov r8, rsi
LONG $0x02e8c149 // shr r8, 2
WORD $0x8948; BYTE $0xf9 // mov rcx, rdi
WORD $0x8948; BYTE $0xd0 // mov rax, rdx
LBB1_3:
LONG $0x09f0fbc5 // vlddqu xmm1, oword [rcx]
LONG $0xf973e9c5; BYTE $0x08 // vpslldq xmm2, xmm1, 8
LONG $0xc9fee9c5 // vpaddd xmm1, xmm2, xmm1
LONG $0xf973e9c5; BYTE $0x04 // vpslldq xmm2, xmm1, 4
LONG $0xc070f9c5; BYTE $0xff // vpshufd xmm0, xmm0, 255
LONG $0xc0fef1c5 // vpaddd xmm0, xmm1, xmm0
LONG $0xc2fef9c5 // vpaddd xmm0, xmm0, xmm2
LONG $0x007ffac5 // vmovdqu oword [rax], xmm0
LONG $0x10c08348 // add rax, 16
LONG $0x10c18348 // add rcx, 16
LONG $0xffc08349 // add r8, -1
JNE LBB1_3
WORD $0x8948; BYTE $0xf0 // mov rax, rsi
LONG $0xfce08348 // and rax, -4
WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
JAE LBB1_10
LBB1_6:
LONG $0x1679e3c4; WORD $0x03c1 // vpextrd ecx, xmm0, 3
WORD $0x8949; BYTE $0xc0 // mov r8, rax
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xf0 // add r8, rsi
WORD $0x8949; BYTE $0xf1 // mov r9, rsi
LONG $0x03e18349 // and r9, 3
JE LBB1_8
LBB1_7:
WORD $0x0c03; BYTE $0x87 // add ecx, dword [rdi + 4*rax]
WORD $0x0c89; BYTE $0x82 // mov dword [rdx + 4*rax], ecx
LONG $0x01c08348 // add rax, 1
LONG $0xffc18349 // add r9, -1
JNE LBB1_7
LBB1_8:
LONG $0x03f88349 // cmp r8, 3
JB LBB1_10
LBB1_9:
WORD $0x0c03; BYTE $0x87 // add ecx, dword [rdi + 4*rax]
WORD $0x0c89; BYTE $0x82 // mov dword [rdx + 4*rax], ecx
LONG $0x04874c03 // add ecx, dword [rdi + 4*rax + 4]
LONG $0x04824c89 // mov dword [rdx + 4*rax + 4], ecx
LONG $0x08874c03 // add ecx, dword [rdi + 4*rax + 8]
LONG $0x08824c89 // mov dword [rdx + 4*rax + 8], ecx
LONG $0x0c874c03 // add ecx, dword [rdi + 4*rax + 12]
LONG $0x0c824c89 // mov dword [rdx + 4*rax + 12], ecx
LONG $0x04c08348 // add rax, 4
WORD $0x3948; BYTE $0xc6 // cmp rsi, rax
JNE LBB1_9
LBB1_10:
RET