/
sums_amd64.s
197 lines (179 loc) · 3.95 KB
/
sums_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func sumUint64(x []uint64, y []uint64)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint64(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x10, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*8), Y0
VMOVDQU (BX)(CX*8), Y1
VMOVDQU 32(DX)(CX*8), Y2
VMOVDQU 32(BX)(CX*8), Y3
VMOVDQU 64(DX)(CX*8), Y4
VMOVDQU 64(BX)(CX*8), Y5
VMOVDQU 96(DX)(CX*8), Y6
VMOVDQU 96(BX)(CX*8), Y7
VPADDQ Y0, Y1, Y0
VPADDQ Y2, Y3, Y2
VPADDQ Y4, Y5, Y4
VPADDQ Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*8)
VMOVDQU Y2, 32(DX)(CX*8)
VMOVDQU Y4, 64(DX)(CX*8)
VMOVDQU Y6, 96(DX)(CX*8)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVQ (BX)(CX*8), AX
ADDQ AX, (DX)(CX*8)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint32(x []uint32, y []uint32)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint32(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x20, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*4), Y0
VMOVDQU (BX)(CX*4), Y1
VMOVDQU 32(DX)(CX*4), Y2
VMOVDQU 32(BX)(CX*4), Y3
VMOVDQU 64(DX)(CX*4), Y4
VMOVDQU 64(BX)(CX*4), Y5
VMOVDQU 96(DX)(CX*4), Y6
VMOVDQU 96(BX)(CX*4), Y7
VPADDD Y0, Y1, Y0
VPADDD Y2, Y3, Y2
VPADDD Y4, Y5, Y4
VPADDD Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*4)
VMOVDQU Y2, 32(DX)(CX*4)
VMOVDQU Y4, 64(DX)(CX*4)
VMOVDQU Y6, 96(DX)(CX*4)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVL (BX)(CX*4), AX
ADDL AX, (DX)(CX*4)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint16(x []uint16, y []uint16)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint16(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x40, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*2), Y0
VMOVDQU (BX)(CX*2), Y1
VMOVDQU 32(DX)(CX*2), Y2
VMOVDQU 32(BX)(CX*2), Y3
VMOVDQU 64(DX)(CX*2), Y4
VMOVDQU 64(BX)(CX*2), Y5
VMOVDQU 96(DX)(CX*2), Y6
VMOVDQU 96(BX)(CX*2), Y7
VPADDW Y0, Y1, Y0
VPADDW Y2, Y3, Y2
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*2)
VMOVDQU Y2, 32(DX)(CX*2)
VMOVDQU Y4, 64(DX)(CX*2)
VMOVDQU Y6, 96(DX)(CX*2)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVW (BX)(CX*2), AX
ADDW AX, (DX)(CX*2)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint8(x []uint8, y []uint8)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint8(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x80, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*1), Y0
VMOVDQU (BX)(CX*1), Y1
VMOVDQU 32(DX)(CX*1), Y2
VMOVDQU 32(BX)(CX*1), Y3
VMOVDQU 64(DX)(CX*1), Y4
VMOVDQU 64(BX)(CX*1), Y5
VMOVDQU 96(DX)(CX*1), Y6
VMOVDQU 96(BX)(CX*1), Y7
VPADDB Y0, Y1, Y0
VPADDB Y2, Y3, Y2
VPADDB Y4, Y5, Y4
VPADDB Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*1)
VMOVDQU Y2, 32(DX)(CX*1)
VMOVDQU Y4, 64(DX)(CX*1)
VMOVDQU Y6, 96(DX)(CX*1)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVB (BX)(CX*1), AL
ADDB AL, (DX)(CX*1)
ADDQ $0x01, CX
JMP x86_loop
return:
RET