examples/assembly/Add_Array_(AArch64).asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

# This file demonstrates adding 2 arrays using serial, SVE and NEON instructions.
# To assemble in Compiler Explorer select AArch64 binutils and add
# "-march=armv8.2-a+sve" to the compiler options.
#
# To compile locally use "aarch64-linux-gnu-gcc -mcpu=generic+sve" (or -march=armv8.2-a+sve),
# the compiler driver will set up the call main.

# All functions have the following prototype:
# void addarrays_{serial,neon,sve} (int *restrict res, int *A, int *B, long N)
# x0 -> res
# x1 -> A
# x2 -> B
# x3 -> N

.globl addarrays_serial
.type addarrays_serial, %function

addarrays_serial:
    # x4 is the loop counter
    mov x4, xzr
    b .cond
.loop_body:
    # Each int is 4 bytes, so we use lsl 2 (left shift by 2)
    ldr w5, [x1, x4, lsl 2]
    ldr w6, [x2, x4, lsl 2]
    add w5, w5, w6
    str w5, [x0, x4, lsl 2]
    add x4, x4, 1
.cond:
    cmp x4, x3
    blt .loop_body
    ret

.global addarrays_neon
.type addarrays_neon, %function

addarrays_neon:
    mov w4, 0
    # w5 contains the number of vector iterations.
    # For example, if w3 (ie N) is 10,
    # then w5 = 10 / 4 = 2.
    lsr w5, w3, 2
.vector_loop:
    cmp w4, w5
    beq .vector_loop_exit
    ldr q30, [x1, x4, lsl 4]
    ldr q31, [x2, x4, lsl 4]
    add v30.4s, v30.4s, v31.4s
    str q30, [x0, x4, lsl 4]
    add w4, w4, 1
    b .vector_loop
.vector_loop_exit:
    # Iterate over remaining elements serially, starting from w5*4.
    # For example, if N = 10, we complete 2 vector iterations,
    # and the first scalar iteration will start from 2 * 4 = 8.
    # So the scalar loop will iterate from [8, 10).
    # The scalar loop is identical to the above written addarrays_serial.
    lsl w4, w5, 2
.tail_loop:
    cmp w4, w3
    beq .tail_loop_exit
    ldr w6, [x1, w4, uxtw 2]
    ldr w7, [x2, w4, uxtw 2]
    add w6, w6, w7
    str w6, [x0, w4, uxtw 2]
    add w4, w4, 1
    b .tail_loop
.tail_loop_exit:
    ret

.global addarrays_sve
.type addarrays_sve, %function

# We start by building predicate in p0 whose elements are set to 1,
# for values of induction variable < N.
# Then we load elements from input arrays x1, x2 into SVE registers z0, z1
# for elements corresponding to active lanes of p0 (set to 1), 
# and zero out the rest (/z).
# The result is then added and computed in z0, and only elements from z0
# corresponding to active lanes of p0 are stored into res.
# Finally, the induction variable x4 is incremented by number of 32-bit words
# in the vector.
# If none of the predicate elements are active, the loop is terminated.
#
# Note that the code doesn't rely on knowing vector length, and can
# thus "scale" to different vector lengths set by hardware without
# needing recompilation, and also doesn't need a tail scalar loop.
#
# For example: 
# Let's consider adding following arrays:
# A = [ 10, 13, 5, 8, 1, 42, 65, 17, 21, 24 ]
# B = [ 19, 12, 31, 42, 3, 9, 25, 69, 87, 93 ]
# Let result be the output array for storing sum of individual elements
# from A and B.
# N = 10
#
# Case 1: Vector length = 256 bits, that is, 8 number of 32-bit elements.
# 1st iteration:
# p0.s = [x4 < x3, x4+1 < x3, ... x4+<len-1> < x3] = [ 1, 1, 1, 1, 1, 1, 1, 1]
# Since first element of p0 is active, we branch to loop body,
# Since all lanes of p0 are active, load 8 elements from A starting from &A[x4]
# z0.s = [ A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7] ]
# Similarly,
# z1.s = [ B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7] ]
# Add and store result in z0:
# z0.s = [ z0.s[0] + z0.s[1], ... +, z0.s[7] + z1.s[7] ] 
# result = [ z0.s[0], z0.s[1], z0.s[2], z0.s[3], z0.s[4],
#	     z0.s[5], z0.s[6], z0.s[7] ]
# Finally incw x4 will increment x4 by 8, thus x4 = 8.
#
# 2nd iteration:
# x4 = 8
# p0.s = [ 1, 1, 0, 0, 0, 0, 0, 0 ]
# Since only first two lanes are active in p0, we load &A[8], &A[9]
# in z0 and zero out rest of elements (because of /z): 
# z0.s = [ A[8], A[9], 0, 0, 0, 0, 0, 0 ]
# Similarly, z1.s = [ B[8], B[9], 0, 0, 0, 0, 0, 0 ]
# Compute the result:
# z0.s = [ z0.s[0] + z1.s[0], z0.s[1] + z1.s[1], 0, 0, 0, 0, 0, 0 ] 
# and since first two elements of p0 are active:
# we store z0.s[0] into result[8] and z0.s[1] in result[9] repsectively.
# Increment x4 by 8, thus x4 = 16.
#
# 3rd iteration:
# x4 = 16
# Since x4 > N, p0 will be all false predicate.
# Since all lanes of p0 are inactive, the loop is terminated.
#
# Case2: Vector length = 512 bits, that is, 16 number of 32-bit elements.
# 1st iteration:
# p0.s = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0 ]
# z0.s = [ A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7],
#	   A[8], A[9], 0, 0, 0, 0, 0, 0 ]
# z1.s = [ B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7],
#	   B[8], B[9], 0, 0, 0, 0, 0, 0 ]
# Compute sum in z0:
# z0.s = [ z0.s[0] + z1.s[0], ..., z0.s[9] + z1.s[9], 0, 0, 0, 0, 0, 0 ] 
# Store first 9 elements from z0 since first 9 lanes of p0 are active.
# result = [ z0.s[0], z0.s[1], z0.s[2], z0.s[3], z0.s[4], z0.s[5], z0.s[6],
#	     z0.s[7], z0.s[8], z0.s[9] ]
# Increment x4 by 16, thus x4 = 16.
#
# 2nd iteration:
# p0.s = all false predicate since x4 > N.
# Since all lanes of p0 are inactive, terminate the loop.

addarrays_sve:
    mov x4, xzr
    b .cond_2
.loop_body_2:
    # Load elements in z0 from x1 (which is A), corresponding
    # to active lanes of p0 and zero out the rest.
    ld1w z0.s, p0/z, [x1, x4, lsl 2]
    # Similarly, load elements from x2 (which is B) into z1.
    ld1w z1.s, p0/z, [x2, x4, lsl 2]
    add z0.s, z0.s, z1.s
    # Store elements of result computed in z0, corresponding to
    # active lanes of p0 in x0 (x0 is result).
    st1w z0.s, p0, [x0, x4, lsl 2]
    # Increment x4 by number of 32-bit elements in the vector.
    incw x4
.cond_2:
    # Build predicate p0.s = x4 < x3
    whilelt p0.s, x4, x3
    # Branch to beginning of loop body if the first bit in p0 is active.
    b.first .loop_body_2
    ret

.globl main
.type main, %function
main:
    # Load address for res, A, B in x0, x1, x2 respectively
    # and call one of the addarrays function, which will
    # store the computed result in res.
    adrp x0, result
    add x0, x0, :lo12:result
    adrp x1, A
    add x1, x1, :lo12:A
    adrp x2, B
    add x2, x2, :lo12:B
    mov x3, 10
    bl addarrays_sve

    # Call memcmp (result_ref, result, 40) to verify the result.
    adrp x0, result_ref
    add x0, x0, :lo12:result_ref
    adrp x1, result
    add x1, x1, :lo12:result
    mov w2, 40
    bl memcmp

    # The return value of memcmp is computed in w0. Pass that
    # as argument to _exit, so it becomes the exit status of
    # the process. 
    bl _exit

.globl A
.type A, %object
A: .word 10, 13, 5, 8, 1, 42, 65, 17, 21, 24

.globl B
.type B, %object
B: .word 19, 12, 31, 42, 3, 9, 25, 69, 87, 93 

.global result_ref
.type result_ref, %object
result_ref: .word 29, 25, 36, 50, 4, 51, 90, 86, 108, 117

.globl result
.bss
.type  result, %object
.size result, 40
result:
    .zero 40