Skip to content

Commit bc0f97b

Browse files
committed
docs: Add document retracing x86-64 AVX sgemm microkernel in typst
This is mostly for fun (and verification). Not as generic as it could be and so on. Compiled using typst 0.13.1. PDF included in repo so that it is readily available to read. Experimenting with the document in typst.app or locally with instant preview is a good way to work with it.
1 parent 1c91e1c commit bc0f97b

File tree

3 files changed

+307
-0
lines changed

3 files changed

+307
-0
lines changed

docs/typst/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
x86_sgemm.pdf: x86_sgemm.typ
3+
typst compile $<

docs/typst/x86_sgemm.pdf

39.4 KB
Binary file not shown.

docs/typst/x86_sgemm.typ

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
// Ulrik Sverdrup "bluss"
2+
//
3+
// This document retraces the vector permutations in the x86-64 AVX sgemm microkernel,
4+
// to verify and visualize where the elements from the input buffers end up.
5+
6+
#set document(
7+
date: none,
8+
author: ("Ulrik Sverdrup", ),
9+
title: "matrixmultiply: x86-64 AVX sgemm microkernel",
10+
)
11+
12+
#set text(font: "Fira Sans", size: 11pt, features: ("tnum", "ss04"))
13+
#let rawfont = "Fira Code"
14+
#show raw: set text(font: rawfont, size: 10pt)
15+
16+
#show link: underline.with(evade: false)
17+
#set page(numbering: "1", header: {
18+
set align(right)
19+
set text(size: 0.8em)
20+
[matrixmultiply #link("https://github.com/bluss/matrixmultiply")]
21+
})
22+
23+
24+
/// Add string prefix to each array element
25+
#let tag(name, arr) = {
26+
arr.map(x => name + str(x))
27+
}
28+
29+
#let load_ps(name) = {
30+
tag(name, range(0, 8))
31+
}
32+
33+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps&ig_expand=4923,6050,4597
34+
#let moveldup_ps(x) = {
35+
range(0, x.len()).map(i => x.at(2 * calc.div-euclid(i, 2)))
36+
}
37+
38+
#let movehdup_ps(x) = {
39+
range(0, x.len()).map(i => x.at(1 + 2 * calc.div-euclid(i, 2)))
40+
}
41+
42+
#let select4_128(src, control) = {
43+
let i = control
44+
if i <= 3 {
45+
src.slice(i, i + 1)
46+
} else {
47+
panic("invalid control")
48+
}
49+
}
50+
51+
52+
/// _mm256_permute_ps
53+
/// control word a, b, c, d (each 2 bits)
54+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps&ig_expand=4923
55+
#let permute_ps(x, a, b, c, d) = {
56+
for (i, c) in (a, b, c, d).enumerate() {
57+
select4_128(x.slice(0, 4), c)
58+
}
59+
for (i, c) in (a, b, c, d).enumerate() {
60+
select4_128(x.slice(4, 8), c)
61+
}
62+
}
63+
64+
/// _mm256_permute2f128_ps
65+
/// control word a, b (each 2 bits)
66+
#let permute2f128_ps(src1, src2, a, b) = {
67+
let select4_perm(control) = {
68+
if control == 0 {
69+
src1.slice(0, 4)
70+
} else if control == 1 {
71+
src1.slice(4, 8)
72+
} else if control == 2 {
73+
src2.slice(0, 4)
74+
} else if control == 3 {
75+
src2.slice(4, 8)
76+
} else {
77+
panic("invalid control")
78+
}
79+
}
80+
select4_perm(a)
81+
select4_perm(b)
82+
}
83+
84+
/// _mm256_shuffle_ps
85+
/// control word a, b, c, d (each 2 bits)
86+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps&ig_expand=4923,6050
87+
#let shuffle_ps(src1, src2, a, b, c, d) = {
88+
let control-and-source = (a, b, c, d).zip((src1, src1, src2, src2)).enumerate()
89+
for (i, (c, src)) in control-and-source {
90+
select4_128(src.slice(0, 4), c)
91+
}
92+
for (i, (c, src)) in control-and-source {
93+
select4_128(src.slice(4, 8), c)
94+
}
95+
}
96+
97+
98+
99+
#let digits = "0123456789".codepoints()
100+
/// Translate a1b2 to ab12
101+
#let norm-name(x) = {
102+
x.split("").sorted(key: x => digits.contains(x)).join()
103+
}
104+
105+
/// Multiply two arrays (a0, a1) * (b0, b1) == (a0b0, a1b1)
106+
#let mul(x, y) = {
107+
x.zip(y, exact: true).map(((a, b)) => a + b).map(norm-name)
108+
}
109+
110+
/// Map array (of string) to (elt, bool) where the boolean marks it as duplicated or not
111+
#let markduplicates(arr) = {
112+
let counter = (:)
113+
for elt in arr {
114+
let c = 1 + counter.at(elt, default: 0)
115+
counter.insert(elt, c)
116+
}
117+
arr.map(elt => (elt, counter.at(elt) > 1))
118+
}
119+
120+
121+
#let show-vectors(ab, name: none, row-label: none, check-duplicates: true) = {
122+
let ncol = 8
123+
let vector-width = 3.5em
124+
let color-indices = true
125+
126+
let elements = ab.flatten()
127+
let extra-col = 0
128+
let nrows = calc.div-euclid(ab.flatten().len(), 8)
129+
130+
let row-enumerator = box
131+
if name != none and row-label == none {
132+
row-label = name
133+
row-enumerator = x => none
134+
} else if name != none {
135+
block(strong(name), below: 0.6em)
136+
}
137+
138+
show sub: text.with(size: 1.3em)
139+
show <row-label>: it => {
140+
set text(font: rawfont, size: 9pt)
141+
strong(it.body)
142+
}
143+
144+
show table.cell: it => {
145+
if it.x >= ncol {
146+
return it
147+
}
148+
show regex("([a-z]+[0-9]*)+"): it => {
149+
show regex("\d"): it => {
150+
let color = if not color-indices {
151+
black
152+
} else if it.text.match(regex("[37]")) != none {
153+
green.darken(10%)
154+
} else if it.text.match(regex("[15]")) != none {
155+
red.darken(20%)
156+
} else if it.text.match(regex("[26]")) != none {
157+
blue.darken(10%)
158+
} else {
159+
black
160+
}
161+
set text(fill: color)
162+
strong(sub(it))
163+
}
164+
it
165+
}
166+
it
167+
}
168+
169+
170+
// check and mark duplicates
171+
if nrows > 1 and check-duplicates {
172+
elements = markduplicates(elements).map(((elt, duplicated)) => {
173+
set text(stroke: red + 0.7pt) if duplicated
174+
elt
175+
})
176+
}
177+
178+
if row-label != none {
179+
elements = elements.chunks(8).enumerate().map(
180+
((i, c)) => c + ([_#row-label;#row-enumerator[[#i]]_<row-label>], )
181+
).flatten()
182+
extra-col += 1
183+
}
184+
let t = 0.5pt
185+
table(
186+
columns: (vector-width,) * ncol + (auto, ) * extra-col,
187+
align: bottom + center,
188+
inset: (bottom: 0.5em),
189+
stroke: (x, y) => {
190+
let st = (:)
191+
if x == 0 { st.insert("left", t) }
192+
if x == ncol - 1 { st.insert("right", t) }
193+
if y == 0 and x < ncol { st.insert("top", t)}
194+
if y == nrows - 1 and x < ncol { st.insert("bottom", t) }
195+
st
196+
},
197+
fill: (x, y) => if x >= 8 { none } else if calc.odd(y) { rgb("EAF2F5") },
198+
..elements,
199+
table.vline(x: 2, position: start, stroke: t / 4),
200+
table.vline(x: 4, position: start, stroke: t / 2),
201+
table.vline(x: 6, position: start, stroke: t / 4),
202+
)
203+
}
204+
205+
206+
= x86-64 AVX/FMA sgemm microkernel: 32-bit float
207+
208+
== Loop Iteration
209+
210+
Load data from buffers `a` and `b` into vectors `aNNNN` and `bv`, `bv_lh`.
211+
#{
212+
let av = load_ps("a")
213+
let bv = load_ps("b")
214+
let a0246 = moveldup_ps(av)
215+
let a2064 = permute_ps(a0246, 2, 3, 0, 1)
216+
let a1357 = movehdup_ps(av)
217+
let a3175 = permute_ps(a1357, 2, 3, 0, 1)
218+
let bv_lh = permute2f128_ps(bv, bv, 3, 0)
219+
220+
show-vectors(av, name: `av`)
221+
show-vectors(a0246, name: `a0246`)
222+
show-vectors(a2064, name: `a2064`)
223+
show-vectors(a1357, name: `a1357`)
224+
show-vectors(a3175, name: `a3175`)
225+
show-vectors(bv, name: `bv`)
226+
show-vectors(bv_lh, name: `bv_lh`)
227+
228+
[
229+
#show "+=": $+#h(0em)=$
230+
#show "*": $times$
231+
```rust
232+
ab[0] += a0246 * bv
233+
ab[1] += a2064 * bv
234+
ab[2] += a0246 * bv_lh
235+
ab[3] += a2064 * bv_lh
236+
ab[4] += a1357 * bv
237+
ab[5] += a3175 * bv
238+
ab[6] += a1357 * bv_lh
239+
ab[7] += a3175 * bv_lh
240+
```
241+
]
242+
243+
let ab = (
244+
mul(a0246, bv),
245+
mul(a2064, bv),
246+
mul(a0246, bv_lh),
247+
mul(a2064, bv_lh),
248+
249+
mul(a1357, bv),
250+
mul(a3175, bv),
251+
mul(a1357, bv_lh),
252+
mul(a3175, bv_lh),
253+
)
254+
255+
show-vectors(ab, name: [`ab` accumulator in loop], row-label: [ab])
256+
if ab.flatten().len() != ab.flatten().dedup().len() {
257+
highlight(fill: red, [Duplicate entries])
258+
}
259+
260+
pagebreak()
261+
262+
[
263+
== Finish
264+
De-stripe data from accumulator into final storage order.
265+
]
266+
267+
let shuf_mask = (0, 1, 2, 3)
268+
let shuffle_ab = (i, j) => shuffle_ps(ab.at(i), ab.at(j), ..shuf_mask)
269+
let ab0044 = shuffle_ab(0, 1)
270+
let ab2266 = shuffle_ab(1, 0)
271+
let ab4400 = shuffle_ab(2, 3)
272+
let ab6622 = shuffle_ab(3, 2)
273+
274+
let ab1155 = shuffle_ab(4, 5)
275+
let ab3377 = shuffle_ab(5, 4)
276+
let ab5511 = shuffle_ab(6, 7)
277+
let ab7733 = shuffle_ab(7, 6)
278+
279+
show-vectors(ab0044, name: `ab0044`)
280+
show-vectors(ab2266, name: `ab2266`)
281+
show-vectors(ab4400, name: `ab4400`)
282+
show-vectors(ab6622, name: `ab6622`)
283+
284+
show-vectors(ab1155, name: `ab1155`)
285+
show-vectors(ab3377, name: `ab3377`)
286+
show-vectors(ab5511, name: `ab5511`)
287+
show-vectors(ab7733, name: `ab7733`)
288+
289+
let abfinal = (
290+
permute2f128_ps(ab0044, ab4400, 0, 2),
291+
permute2f128_ps(ab1155, ab5511, 0, 2),
292+
permute2f128_ps(ab2266, ab6622, 0, 2),
293+
permute2f128_ps(ab3377, ab7733, 0, 2),
294+
permute2f128_ps(ab0044, ab4400, 3, 1),
295+
permute2f128_ps(ab1155, ab5511, 3, 1),
296+
permute2f128_ps(ab2266, ab6622, 3, 1),
297+
permute2f128_ps(ab3377, ab7733, 3, 1),
298+
)
299+
300+
show-vectors(abfinal, name: [`ab` in order], row-label: [ab])
301+
if abfinal.flatten().len() != abfinal.flatten().dedup().len() {
302+
highlight(fill: red, [Duplicate entries])
303+
}
304+
}

0 commit comments

Comments
 (0)