From 935f40aab9884af6b8107581d7e366ed60439314 Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Wed, 14 May 2025 09:24:26 +0200 Subject: [PATCH] docs: Add document retracing x86-64 AVX sgemm microkernel in typst This is mostly for fun (and verification). Not as generic as it could be and so on. Compiled using typst 0.13.1. PDF included in repo so that it is readily available to read. Experimenting with the document in typst.app or locally with instant preview is a good way to work with it. --- docs/typst/Makefile | 3 + docs/typst/x86_sgemm.pdf | Bin 0 -> 40403 bytes docs/typst/x86_sgemm.typ | 310 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 docs/typst/Makefile create mode 100644 docs/typst/x86_sgemm.pdf create mode 100644 docs/typst/x86_sgemm.typ diff --git a/docs/typst/Makefile b/docs/typst/Makefile new file mode 100644 index 0000000..0c10961 --- /dev/null +++ b/docs/typst/Makefile @@ -0,0 +1,3 @@ + +x86_sgemm.pdf: x86_sgemm.typ + typst compile $< diff --git a/docs/typst/x86_sgemm.pdf b/docs/typst/x86_sgemm.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2d8c218858629c7ddc278f7858ef43cfcefe4ba5 GIT binary patch literal 40403 zcmce;1#}!umZojVVz$U)m6(~CC5xGvnVFd_W@fY)Ew+UgGc%LL%v?RA$7Dn>QkFWgX@7R+7jGi_$RAvcZt}0smlNSON3^TLTLiZf+O=fKJKX&KN)^ zYHQ>4zNhGH;Pj{Ay_x>KMNr?-_`MfEC!!!ABc(wlYVM$~sBhy)qhM_6Y^Cq;ZX{x3 zXlrC{V+x=XH!`+yGIw&P`SRW_Z0u-gY-6Nvr&(^`*&dJsR z!2V}Wz;~X+x=sa-Uq4zH0goHtgNg624G=lVFNG%3kx$dfa$%k zG60x?g^8UNzyd6c?2G_bU}0uw0k8oJ3vizuSQt5&fD6e1VBuf}FwnnOW)4OG1JES{ zfRU92zyRC5b0Z`O&4jTrKpiJnr7jq(A#EYG7K| z_US<*?c-?udS-{BDFl_d4!7!w5$7rf2Xo=mXF?H+5f{XYMp2iU`uvr|)>CcsvR!vM z=DBC3OW_g?U-Wyt3XIk@X4(h&JJZjZ1U~{uRVe4%PwBp{e~;%gYFj#hB*n36q?Fq3 zb50<>)f>#PY;~26X;fi#F@RimB~&Q3SQLXyDhk z9|?Ydt^~Gao?FB(uQXo^A_u*%>DB| z($GKu)FP>Aqp`x)2+?<4GUsaOk>dI|G6g!_O-nDyWE{4--lFP|InQWZBgVP@Fy`8M z)zRfx4Mh^9Hs~k}u#S|?Qj9i}VWQV13u?mE=5aYHE84FHeWf5U&Nd{Y!{d1w+2$ka zH6bkAk{n2Jlte$CZi6nHuPHfZM~XMH=zbimu4Jzu!M?)RtI8b9(zx~Z9%zQT(3j}??y)!M=)L~3Vl0T&3DtXaw75I# zJLX=aTG{=qJ9ze(((|TMm*R{}=%xR{dc1y*hTpse52s5}L9^wX__dCkU?k)ww)6F| zzip)LqKcs{bPv~rYRkFY6;I&qwqqq|3i9xmD_d*RRNR@hFrMhZB{N8CU`>|n{+({a zXD7%=CoE_xti>EGDXO6b%R(;?sRR?m;s%-Oxx_I%<++c5uEKy zsg6gBr_HNHg$^6aY^2S%n(Z1or@s4LFd@pnqY@edv%HeJhWR5+SEGZ1~em_yu! z|MM(PCxNE81O4RIa)Y<{y^gLL94#~0r}zo3Z*Wy*9H*aGb$W-b%PDlA)Dw}{wyTds8KIJnv%3sZ z8s4DNZ_1RKq>nynrKgyeA?y#1ZeOL6Jwj=zEW<{ICMgw2?qOn-#L6E=L zV>n$_tN!uu2IOq!BHAu{A^u6xJ%grfQcJKt*lmO`9_2ydRXKz%%3}kXfIex+aa&-y z7wdUA#`>acK_#f8pL0DxX(N@H88hU$=f~P{xfnZ=`iYW$WufS!t(=6jE=Lcs?81hY zM12{wv0OdjJvAjLZJv?+v+9Y9#E>{)ZSp8rVeGBJujLO+Ex8#?sFN$3+`9#kE_;T? z>~!a~l$mT7_L}T;m$jUkDfb8P`^LO;L02Zcb0JqGURDU-MA|GB(9urkEkAmGRnnD`Ce2E0WHWPt_B#F7n zkKUBF6LyjXr&z8x9cmm+HZ|{v9N|gj#oB`}H*4(u+>Ne5p5BcLh(_RbgQ~mA3w7ka z@De#Q`S@ZX+Zou};*drEf!OE!k6#g5vci$Unp97O8uPY9B4Bg@?by5o*?v>ya91Ot zGy;w8y5)Cn#rW;0nT!osADr)0v2_>ka;nFNYpn zK~7d@+y~Fx4gUSg!x0vIS%vU3jZy|9zz5@6@y8iKXsPHRbDqQxXhK3sFFz`FhbYii zr2A|}iQU!ujVD|YF7h7{QQ%yq-t?1Wqa>ph)re(1&vK_7ozhqK#xUrlj%6xEh%_mZ zy8M({Rr#=mA51)(U0i$I5n+!5N|f-H+nT<_Q@2tMSAV$srsZVI2O~n&ycMewCnfDZ z$(YpXWZ9Y28k^g8rk-^nvs#bkd$4x5R7+NI6SQ{xLs;z?zEXe{Pgm|JuF%fx+gt>$ zxVAA~hDgqNjZz$SEM-g0 zLJ3AZgm@eE!UIS%BoKwno8vaV>i|-+DmBP1-FH6Scj$>}%g{i$F;_SnVr;u$2l*!v zTu2AVc40?+#f%6m&0a6_@?B3c?Ud(KpY z?*|jackB&HwpQLq!&|T=GylB`5QlU!XDp3ej?hsqwwCO9)5qu%xY#qd91w>~*bPO% zcQe1t4fBsXU@82|&4Xu{-o*)gmVTMfb-;eA1$BVi*awcEPw=rQTHq&D(mvzIOj*w# zA$|?=0mnUq|E5Or`UNy#0Xm=yu}w}@yQoWtGT$?nK9|)dzukoLG9gTv;~PGbo^^vs z7WZpNsQX@2RNGCRM$XU}T)Jxd+1myG>brLQ*O;XVj7VqaK2Gde2=yFi7ZKd-ZWuk4 zuI#*$`SsjYs>v^*qXxBoSl?!sB&I=;hNxxcA62!RV4B;y_XN3VHq7k<-Nkvt2P7>hanVr%Or4}0rHi)kKm>?r zQ;o%u`uZ4944)}nbJ%pG?=!0<)Qsbu3>+D7tNl)I3afseTK(`1pT#D`Yu0XbEr*Y9 zH~y;PBErR;q&-2!7l#S&GHWWD(R##~u$Ogp64>-uOMZoiq3!FR&532MsoOKr>g^r1 z*guUMH|G*2O^AIq)lA?CU%`bc>p_mPl32$X9!@G6Ziaod%C3^V3dz>#BmQl(k(OxBO!>UQAm{ z0kezO)33_mlibaKDaM8~y(mb;X!9wt=G=K0YkX-A0|uAY`w zED?x{v1uPl5f3n?jnpTTSz;drD!*QIH2B_o&wcATUbjH_4v#+#Tayq)jU7!xlY}WH zBxb)A#-uB_d>|m)K@to0klh!>kZ9MXgi?b_ z2#mG;c&CbdRAuoS_lfuIh*7v_VU&AO;MA|qn=|_5@GNg^Y$JS^w?H`wtaHRQH~Fje zXJzk->wF9 zpH>E^O{1reZt%U|o@TDP*6$(q!fz?U?JT-Lf3-w z5L})aiE;exKP6hQsYx@rx}A0Ol=wc{VN$@usJQ<6c)AL4Cn5fFKlXKWvqAYGUBcD5 zUUH6W+)WF)z3*4IXjqFAood3iy=PI05R8XPEQnu zhLL1-ZIF5(?V39z_`Sm8i6NiVn<<5}#>h`f`5Z^47V%3<$MGM2Ed|7K=;KnDhl!+G zcO<-e6TJG4`M^|*ZPB2q^s6rqrO|`tVhoPhagNy|1=T9X7#7%|8MZ7(%e{^`kl)dr&%Igm-JP?#sRfdO7Ba0N^ao&@3u|!?i#C%YX4ddW zoFX7~Eg{oK@3KIJ=Fcd3z(Aaayt!MN!LeQrfr&rIfCs+vynZqDv6&L#~e*Tc_7pDr@6H18Caye>%lgHLTg5bnO#(q{ig00X_+9LTO zf%xU%cNPQv87VQ(Qf|iLS)x4;s5nU@1&6d$VtK~!l#3evv^2c>iFjmp|6g@BaOT-x z6r$&OJ7nl!!v?Fwtov%?!^MX+)mK(iqq#ghJrg>@i{(m=;Ci1CzBRH6$9)5Ll%Tgk z73rAJ#x|5sDbdrPEqz%9=kjcAZIxa@dn_LsaVE;TBHMqtDcTKmtWEp18~OBE)8yNu zr)j7sr%6JfGfQROf|u=^mh1Fm*WGED^9`?nt!81eP=X|SeKYq=su^aEa%gLkUwsS5 z!~%A?jiU;@n4#lx+3ca4foz54`03h}2X9xTmk)N!9*JtyPNS-FGC|p>heC=5Cf!;F zyGC!(naN{E+6b&B!SosJ1@h~J)SZfwrxy1#uK%diesMySlpQK9rA_vcu1EX!`RU?X z{GpU>u?c;rBl+Ii?2mRj?JRc}J2S|qJm;4ulwhB=@!+@vvaI$^K-2JVsw|k(J!qu8OE*14+;YqJ#4cH zF3^#5slGC;iSWa%hz5y2oA08(u#r8oq#~Ypj%sH z4hhwLBoNi@oVmJRV;aBa=6&31w8S@9OR4_x^>O|n!?yhaf>}2Jx+|3Jg|IrQoyf#J zxG*%z2TqPSA2(|f#bL?2Zv54C#!}lieD}=D1-%X9Fm%_3sEP6m&0zltD=J_SdbkK6`t?OR>VT9@_W4*2fs z))DIC>1L$YRKb17Xb|`cSG(Cy!-8cilpc=^MqW-J2nqTM!PRyB zX>q@Wj?*3u{;2FT_`b9B()>JzYQ|o&jBW%Csj3R%%^jb9i?sNxKc=j*>(=9L8WoYH ztS6ZXuPm!)UJCo*Jyo!Z8T)b>n~+u}xPI*de1`|r!5!Z;=-q`$PEdkA37P%5&R~|8 zZZ-3%w^=gHD2-Wi!e07fG|dTPH{B%)jkSpEWhdPy-F6w~$4+}5g1%(xPlI75lm|RH zoz7PyRFal-jLTWNxx-|K#~ah%?d7V;A3G%G9_aCEuKjd$_8!!?`#XlTD`@K~u$=L3 z`r@RRlNPUG6xXlGa++s2`c&h+u>`L>DK8__Z5Z)PiZbn+0LAmEmyU2B)nNjz z>SIA&I2NQ6(qvg)h@P&qM4BA+#~s^U0I7@Y4pU!GH7jthNCW9C#QZhIp$SN$kv+@6 zgX(Xrzgz9#PPSg6S?4v8WR2 zeHoB8bQ#AW5)4PCE5r-kH`YoL$GQre z;6r0rh&Y+kc-~3q@#OnDxYVw9{Dc;t{Wd4_TBMvdIZ$9*<|;AIM*WF(+P9FrAt{Soi=YVqN(<&Y~{nmF&|Z633U1JAe$3(-JUO z4x=9Ojk6z1d}O#IQYz<*{qql*J~7vIpQa>z#gi8d3v*ZaB#is-ZXMsGJi%|6eUiTlv23_JWcV-R$47H2fQaPo#ExD85sxE#lO z?x@pRVN@@N)cp@M@k~ZZTEwQ#<=a0f3j-TAR}~?{QFCdLAzTo?4Py+{Zkx;H^z!Y~ z2R*nuwEl1eub=+0=LEOj2-j-tpfpgu!QE+^+z`)m4+rAd-d9pJFI>Ch4KX7Z^XtNDS@$% zr3p{J2;v2K$Kv6*8AB8;H%H|ezx-x54Y3;dR@~thkwa>}lCDi0wmeI-Rs7XUmDks! zh-PU{x?hKAJ^N>M-*v%#@bLYl5sU1OaQlv-stnGTCL$}1Yxh3bz52bLJ$c8N6~ehR zBS#Q#Z4XgS+EH=OUYDQaPEPIyvf*bcqm@IN(VDyV!!!@*-xKwIMu8)D?d$3y{o-Gf zt(M+yd}~zFQN#MhcDbmVmLrteS531;<`7$W@wxiD39so``dymqFMWB2!6KUK$tXmN z;+T#3Hvfz*WSFTBQrEsOqQ6N0#ZBLLPV*0h@E0`rXO#1f58hj3^sRxM?7v`OK%iji z2t+Fk^lYquA0#Mf>ju!IVPjzh&@i#QgBMmHVqu_X{)528o%F5D4FzmWt^UR)1m0<=w$w@XQH71LQaF`$GD5UyRL7&71(N>>Te6ia?O10^newW&Zo|fOn~lfuQI8 zPWC?$H6R4~*Yo`2=D%QmvcP|WWMuxq`~+>SjNZrmw}BrJI{lx-4@jN<_W<9`{?&>2 zrvU$R=J%h)`Tq>~0g>AKl-^I<|0VHb0WQ&dVPOa2E?{9{edm3^LJ#C{?*&Nj-iaS0 z(>w2bXLIy_h#w0Z{a?fnIPNdv_lMv8A%4v4K#cc?__6(i_^|`??+@`~VF4Nei6786 z3(yFN1X%&>Kvwq``+Fyhf3ZJiwm;zSo&C`Rao_*O{@%y_js39!Y2F|9$H?){_5P|r z^S{{N`@lc!j|F&uzt|tkyWJo5_s%u{#{S-M-yinJ!p#16_6JLkfX! zz~uqbM|vP91Y$wp+OYxOfKU^-dcatK5D{2e>FNK87XC@Pf27#oaN*zZ&i@=2CJsjf zaiPeaSFowSFkR~XCmgfJSb$$|KE8zplVun@N;rF50`>hASHQ8Vtf84+Fc=??SC1}v9l)2W44GiR~O7Zcl|vEWl_ z`;#O*gHBlWV@Y#75S_ps@8#K~wGE3_Y2!rqP9h4t?LN%8UEo{eFK7}cGqY#CA z#>?I{p8SrI3aBN^zsBS`Q!&>}hfRYsd0XVZmUf(7BhJ%T#zx~s(S2U7ogxkgUsp9u z3gSubw26v_j$f~C#Pa)Tu&J`Nr)K8K{F%oi?^HcLCCf8e=c4O{xvzG*<=(M@K&#bG zfU0b-Cp(;wY`0U?oy5z?Cfik~U70Coarzsa8%6$6zR(mvJ?&tV8vJ(tWh zH5I;Y{&9PJJuf?e(~0iZel23;iNoa6j< zWBa*oDChvEoXvT3DmMXm9Pfey!u6nG4BM=(WS_@VN4e{{Ru1T%&$Jy~n?;j%6gEfw zIWX1quRVO>C3ma3S`YkyfAM_-{LKBw>)hXxkd1}y548TbkkJFy347jsuY)O&3?J|T z6ab}wqjwH~_A7)6$i*xEkwPBt5sPXLT1Ol}NRbvt!{DJ_hpX<>pEup>ydAaOzeMbu z&SP$fSXAobc$-=3d0SDxXfx%+wwNu=YQ4IUVZ%mtyA6+G`8{T zc;Muo1yZW;hh%y zV@C7qrRG--;(k`NQ=LrBnhx)``mZFP=}Vq$oKjx}$_Ji4U*L5+k1g@uu$c<9{U_UL^aK! z95(V&mZZM;-0nb(1XoU7T?b-(4zq2QJCUR0E|>%>mQWrW``PH`@jImmqL%sIt$*GF z!7O(oY`Fn3pSvn;DcDCU4s~37F;!Ol*jc;)k4txV<$-KXxr|BU_CX8CC|9Jtn~O|1 z3za>iS>9}_T(M#XE`e1r=g*+QUcC5n?%X-5O03zMBl`*Esj;=usA(UKi~TP$EfgM= z>!%QTFDdDWl_`L?NZs{koeFh}KjpLMD)M)g7tu3UM^p;HA^f7cgke=Xp%?kFBV0v~ zmzpuMpXfzT_igco<12U=i{V?Lg+ytx|2-P%K|JHO@}VlgzQ|VwEgDDpfxA>rx>9^L zJ~n(N!v8^)#6`FmZN`c~mONM>O3897q9_0!?xq=H-q==hr4+WU6DB4w2;^KlHdG~% zV=9uA4jqtI?(ji4xirWnu|>c;GGbXuwI{U?6uTlxwp_{#PP?ooQqF}RdQUS@ypgh& z5=!W(dqq#7Qs#qv^6Zemd^!UE;rKV@Yu4~)%61ZE$mFs~VL~C$s|D3cJn~;u;Odnj z6Y&<(23C~Klr(UAJr=Y14!mWYD26C4wJ#o~fH0rtxig6h#(^JBHnW=;at~TvM_~!0 z-SJPVREH%ne3W;BCkqG4!tY5lmJ&y*P}SNDCB(4RGq z{+1H~DTt&<k=H~}Yf;q1 zP-{`pM3R@dQEBJsL-|GFY-m$P zMticLIVvYJK=JTf-BH~3Y1m49_FjsUM4>4loab+=(1AxTUfY)uSsa;HY|_&^QXAgjIq9ZN_*Sk<1YV6IClg(t}Dd zCnuDa`MvCmq~mr^sMKtSqsSdHI+H2$AXDo!a&zOdercl_$s=f?aJl*QaUv(5`Vgu`N374993pE-q zvWDobC6<;uX?YF$W9c)78V8r-9Kt;c6iHhpDCxz{SrxKZop)DH_A)=gRp*L1)lSmU>38chIfc>F_&&HKwL%WgC{P6O}-3H}$@qV>(0Px?%c$QH8C zJ*>ZFOEtHsa-)93O0A-!!B{IU(;M;|rdib&qA*H=k4-`FO{?MxuL!_U5cq%*;k~BR zX>J7w&bWZfmyQSqxCizV^v-3wi6d}bG?Cyb!HHvJRxt6v74x(P7PR}(wJp!k-d|NM zU&rWi=p3s|`-!1gK#XDxT0?;*yAq}(6E|`RkXA&4tR#F7&;!?_`ylh9vlp%E=Ih4^ z4^k*os2{|eNHsZjP^pACpv5SvpKcyMtU`xbPDpK0&ObT$U3YSC+6N#YWkalrxM4ZP zo--dHop5~Epg$4+L46|5?|)0I7j5SP$}Y$bHiCix0vjasBVSaNFSh_o8w_{)fbhU3 ziZ&pbxfSX$RrIsK081-Wt5`u~t|!5ExzHQDg2eNv_{^#Pj#6e`vu}Hj0 z-z^e?6l$07RVyRQG4S|whkC>hqzn*Iy9k*#cPJTQBWp%0C@pQH=F~=!O%=p4xdLSr zCooalvo-3$V~`_?r&m`Gt4fjU%xtw+DN^P!K&Y9~5HC^s-hMGjmKl}w*tixJ+(;CR zTa=eBp)=0UapJfYKCW|{bFSZHGzaM#DIQiP8;K|@_a$OjL5EQFQns}DGvYe; z+00ptO|;8B(W{+huhwQ16Ef}~Licd&hc?Bm^ojx#*~khjB`U{_gqsQ4BFyD}*9bC^Vak5rry!9P7h@8`TQn&kbisoWab2NvI?l?=12!R_tA~kIhI_0{*r8JD9S##mW z)b};Zmirb!6TQy|ZBAc%%R7hv^RfBjsax#aQ(kE$fA0>_cmK%SRQJ(UwVIl`qjwxl z^?T#J?*k+Lj-&kFIUSNZQSnuy_v^t}r4GWCK<_xtD&(uytvW77f z6sPpB^HDothG$yfTv$%YO5L7I&w6_o=G6Y3G0Uh_+%@5hP8j^Q&sP*<>RVb>9t8#RLS3ADyw4X4OdFlVWb|A~$coQ#3A6 zkGRB%ng5&Vu_;;@5C(eRNo&=P<{;{2+ zqKky(2;1K6IJIGR-}&v)Vg(dylRx5gE3OTeo31Y z?I?B^FmoGj*nyB5ItCMMB5{}T;x&P>oHBJrecih0ZAgqqF7MruSEWSzAI!a<F;?g6@32o!y$ljM0@^#N_o9LZ+U%{7fIbJ3%(DHx^%$&*w1Y|RSigsT^C z2(K?B_|?TqTYir=hugaz)83g0G3$l&m-c+$U_*VE2j*E!-EOp77W7PEicBeE&9Ii5 z>FsGVc$!tAhJW@#?~_UDTLbd#l1K8qxo8; z85|aiw7BN8Kj#~%>eJrQFqLvu94mp*LqJh=_5B;zUF`@$k6}8&4JDL$H#u)v;SRvx zk*3>fM(5_B2f!={5@h}}Q52lAMQirU#q8@*tHx)QZ^9Rci{!j4pEBl!1eG>ZR9!N0 zyUtO4I60Ttrip@bybT`nvX;iv!VlBzJe1l@wJSsKVM3!()xIuofB!K&!K_Wm4yA zpjdhc;cLyqv7Za7pdPCx6BhFO`bbYZs*ujx8Ao@VmP|Ssxl3}duiiSt)ELQ;McXHx zwsbB~E;--$P}!04@)`|>L)(A|nz1BZHMhbn%-dAZl2*C~$`>J|kz@tXG%u-hoFUc6 z7g8~z^+GR`lZ}z76wRV=W|W9@X;-d`AWte=Gh0ftpDX4ZRLx#+bi-M{j zLwr@9X!uQ-)9)VgLPFPFQdUuC)&-eroph!80K$ScZ$UG0Ln`xu!G-fU%5D&KFW)F6 za^{CD&cdS6_Xko@=TveWp+wfvbosf@UsAD6Fof|bby2pQpiw~QRqbPkhA{YlAqNJveV!KLf;e08aoKWIoX+c1dzzCf9>A1zac)3ZuWde0 zwJWl=9l%YX9e!QaH1Ui+MPzYqbS62owv1}9%IJ)=K$AO%xW8tS+XVN;ThIkBfvVZu zYYnvo|1HW<9n7HQv0=Ks@#A2n2Cv^hCdYgmUh5fG&igkM&h$ng{j^X610(eAtdgDI8%{sL8*lek`YX8FikC`}CoX5sd+b8s(u$=8k^^}?_nj#D@`?swtw!nBk0PpRq0p8yJn7Nf zzR;IBS=}mZNidYk!!V-Lk}pi|=BjGI?yXi|#WaD@RVjrB-2n8d~<`hf18= z;(6#Lsi~W?ET`d}%&aaCKKlyV-g*gLbHiOP$0#M*F*$B*gj@nCP}16d#e%>4Dhdby zcCeYDq2HBJqhe75%o@AT!(7Y(IE=?i8kKN+DpfJ`2nJd;=Z0(>o5FcSDQXtR$W;v_ zF10}j!Xq_P;$W+Dm>-wR%SssYCkT3Q5tiiGMH_0)X5wjT!;wU(T_z#FHpQ-8ES4V~Nt`Y*b^9t43~Z4_xL#Ceo8jv^aq!;P&07cb%?Tl( zCb89;xKL^y2Zl}x_}P3KVZWeHXuuQG6NG9UVD_CZ=edGch0d!x&m%~`$pCYFkQQ`S zR8(|Sq{fo(W`H5qF9+n;PFX9Jk37h|xR0bY$m^WvaIF3u?HH@_yv6Nw%4=(>_hCjq z)so15T*$GBa4rdqLS$8o7ME~(-Z&T$&~4i)ZC%#(yg$EdDn<~?Fz#cA+(H85tm71u zd6AQ_-?XDJbEBAtjj8yPxFmpa=Emm9*OGuyt(eO8DSDLg70sq7!00?KM7Hp}8&WMDie^rjr<`4{ms$VIsJUw8#$-4x|p_ zdegT0WYwGSurb?ySD(S@;-wCqZ0C^s@XpN)^L6SBzF^JHI?dI}4=Srd=YN{l*5D{f zSv}s>z}2R#9XN0hDhT)TGh>KGE1l(!%Z?eAWTAnTFH_MCRXB05Yp^a*z^3+*ajP%B zum2MCOc$W05|euB_vS4+UPDXvEIac&o~QbKR#dz!%Qf1PjnBhp70Fc3Pz{A*(@M)f zXhT=YUWd6VF53L2gjumWo}EKiSz1~;x0n|l+Wby z#(CV-)m4=6G?i~M$qH@lIk(ew-7!X5YiV{+8h@q!1wO~s;cD%)+&q%snx6fIZ*fmS$uZ&k){YN-s zxL*Q|+#5?JSd&&s*w&7U)ezra@Ha^p_YFrrUk^rsSRKfBe_GLBv8$nI6Jt$TB_a#S zaCO}^Ipy&Z<6c`RZIGxDsuXZw2Fj4*J<;k7l{pO>y=dLKF|_x;mnJSSMl zQyD>(o8QJSKh$XWg`h?iNCHaPvDD7!4{`fj>IA-Vk2;wPs0Jk7!4({QmT<`ja7F0d zY(Z4T5f01Mk2Vhol4dd3iTGFapE`vk*_HMi!1lImG4Rrwf!}(lR~80(-Klo}%+?ko zh1Gl#I+4!n_wZfi#oL%RjGpYwQWRR&4EU)RxP4ZOLVNnaVRo0}^SSecXf1g8`2uP+ zxB$ZOiQ4J&*mxt2?^E}4_m>m@>U9p5#uU}yHy0HyiGw9K%yn~G6Kxo_G$HnuA6vbY zGdLuYusuJ8!^1z*#G(%&p+wWhCJ|eZIY(H(2pbHC<|b85(fauTLy&*hbnMbM@+;o)#hilE=H;~wkI!^w!-b1N^~s! zOid+YEKNS7CE5;l;tVhpzZU5TH(M7o==Qd(-ma<&=u;Vm4O|S#P^H$inH_-$t|PfC znWmx9)}}GwMvE*uV6ahMkdhq&#!iKwJ)k+ z(tIV|J6%nK2^yZ)&bu$oktx#KQ}ce71UK{@4wJ7im;$dEXI&k0)>!i@lyCDy+~asi z3rp2O1ast0wd8Nyk{k8%AD_UP+gwj@^{KM+8t=fVus7clZo+BNcYFs%1mR?v68d;5 zQs(PjUgxFux`sW1wWUOTt0zKV(VK!hf}JZOF#Lcq{3;AMd9t z@MPg+F=UZq*3Gcb3xiIqU@Iwh)PmNcULD$N7`n1=g>gTB)GiDaZwheO<9&sDV|c5r z5nrJ4Qn(K|7h)jx(Kt%xHLXjxkRFg;XM-FK9jQ)#ixH{IZ1!4DukSRoZ0gW^%JyFH znR<uG*M#@VBrPZub zFXvKhT~}k);7O4NPx@^iz4gGihHjbm9AmPtEGg$MpdF4Hh@%={YlNNY3UA1_1sqTE z&Uc%O;n|9!h@5ZE_ut_Ig5b5pZ?j*DY$Wa~%m$b=p>$C;gApHH{rspcnaZ zL*%SMTmyU`*-jRi1YDngO_SQ}p?iVV*WzFs_NLwmbUZ7NIc z>~igdabi+%!n~9?>4Is!WY!kNYRK2&W^*CD+#Q~9ry)z1HUD9GvdA$K!s+ri1$acC1w4!sdd-g z#6_C_DE*1bNyi@aF~za|acSegl*JOUdWGoS^d97>Yevb4ck!r!+C=NOrzfkQ$MEys z^V;*@>Ti#ekJb7@KoB9+ZbROd{C+{G_B;=UXQq2IHdE5VRf!&+XwFHV2memEQsfXE z+&`=zPnctx3vh0B{zjr7Pa9YisL`4+jRsQITgn+-m{vSO{0H`MU7t*5t<7ona%>LDx_-{&!{)HlK zFP|p>)bFq|u+TEGGqVDvX8+;Q3P3H-e@x52|7M=?kHGAIE`o`dj#&l(e>4ApHZ4^6 zo1iiLYIc@#s$ZNK|5hkN+0#WYp0LmF1ODY1EZmria@_O9Nc%Yg{kHAdz+wFaS*FUm z8G!=D|w-HC$`i*(mEEQO|x}V@QoF zKxbrapn8j0K?pkJP;|I|cEKdWw;eN_FRc%WP|9ahht@~M-<)^%3>2sQ4FZ-;ab*Io zQASIT8Sz5p{yVD0fEsxSRfJ@!IV{81nk$IV_8xDv4eS=}u^n>+7PU*K0mDx|oLe~4 zoMTBv+wNK^q}IPg2qWX*UN*bZeCcq+x0!YD8-Q;3!6zY%M1wIObMmX1NCT+`@~rTc z@Hdjl7-3~`C%};9oNxmvRXF9)N!VUufkrv_?yd07kJ35|krwNG^;=D14VHyz|8^Nb)umk@Om^5t~`msonvHB?( z4Uxn;5JW+6-R!Io8KG8z@D~d|XP8~I3`>O0BG}{XkEbrEPJ0`qvs@&mrHX447^M|r zxWe&si}s)2B%t8NjEbks&RoZN?$)ld98X)`UO^!~sMjD0r_R<}%31R0G$4@pM){hu zUfZtRdIcVneW>^hm~XT{&W88&eNcnZ8~XU+c2e3qT<7k$rz|MyY@SaKDJoRkz?h*g z)vY9(H8xHi;xh4XzVMf!%4JXFU`_6m1^lAdre3+O6DCupe zTy}6%Ynif}BThnGAm7Ut_3>9q)wl}d^!E718SO$J;07=(1ON4|XgpZEj@W8b&kW}@ zBE&F#juobbYUz8OgLLA&yPGj}rvP>*)lhI%K|euZ^Vbi7a@+^| zBO4lCfjO`tr?VYCv{)Kv9FvCudSP6G8r*nNFk^D`>ZBpxk&dvY+?t{Ri3ht1MrOiD zU|J!bPsJYMobks7Kg+NLdDyq~5OAhBg!8>?Ke3^Vz>@?<@}9zonDdRjO`( zCe)(TWl|V)5ZuhBSM8Rk`#fxrwH}sVXUblF@dZo-WM(fa2+NRx#@xI>B#B(&j+r;I z-!yJ7_=}Q|EgTthHfDIgeEy^wX&!_iyuBRhDPwcg0Mv5*jKVI7!5T`-}%&2%8Q5TXH?oy5vG z+VtuHbS0a)CjN|B?12aM)nUT$g&ktbw{~`nsgOqf6)Fx%S|P;BuzsR!VN>ezfp&%{ zNEK205ulnsV{4x^a9ODH6H*1esh%q0Q7Wwm{mrU&Ic}JzplBooJaFG`4FLh|#aj-O8pCEY98c1x7ISj#T5Bdz@cc zdRtLY(i{W~gCJY1L+1Bx+J%CB9``f_)?+TD>z)O975o58RsQlb_qy3}1ZmM)vKqzs z;dJQpihe_7SW{&~5FXtSZt;paF#(^GDsx&6{ucG8o${DyoMIJvj$q!)^FSuHenIL; z;e;LD9c-L_sO=_qWgXm-D8m>Q8O3wr1mX}W^n#a2c@M)H8`Ui8-GUI5>;Me7c(P10sA!TbOy1Yjai)(i zd!lu;IXRy(#6_(MIJ1`si6|5_t%yl1ctjP7UCQH>N@*D4S^pPzZyiWeeO3Vy9vOv-O59Ya`*_+O?u~ z$lZrhwyGD*PF4tWHH>8j=SIh|*xEc%LuCOfc)^nK$nhrJboH^O_NxE(Z0IOLbkJRBM^7I z%F>FrY{^2x%8Ey>o4lOFQ*Oj~z)6leCBjIMx;{;#*~KYz0cTTcYr_e*pu$X7UWsxc zLNctD22naE7}_C2YGJ?$)f`6;!jq;)qZeOmd*=&6TI15=QJ32A;=or}LBvKKefpXh z>SUx}KEt5!kCLC?=%vEQuC@#-P^T35rl!L*~b*)*hWJM~%Bt4H&K_txXxPhm-i#iSL)tSd2!i6M9Iz#ne{mA@XD{XA>W zx#4_N|9bWrA5yx>S~p{ZwpBN-`uy^Ad*peB&;p5ge4{(ODX4$tCtm>~$G|^8#A%LehvuZwYIFr-b>p-(u^Iu1YU$aX%->h zpOb}v121ufS`Wi!@qw&D-{HXbzMp{?mRUYVG%#u?Fo_s!Qq2ugdUYPsc%snbS+wMK zjp1JAn>q~~Y@F7T@6O#)$n_T`=qZccc8hEug1IzsHI)0v(yD`G<1o^jkg9_bkl74X zRjskVn8pfL^pBX93+I~;&1H0p^mL6+y)MBmuhfaRG`sa-W|5DZO=BnLR<4k&@n&Vv zraIwlDyo$^FZx=z#r$P=7Z+z|_scnU_`u!-E6doRL!+)-_lpS>=QK+cs>3f7D{dd} zdAZ-MbLZRXYK^ir=etWKH(o`nVicodd}_)0Vlhg+E$@37k!>A6>2n=OR38yBZi-#DGO=vEkx6`r-#B(;92a)XU~)Lk~pfeNHh4MP5y#ul?+%XAkp}U3e5v z_oj_n;l2({n;_NFTi|jYuS{?8<>&o6apwG zpm80SHOz)nX*}}lb!K$0xER?kNbDxr z9H))q(Y^${8my=3E44G~1owM+qxyJ0#_4y_yZvUqcSCz)8*NjBZ&Hx?p%}pPbiuBZ zpqkQT+P=^Bb@iCUQ-XXALR3)}bSbvQ9yqG72$UWl*HWV!E3PiulJn4T-Eg~4|DKGq z9c97rFw1L%@33|t?v7B=0nM?w<4JlmRw?GrdmI$!$BB=`@ zLL;tX6PY*Zo>!$~q@>ntr_$u=zEnv%J_6nllRL7!bk+LBU6ra?DxTWI33YfS`92K} zk}vyF!ikDJ; zHj*Rscr)4XkYxFznWQ3<5KQl`Xb{bw#JrQCcwijgF6(BYEq7EopMd8LGhr?E@m&;- z>mAC!7xUeea_S{1Xqa60Ie4vdOF4bYr{$%2PsNc#qEN&H>NlHNH?G7&o=L0$KgBb# zJci#8Oc7A1)?-BW70qArZLxK4$K*s7kLQbqGK<}!LbJ8(=qOiLpUbOMO7CyY4a7>U zJ!`d}ohSTCCH9mX3Yj3kNCYbOl2V3u>^PhRO7K+$$f~NYkQeJ=PGDi~nzhO|e^L8B zGjTJ9nGzkay;X1=>Ap69Rr3v$z#L}mdEKe14B-Slv!Oo){&V`E-R;bTfwy&xvvI5C zeX-pY=#8_z8!ze5&aeaDr0Dx+4}z&3d?a#`O1K%p&gjI+9zyG*}XfS3+CG$3b~Nq~0( zV|C?%*w;)8Y(kf9pUVNEkxA~|7lf;i-4)3#8aNQlucp@>7q#Q1d8zlR&c}ESlI?=+ zc4D<~bq`SM?q)gEt`bkzs?XE7v{O4x{B>Xy>y#wH4{{D0W|e>_2RwfkQWJGnuzcBs zV#y}-B-pTCIC%hPk=8X)#^kM6fm)(U9UiA z+d8K>c(fmB0IA0f&3!yJ1v~ zwc-wKhM=(c(SS|(c~S%HIFY8)b=bp~0GVt=sHF9U?a}<_4dYZxX1=o`9ZQ_OCsv$! zh6i=*&OY>keN-w{4+Bxshx!#O8rEC-G0%Lfyn6#N8?(HZyKkX&X@nYQv*#N&UiQyy zHJ%)}6im3_hi_2#$JkAOue}sob$g|qHSK4(^sRv|6I^&8Ac|&IhA>lvUVT`6lo%c6 zwTz*=f+)5VzHn6SHJ?g51ug>H#E~toa-ta5o)8!(YkEB!e*R`kQu4NCs83PMbkp_@ zt6phS3x}(GGqX2)H*4g~t9&|(;@I!5Go_Vz7Rl_nua;x=Fu|u@ zSy|%m{4g12{aD;V^MbRdpD5&5}73xyz(U*-{y@h3-l)_U5<%YHZ(iWmPGT@n7w2*A;@P9 z;8zHo?n&y9W@RaH{WGUNvP|N4`M7p?e`sHfs)MtbgWk=-F|&DPLy6yYp*L0!y^yO`)HCdL><~6iX1X2pWH|UFOOpoc@QNTiF8U9c zj`ULul%O)NXcG(D*&X_i_2Zl6s3zlSOE(0Zj%ulJ%(w6xn1h zx0VR#7XNv;Zb1CVHTL zBv5^kiJ6fe_?G$KBi?_c@uZgdWQrE3P0P&4PS3{4#QbE|@uyBLJ0lxCfR*{*D%L)s zWxoO7KpUw4+#0`u%a4upH)Fc0;u2yal0O0P|FL2%Q1$X((Qlyc<=+tcr(pjN==XnB z&i^&&8>m?NSf&41u@^cJI`1pB z7bnD11TmCVaI&MHgFUfgxE3)AiAPd!cTa9ovY{MAtXKn`Ze*W#A8n4EBcqstToeR= z(x*zJw`%S-BBZlZ?3S0V>j_FyFKoYuXstkcnB;N_RHWU*;Wz*S-yAr^NQ^rX-;?H? zwFm*|upW`QYsc*H1FdXyLdiIM;>sxsIzqDvJ4QTn~SQT$dU}0iI}`+qIiRr$7~li zlIA8t{w;AUc`M6YiiK1PQC`F;mwLom7Ko6GQ9I_U;tA}qM3T&1d0dn*_bkdb9dT?i zsuCaHIaZArsNZ!_Otvlv*zFS5WS^4%1Lp#&BmerX{y6|Rm;q0`=fANFTnfaxl_%5g zhkW|%v?n@yDh851uBVhe159Btib*8nTQhcW5DB7^Y4@Za!F_aj!xBKIU`Ui{YYNvW zd1_ukH7{!#Eki)It@OsUB2A`ZU?hLvlwx`?e}Ci!*?@I=j(_+4Lvwt&$NSN=o1D|^ zhlfGVhuyo)IhvOsvG87k!PCNCu7$i0s*0~O2$vp?pUsNb@GR_bfVZuHKt{(6xY>N* zxVdxi@9+XSS(xI$i{K6^%?3%lMP6(`!4|q)E1n|k^gZvPYwW2wK=pg|z4t?_$E-q$ zV_P%mZ9!oxIa|&A2?cljdTS4oiQaJriJ}B&@%St2#Ro?_0u{UXNLm&%;@XQ{i)~ z#sl$4`zF;&MS;x&D$gjf(0>xoJ>s+7R4$$zEQ{xy)JeDeQbg5DUa0+xURjA2?uL_Y z6Futu+vr8qjX;VEsprP+!?A2zj*Xlm91vB5urHBc-{5kN*5k|%e}>d`vN{l>-?6+4 zHh1&qZ?geq#W|;d@M_CGdJRes0o9|DRX~jLy1O17Cmk6Av|#yQ*^%9W0uJx1ei=#O zHHG!pP^KthN*=s4d!}{`g^~KL^=N_AiPb>?B1ro}hoo#oheFeR5&V86h*yC+u%Yuh zXCgO~At-w|NX0zpLdi(qnxa0`NQlU$nvZ=z)GW;UVwT{|MlAoTgWA)GC;ucvfg>C* z$(U258g(4UMU*t|HGFgTDVQ+cO(&iw>|Rexaf&*s>NGC5CWLS>2UX`wGSnDRj#Dub zDtHo0y!~%?abonl)RsgZhUN+6$Z$>(8=IPM2ub~gbE9+sb5U6etFZlV^~s6TcA3Tt zRXV4aa{cYcL!|PnyM`@4$vK7H;Ua8~bHM4S z%cOIC5MW2(ipz#JL{@jOIFJB;;gFMs9846`%Z2L^7phqDZ0Poz1gvxGk~9?YrXz(*(#(0u)t zP^wpj=HhbbF7EYJi2)eaOP9BOXs#*{Ca_+~DTF5tMZ9qWdzuak8cAlOq93Qs}y6g|Zp*Vzc>iyUM|X zkwmCz^EtyAIxOr3*wQ zS|zgY;2x4-Ff|u5qFr(YV3l@`({WIloFFzcJ1vA&!9S`1zFpO`n?66oFsIUEh&u~8cmm)!S$Hc4>Sx#BKGDBg~=eO zmfX*u@h{Wm;#(+nF#^D@!oFU|jLB!~!E-%30;4sQ(+gyT-7V_%CqHQ$asppiK7Jvq zI@{|o7&2;LnZQY?ZqNRHm^wzu;kXnbJ61*0B{SA?t>SgIlr5r~-2hgOi)cM+o#&!h z;a*yA0{ZJ70M)Ux=qL2o^|8@PeT|+n2&$by0=pepR&_#{__~~+Mi?1~ToLA`4yyKO zMvTTX9M^9&9mJG6gr#0zrBN5EPjrZw%iju#mgyv%giJ)+Y9*o^;|VY1xs2wbcSlFU z_hF5W;z4!DT5@OYz4r+(jMpnvuA$YTl8=)PcP^yA+v(i$N2V1m!VbPIkHn^Ba;dnGGiF8e)+;D4` z)NPD5{>k;3^mwATgSe8Xs}ZQ)lC0Od=^ou#Lv^CQ(uA>2IlAr=V=b?qwEe5uH|{hC zCt0{hUJhKQwZ|S345QcH*2xK&L0h)f@bXJlwKvXE<}G-A7&j0Cn5e{LSSU2-u%`Am zaf}?rFr!gjAO^4NLKsC#ULc@YouO&KgG+?O8A4%vlV?n*eNL578*Lx?K^xXgp%!KE ztt~}{U}*tskc5bZEQw*&3zQ(>B^+Fj9mZks8Vi^LE{t&3>l}3oCERDtuW?wUh!G_QhguW1B|FQi$q5Lk!sqVDv3tV1s-!?MX`)3JcSAuwHU_8e05LoPzV0a@Gj!`u9 z>N;rgL{P`uu>CK)4%an1bxIJU@{o4LxD3f*bopM0uCpI+&w{xyV%naTmKw!>O%4!^ z!-#0YR1|s+L#m;QA)U+w$0RXb!2&lOOKQ3qxxo|%aCpW>qK!1ANi6APYk8(tScWa< zrD&iRZ5&72J(0^}5<0n2)4;1%n>3A4T9lucS6yvl8>N<~5qU5-F~K|T+QhfkyN&E2>qNWO zBV9qq%h(n)ah$L#4{hRLLS3|yBxnI7;fuV(b zWAVi4QFp-yh!Mw&NtQFA9%B?S^Ui$rtX@!7)wS!RKV@pH>{3!zE>5*A%3>}sq4y}A zuulzQtsS&#Z|OWpbezi~uRT1a zaWN5iS&?#;;@W!g`IP^IS3`$Oe$9b6gRF1OhyD5_pNPOZeFISHO6Wxv%Hhl0$n>PV zxmX4`FeCV54A`$j8a9jp`|<#vdi5Kx;Vd0%422Rks>ZSY)bx5AHVgTaQ_O=R=A2K??tR;lwE5Xh!}zmFy;RL;{~Leu`6 zk{?hY=WV|hq5pl03R3+wFi?)!$%Sp__a`6zW&A;`qFW92dJ%YH{Ei27cY>97U^ z7>Jjoa0%-)QYl855FP{7j@aiH7)x`I3NLF{V#sILO(r-I2X>y%)5(}RS+8inLL`3> zPW8^*H@fai@Vj_jOF=P6&Tl+}eLOXsw^+evG}H@dg;J=6D{+^<33nil9iHWsK~;HS z=6WZCOsfF}5q1X$=yqER52P*Z-SHSyT@y&BjR3T(J$-U@ix3HvCZ_|~) znW;>(!@E7&y28Dry?Wl?YIa#__>lj(DCs0_`n-Tf$L_(tFPD44XKmOHF}mgMI$*K# znVmfS)Y(E~FB^YTx?wH6(d4T`9H*COCOnWL$7RkCxFrR8QGX`aQYl8EK2!o~aT z8u3ft)3p|U@565llQLEOBm9mJ^RtJgdS1H+`*Es^8dvk@*fD{pt@eVdcW)+cW}+7xvb;xMwP^Y~C@-_>2Ye<|`zkUW?khQof-i{5Xu%-i z9^QXxNU*ZltdcdUotj&v$ch2=t)V+@XT<(H0H&8511pWIgMF7F@c1b5educ;epM3B zV?;}b>H@J!QveRt_qWFHd=rQJdbv*7(;g~W zKX!(Mui_kLV3XJ^`tm)Ujc;c&4GDur1SC&~F;b_o3yf%AHtfk7+KTdy&B)4-ukBcJ zCsxo0Z^ZB;vE##Lp6nfFE{HF>ft2e8x|k^#7;wX@Z+f1O?7ilE51~{WO5ShMpwZ2^ ztE4II*--Yn&_17W6x9oA{^00_$&(JmjyUjj3UUp5R9qp2*a+pSK$oGmb*7?WDQy|; zg$h-jD7I+clqDu3Wi=LA%m7R{3P$m0--0Dy#mW4nVL?wT=74mT$$rc;go#VGS_JbX z-W8hrmNl=wKC}1~!g_w{*437y2yuCytJ3THYstkDQ%74K4-GBXYeAmxR;W@ksT7t!rQ{XacQ6dFY#G zWT(3BlC#)>>gtp^Vq|5zQ?!lYrBS~-W$rZX2{46epE^}(DtTG$!9uNvzyOt+b1L-} z{>}J2NmJFBdh6HblJlvfZOr(G7p+dmzKcBC_wxO?eBP=}hbgA?1Y(!!{n-M5u4927 z(m=jo6Ah`HUW6))tz07r9C<1>{&&p#I3-0L=5O|$Mm+iuL86FsL$KsX-RVw`rfVtl`?u_dS)7B}HzWi$ibu{^`xRkHy* zO~-Z5$SJm->Dqd+>ul9^YEm?`TLcH2N2CjxzIsiT-=6aIQ9&r-w(GUPcc1D{=OO!k zU=iyMS{t3&I#*xK{nYqxy6dI$rgQcv4KdE7cHlWP_^nV|r4z3(H|fPHg?nU1TPhNc z@FAR+b=_gc!vyf;dn~ttg0Sj2XahP+qA-YU8TyTQ_gn-qTV8CuHaM;wNp;paNL)Rk z%G1z%i-5zd4GD?V8s)wL8z_$#L|S)p36GaJ7FuZa$wY~5WC*)Zj-_+Xon93F3c{6I zbk!~I$TYl%1WZ5Q9Q%ugL%3S(1To0$OYS*H?w9EOhhkL%A*OgfXneb@I4aB|)Cg~_ zv1aMJ_S+Jbqv9NbFog!$rh?C60lvUZwkzq0Yn$BOB=+e+hl^|ON7Lm)m_eb}^NcXE z9G1A&f(P{0wImyeV`gfmd1qYblbW+J`=;-;9FnI)%itB+Kr$&c-3Jh&uL5i zV$KLys}zAJ%~!9LIGwHmBJHdmLU|ziUN1wZj>7UFO#SgQ6yQvdyRg*+7@Yl)jv^*& zZ;q|-E*M+1*cO#+<^8)KP!A5$=a^U&dQ&DTTqH|1SvB*Xb5eBgV0s24=1Hbe;g;|` zP|aHGZr_bg7hn*E@?d(XJy7Y8e3x!gr!3_{8{b=!@{V$4l=<#%mK%LTQQ2k;;!B$J zfqbG^(+gnb4gCsg1nlN`JtR0ige^~wDq%e^G-2<=mma_UawC&sjpO;Uoc{x~vYTGk zKcQv+(*x}P6;k|fL4KAWSp5GQ$Paju8azS&Kdfi|!!h?KobY#)|Nr)wyQrd=w6YA1 zoQa#TtgL~vIh~N5ofX}m{^WqiWBwfsEI2BjIjZA_U}Po9@I-JVl(DR!=A7i$Hq(Q|ev8!!0;=Om1eM&4%vN4Ej?|ck zI=T*VEj{*P#af;Aar-p9X>v~8Gf;WlM2>4l6J-Lp-Lzbv!<%9|P^x41798<79~5ic z(M}|faAK<`3Pa;Dmk-c0&Ou?a-Y${R>p&l8P-dH#C^$mG1|an@kGng$qCUex&XlMk z`=%vf3q@Y%IWJiOL7$4N;9eP;f7WechR+sKg7Ka=qSo^zSm~Bb>qHu&Y5No&$Ar&@ zIS|&jW5gY}mr5?3a65ZJjHsB%<4wwcmt%>XYp^8loQfK6wl!jbha;lz)h!--_8eQZ zqzvng)Pj9Qw1v&f=6BV{;eIC;`Yxr@ZYE&d#IAS@wdwxJpm`oKBmO6~!xkTodB0Ea zhQ2HXuozF(FHOy&p%RL}O}bEBhBaf?1CNPrBn&|t8uC5#ZJv z*~w#{-7L19S)9Sv@A#bCRooI2w_fK{D*PGu_h@OAda+OVRUQ;!Mf2uW;B00MBI8^f zU!a>WN|n7X&@WcLF5@DQ)6XnolCULmY_EPII?v{O0Ix=TqqBH=clNEb!XowHKGf&B zSGLk}M0kdp<+Ms9s=~lt#yc82Uy<$`4;{KYrK-nn4ZGM@U3;NUtle`hoxzmUgKuKd z$KWO{@oec;>lgRnAJd@K|6!a0E!X}!PJeRPpAJBuBuYQ6M*q_^@D${~wiyjH63y}` ztOgp4W?|%Dr|0AZde~%UWdjPtp4zU`lD)ML{CK){L4M-kM}(hetzDd)7;Frj9WC5#T&$ff?5#Z> z9k>FOp?=Ya`lre%7?_zn7EQsx@zEML&@AyE0%%ziV+(^Pi{6i|#0qRZrbo-(94r8O z4h~M>lmv_<4D57ZET>1a#eYOoGI6qVaWpb<0vdDvgY?vXKzF!LjSKXL`)i4Qw}kzN zl#+>^?LYEfy00r8gUgQEva5bJ7-?1lDU;BB=@N8K+(uu1G`E81njzy)#1?xDB6#l; z{@piro+K}sQWUVK#y)NQNl!nn#8gA@c!)dn0jvErU;;h6)q3IK{Tc(q!$alOosDpW zU3RPOf?pd>7kbQ4x*xBt#wm`5bQL-goIzM_IA)AK5Q6wFLe z#U9EV`hM+&1yKBFuaVwNy3kFKz5zoxxT@LGf(97FSZQN1=6l|ZSZ#KT&wuI~u4o<4 zI^@f}@V+}}#gz)Cyl+cCJU;xGPUtH#eOHCj?Z=ksPDcJ#fk&SW6#``}{NkHkdQAGF zRa}LTvSFfO?ri|hEJmYWCY;|($OMr{*5}19V{nM1ls)`aWtannf|WSawlZLZB9|CDiN!0&sYH0nNKC2{8>D;B*^|#^{^E z9d%gEQSU!Tw+JfBVtk_>d4Mv66eSZTVC`6k$nU5QhAq9RSmz*!MaxQ@%$1+#pSkBw zyQmAvNK)1kcQijxeTfi=CT&F+L6G!m8-^ur<^p=D3mP<^6>b*Jfox1Jr#6J3bxh5c zMd_-lz(t7kTa9gL0Xspp!`hcPuN|KATcjmD96$g-_Cu;;V8sGZl?SjCtZBP;HXnLt zdXy2xSu2vr2(frlEo%@)PX*IbX!bc?xFVx%O2jIC2CrQ>QuCSj!8n;92(!qlAD<#i zW=I)4EoiWj^X+prWDIk`I+T_g?I2|7Gi=dVE|F|W+^7WY{7x`XXYopBZ#d$99v?ha ze8tX!Xb{lDcMyi9RL$~1{Rq7Mgt&=E+)^Jp(isk*kjq)2ToBy@D)n+2cT?E*69?YD zX)G!!*|sguU+X}${KACnuHLe;)y5??Q-oNMPv_8jYclSnm88Po*c(k2jxpP@5m1a; zE-A<1Y4{>T95KC;*}H>+L0g3}wZ*%Vh&2pw;aIS!=j%ZY)IX|oa8J~J3Fikefehjz zxI}xw3Uy4-VO#KRny4l_-g2A%&O(w-Dipp4uTM)-yc;h^?V8gaQJfNZ)wCh}Bg2+v-D7BC;2H zMlF|M;9z>L(rzudr2&^9XuaG1F!mSJEdrptSGK)^SjJ|%qJ{Mka3HivlbHH9-f zSy)#X`PB>xhM~{BVQr(wmGe2Qfk60BLHnoudimSeeSU7fNsZL37f~36FB9-OWc02fzc*V zdzDouGp$Vz&8Z7e>EQIqg8iw6Mp#29Goed$5I_T5pv( z!}E^H+4Eys0sa@k5A_G#Y+gzu$6_qIVOFyj3U8WgecX$Vr50q;_6vA>d=^^It ze-;6-I@pqb&yY>m-p}!T&8#=LU%^mWb^;2{A2VVqX;qXhjK+Y8=H@h^4qW#P&2a z@k%_nO#mrc!bwjGRn4Zos>qn9fL8(ZMd?Yrz|~@1R-x%{w&TB^D#D&5LcxZiOH!kb zXyx1qR_jQ-!#aEUniBWo;)ttrJSQvuM0eX}z1Yoq&f){)!!VS^Dz7&B+%x&4o!pT3 zc{l!w2llhgi;sF& zd<)t*_GM3d)DV;bcl?RFa3-aDaB!# zBVGB>5|N1Eb$4yZDNB)+{?edyuGJqBv{Z~FOHJB`E9e(<5lEZH;^8%G6x1Uc3{{fb z-*#8u_G^xAWs5sjSJAj;&Wwl^<9^-0RZ`5R=>f;pv3(daattE%Pjx$OU~X~19UwL= zVokWm_HRA~eVE2Ha(d@QQFuNmx0|UuktN1Uky|GEP!0+lnK1G zfNSC!RyK1-!+@%CogoB{#+}M?#?;{?v$}-^`PtfAi)ebug8=W8vNR#u59&GUxom3gCMavtGTVp)usz7-hz6zn(;=Z5e3PVW{p?Z1b`r`#~>y$N|;(m+k36Y?K+YLQbrLMguifK1%sH z`>C?I#~2$!w%Ky$o!1sKQ}qZr2Gj_GN6|wRsPS9Pl1ZNnJ(S!eC3b2*K#1jC01ZHk zb$vF>*5UYAb#S-L^(IqYhpKN9nY5iZx;xI7*o};VO!S!++E<39L}Q>mSF6!EhBmKO zkm_@Idn&InpTWA;_IfzG%Qyk{`UP4rf0IAvOP4;?Gs~0=>|>@4zG=jGQY++t39`!Qydb8r|555kyNSf7`N;~K%|e2XXn z_J*?yL_Pw=-#}z}_f<4VyqwL&&ZbS4)tThx=e9eL${7cFKH?dg?m-PE{X+h>I5x&1 z|F3F0qliTpaM#3>c{f$cZZCZ!)AlC{2qX4mc4{1+r+1qA-Vh6vw*^XTeM*S{2&RX) zZ@nctzuJYA=%Znj;AWYnZSSc!-XYKIXhy!yl>HiOUsS6T4mV`A%KyFOh#cz+UA_d5 zA>Q-$4R0ohQA9fZ*Z4yea}peo^Yo%ge(7TYlk1ES&E>IbItMddQnzRYT0lQ{ErMBA zXyP*B9Q!L>C~j1aTA3PF=_}5~XubWapIuy5+zSyX^fEYNA|b2s&{+rt^v-6wv;rO@ z^+w=UXs2_SevRE-=>l|y&8bnSMNn1fW^x#Qja%I5Lg|ds!K4@@M!w#8COntzA8TJu z>7-}QqP{Apx&^d|-8p{`w$1!j^B?wH%)cMr`(sb=|CepZfowv9lpe zcBNya1>Jzcy4ZIcMcG5?}T-FIxoqSy?SWM)+uGIQU{*k-?DP-2LO zI?nhFiF8t{=qAo^MGsBLb;GFVzy zCS3s+9D8rH(rXN0@g(~zxbo()1xW+8-g-_ab~KBT3PG7Uryp8TSWgkx6&nl#&c}5> zW@Q$-I2GpX-rJrv`9igqvZc6?^YHpSnPq(+%B_nD zA}5cpay#{7#7OsJ##~uq#QWGbSH-rH4Sd@Yqf%>pf^=IgrIk6U=h@%g3QlJ>Vhj^E zKSzp0X-_jBLcGbwZd5^#wz8aNzi^#9gQli~FA;}Bn4FMfTn3u=+F(!&fXLw4LIZ;$ zTYgdeEI?t7bf=Q9y0oAO$|*l$rRY6hKzs%hD`g)|r?=*Jky`LCFl9ST=t=mZeutww zyC3q$^eVAVNjJkP%b9MM4^xXKlRwl3qlHpNgTDj}Rk1@o3rm{G$Ne5$%ppg{>_Qeet@jyo1gWkXr>dS`HkU{I7=G1tbyLHRShWLCUi(t#=midKQ8Wxf6eo@&2sgIqx2Lk9CgM+~|JzZqtKtbx#-A8p^+RrTfYDY_jcaHBV=O~z+gd9J4eORC@ffQ0wK|p?;ZPW zXd&Nw_L;<0z4t7ZR7}4&%|U9cCAtsg>k06eSmvxOdb=$O3;*&H;d6UZNn(Mw5VqdE z$@7Cg%LA$z@SOKxGa!RW$M@w|o>8s-Y0yQ4RxnG zlPbdu?NedjYu0oSY#WJ?`$%+h8=w7b9t3EJ$7}R0+0JOGa0?k3d z61*c@Twn$fv5kOPWkYHbA=bCRMSww>SS(*EA z?=-+}+jbVG%aEUkxCD$pRIWZ%gtP-+r2iEW_fJMIXUPpt%B0vx}n_^K2ga$zB8_TSh>0OX&JX%qo+;^ z#B4q~vrs@TBRcb%Q6Vuplha{1H!G8~UCz2JvkzImsw@*5Me#lG*WgJ4^S9W=w)mj=KCb?)eOrva_u%F!pzTV zZ6aO)BYHOQu_R<>SIdAZRb=J`hB8d?PP#yaan+9Fm;ou{2VRAl?`b}uZ(81=l1H(8 zj}($|V2+E+9m}+rz|;8@yYF30gE+xDKiC{lX}Ta#;hTe( zi|F=MZ+jNm?gd=zp^GAFUvP8KtRy$*a@VOlV?H$aH$(2zc7N4Z-k)O~Z*L^w=73*M zY|j7r_8<-adDUA4>HaZW0oS2@XT?w~2e|`hV)ql|)jeAQXSqXXNs=66@%^$bA=|N} z6ZxpLf;(l0l0x7wpBjeMeW87)mi&F8!mO6!ec>jy`c*^CN!E8~jDrI3#5UfBH$fl- zZ6I{-vz{ZEaR^bED|Et2arc|oI-?ptgDW8xV+~@mps<0J&;}bx_OBu*@}9p^f!8}t z_AfqsDeYf?uRoRSPb93z?HLPo#@K~~aD&BeN5Dbq`&rravvRpG&};&ea8D>+6L(uE zZkbW{vldh>#h}`2Jte)*RuK{Bfg?hCele*5_}vKm)P40nh!!b&GkLW#jOsIam>!gq zFVtuqHYCah;842L!0UNzZoq1{Z0=U-)B>5275v+zwYk2!K&#;f=hkXG6 z@Z`O{6m%A#n*iQsc}G z(B8~qI(ldRhyZS}ZLa{2+ix&YzYCz+qLh4CZ0`UtlIJ?Q~|e_z}D6zBd5ygrIo ze=Nd&rT)bFf1&=q7}bzBw0wlUnIF?t`T-p~JDWJ#5&>QI{Xr3RcNSN6HgGl}V*X3I zDQstL=csINU<4%o5aR-uX^&Fbzo2h1Mxf)rA1OS(dh+e}^YO`d-_J*&2fja{>c_x9 zH++A_1(y7m!awEpbG7N@Xl6*H%^)c(EM(whVhj|p0+W2gvW?kT%^VFpetr3e(%CPm z{oW;itP`1;p3rJJq9VFSKdaDP7ylo4IgjMf20=>wOq6z~FfIT@uY8c^@QRhDXhTwmqWDKuwKh zh4%QpEL50Vu-?g)METBf$V=|bNR-L2btT+l8|$s~z1bjpLh95s^nHRiDE4G=0%tMl z2NSZ9o;b)J-v5dSUh@|j0vcR+V*P6 zCjd)nY&&ZHD7s}^4mVat*R7_dI-FX-xu{-=NVeEwOo!1g3D zk|TQ3g8JE2|5+Es2ugz~qK6EzBt2J)(7{1KDTT}R6$p;CnfV~7Ts(wfy9Li1sAgu2Nh`c9J3P9WlYfRr{9i`WPqB0oOX3lxJ<{U~y8}X_FgK zC>BEEcve$gh^r&5Uo*@ENi`Ku1UORP=snZg8W&FYP`low*P{2a`QD1&awxqI+^S>H zP;g!|Sj60t5ZT4KMg+*0h=#4snLc+2yjmhomqXP@#w|PTqId{?X2^L+ElhAgMidv= zcAD{Zhce;48$>Ys&;rRRMel0q6xFyj<@h`_$;Y-O*~0rqr644MkILub*|2P1cHZZC zgKl|);l3!>1)Z>aHANt~_YaK?R8swuo%|w9V)-Fod;C5v9G#pWeOLbYfEi>A9v`1r z)=y?e_u~YEg)`7YETuaK8y(O$sGyn#gP5!!k&~H;jSZ2Fg^{D36_D4MSaShoPIQJA z&P1lxb_ULmvaUxLvX9cZ$2^D_G&Hq{*cs_L*w}!sFxfc(^o&f89P2mM_*9;syx|WK z*i&7e7@M4l`>&-``B5?%J3D(KNg&5Fc$CrocR91N(6a)7Mfh9Jzl-AjC1(Xk6W3pw z>yMnpENm^D%zw%8iQE5&b^ZOg!4F75-o@G4!uH36@<$gvcH~p#e{|7bkHU73^9|rv z)(RBa{@Uq3l6;(?WDE>VtRH&>D7qAR`v3QszfUQDRPT53B^S}pA@Hx=|I`SAF3#q5 zjzpBI){YidM9Quvj>e8I_D`>b9Zd|JAKw8>1soNO^Z;gh0M&1$f1EFVcK#o{9O%#Z zNjdvRWn@j94U7$(4W8=or{iD^S((RloScDK*ue4dy94JmD-&lTLlZL#TV9gG##Ry{ z3u9gqbyitMS$km z*?%9_#^$%+Dhg)*KD;x7z1hzg_P?~oZ$$yV02YNA7?O|2*oe#d@jL#}l!DgIKW?lo z9_zzp4V=7zEly`_Vrl>sqm%IcPsStJTEOTx}r zj>giJM(}6kUxGfiwyB*Xuo3wT?CpW>zn@qp1F%1M9_#bRD<0r8mx73xnu((ma7gk2 z=-GK5!~9BnOzE+6T!8H5N2>`LnElm!CboZTvZt4iS^SIxYy{wV;*xt>LGv*^RrMFb zAN5o+u{E}MxMn(jWZ}{5*koe|_P6KRE<2ZC(;b z;P?MeGw9C+`6IXa!GDB-yvN$k?2+(P>|c&&s9Bh}nK%OH2~!i`!HviB7Cbz_aF5fV zg(Gl%{`hS($S^&!av-w;lK;tF)8p^q*qI-X!u&=9urUGMjsH$#WaoTb{rnb=k%N^3xLf*- z24G`h|9e~j8{6Zi=C^Q6Opk|+ey0K0I2r#XJ$9DIRnXt!0(<0NXsnM1gMSal!SuL} z`JKkZ&iVH?U}9!w{FiVXEI>!=e~bHn>bhbTgn<~k_bYBGw3FASy?DC{PJ)h34iySw z;ppVo`*JvV2^}w7$fLX_DSae+@Iu+hdr=9MF+^^pZIMQO4>Bxi=n7vb%H#`F*?KIK zS2hffHdtsde4)@`NxSk5o&2vUV_+(W<{AYG1`7vhs+2GH7Bderj-`yjSm$nRY`8(p zSSPQ2qv1G3AOEnd=`p{iROPg?yUm|T-fAS0>$0FLmUNZ-#Pj>PIC*%#803#8X_kN^Mx literal 0 HcmV?d00001 diff --git a/docs/typst/x86_sgemm.typ b/docs/typst/x86_sgemm.typ new file mode 100644 index 0000000..c6fe915 --- /dev/null +++ b/docs/typst/x86_sgemm.typ @@ -0,0 +1,310 @@ +// Copyright 2025 Ulrik Sverdrup "bluss" +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// This document retraces the vector permutations in the x86-64 AVX sgemm microkernel, +// to verify and visualize where the elements from the input buffers end up. + +#set document( + date: none, + author: ("Ulrik Sverdrup", ), + title: "matrixmultiply: x86-64 AVX sgemm microkernel", +) + +#set text(font: "Fira Sans", size: 11pt, features: ()) +#let rawfont = "Fira Code" +#show raw: set text(font: rawfont, size: 10pt) + +#show link: underline.with(evade: false) +#set page(numbering: "1", header: { + set align(right) + set text(size: 0.8em) + [matrixmultiply #link("https://github.com/bluss/matrixmultiply")] +}) + + +/// Add string prefix to each array element +#let tag(name, arr) = { + arr.map(x => name + str(x)) +} + +#let load_ps(name) = { + tag(name, range(0, 8)) +} + +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps&ig_expand=4923,6050,4597 +#let moveldup_ps(x) = { + range(0, x.len()).map(i => x.at(2 * calc.div-euclid(i, 2))) +} + +#let movehdup_ps(x) = { + range(0, x.len()).map(i => x.at(1 + 2 * calc.div-euclid(i, 2))) +} + +#let select4_128(src, control) = { + let i = control + if i <= 3 { + src.slice(i, i + 1) + } else { + panic("invalid control") + } +} + + +/// _mm256_permute_ps +/// control word a, b, c, d (each 2 bits) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps&ig_expand=4923 +#let permute_ps(x, a, b, c, d) = { + for (i, c) in (a, b, c, d).enumerate() { + select4_128(x.slice(0, 4), c) + } + for (i, c) in (a, b, c, d).enumerate() { + select4_128(x.slice(4, 8), c) + } +} + +/// _mm256_permute2f128_ps +/// control word a, b (each 2 bits) +#let permute2f128_ps(src1, src2, a, b) = { + let select4_perm(control) = { + if control == 0 { + src1.slice(0, 4) + } else if control == 1 { + src1.slice(4, 8) + } else if control == 2 { + src2.slice(0, 4) + } else if control == 3 { + src2.slice(4, 8) + } else { + panic("invalid control") + } + } + select4_perm(a) + select4_perm(b) +} + +/// _mm256_shuffle_ps +/// control word a, b, c, d (each 2 bits) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps&ig_expand=4923,6050 +#let shuffle_ps(src1, src2, a, b, c, d) = { + let control-and-source = (a, b, c, d).zip((src1, src1, src2, src2)).enumerate() + for (i, (c, src)) in control-and-source { + select4_128(src.slice(0, 4), c) + } + for (i, (c, src)) in control-and-source { + select4_128(src.slice(4, 8), c) + } +} + + + +#let digits = "0123456789".codepoints() +/// Translate a1b2 to ab12 +#let norm-name(x) = { + x.split("").sorted(key: x => digits.contains(x)).join() +} + +/// Multiply two arrays (a0, a1) * (b0, b1) == (a0b0, a1b1) +#let mul(x, y) = { + x.zip(y, exact: true).map(((a, b)) => a + b).map(norm-name) +} + +/// Map array (of string) to (elt, bool) where the boolean marks it as duplicated or not +#let markduplicates(arr) = { + let counter = (:) + for elt in arr { + let c = 1 + counter.at(elt, default: 0) + counter.insert(elt, c) + } + arr.map(elt => (elt, counter.at(elt) > 1)) +} + + +#let show-vectors(ab, name: none, row-label: none, check-duplicates: true) = { + let ncol = 8 + let vector-width = 3.5em + let color-indices = true + + let elements = ab.flatten() + let extra-col = 0 + let nrows = calc.div-euclid(ab.flatten().len(), 8) + + let row-enumerator = box + if name != none and row-label == none { + row-label = name + row-enumerator = x => none + } else if name != none { + block(strong(name), below: 0.6em) + } + + show sub: text.with(size: 1.3em) + show : it => { + set text(font: rawfont, size: 9pt) + strong(it.body) + } + + show table.cell: it => { + if it.x >= ncol { + return it + } + show regex("([a-z]+[0-9]*)+"): it => { + show regex("\d"): it => { + let color = if not color-indices { + black + } else if it.text.match(regex("[37]")) != none { + green.darken(10%) + } else if it.text.match(regex("[15]")) != none { + red.darken(20%) + } else if it.text.match(regex("[26]")) != none { + blue.darken(10%) + } else { + black + } + set text(fill: color) + strong(sub(it)) + } + it + } + it + } + + + // check and mark duplicates + if nrows > 1 and check-duplicates { + elements = markduplicates(elements).map(((elt, duplicated)) => { + set text(stroke: red + 0.7pt) if duplicated + elt + }) + } + + if row-label != none { + elements = elements.chunks(8).enumerate().map( + ((i, c)) => c + ([_#row-label;#row-enumerator[[#i]]_], ) + ).flatten() + extra-col += 1 + } + let t = 0.5pt + table( + columns: (vector-width,) * ncol + (auto, ) * extra-col, + align: bottom + center, + inset: (bottom: 0.5em), + stroke: (x, y) => { + let st = (:) + if x == 0 { st.insert("left", t) } + if x == ncol - 1 { st.insert("right", t) } + if y == 0 and x < ncol { st.insert("top", t)} + if y == nrows - 1 and x < ncol { st.insert("bottom", t) } + st + }, + fill: (x, y) => if x >= 8 { none } else if calc.odd(y) { rgb("EAF2F5") }, + ..elements, + table.vline(x: 2, position: start, stroke: t / 4), + table.vline(x: 4, position: start, stroke: t / 2), + table.vline(x: 6, position: start, stroke: t / 4), + ) +} + + += x86-64 AVX/FMA sgemm microkernel: 32-bit float + +== Loop Iteration + +Load data from buffers `a` and `b` into vectors `aNNNN` and `bv`, `bv_lh`. +#{ + let av = load_ps("a") + let bv = load_ps("b") + let a0246 = moveldup_ps(av) + let a2064 = permute_ps(a0246, 2, 3, 0, 1) + let a1357 = movehdup_ps(av) + let a3175 = permute_ps(a1357, 2, 3, 0, 1) + let bv_lh = permute2f128_ps(bv, bv, 3, 0) + + show-vectors(av, name: `av`) + show-vectors(a0246, name: `a0246`) + show-vectors(a2064, name: `a2064`) + show-vectors(a1357, name: `a1357`) + show-vectors(a3175, name: `a3175`) + show-vectors(bv, name: `bv`) + show-vectors(bv_lh, name: `bv_lh`) + + [ + #show "+=": $+#h(0em)=$ + #show "*": $times$ + ```rust + ab[0] += a0246 * bv + ab[1] += a2064 * bv + ab[2] += a0246 * bv_lh + ab[3] += a2064 * bv_lh + ab[4] += a1357 * bv + ab[5] += a3175 * bv + ab[6] += a1357 * bv_lh + ab[7] += a3175 * bv_lh + ``` + ] + + let ab = ( + mul(a0246, bv), + mul(a2064, bv), + mul(a0246, bv_lh), + mul(a2064, bv_lh), + + mul(a1357, bv), + mul(a3175, bv), + mul(a1357, bv_lh), + mul(a3175, bv_lh), + ) + + show-vectors(ab, name: [`ab` accumulator in loop], row-label: [ab]) + if ab.flatten().len() != ab.flatten().dedup().len() { + highlight(fill: red, [Duplicate entries]) + } + + pagebreak() + + [ + == Finish + De-stripe data from accumulator into final storage order. + ] + + let shuf_mask = (0, 1, 2, 3) + let shuffle_ab = (i, j) => shuffle_ps(ab.at(i), ab.at(j), ..shuf_mask) + let ab0044 = shuffle_ab(0, 1) + let ab2266 = shuffle_ab(1, 0) + let ab4400 = shuffle_ab(2, 3) + let ab6622 = shuffle_ab(3, 2) + + let ab1155 = shuffle_ab(4, 5) + let ab3377 = shuffle_ab(5, 4) + let ab5511 = shuffle_ab(6, 7) + let ab7733 = shuffle_ab(7, 6) + + show-vectors(ab0044, name: `ab0044`) + show-vectors(ab2266, name: `ab2266`) + show-vectors(ab4400, name: `ab4400`) + show-vectors(ab6622, name: `ab6622`) + + show-vectors(ab1155, name: `ab1155`) + show-vectors(ab3377, name: `ab3377`) + show-vectors(ab5511, name: `ab5511`) + show-vectors(ab7733, name: `ab7733`) + + let abfinal = ( + permute2f128_ps(ab0044, ab4400, 0, 2), + permute2f128_ps(ab1155, ab5511, 0, 2), + permute2f128_ps(ab2266, ab6622, 0, 2), + permute2f128_ps(ab3377, ab7733, 0, 2), + permute2f128_ps(ab0044, ab4400, 3, 1), + permute2f128_ps(ab1155, ab5511, 3, 1), + permute2f128_ps(ab2266, ab6622, 3, 1), + permute2f128_ps(ab3377, ab7733, 3, 1), + ) + + show-vectors(abfinal, name: [`ab` in order], row-label: [ab]) + if abfinal.flatten().len() != abfinal.flatten().dedup().len() { + highlight(fill: red, [Duplicate entries]) + } +}