From 9c5504adceb544d9954ddb8ff3035a414f4b1423 Mon Sep 17 00:00:00 2001 From: Bardia Mahjour Date: Tue, 1 Dec 2020 11:57:16 -0500 Subject: [PATCH] [LV] Epilogue Vectorization with Optimal Control Flow This is yet another attempt at providing support for epilogue vectorization following discussions raised in RFC http://llvm.1065342.n5.nabble.com/llvm-dev-Proposal-RFC-Epilog-loop-vectorization-tt106322.html#none and reviews D30247 and D88819. Similar to D88819, this patch achieve epilogue vectorization by executing a single vplan twice: once on the main loop and a second time on the epilogue loop (using a different VF). However it's able to handle more loops, and generates more optimal control flow for cases where the trip count is too small to execute any code in vector form. Reviewed By: SjoerdMeijer Differential Revision: https://reviews.llvm.org/D89566 --- llvm/docs/Vectorizers.rst | 19 + llvm/docs/epilogue-vectorization-cfg.png | Bin 0 -> 73101 bytes .../Vectorize/LoopVectorizationPlanner.h | 16 + llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 646 ++++++++++++++++++++- .../optimal-epilog-vectorization-profitability.ll | 133 +++++ .../PowerPC/optimal-epilog-vectorization.ll | 593 +++++++++++++++++++ .../LoopVectorize/X86/invariant-load-gather.ll | 22 +- .../X86/invariant-store-vectorization.ll | 12 +- .../LoopVectorize/X86/masked_load_store.ll | 80 +-- .../optimal-epilog-vectorization-limitations.ll | 100 ++++ .../optimal-epilog-vectorization-liveout.ll | 125 ++++ .../LoopVectorize/optimal-epilog-vectorization.ll | 402 +++++++++++++ 12 files changed, 2076 insertions(+), 72 deletions(-) create mode 100644 llvm/docs/epilogue-vectorization-cfg.png create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll create mode 100644 llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll create mode 100644 llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index 702090447c6..4ea3246c65f 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -370,6 +370,25 @@ to be used simultaneously. The Loop Vectorizer uses a cost model to decide when it is profitable to unroll loops. The decision to unroll the loop depends on the register pressure and the generated code size. +Epilogue Vectorization +^^^^^^^^^^^^^^^^^^^^^^ + +When vectorizing a loop, often a scalar remainder (epilogue) loop is necessary +to execute tail iterations of the loop if the loop trip count is unknown or it +does not evenly divide the vectorization and unroll factors. When the +vectorization and unroll factors are large, it's possible for loops with smaller +trip counts to end up spending most of their time in the scalar (rather than +the vector) code. In order to address this issue, the inner loop vectorizer is +enhanced with a feature that allows it to vectorize epilogue loops with a +vectorization and unroll factor combination that makes it more likely for small +trip count loops to still execute in vectorized code. The diagram below shows +the CFG for a typical epilogue vectorized loop with runtime checks. As +illustrated the control flow is structured in a way that avoids duplicating the +runtime pointer checks and optimizes the path length for loops that have very +small trip counts. + +.. image:: epilogue-vectorization-cfg.png + Performance ----------- diff --git a/llvm/docs/epilogue-vectorization-cfg.png b/llvm/docs/epilogue-vectorization-cfg.png new file mode 100644 index 0000000000000000000000000000000000000000..1dbb269b7b9cd7660b78bf698107ee5236679cca GIT binary patch literal 73101 zcmZU)byyX_+dq8hk}m1)ICOU@(h7%A8l=0syA_a51*E$hq&uYHAl=>dF5f49&vm{0 z1MKe1?(EFm_x*{DP*r}1j!KLQ0)f!w-^+Xifna<QGh_AFEHH55K9&Q!2b>EzJL}Mev(fo&R5s*L#xN5a*M8@M zza1|x`bx>8*i|6^K&)8fC@m)BY)S7#I;;X1f97}pgorExiEL%H^uphSpRN!or)*3T z#inn7YXl9c$U!*?Wv3?epHe^|ZS;ugDQ%SZWal4)NAIl3Db`axbhW^YBzNzUgUwM?MdaPLckLR-$i07SWX++u^NJ zJK8D4)O+~g%baoUPmL3r8F=gwPJ(abpIB|RHbn|AsFTNQ7@21LsQRUG2qUFwFPv`0 zj?sMD=voTHpR2)fRD$V!=od+cV_#CTdkVobc(U?ejh#f=2pqfV?gDw-$>_PB0=J2711Tlo^g40oVbD4iHcSdeS=SSowHP+(?5EA0 zH(edf--r!cbB<0ePm%hkwg}xYwUVT?1;tbQQuVFC`Hxz{i?Q|oOYoWw9$`TP_rCcE zlSBQWUvCl-vggEkIf99Z6~GgZ#4G`~3z9a79Qlft;ZMT#ivk(mkdn9*k+l?%~0jn42a0;U+i4`2gh=u<# zfQL$S0KPi-kcw~s4b@*N$DEPiga2*(+eMH~(0ML(5ni#sWo~w^(}d|R;tbqMSZEIC zF54xMT$hO!IV9|Aovu<;G3;rb!yP{ZR1s1GC3R=`0;}2Gzb<#n<&Ab6gtY$S6dpC_ zJ)>+Q!4RHl+_?loo{{PqBb6mCGp0s#vCM3~jw)G^aFOH;%M1e#fs*X>M1eU&Em(+j zRH{POPKq!`VUl-O&RO^#!xLFF_Bk}yV1|SK?YlQ{xFPbLCI*h2e^Qu7v_~`)DZavA zz(oCFHbSp-)zY`-;lpkr|LiXsZqr$BV`5G zMbhQd`~5V{U7?j<3~{{QY$NKd|CI2o?RMvuDe#SC9P~XhZ6(S9a!7D~FhRFXH?mZ?|AR1 z=V;wT+C<~1xf%MKu^3}EXeiG%pZu5Juh;?OE%Z&I^N91ibB4{9&3^n%yi$B>JTCkp zJa)!6%z1cM_(YL|ks*`FqG_}tat}~qVvbd#Iz9;^7irkPGe6K8Es4Z!J|KcXYQ=SRTlhLnPl$c^!Y4~ zm4>;Cg^~q{d6<=%6~DRoU#k82gWiL~S&M0+x!D=rf)>dJ<(mL<$#K27k_P<>X}gbW z65=2IT717h7X?(q^iU4xj6oBq{!Gl;ml^(A*QF^$Z4PYu51o&Uj)sozkG7724$%%PgbhfS$@qjb+-^6`4^p<>ht5|Hrk2<@$Svgu7S2e4A zTNmvhF5D60d-QDbqWbdjr4XhMrrST+^pvZ^)GJw3r@SGfqm>B@o(uK{UJQc^S%>6O zfrC$gv61ywJ6v`kG(J|RIT+oqqWF{Bt8}=&DGDQmCR1FF|)-b-!=8MXxc)Xww+n& z`M9pN4hg?f)_fDh?hj{)BQf)+yspu!UYYk=FeWuq#@_L0b@EnwG!Jy@&I;x%XK6@z zh}`L7*=m!&R1(P$NX1K0&*aG@O0%O?zH? z{=Kh9*CDC5eY^O6O{)rJ5wGF5P0iSL#wg8a@A?^QI$Nd1d#id6SGDU0n}j7D#~G*k z%{P8K7rMIT>g5~_el642J!|j(;7EERxJ z%Zr~Mn)gn|W*_<$mwhZ7s<_t~tspN&`E5~AS5hWg;96RVQI0zoL#FT1`VhPmvF$m1 zABp%U^zl3N=j~=xwy|Hk+t7>plEuUHmD%N3V)n1BQLA9$Io?=)hjhgB{B*CeyG$!9 z2TQ?+s0HxsPTklKUDZZa_XKy&oqw0VN)>R+0{?3Mz-7m5D6{Z#r`sP|xrSbsmpp4$ z=wGzE{t8-k*lWJHy5J|Z%V@FitJzCFD(h2DAFJtDeDtgzt}i*OUpd;g4LeTG`Xx%} zrRxP=J8N}1V&6C}Yv1w_z3DzdzpOZm@rG}+_Jqs)=~ zmYrM&@0&#oy|?G2``*O&m+Kr>w_7K9YaP*>BKaaLzU6nVt}~98k101=H&OUtIXzT8 z<+hFc=jxzLL7$GvN_9ru~r0h0Cfy#|C6^;M0I2H!P4Y@V}C>7lUSgRODQ6jovj ze*3qicXxN{eT2(DzE#NSe{R3?^Ww?m5@j<$O|AI|3K1Zs!0cSI1TjvALv|7U8;z6| z#))|fU(8S*8M3CAYQCaL)RBO&!^a2m&LJ{Gwvs$3|0LhDj-Yo!AUH3D5&z0qcPtaq zuq-}`h(2LV#t6`W+OJhqRCtwC{K}+4eLxZiVS?!5#)}&uEvVV9vOU$U<^vKB3i_qY zlo1b5)Mi?!YdLEvDGC|e*{~a$*cq9!yW7|UH7*Dw;w}XIX=CbaNa=24ZR;fDE=v8M zD};c*Uk`(+DgSecvy~{dmXa!^w4I|VB`-SLr(w$6s`Y_?7`|JBL=^dn>HWb9~R z?`&abOZnQbp^=@7vnVz7>y7^R=f9rQ)ZOC$-O1MJzlQ}35d3-u%*oCH{@=cVrXsIL zg;XuvO|7+LENo0|oq&6YaSL#Y{O9`r-dt^XVaUY8iE2>5?ry%=hxP;em#Bmt6_kyLkwIm|?AT%M`wkCQ)_Vx~hOGZMrzc?Ni#wKA($eBq_sz9# zG4*6R2_obYdz_Ik$o;Z+vtLr-X=Xyof$z*nz`#nUpA=Rg1V!SF|%bqW~# z=X7%z9Q|dBS!r2eAIyfoLnq|^`oAs;u>I#>cG+T4TUG&_k(hIW_Un$r7)xMcc%Kh4 z9Iv>oxls6ACOkY|jIWqw`xLypO!IoV?H6C}c)kMPbi6!th{oE-Nxd9&JP$u@GKf!d z&#gmO!Mt4SAWSj)+9>DRFciv+hntg==hHCp#-^RDrKkJ3j^jqoN<|%t6n-3qg1CKB09dxkn%#6G?G*r)Mtt z$34)jrJvkuMD%_)-?Q@DySHP)C$0Da19p%>Q|flet>e0Zk=J?%;U!BbIDqcQmydCD z$y(i`Vh@L=`}J3U1SWWZuJ}irufKY4rloWTa((;UkY#1-5%(_?Avxcx$?Un zYPrqnx;3pm9lJ5NVi^px)}sL#8Pqi_luKK z!=hzL)7tZO!;wYGtR<;AC-8g?DeLmZ(k>29S91qv|DwRf@%qLwyvxA4EGH?GAF8eu z73`EdvuRk_?u#KQ#D%e}y!!iDC9e8+r|0#uuZu9y1>qt3?d?X~iiW1{7JJ8|RM)=7W7n`Y(a^;B1!Zl8pKwxSi5tQ#T8- z?Lp_~aq8iVm9MoN(W_{?)^U5j+i988^HK_V!4N)#7NpxYn{&1BVL0OaWL-0=2yHMFvDAnt6F2Uj=a7rWH}0F~43+B(P~P-M zc_N5i3`W77(J?uNc3FPByI!(unN}7a9TnWkV6zUF_yjzUQs#9r-BrsaS6w-Zxg!PHi%piljA~$vBFM%{WHfAGpH+@RGB8 zb`4-0|3xq~-|b}U^FPhQFIyC5TDxS!RV?Y!c4t?~-qv(Qjb&Huka2Kf;3dEntPZ-X z&qI4qclbN*Vn{?iYvE=_io66sZE;^KiG+f?MhBDoJ}Wydm2K_RS3Ar**e{v%`MuDT}=>BntKE z$*1e3aR-nbBo6DgpVTQj-Q?SKyA<@08&Q@vu-!j6QWJdgUTsNRW>q`=d_%(P#rWbt zQ{e;R*|*ud?B`2{{tr!RQplBiZoo2BuZG^CT*3xZ8vjC;IWKQHF0SlcM&i2lhr>_} zhobY%cZyz4NFV0}VSJ1xFMXE+nwOAy3vO6;v`x~0ILv)fV%O0i z3)1A6=pi)(2Z|8tx#xcNz1y0VLKraG9R_!(M^ipw-+ij^ogtS{@JIdN6(@FYezTo! z$K0pH79d|mf^6&jg%@_E9{f*2jrdjZTWC-lJknX-)G4)0tAEhlMx6NSxX8tb z9z6(LWz~lKQHh45TTf!?QN{266PQJD6pUTd>K}0jY}LW2%2}`VxYENmMT0CM0Y)@x z%rNM+R{-T{b@xr))LE@|H-lx`u1nedpK{#?Myc^(%@{&ODzKzVYmr2&f0~8h;{p(r zp0MAM=5{bCaGbVLM&Cc(tgWHl|N5&d&q}zH;S~D=g~_vo9@Ct^H7P>TgC%ffSZ2rc z+r;yLc-{|`t{!2@l^G!~yo__C6V#J8C$Ydj(1L?OhGqycai55m>Isw_xJQUtBK;9T&Fw?LjY=}yt~O2qF5RhF~7j`!%Zy&XrxgbIke&=$e^ENMSR zx8wpdXDs1v_Rv~G%yGN@Eg&!F4$D~@fe*|;F$WHE1Ef1M_vL@@uJ zf-IBtGlkEt^(@rZJGA~I!C(3LwjMP4PRrGi%cO{)paUHns1=+f#kF9#wuHWnXn_XT z@AFw%;dSn^_BV}SK{|8S~zMf?rQnBWNyn< z?+~#+=s^PXO1)c4t=o!BE(L|`FK?=)uBlm{;4np?e_#iqlEU|gWeq&AGxS=eG}hSn zBgs82P@<1blNR#JPo?=jXpFO$*pE}O`1%Bk>8o?;LHc$2Q#g$;#XPT>3NxzbUU!Pg zC@p`3so_ojgz3m~7 zJ?a>eou$KYU4wJ3ioCPxI@v|Bwd~h)CQ@e?BiwQI2zrCt7jbZzTyGs&1nHxO! zKD#W2Tz3*Fy?y8b?9~TK$TA$_n+Z{ZEZ#ghRFCP5yKg~-0hks;)vbL2azJkAfMJJ; zXq}WJ`Tc?JH`F5MR*E43zaovK`Yt6@b;aNz&R@E4te{*ATW|;ri;x@FX-5a_8qLV5 z!2(W_6QBZU6@2XkBOkY4Pj3bb{U^I7PN}KZ75$S`Q6BvFk(07J4OWOC(%V#p$5oGG7Ol$_Ra ztu(X;X_al2Q_(Rd8qUM1GuW4cRBR)f8#Ur+>Rru|Lzn~TF=tvuysbyaX7oq~BgM0D ztjG{GVvK0I-ymO)p-FVeACA&BZ@bfs65UioIgO8YPyx%eJx~uw9L@Aq@_jVcl>jvfynsh2N$RvLFU?L}3R!V-px{bP0B|6N#orKo@R zO{{+dKhuK-p4^8ZJX1Sq`h9xM`<}c&2M&A{g5_GSeFxxi$&Nt=$$u?>DW64<2aLxO zvIOc_k6fi`wag-8@m1MXTo?6=N-XsruL7yYwpq;wC{jxXjpAg89yYznblT*`QeQ})f+(HDbL)DBP)T>$+Hi`OvLJ|jt|9^EXui! zaasb3Y0~0JZq-4F_ZNR){8xY)vhZdPSJKqK2zg#c1e6^fvA^szK3kuEp4T=LH7f37 za!zSSnbD<6$jM~LtYh-;e_tltmy#acU=@kX((k25arUl8y!r}78q$L@{z%X&bEI3x zqA(dEC6bQ5zoba4kZnZ?tP)MqYE}MM@FQC7pEc-X??PGWlJef(-DEXrixDYU1O`HqP;jk zJ8$$5n3dH~>!PrF_{|~ng)=MA*gt*ZpAmVwA0b6G1p8mvF4YVe{Sz7iY_g$2c+Kx4 zt83QXQ7JuhhR&Uj2S)6NP1k;O3TB|@BgmqmMpt_S2`#V%d8xy;RJT4`X z!%J2ck4{zkwZdExk;};+`u)zES6zCgG*s+^;3a0S8ce~b;h&zSgzH2n#E(d3=D>QK zdrGCQ9bQyr$4Hr|;Lh6Z-BCF2lMUtHfQ>H z;xHobQS=SNefmVi?m<8%{d1|z!jx_PaDRPvea(8zsMkQ)J5COy;8gFeB+Wc)=4IX! zVvg}c?M`X`WI_f6jk0_U`zp<_GirIg$0tU!|NP#4f?2J!?%2 zh?j}!zF>qA3#E*~EpbFS(b8`E9)~{50TVCwOoBJ2bCo}oqiYyP6*(`j&&a=6uTzxn za5-r%ICR zO?C%9cpN|MeEKj`Y`N!mH&UasMq6!I(ycXhv!n`HS!)P;JX>qq_q(lnNo4cGwv>cA zaibo~`Q^iA?-WTCqe#uD6gk7n4oB#P?Z^thZcs^YrQATc*u?>XcqSwh*{|wBaDmN5 z0aCrcI^-e=R@V*u0(7k)M-fPhXB6-YaB2 zw}h0kcxuRCZ|FvSFTHI$Z0WUIvfGfDCgsTVrC3vvVA(Jarbz(y2NR`@+xF`IE5&Rcb{%~|n}eePtIo;>jM zIQJwd^INiMa8VJxQvaj@AUH47O#-)aIIs77?`N4!u!dC<#iGCQ z=avpc>CcDDcdtW&zRhz&Kzs`m0H;I{ta}aMfI{+uQhUkx-UL*21xPUvftqPx@q!tc z%@bAG8cl*<5j~F*Z=>O2Ozhiu?p9-Q0Q)L^KEgZ#{Se0@i2xPLM%1r2we03aL0XN^ zR!uJ)zAN!f&a&K_PjChV(9=tFZpu#)><3fS5}64G&TLR3s!`|*baErb^B-?agn&Bk z_e<`6_uf_&V^Dg!Gr{flVN=#(S4K;(%Di~|F#k03E?1|`ovj0atVv>;3dT3la50WR zF3r#Q=4{aF|1|0eqfniHGbM6DzWn) z#IlrBRhR!e!RCAm9gsj)ue9yVrE*7R(SX!TX7)z9+@7u{Il_J6g2n85C$h#UyB_Mc zU`lv({8MBf#yCsk7>U9gYjkQu$lxEA5?Y&(%@!K%2jutd>{$0|0mHr3vyjhB-Utwe z%YY))VYY@Z2*Zr>pON>aNKU&Ea*i50p301lg6rRq@F}jRGKqiip5*YDM51RHim3m- z=G|o<45o#wc%$^fRmlu1IgJYUwDbt~w#+aFxZuT@PMC`+n30A%L6NerZgC%Uzr!Kx z2NS)t?nbNG@kr;W1lnsyf$)T18Inea4Lr*q==~Bf(nvCjjJ8vn=(Z{N0twYxM|sZ? zJ1C}M+*dS}YCc%uv>0pA5mW;ywFxx0T)|+Rd!&$h*buCEzVEekMUGel5?rcww^DiD zog0?eKBnF9TexkaCEF6E)%ikx$35S3KBDba3vn-$1N6o{K0#(D2@^#wg+>GKxBjWO zNX4&w@@(T&rzoVj6iZu>`h7YTAIYQ!lGOWiVPrYT@7-<~Uuki&V5YqVIFQU6WRrF* zfjo6|Tld5HD%)(IYqiahoVnQT2>t2)2imVJ1Z=axcqNb-HEXV^WLkYO`Ncs0!ZvSw zZ2J(=7zDyE9IMO49oT^W1CY{g-q@G2-kg(!ee??d$kuh&GXpB*Bii1mzPCsRLK-^2 zFcw0|UwVUuj!viQ12I~y z5(2@I8Dn6lRPDom_;%lk97%xG_+cM}{)65Sp$$6!aJJE}c^=-}KfG3*oisy{Ya8u{vhJyG zn-rB=(OPj^Nus57@@xW|Q|sRjuWh{+^uT}V)6eY=W$YlKcsC-gz zPhX(XUB#HeI@IMD$R(zg$N9_@kP1mZ^7H=nP^NSr&=9gk{$1XjR%NA7%pAqgwxWkM zDpKV3jvn4r%DvQ*$6w9Qc^>jAtG^44qYf;?soz-r8%@g^p_ zBe@gK`$1O>NT+c0S>wc?-4~5m{eC!gg66n%i1{)cM zj8=&ZfGIua#DPA8cZr|0cqIii1shz<8VFF z3YBNWap4mYvu^Qs1r0^!eeqQ$ib0xS3FG7qPOp(04F@TVABH9WeLzmhg- zk;I0~M9@{uUO_tZsv7BZ(YlnA;y{Z;Z=D82o33A{bhr97M+Uqj81aHv^G76b^L-bO zN$FA38_L9i=4eG+Iihm|Px!7cSf`}H#k2!xAae=G#?>;D zGa&!$e`P0M1B#oUK-H0*XpA5qry$0RVi1IKi5`o-ngcJsu83F8=;|$*w({*O77GZC zL<*;yWXye}n}p9qEe^}z*C%<jkjf{1>sGopBmjEg(!e8Y3B5WsEIVmM>v?3l z*KyOVOSY3hWO?)ZAzXC>Fd)ZlbvB2t11r6J8gZx6D zv83Myk8|XXa*vC|!Z{7)UC~(i23CG3T~Yq1U^)uNQX%*{aj|w}>1c6NlpNZ0NRl6V z*)e8h6hYQ_G+F{!rDs-ChDyCKlwTURNoPJ3X;((0ghA&t%hejNQh>A|KIin976Vz7 z7Fxumr=w?VhrFtRMU@P;w){8+Kiv^GP66TceK#;G@{O|y76Bi>QiIwB*~9Z;)8^Y} zgFPCNO7d_ey&WlW)V--AlsOFMDWP5C{QIK;qkKz>H}v7g*8gdb7M&IRSvQ`5DjhNr zh#8a(quMEVXYVhHdhGz)`KdR_s{#{*NVlkzuqkT3q@BuZ!M?taY)*spm%}2> zHv1%eXWRVo;0v2y*Y;i>YTpTi3h9%Hca6(&5#7s3qffWys{2-5Q9MRelUF+v z?5if17qkR(X-_%A)$w?);5Ne9zssg5-sRWN)zA1DBWLba5>-k{X8Ww5pSE|YAl`fD zhGms$DMjYxlc%8JmF!nCqz^adV%K&Bxf8Y&9Z3Y%5VxtB8^K?I_~kd~^w-N22Ljo$ zQ54(XgZ7nyP`F_0Uf~4*90dbcezzZ<`nw5|_^P%Da0@Hp*fevVO)e!De$-liY7c5(2G3Jt&A^1spR>X=x=9l~rS z&-WITFsa*0{;X`seO@;B0gMaCS^5Q&*2el12e|Q@fEdRWu$JdQDaWoQ4%KC~#pIo2#%b~rZn3+(*3|w2NPG{IRvGoaS0(xyqW;Jj zjpCCvU4k=yXJ0d2ATdRd0(Y1zFKd-)$IipcX~#B;G-hGe`ZjC>bH;XA+Lb<%eHDtAeW0~%M+#x!*lYhEDw z;eC#th4CjeHQs$Xqb)8fv8$O+#}N!!6BB`ouv#r|h(&K1^aMcD0Gk5Z;<*uD)O@F6%B!+U?oEbe zi4~R4`&UbPR_~9s#}nk0%sK{$d&dlb>Wq0!O*CID)zW|>a8lRnsO}ih>fN$^?|-A_ z&uZ$gsco9?+U?69Acm1w19kz~qkT-#OH}~N?xgxW-8{+)1UBi@ce@30el6|dZU)G~ zr72O!J&-oL?_*1WcpBQ{`UhXL5fD};R{%QOaw|b&vJmHkgpv}%Ied+ zfc*V04)u&ggcOSt#uJVdeRYOg*d@NiXbj=cuYxpcw*TJd@4~ENT;zL~ zdvXlc+*Pj=QT$E;eOZ*3uy+(bJf18yR;-@8gvR5-6P8?8$|ZuU>sN#O!;Ekk$R>e5 z{KUZI--r_b)*uI98AKc+bk9dYi$(ATjKIkxdIqsiE_1q?P5M94&LmH(yNZ++eIJgV zfHd*3&La@KiO{!@?Ks4wM{Rcp3}HBGBcXr{@I++$J>NY6d8g)uPnZG%Yw_zr9-je)6eG?v1N)z%HS#0l z;EzCM*T5Hwwz4^7pGAs7fPDIV@a6_nXqOV4UcPQtZng@uig3H~0f+YE%a4x7Ur8D| zx@51l9nl+7sf%D-<$;`^`?B!4{x;NS)zL@QF@-viTEtM52S*JjP-=|KNc za(G`;l|iG3`u2`Fff1s|adqVhxL%4&JF&+~iNQ^qIA94oz0*C7(`hagojea(+C0Md z#%%;Hp?qnBlIo0YLX`3SIw!9rnjf(1_njbzipma)20`c|dw>mv1H4b*H8e89bbrA?ivG+xAP0R(!!mINfC8Jc7!;!33Sr<9Nw|ZCJ zEJ%YCkkHaG#UE8Cc{Z-LuF~BC+yX&oWqCjuul%r3d|#S1w&jnYCNJn}Pz$Y-zwLxf z{QYcKmX!;*cG6I-x{2gaBozdt;VGpcGfd3^AK;ncO+Z_)UgE&l8DxA)Rn>-$$THf& z!f)`(?10M14dA9r^VD(9vDV4yg4w&t!p-TC=m{tHN;Iu%Gvbkof)WBHdu2FJS}%ka z3MU3bP%sQQ?_g2nSQR)fuShv)PHLKsr1K-5EL??eKYErNkAFwXb@H zm>6CorQ??>fe5sgMA^Ti?4Ca8$kdi(4Lumbmmd^1Ss}C47RBRLq4GFiLXbbfTcrv( zY=fYk+r zla06jgCS&A&@8Yp{P;6jAXXOEjW(Th9_HLcK3J+YP;9vaBHT4~fjT(w#2q?~F3@w|^D6=k?NpKxFzvwInTW zid1*h@uKkP{(-1rd;x(G`>58W3(C;$$rdC9Ir)RZYRoh(ya z;6e`Y+6j00qCU|0-A`v{q$FG`#-D*dotTVF;1HVaXC?+kEzqWy7{%$i&}CG+53dH0 zN}ykdi9bqG5zX)RMIy2KN^xz=TCdjZ1Q5S^vHo{VY*oED${BvnymJzc3R4KA2Bzb} z*dx3PhhN420cMNAvYSJv9JzzxVx|T}tuC^j|Aa&Q?eDNEP{5t{`^^Uy2=>3NVwk1r zdgc4{um|K8ZByv~wx#nsF3xkMex??KTc&e9K3a zIDb!VNu>=y>H1W~uYefrImV+$1EfIv1cAddWf>HFO2%xDGzT0?vKLf4bXplWa>_TdnzPAdov1+RXE=ZrFJ*C z4Nd-sO8$OIq?cos;Sl=SG)ZH*ZcZ3ZPyI@>4Sl%Gyj(6EVa#=syIn8)kRmA(79oP4F_32=yp{IMwH+;b zBEbebO<^A1YMy3Mz?qv(=%k=-tI~0Y<5H@ti+$+!T7RD%`uvBLgVNS0aG*HLixekO z)@7m8*UE>}Kcr_!AGW}pR}s~li4tBL7*-RNCAg1Es9v5@VB%(HYs8G@$ET>1rLv*kh=S zs@Eiy)x4DN_vF^g?+-E#I5Hz2PQR&QOx7X0F@6t|o5q=q*ZcGWxJ4D!LTLMq#+yC~EMw`~aljH1z+uv5xf8H}2opek#<9K63tXZn5EdJ!o@B8Q; zLfL2K1RxcgPgcG~1a(SygIuwtL@%TC?961WKCu2IzhDNVb&$bojA@vc3gJ65{WXA> zTj~UV9dlxIez9gw0;Wt7W1cmNfexm28`{ zw_NAnBr_{JNpUGPXDY3FuG)b>Cq2&t7%(Bm^s!Rz4u4Iz+13&@Q&PMI0KvzS?9yTS z(>J-qf+R$*>OwH!AgSFWPv)5SrtzMi`2u*1FXKVGZ3f?#W_qX8hv!hV1a67#K~Ezl z!#|h8>6D4(u%y0$X^fDdwMlfNfRB0Kr2jQc#{dAh#Nij{rgBT}Z48+(L^D{c0eTpm z2E&WxekT{> z@11~+uLKs|PBq7S4X~j1Gl&LEo|U5`@!guz-z%{O*&DD0hoC=wmFnd;03gSd!Qm0! z`59+tcf{zwQj|Ki)1ZN7YnBSq*DWpBr$&X5f((p~4&PG}88(gY;jpIU9=7oJEhpdv zBuIk;1esLoPl3urqN;ufDo~Ct&b{0?1_SR(VrI)715Ccix`;G6o{{t43WX@afwFvg`~g=`6A&DZI^PG!ql93@?>Z z7UjT3V5U$vEmkI0W8X(Qv4G$;MJ}f?K{L}SlJELOQKR@3dwsYYbxPNki5;V2uFNw$B4xVBIUkContorN%H&Q>Q&hk%_VUWW?bE z%cllW0wxBM!D=T6P=dnsY)3>SeXN<1pXXifSuLch+MG>hqGQq?C!;x z4OHX8fGYl|i$qc=QbvCNJ^E9!5DP}il`ds zhwN;;giFD~yJt?1GL z$i22hb;cIJ5$!bAmuezE?M7xbHohZukQv?wOUr`(m`40CCTTKtXE{FzLV0?0U&5y| zwrU4HXVAF!>_%Z=RQ}NBVW{Z-J@`mBJ*hdX&_fx_(+fdlJ=g-Vw<42p?@i&c z$3Cx*@b&|rIPAP~jcwV*XJi>fUiX0Qm#v08#gl;>l3Yz0m7y_YShm6x%zI`WQH`J{ zCM8CywL$5Ra*9njfoz45ladcB`i15Y`*CuR@=H+1crP}697ljXeIVXd(3JejI;E=M zcVySY88uhx3DE%C%1wENTP5fv{#Ew-&jnlH*H2k~n6Y|Y!RIO%oC1`T4yJ*(wpw-d z7xUbZOVVIgGkBF2?zmXxM<4A4P1DL0wpRzCZU!?@9 z9TY3w@88vyE7S9JRs|As8p+VFbUst?W(Gh_yT?A_Pc6JK{UVOs7Lx9aCb4T(TgM{z z+@;cpqkIFtqV-BKAK zZsW7&#JA&)X&+p5iBK1{qC6?TefnEJ8SHCymnm-pKi&R(e*n0Zbqgz7OR?U?lJ2p$ z99Anjcc=1j^-K8!jz9NGa`hm~aD$Umla8nBPd?WG>2LzT&Nju3@$Gp!ejwPkq}#O% z)-VT7FvPb48p;q1_d>K|e&-x?HX9(HV)5CEiJ|U5<)_c@@K44K@MATj^DkGj!28xw z8C`sT?m@bR(T8I0v)&@CLG=?NlAeJNl(MvOgC@pHC0L4hUf1nut_e48?Fau+t-)hp zXvg_+Vvc|(zr9DS-I^rLA`H&Alkhf3nZ&n?{}zd&CRRSZzKu$pWY5~jOqTw#=AX&k z5NEFb1SdkZ!0vCe6hp+DDFtwo!$zMV$Dy##bhQR+n93WMX$Re+3Ye*2(=I9cDjm zaF`~NAwe1L{ch2+*ZI3Co*UAu@$LbY_Qsr`!S|JnB?-pd56zGL|9CpffGFE&YXbu^ zbc2MXgoJ>!G!oJvFoe?5-Q7c{bV*8g2`Zfe(jcXzNOyz4x##`P`F{CBVB(p1?!DJu z*V-?K2;6oMTJ+*6zd!|B`9MF7e~$<3J_fQr@my-lqG(nif4N$n-MN^`DbNcXUa?gU38Ftn56<9v?)lupN`R)OV&RZ|Mi_d>*?v`^$LC%% zEMzpQ%i)WBe;gmsf~clM{~h%l0Pw^OFj-1iHqGS}tDWzYl>e#-@AEbTE%O`4=eh3R zjy*V0k&sm7C0<1JC+k9b9#}XXSChiNU~l$k82Rn?=J2w@_R0Vfv8<&e{&Px_t26`e zvPJNXW7}{B_$rvFZu*kg`3ER#9U7fjjiynk(lnr=3ZgsXnUv&Tv!NqHa z;skF)*r4daKtRMo*KnH3KlhR4PiX-foPw|Trv6+9f=7Hj&?>)%`{x7By3tr1Elb7;_Y>*9oZ6TDgbI9UUDj!C0)RHHpdkiR`W8!+!JfPD97T-JyXdrhz8C7}Rh6 z^%+V;u^AMa-SKdL%U9(~V3~F0G{~4{DS9Ren>9EJ;Z;6U<;Z^kA5TlaGn>L#{u7je z%K4ievQ}cQmm49zZqgOyUL#lBaWNJ-jni)wy^u#0_0bMv1xvOcmpuQ0$wC6|(VK$V z=+vHhWR#BaMk=q+sT>0h-Db3!pcsJ#!EZ0n&jh40yis`!LyJp;&+frPRk}5NB}orb9Fder28GbTX^El(-Mo(9otwzc6<$N{!0co;x+(JS`YJ%B z_6hcY3}!2{HC$XOe%wpcCP8=m3Qu%ekzI!)TrA>7vc`(p5XXeG2Hg&yFHQ_Apd7;w zPT@ypiMD?WN{UUij($b1RIyu`<@7W6Ib(EAYbI6QSp0Qp$BKsmukBoIIgugHdDptC z_+4X0Wok(;xC`V?t9qzm-5@KaNm29yMA8Rvsy;!F)w;%b zU{}yzEPXS=zE7k(Xi=-CXk319XQFknr1MSerP#pR1Jk`G_HV&G$>5N)wiD&12%qFe zL_&)$PW3+F748Hl@baVps=8}z1s#{yyOqn9ZvuwYk2qDZPi7OZIjw4Yr*wUkAFP*0?a?y0~@xOW|WcZNJ{*LsK^0 zx4;~(sv1So^+%%gY{lPHFo&zkv!(BqX|B%slrBLJ55XsX^3^mnR#@yq<9kzWrz?Ww zU#3!mYY_~;5KeGI2Eph^gH{yepfY+nJ``Kk{12Wg2Gz|KU3T-|e zG_m>?2$@N9X3X6>*~&>5*ilHHaW;5*ag#FR+=vSs)==h4S@+8BZErm>>*P6`FJ&(1 zjp8cT-Xv%Ht0%osYaL%=Hbdq5SA>CMV_+)uWPQ8GZ)CiaWiuSfZ{tW1AmFFM{^Gu!QT`8-Q!P}GJgv{&0h zJoIHYWDciJ#0kv8TQY5M-(aS_{l6>#DnYgVvAzhpRMtDyx_bFfru|fe(xbf6YJ}Yk zpMyQ|JZUBqtAL$g{A6}Lu*_q}sU+S)Rpc1nV?#oD37ep>sayrpqZrO{KfBmLU8!iIMtXivU43@c9UZd=d#PwK5{M8$a`AP{+Y|{7`&1r11R*} z)c%k)qQNQ-&zmL(t`if_!*7RT&46w#VxjYbtvo*Z-q)jr&vTx+ENC66|Jl1E_#sZ= z6_@=lU|((|WP*yAV&I~{M3B4aOuEZ{qVTzXHAVGSG*X=7`YfFIK8*e1sM-deR7Gwv_Q`^*y&i5V z;>r-O-^`iDm#xUFOBd^QbqRr4*Aq6Y73dK$S}?LVuz6Fl*&TVIUM^u^H}r2X^0O0R z+Cb4qO_I5?NMC!eM9Lq2o?-h2cuwD=;)f!lHoiwJEca znW*g_f`v~T(U37c5pxK}R@P@q&R+D`+KFX)&WxO>Tw_eavWDL@7oo!Ha;1 z;82F5yFst>Y?3(N3%u`{p+)SVhI|&5k^*cqqZtM8y#~yMV%es0*bB;7I1)Ua3bk2( zl^n%VR|GE~rA|RI!KpS1-L|r(hLR~_$bz9Y$6k>&F?fe4;7*R?}<$0`dr^o=&vFaZx$M;FT-hPm zm*vF!%B-U9ORnZ#<-A4XgNr9}6lniumQ8q{hbf&MZc8ea+n#P9z8{gw2|hk>)2PnG zP6k})ddrc5j%N9vl?GI*3^n{qL6l>W%&-)i{Zyz%`EtJq5nZSYIsY*f>S(KWo5x=k z3LBh_g)_SnVkin2T&rXKfPz`u zH9{)tuJ1z>Off`aPbSY~?vsiPI#Q4~qJL=EECvXUJ1 zj72?T$UW3)qOBZX{)zI>x^nuj2AyDGIk9By9x^vR6C-M|boHbJbIC{d%<9C7vLK!O z0^EpH`Q7ozBr*S79DOFt9m$qAcr`>FA$pHyR9cGJJ-@5(V_~dp`$1f~3QhUh=lUO%-dkvs45nz_l&NOj-Va6FEKl5D>3Ef|ZltTRG57JLI{Bo!g z)^*8`ZkPJ8s6q?_pSBal&?64O51Bi-3+@Ix{K!a3ll8bp!dUxD?ugGZjjy^g}}MlUh+%6RZwl; z`z7{(5xI4-u>8fv!E#H0?JmWh0B{A~`x?O~St<`_4~@xwUfTV)ojxB@>nFE)kTb_s zg()_M9Hcsej!|Edrw1cwj__E4e58<54M^Ynr1wcpAG5x8xI|a|{dLyx(&O|R=Pu`c zBwzMzTY4IQx|r3qa>=r1mF?A=gg2fvo}80}KKL_ve2J+xY0p{mVh(*8*W9EYvEeD^=n*IxSTK9+dc0f(=ZTvgIpk>mRiAdrf#i}t zPyad`oBT~svAJ0s#xqA8;G1x5TBGlp2FjZjnrif84Dt3Zl&fiSDH$F9wYAi*YA6jqoyL;nH_^b#RQ8g5N2&+7*#~b z^j$wJ0&T$h)72_JD=l0J#(&$%7?=@1Z$Oop{!-^-fmG0u652&?3`>XKI6JtFPh9gVfCbZ%ND$}Hk&|^$^6xvY}zFYKtQchcj1Oy#d zlUAT}-H{!Vl6*&=N0CTq#&h3_uA7JfeB)MTXkadimmTpDVCJ%+NSQa>7C+68FXA1vwAjF)GEeGmC4bC`m~;5)~wHh9(q98^556Z1~F4D_jsa(E5ha z8XHyz)kNJLpb2>A)sJS)&<;g<2GOADt5=W@m`)#0Cjp=3`Nc$)!2 zT?u&#IvGp>5yTqjs&kPmder-hRJ1b*T@Rj6-DP;z& z2qhU1n$!e3S^R4#t^`8#p@7+MRkoTYsw*d(irtunt9Q%^=yNY)3B8Ts*y?3ER)k!i zbi3EnQtUZUjAA;O*85{MhOtpS@@m)6xNuC0Xc$+*bthyaQ(k;%z)vm>So12d`cmxA z^kn>bkb7zKrh1JnEI&x+H_x`CxhY9Lb`ZKI_)H|LUTId?!~cQCa2pX4L`1_{$D-~p zLiJR{qQELyEtjA+jI&Z?G#-9CT%n1aOd~DkiVhsC)Ry z#PAtUmUqceE&W+xmcLfwbgPV5H$i>9^!Wrf-T1sOHcQ3~Z6twtseM8DlN~9`!UANP zhZvsGad{NuLFD{l3n}`98%(y!JgMCJIeZ!B*bM8}lp|zq=5OdZcStmOoT7^DKG~iS zIYnu4c)L?Y-3-q{gPTZWoEzgj?^XpUzxp=G*liA?hq}BT4&zRuZ9R;BtlgjHWPc&L zU4ffpL3d_XviLdYFGn#{JoaKjb5izH zre=`y^ML`JCrv9-40Wf%c2eHagVX&iGqjZuV|aL0g{6Bqtcq+#b3=e^;-$Ap#6NsD ztv|B?&nq^P+Hg5{h*iaR3DyT&UbK59*#`Sumn&-FtUqV%&@|Ke>x}SR*2z=5p<0LK z5ygwzUoj)U*2!-mRIwh4^nVape9 z|0*_2O*3nr+~`I>mRsmWJTnbaHotyC>gh#>p>H%W|1LqbY9V$%2PAExv7s|GKf`W$ z=8u2OTqcK^%+_A)R=s5Ox2{RCZ}N#`v!jI=5}2$`NesQ0e;?2V)Z9e_eupHpAGB_w z1{xyk1A;Gds-6e5%WC7bz526gF{5C9##aASs&2I7l6omeNFa z+Q}Z>RIk^I+#h>vO7&K`K48tvkY1~)6j_}F*vVG{bq}T6TMt10)I@>!HRBEL-O7e? zufnbECv;fqvyVx#AfBSYC}Bp>wOr?G&4)&US6n%gX!n1%3imFa_i-6q=%%B)C zaXcv_qwqKx+`kcX?vx~HkQlu6UQcfOef%p&^rXd>5dAmR8RM!$bP0>)-&JJu9ZTa?7FC~#BR zYB25&2Z*HQQ2iN}a&%i!1l58TgLRqkM?uV30{ts<_0K6EVj#56-2weU$42iS=hEX# zb?7p8jQgJs;bhnH03Gk)&L^SSy5#3>82T2_4FB}b5A1^xsx|s(alZOJRp8j%?;ek< zF+tw74_N`j{-u)H({Hf+{yjxu-N9pA)M^F{&iy6B^7DKe0}J?_dD$l>T{-@vbqLiW~wo0@PYnhElbu(}k<;cvX48!o0+_@|$&=!YW zdb=Vep>-!u^+>x!p`#zBD+I>)-A(3?@Wi_R`!g=YX(bVNLsD%a<9L!DQFC%O8a5!5 z!|-av;_i(i#|$Jjuc>GJ8$Z8~6&bQB5_OhEju&qx@k@)&u9%y@NWQ^_(yPG%M+E-e z+ETCji1F>sHWxgT#+85R;zSNNF`PRDSbMBZN!K2qMhj`s5rEo$veG zx>e3Km&*@fQLlNe5SupHo{Tz=^dMf!0j&)VAxBSrmcE-3TPrlsn>ylw+bxBddX?8B zFH?sXzfh0J>-$2!HXPo$-%H2;0@qda0#~v}SNWLycO*QJA(a#TL2OL=uvM>i7WhKWK%7uku*i;X<^Q~)3cTTeU5 zX|WDX+b z_rM80O6xT7WGDgMgLTi#T=OHuDztrmTYJ-{d1>D|KmJI{Fb3&`Gf7u=GMN&TFjs)@ zoy4T~H%32FspU$KuN!vT-*WZs>LW8DTES?%+qW`hX(%&F83J#1&)Z1GK%8Ij@NW${ zZLB*mlTm?~xsEw*PHD8Km!naRm3%&66%EnnIMdy3P`6Bu zgKS+-P*QI61T{)^76Ja{3#ul)LC82+z`P)A-5E7EhZvMFow-n@ zH%jFeXUv<&L8gsmJetNWiX5cG?TgZJE zi|8iyluWY6iK;sYvrVk4mq6`T52{O|{8w*f0-cl_Y}KhX){H9u9b*<{WPP?X<));A zCy!2$jT%^j0w6)U55RU@K`nIit2%340>qhoI@B+yUbBJs7)Xz|PTReN-`;E?PBeQEl*4}& zn(V+{n!R%M3i_a4C|krnQ9rxZNvxst0n=(~TDVDY(eD!cyr~H^wN>347e5Lg8ip8? zF-@H9Rq9HC6BeRhF}PIiJr171%qUQx=CvRnwJokq=mm$t*g9961Xk@xTQx55vAs<; z2%>LQGNzDQ8}wTx9htBfF8FO?9R#hpOV{5lF5my$?X<UA)RZN zM~biy$Z|x&fw1X<;1NB zeA*0U&vB(brni7}fg_MUVIsV-&n`hIr(O^3^ZM`f_Q1=jg&aBbDJjBfHKP9RxSoGH~z>M+|ut`EU!obZS076ij|0F7w>plDPY?wSh_fbIj;A15};C%&;2)nMt zwULApU#9@l8Qgrnfq0-E(dTu3x?KS1Jh2DP0&QdM3&92@BPw-OBPqVHK?mT7pX3Y% zN7Rk{FO zX~$9U@EV;*T^3h3%c}ts z7{WeEM%=M>VN#A~<|leNiB6p!)_+H}gULP_<$4?nVdyYQ6tDJNx~ z=ng3sc?WPLowv+>$E(9VHZh)hJ=y&!=*nDT*MFEa`XgiC(d@ zQ(gt^#&@ZJ3;|CqzI0oE+QaGsyeV$Lde`Oy0&{{GP&BD>Q z37)@&FLZe`Tk*G7g#TGkJ;v)VDpgUk(iHjp;cdih6@Kg%Yks@{t7XDI?<}z4kj+$Q zV?^w(_JlW|%XAi)ecdj00jSQaq9?$QRJ91)31SX45h$mbgajnw2)XE2=!`}p!Osv7y6mlYDus_ z(*D|!oHM7!0H3^xmLSj|IL}8Hrvg)&>$DA3`jRlZGg|9IE|+oevT^v6RzckV2q8t^ zcMs#Q@VtUNj}UkcnW_;d^3)x|vx3&$6>XZ5%64&T&>Gy9UT_1PTRdKcERp_a`QGBy9fwt&0Xak7Vyp@sr|E+KA%n78V$vqxMLZHqTe!QDn>~N?MXR& z{c+24T1S*+tjS{bIO50q0F|4x37iJf#j?6VT@2ZJ{}oA4y2R8=OK^%4$kw;nfsf>J zT#6VzW@S_~y`XiQJpwR-{srQmwU2x_zBstg|K?w`Ep!})2B3n%EM_8xrs>NE-9e=y>IQ3nIdM#|Mt#3 z_Fdi1e)!jN9p#}KzLmznNvOGR1(cZ<)v7kkN~>*T8sf}s9oXVVR0D0fT$yj^#_NsB zNwFZ_!XZJiOqlTbCz}#-ge`5bvxC~5ls5R>cfPS=M5)@7I9#6JN?3x^Z?A6PMst{v zV|LPm6g{eAdMF}yt3+M05u`=9;?ttzQl+bNo^U)0q9(hV{w{h&N^b{HFWj=6}3e$*r$)*&sM?_?`LcZQ?U@w;&R`rIRB++je0=HCS(9nf@{_&8OWB;gxu;8-6?xsfR1m@W7` zrRq$0)$W-;fKB#*`<<`7Fl-uI5e-b z^$Yt2I}vSidTm7$7pPW*ff;$f-3I`$Tz^}RupY&BGdJrUnZAvGHH}bHeJBtE)p9K; zc?-x>E~}=n3M$7@!G}(M|=%_z1+iD-=di~9ysQrk12aDf4P8LA0$3%aS?X7{YdBLC_fLkc2s!fNkp78(i#2y>+6rD zI)f%}<8M%seu9joT{4U$X<9DJr{5!_*d0I&HLlW=cKV|=L?ru}A_~lGJqT#u$w5-<>lEagsI2YAq-YgMR@IYi)I2mqWzWZUg8G0-DDU*@BTC!6$AWZz zOBVg%PA2rGzoMkB6lt$k19;x2-;6j``Fi;~eqc9TW)oQRY+pgCF&JIcv?ZZ#QwIL$aF+wKkl+DuLlq;InfjB{OXUdA z159}zotQnV9V>8p`0c=xA%R4lc5khxt*q|5>cx=IOOebhB_Xm)Z>uP-P^v|GL3 zhV~8$a#6=%t!qdJ;dR&+(JWC23k(NU>ufg<%Xa+Cqg|qYd=o0hn!h_aRpzu8iJuL2xI+vD9OlepK&~#g% zZ;>^RF^<`cK@;}%=Am2eS^fO38J~_9Q#lpqPrj7pV_$j+K0L3&r{Z${0CVFr;i0{ROV3DIf z$;VRH&3kFwkc=ZA2kNf3+TWD5V4X^yIP34jIY>_xs| z0i6tzv-JA030eI3AEP%dATdZQubsi528ySoTasZ;@%z+hIRx^}hu?%7FNPXQ#YD+( zF+aXRg?r@E9rRIr);E@GJI^AXS~_29Ba9Qt0+)aBxP-RU+tozME%Z*zkmERZi4aG> zN8cE`phX79A{mazC#h()Qsg{ys@gOga4MyYylOgYtU0ccI=l+rxj7c;CPPM9X7ABIy1**V+m{&aS zr8>x~Q#e((L&c=9!6BRlf$C(J2~Of~a6O~dj5*mrN-6~;f!Di>NkxZxD=F~@?KxP& z^{L=Aa6>qDkB%VnhZ?dQd`WQgoL>`tYkHG_02YXD{OZ3Go^>H>9G-R2`uq|=pF1-% z3=~CSw;#U@nYKz61^!Tru_=2t;9zf#^?Y>KHyMV+0QFSkJ9ahGfZ;1Qy3%F?fTeSb zRR%zl-NfHgA|U#7XH)i?Oly{8Pt8sLfrNhT_e0jd8IK~>#eh2AV6}ujzof|Xj?FM< za`&tbZQ$S+A}j+g+aj}u(i@4y3V+W#6b?8ygmj~64PNr7{9Gud*)a0aQK976tL@K2 zc-h}Ke@I8yNb__}03Xa)x~pDUk@g`_dk#~ce!o7SHLOvuEk$d2AGk6kk$yv{v$0*0 z`%~Gv{jrlYgU%0-?^6@Bc1&fDPO$v=?cJy26lG9kG)8HcRGmc&Fg=-~hyOz`at6)3 z-hgo9c$%YHh;0@E7IDPGzN0%vZBYgjPLF;dr1A%rM=dpBDRyTE5aA1p|0_E@#;V`I zN=rg8E31&Y9)*$nK${&Q0R==%F@2N2bOot+#*%ChHrY)+|xI-#Qgr>Bg; zbfABh-RI+Ag9Wy7ZC=q?X#oFqLfXVLa0egUZ^J$5atVb`!^$?f#a`Y1$Rg%+oZ8VP?cFbS!ky$}Zf zxRNYuPe1QmjuTviUV=Gu3+T7C7@Svg$1$a99^wQ3F*s=f)#*q58}qFJD%entdBf38*M|$|4$hNUAg8ml`xn=j{ ztpR7_t1((6(A-tpPmG;5UcuGCf^LE-iX&$ZHl_5LeZ5$)(I_igL8s$bP4bmz>3ruG zSK{*(D zfzb5ntFc6V1X7cNeU=+}0F*rsU{XmdfWxSnT8!RM@|ef$T!H4z1raj8UF;rAPsms# z#*}H!QhD*Lr&L%-Z6`%cvTi0gcA83)Z|qV?N99%B9vz&)$@?^q9d_f@I6Sn64Mf$$7^j^H1=~h~6GZJr zy{BK5YT7QrY%jw|{{pHfx{3vWJg(C(xx3Z=4ie9#p&xk=U_gLFX0pzlL{|{D24p<_R910Or-QEuwn6t*v8Bri=*pAyTr@BGn{7 z%+|9hVRZ;IVx~;6kLj_AO*~2xEjpX0l6xc;BI$T14QZ(oU=hv2z=Gj?MbHVq*;S-6D4sNpE zmMKMqf3wCGfIpl{S-4bt8@W2k8uXN+8YZ9l*_#Qge?cljD#mRM9XO)M>A9`6Limn3 zW6Y__cAC)gmn&6h+#cW3%Kjb>0<$uH5n;(KQ)Yx^U6?8ymA^YOx=k32BY%OEpv5)^ z^HI@o#B=I#@f>^1MXF0MY7S!T9kmVp++Z@+LZu#=O;aF?Yg^6ry>SuQO_y_FXuK1-zc)H`6Ki z0K>}vys?!D%70_G3Pi5Vw*xeGh459;am4Qi{}wk*PKAL7b_??cIb2huT7$Xf48Z40 z!tcH>Qx+<=^O3&#tjilYEIqaz5jCPKII#Bbftxf2!?foyH_R#`b}B{N;0vJ(t+Fni zLuN`l#$9LMfoRsa!|@!A*PjnN5^yw(tSH&OKSyEz6qZX%QHH;GBZM9AM47s-$KjeW)qnqw!GQ#h{PN_ND3xjJQDn>K9Ga#cY zaGXtSlgYFp3Uao+=j9Nr>6{fDJC`^8Q6%CCgE7WDs0Re){Vd%UJR5Kks2|Tf_ODu( zH@L|Q49zSr{nVZ#mS^ph*QrZ?{{*qxJ_SJpVcML5_O?gCcS*_6Nfa*XIT5R8v4h2B z9+S=;r>AM|MR+mK=MGwaH)lEC(_OIEcFGP4IRZU;SBGgVLm#(4jvM<*OkVMCG#wia zlE&fFjrO8==R7c2(Gp`Ta3``%U+IV;e)rHCi4Trb>Bzii={O(~PswqAiV(#I^paDM zaUK!lq&HkXN|~IyXEAi9UO!4IpA)tjiT%PhLIZT&stZ|brf!-#9$ zZ!TcCz7LSxY=#SA3{XNbufm9|B28vk_5C~i?|#)9_ps`4A{ef6OC9}JpdYR-P>iD% z)qC{}fQR|sH2UX2!v1sB_U~j^cwAn+FE5eyJCv)hlUQw3meTGI$a0YWJ%OGSo>KK$<03vS1HfVO2cQc^UBZajI~3| zI#9M(uo*ZhY88>F@Xn(GWl8B3lO~lh9%yIDu^FZ1%Th*okcgq zg?7LQO$XDN7~%e@65dBxe2>=?;7TRA@V6UEZx>Sh)!|}G>4KMyi8}iXn%o4~U0(=} z5JQoje;xo^(x2ySSqWZXN34CBMni0nLr_Rs>4dMT#GG>*;a1ronP+Eip4PlC7arjd zIEM1!(-pmhBf&j^RDw$`MzB2AH_r&~?j_`o>q(>dpentN+apM`>FXWFR8AgF!zbs7 zntn{&kX9W}EkARCZy}J%H3j*Ckc)yt4(VlS&I3-6_m}z^O;SUd#Tn0d$@5yT^xt2pv+oA5J<)@~d-?cfdH->0b-%NrZohf8pSQ;+z%;tn=Uu9u)9Sz# z7Gw4%v5koHmQ=oBMwyek{T`eZWm&}wJ)p!ML7YZ2)bnl6R|B`t-X5JJsO@sxkFoO8 zWL0^3S#w(hW*hICFr3aIH_I*5naVS@brhKL4IWZH<#vwFG||*7dSW|_!F@!Yttn3{ ztodxrri7o>8OLSVr1oB!R3;=VwIwB+ZYVmuyd#Zyro&-$X!JW+=#%;>l@Ve0dqVsM z7{Pyw-bPjqj3zT(9|~*QGgLtM-~6c_g58bgAzxkwVK>vhgqt~)Tskl8P2`9N6!@^l z>t%(9bynXsBeJpjxtcP z5=iE&0;8;hqaHkm@;cLg5F`FakF}{7Gev0x(@*1&clh|^4^RH`T3v4siHia(9?@n* z+cfV`Y#@$KZdiqB6znglt&^kRrD%j>o=%4eI1Y z8QX%Tbmh7z;xrxMV4Y&aJiYZ_Q-)@8VemlH@2zGa<|GJ_4xGD9#o9m?t@r~jsrAQ?0oa#vXmO#{P>5EE7}>>p9dbu z;1D3QOt{6`B9D6~{jHmwK#jN72Z;sJ+|L3yWV%*(32H>p|CVS_VzrA$ncT;=jX2;9 z?!4$-{4!z9u8)CO=@S?d5Xmre5^TQ|7Ss=!-+bLuL$D~C7wN%6v~Fr70E9XNDTV` z3AkHXr+r z=*BBf!6MmiPWU6YN)#vF`xoe*r>O7H(Zi0yjDFW3`U0=uZ{ljmN{@d_pjsMF5!+C% z;2!zQ#K&33;OL@sNcWgK(^$SA6&a2S3@|sz0Q0?Hpf*U>s^c5Bg0-1SbucM@`x}ID zhf8G*e{6_Eg@3W4gaQ@gXV9IfpL}}3PK2ah+gU!?<64`t*CVZ-!1x=Cg7bp!j|XV9 z_X2_fVPo@PaZuT&ZPS*1{zvEru;J^Oo)9LW4OCb;V>IOX46ugogNxB%FM0ACCT=Dk*HgG!wn@}8LcS*?ctq&#*U8~W=+;5?fzMt` zfnq#;5pb=1jq4z{ae-k-JJC6@SY}JWbOB-V&wg%r^WMP|{k((aL&HMX3-}6RGp1ZA zjo&HQsaN@v6{^vy4C)9Il1eBZKQBZR)@_-=|@O(2DTj(>uG zAHI&*M7qlW>xG+RTXY@HzJAxq7|tM_ZX$)5MC9bp9rX0i`%>n!ig}*m42h?$06L~f z2cISn?dUA9#$n3GWRpc0EN= zIsqB4lcoj6Yl8~41Gl>S0>MwX_~HP@t9gkEjr9DfYa!&>3Ge$O$0CO-;z4lr?$oTW zSN=gyxpXJ<_&Z{hf-aHPlrC% z#Am_89U+63SE!b^Y8<+Fu8>Z)PwS<92 zxrL6+AY^{;A6U?2&>EJ~A_KmGnJOKZo*UO_wdYX@BG~BL zh;U9aC?ybQfT3Rg@}9WNb(r7-HGzl-+v~M|Z{NmIWcq>Fm^9OWQ)dN@KZQs9?~9Gi zoB#9)VN}w%UV|yhmxTrfCD~Ytq7EnAMG&XFr^q%yhLZpAu=m@lMMNfr`M$Bl@JwlH ztnhOohy7Ng@?t^k_sOa}TH0#bZp3g052pzxo1_u!%u$W^=B2|BeS;S`o)|X^{Ue6Ws(eD@*rO z<>#5)t&Jw_r?=@$5n(`+S}*kTv=^-|2qU=Bof;F*VLv-{Fk-=#NJf40Q(j4g(`!)i zCEn&Cp9}5cnT@$+bmp*B2Y8z=D-M9}ewvAzc(IBX;JH-y&y1JqyHo(LuidK(-K}Qh zWVy3A>yhq!dDjK2l2{UU4$3hBYTvz?+zU2kwPB)w_iNhR_k`tUL@d<-lcjuh#r!W)t5sAA|iyBwv2)YV4X$b7Y+2x5H4S`4e3Ffi;uoV$7ih`I zAMOgvk8TgIkN5|+XGdn-8`DDlh5Nr;fmmJ|Hj8IMS3NZMj@8)tL+WDz_othvj)cAC zFgMIyO}=O&X4k|(^!rewv64YrWo(3DZj;rlD46}CcDv)Esitp@gNc$T7KJe?^HY>= zz-@B?rFcbp(Pz723!%Wv9=NS})_sGB?QTg1=?z?Q@ikwOgYPjE2DiqGpO}F%TeDrl*F1XUFE6-!{~AQ?k(bieZd$+MHp!M6|`W~HzQ7!V%6P; z6_xc0c78O)9Z9XsSsqM-cia9KN?3QsLxakTj+t`@AnahR!tl=e7d6Q3Wtk(nM&whP zPB;{&Ss_U$smT={7SHf-J~Ou`dlbFgund^9B0fl~-}%KSiF z;wM%WsG##%HUO&DlhCn;e`62tpt&MI%8dp}Y2j5x$4$itu>iu@tj0t~)v<@=huZ)$ zM2s&l(MO=u_5tJ36C0$C^(TiBGOOQaAS#zCxBfc$;`@gmZ#!IZ-{*I@QpXjA23)5F zJgaWM*;&i9sg7J=?_#HSe5(sf;JIqznb z{Fz=Y0#Vz*_t0lKx^)o@CNb6J;NoXv_J0cp&%Iliqnn#?SbHtLaUY*u`JBMgJU)|I`;G-e@SSRNtjh5)b1w8$)Z~x>bpB;~o21 ziQjbE(gXWh+GvlS-iqtrY+`*E;2tCe7f0@RD)*k39u}oH*vN~!LU*Pr()U)R-$o=C z7A~GIZdvJ1dZT#zWTgAI?v{WNF`^hK?OF!mBYlVn1>X%e!)HJWj(!J!k81i#ZH%3ovP}uno9I)^_u7c7#=Ysj*Vb2yeP?{dn$=L$d{d)ZVAdsr8q6y37|Z-~4Pgv0P36mOwKb0{I`B zzB(%EuKQP|L%Ky;TIoi*yQRBBMI@!WLAtxUk?scRlm=;}Yp8qXdGGHpS<6{7-#lmU z{i)4|h>&U)iT;wIxB$;s$-*X&5fQI&lMBFBlQ60Q4y(N`Ka#kTQ~F+-P~dkHU_SJh z6@yQzN){|W4%aVh5M>Q_kOGh;FXP!c z7*Epb{z30*xtmICpSe=OWCdIZv3ZBEx5RnqVfC{o(9MS@GM~i`*(`f4?q^bkC-_nW zKmtOZXo<}Ok;?2#ez?qp?=XFZ`WG_!LPTS%C?>5-%Qf83!VKgLcCSk`p`3n)wNld6 zyWh@u&pT-MJ2&l3Q(U`f_d+9(HCe(Tl9(Q6MwKUQ8@e(v4-iP8Z_VDE%9H9H_=W5Wb?Tmk zg^?AQ1o)$#Eg=2w+bn~Ud#n%eTHXq#GIB0i=61J5(zt;JF%)F_3K~y{7Mmjcljt0& zLvLUABYiMD19ruqlSh_+fb|JigTVWDGmogA10X-++LYI8oLx6|c&vtsk3N8F@@A5y z<8)MP3#+zO!BOSKjfgSYIX z!#rlE?b)>;WHQQ41Z%?G;4Ji$*fNo$D=1qRnmDHr_?dZ_|GHcw(WFHyj2$ZiML42o z{|z~vHCS?^96jU3*^BoIeOYAQQM0<^x0f1O&_<56l0!Vgy}G<28c?4BT40guV03#I@3kbGSjae5nATgO=LdV$Af z`D&3Gj{8DR=oZ#GX&(Xemfd;b0`>s-Rkg^y$#|J2*G(u{>8!R=8hhF)CdRM=O4(hj zvwdx;M1;5BmBPQh@}>5PXc`uiXjug(yOsaOwo14;-*EmD!Ywe!C(?OGYZCIQ=DQaF zviipS;j6~z`(@0)42O>Ddj#blH-Zba?8-T@g6nUFI{{BrV|)zdbt+{>d=>s>pE@A3 z%3rZ+asm;b^m~WoEl)kNth>?J(md zeBANa=*twyxmGSe&w?Mq3Hi(9rS&kj_8{NvnD^rbQ>Eu#RBd9Ji16W=ub{aBk^kgyYIK zhdOpRDnkm$Z&>xJ&YZD~!aG1a;XczFPo)d?T0W*QhWAdc-5EmekB0^wqa4q-QO^X2 z&&`4GX{ZecfgQ68}LmqYQc_%%^nn+<&f< zVft#s0F|unXPOxXtFauH@sHv$z%Lh=tG(jip))6Y_c~c7>8n22MkVb87B5vv%gmp) z1fM98=N9%5p9w_4k>gW7sQpAiFPE6vKE|VN(c}wDALtNnwK ziyF69$l_xKC5GPon8uMCbp~_{dg1;iV}G?oOL_`FqG`!_+%U&Mk5yd#KDzF~n(tq6 zDvsK2V^$NojK+tw=|8r$@*2vVV49_T_jn)+i zT3tkD&ajX>X8K)#|0h{w3h)1fOEo)Ak^<|wYq4f^1peW%D;UfT_~*QWVsP7C@k+2> zn(gGfIo}*zQRYqS?%0p0h2qnJ(Dz6WGv>U75x^bOV+tW$9yj5IywAus9 zW7hi(k^%Ww^J_46MK|zzwq7aBS@cRbV7_2O0EH0>q`^UvT&`H zAp`bDK$hnVbfO;=U#@7Q+5Fz6_z;b46sBY$%cWN2Ub&^ zvHyK6%qAuNofEXM5-|(Jp3E_yQJ4*RV|q5YN(k<=#7xF_(uJE6Dl@I*GvBIj%f$v` zkjp&{Jl~Q|nv$imsj_h!GA7xFkI}+;y8ZdoejX3!Kf_Sj`t8FT-t43kZcLTCQ!|eFy3BtGMWnB+k-b^_Nk~p|LO7tH5_i8$7K5b!*`s~m$CMsik6T( z%QEjU@^2So;UP+18u0-aJSH@yVE>bsHFO<415{*jhi*`C>b2gloP1bcvLHf7I3*fC z(`QYscG-Hf#+?!LDT9LUwduuxz;*Q|NVh3T)4JVa-g&z?ODE>MF?X@31uiR!2YIUJ zwjq3qTr^b&=y$}AJf!*S6c)>4%itzVJ>TblaS3g737?M^IfF7F^kiyf+Y=YZhZO_a zgc?093FOzFMdvQ>H~t;zvrUVvRZZrklpPml;yJu}NFfC|N`0V=AlL}azi>De?iu~B zbkyH04km<_j--in7S&vuz%e@%kSQ4z#2^g>m=HhSZ?zK!;ia`+o;v{o*yBajUk;v< zy>|k>pxdH$Mc^N26Cjndt9lW4GVu5C@ryx^Z&~cs9(+lcC(Ck?(dK2Xd`C#Gz^cGl z>%zMi*gsXJ$eu*wi z-+kjscB&sm*vnjL{#@d39vo$+?sI$eaDf&Nask6(i*1x6gS!DQqBYvFWjli$E%hL+ zc{_V=amI%FA9=r)OH{>zGd=i5~rWboWP-}YJmXH4JHmrd8N zM9o;7H*5si~;V^?&{%ApNfAE3(VbRH)Ld_?IT@7zeEzW=N&uC+JbEfw|r)?9yY zJhWi#uuyog4cy*{R2wmRIupxUieG>$l1sU;O>;KrPj*LZmm`s~IXL5-Tm=kR**0E@mr*pB<%51in-p;e5;wAGBTOyAa8& zy8rFm&h{pEc0dQiR~qQVpB_)VI$SnGXmXfwhbZ`?zp59@eEcLWGK^Nxwn^h3F2859 zB$@(SZws?wi=IK)wUc;8$2hfGuucAcC6wyeLnHT-Bb~U?6bY;^tG|^9t8nr3J^J%0 zx~frF<#|iRcs7wK5=VFTUaRU7^CRg(kjsb1S3b}Ka+%-HIoA*xu;Uk`++wu_UW>N1+3?2xD? zxcyM@4uE zG8wy+P@J)woVKzgFa2b=APqu_zC&|G|=+y?g~D;+5=fZ@2h>= z-?HyvG#x%!l<*+AP`H`CMYmoBT}FW?vS^2#jlVz_^cShJ4eWmADmfi7=%0liY&ra;ze;orFu;%EkYphX$L z_;Ya9Ct#8KWK3*5aHoOzcr;r6Jb*e;CSwaBZiy*ojmUs`V^^m`x3G^qWM@ht;tDICv?!0Bzu52RO|u2> zP55iLOgCTly&yQ!J8_#I%!qyS@Au2rAX<9264GJL|9x=OU7~_~H09}mlr{Vfk`2;D zKM9VgueQ{WNRc;Wb^iG#nd|t&HQ^#&fYFugXZWH`=mMcFy21aC3A%N5=-C$AQKp=q zgHYNN5V&B}exJQPW^ZzANV0>Z6I<=x%ppE zGy*C||3WtvhfZrGrXVCqcbD$j>Ov#l{<2QjRB9h~mQH6MM}u^J_Xa|Q;GamAWB#Hr zxMwhyuWL+~V08WPx090-{(48t{}lq!C#Ow0yd(NoM?Y;>T@&!_o@R)$1xqLSI=2y( zxi65uWORN)fo2qii(JlDdODW%ie+IPIhvWtxQkMI=VN47)~Kf>9mTV&{lln0@5(%Q zwcw|gfHUVLV-Lw9oOx8&ld|f*kfN&(2IoxHCiNS z+11bHD7pXnv#N2_xaNuJ-2S0U08M|N;nq}#CK*FWICa9-;9(O#3^Zp{DIr1_C-Q@VZ2D<-L=Q&;A8YCF#_i;zVgpixKOLfr zgJaWW458CaQv`!9kT;J@BW3t?RPe-g{70xf=OWeI>kRP>^?-JYebUE0$uNLE=VA1$ zl{}rG(2^-79XT;pvo5H4(oS7NNx360LdPI`v5e8lb)EU=cL8H<_LK6|NJX?YJd z(_yTNiOg>crhh7zJKp)Pfbl8_26{y13W0?~MeYrM5!(EZvEl`8Jl=)sBo|SdcZ7ac zXn&6b3p;>oO@C|Lxcd&<2=mVg{Ls|m6=F*uOe&#e{2J3s2aQS_6HftS#6I=pH;p7Ib??IpFM3XBrfVYd|cdl=pjTgF}4lIq=hlzd<{zivn|dA8I%-I zNdrJ|^@I#>CdXrMh|`G<<>l)O?s^XlIH=RNJ!s;^2aO10dBW;wNnF-_+Lg%%?kcUo z_qbSssum(%No*lFT}#W{&Ds)R-VEPck2H2qHJswPDb{kMwHIahuo$*>8ptJf>kWAK zF4D@}B4~YwpF+M;B*Et!JRW?)d^VrBFPZO|#k8VY5wu$a^OBN8#tW@p)F2LHL4-Gx zYFJk$qe6lNJ*M|xw7nwyh~QCvGS5N>vkr+9m$j76k#EZ%C+M>U(JcM}h*$f+s_{4f z#ptF9>HR`XSNB6~25`B*@PwwANSuL@&-c}8&vs(KQDON}pK0?$k8RgEz`{~M9jQJBkcmIDp-ZE<2FdSJ<(qkqhiwi z)mc4HI0vlNv>%_PCg@=B6!|L*$hC@5F{aJFp2iDD$9M571%B=O2A&f}KWcq2eM&HU zVI6i8bUMjO#TySbA#3UixjV(@OU1Ldugu+D0i`Mt_&X^R zh6eWjctm1|WPP#UckX>0Tx;2=-H>o4}gPCRiis zq2jAwq0znYdeiqB^3qkomX(OBr~)h=EJOZ7yNZh1OK9Wa#nl}B^4Xs39v)YGh}fUc zz-)A6$`}6&p0j&J@5AYqoEo?z5B1?`bm_kL02rgcL z0qyl>ACss3OLtVdOayP#_O=iUK^Nenev9(dOwgA;uMo*6xl?28^t+d)9nM!n#G!y3L+`aKH-T`ENev3CZkgN{VQm<^B^M8{P~k7T z8I?dUe(^?>rl*NbIMPG_M5Sj7Vx41ffG2%>Jlw~`qsNO%n~n_xV%Gw zwSm)QY);VX;M3k=VTraoPDz5 zxPo=A-+hWbc%t!DE-iB`IIygU`cPjy&2OG_;=(XYH((^%oN~B`M)Uo)DM9#8W=mjC zY%KeOYNBv(g0N*GbZq>63f}!{@AUQ37s4Ei&5vCgBb zhOc*WSZB)9e<)muVh~rq{{6Qufc}%2P!(fOo=Q8wvEKJ=YX|z$zT2fgzmqk9OtFHO z^Es5Ef>xAnfOTf|z37TJYm(bAvsS$DLIOisl?dUe%wM3&#*U)dWgDnJ6e;dwl|wW0 z&W;=}y^pHq+=(q_5NbjEJ=OF*2}(FTRo7mxqs&e4+vdUHQXZ>=*sUa^1eZcaZbeEK!X$2747C!s>I-nFGZ0Db5I z_e;^;|E_`XC?ABrS>|On#yRhP#qO?R67>8X#ZJG9cpj)`h7uN&fBtB)?@wASmD(H{ z7p#KyvnS&jESHUl+6OVQ%Lfoa>?Ij*g%p6+_U}Qb)ToE)-=8loXa;=dkRJ1`AtJAN z@Qen%amZj1pBiEZ(|KiizBMDIloHz~e%2a3Ap;L8ELT-+X^}hrKOy z=-zPqLUzusb8CIVJOYI{2AgQZfM3f4>( z)-(v(G?$CfeTV2NdAMmo;=#coEK4zZSTEnUfSPW4zLhiv#xeWL+^j<7JoA4~cF}}l zihx&emBHe4^3o7EWRD*H@DRzP#&EeMkN5__zaePR<3g=O9ricYT6rGudXT(2JFBIa zCXf4?yp@h1QYSk~g8|b4)D4orV=r^Ubup%6#BVP_EPq4aqX@;Lw!*2IrHdeJnhQK} zV8%Fgc?F2%H<4d)+xji1P7*Q&wbAP6DY>duYOoQEI`X197hXM|ruXWavU*c#6_%@A z7h0mEl!{S#2$6fX*P_6&i{BoE{iOPEVia?Vj!uctE4sq{OrdMo_LbO4rBdL#{(SFA zjeW!wGh(4*M^tZ}{-o|oMm)R6E>(rJDU5S^O-sv4Y}6Pnf;k(O<3=O0R&M@wzNge^ zQ(h*s^kpmYx#?Hlxs=Oz5RXHs`$v(8~UQ7MyAOvG3bLRgp9-li97c#|C2 zsv;p`FB!`oE(D)1SS>fseq1Rs)r~00Fz0+Ju=&XLBD=5hB@y}c%NnzM&CjPrZNK{FFY;)-94-*MM+3h)2{jbdHtK1qTVu$RYGME7-rq@J4M=zZ|L16 zw)(7pbp1r#qbDrFBOI|2Q}&f5-_AOJ5?Qcu1=sSo9;x^Fu;EVfeFo6s!IZO~2-*6J zEuyD`%wKWuw~^G@`tyD-s*u;$|L(ov@Bl&05v8H0iUviPr%-I>gVQpPM<6SqzuM7{ zTONF<)_$gykHsqOFra}>>H4Z4;rT zGo|SJL{d3QA6L*haIQqKX1@%k7+;lZa_?GoATGb?Rhpz%5>J%rpSdU8qV_I#%@iKv z>kp*l`dN*H^aZno^6kjI)7Px{owt#?on{m+B_T5DqQ5=S^r|y&9W0{_JziF>0KBQb zo5A^d2I<3Hy`|+7R!$hSTuzu*e9X=CW}>)Q+Fsu zd&lOo`oV0__tzV5YvgiQGEXJjqQzk3MAP$+rz9T;K+UE5I>R$d1ozLDxum{NgB22~m+1LFOnnbFvYNXkV3Je^# z3vR01lxxXUTev4=vcCl5vz<+x!;opRbD^HcX6-XHW0r8?c1f%x`Z-2l-#6jmunc^F zl^;yz^o@zmn`QCBxE>XX8F97&THcteu;e5#)XnJl~olz z&RQ?E))3b(S3(C0tX1QA?_XRVL#mS$Lb1Agoh~0smQ61YQ79%YxhGc zbE`aKvu8kKL+m=^GVhPH8qnnY36BZUkf*MxJ1=dIGJQ`yD))X<7Q zzGOs5oKZj1{Q7qXq7Jd`+Q>}&G34|+^CkJuCx4OpeH+53kX?@;{^Z|b`pQw)&v4sg zE_%(S-=3fE1fL5Zo_T-L_S|6vWmII?jQbcGhpb&uUiWo5 z9omuqf*d=nJ&)Kmj^96}>3mMVZ+8rCdH&;duW@Ek+M0XIlx--Jm-;1{@8sTK+0pqd zjmwsKUjMA{VqE_5;MWt=$=$cs=W*Y_xAkX&={IdU_e(lEck@BcMHmRYZ#;4odPjes zR2&CV9W>LISX>DR_-@lkjqfkN`LDbDSno=S9dv~jRz_DFuR_BFTBsh z{Qaip;VIU`gg6yvw2L4w3# zeo|yA7`Y6X4?jrs6Qm6l?r$oXcvk(4aPeasaLifY`;CdLX4O9dC)b9`iqa&!oM<=w zk?%WP<1?{V*F2}hl?yTAW2}{Qc*|Ie=K`3!WVPemX*vme`ae7|hl=c#=|GAMrmBR#_M~ST~yH+22%#jH48UP~bR4kr~%%|JF;cQ&LWilh(mM z@#Uf3QCMJ-@ZO~QwRl6pgUj?|am4roJkF9_?TC*CiboH<8##A#C`JyU$&%F*bm(Iu z@lZ(q3V|pmn4(*bvCT59u6`D$*VJTr3f{VV`&SqIn_i@t7pZ1gV^kX@nQomRB*8*X zqRt>V$w`d4%Wjyxh1jxf0%6I_iUHFG3gmIh6B(;YZ{iBy)K_}E|5pxMreR8eB3jRa z?kL-9b-?$2rZQR;gT?7q%hvlmsRs=I-M_d^r*3-c_U+>z9RB((4J=1i+euX{Glr%- z$&IltI+!hevICYzcT)y@EV@)v<{PUogY`0b-U*H4kr&fhhnTpYMx&E2+bS_)x)UyT z_><3wMOr@B)mY*B1UXa2&aT3@XIWvt>Owo%7^2Axe?z61X#}UW?}8qi91@YtAS5%d z4`VpEku+!*_7cv?$vT_+@t{5qL2H&;{Z>5LJQhZ44GnzPlodNbju!E3Xil%EU|aUX7>!G!ie8Eq+;&~9R@r> zsTLyA0``!!PsxxiZ`urEnMKmdgLZ3-3xy6ANV*5C!OMyF(?`%RBD)lYD^H4mmf62o z5u^4ZxDi#$o`Ok!rXF|sfXb`qwp2yK8UFE*1yR+V8~S-DL_e^w&3JkL{7$y$6wu0K_W1396S1Og{RI9nyy5G>uq4N4xoqDl?puMYaql~}aCn|8rd4lzg5FlP zGWj(d^XxV6E)rpU?esLbPFc>+So*A(R`k(1&&DI5oPk!d&!oX&lbwMD0b}nTk;H8W)qZdf8K^@#p6@%NdT2fi zPS+4CnTo;{DDZChW3y6;(x^(@dgw;#R(7skf$ToX;6u!ImCtVB#Fpg~K_8?|+(32x zF=_?5$bAFFAjz50>y8q^)XFI*TAQxzBK>5Pk%$udoT0t}oJNG1z%%t`wrSh1S@j~% z)dM69N$+^AfV?eSe3&n4_rz=ah1!_4${Z*!leE~((Te0>ymeynjk?SPd^Nb~WfjCD z7r;E+Fs%J&S3dE{PUgD^&CsaXDrQWQl@?m?)FK#?Z6c|cfiYqu;}2y4Ruwc_iQM%u zbpW$R4CdM1CHIf}cm}l7 zo5_8XeTO>}r=k|gHht-kRZpJE{s@h@I7DPUTL!eVJ~Y0giK~Y?(l~h(tvUx2=pYgyN%5>|zBNCFlaTh8t~9 zR|d}L5xz64}<#kgsC^Z=&LDJiRQLJ^^temcvd;qYla1lM&KrhaQD?nLmyaoI)LX=M@ zfHJMzlIbX%=yLS2+`P51d*#-f^wwAK))cFk>mgNqGwOgwce-W1d|yY8r7Hw^n7xhI zGJOJ}@B#F>S$mQ&kWK^vC0(aQGn2P^)ZtKOYRdaao}HwkzSKOZBy|&v#3QWEa=2I@ ztFt~Py>Ra$b+0+-;xZ?7?;Y355oUknecR+R=%a`U*~IVQmSwbCX{l4#69nKTOHOv? zD>6BF3Wa4h87wSUAfO!3+yfpE6v&L+&(G&f&ni&ZZ*u43@b0+GA9tABfG@X_r(1;t z4iA01R)x|$->}!Hw#~9Lf3c+BgE`D%1BEt(e8<1MVTm3&+_Qr8l89v<#EZIxt27;y zl!ih%U%Y05=Rc#w+#$k$M&<=aUMBb44h}_B-ZBs&b`ND3*!yf%cYnihdB=B~f4~V>0W20+zN?C<7wm6CdCTE%-c*x9ptJwM_{VflP^jmu z+yUNAsIOE@jv5rSz71c4B5*#>{c|pI*jI;x<$pqS{wi~xsWq{9WY$icn2aVN32`!- zNdxU|@CLAud^7f123>}KnB5a?o$rb5o&bHz*uB69GJARGw5j158{irLX8NW{T_kg2 zCn^3HJL26>6cl1S3xm1|kpG0afj;wQ#WFVn-qpR`NsO2otslnL|BOXJzR)wkGkx(_ zI{#0~ZaL2P*k%^bf1cXZMVGH9 zGeqk{aEHQ!s&uztsFz$gk`W(04GAQyR?nVc`t$Z%t1j=j8t^eK&Nyz;7+ioBHtq1ozQ(BL4x`wIzvQru<_POK60*B}u^q;%ha3 z>G26& zn1nTM6>HqH@_Y(=qdA2Y|_nZ2z9OEo6{Z!wda{(CVpPgV9=MK8!S_g&497np=G&P3@U51n#+ z?jPso47rpvf_Ty!MQHEIx@<8vn5D3y`FqvPDKSsRy{t8pK6#_^I1+uT5_oH|NN0~y z{cy2_dcDDl*zGzHc!yQ>cLa#@m{;Ale@Z3@Ck>24$=D-RShPKPO7OIstZ}^zekSFp zrBSTw6NyIMw0>3ZfivSct6_ee@IqpCmMedO53^BxSF#@bdd)d-BxUrAZASeHXNEC# z<{=`YO~}5=inx2*dP!q>Z8oU#p(m8)b2hawKQ7sOnLkkc{8sL94u2kae||noIoQ=7 zM~^&bEu$JXyk|ONw_G)2wum!$hRlz6m&P_@jt2~;-hv#91StGUNS(s^Vj>Tn>w_~P6AK3PBjf))3RNkvony4sID#N-4 z?>(=$=Y%^qd{A|pvy&OH3yO3yA9&d@geJl6%)MV_&i`rd-QG*>%5J(0d6UP>LczCV zT97iKP?KOJQ6Gn*xC~*C<|<@eGPP7r`*bv(@aQ_hu9_Wx;95`!Wju2OaRffx--9dn zrDA`INp*H-cm2V*mtmw0XsQH7`@KQ86tWL~LH*UzDYhL7x|NZMjz;@8#`$aVNqaxi ze-FGF_R$b#=$Rm#EBNxz-Kz`xy3X6e;2rjh!gs1sh;IuJyKRca21`W`NyDZI@Gy+K z$=s2|d)v_vW%si4pN$DFMqy#iD7eMw_|ys8XFl^9tgy8(KYv#levpGxKP|z(a(f!l zzpkOMybAE>rj)T$qgnjUnEXvnjhbxd!zqem^ch=Ze{6CY(o7}x%}vL%$0g5e?0n9_ zA1T{VUb*n+p?OYkn30liulHtXn}sZ~q-5_T4Q0(`Pb8}JwqHzii$i9FqlMmK*#`nI znF0q#<$w`aQm0+R?hs6xuYZk3FqYB=g3F&kR$s_K*(>hMpPY-mNahO<)bVx5!T6|W zK_}kmoowZdwrxf87mLsG`gKRZnBl`5%0$1{^U3qaghsX3sCAX0=E2^LSTE`V(cMv8 zRPhNVw-#{#LKn4^k$Gz*4nJU-jo9$rZj}p+n=v7>ARq?l=xTFR(;Zc zEcA<2PrvSct6HLQyJ=Z_n_hD;q}^0Rb4WzQ#1>`)eTN5X{QlX0RE4`hh|s4lK?>zO zTm|v7ACYx1W!hwV_$ z>URZnzpaE^iRXz2mwN7@Du)E0UX*S@p8PATVpHM>3sot-UFm^6=X2i%uP;SE(yT(c znaK^h7f44odb_l^;Eo$meX~(h9EKK&Mj7%=991UfS~z!ICT?@5>)eEyBjTP9y&fwM zvRN7LMhrEQox>>`YAgS}jpMfGi5ES)a6JwPsX~pFv#o&e)UVw=U`VUMzZcAyLr1&7 zV9Wf#Bub_F{ez9_j}Y#NGjbd=#USg&t%x&QWH__j2QXV|wRHA@8X#L_P)nz)=^no{(HmjEtJ^&yQkXJ zaQC8R7JaXP15Gu6p&YI3Ly4N>rG{Xk4R0nBOQ>IR1KPV74TJLZKc&WTyUtspZojL3 zY-wJ3p#D+Ny9o&4Pie-p`9iF)JcE$Ze)?B#y2ocO$$6nJ8A(EZ^qm}z9R*<-?4=5w ze>@(=w}lH)O>4#w?oq0`8PB}?h+J?kLuET_n&JIs)AQ)mJPJ3Y2*I&((_KDuh#n0_ zzke&oBEBIHv7^ZLaYV9EmbSfnK!~N@N}?#BUcnzP+;^^6Z@!XE+Ar$iC>_r~Mu8t> ze+w|+34CO3Yb+~DR_cXD$;YDQCqy2wB0+!c%4T(AhIHV|$Fp~Q4%jpWCTm35y@$ zAvgLKYJLy}(=Xd!eWoKrVc?~B_)Py>MO+roH-91}#0S)50zb=+Zv+!!+iP$|ymS?o z4-AKSUmKcZ_LwjCQTuZa+vXShn!13UB}XeCTX@=(cBAZ#NPf5hk|mU>?jCjdxozU( z#zXMB7J$rAh&(7JaLB>}Olml2atM7Zyq{rqX>gaE+P2n7`t2y#6d=aS;Rdn6%qIv- zFi`_mOVa#U+HYPwOw9v>2n21)%Yowk!ZYe$y=f!VaK@aUjT32F-*+VcS|9Ybrj|4M z>rME=EQ=e3d%J{Z3-HS$nGvbsvg1y_l~Js}49RbdiB7)hqLQ9*GUmOuB*5iuq$cTmD zcA2xguZ3=x+4I=9L~y)Mgh%2FO1}jTW5llA=)$NO!oJwdoGJV-ZHE+pNgv1{#k%7@gwRYQ zM`gc?G&$A#O4fqdlvR!35p?EZk6zHmK5_u8x$&hs#K;_@$>67ywkX4RArVf2@x z2$=H8%K^-$X&W^}tbjbEp82+aj(t#z_QNUdNH%$lPF5MKm5>s|6m@@@*cbjXE?gm| zL^R(>8kBHXrdhC&3t4_#(3h5}q=E1U`yA<;CjH4|F{F@b=Ur1I8_DjU-R#nu;Ufv3 ziNk~JSp?*9Z*OP6gg^c2;kbG|^h?8qCe`I>FpB!w<65v~s7=>+&5|@%YML}pr%p>- zV9X6T*N5g~EVfG;@li|R`+X`fqw$w9&+ul$h8qz%f%RrokEbZ=t8pL-T{47_b0axW zll0nBA_q$k!BY9qAZeVut@;)UT@UcS^n}t=csdCPTHc-gaw^?Rg1q8k!@18m z)Yz@vRbbqJY2RcL*y3ted_(6!NRfwk?383UVmNPb*{rPVUD@?T?OG(@Z6BLMs-MF; z&u6RCG&R|IoJ*y=A5+vB`b<64AO1e8W<0G}Hv6G|}=Iqx! z*e6E^9y*@1h6K)#N{l8D7xthhHjOXH)xmb3muNn$f4DBxAqh)Bi;YW z+aL?>?Pb8FHC5UU3nrK?Tj*WPV0-`jrF%D%dyk-ducCY3nicNtYAfkQqi zQs=y)>)XZTFzm7XWan-8Y2k3JVu@T(nT7gd|CVs^Vblmze&9#2^!-FOL7hLu5#rp1 zgL4531xmXMq#l5sF#%;l?~>w3(oN5_r~gwPR&B{+;f8D#^mYR(^_LiI=jhCrKJUV( zsB0v`TWmF}FW5CBm9^u;=eSxoUe^o1L7y71HwZ&dt8_VoQm`ao?6Ld=1waQ)$#mfB zLUPmc{<_t3^O6QP*Wj2hTd;yS?7-n-_lYMu!1K8^Wg#AxFc>D4Sy9A51h$-8*H0NX37Pp$Xj4^^b}lNRdNGP)CW3qeG8 zB-60uOGJ+$^H}Oq#pETeJw-RFWDe@v!ujNAYRtT$XM zb7OB}(3aS1pG{sxhf{?5xbISjLyc>PW|D&a{D67`xy~nq_SnPR9H!RLZ#q_B8d+Y- zN(Ji=X{gr#YXH^~>(y>=AB~7tnu~Z_Km1WpJfU}o+!$1dGxjD{+fB?+CcMTmCc2&K z`*a4GIrcxCIoQVNsS?ARWbF7It#m|pJn~rTaCWPOu#B2>!yK8|a9l~eqx9U`qZ`k( zU;nIqcmn}PS3zae zI`6`zXpDyczGfLHkX(3}FYF=HL~U%VnO*COP>@kzPWe{?E+ zRws?^KxTkNs#>rE9c}MK45?kx<}F9H@lLBTo5QA?`kG+r1Y6$4b}#SGF)?(i*<}B@ zbGQ8lbw2pknt4DoEpX4mBX=dsnRoGM36ZbfrAYuY7VgPbkN+-(#I;nbd;Q!8?!TYf z4%8!#4$B-%4CKAAW;fiCxeBY@d(P)wfk!C4k>qc7ZM4hW zeXg-qN(lozmQ3veb9*w$6LuSZqCctrDN+-K!v>qxeTy*MsunlkNjEdOUn@63Ztpzu z=|nCI|7wPxsJrd)7meKqGI{xK%O}?oxj2kW_ZDzVLz0x`_dorUEc(VdA{R|d)!(XJ z^Ce-;Iqn4hwEPWwy%=_4KsSSt*|e*h?V8_7o-S}g!1K_jHP8yg z?aG;txbOvp&1_Wl;r_e9hcW?f=IIpA$Vmgk2GUJ6*0VevkSo#$^$oD%_h;$o!IUIc zwgrsUK9G@xHO2$R5_Y4+3MW%dpTc2VIru#}B93Wqfn26U{R-{olwPPyNqTe-$)&)e89k)Vf`-+#6N<~0{hmPFsx|F(E!ki3IAZYhjii~HjEOucnqxX)JvzC+S(wHKQv(9x zbp(T|tdPFIDT&WsE=sgibaVM(W5CrBa6Z(2Jb6}~+HUIse(cy^a3U@Mfs%9SBGLLl zRZ~v~dGQo893WC*8&f(Pj80?Rkfmcn|8{kif*l~N4e$gMp_F56MShk_TBK|;wDh+E zNb7a@pBL%Mgee_Nr;d=m|IlHLPnN@;`IbVEq&DTR9ANR3`F!Eke;$Y=x~8ujod|sO z9HcUlWJ#s)Z~qz%{h&&Fp6PFFzbZEdY0IJFnmAGh*7@bYuB7EZ1sv2VSK34VQO9d#MeTMxUZpfN zsrgjODF4h+n#!!X%Vjkc8{eCpldhq*Sz@U zBTlvukFjkSFV(vt1-#2J*ZqaL%2%z018q|Vxy{w@&m)fx8X8;wzPERrI{ql{rhy$I zRv+AssjrQHhMxvzfYnkho)Iy4)0cNFVgpGtsZygZ8eb2HWxYq(UBUA`k?AC#X-HEg z1b2IsDY39UDM_kFh3G%@?0<0VBh7$xGgDKx282F zRJ99-pTT7ge!gh^QF1h}rS71;g_2~omyYIw*O*?rX8Z2dS?)e>GCk$!U`KE#q&0y} z&zL(D$SkLe_8nsMp-kKTkcbpRRUF;qu7(6W{}XIhudSUmiN`fSf@e|)MDEprgU*29 zByGO!bZ7rm;-E)FQx>5x7b1#6UWrNQ(Cm&^pHQLCiV#KpTQz3?wdvUcdS-a?nHSKg z?+;)AI^IW1kemy(`>^4VzI=|ge2S+QjLm-S@t@O2tlfx_d@VnijY^;ysS5p{7XbC? z$LUaiY5BPis^hdpq(7d}Jx~3Tf|x^h=VuGfFs|Z?35hkePrtFPW?L|&RO~bG+%`0} zy|+Gl$x3Pd>sx=QQ_(d1N~|#3w^x;?s3#T82PMXSuQ`L=fuRQp?lI8MC?^rNC{Gw) zyN~BFzv!`HwI6nviV9Vf_q?ILwgYbmHiri`<}MWt<7<1slnbVQRlSy@!TLS|=H)s= z|0G+>`hWHwTOR3R3&?(N)wMc3IUYZwo$;Troq46i zj{tI%o)|_$^@eTC8F_yQR3MizrXaelaNv#y3BitC8wXEDTmdu>OTvgEPx0r^KA9Ey zCky6g5`N$ej?KigaD2>-7ZJfiaT~E%MTMq!uA2@!jVQD#GJ&aqFYmlboiS*o{MxxXHb{IOo+HSefMVX1|NG=8$T zX3+-PtWYRR;tXmoUe0-cXFUFPrxhGI5O%*7_4i!4w!VYMs>KtbOs58ziqxKYpccsv zzQ)N%w)d9<<@;<~7!$0*LB&MK2X$E<5*ggNUaoW0NKWbA7bl!*LiD3$RA&3 zAf?h}V&0p?{y*cqa0EN{Nx5ZKXSqRvAmag{&xA-{t-;vOHPC+_=wF6OTx23o$hLAs za-xIkA|d+01fuLXlwsF)5OU!`-w%n;BMjdL{ec&chKgos2`NnsrccN(R%j$elsgCl zAgXGMXM(l(Wb4nGW^boMa7YmA< zfC;N6leG?MJC$Xi(|dZ&x%9E91p$`%E|x@|Ucg*}4F+X60)bOwhs%j1p$cR^F@N$? z96aE9>Xz@LLv5bz*daL^xa{lP0q7F0#i;k|+}KO|wSb+jNZwPh`ZFL0cb{M$XCMm( z2R+0?Mc`z^uzt|y2$u>JqK+v!dAaE2|GUmEkVhhckwTE(E$U2>V)P8b^7T)O3YlLF z7aD2iMLay6zbsY=TjlPN|28x+4VX{v@mk$98-)pV_F`Xrsg6uV?E+A|#TefK-3SAg zHv04UL6=AXLMack7?l8adlD^NBUL^xeMZSoKu<(arRWP_v-Gjfs-GFENaH3u$33*3 z?#QY*x|P?>wGr;xq}HQ2sVU^`7tOU7{(fYQ^C~3>)SkK!_xvB17@8o5gzA)~Gzi!2>=KHL8PK1+5D`?Ypl>TlmA~HKd|Cg5h$7}Ier4@|)@enGy(R&|+3HW@*D(C|mABZ-gj74$4-i0dS zYV$n(6+k|x!H(wQW&|H-dA+@@oNs(E7ZVin?{WabJd5{3zj zw4iw!=*`LTE#?fKNkfUsd!B*2A5AlcP_)5P7NA2O99dv=03UwU?H2UQ04VaiZ3g$1 zy*5d(@?)mZf(_mQ&KylJM4r2irrwNiou3;EmJrC@SMpb&j4rsLzs^P5G6*wm?RdHW zf~6e4_4PMWravCX`%g~Sqy~=C;CTS3qX>t&MACNG{p)L}n*7|Yd`M&k%Iqm@oz?CH zxUxr~U!-!Y0SZ#9Zx`(xzOB}vuxWHxbjD6xR!Wzm${9XDQUg~?yU<*f(tLB$;3vqA z=c>Haq4{>k+xkeSjsnl>j6}^)Lm2Ss(4Oo~?jU!zHRjS-ers`-#M@*wHXXUe=ugsf z$Gq!Xw$5}jX0gh2Uj|oJuRDOg@biN#>M<`36>uKrwtcoe`FRyCHI9FN6<2!?xIi6d z(YqGQSWaQ=1V+c$iPZ>)W$e5&UI0xrVTdf2%yqnK-iX4W>T-ZM_GQmS&Yfc~ zZZ96`M9$<`@yoBYLyIiG59BlIAL@GK#I_epR(G?oBWZG_kbQhv!S@8&or>lu$zRtl zw}x@ThgYk`n}x0$NJdT-zLwO#!OlRX{@%{ZhS zebp|(O}8t<^IQ7?hr|VifGN=RvVz`#T4$>s>O9J5&{qG6rt zVpmvdxzU>LX0J*7di;e?ncSM zNo&0#f);@>Ijk^WSP>knfA%6a@dAjdlWk-BB3y8p5-+nbrh9z`aL?@;0Nm!s4?lj} zBuKbs>!tFDPQQb_$qkPEU)vJ*$v%BG$!JV_`4b#LuI;CHkTVkU-q!ba9!NOz);q;r zJ9Ur)6qA41!LO;ZFlLNSd^XT*?!mnv1cMzSrO7VxC3*fbbo@BS^5Nn-ccQsyn_Kda z7{-9WeT70vUF_y(`b*<8xY)I8V~ES`$wgrjDzUaZq<@@D*&D9Hev3a2g?|UYmRN(Z zl}oSx`--_a-#A=CNcnu?`hj-kE1~^a?tGXRZBd%DB2~TrsD4B?o0>adoYW+N3&@<* zlys|hGdv@S$VXQ5jl$*Jj{pA>LZH5#cf8W5U|!*s*xb0X=(}r+5hbNIJuQlBOV;n> zcxWE^K0$#GF4&SV9&9kjJ&>Z|zkY)#I2D7*yFV>LpDRx{TWf&DSs^J$q2q{x7UwO? zarvg5_h3Ei1Ji&|Ljx$c-`A(8%uA}syFcinDD)Rw6P=`7DyS0shFBKsV| z3_uX2vtF$^>JSvv{wS&SVAD@)vj5z|X>?blp_wb#&K1E3zS|x>5%YCZDT}9gd)&F? z9-{+)F6O}XrG1tkbu>S*Q*jU}c;6olAdlH4!U3yJ$fY4n25mqCW61afOtB;qf@vwj z<7*^D7kYqUs*>1O}0+axiC-)B* z+}?kM-^yKAgMsNZ5SxyL&NM(J8Yyd@XulkP`sp`a!3SjcS4aZF5^UfjnKoVvoZZ_# zH~$Hlrybd3J+B2U^Lt*g;(5W#9&vvaC1OJh8C?}2(6Rt2Wl>LPh(Lrb>$9SGSq?zZ z+eUyZ+nB}>>${rUEdPtFIA_KJ?Cmwv3R-RP$RrL547nMM_SD9*PrUH2`*1d4NGZ=0hH5P=F)OP`(jIvVqqf+*H!Id>3P08V(Ez0 z>4WT@M)ByhGQtS`vd;IO(O!do587aJUAP4yJT~De!#X=#gO_8?{$A$b*gljsOTqN` zeSE%Q88DuZ2-{O#^Lp_YpT}uE?>P@oC_)iib+fhiCdKEQTBA^{ps`sFec*8QwD`&O zHiF{-bX_mToV1_@SqfR|w2P<`S(D7p6(=K)Jviv0|7bI&vxu7i65BlZ=rArOp7d~c z2)Bfo1&69vZm(q^0u00Rua`%9*X8rT9mN(FK(H_w=$)t-XZ6|%#g$mmYIT%X(JI_V z7r9Ox$pk%mN{Lo_#b#bXDO|zDUm>Y7Z|)SZMTxL|lY;mqMthCED2}{(%P+b=%fRi3 z*(Y{o+B`EHdwp)fE1wsv?6${|*xA5^PG$TDu}v|RHcW_@>3Lu7yO&A>V0f0L%zb7t zAq`2nO@(9DmcWW@^xlpSHE>I-gHSAT6L~*K%X_g{PL!!2b zns-$eJjh&9@I)WWi_74NeONLf5FWNuVzZ`KL+ejRX6Z$4N|=4EDA0!2;3*wuIG2Yd zOFKQvSz@IPH|DiPwI&gIw%zALGNE0&n^HlMy7rQ#*N%pzoW5G6X>UzJA=yDY6_)K$ zhNRXM_UbUkdytL;rbbYb;zOaTYby%gt5IV!#J=f>GEH)ye>2^}LU6ZwCTC2bhz$bU zBsRq@L~+ICdVfIMmAz(g8N)ssksLX{#2Z;LBwhO%1XE*ZF23#kEwgvauJqao{9miF zyqOZqwbNa@8Hg?y5aNJLdNL%HAP&T(ytgw8bR-czfn>2YnNhZam1>Ni`qe#|&0ImF z0%j{=^J8QoVxvxx53O4O@)MRZhbr?yAe?Lqa5-N`lZib{pRfKNxhq4Yl~V*jHviCI zg$W7RSF^Y+_td>ze}KuZmfDR_KmPGXLKDQm3b_0PC$9bH9QKQzbzXSh_#PaT6#X&Q z9mYa?VN7~WfZ&rz=?5|IMpulO9Pl7P=@Fyfik($8)CFU08c`~r-J*DI?K~UBCw)GZ zD)xlJ_|E{YQIgbWObP7Xf~GoGpD!1Pvgg3SRcoGrWBiso{x1;1yawBQ4Oy{nLuNF0 zIF> z&CtbWx6F7C);E{#0z{$V9litJ8C@|7+H$V?vPWOg*9x2Rg8jG)!EOE}Cj`@L!W%j47UQVvJ>jszBTo)h*^qa-!!zx=1 zs1&=rRawswD>9L+Urg6>(}L&zKqMbihF@izSOq;&P9A{A85#O0NKj=RyLKRZRHgna z)?Dkz;#R&5o%p-~Sf%eofx`9zBJ!-yb!v)_KF>NPvM#9BrLzA87Rt;(u1wx1A`rns zh(Nm~6;?qaSppZ(uVlb{iCdZxIUtPmctvcIRR2OYF58S)=#8`0Bz4IF>>R-E5}3`# zISy|4;Szc)>f-uja6!FWUKDr2$LG5BL`MU+sWx6|%EEQ$m`DJRN!uQ2UqvnChH)9f zJxd0%{lnjVX2zT~vTU>dwMAqhFSUAI>ecI2bTC8dtxB4s8B93wva}YQVprsHDhS#y z^5H3^cs--K0xeQ}nyu^~TQ*&u6Y)>!Yqa>C%T#pM^mg&0g1UN^rN++;qp~iAk>@p% z;;x_7$2<+WRNrZ~(;59#ZQmXq>dpS%q@HA0!VXZ~0B%P3oXT5@d-opUZPITbPHM^+ z$OpFZ&Zr0{;kYfPC)n^D4`u^y_-=dH-15S<_dk@x5xB{b# z`lYockLEf{^FsZ6`%3(@Gd&BK>7ZLcH$-xKDy$a~R>;D_L^d|ZovJs&1z*yGk7^ktw3siu``cvy) zWZ*5In-p)2wMR`4iri7eoDSC75`{>IW-9CzxeDT`LS|0%59~yiniXiWJlp|A_N*`? zb+glk zGx31v)@oE51YEalyGLa`DB7g{Y6GY2uMZ)4c@H72ro|NVF9Sc}hireA%MJST%#UOUo66SDi4NmV9PX(rrGSTPy+=TW^oponzlet~+TWA8575aYkt&3A64~Mlzj720|6-`%8NoEHu^+pfY zeHxDU*(X=BxilxUT>F~klfiBagjUUVU(lA5D$&ZB5(65QJ?ViPgM0cB&2^Jd7W#d? zFB>ikX8Pe;aw?iJ8%ezCqJ zsNmS?kK6d{G(s!C< zaktMMF63{!7;lx42_!}Q17&V1GP&oG0A0>bUe$zH{eR3@@P$CLp3mF+cK)5Y7O>-i z7;=AHYF^Ry544p;>2VmQ)?*2L;Q_;&>A}=wO%A_ImP0TEG9zWN#v@8&3V)bSx&tI`)4!r3@1myapNR0}$DR>%enslH!ZiA_- zxefAO)(_*%n=R_qx(C_2heW%FRk}w^y6x$-d><-@m$(Y@X1?ydq_#>NeGy*Jc+HFy z@z@3MxO)HbxYw!xk*`XOGx1ja+(B|k)-D)a1g1dm`7V^F&o6&Bs#k}!D`2=A`|`5} z?Rj_5C{2bp6DBfQqZ!&EErDYxYtc^6sw9=_M^1y_Eto3kEy1z5%lf}0Xp8($6(^$=mbUBWrc(>cJRfjmwK1T)N zqnY){z;Vf=L`78qR{7z`EAhb!;ugkot(60FB0`)_)9SgA z4=D~`DJ#38vuVP)h&G9%_ytha#R=Fga0aLnSO-lNe^;}JCDI~N#7T@xfg0JtbIDId z@4wo5TiOBS+kqc#E`9EnV`?o^-|NjTun_srP_fH@>8UV0Z^Yc1qzNoMq5&=D8|)cw zvF}?dywJ9bj=49r<25bUCV#U>CdX~0R3P4m<0i32c|ds@ku=GSF}f+-;ACD9&D`7m z2&h>3mt%H!9D>mfUjYaXZt~({Lh4riCp_ft``S(TIV$1ZD{)fSB()_lk%?O%=g|Dk zyY)2QA_Rabjg=hdISeAQiauO)UB*E=Vjtp-7x)?$!> zJ)C?@CfJbLw{v2_a62;GK`sfy!|MYnDP%ebnPBn_b!(ir?Gl61U+{Kz#5wmiyZrdn znYoee=-wyaR#8#>x1?+n83fdFboL>2LW(Pp&n7O2D1ahz_A(lk7lbGiao8R)|94%! z6PQBmN!Q$ZeC~b8QC4S*O#|y#g~q|a^|+WU4vT)HcdJtfgrb=RAw?MU7VXZ{w3 z<|=f-Q`&wE4!1(G>e4AxWBcQQ%ZVxLr2Z{9>2JZc&IeXVHlI3|`dV#c>|rJZx;!Qy{ysn!OC3sGcqnO{gdY9e+x*K{S8MNr!-aM^ zeRYntK#ZtrQzBp^MDr+k3z`#)U3Y}qlr{tmEb%@oVBmf?URVqWOu2Ge91gTuGEq~Z zOlH1Q?G58n=DLirS`0S-g@m+bjK}eVjbYEme~To4(h#SjX9I|hk6qJ+v-7V#C()fE z*`731rwvA944!9#=KGFEP-o_FmnK}EF8sn6&v!HKp)j94nQ&kLH-J-zSe3(uFm)5k z>|=xnV~&@ddEIHqL?;?DweDO=9OTR2T9DHWd)36oZ8ODEkpel(bHgPBlP;Vwq7!`H zhobHLdiDt3K|gjZ-v&2zpnUpg=3tx-CXM6C@|$H;PokOYfE_GmEcB@Zfduj4QOXT9 zES@8mMk35Z`-)*}J=c%*{xJm&1e~z!s(AD4>7KG^k7gMh={%ch(4-Ee@3p^gdXv2r zrmlY}3NbPStJMGcOZke>qohP2K^luHA_2m)w(Iz=SXLp4)bVR?Y)LIdy0I8`9t0px>qK7@93qEW#?1OlYCX zn>Os{EJ`s5i{&BM6Y3Z8uK`SK-n99~+F-cpd*y>s#vCvVk4y3rzo-ZBUCp4zNQ=9o zywGJw&MgXdfxk96gZLRNGc6OJ>9tzGIMGFvN)i z9Gw#x5F0$sZotH}sXpVK{u_hwL}`MI1JO59CZtK2dwYfLU&f38vg$qiW~RGpHL`A*DzA}uvIxfCxTdV+3RsvhYPo~Vh-4(7 z8A;=z40E%C;(gCd8tRd|dMBVjnj?op-YG{DapNW^_jcumR~0l?3HJNwFu(Xt)AiL1=A@#^k5v7Cj<=5_KJaY4ky&d zY3Bi0WSeU9Vf92Nqp35%>W3bD2sCAx@ODB?xSV9Rjp_Vqt>vjhS(|sI5f7{M&e%k} zck*~Ghk5>f8fo*TkUdt&+n`?3(l3{=kSU`CyXl7;AJ6R26k9yM{JkClklspG>l|La z5cy7{qhTMvY(x?)`Ge2j#3$+e62vOXKSgp~%3#`cTjWVqU z2~yP2b3|5$R!txy2jYL2vvHQhH8_i|QLKxOI_kD$$Tq`rE?CD%O?a0~^!N178&V&i z(jX|r7-5fgTpae}G(t5&2?g9P{{15e5xYaMsH$X6)dmxy;VziDp6m0m1&$4Tjbc5! zT0e)u!LY333^EvP$J>v;uQXCnl|nt31YrfysHiZLT_GAvDV*&Ok|$bHWipbYOogXy z##U2=(ER+l^A}ALak9L{{$mio!U6y+h+xfEQ#GT$_sO}+Anr&B*iSLEzEdFJG5H#Y?6;9C5yeg5MAk!dlB<(U6UJ< z63osdV(Tlcag)_kWjke!mc%&l`;y{h%@X8ljvfoE-JryT?;k*~+2_wiguehZ7mzW- z8b*e};hO9(O@;Jt1@W@?!b0^oz*knuoLAs}Ub+Q1GSu)a7)nX9)Nu==R_d|@knW{k zzt=5#p}4H5DCVHd);~N8Q6!g>Kt-GD+_PH7s3hm_IUM->!d+vwD1BIILwEjmLO!fy zPD#b6?WkuO6SG8>Wy|Ko;G3Mcq**qvL0s^;w%2|5BSO`7SZpEE*J113^DX=rk5nxG6oHwMCJJX70a=3WQe?yysY;Z(XU%57Af8f!pGpN@y zyg0(Oo8*ZT`@4tOgzW9qS>`3!_0=GvUbNvfJ1%G6YfVs?nJd6BrG`ccZaO|pgZ~-6KFf^8*M@^AoA7k)_OYtRPcDb3PA6>8^t#55| zWqGO^hs+#ALw5PC?mX92{*SSRB`$&h+oNu8+wxn#v326?Y5^wE<4r?^UaGh}$1qB~S8tO4k>s5E z@_Ol-Tejb>=1Z^pNbsOy?ZS*jsZ^nxy)4EX0m{eV_*P; ze0aPyDe^>%Q>NQw=DX#$`W6v`(S2^oVU*tu!vr!;tT#=z@r3XS@Sbr0*WR&4I)!A< z=RCZYAiNDjLn)^*@6=`R{R=;;&L=OGDYehD1Fq;sz%#D>qdlZ|=Qf^pO67!~R1Cr~ zEN*D^>sp?Iv(DImc|~L1CWZ{7a@oHK{$w9SihXdD{)qIHaB;b=LwK#?@M~&yp2?xj zs<`?%-fdBv_Kt^4{L&?wotUyq1Hs6h%q~e(brJ522rsj#o`IP1^4aS1$7>9Q>oi0d z@IHfD{07;Sk3Elaw;wxr%j!Y;Fnq&#a_BmdJ`kQIQ{3v*2RzDriH2%xhYBNmpOurlNjvGv+-ysEn`SnVCqo-1s11r5~GxzF9*E(;# z>7Vikgl_IUh*W zOW@YzZ52zJjv*B4bWDbTL$vn*#wr$EZi6Z97L=xW75b>n>PHJculXUJoC6nM$LpP@ z9e+K+QxPCap&>Qurk*ynp~rCvS-E12l^4M8v$a?@acQL# z<^q+_7f)m#U0T}jKi_VAe#nDm=j&ez1e+GaElANVj%%I)ijC7hpC*z!L>bU%t2|kG zRnW3Ut9ey6MR9U|Yx=NQMp?I@aWx2fFbMz0(GO>dwY+A76FLqsE}?c(y{hi)q8 z+m9=?sv4Z8vI|z!uA~^PSPASN3 zT)j=QU&)46{(=c!?UkRm7V>fA>af75C(&A{a}@fTY27oA zi~rCM{c-pw0H>WW_%{rhTAPsMU+7Ju8C0j}D$@_Id43l zN>3Q4ymQY6*w{ZDyY5BuBpO773{&1f6<^^u?`z#%`AKJq5;RJm2oSJ4jA^QcAnyR= z%a|DNiqL`>L3iHcFm=@1!*Ax%DdcNHTr)X;mu$d#H>RC$zMI1CdKi$|mNTnklrJ>w zgf-X5S43I`o9$o{kR2JR!Dx$b1y+xe=2#;wYi0P6>W`fn2qO-iHFg=7}~oPgBe7o5rShOE`%)d>8B@wg@u zV3QA1iIBYkvCQrMHl&?fg|>v#&#A#Z7?G7#Qt6Tz8K z`=Z3uIA51lC7u}ip65a5os_Ta_Z|PdZX-VxbJ&U5((E!D#${~}(gSI5Jq^Z3Wb}@J zCJWJX;V?8m6}*=BM94$b=Gl2Gdt(ryU$`M{-NpU{2b!ju&?+HxUuW$tlaj~zzTx^2 zi1V6Zrbd24G7#Pj>b2{fuftK2QY+<+E)PgI^bghf2k^w3TW_LZv@;>-s!w0Yi>1ut zIY?JU-LUYIqRNY{4HwkK#8a!miE=kkvvGe}Kr?I;^X9wPedNfXBt=}a0s~y?!?uMJ zGP+wWrD25J4`0?<)C*4dzD!)gm{@d(ao^X^CkGU_%xQOsRWZ?rTG6D_#3%*21-L}i zTWop)Nu<0Jd9#|ijA{D%LAr5ZqDrep5P#c^E%d9Wo$P55FnRu((YzZ)kHf(w;;Nj2 zTqHQ&8X;g3}#FteB#tW0JY6!HUw|+q{21j)0>{j8R`KQBJ}Z2;ID@`}bVO8mx`evIOzL zJDRbTrk_BTtTEzk(>f$UgD6q!1k89^5u1DPdJufurxuLl=J&x{>W(woBIxb&fq0SS zbRzNCW>@1zD`cJw7*T=y?fmD+QUH#z6~%GGO!5Z+oNPgXF#iLLuGHbO-Yki@gKKuN zb9V@dz;pNWcA9FGsf2ngVt2r*o(}2Ni4rx`PPA2i7=o>Z=iodEG1<27r{^?5X@>Ao z&*^JN;1fYT#mwSBAViAVCNSJ3{C4_+Po_DUkC-+9ryWM=h$l2PiN>V01+Be?@!!7y zM*H{Wv?MKQV7(L8-2j7T*wR`#@#@h7G>$(WT}W!d`cBd_UJ%_bMQ^dh&CHYo3AKf* z7H6a+HvszR%+^{f0IzaJfJp&%j*oGt8f}oyEb63Ds+AQ%|Nm_sP}TUxUX$xyrYRaKVc;yu|jMq8E<7Hv_i_yab=UH!M4<6a%A2`}1y(yR& zgSu}{;AJ?o8zW83OLEQtSTMkSXW06I3x9bah`{q>~~%O{Kp_MECs?lZ~d+yptKU-c{czI^RZA7!SQI5z>a&H zGZ*uHwBOWF_JpJ=9FVebBE13Iksk2mojgp1&`!bovm)CcIszbVbEKa86W?*1?K|sh)<hC^=anJ?pdY%AodB*qhFZ8#~ zX#KlFw%3)H9!i5R=BS@On{ZSwaAv$uD&Fz&(nD98<4xn4_yr$(>LQddf{{BY>pNhm zl39i-d@u7>Jqx_k-7M7W)99C8CGeKPtUnFZbpilPKrid(q5(OK!4H~YA%*s37Q324 zaSU&w@?Y`DP0Xg@N+2AS!eoE2SvLLwc30YPD22jVmNWNcaXvQUI zWhEwoxSgSe*|?Y*J8(8!)^leRT^)ad$9fjOSc&n1%7M|%z*(dIK_6heyr(@--{prQ?Ap&?vHmgSQH|7LYS3zDQZK) zRllB;Jl&EJASjYMhMJZvGQ*B};ALOF>F3s+1_K8k^B~3Xo$_mPzrqVRK>R`;FJF1I zED&rf;kL2^+-=SkH=EahLeCD%*x+Stc@k{{BYlyo3+U(DO^4d1<;`QjZ|Q>$kAN0B z<+%sGhY3oo8JQvhQLsS45 zMEM%zG3#dk_g;$R83hsF&;ZZa^Ku^(G`BWZMl{Dl`2sLqm&8Ld5Tn)-3SqE1*re}= znmeV2cN@2j#mqEwgW!%ntP<(_%^4!_4j78qZ`ZQIn5p_qZ#o}$qYn{%_!=i~-$w-& zwh8hLuK_>lS(FU#b5V<{M!p*?0c91YEEgb{>GS1q)ci>9on< zdoze-4Hp*zLo;$JI$BWqBP`#YSOR&0cvKVW?5vB&reA^0J{LpD%49cu>~%CC46Y0$ z1Xy8Hd|K$Au6~p1I)j{_`gaKwqZa?EvXJ5k(`w$qA!>JUKNs5K9|Zw7h2iq&&8x6o z;8|!V{78>lz(>H4t}6Ov`087GSU`Qd5d93Vc*GD5b$9VZ_)#)hfXN65%0W${nU6qrr$BIx$GgvR2>k71T!v{SO8L>VG zR*!3SMBh5k)1Oq?aUCU>?Ry>XH!|7ihu@@J_x<+k%(KNmWoJRDo&XX*#rc_Pba&DxyE%g)T=Wem5KU)?C(-##m^HJvY z>G_sJ#a#+Kd`fqCAes<}Fao-_NiJ6vai00L*zY1yhFCaG-#y6r($Q{k9re;y7FX=c zj_H9+wedApOl|DXG%K}5v_q35OMUwiQ>*99_HV%KEm6U8jurzbvrB1=+x_S zt7#~H-k7cK!73OpLdvsI^ifoNe2t^?dH+7csk@9oQQKatQYIhA%w1&4Bc!hU>l;kF zWM8q&(x?Vixs$}TW54Y)=S@*7M}Ka0;AYsFn1TMq_xY|d z-s&KyvAW_pg!Zjc>+e=9gHt27^&$a52B(esYUt^K$Dl3$$komeA#_bUa=L8>?OT#< z#c@9ETjt84Zk~WIM^7zG&Z1`0MlKipk*G4|-;0f}=cW6lekq;X+i9?ZUrLDkXfl4r z2bLubjY^G*x15hD>TX2$Q~NGeszokF<0gpJQYpJ-hZ;|neLR*adUk^N&ZZk2=}Swc z@;pLN@#e4c+;Tg=Gr{V(EB3{5o}|rUF87nPtq!_MsjOqtf?kgl@KDwt3A2+yeEdxv zddh3q4?&IIXJ;k9QCBi|^z}KRm!pyy-huGo6lugjwjHgFDx&I9;H8fbUiLWsP%dkB zdq~jZ%8+`Z-FVEXW)aPNAql@fmWY4NZlhBTy^n6g4}hTIJ;iqY@!sxXF; zz%X`0ndhb@e114TrJ&6sD%)f=8MWjAvc2k`+fEUmhKvc(l9D`CPD%73j9D)M1I%J$ z5*xdfxVZljdG0BoS$Dh3QSu@Xx^Q^Y>vXY-me?ke)s0XBz@2(gL|v7P1m$0J$twJy ztI25-6a=13?P_WCgPKtRC4up|`%05glI8%nZ2!og&L1eQig{t{Ce zTy~k%-@9ofh9*z4LBaPXhmh^Jkq0R1hX`rUf za_y20V$gs$?01Sa6cWDS>1adB^^*vJ&0gQG-z_%j2P8?aFL8~J6k8q6W%_fngzT6G z7|w^YOUk&SHXb4WK=VU69b;fUVtr+VUrZ&Ed86b}$`x>pIL zq8x{qHnM@XrNrFmTbn6YQ(6VMWwz-5knuP}mWg7To3!6uN`;73n%AFFJF8cT{HV=+GfsYQ6XI0A zw~$+ms6XWFp+G#3dl#w1#nb;hxjPb#9NPr)3&d`CV<{p2Oh1J?i~{1AHa!qSEC^8a zp~>f_G3{YsqUT*WSnm&n&dCorMoB*dL47JTfE!sB|2JkE@(rz&T#J5Q2!C=AtI6-p zi;x3`t>G$FNB|qhRV#&D8Dvq$*Gm-PhMlcilXoMzgOX~~ms@@Fu{(6^3O{$q`#Ip@ zlUX`6H!R8b3^Y>ULfXPxl*F)VQBh(cCv&<#HDmAO9QR*)f{(L zJkqctL54UKMV#Hjcx{J!k{hiZ>Xq1z{5}aKx-RDEZBgn;tW6?b1?SKqd7r=lMe&-` zt3FCFz%GF z?TU>7lTb9#H4F@F(Gs-yY6lR(=m!*mzN%N4>nh)SpUgj~(i5H0rJHM*20~}JLoi}^ z$hVn{p}PKKOwXc5KDTS;%J?}a84o-WRN#`s+vO^Iy8ZvX$9?GP{2_p9yVfj)4J9qC zEAL7*{tScMEX9m)X%qS;9R*zs3#l1e$Jr|_T#tN^TYNID(ouTHzVh3n6M?%;r^Bhl z9tC0A@M)hi0=5p+J#GAiKS+=GJ!`2-;}jP?itv({i!XI@2immWrben}9|E~kOsWYL z`l3CzgDc(rE+-NMgby4f1qU{Z&Ujl(v#g1ed*MQO}fcDMf zQFB>YGLR%O%6@M&wvs?8iNOUnp5Vcj<{Q^KWs?|%p$UlzIYNxxR2pL^d{LTkOdX!C zFeSc+pyn6@e{>5a9x4guKr(!Ml`THciZXe5myRmv=5x9@pQb#J zbcgeK>?&uBocioZjs@rahtZH7YPw8>*VWp<9+mBi;dQprkPY?Owes`z7_pA+Fya*k z`zLXD>erXRY?ONrskRjp)d|S%?kl9dsdy=5Ckx<^e`n4$u1;y@31~DOh_1=-Fp5fL zMmFhI6qbS)pUEXD&7WU|r&Q=bfkA;X5;9cq3fm0~77j6FooHJy&Ntd-9noJGeeh?6 zcfgZK)0z{0kJ;S=zLQ4}&}x!m<8)f1`N0IkP_+?g8{&YUZB%49TWn8)?MFYH-Ei+w z7Mw4HjoTYH$tuUG252e88z3D2vu9PVjq#p6h?gYLidFlsRlySX3TMKADVJAM{hE>lS*Z zNn^tZ1kt`KXIs6oW*xT$X-r#Q4!Otd#9K76gBxF%*Vv`i-WMrI(b)xUJ_K zUG7~({9f%1CS#odR75yAkoDA%1j6N|DK{w{0s^f~N=#T4kgaKa4}Q7qGiVY`qwOp} zH4)8WF%pU93cfKz>VuUz%gf+#h$5N%K*-xHUJ@X~0Nk}N5sSYx4r0`Sd z#|UT|Wk+uka6;N#0`e?#W64)$67{XHF{2kX?Lp16auf$HL-{7IisPMR> z;NuPa-_-Ixrqm_O1wzGjie9)2gP^zk|Ml$tN%~G8J~tZhvqtBOcs|m&s?l7KraA&M4O|h; z;PS5;9c;G5KzR1gzX7*Q(Mr4FKE7EnG?n}6QMk5tqE1OXbw4bw?som3BHbhk^ATY9 zl%2(4l|*-7sYj6vRZ*Gbjz+}ZcvKZdJwi# zq>g^%DLU)va6mwNLv@>C(iMF@soh_+N~NBRYG&3;ug;@&^3q8)vIcIxsng zakLNKhuUz(sq`8rM9DIJes4&VADblgBkG{wcw5z~)GP2RfD;d2bbx4DugHVj(5Wpx zeumDyL-n~B=MnIvZ-CwIb>XLv(W?u96@3q|=VI2R>XOsA=(meU<5a{8E<2!?!K0v6sw?n9=JA?v9%e1{QwWL7f}Jwbhbz$x~2h3Gf%B>$Zz;A9v3gDL=8Ki`vU>BE=1aKg?WDI{^@RVFVM_ox8#=sO`~J_EmL z8FKS}lhtyqp7xA&l-CZik8ZC|$4zRualvQ5p}%}8v>m|;RPHS@;zJX9Ek!!yo8OQi zBNOCGj(1NEC+h=ACC1JJ}f(=`J+7cEj%#~`1>{m9N;*%N^K>CSk~DLp#G z1|Y8YV*cfTpEmdz@|HpHRV-fnroOmCPv?okQ>VwPYu^#`?(%h=Hn1|%={etZX?FlVbiTIqjKTIx&ZDx z4yvV;IV7_JG!U#p7!7=qvg7(}pOGVE>Jy{}mn9K5WG{Bp!@W{7($@bNm&X4T1V)#b z=Q7G1dw7e$86kFR6*lOYgIzu@%a8OX@$W~x7D>`oqFM_EXkBerv30JD)V=i1Bq0(2 z!i~QHKPLVYo~WZ8QxJ@kOgfX@t?$x01O&<{xowb6Wocq-TCfu~b6D$U+T*?0#n1RV zi?xvGtr1*p;H*gYRCK|$x_DonK3{pp*Y0>&=h~~nEHI+(esB1 zSDN^-4{$f}v63dh7+e6VbA{?xY1rFHgE|m9H;kznfuLV^DfcH1%LGwjPUJ-UtoLao zU%-83mg>LegbN>^K&`lx(O!=(suU+F_zrOCX>ahCA$*44W`yojx^!uL4WZDhizBcm ziiKw}jDB_k(o5`ZLPh6k*g%2DZD@C35Prf7Kk0qB+2e&Y;0^q#IspK;XXG>mUqJTt8sI0Qcvge`@)deaHoqCP_lR zk_cyMwu!n+DgP|WT{V0{p8E9ZqZ{ig;9Ve}tdBvDKKiK2CCG4+nO2W6$zn1m8EW0+ z$&>Y1Bmb$V+~q>)ODbje!w)|sCKD*VETLA3x3Xgfj*IGxFTUt_sd$O_ECxq_bL5|Y z>ZzyPd+)uch+Z%U_eIH5qc7Phr<|hKP*yRaw7C~h_cP8o!}aLVgLsd!b#C6gSpjRn zBIe}|I06jDD0&uAAqEC;c+({M5%ew)&C=Y;O(KHwN6}?X?yBuK%rr#Iv*GT1@WBUb zx{N#SxI^)zXUv$PlLIiAgXxVgf8GEhHh* z?!evU3xYU^1J%ikN|_8aBS(&O;Uz_>(vDzGS-5cF0d2a41|;}{!Ha>1eVQBKFr|pT z!wx=S!h}+%8}b;F6Fgbi0&FMtiiascjTcG$IR*Wy#D{Lg1EFVbO$9j5g&BSOKFgo|@PF9vGgzI}`2 zVk(WG80v#!f z9#|;*i4!NPi7fCZYjPLb)z&@F!Q>-Kha7T) zqOg5QmEW3jQO?Z3*)UO{wBLO5O}&+wwB(u5ER%r1GxKJRgKOu{pRY6iP{-W4bJeEG zz1+iAj`-aE^2;yvngi8|35d`3J$v>zgc}n@AQtC(_Ux(a4}1ZnZQdbB3t#=LP?*`% zXRL`tDGPEJZBLxPB?Y?My?eKN_St9M=+UF~%d)t91_nv7(gJU~_?kxfzpip8%A)B> z7KSwUj@b1Rc~SX}&^3~wwGhEA%1z*2w*Dk%T zT%CXa{kP{j8Hl`O9QC5e{FEtE3T1KCku5c`^AM_*rnKp?z&Wjo6fXzr-?he?v zR-O#D3HvF5~o_5GOM{4y~Q*jpMuA1j4zUQyM{wf?oRzk8wCJ<4oMOLHb zr1=}UqaW)i%{nv=p=yKPHJddNzX~|t~kh56OVNnQIxZoGgvA8LuxWyaAGHu4q z-T2F6ahX6g7H^v&wX(@w{4$b6iD}ARF1&X_rxCq}|N8p%>%*bg_b3hmYkKwS)ifdE zu`?+n1robJKha-}tngu{$z4W-suj&hUGC!eOlhPtSj@fj(n}L>vcc9R&f=OX$O3F# zO%G7!W|O-zjLFF1EXZAMZ2FDS*Q8Fk?A6Qc zLLM4Y2nP=$(h`$XUQ!+u^4dq+Xun&}0kSf}NB9X}A@~fx;d`FRGFyTcL|%D` z99I8vNgd(XNM~&H9|zz&1qZHT!@1Az;+C|RB{}W{oJR!(Hq>lwSA`mYkN^Ql86}kstA_RRAP;}a9 zr-e(Ke*f*a-?}wx*64$hps zCIxC{1_{te90Z67`RB=Lev{Q+y3D zn}SDd*yGo(UF*o3&OpZoHUel#M)l!<$zAiwohkh??+&J(nKNf9ZtQi}UFReRy(;E_ zMpfoE1#}R1Ed_S~O^ewAxU{@H5;WsZ5$suJ?95OB$f#6axq5&nL?L1Z3QCe0FoOpO zUVxhLH4yed8)k(7$AB+^KkUW3#BJHKMIrMTz}PU`0O^=TB&)3Ovfk}5xvMwFy#V$~}7 z3eD8hBynN@Ah$3=~v=V zL3XK5(B^mboMyZ2Gn>9ii7b{IjsugS;Fb)_8 zGRgr{`WfvQYnE}qIA9z|f&-@Xli;eAX&f*P7zZ-Sf&T~g?XB>X%3mP>0000 VFs) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return all_of(VFs, [&](const ElementCount &VF) { + if (Plan->hasVF(VF)) + return true; + return false; + }); + }); + } + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d87938bb146..347d030ef28 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -156,6 +156,7 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +static const char *VerboseDebug = DEBUG_TYPE "-verbose"; /// @{ /// Metadata attribute names @@ -169,6 +170,22 @@ static const char *const LLVMLoopVectorizeFollowupEpilogue = STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); +STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); + +static cl::opt EnableEpilogueVectorization( + "enable-epilogue-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of epilogue loops.")); + +static cl::opt EpilogueVectorizationForceVF( + "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, + cl::desc("When epilogue vectorization is enabled, and a value greater than " + "1 is specified, forces the given VF for all applicable epilogue " + "loops.")); + +static cl::opt EpilogueVectorizationMinVF( + "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, + cl::desc("Only loops with vectorization factor equal to or larger than " + "the specified value are considered for epilogue vectorization.")); /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. @@ -458,7 +475,9 @@ public: /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new /// loop. - BasicBlock *createVectorizedLoopSkeleton(); + /// In the case of epilogue vectorization, this function is overriden to + /// handle the more complex control flow around the loops. + virtual BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, @@ -721,7 +740,13 @@ protected: /// Create new phi nodes for the induction variables to resume iteration count /// in the scalar epilogue, from where the vectorized loop left off (given by /// \p VectorTripCount). - void createInductionResumeValues(Loop *L, Value *VectorTripCount); + /// In cases where the loop skeleton is more complicated (eg. epilogue + /// vectorization) and the resume values can come from an additional bypass + /// block, the \p AdditionalBypass pair provides information about the bypass + /// block and the end value on the edge from bypass to this loop. + void createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and @@ -747,6 +772,11 @@ protected: /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); + /// Allow subclasses to override and print debug traces before/after vplan + /// execution, when trace information is requested. + virtual void printDebugTracesAtStart(){}; + virtual void printDebugTracesAtEnd(){}; + /// The original loop. Loop *OrigLoop; @@ -886,6 +916,128 @@ private: Value *reverseVector(Value *Vec) override; }; +/// Encapsulate information regarding vectorization of a loop and its epilogue. +/// This information is meant to be updated and used across two stages of +/// epilogue vectorization. +struct EpilogueLoopVectorizationInfo { + ElementCount MainLoopVF = ElementCount::getFixed(0); + unsigned MainLoopUF = 0; + ElementCount EpilogueVF = ElementCount::getFixed(0); + unsigned EpilogueUF = 0; + BasicBlock *MainLoopIterationCountCheck = nullptr; + BasicBlock *EpilogueIterationCountCheck = nullptr; + BasicBlock *SCEVSafetyCheck = nullptr; + BasicBlock *MemSafetyCheck = nullptr; + Value *TripCount = nullptr; + Value *VectorTripCount = nullptr; + + EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, + unsigned EUF) + : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), + EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { + assert(EUF == 1 && + "A high UF for the epilogue loop is likely not beneficial."); + } +}; + +/// An extension of the inner loop vectorizer that creates a skeleton for a +/// vectorized loop that has its epilogue (residual) also vectorized. +/// The idea is to run the vplan on a given loop twice, firstly to setup the +/// skeleton and vectorize the main loop, and secondly to complete the skeleton +/// from the first step and vectorize the epilogue. This is achieved by +/// deriving two concrete strategy classes from this base class and invoking +/// them in succession from the loop vectorizer planner. +class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { +public: + InnerLoopAndEpilogueVectorizer( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI(EPI) {} + + // Override this function to handle the more complex control flow around the + // three loops. + BasicBlock *createVectorizedLoopSkeleton() final override { + return createEpilogueVectorizedLoopSkeleton(); + } + + /// The interface for creating a vectorized skeleton using one of two + /// different strategies, each corresponding to one execution of the vplan + /// as described above. + virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + + /// Holds and updates state information required to vectorize the main loop + /// and its epilogue in two separate passes. This setup helps us avoid + /// regenerating and recomputing runtime safety checks. It also helps us to + /// shorten the iteration-count-check path length for the cases where the + /// iteration count of the loop is so small that the main vector loop is + /// completely skipped. + EpilogueLoopVectorizationInfo &EPI; +}; + +/// A specialized derived class of inner loop vectorizer that performs +/// vectorization of *main* loops in the process of vectorizing loops and their +/// epilogues. +class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerMainLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *main loop* strategy (ie the first pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check once for the main loop (when \p + /// ForEpilogue is false) and once for the epilogue loop (when \p + /// ForEpilogue is true). + BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, + bool ForEpilogue); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; + +// A specialized derived class of inner loop vectorizer that performs +// vectorization of *epilogue* loops in the process of vectorizing loops and +// their epilogues. +class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, + llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *epilogue loop* strategy (ie the second pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check after the main vector loop has + /// finished to see if there are any iterations left to execute by either + /// the vector epilogue or the scalar epilogue. + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *Bypass, + BasicBlock *Insert); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -1077,6 +1229,9 @@ public: /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF, + const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. void selectUserVectorizationFactor(ElementCount UserVF) { @@ -1610,6 +1765,16 @@ private: Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -1652,6 +1817,9 @@ public: /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + + /// Profitable vector factors. + SmallVector ProfitableVFs; }; } // end namespace llvm @@ -3139,9 +3307,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { return Lp; } -void InnerLoopVectorizer::createInductionResumeValues(Loop *L, - Value *VectorTripCount) { +void InnerLoopVectorizer::createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair AdditionalBypass) { assert(VectorTripCount && L && "Expected valid arguments"); + assert(((AdditionalBypass.first && AdditionalBypass.second) || + (!AdditionalBypass.first && !AdditionalBypass.second)) && + "Inconsistent information about additional bypass."); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3160,6 +3332,7 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L, // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; + Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = VectorTripCount; @@ -3172,8 +3345,19 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L, const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); - } + // Compute the end value for the additional bypass (if applicable). + if (AdditionalBypass.first) { + B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); + CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, + StepType, true); + CRD = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + EndValueFromAdditionalBypass = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValueFromAdditionalBypass->setName("ind.end"); + } + } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); @@ -3183,6 +3367,11 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L, // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); + + if (AdditionalBypass.first) + BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, + EndValueFromAdditionalBypass); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } @@ -5495,6 +5684,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { << " because it will not generate any vector instructions.\n"); continue; } + + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) { + ProfitableVFs.push_back(VectorizationFactor( + {ElementCount::getFixed(i), (unsigned)VectorCost})); + } + if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5518,6 +5714,117 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { return Factor; } +bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( + const Loop &L, ElementCount VF) const { + // Cross iteration phis such as reductions need special handling and are + // currently unsupported. + if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { + return Legal->isFirstOrderRecurrence(&Phi) || + Legal->isReductionVariable(&Phi); + })) + return false; + + // Phis with uses outside of the loop require special handling and are + // currently unsupported. + for (auto &Entry : Legal->getInductionVars()) { + // Look for uses of the value of the induction at the last iteration. + Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + for (User *U : PostInc->users()) + if (!L.contains(cast(U))) + return false; + // Look for uses of penultimate value of the induction. + for (User *U : Entry.first->users()) + if (!L.contains(cast(U))) + return false; + } + + // Induction variables that are widened require special handling that is + // currently not supported. + if (any_of(Legal->getInductionVars(), [&](auto &Entry) { + return !(isScalarAfterVectorization(Entry.first, VF) || + isProfitableToScalarize(Entry.first, VF)); + })) + return false; + + return true; +} + +bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( + const ElementCount VF) const { + // FIXME: We need a much better cost-model to take different parameters such + // as register pressure, code size increase and cost of extra branches into + // account. For now we apply a very crude heuristic and only consider loops + // with vectorization factors larger than a certain value. + // We also consider epilogue vectorization unprofitable for targets that don't + // consider interleaving beneficial (eg. MVE). + if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + return false; + if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + return true; + return false; +} + +VectorizationFactor +LoopVectorizationCostModel::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { + VectorizationFactor Result = VectorizationFactor::Disabled(); + if (!EnableEpilogueVectorization) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + return Result; + } + + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " + "allowed.\n";); + return Result; + } + + // Not really a cost consideration, but check for unsupported cases here to + // simplify the logic. + if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because the loop is " + "not a supported candidate.\n";); + return Result; + } + + if (EpilogueVectorizationForceVF > 1) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + if (LVP.hasPlanWithVFs( + {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + else { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization forced factor is not viable.\n";); + return Result; + } + } + + if (TheLoop->getHeader()->getParent()->hasOptSize() || + TheLoop->getHeader()->getParent()->hasMinSize()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + return Result; + } + + if (!isEpilogueVectorizationProfitable(MainLoopVF)) + return Result; + + for (auto &NextVF : ProfitableVFs) + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + Result = NextVF; + + if (Result != VectorizationFactor::Disabled()) + LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " + << Result.Width.getFixedValue() << "\n";); + return Result; +} + std::pair LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -7122,6 +7429,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.printDebugTracesAtStart(); + //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -7137,6 +7446,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(); + + ILV.printDebugTracesAtEnd(); } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( @@ -7243,6 +7554,276 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } +//===--------------------------------------------------------------------===// +// EpilogueVectorizerMainLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton(""); + + // Generate the code to check the minimum iteration count of the vector + // epilogue (see below). + EPI.EpilogueIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + EPI.EpilogueIterationCountCheck->setName("iter.check"); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + BasicBlock *SavedPreHeader = LoopVectorPreHeader; + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.SCEVSafetyCheck = SavedPreHeader; + + // Generate the code that checks at runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + SavedPreHeader = LoopVectorPreHeader; + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save/overwite it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.MemSafetyCheck = SavedPreHeader; + + // Generate the iteration count check for the main loop, *after* the check + // for the epilogue loop, so that the path-length is shorter for the case + // that goes directly through the vector epilogue. The longer-path length for + // the main loop is compensated for, by the gain from vectorizing the larger + // trip count. Note: the branch will get updated later on when we vectorize + // the epilogue. + EPI.MainLoopIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + EPI.VectorTripCount = CountRoundDown; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Skip induction resume value creation here because they will be created in + // the second pass. If we created them here, they wouldn't be used anyway, + // because the vplan in the second pass still contains the inductions from the + // original loop. + + return completeLoopSkeleton(Lp, OrigLoopID); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + }); +} + +BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( + Loop *L, BasicBlock *Bypass, bool ForEpilogue) { + assert(L && "Expected valid Loop."); + assert(Bypass && "Expected valid bypass basic block."); + unsigned VFactor = + ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); + unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; + Value *Count = getOrCreateTripCount(L); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // main vector loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), + "min.iters.check"); + + if (!ForEpilogue) + TCCheckBlock->setName("vector.main.loop.iter.check"); + + // Create new preheader for vector loop. + LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + DT, LI, nullptr, "vector.ph"); + + if (ForEpilogue) { + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + LoopBypassBlocks.push_back(TCCheckBlock); + + // Save the trip count so we don't have to regenerate it in the + // vec.epilog.iter.check. This is safe to do because the trip count + // generated here dominates the vector epilog iter check. + EPI.TripCount = Count; + } + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + return TCCheckBlock; +} + +//===--------------------------------------------------------------------===// +// EpilogueVectorizerEpilogueLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock * +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + + // Now, compare the remaining count and if there aren't enough iterations to + // execute the vectorized epilogue skip to the scalar part. + BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; + VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); + LoopVectorPreHeader = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "vec.epilog.ph"); + emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + VecEpilogueIterationCountCheck); + + // Adjust the control flow taking the state info from the main loop + // vectorization into account. + assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && + "expected this to be saved from the previous pass."); + EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopVectorPreHeader); + + DT->changeImmediateDominator(LoopVectorPreHeader, + EPI.MainLoopIterationCountCheck); + + EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + if (EPI.SCEVSafetyCheck) + EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + if (EPI.MemSafetyCheck) + EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + DT->changeImmediateDominator( + VecEpilogueIterationCountCheck, + VecEpilogueIterationCountCheck->getSinglePredecessor()); + + DT->changeImmediateDominator(LoopScalarPreHeader, + EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); + + // Keep track of bypass blocks, as they feed start values to the induction + // phis in the scalar loop preheader. + if (EPI.SCEVSafetyCheck) + LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + LoopBypassBlocks.push_back(EPI.MemSafetyCheck); + LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + + // Generate a resume induction for the vector epilogue and put it in the + // vector epilogue preheader + Type *IdxTy = Legal->getWidestInductionType(); + PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", + LoopVectorPreHeader->getFirstNonPHI()); + EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); + EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), + EPI.MainLoopIterationCountCheck); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *StartIdx = EPResumeVal; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Generate induction resume values. These variables save the new starting + // indexes for the scalar loop. They are used to test if there are any tail + // iterations left once the vector loop has completed. + // Note that when the vectorized epilogue is skipped due to iteration count + // check, then the resume value for the induction variable comes from + // the trip count of the main vector loop, hence passing the AdditionalBypass + // argument. + createInductionResumeValues(Lp, CountRoundDown, + {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); + + AddRuntimeUnrollDisableMetaData(Lp); + return completeLoopSkeleton(Lp, OrigLoopID); +} + +BasicBlock * +EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( + Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + + assert(EPI.TripCount && + "Expected trip count to have been safed in the first pass."); + assert(!isa(EPI.TripCount) || + DT->dominates(cast(EPI.TripCount)->getParent(), Insert) && + "saved trip count does not dominate insertion point."); + Value *TC = EPI.TripCount; + IRBuilder<> Builder(Insert->getTerminator()); + Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // vector epilogue loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, + ConstantInt::get(Count->getType(), + EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), + "min.epilog.iters.check"); + + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + LoopBypassBlocks.push_back(Insert); + return Insert; +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + }); +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( const std::function &Predicate, VFRange &Range) { assert(!Range.isEmpty() && "Trying to test an empty VF range."); @@ -8608,16 +9189,51 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, + &LVL, &CM, BFI, PSI); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; + + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } // Report the vectorization decision. ORE->emit([&]() { diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll new file mode 100644 index 00000000000..7cb5f34fa57 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll @@ -0,0 +1,133 @@ +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s + +; TODO: For now test for the `-epilogue-vectorization-minimum-VF` option. In +; the future we need to replace this with a more meaningful test of the +; epilogue vectorization cost-model. +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4 +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Do not vectorize epilogues for loops with minsize attribute +; CHECK-LABLE: @f1 +; CHECK-NOT: vector.main.loop.iter.check +; CHECK-NOT: vec.epilog.iter.check +; CHECK-NOT: vec.epilog.ph +; CHECK-NOT: vec.epilog.vector.body +; CHECK-NOT: vec.epilog.middle.block + +define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 { +entry: + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd fast float %0, %1 + %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Do not vectorize epilogues for loops with optsize attribute +; CHECK-LABLE: @f2 +; CHECK-NOT: vector.main.loop.iter.check +; CHECK-NOT: vec.epilog.iter.check +; CHECK-NOT: vec.epilog.ph +; CHECK-NOT: vec.epilog.vector.body +; CHECK-NOT: vec.epilog.middle.block + +define dso_local void @f2(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #1 { +entry: + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd fast float %0, %1 + %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16. +; CHECK-MIN-D-LABLE: @f3 +; CHECK-MIN-D-NOT: vector.main.loop.iter.check +; CHECK-MIN-D-NOT: vec.epilog.iter.check +; CHECK-MIN-D-NOT: vec.epilog.ph +; CHECK-MIN-D-NOT: vec.epilog.vector.body +; CHECK-MIN-D-NOT: vec.epilog.middle.block + +; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and +; make sure the epilogue gets vectorized in that case. +; CHECK-MIN-D-LABLE: @f3 +; CHECK-MIN-4: vector.main.loop.iter.check +; CHECK-MIN-4: vec.epilog.iter.check +; CHECK-MIN-4: vec.epilog.ph +; CHECK-MIN-4: vec.epilog.vector.body +; CHECK-MIN-4: vec.epilog.middle.block + +define dso_local void @f3(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) { +entry: + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd fast float %0, %1 + %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { minsize } +attributes #1 = { optsize } \ No newline at end of file diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll new file mode 100644 index 00000000000..82f73e2b7c4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll @@ -0,0 +1,593 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s --check-prefix VF-FOUR-CHECK + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Function Attrs: nounwind +define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 { +; VF-TWO-CHECK-LABEL: @f1( +; VF-TWO-CHECK-NEXT: entry: +; VF-TWO-CHECK-NEXT: [[AA1:%.*]] = bitcast float* [[AA:%.*]] to i8* +; VF-TWO-CHECK-NEXT: [[BB3:%.*]] = bitcast float* [[BB:%.*]] to i8* +; VF-TWO-CHECK-NEXT: [[CC6:%.*]] = bitcast float* [[CC:%.*]] to i8* +; VF-TWO-CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; VF-TWO-CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; VF-TWO-CHECK: iter.check: +; VF-TWO-CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; VF-TWO-CHECK: vector.memcheck: +; VF-TWO-CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[AA]], i64 [[WIDE_TRIP_COUNT]] +; VF-TWO-CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; VF-TWO-CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[BB]], i64 [[WIDE_TRIP_COUNT]] +; VF-TWO-CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* +; VF-TWO-CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[CC]], i64 [[WIDE_TRIP_COUNT]] +; VF-TWO-CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* +; VF-TWO-CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP45]] +; VF-TWO-CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BB3]], [[SCEVGEP2]] +; VF-TWO-CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; VF-TWO-CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP78]] +; VF-TWO-CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[CC6]], [[SCEVGEP2]] +; VF-TWO-CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; VF-TWO-CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; VF-TWO-CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; VF-TWO-CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; VF-TWO-CHECK: vector.main.loop.iter.check: +; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK12:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 48 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK12]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; VF-TWO-CHECK: vector.ph: +; VF-TWO-CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 48 +; VF-TWO-CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; VF-TWO-CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; VF-TWO-CHECK: vector.body: +; VF-TWO-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; VF-TWO-CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; VF-TWO-CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 +; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 20 +; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 24 +; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 28 +; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 32 +; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 36 +; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 40 +; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 44 +; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP1]] +; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP2]] +; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP3]] +; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP4]] +; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP5]] +; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP6]] +; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP7]] +; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP8]] +; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP9]] +; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP10]] +; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP11]] +; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4 +; VF-TWO-CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP27]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 8 +; VF-TWO-CHECK-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP29]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 12 +; VF-TWO-CHECK-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP31]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 16 +; VF-TWO-CHECK-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP33]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 20 +; VF-TWO-CHECK-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, <4 x float>* [[TMP35]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 24 +; VF-TWO-CHECK-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 28 +; VF-TWO-CHECK-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, <4 x float>* [[TMP39]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 32 +; VF-TWO-CHECK-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP41]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 36 +; VF-TWO-CHECK-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x float>, <4 x float>* [[TMP43]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 40 +; VF-TWO-CHECK-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x float>, <4 x float>* [[TMP45]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 44 +; VF-TWO-CHECK-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP1]] +; VF-TWO-CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP2]] +; VF-TWO-CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP3]] +; VF-TWO-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP4]] +; VF-TWO-CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP5]] +; VF-TWO-CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP6]] +; VF-TWO-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP7]] +; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP8]] +; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP9]] +; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP10]] +; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP11]] +; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 4 +; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = bitcast float* [[TMP62]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <4 x float>, <4 x float>* [[TMP63]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 8 +; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 12 +; VF-TWO-CHECK-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 16 +; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = bitcast float* [[TMP68]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <4 x float>, <4 x float>* [[TMP69]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 20 +; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 24 +; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 28 +; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = bitcast float* [[TMP74]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP75]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 32 +; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 36 +; VF-TWO-CHECK-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 40 +; VF-TWO-CHECK-NEXT: [[TMP81:%.*]] = bitcast float* [[TMP80]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <4 x float>, <4 x float>* [[TMP81]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 44 +; VF-TWO-CHECK-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD35:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD24]] +; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD13]], [[WIDE_LOAD25]] +; VF-TWO-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[WIDE_LOAD14]], [[WIDE_LOAD26]] +; VF-TWO-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[WIDE_LOAD15]], [[WIDE_LOAD27]] +; VF-TWO-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[WIDE_LOAD16]], [[WIDE_LOAD28]] +; VF-TWO-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[WIDE_LOAD17]], [[WIDE_LOAD29]] +; VF-TWO-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[WIDE_LOAD18]], [[WIDE_LOAD30]] +; VF-TWO-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[WIDE_LOAD19]], [[WIDE_LOAD31]] +; VF-TWO-CHECK-NEXT: [[TMP92:%.*]] = fadd fast <4 x float> [[WIDE_LOAD20]], [[WIDE_LOAD32]] +; VF-TWO-CHECK-NEXT: [[TMP93:%.*]] = fadd fast <4 x float> [[WIDE_LOAD21]], [[WIDE_LOAD33]] +; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = fadd fast <4 x float> [[WIDE_LOAD22]], [[WIDE_LOAD34]] +; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = fadd fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]] +; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP1]] +; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP2]] +; VF-TWO-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP3]] +; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP4]] +; VF-TWO-CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP5]] +; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP6]] +; VF-TWO-CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP7]] +; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP8]] +; VF-TWO-CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP9]] +; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP10]] +; VF-TWO-CHECK-NEXT: [[TMP107:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP11]] +; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP109]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 4 +; VF-TWO-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP111]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 8 +; VF-TWO-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP113]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 12 +; VF-TWO-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP115]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 16 +; VF-TWO-CHECK-NEXT: [[TMP117:%.*]] = bitcast float* [[TMP116]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP117]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 20 +; VF-TWO-CHECK-NEXT: [[TMP119:%.*]] = bitcast float* [[TMP118]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP119]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 24 +; VF-TWO-CHECK-NEXT: [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP121]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 28 +; VF-TWO-CHECK-NEXT: [[TMP123:%.*]] = bitcast float* [[TMP122]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP123]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 32 +; VF-TWO-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP92]], <4 x float>* [[TMP125]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 36 +; VF-TWO-CHECK-NEXT: [[TMP127:%.*]] = bitcast float* [[TMP126]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP93]], <4 x float>* [[TMP127]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 40 +; VF-TWO-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP94]], <4 x float>* [[TMP129]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 44 +; VF-TWO-CHECK-NEXT: [[TMP131:%.*]] = bitcast float* [[TMP130]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP95]], <4 x float>* [[TMP131]], align 4 +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 48 +; VF-TWO-CHECK-NEXT: [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV:!.*]] +; VF-TWO-CHECK: middle.block: +; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; VF-TWO-CHECK: vec.epilog.iter.check: +; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; VF-TWO-CHECK: vec.epilog.ph: +; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[N_MOD_VF36:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: [[N_VEC37:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF36]] +; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_BODY:%.*]] +; VF-TWO-CHECK: vec.epilog.vector.body: +; VF-TWO-CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[INDEX38]], 0 +; VF-TWO-CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]] +; VF-TWO-CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD41:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]] +; VF-TWO-CHECK-NEXT: [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD42:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD41]], [[WIDE_LOAD42]] +; VF-TWO-CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]] +; VF-TWO-CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>* +; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4 +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT39]] = add i64 [[INDEX38]], 2 +; VF-TWO-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC37]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_BODY]], !llvm.loop [[LOOPID_EV:!.*]] +; VF-TWO-CHECK: vec.epilog.middle.block: +; VF-TWO-CHECK-NEXT: [[CMP_N40:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC37]] +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N40]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; VF-TWO-CHECK: scalar.ph: +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC37]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] +; VF-TWO-CHECK: for.body: +; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP145:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; VF-TWO-CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP146:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP145]], [[TMP146]] +; VF-TWO-CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX4]], align 4 +; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS:!.*]] +; VF-TWO-CHECK: for.end.loopexit.loopexit: +; VF-TWO-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; VF-TWO-CHECK: for.end.loopexit: +; VF-TWO-CHECK-NEXT: br label [[FOR_END]] +; VF-TWO-CHECK: for.end: +; VF-TWO-CHECK-NEXT: ret void +; +; VF-TWO-CHECK-DAG: [[LOOPID_MV]] = distinct !{[[LOOPID_MV]], [[LOOPID_DISABLE_VECT:!.*]]} +; VF-TWO-CHECK-DAG: [[LOOPID_EV]] = distinct !{[[LOOPID_EV]], [[LOOPID_DISABLE_UNROLL:!.*]], [[LOOPID_DISABLE_VECT:!.*]]} +; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_VECT]] = [[DISABLE_VECT_STR:!{!"llvm.loop.isvectorized".*}.*]] +; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_UNROLL]] = [[DISABLE_UNROLL_STR:!{!"llvm.loop.unroll.runtime.disable"}.*]] + + +entry: + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd fast float %0, %1 + %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) #0 { +; VF-FOUR-CHECK-LABEL: @f2( +; VF-FOUR-CHECK-NEXT: entry: +; VF-FOUR-CHECK-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* +; VF-FOUR-CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1 +; VF-FOUR-CHECK-NEXT: br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; VF-FOUR-CHECK: iter.check: +; VF-FOUR-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; VF-FOUR-CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64 +; VF-FOUR-CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; VF-FOUR-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; VF-FOUR-CHECK: vector.scevcheck: +; VF-FOUR-CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; VF-FOUR-CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]]) +; VF-FOUR-CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; VF-FOUR-CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; VF-FOUR-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[MUL_RESULT]] +; VF-FOUR-CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] +; VF-FOUR-CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]] +; VF-FOUR-CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP3]], [[TMP0]] +; VF-FOUR-CHECK-NEXT: [[TMP7:%.*]] = select i1 true, i1 [[TMP5]], i1 [[TMP6]] +; VF-FOUR-CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 +; VF-FOUR-CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; VF-FOUR-CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]] +; VF-FOUR-CHECK-NEXT: [[TMP11:%.*]] = or i1 false, [[TMP10]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MEM_CHECK:%.*]] +; VF-FOUR-CHECK: vector.memcheck: +; VF-FOUR-CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 [[WIDE_TRIP_COUNT]] +; VF-FOUR-CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; VF-FOUR-CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP0]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 1 +; VF-FOUR-CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP13]], [[WIDE_TRIP_COUNT]] +; VF-FOUR-CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP14]] +; VF-FOUR-CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast float* [[SCEVGEP3]] to i8* +; VF-FOUR-CHECK-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP12]], 1 +; VF-FOUR-CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP15]] +; VF-FOUR-CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8* +; VF-FOUR-CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]] +; VF-FOUR-CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] +; VF-FOUR-CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; VF-FOUR-CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; VF-FOUR-CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; VF-FOUR-CHECK: vector.main.loop.iter.check: +; VF-FOUR-CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 +; VF-FOUR-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; VF-FOUR-CHECK: vector.ph: +; VF-FOUR-CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32 +; VF-FOUR-CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; VF-FOUR-CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; VF-FOUR-CHECK: vector.body: +; VF-FOUR-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 8 +; VF-FOUR-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 12 +; VF-FOUR-CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 16 +; VF-FOUR-CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 20 +; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 24 +; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 28 +; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 +; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 8 +; VF-FOUR-CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 12 +; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = add i32 [[OFFSET_IDX]], 16 +; VF-FOUR-CHECK-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 20 +; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 24 +; VF-FOUR-CHECK-NEXT: [[TMP31:%.*]] = add i32 [[OFFSET_IDX]], 28 +; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP24]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP33:%.*]] = xor i32 [[TMP25]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP26]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP35:%.*]] = xor i32 [[TMP27]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP28]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP29]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP38:%.*]] = xor i32 [[TMP30]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP31]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = add i32 [[TMP32]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP33]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP34]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP35]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP36]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP37]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP38]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP39]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = sext i32 [[TMP40]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = sext i32 [[TMP41]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP50:%.*]] = sext i32 [[TMP42]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP51:%.*]] = sext i32 [[TMP43]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP52:%.*]] = sext i32 [[TMP44]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP53:%.*]] = sext i32 [[TMP45]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP54:%.*]] = sext i32 [[TMP46]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP55:%.*]] = sext i32 [[TMP47]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]] +; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]] +; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]] +; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]] +; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP52]] +; VF-FOUR-CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP53]] +; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP54]] +; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP55]] +; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP64]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = bitcast float* [[TMP65]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP66]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -4 +; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP67]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = bitcast float* [[TMP68]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP69]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -8 +; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP70]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = bitcast float* [[TMP71]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP72]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -12 +; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP73]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = bitcast float* [[TMP74]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP75]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -16 +; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP76]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = bitcast float* [[TMP77]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP78]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -20 +; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[TMP79]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP81:%.*]] = bitcast float* [[TMP80]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP81]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x float> [[WIDE_LOAD16]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -24 +; VF-FOUR-CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float* [[TMP82]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = bitcast float* [[TMP83]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, <4 x float>* [[TMP84]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x float> [[WIDE_LOAD18]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -28 +; VF-FOUR-CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float* [[TMP85]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP87:%.*]] = bitcast float* [[TMP86]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP87]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x float> [[WIDE_LOAD20]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE]], +; VF-FOUR-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE9]], +; VF-FOUR-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE11]], +; VF-FOUR-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE13]], +; VF-FOUR-CHECK-NEXT: [[TMP92:%.*]] = fadd fast <4 x float> [[REVERSE15]], +; VF-FOUR-CHECK-NEXT: [[TMP93:%.*]] = fadd fast <4 x float> [[REVERSE17]], +; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = fadd fast <4 x float> [[REVERSE19]], +; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = fadd fast <4 x float> [[REVERSE21]], +; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]] +; VF-FOUR-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]] +; VF-FOUR-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]] +; VF-FOUR-CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]] +; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]] +; VF-FOUR-CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] +; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP105]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 4 +; VF-FOUR-CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP107]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 8 +; VF-FOUR-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP109]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 12 +; VF-FOUR-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP111]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 16 +; VF-FOUR-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP92]], <4 x float>* [[TMP113]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 20 +; VF-FOUR-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP93]], <4 x float>* [[TMP115]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 24 +; VF-FOUR-CHECK-NEXT: [[TMP117:%.*]] = bitcast float* [[TMP116]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP94]], <4 x float>* [[TMP117]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 28 +; VF-FOUR-CHECK-NEXT: [[TMP119:%.*]] = bitcast float* [[TMP118]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP95]], <4 x float>* [[TMP119]], align 4 +; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP120]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:!.*]] +; VF-FOUR-CHECK: middle.block: +; VF-FOUR-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; VF-FOUR-CHECK: vec.epilog.iter.check: +; VF-FOUR-CHECK-NEXT: [[IND_END27:%.*]] = trunc i64 [[N_VEC]] to i32 +; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; VF-FOUR-CHECK: vec.epilog.ph: +; VF-FOUR-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[N_MOD_VF22:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; VF-FOUR-CHECK-NEXT: [[N_VEC23:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF22]] +; VF-FOUR-CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC23]] to i32 +; VF-FOUR-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; VF-FOUR-CHECK: vec.epilog.vector.body: +; VF-FOUR-CHECK-NEXT: [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[TMP121:%.*]] = add i64 [[INDEX24]], 0 +; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX29:%.*]] = trunc i64 [[INDEX24]] to i32 +; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = add i32 [[OFFSET_IDX29]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP123:%.*]] = xor i32 [[TMP122]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = add i32 [[TMP123]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP125:%.*]] = sext i32 [[TMP124]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP125]] +; VF-FOUR-CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds float, float* [[TMP126]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP129]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x float> [[WIDE_LOAD30]], <4 x float> undef, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[TMP130:%.*]] = fadd fast <4 x float> [[REVERSE31]], +; VF-FOUR-CHECK-NEXT: [[TMP131:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP121]] +; VF-FOUR-CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds float, float* [[TMP131]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP133:%.*]] = bitcast float* [[TMP132]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP130]], <4 x float>* [[TMP133]], align 4 +; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT25]] = add i64 [[INDEX24]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP134:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC23]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP134]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:!.*]] +; VF-FOUR-CHECK: vec.epilog.middle.block: +; VF-FOUR-CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC23]] +; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N28]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-FOUR-CHECK: vec.epilog.scalar.ph: +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEM_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEM_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]] +; VF-FOUR-CHECK: for.body: +; VF-FOUR-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL26]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[TMP135:%.*]] = xor i32 [[I_014]], -1 +; VF-FOUR-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP135]], [[N]] +; VF-FOUR-CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 +; VF-FOUR-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]] +; VF-FOUR-CHECK-NEXT: [[TMP136:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; VF-FOUR-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP136]], 1.000000e+00 +; VF-FOUR-CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; VF-FOUR-CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 +; VF-FOUR-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; VF-FOUR-CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 +; VF-FOUR-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; VF-FOUR-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:!.*]] +; VF-FOUR-CHECK: for.end.loopexit.loopexit: +; VF-FOUR-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; VF-FOUR-CHECK: for.end.loopexit: +; VF-FOUR-CHECK-NEXT: br label [[FOR_END]] +; VF-FOUR-CHECK: for.end: +; VF-FOUR-CHECK-NEXT: ret i32 0 + +; VF-FOUR-CHECK-DAG: [[LOOPID_MV_CM]] = distinct !{[[LOOPID_MV_CM]], [[LOOPID_DISABLE_VECT_CM:!.*]]} +; VF-FOUR-CHECK-DAG: [[LOOPID_EV_CM]] = distinct !{[[LOOPID_EV_CM]], [[LOOPID_DISABLE_UNROLL_CM:!.*]], [[LOOPID_DISABLE_VECT_CM:!.*]]} +; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_VECT_CM]] = [[DISABLE_VECT_STR_CM:!{!"llvm.loop.isvectorized".*}.*]] +; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_UNROLL_CM]] = [[DISABLE_UNROLL_STR_CM:!{!"llvm.loop.unroll.runtime.disable"}.*]] + +entry: + %cmp1 = icmp sgt i32 %n, 1 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %n, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.014 = phi i32 [ 0, %for.body.preheader ], [ %inc, %for.body ] + %1 = xor i32 %i.014, -1 + %sub2 = add i32 %1, %n + %idxprom = sext i32 %sub2 to i64 + %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom + %2 = load float, float* %arrayidx, align 4 + %conv3 = fadd fast float %2, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %conv3, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %inc = add nuw nsw i32 %i.014, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32 0 +} + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index cf6cc1356e0..ed352559b5b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -25,22 +25,22 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT]], <16 x i32*> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT6]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15 @@ -51,9 +51,9 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[A]], null +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]] ; CHECK: cond_load: ; CHECK-NEXT: [[ALOAD:%.*]] = load i32, i32* [[A]], align 4 ; CHECK-NEXT: br label [[LATCH]] @@ -61,7 +61,7 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-NEXT: [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]] ; CHECK: for.end: ; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_LCSSA_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index be1b7c2d7ae..1a6f59316a4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -54,7 +54,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]] @@ -75,7 +75,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]] ; CHECK: for.end: ; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] @@ -141,7 +141,7 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT6]], <16 x i32*> [[BROADCAST_SPLAT8]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -161,7 +161,7 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP14:!llvm.loop !.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -242,7 +242,7 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT19]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -264,7 +264,7 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP24:!llvm.loop !.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 8a4d46c0fa6..0eab7618bbe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -61,7 +61,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -84,7 +84,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -176,7 +176,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -199,7 +199,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -291,7 +291,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -314,7 +314,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -391,7 +391,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -414,7 +414,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -506,7 +506,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -529,7 +529,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -621,7 +621,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -644,7 +644,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -731,7 +731,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -755,7 +755,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -851,7 +851,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -875,7 +875,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -971,7 +971,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -995,7 +995,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1131,7 +1131,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1155,7 +1155,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]] ; AVX: for.end: ; AVX-NEXT: ret void ; @@ -1251,7 +1251,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1275,7 +1275,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1384,7 +1384,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1409,7 +1409,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !50 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP50:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1664,7 +1664,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1687,7 +1687,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1808,7 +1808,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59 +; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP59:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1831,7 +1831,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !60 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP60:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1963,7 +1963,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41 +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -1989,7 +1989,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2087,7 +2087,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2113,7 +2113,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2211,7 +2211,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP61:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2237,7 +2237,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !62 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP62:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -2380,7 +2380,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44 +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2406,7 +2406,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2504,7 +2504,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2530,7 +2530,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2628,7 +2628,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !64 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP64:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2654,7 +2654,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !65 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP65:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll new file mode 100644 index 00000000000..e4d62a649ed --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll @@ -0,0 +1,100 @@ +; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +; Currently we cannot handle reduction loops. +; CHECK: LV: Checking a loop in "f1" +; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate. + +define signext i32 @f1(i8* noalias %A, i32 signext %n) { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.02 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %sum.02, %conv + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %sum.0.lcssa +} + +; Currently we cannot handle live-out variables that are recurrences. +; CHECK: LV: Checking a loop in "f2" +; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate. + +define signext i32 @f2(i8* noalias %A, i32 signext %n) { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %add = add i8 %0, 1 + %arrayidx3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %add, i8* %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + %inc.lcssa.wide = phi i64 [ %indvars.iv.next, %for.body ] + %1 = trunc i64 %inc.lcssa.wide to i32 + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Currently we cannot handle widended/truncated inductions. +; CHECK: LV: Checking a loop in "f3" +; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate. + +define void @f3(i8* noalias %A, i32 signext %n) { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %0 = trunc i64 %indvars.iv to i32 + %conv = trunc i32 %0 to i8 + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %conv, i8* %arrayidx, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll new file mode 100644 index 00000000000..243d3e9ebed --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; To test epilogue-vectorization we need to make sure that the vectorizer actually vectorizes the loop. +; Without a target triple this becomes difficult, unless we force vectorization through user hints. +; Currently user provided vectorization hints prevent epilogue vectorization unless the forced +; VF is the same as the epilogue vectorization VF. To make these tests target independent we'll use a +; trick where both VFs are forced to be the same value. +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -force-vector-width=2 -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK + +target datalayout = "e-m:e-i64:64-n32:64" + +; Some limited forms of live-outs (non-reduction, non-recurrences) are supported. +define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { +; VF-TWO-CHECK-LABEL: @f1( +; VF-TWO-CHECK-NEXT: entry: +; VF-TWO-CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; VF-TWO-CHECK-NEXT: br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; VF-TWO-CHECK: iter.check: +; VF-TWO-CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; VF-TWO-CHECK: vector.main.loop.iter.check: +; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; VF-TWO-CHECK: vector.ph: +; VF-TWO-CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; VF-TWO-CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; VF-TWO-CHECK: vector.body: +; VF-TWO-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; VF-TWO-CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; VF-TWO-CHECK: middle.block: +; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; VF-TWO-CHECK: vec.epilog.iter.check: +; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; VF-TWO-CHECK: vec.epilog.ph: +; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2 +; VF-TWO-CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]] +; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; VF-TWO-CHECK: vec.epilog.vector.body: +; VF-TWO-CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX6]], 0 +; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] +; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]] +; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]] +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT7]] = add i64 [[INDEX6]], 2 +; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; VF-TWO-CHECK: vec.epilog.middle.block: +; VF-TWO-CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]] +; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-TWO-CHECK: vec.epilog.scalar.ph: +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] +; VF-TWO-CHECK: for.body: +; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; VF-TWO-CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]] +; VF-TWO-CHECK: for.end.loopexit.loopexit: +; VF-TWO-CHECK-NEXT: [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; VF-TWO-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; VF-TWO-CHECK: for.end.loopexit: +; VF-TWO-CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA3]], [[FOR_END_LOOPEXIT_LOOPEXIT]] ] +; VF-TWO-CHECK-NEXT: br label [[FOR_END]] +; VF-TWO-CHECK: for.end: +; VF-TWO-CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; VF-TWO-CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %0, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %res.0.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll new file mode 100644 index 00000000000..8f5ffb11283 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -0,0 +1,402 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; To test epilogue-vectorization we need to make sure that the vectorizer actually vectorizes the loop. +; Without a target triple this becomes difficult, unless we force vectorization through user hints. +; Currently user provided vectorization hints prevent epilogue vectorization unless the forced +; VF is the same as the epilogue vectorization VF. To make these tests target independent we'll use a +; trick where both VFs are forced to be the same value. Mismatching VFs are tested in target specific tests. +; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s + +; Some simpler cases are found profitable even without triple or user hints. +; RUN: opt -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S %s | FileCheck --check-prefix=CHECK-PROFITABLE-BY-DEFAULT %s + +target datalayout = "e-m:e-i64:64-n32:64-v128:128:128" + +define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) { +; CHECK-LABEL: @f1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AA1:%.*]] = bitcast float* [[AA:%.*]] to i8* +; CHECK-NEXT: [[BB3:%.*]] = bitcast float* [[BB:%.*]] to i8* +; CHECK-NEXT: [[CC6:%.*]] = bitcast float* [[CC:%.*]] to i8* +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[AA]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[BB]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[CC]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP45]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BB3]], [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP78]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[CC6]], [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK12:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK12]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF14:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC15:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF14]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX16]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD19]], [[WIDE_LOAD20]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4 +; CHECK-NEXT: [[INDEX_NEXT17]] = add i64 [[INDEX16]], 4 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC15]] +; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC15]] +; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP12:!llvm.loop !.*]] +; CHECK: for.end.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd fast float %0, %1 + %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) { +; CHECK-LABEL: @f2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1 +; CHECK-NEXT: br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 true, i1 [[TMP5]], i1 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP11:%.*]] = or i1 false, [[TMP10]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP13]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP14]] +; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast float* [[SCEVGEP3]] to i8* +; CHECK-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP12]], 1 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP15]] +; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP17]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[N]] +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 -3 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP24]], align 4, !alias.scope !13 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <4 x float> [[REVERSE]], +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP25]], <4 x float>* [[TMP28]], align 4, !alias.scope !16, !noalias !13 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END13:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC9]] to i32 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX10]], 0 +; CHECK-NEXT: [[OFFSET_IDX15:%.*]] = trunc i64 [[INDEX10]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = add i32 [[OFFSET_IDX15]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP31]], -1 +; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP32]], [[N]] +; CHECK-NEXT: [[TMP34:%.*]] = sext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP36]], i32 -3 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast float* [[TMP37]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP38]], align 4 +; CHECK-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x float> [[WIDE_LOAD16]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = fadd fast <4 x float> [[REVERSE17]], +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP39]], <4 x float>* [[TMP42]], align 4 +; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP44:%.*]] = xor i32 [[I_014]], -1 +; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP44]], [[N]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP45]], 1.000000e+00 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP20:!llvm.loop !.*]] +; CHECK: for.end.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + %cmp1 = icmp sgt i32 %n, 1 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %n, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.014 = phi i32 [ 0, %for.body.preheader ], [ %inc, %for.body ] + %1 = xor i32 %i.014, -1 + %sub2 = add i32 %1, %n + %idxprom = sext i32 %sub2 to i64 + %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom + %2 = load float, float* %arrayidx, align 4 + %conv3 = fadd fast float %2, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %conv3, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %inc = add nuw nsw i32 %i.014, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32 0 +} + +define void @f3(i8* noalias %A, i64 %n) { +; CHECK-PROFITABLE-BY-DEFAULT-LABEL: @f3( +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: iter.check: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vector.main.loop.iter.check: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vector.ph: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vector.body: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> , <4 x i8>* [[TMP3]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-PROFITABLE-BY-DEFAULT: middle.block: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.iter.check: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.ph: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.vector.body: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> , <2 x i8>* [[TMP8]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX4]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: for.body: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 1, i8* [[ARRAYIDX]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]] +; CHECK-PROFITABLE-BY-DEFAULT: for.end.loopexit.loopexit: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK-PROFITABLE-BY-DEFAULT: for.end.loopexit: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_END:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: for.end: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv + store i8 1, i8* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} -- 2.11.0