From abe5caba6f71e0ce4fb145ebb3a7d36bed73d6e7 Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Tue, 26 Jan 2021 16:54:23 +0530 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9A=20glu=20variants?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/optimizers/noam_lr.png | Bin 0 -> 35533 bytes docs/optimizers/radam_r_t.png | Bin 0 -> 30948 bytes docs/transformers/configs.html | 568 +++++++----- docs/transformers/feed_forward.html | 178 +++- .../transformers/glu_variants/experiment.html | 140 +-- docs/transformers/glu_variants/simple.html | 868 ++++++++++++++---- labml_nn/transformers/configs.py | 49 +- labml_nn/transformers/feed_forward.py | 24 +- .../transformers/glu_variants/experiment.py | 9 +- labml_nn/transformers/glu_variants/simple.py | 106 ++- 10 files changed, 1390 insertions(+), 552 deletions(-) create mode 100644 docs/optimizers/noam_lr.png create mode 100644 docs/optimizers/radam_r_t.png diff --git a/docs/optimizers/noam_lr.png b/docs/optimizers/noam_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..b8945b93f0d030a5f6b86a339c5cecb0370e44fc GIT binary patch literal 35533 zcmdqJWmJ}3yDm(3iP9yVf;7_I4H8PHk|N!m(%o>=AOfODhlF%@cS(1{p6IjQ^}g#{ zdyM`2;}4AczMR*LGmhgp&uOT#q7*s`Aqo@}6uOMGxGEGBj1LqP^b!&b_@wTe+&3sF zA1E1d5p@^6-FajWw9N;^4`1G9SkM2)c)+&9rO%i6r26KC?57-c6|XOQoa)J)~H z#Kpmpz{L_J_Js~tRv1p%+}@VK9fOF#YCXr|U>wbn_~dnWeRtGJrmd~L=9Iu^zk${u z`tgORG&0x~&W;kOL}{wPz(BT?aHcUtUZ25C0r$J>GyBbf7dkaq`--U?p>t&)+k!E( zcpVvPamn-GVDTZT7U@ty@-OJ$NzgB?1IA z{10cpUz-^(ZEkX#_QZ2p&z$bhyFc9D9lFJnDHh?kNJ>gpY)pK4+n37VMd`wNx@<`p z*A8|hjoUUT0~XVl#MHpxd4g<;6#3ZLD!@Y#-*3|bLFVn4mxI6IP9L!p*u`~XJ z`(k4-YtRa+_Oh3yS-3J&(39{-z^1RXaY?k6KO*M-NPh46-t1ro-{#hqG(C*VT33vK z`*}bEzuiiEi!bc*)#&KxcCb3y5=1Q6>0rTgb0Fie&-hok>t=>C3@q%ng@?!O%|Y|S zPeiia`TEJBp~Hv!gNLP*gg?vhez@Z#?y!X;BYbYUT&Sbza=P`Tv~O^5a%9AB z5~>`_t8=|Ksf{s!-z*agQh@MyTqP7f=jP@@zRmuye!I0R8p1ZKen<409}b<2a>4zev7+Lo8M?>J zyM+cb=9_SxTXUi6e=v3&Q=<9fTW=XJ=M1+3O| z$P>zjhK8=aXX^KoII#G7gQeQv;+=b6f(EX(`#n*RfT-G_< zADo=PdAij*?Ztm)Wo321S+Sq_{@ylI5t>V@+9u3QQSsza7d(YQ9AM}*<2zcGPu;(w3Pu1F+j~~g5TS86lVR1ZdQJ;=t=j6;k z(6}LbZ()HA&2KgJ(%Ra3ofC=E?w@nBGBSe6KiM2)EN7yl>+0?G4^%5u6yfCMP1R$; z4s|=<)pqnyJtJ%(;Vr0cb>0peaApX zem`C^4h}m!JRA)T4FR1{TnApARj}9%eQ9cHY9NEp`*JzB&r}*KzXw?s1c9Zl7|Q;h zGVzjg?l;X|0=wu_uZG@x9j_th*Py-cOJ;rX=KJN@q&^6Ns4TEvNf!(5QJ3^coFtEH zewN{i)6vnP5+=9;JU0lFYynJzS+_2ngb#OwnU62cYxnNvqU!s1U@@c3-riH;_=vei zfA#>Xt~tx|oS=bgtBaK1C3?S=hL*P3)dxEP8m(EbP%$HUv-FzV&pc>y6pU;p3v0Kw zv9Uk()%bAfC)UE@`=wDOp+NkgJ4wwa78YQ4EW)^K7tp5ZW2C!aM2o(HXf;Mf7nK&1 znR$PIb1<09DpnkcWnl#-&52AY6&av+P;0+=zCS-(ZM%s3;tjv|efk^{`B=!~P}okp zQxFJb_=okZ)833iCja!$pT2W2RKZ9* zf&0oLpB1@I=Z3$-;4iSdv{m*7ZAu`Kx2y2K8)SWasU0TfjQyV%&9(WT6aRVh#bLHU zCzuLkCZ^lO=xYrYC(6`#MO*iU58mFZ$Lu~XYG-HX;Bd+~7Ge-glP8^%JqqRp_n zZ9r46`=9q)gkcy^AS;Wh)y}#OWlowy5fKoa?M#+C?o5E7f`<}lqWyDj&SPr`QV~&5 zP*7FH`Av@|ZhLF%bZaG`Vs-B)I+_ z%gHj56S@SK!yvHR(x%wiK?NVHPqGEdtD(5q*)4&cgP`v3=QmNRJqnI9KK_{z5guOB zMt>Tapa;Iw`u=?V+aFd0SXjbIz!jeBzj*QD?)DaZ&27*EWu`v}a019g`}_Nfii#jj z=~bE_H^8#mSLSpMkjN+6e#h)Q#I}N|*&4wqBzf z3BOClEzKA}6d-N1&1JkaY_)B^lccX}9=Z_txQ5iIM};kdH$67?EYYyAjrsiaG*9lw zj~@pOCyJt?qSN;A$7g4O{dx^9BBiv4OXy-q#W6ip%6KbuOJGB`&p?I+7{ekMgT%m8 zLj(7lr@H!B$?MP87YB<43TYrY{Qmu0gTE>ns-K6y8>}pa^RT{Lzxn>|V$u7c@dB4g zD>^;BFW$(=sOkQC=dVSdflSIwPj4t285x-?6&0j7lP%(3rc>KmT2B6Vk#}$}8$JzH zRaI0NOfcy;5%X5crgeFq4oh`Lk(uwe&X(ur!=uK;#8i-ZdUzy{#M7$*Z~=#!#H{N# z!10ZALFH29Vt?LvVu;ITj>n`6%g9uM-=z&;&pbBdF?bl4%fsb9m6q?{Iot#|z9Hg- zKm}p+fv{|3Ym2E+mXPqe+GhTT)pV;mNLMSS_na}{`So{vw}u_ z-wAlY+0&LG>Ohj^-@kiGr|TRUPeT*9Vl1aC*Fkbht~Vb@_kDnlUm4=y=VxV&p?^et zb-fkYg@sa7j2=JOxVSX*U=L)$S}{m?^BQ7R)Yap4gE9sDM{V&xKTD9tn1y$jFpq{0 zaFI2|E<%Mn1o5@I$~1da2JYqZ;c^=+0^0M*#P0t7@u8s)A3pdDfZRe6gOqy2=e$4U z?CkK*KeHZ5Nt5 zol{d_8;a{YCb=cj(UTZBthm!GVX?WT?%szF*bcn?{r$i#cXxNcS5=Xt+JF3*Y%vZ^ zOiGGD@3PzqZ`KbSA*En^Y)oBEE$*H(0rnEu`Ik3UMx-!;S8GqVmRehZBR5_DW;FBk z@)Epta0#qH6L|0G=~-Z6ZH=Dq=Lch^Q1#O1GxXhj1QGX0uWIpk=Ys_XrPfqFXJ&i6 z`%b#70MmExq|1XzWYYuI*ViSIcK7x|<0_WyaZ6nfm!Q8DpaAoo5~E0nn6p%>ekSHT zOBH5uv>AXU^_9UVL1aiY8zh-#0Ha}e1O>f@vPC{?meZi30Q^2*=jg9fcwhv@o$iKe zz|PJtt?=~e(^}dD@{QTDy#;P`u%VYJe>814;8}CpEHa31#4+*U)|gc zW(j41ObmjiC}e1QnqWbCgN=Br2O&RUW z+4y1|RQ$%`Fr2HglQ))oP1-W#>l^g!jhT^&R8U|bJb#5NfmV?<_ zwmSI2YP~0+PnbA9=kq*3BpH0pY?dXz{JFyg26%$R_RF}APDA_s0`F(Z5NJZWBCuBQ z5@cEshF&^tkF2kC$Mq3{yzi+mz4%Cb!W88|K=#4zI?M9DPuXW0rZs#FXQzqexN8qMH*^(WCUQaR=tk^R&f@t?#QJt@1Sy`|`vW#qbhqeN&TaFbmW#u$u-=gV?D@n( zk-#M$qNz>;ZSju5nePu0QOEnH*x*xIbsDOv=Q>N7D9FM~O_WC4uJqrz5@3*$s_%0Y z*Sr#1%6cpbJsSkwEHc+8=V`33u~wcY$oLSZ_RJ6bffM6|4j$yDkf-vvbUoZ$pC6oD zxmKZuLz4dUotEBdH=^yowVNqWQD*lqMb38OSg2GRsF05$dZ>ia|Fu{PDVjKiF>kad zH~i{e9flIywGxzYP8B~LtKU@yVRaVE!;B66%?0=^MNSl|_T)O8PBYJ*Y4XMn;A~D- zL`;z0jHQ3n$EIv|{Ll4}JWZlQnxw{C^B;U&AWo5CFYA?hN>;r_`IDlZ=szFL7^gX= zM;hb_iVHlxT@{CYW^VjyWlAeV>Oa?nI#fkc#`xeh$r71vl%Lah;cw6@gP?S5$^My8 z)T)5(UkJ+>D)-G{*np2bN$q44jXBRIPJ)NqA+lAue)rV~l3nB@Y%uEE1aNs|2Lufv z$KPWhLvb()5vl$!paB;TkNveNcBlzR)P0FJ=X=Ho8TsaAJHxq%@dZj*q`0`h_!%fE zjkibg5AFFJf5Qn$yKe9b2{m`e(Sr0`2UrIHB}NmI(f>NaR=Z$jq_H#ZhHg=6(q`G9 zJ;eQrUnwY0gdSba7m8HSgNMCb<;@$y#{0{o)yK5|qpIp|JxL!#TS`jGFQuh`J^TQ4 zx}Zc}N$J2JQxE_Ja+7lqOlGQW2jv~!85lraS-gLboWctjveEu|45d_EeLdf+SLmet z$tm?DsdFGMKAes!`TF_-fBxeXNDjq8|2k{jCT=HR;YR9ka?}L>??J&H_A|wBwC{!I z3jY8Te3&Smx034~Ej<`yf@nuFWv`Rj%`qGdQ|eK1nS3tup3UoT-H^We81TsFvei&i zP)H`gqp8j~a+9WIZrYH6Q#{BLKR?FT@)z0)c+O@5bd^uyI9#1x) zDLC)Va60X3ZP>8qZzDkv=K#_h8dBCe2N1I9800nsg1UiX|!q_ zwpD?gl5z&9BmlmJwD@`eTHRiqq>O;*wJgKN&Yr?KPT^b+^VhA_Fhq`5yVWp+s8PpR zFXEL(vcHbfFue*cC2yxU!4{TB;pG}b&uoz8;^Nxa*zof5a&mH-nVIptIq&awJ)zZb zgbEH01}N;PH4wFra4TWNRi?YsF-cSir>+7}Ngr>ZV{D%>GczBihlYms<>E5y)Vv#} zkd2uZ3qmuT@CUd}bP*4?>Grs{%5suz2f+I!83XR7>F>O2HkwO+Vwr`k0S?lux|2*H z?V?C|nzybUu$bbMq|f7ozd)Cw`QgHxEqXtgPN+X=IXXJ>M?inm=*Fo{XKZ0HQ>a9C zIk~v_0FVv{vKoL^}V8Re41{xX%AZmcd=>>_m zK}m8}cArw4Y(A;Ss#GSTpALNEzPs!iInz9T@;s0EajLj1dUGH#0~RQv)%sCfd^-@K z9g3vCN6|LOJM_%Q^@o|Pxya_)n$t?gpdsW36nrHq&dKwvaZgkMkomVf_WGdpL^`Ao zCn?YbqWJNXtRUa%v&;{1^oklGJd*#-U;%9(f9oZK9c#tgbru@a7Wwnb? zl<7ZXz6IbELHj>^D@qr8gB1la{5OfqEb-#>KKZ_QqRR50FK?r!K#%>n>xF10Sm{O{ z)8oa%m^EYA6k~6Ptd|f_Bl4dU;IYC(W38=)ZYq!H?JErKoVj=&0^D5oKhxN+hEc`n z80QAUwcZ)2Fl5ptGonK?VW)7sRjy7pg991}j7apBug`WsthfWJmhDeJ`0nnttPgH(ZU+mEX$e`(+ra1FymnXHNp7AyK|md{0-`6c`I&VxVfB3`>zc))6RTLW+O2w|m^3O&+2h#y*Npv9Z_% zYg=19Zl_zop3w<8LI@mE%}h+jK-31JdlZ?F9-TVtQq+It&({nz0%X{BLl`@AIo#e7 z=$l2JNVhfQ^;Wm53%3H4?F^3&Vm6SDEQ5=WzqPZ2`gIa4r=elwDLohtar<(u`zZ@c zdFWGGFChD_lgWx00Pgtql}|=S#-iuNJmAhvUUxIy-R}YY1|R5NA$G&V!3pOqHhT+E zP{3sBi;1lQ8dE5T9V)J-X8$XkKu}N+#CbBW%VnT4Z(;MB18)N|0?U*;!1y-62QxBA zbU_C3S8wvT`lzft1c(@~^L~14tY{Mczp`F&+CFP^h)}vCIVw8fP+uBWpoLW4KG?mn zd7>>D0{<8c^9l+?SI=*+>}6T+^m`Lu9L&{iCf)iWn|<4kOWe&Pu-SPSoC{>u`YMN4FdJDFFv~& zbs7y7YjB9@2T)#Q0jJL5wk5Ss3WC;Yw1`?&3yYEy3&sev=HTR9&M=!8wU=-8{sfd4 z`7A*Jrd@^HteBXu4%;ID^*Q_=12XXI9`Ho&l3H3&5P}Ds|DXaO1qQMVM@#6I97AOJ zoBL;B15}pi^!(K3HDw+S7$kD8YJOOsv1FiFi{Ix=3v;_19}6|84%wx%n-Ap4#BJ4c z+b#sqsJQwo30}cnXIChztLNqCJKkM8mN01~*b~ldf-+C_yJ46K?~{IRBfD)6)1geU zdJ;0SK`R+KIXVA#i?#N&o_H`fpUaz!O0KT14*7>(1ew0e2dE$go8w-YfsRgETG|I* zQ&Y3xJE)3DV{I^_!`2XTzKT3pS%D#QdIV0o`CT&%1To-jR^mq9p+Z(A*<4EYQS;MpJMGt_AT7lAHo% zx%QAJL(7Jl5Q=xw^+<#amX4`%V*dcu;_IYq3U1)QoX0P zTcb3UA9H1KF+Ee8VzBNmC{HGV65gWgCW*`77Pg0~n&46S8z4FmMhuUB2x-wTPu#zf zlvElAXzUjygMTS`o*)ybB%#}833_HfZ7nTc!QyautY*z`pj1Gm{q!p0gyk*{!WS<&7x#kNg$wkC&!TZTo=JS)iwEJ88Zim0bd-|KXj428V%GL-VI@yz;^rObY!eSkGjF~`*A)*hkio_?}p*qp1uIXvl zN7;s`?3U!^_pG7ck@&1ImFZWt*r6V1eAjGwyVngC+Uqu*P|W#>PKMmBb#0Q3;tJC<_VZ9(SK&YWpM{6I>m%|HbJZQSX zV5^*?#9_z`04%fEuq~(xBuVqZOrg8er}h&G$J#@uY`(8Vc>jtMhiH!}B=N=D5MQ3` zv`76K2-n!KcIlu_EXXX3tlBm|uUjh)3JI(}a>eUd9Q?$B<*d|~TZl@kNBm5?(33xX zgGI31-yU4X21??^s=KM40vI-S^B#)4B=#@y?}YNorRh9^r~w&OXQK>A?8h=%K|w*M zZc~tdL(Z>|Q4=B0fVPt4;pv%1?-=rWp)zJZeU5fSn=g*%ozCJZ*$fQQ5rt1d>Lf-ct zD{Ohgxl*WsqUPr2=;-J`vkSRmHU1^6e+6n5(L+yZC@NxN;kChJSx}nZ7pq17W=tO2;wk%~V0Q z)-ez-T>_t;kV`G}qP1m+O`ma-Vkt3q$_%fA|Lcj)fIdFpRL zkD7;t9?K#6Nfp^q(a?+#pv_l`j_9zXD+22!oJ$OYov zwrMBbu5eQ>kdGlzVIxm#i=q4T9QXaDvI{)tbkB~h?i6e*TQr8$CZBcH@5-^%eLX2< z4N!(%{zG>cvlA3G@EzcxBjUdrfki5WGB3&6CoHtSwpD=b&bZ^^ zCx{mtILofXNJj*J9->O3WAK?Hthj>|`OHp^xk;E0C9!$i?)pt4Qnnlad0EVy^`I8g zmn+Eu6JLK1qi@XfTYN=p=njbxOI}R4(Hj+A2y&xC=H>yot zA~ZI0Uv&sp$sbt;Jk-L4&$g|^PFC5%cZj?SNG7kpW}3^&vsNTK!oTC}r;RFNNk6!D z#K32|Z6y^!l7B#Z9mS`MR)%A({H^>dyfM?Wd)PZBHT>xGG>igRt4f$5FAV`dvSRB4 z8j>CMpOI+L>-?j|^hw<%aWD5B-z8vv;wHhIsycm+nrPyU1F)2eBk>YTi`%ZAxcW8wNoQNYTBbA$NbDQNb>j zw~V?~B?gSUazcF_FUg86Vek`L^`foII@ccM3DVa;M!rPTL(7F~B&lYgNwGZsEWj3- zw@Fhs7vUh@oBB2oVywkx+u11@vh&ZVl1gpl#uGYcRHz|24aRJ=zlHV-lPRZ1Xdlf~ z%jeNdKRlHf9CGgG3}bp8csiGpkGHOg+-98==ncUyR~8M?{X3GxRR{H{%EaV*R}=Jv z7-`esy*2jNE!G=Hn=U+rh`3(xFD4a zxShTmE$Bbp9&Kw~&`D4IpKPPHw)XV8sHn(n9sI=_r5cK8tJRpz;?j7KCXB00ou@iI zZT{rcM;n-gYv`m6d7cgM%$*lL{e69Jsw^C@j@PrZvpFp%96*l`HFe+=NU!;>CmX?- zWL^MV0vz{$E#~{QwvHeloAsx1eq57wc6O$vE&S<^$f8{tpr?d*2)YsCdO|wJ9TY&W zZEOS{QOBMXqbWb3obcS4mY_>&khG@ldMj6%RqtnD^fl~!FYkWB){#8LnI!o&EHRaPX3l=9 z5!za4bXV7gYO@C|ERG|2G8!5heP#*EKZ(xI@9XwZmm#(%VNBVC_26uxD7j3M`b{?{ zRp~QR0+o$dMce^WM$=WDdQw(q8+5k#ztzaMC%zEn^KhVJD#^>6$fDvg2kf&kZEbG@ zP7t7t`P>GG;D+QcUhIKh55UAoNl71WPcTozUz>eUQBiS90c|p6&4#uzG;tA=_C9ML zP~<2ri@0LN1ACc~x62Me#zEHRiNF?(K!L4^e*=?TVKrR|Di3j%^sB-7uV!Htd*ouJ zElF~cGW5mX+&T)$_P3D+cZ)h|=pOl`^V~4`F=baP zQ^aK2p=CAfxaHV9Sk04`ZAj(2_;HA$hyQ6WsYpp7$SVNKwD<;4sUfqe>FM(Hk!V2e z`~x^2DZ>8^aZ*WZK$N$3cS|9VpE^NsLZD^t?N5|PJ*(+vAK1`ev^lIX{lyO|=&g<2 zi-A@gxx6+9D{>#$6$umcM(WU-rCko$p_nH4-wI*qCzRD9NaJK6ot@#fECIg1Io<{;t6$=CPApyWP=uJ&bgtvZx*&BC8 zGHRBYAguU6-GjEZ*(yuKon_jETKiyE=|>PeoH)&t3w!C*WnHQ9Dyj%gdU5!MbXmLp zY|w^wtxUdou=n%qv15Jf$C!OY(Kr(74vDoyB*=9O@D)MW-@&$F3&fRg~+vYBuF*zHe%Rh_ndFJ z+ovS1j#Rfq$)wUYDxn7hU4uAmCn4mhTh=BG1*bx>oi7~xdb{#etKH4n;2RAa#jE3b zx5slz(egW=2Q1(|rZa>6uMw4jb9YBSqh>R0{R11z+`C_>cQZf3R7W!55Svja?Op@=|gRRD-|c-_Hs*p!osz9R2TDua-SwJ~~J3kVa8qYTvlY2^|^qR`H{l4ZaO;qC-T?>|Z9Jt<-T1Cfa1G@=ng`C3ij zJf~jO^W4M{Wwj7qw3i@MhYd3LoRa(jkGGlF;}UkHrW_)q`I;NbRRmFJg0!yFi}F*s zBAp*ve7l#um>(xc_;+%wr}$cnS{`ks(+5AuCagSkc+pa1s|Ae6eVPSU<#Clsx=h9z z7uS&8#Ql3>qPW!F242;+%Dvs998sNDY<(&>8fV-My7ykMJVNh2R=hhF(K#E>rUdPi zH>aulPdm*tXF_b%ndW&y^$(KOwByI?32?%W1d4Fr|3i0+9#q#{C^HX zCzf_#R6X1`K}D;Kg)y$X>FXuJL>gQwU3p=KJ1#VI!*Lj*wr~|ioJw_4FZ=P-2ZrYR zDC)KKkh%ML)$AK`lD385rbrA|b`e_9ke0La^j4^i#3Xtaa?`>TB}&w1j@-Lt4G!*a zzUC#<-QAY+!5vZ;PPV(L^X+f04cA=2G7Q8qjc0woL>i2fqpHiG>%dPWrS+DTYvINq z$EjJ!RKstX`*&1VQNtG9S|gxfRST*H{?bkG}6 z#>)@9UZ~WujlbeYA8~HT6sw|i%$8j@SMKCkw|jT~?N->q7}?<)>WolKq+v{U;kWMn z2A#6|Nr|EbyGDV+mp-+o%XfiTUonPD5C!}2HkH^i;E1N0ewb>Ien$?1x%t4Vbsq*d4Q)RhHQ*p;RV%bWj+1hb4);DYlQ`^I`u=u-dVv58tDSaaJ zE5bCj?X#v|g1;($UT7?`-_6$+ra$*NaMR>{4OJ2}%IbQ#wHiPd0fqoME1ZJ7{XrrA0gXUA1_-R*sR1r7?(x= z=O4iZisAsaD@oFilB?@#EEV!1UYWfLjp`!J+^imI<NC_ zD7>VgWW~D2r>hx6lIK2I2!+vavn>S1%j2G44$gcOF7BA3xb$f`fr zf0Q^Al136`YTrIj4}$JPXG@X4Dd1|ikd{1ijn+;7;wjg3^t|!T-}%iZUJ@9tBv7aT z?ilitPM^K9T~H#>m2@NcX^-M?&y!QE`@DsAHjr7iw^56!~#AxEs8 z77%&@;I0+RM+nlRY2`MtfwB$Z-Mbf2`5PUB%PO>uO0lMSs)ubO?I@KLa&ym)MuulU zkP1}6&gmVD9yfVQ>hv{c3W(zTGb31mVL#_eZtbG>;!eZ6d)Ldh@sq=%ualiL>(nuC zpNpD3b3CMURi3t?l_O3%kDTA90;5#QO6AaooY&@FRb0_JGMPeQYt6 z9%?3jpg>J#f~)0Zm}A&~LMB1K}E%oPC+pjezSQEqULck0U z$=C+9;bY6x1%@g;HG^Vn!jTEqB)RudbLjK{VdI(Q6s-C24=y>LngZ`yKT+=^M#<3 zj=C@ce0Y*?@B;ILrI8rFo>pNI*?kqDHGuk-VOzoix2gWE*aU)oq4!G!YEVETQg}OM z4S5h&dq3;9QhaJr(&Repw?g(a!a_?XT1xs~^Cs>qnIoaW%ebY5d!7$gD$q_LPd;Q25$IaB8M9pT6{&wgCuFx_?!#v)hk>EoC&a%+Eng#`djbtVJ z?y2qpaqllz%$PU-EPg&VSnfHSb*>c{71I4Gx=GAw`dJvYiNdnT8mvM`sq7PN1;#?JjNK07{UmVVfwkXQI8}z)i58Lu%3#M&QSh`M(4{p>;qnRLt zUD9E=dXu5kZ+Q>MshfjbDh`hqWri7=&V-AWKa>fSBf%z*x3ag9Gj%TF>$kCTXy_gJ zKnY%ymMmDS>W#FgJzahe&}J9mL5I7FyHIpz(B~)GqE+N^vSR#X^;tTJxw9EED^QJt za|am%-M{4Km5rBl-1SsE5h`M0RGPi+-c;K>UhRP>J6^Q@k825}Zf5^O-e^r$m6ZGi zIkq$2K2HD3q$F$y?p5d2X5M&o}7T|dkdfv zih?RftB?lt6i{uRpP$c2Pe;HcZ7}JMDX^ zg?_7ziZ!ni7#{MI{Hz~OO7Q;db`}(9M+++Z0-B<~nE@4QDCYHy_+ zA9D~S3x0>F&@n_zDnn`#VVO6`fla$&jf2 zpqpAG%x`cbhO~593Ihl^i8;yT6ZZYSF6ND2g2kWa)O(cebWWS~k3KK{N{Y+aHZ$D) zEEG}eC6hBtztm#F{Y|V`hIMTMX&fKn1AP_TA-IuvF(~vf#P}J>VE^1F>lc5ZgN{L! z4^xp5UNYSF4d}S2RL=LXyWg$<#))H3GGdQ+9HB0%n?Y-Pq2w)!6H|Bq`4yi?Ov^Ej zD)2IM8%qou@wX=phfr(+wvtVRvg6CYl|w3<6E>iBF!RGQwVe0;!m-;yXFq2RdN;TI zJ&viUa+U%LMys=OLjT)U{Q?55oF#<|{eL2_3zeYBYwfm>M44O&mooy3n4f`|rl4?JH`SQgBI}rjQ zZzOf1v%^kI(%e4$c7A@|C!?i|LTA6!;tN_cT13sv%(lOMZvDb-H5#W7{b|y#EC&9y zpwJ2uv=y0FlD6k_xAo4t@(90|G?wYONAcKnWf4aS5Bj23aRB3-l_+jCeeIFiE$8ky zJCBKn*%LM*i4vL@_bum0fZFii;-b0aQUmwr~}1}$lm^hTr8-C3tX=* z=fRQ!tsdkZpxua+kgzA7jg9TO!{eR3ug0C=qPGTdi2sT)6yP$^As;?>Z>yM*vSPI? zX4rv7$i%nle47izmv%eE6XPGoUTDO{D<_%tq#b{YqXWv9tk~~_!|Q~@nf=2%+w1lZ zpXLhJz23_?WE1Hrg~!_!36#rwBD}n@+d_XzgF=pBe$V}AKg$K&aEHTEwt`h-BJa$$ zYIataf-_GAWhQtwV&CUz9^H8~k*jlPUsB0-A10M^@>doRnn^}YUR=DB7Xww-R| z>GRBGWp_OSG=oGoWDZ}ZxmyQz?IK~I11ta?V2bj$bzyKlnO)Yt(`>^Ja!X=tBefY5 zMD1AWc24d0GW!>b?qsRDfOp5W6Zi?*bSApY=ylo^rmA1{fZ;^dl_n8Wia>9zEh8kC|M(SSV)VOJ(YJ#b3uMKUN0x;61s9?bfcv6 zE19@oraU&@w%3yTht#xUFHv=*E2ws7Vv9jWf9O8d#W!B4B3t-7(zpyj&4ht*8Bh2< zI9`GrsxCUH$+LIKIZhwTqW`kVo8X4_O2`CQo&vPL*N6&br>Msr5Y)p2BZj%2E! z*_}qpxj|broEbT}Lnz0%7RY%68^D6Ku!bw3(n8-M8#!Lk@ zm>_dfgX^=^@4vXic-u9$#jmq zB+ktEJgfUEobo&ZSMQRtef*7%4(|-+3dMg7o>l7YH|36h1z6n3+*NR@n(WNKG{j_sFCd&hD*4QpyStb6am; zeMm-3=u9|oKmkaRGDcRg_6XTT&qRYtn~{aG9DfAQhW-`$rO16z(rs&AjHZ-esH^D2 z)Y3Y-gz-#hn1^H{R>^Zt)?fWA;X{WvfQ+xdRK!Aa5^8E+y?0Hgiecm#&eTMQO?K4# zq$jo%Q7?!Qn_QaUyvPkb;=EyZ72bs|#SpjeFoVqU{D4522%N-2c!O||Wr5=eDB6-LY!gZl>jWuv)1o*u zta>@=wl}Ht$mrzEzB+Ajx%l*e>foiU6)h!*R*XYCb5ddm^=*K)-1xM%(*(kwB7Y7F z)>th{VuY(kQeVa8F^5=zl&#VChAvLUS)=EzOxr{5@kkegsAED(>XxZz_uX3Mlz>=5 z+oQfE)L1-_RE=0SjfZ3R``RUQMciGF6U|^k`{vz?wK+gpOdt)Qq8Qyo$Y*{0iDeI2A9RTX`+YN7###t`IiHHbR&KHrD+Th>AbX*pgR(s5s zL0_xsu-Lkc$aBeMqKcs1483j;3)bzQD9t-Ts13)qK`YSdaV+F z_NKYm=O~3?YTIxeEsP0v4Bavpc43R63)3Y0f^&CG-M%7DiKAB5vD0&69JzZgZCiOb zl87}KECJ6zBPW3#aJT0QuJ27#QwzbbDD0GIR2d7GHEtk7Tb**GGjUY#+by8X;v7;4 z)gFaPsotRp>+->;jM;VIqgE$&fqMaCH72SueT+)Z?63!vi+QMmsu-(A^B4LgLr#qq z)00OwTmxfcEmMn`ny9eJx|y8D7(*nJdMT=r7*}ays91&1ZjIY zAu9Dt$L)c0fxq@IkxGQooB2e0vdxKcZb$ac4I#_U-G28yhD;NC*p;@WpNGpcCz8~j z=5$NtRR6j^Z+L^>viou2fFEx5$L4I8sOti2N^}$GgQr$KbAbLtCDF`xi^1a9SH;Si zQdzuElO9`bb0e6CtrtrjFox2~!^g3-OJJ%cgW6Bc`^a zCVoz;fw#LJ_>V@MAm!(^*dGF!;+>D}@Zk>YcJxdjPeQ3z`jxmX8Gdcm5IH3=aR=aY zTSTq6DMWoTK_*X>`Go3zb=Yut$`W%UDeK5bA8hv{yD;l#(moBJUhNyawed|31_hLx z*%J$3zP$jek&nz_&x2)v(!3~?zO&ki-HWg44YRJ*1xBSk_D!3FPA{7qIIeJ{hP>vr z1NHAHY`UMAUJBLkxE&2bHPXA*>2_0&-*ndEe|aL<@QjRPp~74Iw)O8mF~zE-iv??I z7`|S#*}zFaTX>HSO`K779|N7;$g}Mk>ot8df3C^rrb+U1RPRP(!61W>k<1&Ge1L(O z*=zl7?&_DFkzJ!#2|hP(Q>_2=_86=rT7?5j3yCp*509dEq%c4nyk8GI*NG~AUplDCC*fVjyY zt|7ilrkgXX{VIIf1~@!vVWO*cW*FwlPuoajCfh-9AvRfJt2mfTa6uZkXVzT^3C87S z-h%m=jfsBvx-p6Bex5QWn@Z=N~y&UW82L2JX`pM}0|5W#u zVO?$C+VBFwpadmUN>I9#MoLgnx5sKb0H<6AwBOCnKx@wON3#7Iz%Mn+F zVO9A%I@MpTQ6y$=h|RV6N;D_JNkW}WKTplKbFOTf2^EyhO3mIeIP`N-*<<2AaD@Zi zl_}{2n!3*tw`(h$a9qT`C`;}8#qDElv!C?shJHP_Z2qa(7a=A8#Mq?W2!InN9E|J2 z%A{M|PYDA&?$16x!Wn(1pjhRLc_Uiw`XXip(fx-`Rd<^M>F5%fohVEh+gzms@xLi~ z$oe+McLfwaAAi>GNyK$b+;VqLDc-d414{Wvy<*Lwsd+Pg4r+WdVF65r>>~b53ivdt zCb=XZmRxF_bE0Q7llpNh;lRET)#tPK(`E9^r0dU?xtC{rj2_shwsP8+Ap8@O9M^X} zanIvv>JNCw%-IGfooB)d`JXV%Wk2s=lU--)XR9bu3%|BaL`ayu^W;I;@b;Z5L=PQ~ zawq4|J|%aw$mS%@1T_m^o!D-_*MM5`+tjvipOIg_e#Fr-jfaHPKyC=vV*tFPtn*D> zU9`fX;t{?H7i05-v2OqjPWAUSj9ARyw98-?WqY)ZC5a4|M{-n2%b@iraw~36qrX|D@kfx=a5lj316}64! zy60!rctbdOl7NuVYjVT_d0Gd&p$Ur<&Sz_De{;w;MUN`$H=+4&+ztyaD6}fK!5)Q%`f!d69bWF8AIOaG$ls>ndN~G%QwL@*j~-H)G?)UwG>u z_~WU7-0HvgIhfib!W9vq6L=Hzg;w()jN&PtjOZK2ANWryG?!BePo|>6q+Q-77e1=$ zbc1u38agNzc=-F3Kp&9@SUFiFX@R~$ZCsz3#LXS+lEfzhg`BFxk^Ll>vyKu7;)Sk4 zKrL5G_2?Ih*k9${Sfq`p_2}{FEhSt*YZ&VfLV`t{7@(l8yy_(Uxu8BvGn>gg=fUk3 zt-3a=66U*H>DV^E=JP+G{B&vCY}?u8c_Y-wjOlt+3+gI!Dy>;xbVdX|liBGlW+kzy z(oc*U^()LgLNoNh)tu4Q_j2Hu`s$eWaBOfevS0kaBbU2cco(G>e_B8QZ3x-zm|OZ^ z5UQv>%CdX@yaCX&v2-;qyI;T{wK9}1!I-gghwKjTc$Vwwx#!B^@!&W-EmZpfu8-ck z%vWMOVJ~*}!cK;WykZ$LKM7k;mEp{Oug`05XQ+|Lp%Uo}QZO zDz)E+EWW~F_bp`cOlrTduF|e{UGxr?LtE9+;UP4nUPlF%k-wOsbnCx!u?VoT-f#)e z%rv#K$_or6xW9SPg(R;#aOfqpBfJFg6Ktt9dNWUsa7MHv6jWPju)d0I#O%@O@mLFp z(apJkh_c22>(To2)E&#CSus{kR{wuH(Equ-s|R=n{*-sg$P~XZ zGCFSKT?Z1K0RDPnSLn0=8iU6C+3aoXu(X34Jv?I}i~$b`>K-F34{Rtf!W@`J61zbzH`gh5Is=ZEbr_B&d7eI_+$dgAZH(M-NMYet!$k$CI4!i*j$>f*jA zB*etT`1oit!+?!H`KZ`3Tfaq?Wtc^W*0PRM;qS?cvPt=Te6rZ)^=EHCqHz3a=~X55 zRF=}mKj=TTc%Q6xnp&wGY!64Nd8j>)_kGwzD*S2o0$M!9WiNC&bpR?iDy6SNO;ucy z6O?WSJc~?%m||2q^;fS>^iq?(c@H^SN*sA}cmD~|Y^F=vUH?WB9a8=A2&wuF)HHOK zsaF#T;=mWiXxwg;=7Jh5AMTNFJ}v6-f<92#RdH5K)pEYPXh`XQs4>$V6jN2neDw|x z^j;Op>-j7rc4z6ED9sWV^M)&*4SiM3JUM{{vqWqcGlC zA?)j9pbf*NE<0N#cH!hY)q%)<gO`!62<8_e)Q5XC68MM4AE z#&q;0Y|Ju;U1&p$x>&D!UT}{0u)pqcxT4K>xfo!oj)ubo=oTr4Ix=uJAI7qv>%YtbT2C01 ze`@`D$i+p@`sS)Jo72ID_a$?y^)XU;C!ke~i7{!?jHy>uCPFa9bwlW3)*G#c5m%j9$3b&xYO^*YGVWKnpPk*%q4 zr@DWpY_h3b`PtRYOBXLti6|3?NG3W=DnKm>S9WLvfFus zyz{p7-m7;NpBAbjyx9P}*AZ zU^kTpEvr+(#OtX8TEh6V$nw3C#_vb?V%}?>qsL=Qj>blkbk~`Lw@JhHk`SUuH6N7nO_F;1`PRURRc@aW(-G6b zQk%;BX?1ll8@O^(hT{VnTe18u}H(_GPR~s{9#fC$5gRmsF-t z&#yaiu@CpxInC>}w9L(wkXy1t(G!G0XZA2qad}^JjAezE%U%-~edC*!25*pp14(WXDRE^vN2I!fQLTP$ z7+Tmy5O4goO___}byA^QZU~8HTup)LLzL^D27OkXZBJ0N{E;?N_5HeXkpePa%CcqX zpP&@nyt3M!xITI9OWRssfM)o5_=K_9kXsb)^=$GSDm#gm(_7(9mma4X$na;fp=F_B zh+`D;yLD}MdpcuH-*gP>+TVUxUWM~cbkgeN3a4{x@xvl)ZPP@f7yUL~t0z^Pm;#?^ zillNr|N5=cjl2|=Sh9D%>qdYZh$n*XoLt@UmPV=0q!Yg~Ej-SHbcnGfRB9aCl$A@d zidjQBmYFBY!X@N9$M2os-0Q;q$+7Ly`Jz{rtxTzf%p3^qzk^NSuRl!~*)V+qQ;o-a zO6N8jC+|u6l7Zv<;6Nz$1Y)VJ^5mOF@Ty&JXXR<5N;iI^;PKf?;EIy0{9O`kq^tY8 z!z=QuTIj}DL(d7ER1TQ6uT;nD63HHw>suWOxqQo#;U##(R^phu==cUB&g9_A#(jMn zHIbLTwzu19Hjxq((Rtstgm4CB9a|>k&_Iko^Eymap|ew)y5PMPtzex=I{eC;dzY{B z<|(|mJ^jE}YFvXqf&p44$X4cp6arobw1DUi_3Z%-1?G6(0}mI)%fU9n^wgqulgR{rRLi~wqENe zOEoGm9M6dU*LjlRTf<>I#WofM$h(9;BM#sG#OJ+b@q@T5vOf4MA#LK2obFvTCVJRgG1G>DWN zFsyvgw^!ZiU>G#Au6(=pu5#3YbWL}{L<>(F?;Dw1EHj?K6(lvzjz3cnE$e63R+qz% zD5;dJJlPI{0K2HJf^q8gw=#YSOk`uaEyX#D%Y5X5r+v>-irHE3S+v*kj~6?q9inqc z`MB<#4sNFEG*#nIS|X3#r*=!va4U1RDrRLw_Kf;7JxQ^2=IjGe?LMDUX@A_4g@(4T zy50R-3W-;*41Dt9Y!1H}9(2e;Y%HkC_ND%j->8#z?l@CL=@cIfcZa`r+5QyXH=o6V zoIp8lTeJ$tD?|(LIm!yYjl@6?cQ6RfZKOx!XfMB-#-fp*b?Rz>aSA^4G5){b;6~&WnUf zt!p}Wi$`Kda=F?~inGMF&8Ula^H+JhbrN-f@$Mz9bP>95$e*{oW9@|B+wqs;D47+`M;Hkue( z>%k>py%R;?*_cQ}THZ>(!WNHtAnowi8*=@M9@CULAt|flhPgP?(Q^0<+ab&PU=|+j zXa=|e>GQFVk>+~Nx*y)tu=Z}`jW@LN$5dwN`8JO&#{?V3WA7eyop7XY6`6-_9wK=wo4fSN zLyf*spiD1Rvc>8@@R=onMe2i)?VA~WJ1WNzuV85&j)eZnSE1S?;~D4sWaH}?)(tM+PpWRR|hL@*yq!EWi>fEF@P?4`w+Otx@uhi$lEQx*}yIRFtWXt!RzeTuJ&%W(wry;39 zr^5oH&ocYd?Mi5sf{ql2*+3PbKF7U5$=8oo-`JP}OhoL7(3=izNKU-YXFwzh{Cp+Q zEVj73OmqMKhDJz8$mPqIGb_#eIz zj!PAF^wV+X&sfIFg4Gl^Q_%kM{jjUf7>k|-WS^0dk@@-7l9DYWMV3&g5faHfQZJt% zUk3D7kbn94`T`TdyY57(O!bt)=%2tBF$?8L%h1b1r+m@qE-RZY(Eu%0beb^MB~;X6 zt3_6zN5~khsqr|wdfHl3Qv;ph6+2x(smwnudmCLdYf(Q@VenCioqX=5oSb2QuRg8Y zO}4x^m%zY>G+2W>9Lr5J7@7hGd66OY{LR;GUQU(N&{s9hC(3|VOfOF+V!!(LbQ4yx zFKZNeo3-lTVetZC>QMY+*7|+e1)@6IHRi`Pbg8dr@TUs!wp1yuJ{vdEy zttlvW)8k;K9eDLsS5E`oeJRlDPgiW#1V7Y)k3wafE}25~WvL$m0LAzWU5##IL82(3 z#p6Xn@^>l9lbps!CI*f7uUsub$l~_5eZz?(8zUW=*IrEIJ~sOxxh6`%Kiz`mujN0z zS)Vyl@?G;wv;O;rfYf=)8#Hb}7-{f!d1|Vv0En&KsdMf;Q3aAHb}p`Y@(j(2a!9VN zXz1uZ_vt)+iZ;#xrGK?PMBNbpxOKh>HD#fri*W^Jh7i+9pqslO?yOSwqGM!iv1g>B z68T2-C6!=5Jz|3WD)m}6er0n6B?mEqzeC91!O}Lbrv@Uk8$BUp_I6`(p*ro7^7-*h z#qoeZQNT^x3qRFa|BPkpMc(n$nu1u;6zg>As~g18%D9yT2IArjl(e*;OkZTbMsNMh z;guE2`3^XR|$;KhO;+QV!5)p8QJ{ujgd5lbiZ&WZo?5GxJT?&Es17DsRd|@Veo| zXpyKW>O#LWqfbvy&o}*CSH>%%OzkleUS-*jZ6}v=*YFr{cGU!R1yTPF^=&H5^7|*A zGwAM=?J$E7M|Va)c`eyk$GAVOkGPOnkzYckeTyaI87a94Ie9m(LVem-{Kj58lvV`~F}0S~y99+?iX}Ju0Pr~Y2n=&wBiD=oUr^^wm;EiR3fP-^ z5d52yRt18*fO2q5Bh~uyOX$;W7PujDiv?q|(%Di{6PtR1%#8&5Rbsmb2b59jz9Zilvb(E?Mb(g!+KE~(BfBXnX+$8*On`Uw&B_W~Z zcxP-(%L+6ThC>ef%#P+|Kza;p+~zup^h*BJQnUsD#(wejMgK_AsW37!0=%a{p-jt- zl>&GQkg5V(BDk5^>rbZ2?O)};RT|O>328q#QUsg+HR$-`d}=p&25|8gK0Dk5C z6dQY86SJWI=}<^l|0M68j{%Y{6!q=cZ|!4(Cr>T!JIqneaYunxFGFc{j_=1LYiw=y zBYtCoT~7}UMmj5p-xrXLRar;f;S*UZDtwBml>>aIP#>rwBjcx64O|-P)4;zCL|2V| z<`x!9&}Ztq2vi!qH_h|j4gZi=*VGggIBN{NMypI`1*NWMb;PDEp&UMfcvnsbx13WW zqg{PmmGklKe;dR_zOAtf+6?vme7h*1Ttg+wUNL^8k-6usO%Q|lIKT5iL>7|wN=Qt| zNx4)%XTH3_xgzq|daJSVboEHZ_YG&@yFtZQ6S-~@y1nm|nyLfj!_^W|**elNIari0oQ^JVE8c~Jjg1_rR6Gdws z52rSMd{|mYsm*czRO#~rj1`IM~C6O85Y(S zY^~G1?*UDco}wQlW}|t9aLjRx>5mzYhIW@_&W_^W-nZmhifv?<3M%k2&iBpUEn5ux zE6`2B!SMRU=fth-YGP=c7HA1Um$AXctoSCjIDE{NDW891GKpE2CGYW~(L=Yg9n{D7 zM7z~PSGofBt-2N=%+(|=wD~drlf}>Z#p>9rk#~^8rX(euP3<>-%_4pF+V5a=t0dH( zBk6g0Dk{n0a?t|Zs5rpovp{xrgqIO#sAIhX!je^zJV+ar8wO70gwt_Q2uYSUH{65} zGj*6Ark}UkmU&n0(QHq^+jjD*aRL6V#rp+f>^fr*rtcCMffxIB``hiq+e@tBFQsp6 z^;>pQxFv`XH|#XVx5w*ELp+Bu5l6wrL9A=h8urlS_vH;;8f_wk53|)uXRZEVWFy-* zn*;ly##G50u%Ra_9~E15&ieX)twZEEXV?0IsIU!o(%Dc6v$tjzHaTqGcA&k@<6&{Ke54rkarP-JN!jY2hfq-zcj<6cJ9(m*dVRJwwfcS z%SD<=Q6U-Y@#D*8nW^7e#tp=!{O8EaTt{&8v^8f0=F#+fNtNVnZnzk_ko|irwfdch z+vSt31$Ol#2|J|4qbfQ9jN(2a?wUD^C8k#uz9_7qHpHlEIPvb=7`#kYizOCWap*m| z@crwo<%!inhr`kL+{3 z(yva1yBmb#1}lV8D5XO@;yTfcd@8w%B?zX8po>d=66Xf#!$%x@a8ssPsbNA9Ku?RABMO($TYi-Kkl-aLEuJcDx z*KkCTt8I(w(AFA8mOTR8v9L(vXM{?Q3Se?lH9q1#9_Qbg{fi<`AK z9m}aWj`0#q_2aq+4_PCfXtp2mZYJ#Ps#3uVMO2*)gh$|izhgna7yxF~joYnz)~pvE zvf2ANGtxqm&6W7I;G6Hm_p2+g`_X<5TAE+4%>_6?^`)wX4Qp$I_43F))_zO6yKYoZ zqp%(7yIl_nld{wR>R?3W5Ec7b=yrH&2?@X#E!WGJ-7b4iu_e*cYzj?z_F&T7lp*`_ zsF35$GK))vq#&Df&!!wlM5MxY-+HDCd)GwXZ%B8BpnIIMs!E`3?AU8RVMn6C!o+Ym zL;K%bT53ATbn^w9zIg5W8FipVq3)8I3=<@|JN(qVA~27>muoF;wD;g!r}EO%i-QLY zMhlbW>t3<{xjmqNw$UxJMQLGt3{vX=euqR z!o%+T&3?bmh$#gH0f0Q60vt1Mx@?)l1qWSTbB73^iu5qrj zUw*mcj)v=A(b(X$V+%>`jc2x&Uk*%)dn`AJbtOGt7~6t57RCZ|Ez=LNcAN9G-%%x zd$735wHC5C{}F0z%UjgkOxLh+&jN@3%8isjLt!Ks;)}N(D6X`2Y6jc}zw%0(gzGI6 zfW7#lq{de@fz+ zd6e8#GG#JA=gR}x4_3=rF~CNm zNd}jStl(*2M@C2AfOK=@$$qk;_&A`K41PCySIy=MTys!J-q67^H8A`0D6dPK?Jy>` z=?Hi!GA+KwB4cK(#apzSGy1G04geKB@I9rxgG9&pr$Lfl6 zF!4O0xTR<+0;`zom!AARNDUC|0KQ#y`Kwx+Jg=zk0E~KAyh&gg@vGs_GjEpYSWgSd0^o{k+9XGW1x4f&(bVO!E?I zTVmwEZirj(CG1~IAP5unpTDpyos&fXz7g6#oo1tT794g^Zjr!sNZ0MQZRTw`eR?r` z9K}BuOXBCbuU=6=>PP%yK8DHE%OaWF)MJMar$z>M64bt=Wnm)G55E;hHX|sEi9I7= zRstW7@dF7vtL5hAY28LC*RR(1N-4AE=HCk|*H^~4SRS`imW!}(D_?JwElQxA_zrVf z2~KNMB(RLG%@bnO_X>$oU9U%>z3et2KOu7yFEX)5XQOY;H`9j6e#v)5MU8b?d?1`|(FcRltaDy*?IE zw*yL2(D`M`!oUD0^AO{&LYYNkW@gl18#6V&{~h;G%YJ4qB=ox_gl%!X6d;J3G}1Hm zf}TAr15>#^bZ+l-j2F*Xlb&NtFs)@*30a%X+SSTbY`*&Yn~E(tFkk9K9r*oQNJgf8 zXy_3&wcgrD5mbEunLE)JxfDhGAO8Vq6jfP=goetO7Z(@5DtEH4^0)?6?}<*c#iD*> z09&k2OiZ}Bxq(87WW=LDqxR_oO@*b{KOo@g@59@axSuIS} z`Yg5qrN04dYe7&zK!@PW%1TSYgLVv1J*f(>mzS51cl<3Z|Af~0B6rE@z4nQ4f%5<4;D%>H0?B^wGfBWA%1Q(ha`a2x0 zLA}bfwoLG@L`9i}ilaJZH0T?F!-G{q?dvA&xhYJD*3D4n3*#<&X9bTP4ynmT%KylCz#42TN%Vi+y>yvV04sSKy!N)k<`t`3dwm<`-WX z#0gSIuNgs;<4LuU`(?v5k9?haoQw$NWG$MXK;oOx+}td|Yg1=7S)|H1M%_pRly|R&!#^W@hkTzxIEn>6&lDPgy{1wy3Ghn-_4?{_wR>!14Px( z?UoT4N!IG+?#@u9Gmxdt{$#eE8yo;oo7P=OZ7qG~9{Tl8f-WO7lN^Y6ZFTkZM1(f)`PU+eMPB2QOtYG8yMWr@{WsnH zV{R8)DDw8M27eGXswzqMNRk{9CH<&uY!?H4<4zFm*84=4&X7A2cRQXx+FjzdKo6pu(6S zeWl`gFH%^ei6pWiaeYiCxQd3??h;a|@LaKd)&n7YKRND&Ux{P>mE80s43(QZg;T$M zgAKMF1+_VoFR&hAHstt&ZlV^J1Bt%ZBdi3(#;5G@;_>7ozu52d0`_lWS#(A|U{8Fy zcQMz=A^5xMdRHfo#M@<(c2=zmgFH2{&^Pw`D5~%{8pU*fm4U-Es=AI@p$IiY3EcXgyF5v?{G9KfMk7@!+Fq?xNA|ZH1HP7gjApCBh$c8y;g%b4&)M zbmY8$&8S@#(UfweC6P<7=!a{&-WO?B7+t=Eu#3z0ScXuzEgQl4JBF5yE2jAPSy)+f zvJZ9)Qn2AtLI zd;+kb?;xE0IZeIxHuHE4TjMz(j8+8N1Ik@9i7t$V>GP}h*^fv@ zkZaTl7y5?V=p39zOs`>-*w^|EOgqy#U-#AXLoOq3M|l4XXj_|5>uh{1i&PZ!{c3)@ zUs@IF-)l_~Rz57K)$W3hAiLBi{3>Xug=+xw(mEdx;;iIt^~{XBdyJNTx;HC$`b+Ek zi3j*XS(>RnOOd;7%WXWr0bI&zt>8`PE32mw8fYydu70)n33Z@d5mGY5n(EJhzr2if zucMCTx}hFSwK8gR(Y5HF_4|RV%)Gs8sY!VaW>I9sA>%5V*=Q>_n>*^kC4XOjUw#Sk zv>Brpw=Io1!^(NjYG+1o!U}xM|4FT#;60LY&pO0m;@f-t%7}h)_B`yHHvS#%0K(h( z7ugG4+PVqF0>0oC>IZ2rot>)SAWe}UVGVWf(*7|q$ljtt~%Y- ziAP)Fd$Gt_#rQMpm-a_TWu@ZIeq;~k$g>w6t>}+M*b(HDREcO`u`>fg?_yNg?1IQyDS$kbP*RK=AV4b`NNX919 z++MBtDuUc4!&@$r$E~zxA2H`HxzcyJ+5ekuWZoE-o{JGYTm!kx9T>`(vd=OB=6dQZ z?l_>F|LkO;Oo@hdn@qEhYn{xo1}|Yf`^CW`L=^I`do9tyxB&6E3!|N})Saz6iTJ;4 z9gu$Az3+M&T=g+f^b+DWP?I{3A&m6b{!VtsuN>XKWz2I^t#hUVe2dh(kdn?oFEB#1 ze=o8)pT)1#C}6q%oK~<_>APTV3>9%@(2Bz?v?&u zUH#VY%}5aB`IDax8F$gl(jkG}&pZ5tYS)Cql9=a_edaBt*(@X}S8hgoaaDjr1|wVr ziD#Ch7U`|LcEN9(V=$v)j&rZDgYKar$hA-M?SY<121iM79De0_&H*RS0$NzPRf&Z< z_f-Umzc0?OLWo+}M`)K3kDM9%X%F_<0z6eF$qi!H_o?C6AHgWb+S;A4qku4!ZlxOv z0L)cu7NF<_Azji#(EYWtwVfYsZfb(liT`>M$9E{o#m3fMSvdhbdZwMRwV+vIX=!xe=imHXZe5l6D`l7K44-ByEsNb>{p6Y*lVz$r0#x^~ao11GWFA*ZXPnT*) z@X=FAeRJ&J1GA6(r#XRvy2Eo~<<<0&5 zL1)jEe{5j5fmAk+UqXC*&o(;SHfs_Rx#>t$XOy0G9YJPbc|KNp)q-12Iq-IN~Wg;cJb&GXgzwUxk_40^3lhM$*a;td| zmS$&XpBaW97W_KC95!}X)Ya9oC@M!0t@Muz#vzrkugz^8eyB|)rQ9SPbHbkX+ZZOyDF_ZUFSPZ zj&q0oxh2g{I3P4z%@aoF)%N`O(NY5>cs@jAdHXPwW57{KAQrDS+>p}%*;UU&Y|PA} zd-AfgV;bO5;|-U=;6t|TQ$ZbwOEf{A!?4Ta19aiH4GgH7-MYbF+uF)RPwzdK|DpP? zqrFw)8#iv;x;2n*q;mEk9>fqqzoKj0@9Wn@Py#*U8kA*}q5xX-<) z4k*}L!+HG4r)Oq*VBYRr%x)^pRP^HneBg=q^Yfc80iBNjdSH?^bkn@UBaAZKG*j%7 z-EYvSH^kjlzc|>yoi0qP}z&=}? zgz|rT5wQvajOg8%KR|p4?o#}vq?A#~T05qn)R)1nhzJI~b2)XXN;s4EjtY|2`_>`f zz(=f*PKj9E+vuGAPBYyjjTefHNefoGa>TtB@>EF`kCil37p$L6V`$0EEHqrdt}5NR z*cAJkMFUJG?!W?^9|~5*Im!S2XpIR@%xX5>eHK3=gOUE5841s(_GDp(SN{_hVNq?njKK#W_EpxEzjVg%NNr+ZzfoNIA2yFg*BGF& zfqIU(eZR6Y{@!ws$F|g<5A^==V~LyT5C~u4K;L1ui6CQ{N2w$nV2Hd}P?brc)hsl7 z@2v+kMh8im@n)Dvw|(L{sLNO$x7-zf3n7FE5+ZOAM#wS@0JNYaDrk@dcz(6v?V$ z_xB}O%4$p<*MT;%6V`pV#{IU$% zzR{?yt-PT7H|Df0Wg^!I3QLEXREd%ytRQl?CUBeEEPC!sg)*993>{F(Iv6fh^T+2s z;Nr^U^GIm0kVm>f^QPj%6c|!Ya;QNF5Jk5MD%u3H2KszoAh%gfDT9+#K^6y}4#JSR zkJ_MT#D3m!i!uuI$U2Eb$JG-mqp=%-7cFXMkAs!9sl9!E2#SM}ZhPdt7i)(~nj53Q zS=`RS=Ecdhs8cEU6JC1F=UHetG%%Uz2L19})N}=LSivv9fAmf7o+zY?4%lOI-o?c=x3(D zADU=DxCRQ^`%Aq(uBr?&Vwk3;rs6DTC1~Q9p+E9L7! zf;&rBAp}t``DZ5hFtVtnLPu<=3d_jISgL|7o?%#!oC%LT$W8BD zn|F=x`L&z!!c&Q!fL{`Jd`BAC@fB_tKOx4&r*Ptpk&U^AJ9-G(7*Nh|xECCJ>*yYP zb=)z#0*}KE?YOR?;b5W^@tJmdPXHl0I{F0~_|AKi35ubB`6|x#s+1e;%nxRbrhJB+ za0t&#OiXqx6sA9tyF52-?&^{k#e_AYD`yXlJaP6+=iQg$cIXN4smujeh7=q{J|Zr! zhPJ7reXWLuq*5m7gGYYpzkW@rkhK43oRV+|Be8V=l%u?lLuS;T3Y{N8^@m6LY@YVT zw~oM^0Jf*i>H2+8KuWxM-u2X<+H&N@H3D^MoPBh6Kfmj7=zn`Wz%|Q3@KAeMXDd@) zNbbgdbq0UFiq+ML`2;FWcSiNXjR@PZBA3(Rtm*91Qa*=V3tpC<$zd`&r>#aBQBlz; z7Y3?WVlIXLsHV5Nj_w($C$6i**#jxbnULDAGcYNM0{$HwQ zw2~;ipJaf#yHcp*ltJ6s73@2T^Ze#$hQFb{kc=NFfY?Q^foJS|IZG+@ZMNsBCHG51 z_|9Jlf663!i9|pf+8pz&QsXaiBYN>z*MY$FGB|ST2Q{%v1ia84cJ$`XojYB9*>k_& z$sdDs8?2+r_4Rgg!%F9BSiJC2Md@!gHa049kGY;Zhu$!kncO=d=drl|j`$gc5fS2NNo7s|LX0mW}QH-u>%2=~s~y8=jP@ZW2#w(Y%ZUTgHN1BUs#0 zrO6R`+xP@|cMOHAPw5VW{O`cplpcJ{QsC#;4oX;(%f1!jeiu8iwp&lA(*Z1wURLpJaOs)HaV?W4_?c*5H+%pUewhgdV55L$V3SBiEB#rggRr z9m56mpg<|skKr6&^7Ng%$ni_3*mt#?l1QPrFfLVGA{peBi#Fic z0{_N^ll6W7VU6c)}Wo|I&cI}pGkVj|K{GlaE1{C|=IFT4N% literal 0 HcmV?d00001 diff --git a/docs/optimizers/radam_r_t.png b/docs/optimizers/radam_r_t.png new file mode 100644 index 0000000000000000000000000000000000000000..7b77edb985f7a4d1632f6717c87c32f8937e3ce7 GIT binary patch literal 30948 zcmagGcT|&I@GklW1QY~Rno9dDNCy>x&_SgL3etN~dMDCLAc}~9f{OGG(tGbAQbg&} zI}xP!5^8|EiNAB!J?pG>FaNk!eDm(vGqazWd1iKko-4~!l3yo>Ac*pb!eccEB0@nB z411mkd{R@YR0=^JxlbO;Xm}W{OptkKn4?<}?h_;v^;{vgFk0H7hw_(bNmO|_xH!la zwJ$$@f0IigyvO^(?XS;?FWf$No;TuLuY$s@k2{K7i}maR>&FcOH=a&jyutqMsj_d9 zWg_y+w$ugZ`BdYn76P%Ozp(uk`jK6byp&U_=+ehvb*Q|Dm){5HSRm5uLjV zLCJLgVZdYg5R4RptVqt^1z+Ko{{R0<$*aFVKa;U|{3j`C#Pr|;8ZCKTRYeEI!cLwL zTRppRALXCpp*_z?RsCMKpVpyWCuV{Y-f;Q(b3l^O@_dr@h_^LM3} z_cvQvSv{~YVPWAHwYj<6_L-cu$M2jxJoevSpkeX+`?)JY>~yQ0U+|uHvXE;~KxMM` z&cfy|_9VEN7&VLdeo7$A(pV)wvQs0(@vz3UHPoM!a%*o-d%eS^f#g}k^Zi@rSCbPX zBO>y3il+k@jFQ=o1iHmbo&G!=RonKq>h|281LEP5lcBaIrU8CbjQqRaT$}H{rz9bMPD|bSEo}^QOV%N@HgOJ zMe!MG6&O?lNgO6>j;BtKSzU|0_tqFSS-e+@f%5#y6g1ZhubCVMEKTJJnYYniU!AOv z%cd`9_1(XbkM%poX`7mwN=QiLtW+Up`lWtqz}5zGhiFv9i){wD4nLz%sMyO;)(s4l z{~tep*p+l$o@ZdObZZ%z*P?+t_MKL1D1|oySw{dEc5Q_n7q`&pFb%lk?0KO zhmRiFx{!9HjSZ*Kk{3;+xJt?kPV1BP zwi!zsep|oT1@(!DiLKjH9tHMK*-$#Es;W+*k=BNZ5fSN=4^N-|k8{Akt-p7?{W_sF2j*gD5tVF}_VaIiPY?$Psf-_lc zSByaBoc?Hdh22Dr=5hWYq!Rleh`uw{@z&nbz%tC<_h6O*N4107oANU*m4f=&2(L|3 zisI{*I<2eH3>b`^sBtx|bk4elZEI7^h7B65my89<5BTI#ZA9|w_fHK{7QKEAv%e=G z&^Nvhd~t7isJl4y{rmGcs>UQD$o0$?rh|F98l0VBssrQW<1AT7S{DgN8KBQ&<5JS- z#15&G%`juX<8q6T(dc`o3sRStM8>YdNUPX^v$;N&m6gD6G}U|W6&#pEm|0r(Mfx2t z8-ruC;zqU1@|N!E)vS}{aFw!po6DH9()eiRgr z{Q8xajxM`F4h37g08XM{WMt$ieqO59WwFHPVCzvJ)iG|!c*#~RQN&rU-d{9usqd59 z`eeQ&rS;a|UxE#Z8o-9@OK8mg8lqCI-L{c>rVoS z^3XDj0DQ+}d1v`q^E_V4^x{*r+2G@tbD5wQP)LjF}kTsZgt z`5M7Hn1X_WB;P}8fHn&%q5#0*=+2Rmg8^;h)t;0l3jR8$zdtV{3omMV?f?NVZ);zLlxlV(< z4lMCIRTg=;lYOweT-d-O;Z^p-fFfIk8ED2`Az#Ot#H0EBYnnSWvTcxHsB=%MwJ@7~ z;W3JgN^e)Du=Jl>wjlCSq#w4XcX4nHK`Z*FLh9zwD&LG8p?H zjt_ByQ=`MfJzW6FuGc!5o0}h0W;>dXZ)8qNbN;R-^K}+H+`P}?fqf0)>G?2!0jl5; zt2`lk6D0DO%*J+K-SK?-Lr=SE_WI^PhH8DjhL^jz$r&1V)}*TW<}&}eshH?)af?m= z>=tSZoP5;HGsFVB*HxxW#gTi*61ywfQWh)et+3TF zg{MbTrU}&Eqssuls^?Chm zJzlXFZ_Mrds`=POP7d)SJuXV<$S%=Nz_Vx(i@t+gmLz3jzPt1{i@)1Xr0>NXE1zFp zeZ?RL=I8C^=KH<9qJxKujdXU%$UfGK6=$tL!N7NsntH0<+ub(d-CrA)C$QbJ;3vI(X^`d0ecC-MO@t?+ ze_~Ii-dKJ7ME=On^-r$g(4fKlYA1oyJ9-pQj~4J%Q=*lX6`v!TF&hDkj^x#yhFkKX z{p0pn%xh**mxYm1OIySDGmQlhxzl@nwdP!Z4h);{!)6IRf3@r#h27}cySW*)O`+8K z3pE=rS6aq<4G@k%CwIt6ayjUPg(3;N=zP4S?)ZM@aEJqgh;n?6%>J(Em9HNsAKdzu zg2raB);VErv~?^>b8(z;2B6gw^K0B~W+TNsf}dJyOV8`myL>rdGFcmo>Ep84CM!Ms z9HU}!C+YsoXS<>QDrVU#3b_0HJ}@(LpXB`bqIWNbY`^EBJGoO`j35czx7Q3YUh;DL zrKNFfzViB|(i@ER!guIy2Z_NTRBAAt<#y~HW6OKmIzM&i+}|1snuKzXw2-XqP_OI{ zt!!MaaEA*lbPGvzYn?_dN{#+(eZAK#RbTu~>uDEEiz~u=&hQUxvM8L$$9BL55f-VI zoh5$B$E2rN=Ii9^R#G)DXGPaN1v?0pp{@CyyieB)Uc2}!D)aT+%Bay?-RBWr3roM3 z>~}Vk*F3}R*P8}iX&6T~T1WYOtZQxW3it-b`22{f({hgPd^lGYTfJ@~Nv8XWEu_eF zQ~Thd2=_aw^sSeh*=IWbC4!+d*Ie4po0&gVhIQFs=6gDCl;3qN!I|M2-9u;MoAHkk z)DTKYG23ibmPhz?`njU$cAkCO?aA&^o7(q<>S4(5Q|lAI!}9&)c+4NMX!P0F53dOi zr-bc)8CsG*u`SFqO_WiHz^c2r@Uegt9t0FiW!DG4dVpko^% zk}Aw^)$z0g&)eP?g;~3(OGWRk_;h&2zoUTcNso_m9)$7J&=eH+Z2KH2n3taVb8cN(d-a!@mrbaSJ-Kg zY>ZFK_@F|5bUblT=L)_X-=r62*fI5? zVJsfoaz9N+RV%mfc^o*IK;N9IBD5xbvn(3MUK(_?@4X)|>MdR;yOulUX0(K+B@-c* z=H}p^YrQwI^LKsFXsmrGwtpyX#)DH>l3i}cANJk^wm^eI=| z?5h6NM!v6i?VC36Fy-^VQlYrQDn%US`cMPg)X4-*k-Xtl!jOB#-y{lXRmuEXd+1Cc zt?Fg#7VCzu35SUdQgyaY{*rEn12GLB>-Kn0UaZE9;9Do$H_UCn5li9W!k;zYGi(HU z5tPQn_(=vPS=|_6{PxvfvFWOj=(o{X%Sy)Y1t2Kf!`$?sI$wtUpf!E4VY_7kec5iC z;^p&Y$wA4^p~679ZE6R5hD}y#co##V^$onl3m)p&uLRAgN?Oxm`?w39qfDhvA2JDt zg%mRk-~_4e(_2E2d)o7&H~Kuk16dV1xQ`gcHYp2NG7?{^2eihrk927ts@tll zZeFxBO!?%P8Zq8~KUhF6fk_rO;M}Xup??}11@*KkzS?Kc7FHG*>RhQ~Y54o6>oty9 zudak2{}&U}oV@J3=L8V(G@3A}~hzuX}K?x3b&fa|3Dk-)62W5f{( zlm3wvN>aK1{l|&L%GPz?t6w+8@Nb7|g7Pn{?xOJ@m~%z&C09-|@XRyu`xPE@+*Sb2 zM97AR?j#cVY(|;VVms=+TM*%**I$(tF74JAR~JuTgv=gI=q;tM-|!vGJ23N=2rgF% z3Cj1h^Zd+=|J2~PQumtmn0CFZdrZd4T+ycGFOx~(b}_98dVOuaQprAL|pui6P8X|Kiz2bN6m)PwK2Zw`4+Y3 zI98fRxeoc?A&XtQ(&KQ?)tRWzZ6D^eBi^{F5ab)8xlC(vv>0~~kj!sP-}(#$<1b5M z{dQJ*snS} zK{d@^>tc*%zxa+@x=SXuQit_UJU&ddAMX!!s3sY|;1WC^W%%~!cKOR!R7DeL6BcZk zUj`J_@*`;Gh(3uy3cCQ27j^TpB0znY4G>bN{=-PMC;oZ1pvvH{)iituYiVJ0(<|Qx zuY#o-LU^Ua|2c<+$|dTJTB+wK?%v*(O{6}y4z(mUWL*G{z2gj4k zb8d#(ZX3kDGV)VR_`^_hUUz{9R+WTUhPJw@0CT&3z}vv^;swp=(AviiMvW=oJI6Pj zZ?;^69#l8F&|7zhc>COJz}ZbqJ1lojsi5oL1tiY6yJ%x=TzYy5uaRsJKoR-Ou3FQY zn5&=m?QhQ}9p$XyxT&DzC$#1z9P++IYe7%&a~lsGwB4q)F-5UW;_lo2F(IWbtYlUQ zHSRm}P}0Pj3e1h=9^3SQ%=F617HXS>9-!kC$ z;Tq*^)Yq-i)+_Y~`ExLr4+NyJtk_pY8NO2qzApJNej}W^xCknJ&He+3IyE%!Z$DXhW{ery|sFw2Rz5_mL zdPUN1Z%i4Nuy|LU3D8QYoSh?(^2F|B-&PLxAv zNaWv{Z06!8-xwHbW%Jpqhuh{+++)Z>v&J7LE2^?}L8AhCbrvmmrD%NPH701BDUD6% z{VZP3;U5`8F$AaDuR0MVFKX>kp_#|u?z*mooJl*DIckAPAAOB#=I4?9P1i-MHxjh! z9XG-|Cw!NqmLIQtW|qM*d)S#+Jjs9)TS*1qANeSN z|4}JsoKH6~aB3bX`BFJox9k?TeTm;E*yukL4ddHPhP>;AM9qKSL=1AoA%$jEZdql5 z!1!9DEOkQSslnnJYYUThNi;WHm|;|e0KH*&&j_LwiT&gLS-&8GMJ~FCTfA~=s05mX zmTM>;PFfx|)D+p?#?{S*SqaCjtLk46C6_S&ti`IgpbXo+_F!pn)OGr)=$@H-&ieW7 z%XGoy2-5%91FrG(yh(^@>24eNQ1Bf(sc?Qz=SOiJS_1VcKtGIQ0jA zAPxRRR5xi`4~IUz&LOAwi)aY1b9|h?P_W#l?9)`YSGuCJDGwzJ5oF%$c29whD49MN zJ4eC8v$9eb1p9{e2MX^aaNM6hPuuzOAF59kQ+5w7txf*H3*itBc_jw;%TN`?zpo#! zJcXepv7H^rAJb27)_RELtVau&eyviYE_2;+hu``F652p13+_{A_L{iFc?Kh@2lK)w z#LyzaRFeXGiRvaCYvGU;cMg|M9!05sqZ?IT`S8Rsso%WF z@3=ps_n|#`0!F8aL~&wXBLn}&!*6tbhiW@krZ~LqN&4 zz_)N|9V^_vsyVT14(shxp6JCO;TTx#S%NA=NRXkag@Hnln=%ROq^EB2ZdY<((3L*G zK|(n?Kz2R+S!Lff+or~L|9S(LnIg2dN!D{8=&Az7$DOS~2LBr&8Q|bBUG<%)mpBXU znCL-MDMgf^{-quN>>#r`mP}xsxywMf^uu|CC1hm*kYEgoqK(WYW;%RTc$41|F>X?e zcUMK7IvqS>g32OG9{T1^&`fQZo-K)|s;vp1>SKZQf(YWa91OsyybmJ2pdj!)8h+(( zP*R8_I~Am-Bg-k3IPyv^er} zTc#&GLJ}Nk}94%a!kS< z0nrp1Qx_QSyeCG{e`0Qp7ryI%;$Bmm&)}@{Lk*8GphHm+3|aMG9GOQqk2krK=fgK} zhiKV?5DpCK5M?t=dYki?xv%_i%5U(!0&cM*nh@vNJ+gCbTkO4%h@x{z0YSvL!OQ)_Cu#Wrun`zx1h-2@u#ksxSv-h2aEPq1zx zj-QMUX_udWtd0bQbbcziF!XNjYuwK%o5{)ft}jCiuD+#j@jvi^x1ch5g6}x@g=oSm zG9c}_zwc*A;g@~i5aIossHZ`6(umy7eFzGH!!S*EcT;MM+V|}ZgF5v7Eh)xCL^qVDMd)&Hxt$zJU>s7U$WdJ`uT=nd0ox` zT9cXeE>%=SM8x#I(FW}9gLFKt?zmhGoWx?f6E-f_3bEi(6&y zx{ei|pE_H@$Xc>tqW;e(v2`IRof1wBicN3g~2h9>a{h?epPKk4Ajw zctvqOzqY%3XZT8^PvqqrYoySsKS9aa_rx)Q88Pw=Vv0aK+Bi;HEC`a#FqEFd%?7i9 z(M(6yG5k&EiH_is<_RcS0)*8OQaGDv6p@d|XwU++6s;wWtB}hNH=GOV5xm^wqYHm| zB;s|Ce`>iFD6c10>R>Kt>E1b8FpOX%RS|KFdRXg+gU9g$+#I*wFNux&c--n@Ar(>D z(quN(bl&{gt?F5`X z#2hEZaYN(*si614C}fvaPm#tf_p!!#f77wycz00W(YmAj(_#N|)TJ*i#Nr1(J2}-` zMkeSqX>kIJ@~|$RU$mAzEF82T1WA=CCQP~-0^W#e@*+feqx|I;VgnD9G1C5f8jg3!Vk6u6NUuIy4 z1{M8o=j&z>si4wYeP34*lHs1I=S(Pr7SYn<^_%sO(`l>(hSG<^Yk%X9AAVE5voa$i z12n;k9;WmR2iT(Noa`X#qDA4O@Eu=zOZUCm1Ab17@&)B-=r8+7SW>$_GFp?N?$1SN zFMEWzLxb$a;^Ja)hXqv9c=zvb6UYV6@XC=FGd@`i1scF4(^9wK*!%R{aF}@tJ zRUe8jP&Lw&+*pt%cT#*23?~-(_1=VD-66?f2i}lnmf~?$3PNe9aSujA`Z_u;E5pB= zn{V^**zK;2B#62yDk-I{_GLWNuk$GOVg=l@Jd0=Z__#P9-&$|F5`W!BV{;IV?Qx{& zH%Z)~?9nC^hpgxK{PPgu#&ARThQ?H7$5f+Wkr}t}>!kMD5*ii$)>+1R$fW-o)skgIEL+U2h``Nq@0m}$r?`F4~Dk7e$ue*K1EQb%u7GZKQ<01OG< z$8^dpsTUf5>*gij*bMSk(_RQEt<~|i=OyvTUDrZD_IjEm+xIOPlX8{UKdb1ZhE-8R z7$hhW02VbW(P1yhRcaMpwshajpQxn8Zr_`4ft9H$vQbpNs3q0T9CA7zQr5ak10~0R zV%Ue+hc|-A=jEFhqh6cPJ%GWpPTPNIu|0Xu5v2C2ikJj34!fmaK2#++iU?M~S>7T9 z>MC~Xq66*P-b%a6mZN5oW@^Zzm>uqHp39a!6@N3g^mmt}e)QEIUYN`3R*akDr-P&= z2=GOWLY4c9Wm2V z7qQ*uZ0LuC+U6LcrI6<*uMQZ13S__D8WYSCttXpo;`N3l2Y><%JUkuQK`id8|L*f= zc%Mc3ydwEV*Rog(YyrIjcbkq|+Hp8xrGR&>=Yes8l7C_1PaCn%EBs(b{SkP*x_yQk z%6`TUe4}LRr7v!7KxJd6V5wszs-#!^Z1?m4EI^u4&vxm~@ozo*PQx$#3~qJ+GJn4f(M@N;n{QH)p2vLO~w@d&w$6E4JJ5dKNn5_dCm&smo0B7oi$} zV<37jQweLrjG{~(Zgee0FWv*D^pZXS|#+W>*|z`J!FF#o=+^kVp&e7 zO{UBUIqux!PwY~o}@!u|y`e3%?^Ib&LdFlZqJ;QhL_W)o^#8QcGi{Lz^D zGuH$>Ju>yF#eqGp5%{^o6rh&>rB9NBKprW>7*HU@4_Gib%EJkX+ts;7KK9x5 zZRZ56`$f+vYN>Lq5ktx01n<~IU&21$Z6}lLRgHtvB4- z%`v3uY9cH9cgNqQ4BS2}N+8Z%wyliT{Q6!DG-WUUlS7NlLpLQi7vA?t^NTEQ)v~v+ z>%27MtOGpIN$;JKd zT3DSkBKL~{f($}coVfuL)=CoPpr|W^ihk6CrN!N)Oc$U2<_0Ua@v1U6w6oSlT z2u2MPs3cxlsk#*%{|xpnKyUlh{$HgWeVs&1|abilmsMxvWZ)seoCAk zMfCId;gS!OE`|02PGNlH30V8S4|_|eW~+d+;_vEclZ5q4Rd@F9=N$u1sVyK2__ zguEy*JQ0pM`b^ifTc?XGD?M_{cz$N`pPva9q+f#gc^W*`0qfN1&iFso??-yK*ZF?$ zhTOMLb>idTB+w^*0<;+B6657PdkqVBi!h!rl3|{Xr%ckfdKCAzV#gRpu6Ayzw}7U# z*|W1M$HbY%WcjzB#KVr-jA6g2 z0XwO`$!i!tu{#$ejJVZ!=M!Di`b>Mbpw}B?K*%Q>5=zA3*(y;?gQkUzRBL5c>iZ?7 zu*R*22vSOh)t^G*`pBSMGbsc))Z;*i0V{U&%CL%qjmoxoCPc=SaK=YCW;dT`IEqw_ zYh)l{KzK)DAI7afb9Pu%{vEDYJQ{RZ5>5Am0PXNXHBoZUv^_nF`JeJ* z{A0s4eH^@S{q>&ii+5))pZfRman@w=4Sx69zTfOxNzYM8hlBUTSxPA;%DFkz)g|~y z*4HEuR1RXJ53$`!H!=Q?4(EB0xj_J71&T z2}E}8QIKVng*qXy7xtc9xTnG0X?Z6}lXYVyEX0Bs8b2dYgfZTD4$N*^IT`#m_gf-- z*Dv>Gm=)JkOYgw#!-Cbu)P!bEU=>2$JN%-OpL#>SA()sh^Bg<*#_i=N(~yl>3t$oI z&~aWRf0w6eA;^Z1#|%&Y3?$p1A(@tLIm>mR3~=%o{K+uDu8HOB8#0i>XM04pDmdPE6RZ(Lo487-82BC@;k@+Q@P z%{hl=AV%xzz6R<<)&3K(3ll=NA3oLmy6s}^HCmG~xIUt%>mW{ZYU1aXzVGg3FrpF- zHVwysmUWcm0E>x1z#|I&nWb=K_wv1f-S$8jgqa|?tX*va*|4fUDnfAlaW(QRpYS7w)DQF1# z2vRNp9a_}nMSVKFu+a99v_rHn-WAb9#Yx*S7q)b0G{LD1pzb@tDJmvkMUZ)nvreP# zt!p@vZ)9pE1^e&5Tk<}!)I_qHq65z?q5u*Z@JA~3_B8Ra=JAqEUV{=1J#(K3-|Ci_M(+5WVgF!@Hr zrXEqK(>13 zj>~3-E0}^RtlvEwQ7bU2zs}0KK0STq>Q#=FTW3Y6yTeBTl40}dmoL1<>MssP;0CnE zHiYTqL8d@UC?_&B*XumD`;ED2g9{969fpficKPhUD4+zGPs+-=R5A1jQ~*=e138di z{o`SlI0hvga%P{g(M_bbrnClabkuz2jF$nLGnwd(o=qfKb|>bK+y~P~pt!eLX++{T zTfrD>3T`u8MREj84!SN6YHCi*kcz1#c^!8Z)I5Z_C6s9l7oaZ=T3SlcRJ2ESzg{}5 zDIe-90COg9`-7iQIVmcVAHxz869x6@`3w|nUw$;KiZb>+8`zxkJFQPjN(u;&o$pG} zE->I+dDPa{w&kd#sQ6_E?B9)zx=23(8fzYXj%p}B(i-Mdc|A9PbM4=nCWfSGdLy8Q z{pk=;E!lPExQ|uXpP6RW5dOcWh6L>o=Z}qny6=b1> zC=g-b1kDYuz#OppJ!kd5_j za@FuLL2G$VsZgzoA)npQUQZ^pPVp7@N5MM9uBo&R-P&NG8F6+PVqgDnT13q4!M%GI zC@A72yxlUD6MB1lt&Ym9`{?-#Bv;w*-P`;u&yu1nFDIv`uP=4*HyjWf`8fS0g8$6% zYoanw+3bdD^W0fkDyOwL`oU$<@W36qSD#(;?#tPLPNurvp45Cl;^OI&1oH zVvLRPdd{X}k<;B^Vqw|)4}-xPIT$1-l#f1JD&FKpLt(ZLgcE~AAHhz3ZL)WMwAV4Z zGPZT4aJ{%&H{va$3Jj7C|0m^VE{wB{;qi?028?`TD|dRfc=THdQN{eT0nUcPQcy91 z-Sa9DsNw)|%%s#n&D=JKQnc7r-!VbqiYMf(Ed-4#KF%m|vmC6q-i55+*o zcN`bX$WX)g4T^gkZRKDW4hW^g0^c`m2SK+$Jf$R{Vccm7d67|m?c)xY=qE7rC1)A1 zGv}EPOW)Gub&~_&X94{t0q57nF=#2r{_*{m1AYz+%P*E###hEfL08yT(LjEcmEXeS10m({SN)Nz;d44lE z!=aDnkKdo|=d9y42C*80Y{pUFwxZwQyq;-c#Vzb(+(68;UXMt@36wDw1`mkV8vSKzBI<0!@sD{c@BUAo_OpW^?oOS3iuhLYWGZUT_ zsDD|oEu`)hZ(~$Z3~>IQVWoM`m1&$I4qo6v0wvE7oKfgFfDG>Rt1Fhbme@@lrR`bJ zJFHYWv17u0+C;okZ_EUS%R&KTTutV&SP;t#AW0SFZqyKNmY@}_ip>4qCxA#Z29mL# zxxq%UaULf|rk&EZqAXqP9tUlQOWl%||07+`&jj)Rs=xa)20MfO3I!i&H>q<5M-yTwXJws2meg8@?{~lI7(R&-4#}o3LP=gg0qyIOAuc& z2gKGk2|<0iB$j8OKB5M@-1|y**B@4grdQwm4PrGT@TU9JPCu9G1@(=jPDDxIlKdxK z%AyM$JVsNiNT*tmN>vatGpuJ0RCrIzPhV9t1x@qF!P;S!*){0!O*hcRm4Y)LCiKL| zptvmJx&qO&`buL8jAdyruGqVjz8oIq6UXqmuVKL!>;njSrn4vNkucM9KgI-_XD}8A z$+}%5(X^Z|%pOTYpnscDdKv^qaozvv?y@GTNl1%TVa2;WiU^-QkXgRhd(A=L6FDT+ z^or&zbe53-kK2PBMZcX4E@psiCAjnJsQ2PKa3;1xi9$opobuK`wcu%}o%SP}mK^g_ z^Efb4A}1Q&{LIf)+ddf0;{=mFYy*%P2M1d*k9~2DJyQJIK17Jy2!}j+`_J4vD(apvXOk^FZ`DZr+<^BNY zbcNV1=fo|sPn>)sCQG1v&qhL6!UHr|on?V1{tw!0d>0WC$JBQ8PbpU>fF#dlTWz&o z?}_Y%F~ReWlEk1#magW>UD$t1v`>srKVI;xwIq)D{cZquf*+r!#oA`qAU@72af66} z?of2uZ|sMlSE@$Y?-jX=D3t~4m~gqW7|VBS^`|Avo0)EE2>c=jHkt1Kd>()qWCV<{GoI59y7L13#0O**^oxH@zung)3If1e+wJZ zV5w~8@|c$F{%*@RV9zUYm{-JV>*D(T7nH{R+fCR(Hj|;JLh z*2%kA==o-s9PkTrfP+I-#7hEqD-=OKo6sbSR^(Y5`cm?8c!}Ku<Q7L4U==;-qAfRW)=z>wP zZcKnV?}qZLG~ksT#7urDvSERia3Va2M$HL&Ppu_Ee5EehCIt@upKaO0kDVMY=c~uEtO^L3n_+I^7_x)Re5y?< zTI^y)A5g)E)tg3B3B(7TWH9Kj67om>GpC%_ru}s@L-OlRNJMtzc(2^`#e|KiMOt&l z`0EKq&r<}}Zwto)2baWTE-d$b&g8^+DSJ&T^WI`Z<*$6JS^8@Z1nh~-rdE<N5=D7yMgl8} z=NGNYb%%k(G2g>oU~(OwI}6F)@wR4iHby~wtn34M1b|b6EJ;W8+!VZ#yRT*y`#WO6xZ`+zcf|fQf{JF>i?5tqXcNm& zrND0$L}5@y2XJ)?>OT2Qw*SD7;!vd$494BLac5e&v(uNeaJ}F=m{kGrQ^ta;M;Y7{ z*scAsMf3XNgt4v>(wV4JzOZ5C$8|$P53UWAexAODoL7i}O3o^QGJ?O@wbMi$P@?8V zJ6FfHgo(R8yv!BdhU7(pK`{^+0S?c#aM|`jHm&bqvkRGpvvp$(Q6s)~SfQzLze?va zbse$8m5OYTBKQ!C+tO1P4d}WZSzv(A0O{Ixl#Gl^FHE~ZC<`qetwjJmZ{6RV9DI6m zR7hzwqa$1?5j9*_*=b%df@2Q-T=6~o-nD=d@%z@HR2*ZRc>0BC<@3so zwSJ$Q%pOQ>iO^*un6z`Q8V6^R(D&>yrklR-C9#)%5jEB6WcEqc)3qa#cmwHIIySz# z=8Ux+0c(WZa{r9`sdJuRSpJ9Oc)W(4#);7Ot~F3wH8IR;{F&IcsxWc`)L<1f_x^E7@RUc zzc?io`=LlL@9l_@R-a1aY1XM9=Jc9>tT>qot&5FL$A(!L+RD&Kb5h$?0E^$sIOyEN zW_2&y!ryBCp=5=sga=^iBNj%&QAzci14d!!(NLAOhTazd%jxDF;y08xRDN>Wq#sQ@ zJO+M^h9ye+lMRoGWs3j zSBr@@v~!;s+*SH4#e_pTH_g^ijH9_%*jKjurC49}vTw@0GO{L7#ze4#M!Om6t@9hg;T|Y{Ls2@#Du&v-RsQNV64DM*Ht*wDe zn(ZAOW;oKSygZ)Mif`X;KLMR+1Py2?8l&sPOoT~1%ICq=l#DA5P0O_v6x`FNtwg9$ zbz_@0$-jd_f$!Ht_b>i-GjF{dSAaL0vvgq_WxIciMxOIg)52pE1vz=5_nL_Q*iwjU zKh6KX=~ZYve$EwDXtz7^6gD|m`)oYz2?woRx4t+glpEM=fi&RZwJ2Se8+Xm??T5b- z8@YBjBOQuy-eU%HaYw99A-MOynggqP;GHbkcXcGpmo7=GOn)Cc$#`ho-27htyzmV9 ze(nWvjCR3-_kucVk7SKz&+5tztvd8_(uBF0KE!hDf3%uQ;M)F^PHxwihcS zCo?j;Q3gWZ6zgWTx5_v;X9q4(M^|!9oY&ue`dUX$%b*wk|C(A@nO}lx!=$7V|Aj$B zQa_ng;qnha2LQ1HR#IO@{k+)Ju38#;h%TOd!r`DY!0S(+NG|TSYGLw{uV8)n?N>!} zSy@@magutto{H>0s%3m$9J8h^|38)KrM)WFo!)gnZP$%bG@J%HW^PDTl97mt8W8YNC zK4rh@CC|zJ9bv^~JJGI;J856DHSGqz>L#p~-8oJDs9lXrXMwyw9T)H~$C?XQ;lPY& z8}`Ha3an~Qlf_O1sW+;tL0WVc=EfDe#N1Zr)+1l9+dE%&u;@{Dz5D^>TkOayw;bp+ zILnwii1KA^wUhbs`Oj|s2xCk=Q0fzW*~EFzg?@`%So~9mJ`5}FutJ>c37u%;F2A5` z2x2ClB@CPy^7iqwvcVgF2&>dd+P7m8Pj4@CBU$P_d`L*K@S9h`^}P_vfjuw|-Hb>} z7@C1qMBj8Ev$AaCraQSH0y3ejyr!l2Tm33izDqKN_EeXZUszYg$S|gsbLY4JUvqF4 zd8w%{!S!gsxqm|*?yWmoT51b$ad1RS`T6dxO|W_`q`YW)cd)=w{5-_QXW;^&0ldcjIkdX!S?w+DZUn<*Zt+(L?#gV zXJcb3IJJS)tW=bg34*p-XC!i8i?a$nuU>z-tu`ZL-0<))5dXRD@$vCMS`ZEU?%KrI zz<{8rs2-U9`Yq4;Oj+3?#~(K3G4t{AExG4kZr!|T(Vh70GanaM>g8kpa(oCs`n=XP zaOM2djYn_O5=IP-{z)&z1UMi!(I>EzjsC#tDg+o_z)eU6Cy`z3q3UnwV6JNWb975m zPCHfN<$XlxCvmG)Mefx9X=K{4p?)vm?s|UeTYIlWvi%>|5a<58Wqpud;ch(VYlf}p z59W8CxgS#05y6_my>mJ}z13#nHNL&B^2h-#FY3PLvw{5{>zIv}@Y*rS*-OgiR903N zOskyqvIQLa4hgTl^*2v{v}K~F9FlVu%S<{mS6@KNO!Y2RTc`-0tCIZ9x&axN zOFf32%uNeVS6#-3ZXAL+^PCOi!~eK7$<3`4PgJ}2lT`K_s%7d<`V-6@Tr6%xR=;9* z3AoRRM_M(K9dC{LRb}0%Okxke^K1!BnQ3Uy@|y7**6eIFQM)hn$Vgllvb#zb<*TBp z8IhgM$!l-ywaAunI9vsOEraIH)1SY7rF1xsSA9N7rk47<9${f-CVse(25y@3^YH;o z_xAOr{RS-n!4qjJSb@rp_qsZYe{3ZM^0PcmfJZ8(n1BudbK|FY_{eS z9P9S8)^T-g_GQdiVtEUse!ul2qdX!#oeSp*dTG=WJDDUm_a zg6g^|%AW7~b5Ph#?1|KVrXqy&^k>^Uke-nj%+Jvi@Ce24)*5A5kMgi&yo_);!Rqx8wYm`(8{4MQYWDG(fSmK!s3`ekiyZ2>?D5#Nj*gD; zaf8EY@Z%7pb)Kbj?*}|n`U#bSuRATt@Y47l@{M)Ah7i;2r>)y(saPfP!d;NFL})ny zM&|o3RHd--{UwveoIva&q>1TaMM=-#Bd z+Wza8=Sp!a_}z&Xm?=PxGE_!DA>3qTYqq_N7W_EIr3zbQ3ZWJc_1ZHeFRC86e%4QD zZz;Y#Qar#gmCO&h=v&K1sQB!%%t!w}jeT`oRBPAv9uV;m79~j7lyv6+3P`7*NGK>0 z(%qv72neVk-6bVRC_^KNC`gM)cS^(1@h&_^kI(aa-|zSNmpbgt?0w(&ifdi#S_>f? zBpDttTa%QxLI#FQcV>g^P@g}SV5iQTB>iJFpg?!&(xufLIO_eG8jQG`CBM0m92|K( z_nSX_{8(C25=6r{vpJcdou}(xQBe^Wr>Cpi2PW z*^HZNGuqi$nwc8|?Sigcy+D6|E`+57oX3*T8hSye6xE z(OMIM_h2gxOD7>f9)7)4#`wXzWBX=#^??fJbj3T%w!L{lV#UH&1!Yr52M$}S=#;gW zl8}-D(Q+W^3ZV|EhI!`!(K1uqBnpeguFiA@3h!==hG;};sH>ZKu8n;TJ+R&h9$V9| zCYxd+A|kYTdP?jjLAPcF*tMaaoA8eL7T7i>6THN6Up07Lm+GiMvm+h%+V+==vI z7Q{-RLFb^$#S$BE@?Db1G3&>1f_NrkE_D>aNv1y@)XaD$iZ}<2hScvI#nv6ZFrlUC zPjwv0@cWi4q4(S3LkKZoGeSkQSXu2cf*syb%5D)?AHsXr+Q3cR_`q8O*W%}Xcj!{z z@X+g^2&!0;H;Z!BHjUQkKi!T36qROFDb?Gie!m83vKezRVOG~ZeZ6dn+!cMiqOneo{e!$~&a6hUMKkY4qNCJb& z2!}YgKb||+TC$*JVj+47NjQ=B5U+LYg>3+v2N^udarTm(9m5|gP9=FIr)G}Zvc8&X zwmFA1N&XoP)DEN~`>xia1mBs5o8L-YWNI=pk5}*C&e+0=VA5a48@RHcacRpub0=*Z z5YN$d=Lbc^PQi&j>__!{!&Cm_EBUHnCQpSVA#SP8Pey$g#l8SeaA;lp(WS8KH;n1R zN@pJ|mc2!H%$0mFj&18yavZi)qpn*oy~-fL)7C?gq40wkI!+)e`(XtI>i!ja0>Lw4 z60`yz_MD(h>FD2!y3oLy`{r0lnwR+9)uY-?QwJessGgnnMWjnasP|Kq`~7`#Hf zEO=w_XrHs`OV19pOSyPxI~e=ya3>+^6yev^wU~3mZVZQ=ZQ!hne4%)U(u*YVy6M~> zftTSYRied75kFOo)x;oq{rcVF$AsVA3$S!U^DNoqo&t6xv?-TnaqdZLbzB4eGv%`n z-Z-e5Y2Cqj$e`?>{kaPbAi&%+k@X^4g!YONtmn&dU^j|ZKVO5mgFj83I#kVMZ@dk%*!y#KrrVU-NZ)60#4$lB5r=TmZ;WtBE>Aw1#+MX(QQ zj+>|1t zDEN_#jErctD<(h9WF?B8Hxh4W;3)mj3wi&pzqq#p2*2BaiRmdGS(cxCvdTww#m6X9 zU)M~;bpLICbDN}G2{wZAA9?4aL_<3TF}N2awk1Ky_A#`91h3evDyYYPU)_rGeH6ar??CcT@*TZ z&JbAW#ygKxKg8#A8wmY%j2{9KZjf(k z1t7b+o4e&}Q1q`lZ|S#oA^Xj2j@L5aOWs-r?N8@pPH)tgPU35D+gzsj^!z(T+KZP| zd|jjZ5jU7PI9@&auO5jmFt^@>$G*8M;O5#=2sJA96_spkOYP34{|cj|t*tqO3=Iv< zWWXvbHtmw0mU2(h{ONh=IaTR~?$rDF7jDfl)Iljf1WPxTTOdD7mQYbAKGLB#AI*65 zAZzhUsgxBPHR2(%NjdvWAfGi%ZfpGxk66RPQn(ecRHWZ`bQBsl*7y*cD&^!83z)X2 z4pg|wN=vs%tpR`IK+Ow?KVMTIo>#M(Ue~;SJt{hS4`NKO>9mN*8Kcqp{xX$F&a;e+ ze%w_yqcU6G;ya61nhb8CmweQJqNmt>glO?ooQI6gQ|R&e6fo??rsZfzU^;T-E(?@a z`b6#oXBrsn>eO?3LEQPCzmMlqcDZ_k>?y<|>if%g$A>`P&1UqYnyzVyqDP#F(>(Ov zq|&$1tRLt(8Azp^z2e(Cvhh+|UoB*3X9rwxB7G3O*w!cnP*W)(^E>TsCk&VczDGG8ruUp)haUV)xg0_$9Pn- z?1Ez&TRFjNnHC`-p_`eS>^enHC_hA}z^48BRWn`qlKGcNuIQBVa?u$94Ug}N0Ueu}^UUt-xHTH^$Voh`^qT+pJY zs4kEydwpqnS=equ@xgb>0B^ zQ1R0(V2Cg`7pXISf2S{6J?x}RAtN(g`(Q2#t`NC?Khk_jqcc_moW-fe~OUwq8)Kc zSS64S;Np0-noKE%YOKD1Hva7iYJ(Xh|yVVp!&Mix-g?zKusWTT_=6SaWtlO&qq{3q&IP+@WmH zNp;0zq|nCV8YVpTcO-U~W~hMM0kdCKA@VRVYi6HNKMM2mkugigv&gdP(2~1RVXJqe zhB*=nzJ6GC;UwF?NBkr_X`(Xquvz+9vl(pir9dpgek7oP>Cw3}ctlvtnciy7{e|(G zdsw?e?9GC;<13y^GY$(veoM<1Lv}53Wvd2*KNcFj0?VZN%WDZ1g*a?PraCGJ>qJ@x zHfmUR(r#X7b$*f^YxcB=xz_| zTXUAp1dF>_Wf2ee`kGm{L}s@(Nbf{DWfd5lC?ByFwLJFMZR_9HyBxNTQjfxw`E?a9 ztnFrZy*DSU>yJKHY%^VE`2F@6kbOethA=1#DSaF{IXPY_GSRy2uJu)ipqYXr~*UpoS0*9-(wPA|KF6 z++&A|r937B@Qv70CkCwMJL>qSr>g`&Yz%ME+^a-4Mqw|f1D z5Choputafjaj8Iv;=H}RKOR@k)_TovnN!;1)86&*ql}*CP89d}R9lKv*J-QrJNNH1 z=7>AF1sBQTh4wZff? z?bWwIE`*alSvM&%wJO9FC;MS)hi9A$G*NqYyigaGJZQ%LoMR?pe0@`PrpK5 zW30Nm+PEn?Dl&3W55lPQ4{2S@Y-~`#`2k&>olvYY8`L#8GNe;pJSn#2MH@xUJb!#= z&ukO_)X^usNtP!Im%|x?846No#Gmg@GT5N4nMbz7`eLaQ&ze0!#lC#WXZDqeNIRNc zqf$9exbw@GrwgTyb3Ls|SEi*i%)zno3e7{Lgj>jEgvXtA>h9PdRd8bW7jYKK)ZJ>% zSEafeSe7W`v>d=6CvUqbdU4FMrF(C1;i<3-j(bzirF2Ozmll7>{eE@WZ7oji%Ds!z|>%`G1HX=wLWn>Mf% zX4S1xAU1IJSqM>NW%}rG4mE~}Vm*9yrQft8eW{Vo(~@nZ)bNm^u-GhZQWj*6ESH$BX2TX$3XDwl+1k7X2x`^+$hGX zXMR->wn4e?4o_Yaa4oha5({Dc&L)=YV%AxJ*?9a#bV(I?MgU(W+|SSN?c28yuQd5G zn{Ud9h?D`SyWIHx{d<5Y6JtH0gvcD((UHA3vCvo2@$Fl9Wieze&CSga)k$@oH~eWP zgfQC==C8o&CtRFABanh!jkEf!M?}&|@iL4)pi-!fRTpc19hQ;3EX9QWsYW(378J#r z7<7{}oF;8GFXb^BrB3b3c$hE2>K$bV6HaF7#z(`-^X$w|BZaC#9wX=@OfmnwZYhFCYRoDgB)+fxuBAf`+D~&kQ@BQlI)IiG$fmpD5JmWg|4y ze_R!LsnQ)`y{^Mh5<#BnWl>9DZ5)fu=3-`L6-lsN@Y7T4_sqrK8B-{9f3@c!&LmEz zO@`uC{%w-G4=>I=>UBqW*!dKMYSR`@-g@h(J^JicMxX7(mRojKdU>t3uXCJy={Az^ z1fSRJ*M<46hpk_?_-(DUpaHlvRBIj>h!>K%I+NZt`_21SSmo}!RWhe%WUN?3%bnfQ z>X4zjUEi9_dPcdnU01?RWiK_g%`Ux>00Bb7@ati2c;I2#Tl{DMN{{9`e|U3BO$54T18;5oOxlN>o(fI|n(;ScSMyr3IqW^j!qhc6mVXdxOE78k{rw3U zx%2SQcE!zVNJyW6dSAW~Embw67g@1@d)Ib#5RrK&JwklV)C7U;E;w@^56b$04v0z}i}Y!CL$ZJDxMy5c z2rn{e8v#Y|kzl`P@Ir~@NRyDg)dZF7w*yKe?N0jU^ZK-;2ANr3_D7mRzG;d@bS>pA zCT=LwS{%k~-I)kLb&e;W1{2E4%4%RR2nGkNe{@t7BsY@0GLt`FHSVvWQc(u8rI9+# zw*Y+hWram(H|DDzY>1Su5Ou_B{?@MHU}V28vcKv0_T%A_++Wb zmYGbhGw4l`f6?ViEY-C)GBuSeynDw1N?N1!Azf2l#0SS2_VNuqzsW1z-@luU7RDSCAPK^Z-mD#XuKHC z2JcTVoVond3)ek!*!fU+JzMLvfF-#Wvj;OKs}xuJ43%?CU7*64FqbY7*<)GEHhX<~ zwgd@Q*ED}{roE83@0jk)mPVaNjNfIU2@heRs#7h@cu3rdaiY4ham#+(sx*iGE(2Be zs8AJv-gDiBlbzYxrUA64PJPe0$rDW*@NtS34|~Zi7pficjyWTRX3BE;t^>rn$Y6<9 z=lA0m20e%1t)$(obYB&Aj$aNc@t?eiV4C?pAk7U8G~hkK4NG^7LzB+m{!zB+l?)dn_fN zeyE-a`T{Gk%j)P#KVA;|N*-!~SUu^MK<%t#CS*sidDIasE)iU3lVRrH-SetoO( zZ`&;>D!jU_tfE3ysqOW1-xiP#bxvw)ARHvOZsg}))@(8TOkl0drXtPzWUTjCI_o)~ z7T37qIF(O1;{~ShxJJM1-)DY|SKvkvp7S&)eg2rHYG|o;ZTsF_E#Js)-A`vBQaD0# z|5ei@``}k4AoZsj{&8JIoSqD-{G6e5-PUH4F5>2O!{V9bI5`v5mfpO&i{1j2{xDbO zMh+E1;{S*D)2FSB#B;`Nc2fMPMBAchIkS7t;4`x@_d9cET8;PTeB3!fW3wCYd>pAZ zs{caK`fCkZ&^!rv-+2m_4<~t%3i!I+&eXg|nA{HA_M_~A2IoV34u4Y*t9 zwajkv>eGh&)~q!eM-)g`p7h-k@)?}RR?Av}Yp-Ic2yg>gx)XoCk^6D6(x~$3kxQ22 z%aZJvF47Z0E+hf+@?D26u13yTX?SXr7FV?0EU|Sa=R!W6Dta&w8?8Bn5P^bkwg#~-#^}ayr;V`!D919Twepjl@g@VT~+kjF?spxr5qvfSoA9L#pTM5!!dX7HT9L^BL9gh+=JeOse zEE)eoNQrs0Ng6vkIzE2<`1PwjsHF^p8tl9+2zSi(yv>e^x(WCoBw>4}-X;7K2gTdo zB_epE@+d@@@uk9G$?H{DqE&EQC4@$sF+%H6b~kMJrPFG-K-25?L4HHzQv)U&C?Nd& z-#&l-TvKz7hew0g7<${tTUH0h#>Q&>j@{DH@m%kBO6C%-|L5q6&^aa};?8r4!oy5d z)%5ru&f7DJPS3|EuqeOVGOj0x1@Q$^a`^5~j4L3@+bNJwD<~*v_oU|}J;nbZ^3B%n z`e4=4$I~>suJ8Z3r3z8xRn%<4Se|%cIPvW|=cOZhOq$`%+l*^<%l->PKC<2;b!Emd z>f;HMHuUU;NvFAPW#PVl9T^$9q!bM#yd!ME!NEt59;LY=$h*!bATSxC%y{V%P>4L< z<5i&7=}@|4vS?YNq3RzGXS@e(QSO?f@2iDv&yElDHhORP=QYk}Z2P#1ejr5_Pd%~q zF>GJSHu^u4A@w^TR-+Wde^9|Q-3epX_|Wf)L2^Gk4=#~c*Q*EHs7NFu!F}xXOGjWg zu;S${CTtvy^cuUN>+5_7k--0Y9AA)$DZ`@o9f$%w@%Qg8wlV{o3a$M^L=l!+5Ij3=oAfegy4NV;@<6&Mfzw#nXsPRZ9z z(R>=2t8TA64p}RS3vK*Gu$X!J{fyAT2CKM8ug0-QWrd5X6_a_>#ev#B-vM$<2r2c! zGEJ00sdY4hS);euD$nDjR+Z;d6vX=xAuBu z^m=o(Uc+hhS}Xaa;!Y>3tY5VvA&Q42J~RU3&rD9Fp`l@AW!01QIg}A3aPHo;?dJ{v$L0!V?S|3goT02@w4sv@+AlS zETlk9&CQgRdMdx}e!@*w6_NG*GKsgbpz@(#Jyx!~!%F{*mtW8*w)F@y7Wl_er?0#( zNMQXX$y68FE`3t=BRwbQ!RY4;y-}56l?$(hpW8ATz++3u{9#aTZ|kj*SLcg}Pv8mK z2Vu*OhNZD|^|#e@Hxtxt?OtO*s)zpg>7ioBN@)~r)t&=8X7s3knR%#2B5m<7iL@1C z@f9#A-&{*Y5ey0E&k*qa@`+R>ez-GLa-QQ za4Tz4yhZFJ*0T9kpIf&Wrrl|y;aQ9j!H6Y--2Bt&ZCik7mAkpV0f-FyaVgsDHjzK6hi8qyqnKWShi7aKC}tZN*Ru*2k0Z*`=RgZ+dPKEjLl zEA`<580+Ru>NF0I{y6ib0Y0_9*W+WIz3y9kRx2e^7uf~yFfswZSF7b`I*5wp>C&A; zYxDs#zrBARZ6_Na$Dg1P{tee*h#;H4vlRfSh!u!75l_x~U5_m%n_}Ooue179*7x;< z_9*l}bUef-^!4)#ENi@X|Gr9`@LNb`VfO*(zJ$Y3gtYSg zlU#0i`R%Wdr)vjJ6sgwFQaWju+6e4-8kEF{dcE5t2^PgiIPK~%XQlZynXKUeyZDFYiPU5`{2Un4Hj1BPHh1=^AJd-NSF%{-M5J*2Ko% z-rm}}^;C_^{_Z*`rm3uq@>}!(&A}Yh($dmg(aHWu?sNIHc%%0O*7IyO5jtY)w=bt1 zc=-Hj;jKvop%GWhRaInXSa}#<8TPiR5L_HHISirkp`@asiY7BB83uMMOy99@PnA1y z;sofYK?n{*NKOQ`2_3QyT3)c7Rxr3&mmKev+_8C|mO$6=E)OD+(s>*o3u9lvppY9^ zuxjO)0OJF9w$u;FZJRs9?m^{voR9JD1nzQGf9JU>K{YBw;(XWfW0J?9(mODqe*?F& zlERd<(0MA^?Z^;K+G873c~*#VOo1lf@_J}kWkT*Tw)YpI1G^J)D~w5{h(!;t=gl*}zHd>1LdbQ|Eo&>qjbV0|~8gG(>}`QZ}&Jh{m_ zrI_(-w==igKOg@uR;F_XA`P*mhcW@OQ>dSzZ8|FFvZk+v?&^wW4E6p6<8gmC=+gB8 z=rL85Nl{U;yu94d$S5v07I*{PJUr4yOqg3mCN|k8_pui|ej+rVc`=#V-jA>~& z_m9w5Ac(5$!B&jM35$rV0CWjKmi)F$HF-5)5XmzCT*Z&s>{U_j^_1*`5jn@sHc&)Y zX9Or5z}2%Ez;uFvA^(9_A0^-gX#NwpR^73k$*x1-43FUu0ew6fkxr^Ohh#IUHKRrJW5AhrT8k!V4 zUy>mmGzDXs}z*k~_*S2XWwm zG)2xH459*IjJB!!fXlpX0FNjDLM^oiwuhX^Co?m%xLClV6S!n-moIN*KfRaX-Dds7M^KA3KkRc$5~mQn*| zKECVi>G%l7@&G*&6&1~S{aQ&;k;$Z{re@3f(!OvC!3~MnHTe>4d3m!O)tHKrM zx+5?G9o3n?k0_4hwg&J5lULHZgpg!Rg^~{mcQL>XK zZEbCd;EyEnOP_!%7n7ybN_evw`Ra~?9F}St7Q3;9`w@{Oqj`=cEW=)1gXG}iPAJ6VvkP{ypc~!o=f*2TvGP2cMmt>)+`z9`j;q_sjUpbh4bqc`)TU?DCyd2~(1~xmwiMSko z^Xk>BprD}4%xNI=)YjJ4^#`e3yY>lAFci=doJam*U|>k9fB+g8NOL`Jd0>wg7gxFB zvRWO0!VZKJKC1zkvgQun-^Im6O55K4AkE1r2g%k0@En@36#2im) z_*h^n;uOo}%dx@^W>6)Ihgp7YDYEzP-aVoREQIqiPffn%P`8^Z8hT{X-TSM&(D$;uy{`Zne#)Z3 zsbYP=4QfNA#Ka;l%VQvG%j-CMqjGm|cN=m-I<)5wm?g*A?~)5WxF21_985frhrb#W z-jW0A%ar!lo}R7u-h}R5ka-F_&N|P2&xc`k@)2DIUU60&U|9Og)nawvPUzB2@K2U8BqM(~MZ?>`D#hnd` zFh)97BPJ>V2_MzAv$Zu*SBR2tT=h+P4p8=J8C0JV+?PMEuG%|n7Ny^742MY9z3VA8 z&tnfom&S&M2 zu4#s|(AzC|*=ZOmt$QH@`~cKmot|L_#xO*JK*G&0EF8r@)tOB}z}ZB5HAv=`KnVdS zMev#iFQW?}2E4@`r`j`24_g}=o_pQynb0a?Kh%}xAlVqRFsk`Qd1vi$&;Dr z#B<)M0k+NKT)><=0{hCGR7{>{KYn~@OO8Xo{90oXgHmI6cefOWQn1(doPbUwsPaxM zpy#f3d^+IZFUMF_GJi385)nbQn?;<>EfY%8lMj>qfVd|;Jrlg=sFWZcNBJ0$!I?>; z6g#vafCTI|=(GU)#f#Dw9~?q&Asv^48mMS!Sc<^aw#MEtx|WhoT@4~h>76kGRzwvC zuN8!yOzuDw4)&0u=QE6nldv<;*;udI7lXhuDQ+*dJ+Sp@iCG3M&{B?p029ywhqQrO zEMt=_bQ=n*MOBK#P-|bP+|F%{Pe@R~4Wsu`yT1TM&4=BseUXg`!h)C)kXPt*lhaESdE5S~-|^XO&pmmO)2Kju>>2 zq;VIf2dmI?JJfS!%9f}ccJyE7(7<*8O_>c=iYeEeG&#)ab@Cf9uQxll*B1hnYm)tm z2OTyR@3hI?kWYKD5G`bv{#2zV`J~#G&dr;q`iW_|qOXjgBEDnh?!H6T*)n}xx!j=2 zQ*5_=x!CoU*g=I_yVBPc4yQ2sRc&O&?piMtV_|ci6A)NWJ}mzn%2=_Ks$z1>%ga+C zoim-;LGLWNNr3DlDLDek0JD@^A8%+!`B=71vVh@TA=lnEPX@Nd$xD~@%Ohv(01b?c zJQdv)z<$Mb{H1nHZ7rmtooNbG!`v5KSz%4XC*H&Kz+`nasAY3kl1#&NKYjs$V@Hot z=9(QpDxnm`?c~ zv=R_XWc&tb$eaYkB_$D28vri&IG2?z3xY+ZzJm zuFvHsWT8^cT;t=od4eM72-WCH7p`NEZ#TemwM{mxsm4hU;3_~At-!+XE_Wp0w>j({ zjHjK7Gc`BQ4*9Va)Bthbg9i_${L;{7tMelr+ClAduc)gt^qYW_rUC5&TXNHRWNZdy zjzLxe&7}YcfbV@u`@i_MgU|tkI&<(j33A~2ZzDfFJ-)>M=??$@F4;qW(3zw6Z|4Mm PiXd{*%2#uxZa?}Twh?c8 literal 0 HcmV?d00001 diff --git a/docs/transformers/configs.html b/docs/transformers/configs.html index c9097bfc..b8e20f21 100644 --- a/docs/transformers/configs.html +++ b/docs/transformers/configs.html @@ -86,11 +86,15 @@
@@ -126,7 +130,7 @@

Number of features in in the hidden layer

-
27    d_ff: int = 2048
+
35    d_ff: int = 2048
@@ -137,7 +141,7 @@

Dropout probability

-
29    dropout: float = 0.1
+
37    dropout: float = 0.1
@@ -148,7 +152,7 @@

Activation in position-wise feedforward layer

-
31    activation: nn.Module = 'ReLU'
+
39    activation: nn.Module = 'ReLU'
@@ -159,7 +163,7 @@

Whether the FFN layer should be gated

-
33    is_gated: bool = False
+
41    is_gated: bool = False
@@ -170,7 +174,7 @@

Whether the first fully connected layer should have a learnable bias

-
35    bias1: bool = True
+
43    bias1: bool = True
@@ -181,7 +185,7 @@

Whether the second fully connected layer should have a learnable bias

-
37    bias2: bool = True
+
45    bias2: bool = True
@@ -192,7 +196,7 @@

Whether the fully connected layer for the gate should have a learnable bias

-
39    bias_gate: bool = False
+
47    bias_gate: bool = False
@@ -203,7 +207,7 @@

Predefined GLU variants

-
41    glu_variant: str = 'none'
+
49    glu_variant: str = 'none'
@@ -211,11 +215,14 @@ -

ReLU activation

+

ReLU activation

+

+ +

-
44@option(FeedForwardConfigs.activation, 'ReLU')
-45def _ffn_activation_relu():
+
52@option(FeedForwardConfigs.activation, 'ReLU')
+53def _ffn_activation_relu():
@@ -226,7 +233,7 @@
-
49    return nn.ReLU()
+
59    return nn.ReLU()
@@ -234,11 +241,14 @@ -

GELU activation

+

GELU activation

+

+ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$

+

It was introduced in paper Gaussian Error Linear Units.

-
52@option(FeedForwardConfigs.activation, 'GELU')
-53def _ffn_activation_gelu():
+
62@option(FeedForwardConfigs.activation, 'GELU')
+63def _ffn_activation_gelu():
@@ -249,7 +259,7 @@
-
57    return nn.GELU()
+
71    return nn.GELU()
@@ -257,11 +267,11 @@ -

Create feedforward layer

+

Initialize a feed forward network

-
60@option(FeedForwardConfigs.ffn, 'default')
-61def _feed_forward(c: FeedForwardConfigs):
+
74@option(FeedForwardConfigs.ffn, 'default')
+75def _feed_forward(c: FeedForwardConfigs):
@@ -272,53 +282,129 @@
-
65    return FeedForward(c.d_model, c.d_ff,
-66                       dropout=c.dropout,
-67                       activation=c.activation,
-68                       is_gated=c.is_gated,
-69                       bias1=c.bias1,
-70                       bias2=c.bias2,
-71                       bias_gate=c.bias_gate)
-72
-73
-74aggregate(FeedForwardConfigs.glu_variant, 'GLU',
-75          (FeedForwardConfigs.is_gated, True),
-76          (FeedForwardConfigs.bias1, False),
-77          (FeedForwardConfigs.bias2, False),
-78          (FeedForwardConfigs.bias_gate, False),
-79          (FeedForwardConfigs.activation, nn.Sigmoid()))
-80
-81aggregate(FeedForwardConfigs.glu_variant, 'Bilinear',
-82          (FeedForwardConfigs.is_gated, True),
-83          (FeedForwardConfigs.bias1, False),
-84          (FeedForwardConfigs.bias2, False),
-85          (FeedForwardConfigs.bias_gate, False),
-86          (FeedForwardConfigs.activation, nn.Identity()))
-87aggregate(FeedForwardConfigs.glu_variant, 'ReGLU',
-88          (FeedForwardConfigs.is_gated, True),
-89          (FeedForwardConfigs.bias1, False),
-90          (FeedForwardConfigs.bias2, False),
-91          (FeedForwardConfigs.bias_gate, False),
-92          (FeedForwardConfigs.activation, nn.ReLU()))
-93aggregate(FeedForwardConfigs.glu_variant, 'GEGLU',
-94          (FeedForwardConfigs.is_gated, True),
-95          (FeedForwardConfigs.bias1, False),
-96          (FeedForwardConfigs.bias2, False),
-97          (FeedForwardConfigs.bias_gate, False),
-98          (FeedForwardConfigs.activation, nn.GELU()))
-99aggregate(FeedForwardConfigs.glu_variant, 'SwiGLU',
-100          (FeedForwardConfigs.is_gated, True),
-101          (FeedForwardConfigs.bias1, False),
-102          (FeedForwardConfigs.bias2, False),
-103          (FeedForwardConfigs.bias_gate, False),
-104          (FeedForwardConfigs.activation, nn.SiLU()))
+
79    return FeedForward(c.d_model, c.d_ff,
+80                       dropout=c.dropout,
+81                       activation=c.activation,
+82                       is_gated=c.is_gated,
+83                       bias1=c.bias1,
+84                       bias2=c.bias2,
+85                       bias_gate=c.bias_gate)
-
+
+

GLU Variants

+

These are variants with gated hidden layers for the FFN +as introduced in paper GLU Variants Improve Transformer. +We have omitted the bias terms as specified in the paper.

+
+
+
+
+
+
+
+ +

FFN with Gated Linear Units

+

+ +

+
+
+
95aggregate(FeedForwardConfigs.glu_variant, 'GLU',
+96          (FeedForwardConfigs.is_gated, True),
+97          (FeedForwardConfigs.bias1, False),
+98          (FeedForwardConfigs.bias2, False),
+99          (FeedForwardConfigs.bias_gate, False),
+100          (FeedForwardConfigs.activation, nn.Sigmoid()))
+
+
+
+
+ +

FFN with Bilinear hidden layer

+

+ +

+
+
+
105aggregate(FeedForwardConfigs.glu_variant, 'Bilinear',
+106          (FeedForwardConfigs.is_gated, True),
+107          (FeedForwardConfigs.bias1, False),
+108          (FeedForwardConfigs.bias2, False),
+109          (FeedForwardConfigs.bias_gate, False),
+110          (FeedForwardConfigs.activation, nn.Identity()))
+
+
+
+
+ +

FFN with ReLU gate

+

+ +

+
+
+
115aggregate(FeedForwardConfigs.glu_variant, 'ReGLU',
+116          (FeedForwardConfigs.is_gated, True),
+117          (FeedForwardConfigs.bias1, False),
+118          (FeedForwardConfigs.bias2, False),
+119          (FeedForwardConfigs.bias_gate, False),
+120          (FeedForwardConfigs.activation, nn.ReLU()))
+
+
+
+
+ +

FFN with GELU gate

+

+ +

+
+
+
125aggregate(FeedForwardConfigs.glu_variant, 'GEGLU',
+126          (FeedForwardConfigs.is_gated, True),
+127          (FeedForwardConfigs.bias1, False),
+128          (FeedForwardConfigs.bias2, False),
+129          (FeedForwardConfigs.bias_gate, False),
+130          (FeedForwardConfigs.activation, nn.GELU()))
+
+
+
+
+ +

FFN with Swish gate

+

+ +where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

+
+
+
136aggregate(FeedForwardConfigs.glu_variant, 'SwiGLU',
+137          (FeedForwardConfigs.is_gated, True),
+138          (FeedForwardConfigs.bias1, False),
+139          (FeedForwardConfigs.bias2, False),
+140          (FeedForwardConfigs.bias_gate, False),
+141          (FeedForwardConfigs.activation, nn.SiLU()))
+
+
+
+
+

Transformer Configurations

@@ -328,73 +414,7 @@ These are lazy loaded and therefore only the necessary modules are calculated.

-
107class TransformerConfigs(BaseConfigs):
-
-
-
-
- -

Number of attention heads

-
-
-
119    n_heads: int = 8
-
-
-
-
- -

Transformer embedding size

-
-
-
121    d_model: int = 512
-
-
-
-
- -

Number of layers

-
-
-
123    n_layers: int = 6
-
-
-
-
- -

Dropout probability

-
-
-
125    dropout: float = 0.1
-
-
-
-
- -

Number of tokens in the source vocabulary (for token embeddings)

-
-
-
127    n_src_vocab: int
-
-
-
-
- -

Number of tokens in the target vocabulary (to generate logits for prediction)

-
-
-
129    n_tgt_vocab: int
+
144class TransformerConfigs(BaseConfigs):
@@ -402,10 +422,10 @@ are calculated.

-

The encoder self attention

+

Number of attention heads

-
132    encoder_attn: MultiHeadAttention = 'mha'
+
156    n_heads: int = 8
@@ -413,10 +433,10 @@ are calculated.

-

The decoder self attention

+

Transformer embedding size

-
134    decoder_attn: MultiHeadAttention = 'mha'
+
158    d_model: int = 512
@@ -424,10 +444,10 @@ are calculated.

-

The decoder memory attention

+

Number of layers

-
136    decoder_mem_attn: MultiHeadAttention = 'mha'
+
160    n_layers: int = 6
@@ -435,10 +455,10 @@ are calculated.

-

Configurable Feedforward Layer

+

Dropout probability

-
139    ffn: FeedForwardConfigs
+
162    dropout: float = 0.1
@@ -446,10 +466,10 @@ are calculated.

-

Encoder layer

+

Number of tokens in the source vocabulary (for token embeddings)

-
142    encoder_layer: TransformerLayer = 'default'
+
164    n_src_vocab: int
@@ -457,10 +477,10 @@ are calculated.

-

Decoder layer

+

Number of tokens in the target vocabulary (to generate logits for prediction)

-
144    decoder_layer: TransformerLayer = 'default'
+
166    n_tgt_vocab: int
@@ -468,10 +488,10 @@ are calculated.

-

Encoder consisting of multiple encoder layers

+

The encoder self attention

-
147    encoder: Encoder = 'default'
+
169    encoder_attn: MultiHeadAttention = 'mha'
@@ -479,10 +499,10 @@ are calculated.

-

Encoder consisting of multiple decoder layers

+

The decoder self attention

-
149    decoder: Decoder = 'default'
+
171    decoder_attn: MultiHeadAttention = 'mha'
@@ -490,10 +510,10 @@ are calculated.

-

Embedding layer for source

+

The decoder memory attention

-
152    src_embed: Module = 'fixed_pos'
+
173    decoder_mem_attn: MultiHeadAttention = 'mha'
@@ -501,10 +521,10 @@ are calculated.

-

Embedding layer for target (for decoder)

+

Configurable Feedforward Layer

-
154    tgt_embed: Module = 'fixed_pos'
+
176    ffn: FeedForwardConfigs
@@ -512,10 +532,10 @@ are calculated.

-

Logit generator for prediction

+

Encoder layer

-
157    generator: Generator = 'default'
+
179    encoder_layer: TransformerLayer = 'default'
@@ -523,10 +543,10 @@ are calculated.

-

Encoder-decoder

+

Decoder layer

-
160    encoder_decoder: EncoderDecoder
+
181    decoder_layer: TransformerLayer = 'default'
@@ -534,16 +554,10 @@ are calculated.

-

Multi-head Attention

+

Encoder consisting of multiple encoder layers

-
164def _mha(c: TransformerConfigs):
-165    return MultiHeadAttention(c.n_heads, c.d_model)
-166
-167
-168calculate(TransformerConfigs.encoder_attn, 'mha', _mha)
-169calculate(TransformerConfigs.decoder_attn, 'mha', _mha)
-170calculate(TransformerConfigs.decoder_mem_attn, 'mha', _mha)
+
184    encoder: Encoder = 'default'
@@ -551,29 +565,21 @@ are calculated.

-

Relative Multi-head Attention

+

Encoder consisting of multiple decoder layers

-
174def _relative_mha(c: TransformerConfigs):
-175    from .relative_mha import RelativeMultiHeadAttention
-176    return RelativeMultiHeadAttention(c.n_heads, c.d_model)
-177
-178
-179calculate(TransformerConfigs.encoder_attn, 'relative', _relative_mha)
-180calculate(TransformerConfigs.decoder_attn, 'relative', _relative_mha)
-181calculate(TransformerConfigs.decoder_mem_attn, 'relative', _relative_mha)
+
186    decoder: Decoder = 'default'
-
+
-

Create feedforward layer configurations

+

Embedding layer for source

-
184@option(TransformerConfigs.ffn, 'default')
-185def _feed_forward(c: TransformerConfigs):
+
189    src_embed: Module = 'fixed_pos'
@@ -581,25 +587,21 @@ are calculated.

- +

Embedding layer for target (for decoder)

-
189    conf = FeedForwardConfigs()
-190    conf.set_default(FeedForwardConfigs.d_model, func=lambda: c.d_model)
-191    conf.set_default(FeedForwardConfigs.dropout, func=lambda: c.dropout)
-192    return conf
+
191    tgt_embed: Module = 'fixed_pos'
-
+
-

Encoder layer

+

Logit generator for prediction

-
195@option(TransformerConfigs.encoder_layer, 'default')
-196def _encoder_layer(c: TransformerConfigs):
+
194    generator: Generator = 'default'
@@ -607,24 +609,27 @@ are calculated.

- +

Encoder-decoder

-
200    return TransformerLayer(d_model=c.d_model, self_attn=c.encoder_attn,
-201                            src_attn=None, feed_forward=copy.deepcopy(c.ffn.ffn),
-202                            dropout_prob=c.dropout)
+
197    encoder_decoder: EncoderDecoder
-
+
-

Decoder layer

+

Multi-head Attention

-
205@option(TransformerConfigs.decoder_layer, 'default')
-206def _decoder_layer(c: TransformerConfigs):
+
201def _mha(c: TransformerConfigs):
+202    return MultiHeadAttention(c.n_heads, c.d_model)
+203
+204
+205calculate(TransformerConfigs.encoder_attn, 'mha', _mha)
+206calculate(TransformerConfigs.decoder_attn, 'mha', _mha)
+207calculate(TransformerConfigs.decoder_mem_attn, 'mha', _mha)
@@ -632,12 +637,17 @@ are calculated.

- +

Relative Multi-head Attention

-
210    return TransformerLayer(d_model=c.d_model, self_attn=c.decoder_attn,
-211                            src_attn=c.decoder_mem_attn, feed_forward=copy.deepcopy(c.ffn.ffn),
-212                            dropout_prob=c.dropout)
+
211def _relative_mha(c: TransformerConfigs):
+212    from .relative_mha import RelativeMultiHeadAttention
+213    return RelativeMultiHeadAttention(c.n_heads, c.d_model)
+214
+215
+216calculate(TransformerConfigs.encoder_attn, 'relative', _relative_mha)
+217calculate(TransformerConfigs.decoder_attn, 'relative', _relative_mha)
+218calculate(TransformerConfigs.decoder_mem_attn, 'relative', _relative_mha)
@@ -645,11 +655,11 @@ are calculated.

-

Encoder

+

Create feedforward layer configurations

-
215@option(TransformerConfigs.encoder, 'default')
-216def _encoder(c: TransformerConfigs):
+
221@option(TransformerConfigs.ffn, 'default')
+222def _feed_forward(c: TransformerConfigs):
@@ -660,7 +670,10 @@ are calculated.

-
220    return Encoder(c.encoder_layer, c.n_layers)
+
226    conf = FeedForwardConfigs()
+227    conf.set_default(FeedForwardConfigs.d_model, func=lambda: c.d_model)
+228    conf.set_default(FeedForwardConfigs.dropout, func=lambda: c.dropout)
+229    return conf
@@ -668,11 +681,11 @@ are calculated.

-

Decoder

+

Encoder layer

-
223@option(TransformerConfigs.decoder, 'default')
-224def _decoder(c: TransformerConfigs):
+
232@option(TransformerConfigs.encoder_layer, 'default')
+233def _encoder_layer(c: TransformerConfigs):
@@ -683,7 +696,9 @@ are calculated.

-
228    return Decoder(c.decoder_layer, c.n_layers)
+
237    return TransformerLayer(d_model=c.d_model, self_attn=c.encoder_attn,
+238                            src_attn=None, feed_forward=copy.deepcopy(c.ffn.ffn),
+239                            dropout_prob=c.dropout)
@@ -691,11 +706,11 @@ are calculated.

-

Logit generator

+

Decoder layer

-
231@option(TransformerConfigs.generator, 'default')
-232def _generator(c: TransformerConfigs):
+
242@option(TransformerConfigs.decoder_layer, 'default')
+243def _decoder_layer(c: TransformerConfigs):
@@ -706,7 +721,9 @@ are calculated.

-
236    return Generator(c.n_tgt_vocab, c.d_model)
+
247    return TransformerLayer(d_model=c.d_model, self_attn=c.decoder_attn,
+248                            src_attn=c.decoder_mem_attn, feed_forward=copy.deepcopy(c.ffn.ffn),
+249                            dropout_prob=c.dropout)
@@ -714,12 +731,11 @@ are calculated.

-

Positional Embeddings

-

Source embedding with fixed positional encodings

+

Encoder

-
240@option(TransformerConfigs.src_embed, 'fixed_pos')
-241def _src_embed_with_positional(c: TransformerConfigs):
+
252@option(TransformerConfigs.encoder, 'default')
+253def _encoder(c: TransformerConfigs):
@@ -730,7 +746,7 @@ are calculated.

-
245    return EmbeddingsWithPositionalEncoding(c.d_model, c.n_src_vocab)
+
257    return Encoder(c.encoder_layer, c.n_layers)
@@ -738,11 +754,11 @@ are calculated.

-

Target embedding with fixed positional encodings

+

Decoder

-
248@option(TransformerConfigs.tgt_embed, 'fixed_pos')
-249def _tgt_embed_with_positional(c: TransformerConfigs):
+
260@option(TransformerConfigs.decoder, 'default')
+261def _decoder(c: TransformerConfigs):
@@ -753,7 +769,7 @@ are calculated.

-
253    return EmbeddingsWithPositionalEncoding(c.d_model, c.n_tgt_vocab)
+
265    return Decoder(c.decoder_layer, c.n_layers)
@@ -761,12 +777,11 @@ are calculated.

-

Learned Positional Embeddings

-

Source embedding with learned positional encodings

+

Logit generator

-
257@option(TransformerConfigs.src_embed, 'learned_pos')
-258def _src_embed_with_learned_positional(c: TransformerConfigs):
+
268@option(TransformerConfigs.generator, 'default')
+269def _generator(c: TransformerConfigs):
@@ -777,7 +792,7 @@ are calculated.

-
262    return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_src_vocab)
+
273    return Generator(c.n_tgt_vocab, c.d_model)
@@ -785,11 +800,12 @@ are calculated.

-

Target embedding with learned positional encodings

+

Fixed Positional Embeddings

+

Source embedding with fixed positional encodings

-
265@option(TransformerConfigs.tgt_embed, 'learned_pos')
-266def _tgt_embed_with_learned_positional(c: TransformerConfigs):
+
277@option(TransformerConfigs.src_embed, 'fixed_pos')
+278def _src_embed_with_positional(c: TransformerConfigs):
@@ -800,7 +816,7 @@ are calculated.

-
270    return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_tgt_vocab)
+
282    return EmbeddingsWithPositionalEncoding(c.d_model, c.n_src_vocab)
@@ -808,12 +824,11 @@ are calculated.

-

No Positional Embeddings

-

Source embedding without positional encodings

+

Target embedding with fixed positional encodings

-
274@option(TransformerConfigs.src_embed, 'no_pos')
-275def _src_embed_without_positional(c: TransformerConfigs):
+
285@option(TransformerConfigs.tgt_embed, 'fixed_pos')
+286def _tgt_embed_with_positional(c: TransformerConfigs):
@@ -824,25 +839,96 @@ are calculated.

-
279    return nn.Embedding(c.n_src_vocab, c.d_model)
+
290    return EmbeddingsWithPositionalEncoding(c.d_model, c.n_tgt_vocab)
-
+
+

Learned Positional Embeddings

+

Source embedding with learned positional encodings

+
+
+
294@option(TransformerConfigs.src_embed, 'learned_pos')
+295def _src_embed_with_learned_positional(c: TransformerConfigs):
+
+
+
+
+
-
282@option(TransformerConfigs.tgt_embed, 'no_pos')
-283def _tgt_embed_without_positional(c: TransformerConfigs):
-284    return nn.Embedding(c.n_tgt_vocab, c.d_model)
-285
-286
-287@option(TransformerConfigs.encoder_decoder, 'default')
-288def _encoder_decoder(c: TransformerConfigs):
-289    return EncoderDecoder(c.encoder, c.decoder, c.src_embed, c.tgt_embed, c.generator)
+
299    return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_src_vocab)
+
+
+
+
+ +

Target embedding with learned positional encodings

+
+
+
302@option(TransformerConfigs.tgt_embed, 'learned_pos')
+303def _tgt_embed_with_learned_positional(c: TransformerConfigs):
+
+
+
+
+ + +
+
+
307    return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_tgt_vocab)
+
+
+
+
+ +

No Positional Embeddings

+

Source embedding without positional encodings

+
+
+
311@option(TransformerConfigs.src_embed, 'no_pos')
+312def _src_embed_without_positional(c: TransformerConfigs):
+
+
+
+
+ + +
+
+
316    return nn.Embedding(c.n_src_vocab, c.d_model)
+
+
+
+
+ + +
+
+
319@option(TransformerConfigs.tgt_embed, 'no_pos')
+320def _tgt_embed_without_positional(c: TransformerConfigs):
+321    return nn.Embedding(c.n_tgt_vocab, c.d_model)
+322
+323
+324@option(TransformerConfigs.encoder_decoder, 'default')
+325def _encoder_decoder(c: TransformerConfigs):
+326    return EncoderDecoder(c.encoder, c.decoder, c.src_embed, c.tgt_embed, c.generator)
diff --git a/docs/transformers/feed_forward.html b/docs/transformers/feed_forward.html index 053e69b2..e263236f 100644 --- a/docs/transformers/feed_forward.html +++ b/docs/transformers/feed_forward.html @@ -84,12 +84,20 @@ where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters.

Sometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$

+

Gated Linear Units

+

This is a generic implementation that supports different variants including +Gated Linear Units (GLU). +We have also implemented experiments on these:

+
-
26import torch
-27from torch import nn as nn
-28
-29from labml_helpers.module import Module
+
35import torch
+36from torch import nn as nn
+37
+38from labml_helpers.module import Module
@@ -97,10 +105,10 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. -

Position-wise feed-forward network (FFN) module

+

FFN module

-
32class FeedForward(Module):
+
41class FeedForward(Module):
@@ -119,13 +127,13 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
-
37    def __init__(self, d_model: int, d_ff: int,
-38                 dropout: float = 0.1,
-39                 activation=nn.ReLU(),
-40                 is_gated: bool = False,
-41                 bias1: bool = True,
-42                 bias2: bool = True,
-43                 bias_gate: bool = True):
+
46    def __init__(self, d_model: int, d_ff: int,
+47                 dropout: float = 0.1,
+48                 activation=nn.ReLU(),
+49                 is_gated: bool = False,
+50                 bias1: bool = True,
+51                 bias2: bool = True,
+52                 bias_gate: bool = True):
@@ -136,14 +144,7 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
-
53        super().__init__()
-54        self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
-55        self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
-56        self.dropout = nn.Dropout(dropout)
-57        self.activation = activation
-58        self.is_gated = is_gated
-59        if is_gated:
-60            self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
+
62        super().__init__()
@@ -151,17 +152,136 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. +

Layer one parameterized by weight $W_1$ and bias $b_1$

+
+
+
64        self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
+
+ +
+
+ +

Layer one parameterized by weight $W_1$ and bias $b_1$

+
+
+
66        self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
+
+
+
+
+ +

Hidden layer dropout

+
+
+
68        self.dropout = nn.Dropout(dropout)
+
+
+
+
+ +

Activation function $f$

+
+
+
70        self.activation = activation
+
+
+
+
+ +

Whether there is a gate

+
+
+
72        self.is_gated = is_gated
+73        if is_gated:
+
+
+
+
+ +

If there is a gate the linear layer to transform inputs to +be multiplied by the gate, parameterized by weight $V$ and bias $c$

+
+
+
76            self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
+
+
+
+
+
-
62    def __call__(self, x: torch.Tensor):
-63        g = self.activation(self.layer1(x))
-64        if self.is_gated:
-65            x = g * self.linear_v(x)
-66        else:
-67            x = g
-68        x = self.dropout(x)
-69        return self.layer2(x)
+
78    def __call__(self, x: torch.Tensor):
+
+
+
+
+ +

$f(x W_1 + b_1)$

+
+
+
80        g = self.activation(self.layer1(x))
+
+
+
+
+ +

If gated, $f(x W_1 + b_1) \otimes (x V + b) $

+
+
+
82        if self.is_gated:
+83            x = g * self.linear_v(x)
+
+
+
+
+ +

Otherwise

+
+
+
85        else:
+86            x = g
+
+
+
+
+ +

Apply dropout

+
+
+
88        x = self.dropout(x)
+
+
+
+
+ +

$(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$ +depending on whether it is gated

+
+
+
91        return self.layer2(x)
diff --git a/docs/transformers/glu_variants/experiment.html b/docs/transformers/glu_variants/experiment.html index 8ab55ca6..e73e4151 100644 --- a/docs/transformers/glu_variants/experiment.html +++ b/docs/transformers/glu_variants/experiment.html @@ -71,19 +71,21 @@ -

Train Autoregressive Transformer

-

This trains a simple transformer model for auto-regression.

+

Gated Linear Units and Variants

+

This trains a simple transformer model for auto-regression. +We try different variants for the position-wise feedforward network. +The reusable & configurable are defined in configs.py.

-
14import torch
-15from labml import experiment
-16from labml.configs import option
-17from labml.utils.pytorch import get_modules
-18from labml_helpers.module import Module
-19
-20from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
-21from labml_nn.transformers import Encoder, Generator, TransformerConfigs
-22from labml_nn.transformers.utils import subsequent_mask
+
16import torch
+17from labml import experiment
+18from labml.configs import option
+19from labml.utils.pytorch import get_modules
+20from labml_helpers.module import Module
+21
+22from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
+23from labml_nn.transformers import Encoder, Generator, TransformerConfigs
+24from labml_nn.transformers.utils import subsequent_mask
@@ -94,7 +96,7 @@

Auto regressive model

-
25class AutoregressiveModel(Module):
+
27class AutoregressiveModel(Module):
@@ -105,8 +107,8 @@
-
30    def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator):
-31        super().__init__()
+
32    def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator):
+33        super().__init__()
@@ -117,7 +119,7 @@

Token embedding module

-
33        self.src_embed = src_embed
+
35        self.src_embed = src_embed
@@ -128,7 +130,7 @@

Transformer based encoder

-
35        self.encoder = encoder
+
37        self.encoder = encoder
@@ -140,7 +142,7 @@ this give logits of the the next token

-
38        self.generator = generator
+
40        self.generator = generator
@@ -151,7 +153,7 @@ this give logits of the the next token

This will be initialized on the first call

-
40        self.src_mask = None
+
42        self.src_mask = None
@@ -162,7 +164,7 @@ this give logits of the the next token

-
42    def __call__(self, src: torch.Tensor):
+
44    def __call__(self, src: torch.Tensor):
@@ -173,8 +175,8 @@ this give logits of the the next token

Create subsequent mask, so that the transformer can only pay attention to past tokens.

-
44        if self.src_mask is None or self.src_mask.size(0) != len(src):
-45            self.src_mask = subsequent_mask(len(src)).to(src.device)
+
46        if self.src_mask is None or self.src_mask.size(0) != len(src):
+47            self.src_mask = subsequent_mask(len(src)).to(src.device)
@@ -185,7 +187,7 @@ this give logits of the the next token

Embed the tokens (src) and run it through the the transformer

-
47        res = self.encoder(self.src_embed(src), self.src_mask)
+
49        res = self.encoder(self.src_embed(src), self.src_mask)
@@ -196,7 +198,7 @@ this give logits of the the next token

Generate logits of the next token

-
49        return self.generator(res), None
+
51        return self.generator(res), None
@@ -208,7 +210,7 @@ this give logits of the the next token

The default configs can and will be over-ridden when we start the experiment

-
52class Configs(NLPAutoRegressionConfigs):
+
54class Configs(NLPAutoRegressionConfigs):
@@ -219,8 +221,8 @@ this give logits of the the next token

-
59    transformer: TransformerConfigs
-60    model: AutoregressiveModel
+
61    transformer: TransformerConfigs
+62    model: AutoregressiveModel
@@ -231,8 +233,8 @@ this give logits of the the next token

Initialize the auto-regressive model

-
63@option(Configs.model)
-64def autoregressive_model(c: Configs):
+
65@option(Configs.model)
+66def autoregressive_model(c: Configs):
@@ -243,8 +245,8 @@ this give logits of the the next token

-
68    m = AutoregressiveModel(c.transformer.src_embed, c.transformer.encoder, c.transformer.generator)
-69    return m.to(c.device)
+
70    m = AutoregressiveModel(c.transformer.src_embed, c.transformer.encoder, c.transformer.generator)
+71    return m.to(c.device)
@@ -252,11 +254,11 @@ this give logits of the the next token

-

Initialize the configurable transformer encoder for our autoregressive model

+

Initialize the configurable transformer encoder for our autoregressive model.

-
72@option(Configs.transformer)
-73def transformer_c(c: Configs):
+
74@option(Configs.transformer)
+75def transformer_c(c: Configs):
@@ -267,11 +269,11 @@ this give logits of the the next token

-
77    tc = TransformerConfigs()
-78    tc.n_src_vocab = c.n_tokens
-79    tc.n_tgt_vocab = c.n_tokens
-80
-81    return tc
+
79    tc = TransformerConfigs()
+80    tc.n_src_vocab = c.n_tokens
+81    tc.n_tgt_vocab = c.n_tokens
+82
+83    return tc
@@ -282,7 +284,7 @@ this give logits of the the next token

-
84def main():
+
86def main():
@@ -293,7 +295,7 @@ this give logits of the the next token

Create experiment

-
86    experiment.create(name="glu_variants")
+
88    experiment.create(name="glu_variants")
@@ -304,7 +306,7 @@ this give logits of the the next token

Create configs

-
88    conf = Configs()
+
90    conf = Configs()
@@ -315,7 +317,7 @@ this give logits of the the next token

Load configurations

-
90    experiment.configs(conf,
+
92    experiment.configs(conf,
@@ -326,19 +328,19 @@ this give logits of the the next token

A dictionary of configurations to override

-
92                       {'tokenizer': 'character',
-93                        'prompt_separator': '',
-94                        'prompt': 'It is ',
-95                        'text': 'tiny_shakespeare',
-96
-97                        'optimizer.optimizer': 'Noam',
-98                        'optimizer.learning_rate': 1.,
-99                        'optimizer.d_model': 256,
-100
-101                        'seq_len': 1024,
-102                        'epochs': 128,
-103                        'batch_size': 6,
-104                        'inner_iterations': 10,
+
94                       {'tokenizer': 'character',
+95                        'prompt_separator': '',
+96                        'prompt': 'It is ',
+97                        'text': 'tiny_shakespeare',
+98
+99                        'optimizer.optimizer': 'Noam',
+100                        'optimizer.learning_rate': 1.,
+101                        'optimizer.d_model': 256,
+102
+103                        'seq_len': 1024,
+104                        'epochs': 128,
+105                        'batch_size': 6,
+106                        'inner_iterations': 10,
@@ -347,9 +349,11 @@ this give logits of the the next token

#

GLU Variant, one of GLU, Bilinear, ReGLU, GEGLU, SwiGLU

+

These are defined in the configurable FFN +implementation

-
107                        'transformer.ffn.glu_variant': 'Bilinear',
+
112                        'transformer.ffn.glu_variant': 'Bilinear',
@@ -360,10 +364,10 @@ this give logits of the the next token

Transformer configurations

-
110                        'transformer.d_model': 256,
-111                        'transformer.ffn.d_ff': 1024,
-112                        'transformer.n_heads': 8,
-113                        'transformer.n_layers': 6})
+
115                        'transformer.d_model': 256,
+116                        'transformer.ffn.d_ff': 1024,
+117                        'transformer.n_heads': 8,
+118                        'transformer.n_layers': 6})
@@ -374,7 +378,7 @@ this give logits of the the next token

This is needed to initialize models

-
116    conf.n_tokens = conf.text.n_tokens
+
121    conf.n_tokens = conf.text.n_tokens
@@ -385,7 +389,7 @@ this give logits of the the next token

Set models for saving and loading

-
119    experiment.add_pytorch_models(get_modules(conf))
+
124    experiment.add_pytorch_models(get_modules(conf))
@@ -396,7 +400,7 @@ this give logits of the the next token

Start the experiment

-
122    with experiment.start():
+
127    with experiment.start():
@@ -407,11 +411,11 @@ this give logits of the the next token

TrainValidConfigs.run

-
124        conf.run()
-125
-126
-127if __name__ == '__main__':
-128    main()
+
129        conf.run()
+130
+131
+132if __name__ == '__main__':
+133    main()
diff --git a/docs/transformers/glu_variants/simple.html b/docs/transformers/glu_variants/simple.html index 46e54fae..207f5c42 100644 --- a/docs/transformers/glu_variants/simple.html +++ b/docs/transformers/glu_variants/simple.html @@ -71,25 +71,28 @@ -

Train Autoregressive Transformer

-

This trains a simple transformer model for auto-regression.

+

Gated Linear Units and Variants

+

This trains a simple transformer model for auto-regression. +We try different variants for the position-wise feedforward network.

+

This is a simpler implementation that doesn’t use labml.configs module. +We decided to write a simpler implementation to make it easier readers who are not familiar.

-
13import dataclasses
-14
-15import torch
-16from torch import nn
-17from torch.utils.data import Dataset, DataLoader
+                
17import dataclasses
 18
-19from labml import experiment, lab, tracker, monit, logger
-20from labml.logger import Text
-21from labml.utils.download import download_file
-22from labml_nn.experiments.nlp_autoregression import transpose_batch
-23from labml_nn.optimizers.noam import Noam
-24from labml_nn.transformers import Encoder, MultiHeadAttention
-25from labml_nn.transformers.feed_forward import FeedForward
-26from labml_nn.transformers.models import EmbeddingsWithPositionalEncoding, TransformerLayer
-27from labml_nn.transformers.utils import subsequent_mask
+19import torch +20from torch import nn +21from torch.utils.data import Dataset, DataLoader +22 +23from labml import experiment, lab, tracker, monit, logger +24from labml.logger import Text +25from labml.utils.download import download_file +26from labml_nn.experiments.nlp_autoregression import transpose_batch +27from labml_nn.optimizers.noam import Noam +28from labml_nn.transformers import Encoder, MultiHeadAttention +29from labml_nn.transformers.feed_forward import FeedForward +30from labml_nn.transformers.models import EmbeddingsWithPositionalEncoding, TransformerLayer +31from labml_nn.transformers.utils import subsequent_mask
@@ -100,7 +103,7 @@

Auto regressive model

-
30class AutoregressiveModel(nn.Module):
+
34class AutoregressiveModel(nn.Module):
@@ -111,8 +114,8 @@
-
35    def __init__(self, src_embed: nn.Module, encoder: Encoder, generator: nn.Module):
-36        super().__init__()
+
39    def __init__(self, src_embed: nn.Module, encoder: Encoder, generator: nn.Module):
+40        super().__init__()
@@ -123,7 +126,7 @@

Token embedding module

-
38        self.src_embed = src_embed
+
42        self.src_embed = src_embed
@@ -134,7 +137,7 @@

Transformer based encoder

-
40        self.encoder = encoder
+
44        self.encoder = encoder
@@ -146,7 +149,7 @@ this give logits of the the next token

-
43        self.generator = generator
+
47        self.generator = generator
@@ -157,7 +160,7 @@ this give logits of the the next token

This will be initialized on the first call

-
45        self.src_mask = None
+
49        self.src_mask = None
@@ -168,7 +171,7 @@ this give logits of the the next token

-
47    def __call__(self, src: torch.Tensor):
+
51    def __call__(self, src: torch.Tensor):
@@ -179,8 +182,8 @@ this give logits of the the next token

Create subsequent mask, so that the transformer can only pay attention to past tokens.

-
49        if self.src_mask is None or self.src_mask.size(0) != len(src):
-50            self.src_mask = subsequent_mask(len(src)).to(src.device)
+
53        if self.src_mask is None or self.src_mask.size(0) != len(src):
+54            self.src_mask = subsequent_mask(len(src)).to(src.device)
@@ -191,7 +194,7 @@ this give logits of the the next token

Embed the tokens (src) and run it through the the transformer

-
52        res = self.encoder(self.src_embed(src), self.src_mask)
+
56        res = self.encoder(self.src_embed(src), self.src_mask)
@@ -202,98 +205,19 @@ this give logits of the the next token

Generate logits of the next token

-
54        return self.generator(res)
+
58        return self.generator(res)
-
+
- +

Configurations

-
57@dataclasses.dataclass
-58class Configs:
-59    d_model: int = 512
-60    seq_len: int = 128
-61    batch_size: int = 32
-62    n_layers: int = 6
-63    n_heads: int = 8
-64    dropout: float = 0.1
-65    d_ff: int = 2048
-66    glu_variant: str = 'GLU'
-67    epochs: int = 5
-68    grad_norm_clip: float = 0.5
-69
-70
-71class TinyShakespeareDataset(Dataset):
-72    def __init__(self, seq_len: int):
-73        path = lab.get_data_path() / 'tiny_shakespeare.txt'
-74        download_file('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', path)
-75        with open(str(path), 'r') as f:
-76            text = f.read()
-77
-78        chars = list(set(text))
-79        self.stoi = {c: i for i, c in enumerate(chars)}
-80        self.itos = {i: c for i, c in enumerate(chars)}
-81        self.seq_len = seq_len
-82        self.data = self.text_to_i(text)
-83
-84    def text_to_i(self, text: str):
-85        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)
-86
-87    def __len__(self):
-88        return len(self.data) - self.seq_len - 1
-89
-90    def __getitem__(self, idx):
-91        return self.data[idx:idx + self.seq_len], self.data[idx + 1:idx + self.seq_len + 1]
-92
-93
-94class Trainer:
-95    def __init__(self, configs: Configs):
-96        self.device = torch.device('cpu')
-97        if torch.cuda.is_available():
-98            self.device = torch.device('cuda:0')
-99        self.dataset = TinyShakespeareDataset(configs.seq_len)
-100        self.dataloader = DataLoader(self.dataset, batch_size=configs.batch_size, collate_fn=transpose_batch,
-101                                     shuffle=True)
-102
-103        if configs.glu_variant == 'GLU':
-104            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False)
-105        elif configs.glu_variant == 'Bilinear':
-106            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False)
-107        elif configs.glu_variant == 'ReGLU':
-108            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False)
-109        elif configs.glu_variant == 'GEGLU':
-110            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False)
-111        elif configs.glu_variant == 'SwiGLU':
-112            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False)
-113        elif configs.glu_variant == 'ReLU':
-114            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU())
-115        elif configs.glu_variant == 'GELU':
-116            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU())
-117        else:
-118            raise ValueError(f'Unknown variant {configs.glu_variant}')
-119
-120        n_chars = len(self.dataset.stoi)
-121        self.model = AutoregressiveModel(EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
-122                                         Encoder(TransformerLayer(
-123                                             d_model=configs.d_model,
-124                                             self_attn=MultiHeadAttention(configs.n_heads, configs.d_model,
-125                                                                          configs.dropout),
-126                                             src_attn=None,
-127                                             feed_forward=ffn,
-128                                             dropout_prob=configs.dropout
-129                                         ), configs.n_layers),
-130                                         nn.Linear(configs.d_model, n_chars))
-131        self.model.to(self.device)
-132
-133        self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model)
-134
-135        self.loss_func = nn.CrossEntropyLoss()
-136        self.epochs = configs.epochs
-137        self.grad_norm_clip = configs.grad_norm_clip
+
61@dataclasses.dataclass
+62class Configs:
@@ -301,10 +225,19 @@ this give logits of the the next token

-

Set tracker configurations

+
-
140        tracker.set_scalar("loss.*", True)
+
66    d_model: int = 512
+67    seq_len: int = 128
+68    batch_size: int = 32
+69    n_layers: int = 6
+70    n_heads: int = 8
+71    dropout: float = 0.1
+72    d_ff: int = 2048
+73    glu_variant: str = 'GLU'
+74    epochs: int = 5
+75    grad_norm_clip: float = 0.5
@@ -312,10 +245,10 @@ this give logits of the the next token

-

Sampling function to generate samples periodically while training

+

Tiny Shakespeare Dataset

-
142    def sample(self):
+
78class TinyShakespeareDataset(Dataset):
@@ -323,10 +256,10 @@ this give logits of the the next token

-

Starting prompt

+
-
148        prompt = 'It is'
+
83    def __init__(self, seq_len: int):
@@ -334,10 +267,10 @@ this give logits of the the next token

-

Collect output for printing

+

Location of the text file

-
150        log = [(prompt, Text.subtle)]
+
85        path = lab.get_data_path() / 'tiny_shakespeare.txt'
@@ -345,10 +278,10 @@ this give logits of the the next token

-

Sample 25 tokens

+

Download the file

-
152        for i in monit.iterate('Sample', 25):
+
87        download_file('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', path)
@@ -356,11 +289,11 @@ this give logits of the the next token

-

Tokenize the prompt

+

Read the downloaded file

-
154            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
-155            data = data.to(self.device)
+
89        with open(str(path), 'r') as f:
+90            text = f.read()
@@ -368,10 +301,10 @@ this give logits of the the next token

-

Get the model output

+

Extract the characters

-
157            output = self.model(data)
+
93        chars = list(set(text))
@@ -379,10 +312,10 @@ this give logits of the the next token

-

Get the model prediction (greedy)

+

Character to id (integer) map

-
159            output = output.argmax(dim=-1).squeeze()
+
95        self.stoi = {c: i for i, c in enumerate(chars)}
@@ -390,10 +323,10 @@ this give logits of the the next token

-

Add the prediction to prompt

+

Id to character map

-
161            prompt += self.dataset.itos[output[-1].item()]
+
97        self.itos = {i: c for i, c in enumerate(chars)}
@@ -401,10 +334,10 @@ this give logits of the the next token

-

Add the prediction for logging

+

Length of a training sample

-
163            log += [(self.dataset.itos[output[-1].item()], Text.value)]
+
99        self.seq_len = seq_len
@@ -412,23 +345,21 @@ this give logits of the the next token

-

Print the sampled output

+

Data in the form of a tensor of ids

-
166        logger.log(log)
+
101        self.data = self.text_to_i(text)
-
+
- +

Transform the text into a tensor of ids

-
168    def train(self):
-169        for _ in monit.loop(self.epochs):
-170            for i, batch in monit.enum('Train', self.dataloader):
+
103    def text_to_i(self, text: str):
@@ -436,27 +367,22 @@ this give logits of the the next token

-

Move data to the device

+
-
172                data, target = batch[0].to(self.device), batch[1].to(self.device)
-173
-174                tracker.add_global_step(data.shape[0] * data.shape[1])
-175
-176                self.model.train()
-177                output = self.model(data)
+
107        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)
-
+
-

Calculate and log loss

+

Number of samples in the dataset.

+

This will read the dataset seq_len times in a single epoch.

-
180                loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1))
-181                tracker.add("loss.train", loss)
+
109    def __len__(self):
@@ -464,21 +390,21 @@ this give logits of the the next token

-

Calculate gradients

+
-
184                loss.backward()
+
115        return len(self.data) - self.seq_len - 1
-
+
-

Clip gradients

+

Return a sample

-
186                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
117    def __getitem__(self, idx):
@@ -486,22 +412,21 @@ this give logits of the the next token

-

Take optimizer step

+
-
188                self.optimizer.step()
+
121        return self.data[idx:idx + self.seq_len], self.data[idx + 1:idx + self.seq_len + 1]
-
+
-

Log the model parameters and gradients on last batch of every epoch

+

Trainer

-
190                if (i + 1) % 100 == 0:
-191                    tracker.add('model', self.model)
+
124class Trainer:
@@ -509,15 +434,10 @@ this give logits of the the next token

-

Clear the gradients

+
-
193                self.optimizer.zero_grad()
-194
-195                if (i + 1) % 100 == 0:
-196                    self.model.eval()
-197                    with torch.no_grad():
-198                        self.sample()
+
129    def __init__(self, configs: Configs):
@@ -525,13 +445,12 @@ this give logits of the the next token

-

Save the tracked metrics

+

Get the device

-
201                if (i + 1) % 10 == 0:
-202                    tracker.save()
-203
-204            experiment.save_checkpoint()
+
131        self.device = torch.device('cpu')
+132        if torch.cuda.is_available():
+133            self.device = torch.device('cuda:0')
@@ -539,10 +458,10 @@ this give logits of the the next token

- +

Initialize the dataset

-
207def main():
+
135        self.dataset = TinyShakespeareDataset(configs.seq_len)
@@ -550,10 +469,13 @@ this give logits of the the next token

-

Create experiment

+

Initialize the dataloader

-
209    experiment.create(name="glu_variants")
+
137        self.dataloader = DataLoader(self.dataset,
+138                                     batch_size=configs.batch_size,
+139                                     collate_fn=transpose_batch,
+140                                     shuffle=True)
@@ -561,10 +483,13 @@ this give logits of the the next token

-

Create configs

+

FFN with Gated Linear Unit + +

-
211    configs = Configs()
+
144        if configs.glu_variant == 'GLU':
+145            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False)
@@ -572,13 +497,13 @@ this give logits of the the next token

-

Load configurations

+

FFN with Bilinear hidden layer + +

-
213    experiment.configs(dataclasses.asdict(configs))
-214
-215    trainer = Trainer(configs)
-216    experiment.add_pytorch_models({'model': trainer.model})
+
148        elif configs.glu_variant == 'Bilinear':
+149            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False)
@@ -586,10 +511,13 @@ this give logits of the the next token

-

Start the experiment

+

FFN with ReLU gate + +

-
219    with experiment.start():
+
152        elif configs.glu_variant == 'ReGLU':
+153            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False)
@@ -597,14 +525,570 @@ this give logits of the the next token

-

TrainValidConfigs.run

+

FFN with GELU gate + +

-
221        trainer.train()
-222
-223
-224if __name__ == '__main__':
-225    main()
+
156        elif configs.glu_variant == 'GEGLU':
+157            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False)
+
+ +
+
+ +

FFN with Swish gate + +where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

+
+
+
161        elif configs.glu_variant == 'SwiGLU':
+162            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False)
+
+
+
+
+ +

FFN with ReLU activation + +

+
+
+
165        elif configs.glu_variant == 'ReLU':
+166            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU())
+
+
+
+
+ +

FFN with ReLU activation + +

+
+
+
169        elif configs.glu_variant == 'GELU':
+170            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU())
+171        else:
+172            raise ValueError(f'Unknown variant {configs.glu_variant}')
+
+
+
+
+ +

Number of different characters

+
+
+
175        n_chars = len(self.dataset.stoi)
+
+
+
+
+ +

Initialize Multi-Head Attention module

+
+
+
178        mha = MultiHeadAttention(configs.n_heads, configs.d_model, configs.dropout)
+
+
+
+
+ +

Initialize the Transformer Block

+
+
+
180        transformer_layer = TransformerLayer(d_model=configs.d_model, self_attn=mha, src_attn=None,
+181                                             feed_forward=ffn, dropout_prob=configs.dropout)
+
+
+
+
+ +

Initialize the model with an +embedding layer +(with fixed positional encoding) +transformer encoder and +a linear layer to generate logits.

+
+
+
187        self.model = AutoregressiveModel(EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
+188                                         Encoder(transformer_layer, configs.n_layers),
+189                                         nn.Linear(configs.d_model, n_chars))
+
+
+
+
+ +

Move the model to the current device

+
+
+
192        self.model.to(self.device)
+
+
+
+
+ +

Initialize Noam optimizer

+
+
+
195        self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model)
+
+
+
+
+ +

Cross-entropy loss

+
+
+
198        self.loss_func = nn.CrossEntropyLoss()
+
+
+
+
+ +

Number of training epochs; +*note that our dataset definition repeats the data seq_len times in a single epoch

+
+
+
201        self.epochs = configs.epochs
+
+
+
+
+ +

Gradient clipping norm

+
+
+
203        self.grad_norm_clip = configs.grad_norm_clip
+
+
+
+
+ +

Set tracker configurations

+
+
+
206        tracker.set_scalar("loss.*", True)
+
+
+
+
+ +

Sampling function to generate samples periodically while training

+
+
+
208    def sample(self):
+
+
+
+
+ +

Starting prompt

+
+
+
214        prompt = 'It is'
+
+
+
+
+ +

Collect output for printing

+
+
+
216        log = [(prompt, Text.subtle)]
+
+
+
+
+ +

Sample 25 tokens

+
+
+
218        for i in monit.iterate('Sample', 25):
+
+
+
+
+ +

Tokenize the prompt

+
+
+
220            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
+221            data = data.to(self.device)
+
+
+
+
+ +

Get the model output

+
+
+
223            output = self.model(data)
+
+
+
+
+ +

Get the model prediction (greedy)

+
+
+
225            output = output.argmax(dim=-1).squeeze()
+
+
+
+
+ +

Add the prediction to prompt

+
+
+
227            prompt += self.dataset.itos[output[-1].item()]
+
+
+
+
+ +

Add the prediction for logging

+
+
+
229            log += [(self.dataset.itos[output[-1].item()], Text.value)]
+
+
+
+
+ +

Print the sampled output

+
+
+
232        logger.log(log)
+
+
+
+
+ +

Train the model

+
+
+
234    def train(self):
+
+
+
+
+ +

Loop for the given number of epochs

+
+
+
240        for _ in monit.loop(self.epochs):
+
+
+
+
+ +

Iterate over the minibatches

+
+
+
242            for i, batch in monit.enum('Train', self.dataloader):
+
+
+
+
+ +

Move data to the device

+
+
+
244                data, target = batch[0].to(self.device), batch[1].to(self.device)
+
+
+
+
+ +

Set tracker step, as the number of characters trained on

+
+
+
247                tracker.add_global_step(data.shape[0] * data.shape[1])
+
+
+
+
+ +

Set model state to training

+
+
+
250                self.model.train()
+
+
+
+
+ +

Evaluate the model

+
+
+
252                output = self.model(data)
+
+
+
+
+ +

Calculate loss

+
+
+
255                loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1))
+
+
+
+
+ +

Log the loss

+
+
+
257                tracker.add("loss.train", loss)
+
+
+
+
+ +

Calculate gradients

+
+
+
260                loss.backward()
+
+
+
+
+ +

Clip gradients

+
+
+
262                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
+
+
+
+ +

Take optimizer step

+
+
+
264                self.optimizer.step()
+
+
+
+
+ +

Log the model parameters and gradients

+
+
+
266                if (i + 1) % 100 == 0:
+267                    tracker.add('model', self.model)
+
+
+
+
+ +

Clear the gradients

+
+
+
269                self.optimizer.zero_grad()
+
+
+
+
+ +

Generate a sample

+
+
+
272                if (i + 1) % 100 == 0:
+273                    self.model.eval()
+274                    with torch.no_grad():
+275                        self.sample()
+
+
+
+
+ +

Save the tracked metrics

+
+
+
278                if (i + 1) % 10 == 0:
+279                    tracker.save()
+
+
+
+
+ +

Save the model

+
+
+
282            experiment.save_checkpoint()
+
+
+
+
+ + +
+
+
285def main():
+
+
+
+
+ +

Create experiment

+
+
+
287    experiment.create(name="glu_variants")
+
+
+
+
+ +

Create configs

+
+
+
289    configs = Configs()
+
+
+
+
+ +

Load configurations

+
+
+
291    experiment.configs(dataclasses.asdict(configs))
+
+
+
+
+ +

Create trainer

+
+
+
294    trainer = Trainer(configs)
+
+
+
+
+ +

Set models for training and loading

+
+
+
296    experiment.add_pytorch_models({'model': trainer.model})
+
+
+
+
+ +

Start the experiment

+
+
+
299    with experiment.start():
+
+
+
+
+ +

Train the model

+
+
+
301        trainer.train()
+302
+303
+304if __name__ == '__main__':
+305    main()
diff --git a/labml_nn/transformers/configs.py b/labml_nn/transformers/configs.py index 2afdb37d..e11c984d 100644 --- a/labml_nn/transformers/configs.py +++ b/labml_nn/transformers/configs.py @@ -19,6 +19,14 @@ from .models import EmbeddingsWithPositionalEncoding, EmbeddingsWithLearnedPosit class FeedForwardConfigs(BaseConfigs): + """ + + ## FFN Configurations + + + Creates a Position-wise FeedForward Network defined in + [`feed_forward.py`](feed_forward.html). + """ # Position-wise feedforward layer ffn: FeedForward # Number of features in the embedding @@ -44,7 +52,9 @@ class FeedForwardConfigs(BaseConfigs): @option(FeedForwardConfigs.activation, 'ReLU') def _ffn_activation_relu(): """ - ReLU activation + ### ReLU activation + + $$\max(0, x)$$ """ return nn.ReLU() @@ -52,7 +62,11 @@ def _ffn_activation_relu(): @option(FeedForwardConfigs.activation, 'GELU') def _ffn_activation_gelu(): """ - GELU activation + ### GELU activation + + $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$ + + It was introduced in paper [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415). """ return nn.GELU() @@ -60,7 +74,7 @@ def _ffn_activation_gelu(): @option(FeedForwardConfigs.ffn, 'default') def _feed_forward(c: FeedForwardConfigs): """ - Create feedforward layer + Initialize a [feed forward network](feed_forward.html) """ return FeedForward(c.d_model, c.d_ff, dropout=c.dropout, @@ -70,7 +84,14 @@ def _feed_forward(c: FeedForwardConfigs): bias2=c.bias2, bias_gate=c.bias_gate) +# ## GLU Variants +# These are variants with gated hidden layers for the FFN +# as introduced in paper [GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202). +# We have omitted the bias terms as specified in the paper. +# ### FFN with Gated Linear Units +# +# $$FFN_{GLU}(x)(x, W_1, V, W_2) = (\sigma(x W_1) \otimes x V) W_2$$ aggregate(FeedForwardConfigs.glu_variant, 'GLU', (FeedForwardConfigs.is_gated, True), (FeedForwardConfigs.bias1, False), @@ -78,24 +99,40 @@ aggregate(FeedForwardConfigs.glu_variant, 'GLU', (FeedForwardConfigs.bias_gate, False), (FeedForwardConfigs.activation, nn.Sigmoid())) +# ### FFN with Bilinear hidden layer +# +# $$FFN_{Bilinear}(x)(x, W_1, V, W_2) = (x W_1 \otimes x V) W_2$$ aggregate(FeedForwardConfigs.glu_variant, 'Bilinear', (FeedForwardConfigs.is_gated, True), (FeedForwardConfigs.bias1, False), (FeedForwardConfigs.bias2, False), (FeedForwardConfigs.bias_gate, False), (FeedForwardConfigs.activation, nn.Identity())) + +# ### FFN with ReLU gate +# +# $$FFN_{ReGLU}(x)(x, W_1, V, W_2) = (\max(0, x W_1) \otimes x V) W_2$$ aggregate(FeedForwardConfigs.glu_variant, 'ReGLU', (FeedForwardConfigs.is_gated, True), (FeedForwardConfigs.bias1, False), (FeedForwardConfigs.bias2, False), (FeedForwardConfigs.bias_gate, False), (FeedForwardConfigs.activation, nn.ReLU())) + +# ### FFN with GELU gate +# +# $$FFN_{GEGLU}(x)(x, W_1, V, W_2) = (\text{GELU}(x W_1) \otimes x V) W_2$$ aggregate(FeedForwardConfigs.glu_variant, 'GEGLU', (FeedForwardConfigs.is_gated, True), (FeedForwardConfigs.bias1, False), (FeedForwardConfigs.bias2, False), (FeedForwardConfigs.bias_gate, False), (FeedForwardConfigs.activation, nn.GELU())) + +# ### FFN with Swish gate +# +# $$FFN_{SwiGLU}(x)(x, W_1, V, W_2) = (\text{Swish}_1(x W_1) \otimes x V) W_2$$ +# where $\text{Swish}_\beta(x) = x \sigma(\beta x)$ aggregate(FeedForwardConfigs.glu_variant, 'SwiGLU', (FeedForwardConfigs.is_gated, True), (FeedForwardConfigs.bias1, False), @@ -236,7 +273,7 @@ def _generator(c: TransformerConfigs): return Generator(c.n_tgt_vocab, c.d_model) -# ## Positional Embeddings +# ### Fixed Positional Embeddings @option(TransformerConfigs.src_embed, 'fixed_pos') def _src_embed_with_positional(c: TransformerConfigs): """ @@ -253,7 +290,7 @@ def _tgt_embed_with_positional(c: TransformerConfigs): return EmbeddingsWithPositionalEncoding(c.d_model, c.n_tgt_vocab) -# ## Learned Positional Embeddings +# ### Learned Positional Embeddings @option(TransformerConfigs.src_embed, 'learned_pos') def _src_embed_with_learned_positional(c: TransformerConfigs): """ @@ -270,7 +307,7 @@ def _tgt_embed_with_learned_positional(c: TransformerConfigs): return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_tgt_vocab) -# ## No Positional Embeddings +# ### No Positional Embeddings @option(TransformerConfigs.src_embed, 'no_pos') def _src_embed_without_positional(c: TransformerConfigs): """ diff --git a/labml_nn/transformers/feed_forward.py b/labml_nn/transformers/feed_forward.py index a7c92afb..57cedc9b 100644 --- a/labml_nn/transformers/feed_forward.py +++ b/labml_nn/transformers/feed_forward.py @@ -21,6 +21,15 @@ where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters. Sometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$ + +### Gated Linear Units + +This is a generic implementation that supports different variants including +[Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU). +We have also implemented experiments on these: + +* [experiment that uses `labml.configs`](glu_variants/experiment.html) +* [simpler version from scratch](glu_variants/simple.html) """ import torch @@ -31,7 +40,7 @@ from labml_helpers.module import Module class FeedForward(Module): """ - ## Position-wise feed-forward network (FFN) module + ## FFN module """ def __init__(self, d_model: int, d_ff: int, @@ -51,19 +60,32 @@ class FeedForward(Module): * `bias_gate` specified whether the fully connected layer for the gate should have a learnable bias """ super().__init__() + # Layer one parameterized by weight $W_1$ and bias $b_1$ self.layer1 = nn.Linear(d_model, d_ff, bias=bias1) + # Layer one parameterized by weight $W_1$ and bias $b_1$ self.layer2 = nn.Linear(d_ff, d_model, bias=bias2) + # Hidden layer dropout self.dropout = nn.Dropout(dropout) + # Activation function $f$ self.activation = activation + # Whether there is a gate self.is_gated = is_gated if is_gated: + # If there is a gate the linear layer to transform inputs to + # be multiplied by the gate, parameterized by weight $V$ and bias $c$ self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate) def __call__(self, x: torch.Tensor): + # $f(x W_1 + b_1)$ g = self.activation(self.layer1(x)) + # If gated, $f(x W_1 + b_1) \otimes (x V + b) $ if self.is_gated: x = g * self.linear_v(x) + # Otherwise else: x = g + # Apply dropout x = self.dropout(x) + # $(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$ + # depending on whether it is gated return self.layer2(x) diff --git a/labml_nn/transformers/glu_variants/experiment.py b/labml_nn/transformers/glu_variants/experiment.py index b4fb4e47..89c21b70 100644 --- a/labml_nn/transformers/glu_variants/experiment.py +++ b/labml_nn/transformers/glu_variants/experiment.py @@ -6,9 +6,11 @@ summary: > for the position-wise feedforward network (FFN). --- -# Train Autoregressive Transformer +# Gated Linear Units and Variants This trains a simple [transformer](../../) model for auto-regression. +We try different variants for the [position-wise feedforward network](../feed_forward). +The reusable & configurable are defined in [`configs.py`](configs.html). """ import torch @@ -72,7 +74,7 @@ def autoregressive_model(c: Configs): @option(Configs.transformer) def transformer_c(c: Configs): """ - Initialize the configurable transformer encoder for our autoregressive model + Initialize the [configurable transformer](../configs.html) encoder for our autoregressive model. """ tc = TransformerConfigs() tc.n_src_vocab = c.n_tokens @@ -104,6 +106,9 @@ def main(): 'inner_iterations': 10, # GLU Variant, one of GLU, Bilinear, ReGLU, GEGLU, SwiGLU + # + # These are defined in the [configurable FFN](../configs.html#FFN) + # implementation 'transformer.ffn.glu_variant': 'Bilinear', # Transformer configurations diff --git a/labml_nn/transformers/glu_variants/simple.py b/labml_nn/transformers/glu_variants/simple.py index 0173cb29..0cf7f494 100644 --- a/labml_nn/transformers/glu_variants/simple.py +++ b/labml_nn/transformers/glu_variants/simple.py @@ -6,9 +6,13 @@ summary: > for the position-wise feedforward network (FFN). --- -# Train Autoregressive Transformer +# Gated Linear Units and Variants This trains a simple [transformer](../../) model for auto-regression. +We try different variants for the [position-wise feedforward network](../feed_forward). + +*This is a simpler implementation that doesn't use [`labml.configs`](experiment.html) module. +We decided to write a simpler implementation to make it easier readers who are not familiar.* """ import dataclasses @@ -56,6 +60,9 @@ class AutoregressiveModel(nn.Module): @dataclasses.dataclass class Configs: + """ + ### Configurations + """ d_model: int = 512 seq_len: int = 128 batch_size: int = 32 @@ -69,71 +76,130 @@ class Configs: class TinyShakespeareDataset(Dataset): + """ + ### Tiny Shakespeare Dataset + """ + def __init__(self, seq_len: int): + # Location of the text file path = lab.get_data_path() / 'tiny_shakespeare.txt' + # Download the file download_file('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', path) + # Read the downloaded file with open(str(path), 'r') as f: text = f.read() + # Extract the characters chars = list(set(text)) + # Character to id (integer) map self.stoi = {c: i for i, c in enumerate(chars)} + # Id to character map self.itos = {i: c for i, c in enumerate(chars)} + # Length of a training sample self.seq_len = seq_len + # Data in the form of a tensor of ids self.data = self.text_to_i(text) def text_to_i(self, text: str): + """ + Transform the text into a tensor of ids + """ return torch.tensor([self.stoi[c] for c in text], dtype=torch.long) def __len__(self): + """ + Number of samples in the dataset. + + *This will read the dataset `seq_len` times in a single epoch.* + """ return len(self.data) - self.seq_len - 1 def __getitem__(self, idx): + """ + Return a sample + """ return self.data[idx:idx + self.seq_len], self.data[idx + 1:idx + self.seq_len + 1] class Trainer: + """ + ## Trainer + """ + def __init__(self, configs: Configs): + # Get the device self.device = torch.device('cpu') if torch.cuda.is_available(): self.device = torch.device('cuda:0') + # Initialize the dataset self.dataset = TinyShakespeareDataset(configs.seq_len) - self.dataloader = DataLoader(self.dataset, batch_size=configs.batch_size, collate_fn=transpose_batch, + # Initialize the dataloader + self.dataloader = DataLoader(self.dataset, + batch_size=configs.batch_size, + collate_fn=transpose_batch, shuffle=True) + # FFN with Gated Linear Unit + # $$FFN_{GLU}(x)(x, W_1, V, W_2) = (\sigma(x W_1) \otimes x V) W_2$$ if configs.glu_variant == 'GLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False) + # FFN with Bilinear hidden layer + # $$FFN_{Bilinear}(x)(x, W_1, V, W_2) = (x W_1 \otimes x V) W_2$$ elif configs.glu_variant == 'Bilinear': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False) + # FFN with ReLU gate + # $$FFN_{ReGLU}(x)(x, W_1, V, W_2) = (\max(0, x W_1) \otimes x V) W_2$$ elif configs.glu_variant == 'ReGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False) + # FFN with GELU gate + # $$FFN_{GEGLU}(x)(x, W_1, V, W_2) = (\text{GELU}(x W_1) \otimes x V) W_2$$ elif configs.glu_variant == 'GEGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False) + # FFN with Swish gate + # $$FFN_{SwiGLU}(x)(x, W_1, V, W_2) = (\text{Swish}_1(x W_1) \otimes x V) W_2$$ + # where $\text{Swish}_\beta(x) = x \sigma(\beta x)$ elif configs.glu_variant == 'SwiGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False) + # FFN with ReLU activation + # $$FFN_{ReLU}(x)(x, W_1, W_2, b_1, b_2) = \text{ReLU}_1(x W_1 + b_1) W_2 + b_2$$ elif configs.glu_variant == 'ReLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU()) + # FFN with ReLU activation + # $$FFN_{GELU}(x)(x, W_1, W_2, b_1, b_2) = \text{GELU}_1(x W_1 + b_1) W_2 + b_2$$ elif configs.glu_variant == 'GELU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU()) else: raise ValueError(f'Unknown variant {configs.glu_variant}') + # Number of different characters n_chars = len(self.dataset.stoi) + + # Initialize [Multi-Head Attention module](../mha.html) + mha = MultiHeadAttention(configs.n_heads, configs.d_model, configs.dropout) + # Initialize the [Transformer Block](../models.html#TransformerLayer) + transformer_layer = TransformerLayer(d_model=configs.d_model, self_attn=mha, src_attn=None, + feed_forward=ffn, dropout_prob=configs.dropout) + # Initialize the model with an + # [embedding layer](../models.html#EmbeddingsWithPositionalEncoding) + # (with fixed positional encoding) + # [transformer encoder](../models.html#Encoder) and + # a linear layer to generate logits. self.model = AutoregressiveModel(EmbeddingsWithPositionalEncoding(configs.d_model, n_chars), - Encoder(TransformerLayer( - d_model=configs.d_model, - self_attn=MultiHeadAttention(configs.n_heads, configs.d_model, - configs.dropout), - src_attn=None, - feed_forward=ffn, - dropout_prob=configs.dropout - ), configs.n_layers), + Encoder(transformer_layer, configs.n_layers), nn.Linear(configs.d_model, n_chars)) + + # Move the model to the current device self.model.to(self.device) + # Initialize [Noam optimizer](../../optimizers/noam.html) self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model) + # Cross-entropy loss self.loss_func = nn.CrossEntropyLoss() + # Number of training epochs; + # *note that our dataset definition repeats the data `seq_len` times in a single epoch self.epochs = configs.epochs + # Gradient clipping norm self.grad_norm_clip = configs.grad_norm_clip # Set tracker configurations @@ -166,18 +232,28 @@ class Trainer: logger.log(log) def train(self): + """ + ### Train the model + """ + + # Loop for the given number of epochs for _ in monit.loop(self.epochs): + # Iterate over the minibatches for i, batch in monit.enum('Train', self.dataloader): # Move data to the device data, target = batch[0].to(self.device), batch[1].to(self.device) + # Set tracker step, as the number of characters trained on tracker.add_global_step(data.shape[0] * data.shape[1]) + # Set model state to training self.model.train() + # Evaluate the model output = self.model(data) - # Calculate and log loss + # Calculate loss loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1)) + # Log the loss tracker.add("loss.train", loss) # Calculate gradients @@ -186,12 +262,13 @@ class Trainer: torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip) # Take optimizer step self.optimizer.step() - # Log the model parameters and gradients on last batch of every epoch + # Log the model parameters and gradients if (i + 1) % 100 == 0: tracker.add('model', self.model) # Clear the gradients self.optimizer.zero_grad() + # Generate a sample if (i + 1) % 100 == 0: self.model.eval() with torch.no_grad(): @@ -201,6 +278,7 @@ class Trainer: if (i + 1) % 10 == 0: tracker.save() + # Save the model experiment.save_checkpoint() @@ -212,12 +290,14 @@ def main(): # Load configurations experiment.configs(dataclasses.asdict(configs)) + # Create trainer trainer = Trainer(configs) + # Set models for training and loading experiment.add_pytorch_models({'model': trainer.model}) # Start the experiment with experiment.start(): - # `TrainValidConfigs.run` + # Train the model trainer.train()
-
+
- +

+

FFN Configurations

+

+

Creates a Position-wise FeedForward Network defined in +feed_forward.py.

21class FeedForwardConfigs(BaseConfigs):
@@ -104,7 +108,7 @@

Position-wise feedforward layer

-
23    ffn: FeedForward
+
31    ffn: FeedForward
@@ -115,7 +119,7 @@

Number of features in the embedding

-
25    d_model: int
+
33    d_model: int