Files
Varuna Jayasiri 8d1b6ba010 translations
2022-08-30 16:28:56 +05:30

2040 lines
199 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="si">
<head>
<meta http-equiv="content-type" content="text/html;charset=utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<meta name="description" content="ජීපීටී-නියෝක්ස් හි ආදර්ශ අර්ථ දැක්වීම මෙයයි."/>
<meta name="twitter:card" content="summary"/>
<meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
<meta name="twitter:title" content="ජීපීටී-නියෝක්ස් ආදර්ශ අර්ථ දැක්වීම"/>
<meta name="twitter:description" content="ජීපීටී-නියෝක්ස් හි ආදර්ශ අර්ථ දැක්වීම මෙයයි."/>
<meta name="twitter:site" content="@labmlai"/>
<meta name="twitter:creator" content="@labmlai"/>
<meta property="og:url" content="https://nn.labml.ai/neox/model.html"/>
<meta property="og:title" content="ජීපීටී-නියෝක්ස් ආදර්ශ අර්ථ දැක්වීම"/>
<meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
<meta property="og:site_name" content="ජීපීටී-නියෝක්ස් ආදර්ශ අර්ථ දැක්වීම"/>
<meta property="og:type" content="object"/>
<meta property="og:title" content="ජීපීටී-නියෝක්ස් ආදර්ශ අර්ථ දැක්වීම"/>
<meta property="og:description" content="ජීපීටී-නියෝක්ස් හි ආදර්ශ අර්ථ දැක්වීම මෙයයි."/>
<title>ජීපීටී-නියෝක්ස් ආදර්ශ අර්ථ දැක්වීම</title>
<link rel="shortcut icon" href="/icon.png"/>
<link rel="stylesheet" href="../pylit.css?v=1">
<link rel="canonical" href="https://nn.labml.ai/neox/model.html"/>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'G-4V3HC8HBLH');
</script>
</head>
<body>
<div id='container'>
<div id="background"></div>
<div class='section'>
<div class='docs'>
<p>
<a class="parent" href="/">home</a>
<a class="parent" href="index.html">neox</a>
</p>
<p>
<a href="https://github.com/sponsors/labmlai" target="_blank">
<img alt="Sponsor"
src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
style="max-width:100%;"/></a>
<a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
<img alt="Github"
src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
style="max-width:100%;"/></a>
<a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
<img alt="Twitter"
src="https://img.shields.io/twitter/follow/labmlai?style=social"
style="max-width:100%;"/></a>
</p>
<p>
<a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/model.py" target="_blank">
View code on Github</a>
</p>
</div>
</div>
<div class='section' id='section-0'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-0'>#</a>
</div>
<h1>ජීපීටී-නියෝක්ස්ආකෘතිය</h1>
<p>ජීපීටී-නියෝක්ස්ආකෘතියේ ස්ථර සඳහා කේතය සහ 20B මුරපොල පූරණය කිරීමේ කේතය මෙන්න. </p>
<p>ස්ථර <code class="highlight"><span></span><span class="n">load_state</span></code>
වල ඇති ක්රමය එම ස්ථරයේ මුරපොලවල් පූරණය කරයි. මුරපොලවල් පැටවීමේ සහායකයන් ක්රියාත්මක වේ <a href="checkpoint.html"><code class="highlight"><span></span><span class="n">checkpoint</span><span class="o">.</span><span class="n">py</span></code>
</a></p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">16</span><span></span><span class="kn">import</span> <span class="nn">copy</span>
<span class="lineno">17</span><span class="kn">import</span> <span class="nn">math</span>
<span class="lineno">18</span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Generator</span><span class="p">,</span> <span class="n">Tuple</span>
<span class="lineno">19</span>
<span class="lineno">20</span><span class="kn">import</span> <span class="nn">torch</span>
<span class="lineno">21</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span>
<span class="lineno">22</span><span class="kn">from</span> <span class="nn">torch.cuda.amp</span> <span class="kn">import</span> <span class="n">autocast</span>
<span class="lineno">23</span>
<span class="lineno">24</span><span class="kn">from</span> <span class="nn">labml</span> <span class="kn">import</span> <span class="n">monit</span>
<span class="lineno">25</span><span class="kn">from</span> <span class="nn">labml_nn.neox</span> <span class="kn">import</span> <span class="n">checkpoint</span>
<span class="lineno">26</span><span class="kn">from</span> <span class="nn">labml_nn.neox.utils.cache</span> <span class="kn">import</span> <span class="n">get_cache</span></pre></div>
</div>
</div>
<div class='section' id='section-1'>
<div class='docs'>
<div class='section-link'>
<a href='#section-1'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">29</span><span class="k">class</span> <span class="nc">NeoXModule</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-2'>
<div class='docs'>
<div class='section-link'>
<a href='#section-2'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">30</span> <span class="k">def</span> <span class="nf">load_state</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">p2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]):</span>
<span class="lineno">31</span> <span class="k">pass</span></pre></div>
</div>
</div>
<div class='section' id='section-3'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-3'>#</a>
</div>
<h2>කාවැද්දීමස්ථරය</h2>
<p>මෙයමුරපොලට පැටවීම සඳහා කේතය සහිත සම්මත කාවැද්දීම් ස්ථරයකි. </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">34</span><span class="k">class</span> <span class="nc">Embedding</span><span class="p">(</span><span class="n">NeoXModule</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-4'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-4'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_vocab</span></code>
වචන මාලාවේ ප්රමාණය වේ </li>
<li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
මෙම කාවැද්දීම් ප්රමාණය</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">41</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_vocab</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50_432</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-5'>
<div class='docs'>
<div class='section-link'>
<a href='#section-5'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">46</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="lineno">47</span>
<span class="lineno">48</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Embedding</span><span class="p">(</span><span class="n">n_vocab</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-6'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-6'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩයේ ටෝකන් හැඳුනුම් වේ <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">50</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-7'>
<div class='docs'>
<div class='section-link'>
<a href='#section-7'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">54</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-8'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-8'>#</a>
</div>
<p> මුරපොලපූරණය කිරීමට කේතය</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">56</span> <span class="k">def</span> <span class="nf">load_state</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">p2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]):</span></pre></div>
</div>
</div>
<div class='section' id='section-9'>
<div class='docs'>
<div class='section-link'>
<a href='#section-9'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">60</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Load embedding layer&#39;</span><span class="p">):</span>
<span class="lineno">61</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">emb</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;word_embeddings.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-10'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-10'>#</a>
</div>
<h2>රොටරිස්ථානීය කාවැද්දීම්</h2>
<p>ජීපීටී-නියෝක්ස් <a href="https://papers.labml.ai/paper/2104.09864">භ්රමණ ස්ථානීය කාවැද්දීම් (කඹය)</a>භාවිතා කරයි. </p>
<p>අපින්යාය වැඩි සටහන් සමඟ <a href="https://nn.labml.ai/transformers/rope/index.html">මෙහි</a> කඹය ක්රියාත්මක කිරීම විස්තර කර ඇත. </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">64</span><span class="k">class</span> <span class="nc">RoPE</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-11'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-11'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">d_rope</span></code>
කඹය කාවැද්දීම් සඳහා විශේෂාංග ගණන </li>
<li><code class="highlight"><span></span><span class="n">base</span></code>
සඳහා පදනම වේ <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.12379em;vertical-align:0em;"></span><span class="mord"><span class="mord coloredeq eql" style=""><span class="mord" style="">10000</span></span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:1.12379em;"><span style="top:-3.3973400000000002em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0377857142857143em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.5020714285714285em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mbin mtight"></span><span class="mord mtight">1</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span></span></span></span></span></span></span></span></span></span></span></span></span>, එය පැහැර හරින <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord coloredeq eql" style=""><span class="mord" style="">10000</span></span></span></span></span></span></li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">74</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">d_rope</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">base</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">10_000.</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-12'>
<div class='docs'>
<div class='section-link'>
<a href='#section-12'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">79</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-13'>
<div class='docs'>
<div class='section-link'>
<a href='#section-13'>#</a>
</div>
<p>විශේෂාංග <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> සඳහා ගබඩා කිරීමට </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">82</span> <span class="bp">self</span><span class="o">.</span><span class="n">theta</span> <span class="o">=</span> <span class="kc">None</span></pre></div>
</div>
</div>
<div class='section' id='section-14'>
<div class='docs'>
<div class='section-link'>
<a href='#section-14'>#</a>
</div>
<p>හැඹිලිය <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqf" style=""><span class="mop" style=""><span style="">c</span><span style="">o</span><span style="">s</span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span></span> සහ <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mop">sin</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">84</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span> <span class="o">=</span> <span class="kc">None</span>
<span class="lineno">85</span> <span class="bp">self</span><span class="o">.</span><span class="n">sin_cached</span> <span class="o">=</span> <span class="kc">None</span></pre></div>
</div>
</div>
<div class='section' id='section-15'>
<div class='docs'>
<div class='section-link'>
<a href='#section-15'>#</a>
</div>
<p>සඳහාමූලික <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.12379em;vertical-align:0em;"></span><span class="mord"><span class="mord coloredeq eql" style=""><span class="mord" style="">10000</span></span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:1.12379em;"><span style="top:-3.3973400000000002em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0377857142857143em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.5020714285714285em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mbin mtight"></span><span class="mord mtight">1</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span></span></span></span></span></span></span></span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">88</span> <span class="bp">self</span><span class="o">.</span><span class="n">base</span> <span class="o">=</span> <span class="n">base</span></pre></div>
</div>
</div>
<div class='section' id='section-16'>
<div class='docs'>
<div class='section-link'>
<a href='#section-16'>#</a>
</div>
<p>කඹයසඳහා විශේෂාංග ගණන </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">90</span> <span class="bp">self</span><span class="o">.</span><span class="n">d_rope</span> <span class="o">=</span> <span class="n">d_rope</span></pre></div>
</div>
</div>
<div class='section' id='section-17'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-17'>#</a>
</div>
<h3>විශේෂාංගකරකවන්න</h3>
<p><span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.22902em;vertical-align:-0.25em;"></span><span class="mopen">[</span><span class="mord"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.97902em;"><span style="top:-3.363em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span><span class="mbin mtight">+</span><span class="mord mtight">1</span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.97902em;"><span style="top:-3.363em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span><span class="mbin mtight">+</span><span class="mord mtight">2</span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">...</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8879999999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">d</span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8879999999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mtight">1</span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8879999999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mtight">2</span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">...</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.97902em;"><span style="top:-3.363em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span><span class="mclose mtight">)</span></span></span></span></span></span></span></span></span><span class="mclose">]</span></span></span></span></span></p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">92</span> <span class="nd">@staticmethod</span>
<span class="lineno">93</span> <span class="k">def</span> <span class="nf">rotate_half</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-18'>
<div class='docs'>
<div class='section-link'>
<a href='#section-18'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">99</span> <span class="n">x1</span><span class="p">,</span> <span class="n">x2</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">//</span> <span class="mi">2</span><span class="p">],</span> <span class="n">x</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">//</span> <span class="mi">2</span><span class="p">:]</span>
<span class="lineno">100</span> <span class="k">return</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">((</span><span class="o">-</span><span class="n">x2</span><span class="p">,</span> <span class="n">x1</span><span class="p">),</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-19'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-19'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩය ඇත <code class="highlight"><span></span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">seq</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="n">d_k</span><span class="p">]</span></code>
</li>
<li><code class="highlight"><span></span><span class="n">offset</span></code>
යනු ආරම්භක ස්ථානයයි <code class="highlight"><span></span><span class="n">x</span></code>
. පෙර තනතුරු වල යතුරු සහ විමසුම් අප හැඹිලි කළ <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.5782em;vertical-align:-0.0391em;"></span><span class="mrel">&gt;</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord">0</span></span></span></span></span> විට මෙය සිදු වේ</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">102</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-20'>
<div class='docs'>
<div class='section-link'>
<a href='#section-20'>#</a>
</div>
<p>සත්යඅනුක්රමයේ දිග ලබා ගන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">110</span> <span class="n">seq_len</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">]</span> <span class="o">+</span> <span class="n">offset</span></pre></div>
</div>
</div>
<div class='section' id='section-21'>
<div class='docs'>
<div class='section-link'>
<a href='#section-21'>#</a>
</div>
<p>ආරම්භකරන්න <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">113</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">theta</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-22'>
<div class='docs'>
<div class='section-link'>
<a href='#section-22'>#</a>
</div>
<p> <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.12379em;vertical-align:0em;"></span><span class="mord"><span class="mord coloredeq eql" style=""><span class="mord" style="">10000</span></span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:1.12379em;"><span style="top:-3.3973400000000002em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0377857142857143em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.5020714285714285em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mbin mtight"></span><span class="mord mtight">1</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span></span></span></span></span></span></span></span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">115</span> <span class="n">theta</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">base</span> <span class="o">**</span> <span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">d_rope</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span> <span class="o">/</span> <span class="bp">self</span><span class="o">.</span><span class="n">d_rope</span><span class="p">))</span>
<span class="lineno">116</span> <span class="bp">self</span><span class="o">.</span><span class="n">theta</span> <span class="o">=</span> <span class="n">theta</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-23'>
<div class='docs'>
<div class='section-link'>
<a href='#section-23'>#</a>
</div>
<p>ආරම්භකරන්න <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqf" style=""><span class="mop" style=""><span style="">c</span><span style="">o</span><span style="">s</span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span></span> සහ <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mop">sin</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> හැඹිලිය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">119</span> <span class="k">if</span> <span class="p">(</span>
<span class="lineno">120</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span>
<span class="lineno">121</span> <span class="n">seq_len</span> <span class="o">&gt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="ow">or</span>
<span class="lineno">122</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span><span class="o">.</span><span class="n">device</span> <span class="o">!=</span> <span class="n">x</span><span class="o">.</span><span class="n">device</span> <span class="ow">or</span>
<span class="lineno">123</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span><span class="o">.</span><span class="n">dtype</span> <span class="o">!=</span> <span class="n">x</span><span class="o">.</span><span class="n">dtype</span>
<span class="lineno">124</span> <span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-24'>
<div class='docs'>
<div class='section-link'>
<a href='#section-24'>#</a>
</div>
<p>ස්ථානදර්ශක ලබා ගන්න <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">126</span> <span class="n">seq_idx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span><span class="o">.</span><span class="n">type_as</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">theta</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-25'>
<div class='docs'>
<div class='section-link'>
<a href='#section-25'>#</a>
</div>
<p><span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqh" style=""><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">128</span> <span class="n">idx_theta</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s2">&quot;s,d-&gt;sd&quot;</span><span class="p">,</span> <span class="n">seq_idx</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">theta</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-26'>
<div class='docs'>
<div class='section-link'>
<a href='#section-26'>#</a>
</div>
<p>පේළියසඳහා <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span></span></span></span></span> අපට ඇති පරිදි සංයුක්ත කරන්න</p>
<p><span ><span class="katex-display"><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.26202em;vertical-align:-0.5120199999999999em;"></span><span class="mopen">[</span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">...</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.7287800000000004em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.5120199999999999em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">...</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord"><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3448em;"><span style="top:-2.7287800000000004em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.5120199999999999em;"><span></span></span></span></span></span></span><span class="mclose">]</span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">132</span> <span class="n">idx_theta2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">((</span><span class="n">idx_theta</span><span class="p">,</span> <span class="n">idx_theta</span><span class="p">),</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-27'>
<div class='docs'>
<div class='section-link'>
<a href='#section-27'>#</a>
</div>
<p>ගණනයකරන්න <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqf" style=""><span class="mop" style=""><span style="">c</span><span style="">o</span><span style="">s</span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span></span> සහ <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mop">sin</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> fp32 </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">135</span> <span class="k">with</span> <span class="n">autocast</span><span class="p">(</span><span class="n">enabled</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="lineno">136</span> <span class="n">idx_theta2</span> <span class="o">=</span> <span class="n">idx_theta2</span><span class="o">.</span><span class="n">float</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-28'>
<div class='docs'>
<div class='section-link'>
<a href='#section-28'>#</a>
</div>
<p>හිසමානයක් එක් කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">138</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span> <span class="o">=</span> <span class="n">idx_theta2</span><span class="o">.</span><span class="n">cos</span><span class="p">()[:,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
<span class="lineno">139</span> <span class="bp">self</span><span class="o">.</span><span class="n">sin_cached</span> <span class="o">=</span> <span class="n">idx_theta2</span><span class="o">.</span><span class="n">sin</span><span class="p">()[:,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span></pre></div>
</div>
</div>
<div class='section' id='section-29'>
<div class='docs'>
<div class='section-link'>
<a href='#section-29'>#</a>
</div>
<p>ඒවාහැඹිලිය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">142</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="lineno">143</span> <span class="bp">self</span><span class="o">.</span><span class="n">sin_cached</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sin_cached</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-30'>
<div class='docs'>
<div class='section-link'>
<a href='#section-30'>#</a>
</div>
<p>විශේෂාංගබෙදන්න. <code class="highlight"><span></span><span class="n">d_rope</span></code>
විශේෂාංග සඳහා පමණක් අපි කඹය යොදන්නෙමු </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">146</span> <span class="n">x_rope</span><span class="p">,</span> <span class="n">x_pass</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="bp">self</span><span class="o">.</span><span class="n">d_rope</span><span class="p">],</span> <span class="n">x</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">d_rope</span><span class="p">:]</span></pre></div>
</div>
</div>
<div class='section' id='section-31'>
<div class='docs'>
<div class='section-link'>
<a href='#section-31'>#</a>
</div>
<p>හැඹිලියසිට පාපය සහ කෝස් අගයන් ලබා ගන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">149</span> <span class="n">cos</span><span class="p">,</span> <span class="n">sin</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cos_cached</span><span class="p">[</span><span class="n">offset</span><span class="p">:</span> <span class="n">seq_len</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">sin_cached</span><span class="p">[</span><span class="n">offset</span><span class="p">:</span> <span class="n">seq_len</span><span class="p">]</span></pre></div>
</div>
</div>
<div class='section' id='section-32'>
<div class='docs'>
<div class='section-link'>
<a href='#section-32'>#</a>
</div>
<p>කඹයකාවැද්දීම්</p>
<span ><span class="katex-display"><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:3.42324em;vertical-align:-1.4616200000000001em;"></span><span class="mord"><span class="mtable"><span class="col-align-r"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.9616199999999997em;"><span style="top:-3.96162em;"><span class="pstrut" style="height:3.8116199999999996em;"></span><span class="mord"><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size4">(</span></span><span class="mord"><span class="mtable"><span class="col-align-c"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8116199999999998em;"><span style="top:-3.81162em;"><span class="pstrut" style="height:3.20162em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0448em;"><span style="top:-2.5834080000000004em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight coloredeq eqn" style=""><span class="mord mathnormal mtight" style="">m</span></span></span></span><span style="top:-3.2198em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.11659199999999997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop">cos</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqh" style=""><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin"></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.20162em;"><span style="top:-2.883408em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight coloredeq eqn" style=""><span class="mord mathnormal mtight" style="">m</span></span></span></span><span style="top:-3.5856000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mbin mtight">+</span><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.11659199999999997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop">sin</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqh" style=""><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span><span style="top:-2.25em;"><span class="pstrut" style="height:3.20162em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.20162em;"><span style="top:-2.883408em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight coloredeq eqn" style=""><span class="mord mathnormal mtight" style="">m</span></span></span></span><span style="top:-3.5856000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mbin mtight">+</span><span class="mord mtight"><span class="mopen nulldelimiter sizing reset-size3 size6"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8800285714285714em;"><span style="top:-2.656em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.2255000000000003em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line mtight" style="border-bottom-width:0.049em;"></span></span><span style="top:-3.384em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.344em;"><span></span></span></span></span></span><span class="mclose nulldelimiter sizing reset-size3 size6"></span></span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.11659199999999997em;"><span></span></span></span></span></span></span><span class="mord coloredeq eqf" style=""><span class="mop" style=""><span style="">c</span><span style="">o</span><span style="">s</span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0448em;"><span style="top:-2.5834080000000004em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight coloredeq eqn" style=""><span class="mord mathnormal mtight" style="">m</span></span></span></span><span style="top:-3.2198em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mopen mtight">(</span><span class="mord mathnormal mtight">i</span><span class="mclose mtight">)</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.11659199999999997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop">sin</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord coloredeq eqh" style=""><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="">m</span></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord coloredeq eqk" style=""><span class="mord mathnormal" style="margin-right:0.02778em">θ</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mathnormal mtight" style="">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.3116200000000002em;"><span></span></span></span></span></span></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size4">)</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.4616200000000001em;"><span></span></span></span></span></span></span></span></span></span></span></span></span><p>සඳහා <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.69862em;vertical-align:-0.0391em;"></span><span class="mord mathnormal">i</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel"></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.2251079999999999em;vertical-align:-0.345em;"></span><span class="mord"><span class="mord">1</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">2</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord">...</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8801079999999999em;"><span style="top:-2.6550000000000002em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.394em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.345em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span></span></span> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">161</span> <span class="n">x_rope</span> <span class="o">=</span> <span class="p">(</span><span class="n">x_rope</span> <span class="o">*</span> <span class="n">cos</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">rotate_half</span><span class="p">(</span><span class="n">x_rope</span><span class="p">)</span> <span class="o">*</span> <span class="n">sin</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-33'>
<div class='docs'>
<div class='section-link'>
<a href='#section-33'>#</a>
</div>
<p>කඹයකාවැද්දීම් ලබා නොගත් විශේෂාංග සමඟ සංයුක්ත වන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">164</span> <span class="k">return</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">((</span><span class="n">x_rope</span><span class="p">,</span> <span class="n">x_pass</span><span class="p">),</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-34'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-34'>#</a>
</div>
<h2>අවධානයස්ථරය</h2>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">167</span><span class="k">class</span> <span class="nc">AttentionLayer</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-35'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-35'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම් වල විශේෂාංග ගණන </li>
<li><code class="highlight"><span></span><span class="n">n_heads</span></code>
අවධානය යොමු ප්රධානීන් සංඛ්යාව </li>
<li><code class="highlight"><span></span><span class="n">rope_percentage</span></code>
කඹය කාවැද්දීම් එකතු කිරීම සඳහා විශේෂාංග ප්රතිශතය </li>
<li><code class="highlight"><span></span><span class="n">mask_fill</span></code>
අවධානය යොමු න්යාසය සඳහා ආවරණ පිරවුම් අගය</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">172</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span><span class="p">,</span> <span class="n">rope_percentage</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span>
<span class="lineno">173</span> <span class="n">mask_fill</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">-</span><span class="mf">10_000.0</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-36'>
<div class='docs'>
<div class='section-link'>
<a href='#section-36'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">180</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="lineno">181</span>
<span class="lineno">182</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span> <span class="o">=</span> <span class="n">n_heads</span>
<span class="lineno">183</span> <span class="bp">self</span><span class="o">.</span><span class="n">mask_fill</span> <span class="o">=</span> <span class="n">mask_fill</span></pre></div>
</div>
</div>
<div class='section' id='section-37'>
<div class='docs'>
<div class='section-link'>
<a href='#section-37'>#</a>
</div>
<p>විමසුම, යතුර සහ වටිනාකම සඳහා රේඛීය ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">186</span> <span class="bp">self</span><span class="o">.</span><span class="n">qkv_lin</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_hidden</span> <span class="o">*</span> <span class="mi">3</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-38'>
<div class='docs'>
<div class='section-link'>
<a href='#section-38'>#</a>
</div>
<p>අවසානරේඛීය ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">188</span> <span class="bp">self</span><span class="o">.</span><span class="n">output</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-39'>
<div class='docs'>
<div class='section-link'>
<a href='#section-39'>#</a>
</div>
<p>හිසකටවිශේෂාංග ගණන </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">191</span> <span class="n">d_k</span> <span class="o">=</span> <span class="n">n_hidden</span> <span class="o">//</span> <span class="n">n_heads</span></pre></div>
</div>
</div>
<div class='section' id='section-40'>
<div class='docs'>
<div class='section-link'>
<a href='#section-40'>#</a>
</div>
<p>කඹයකාවැද්දීම මොඩියුලය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">193</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span> <span class="o">=</span> <span class="n">RoPE</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">d_k</span> <span class="o">*</span> <span class="n">rope_percentage</span><span class="p">))</span></pre></div>
</div>
</div>
<div class='section' id='section-41'>
<div class='docs'>
<div class='section-link'>
<a href='#section-41'>#</a>
</div>
<p>අවධානයපරිමාණ සාධකය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">196</span> <span class="bp">self</span><span class="o">.</span><span class="n">scale</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">/</span> <span class="n">math</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">d_k</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-42'>
<div class='docs'>
<div class='section-link'>
<a href='#section-42'>#</a>
</div>
<p>හේතුආවරණ හැඹිලි කිරීමට </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">199</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span> <span class="o">=</span> <span class="kc">None</span></pre></div>
</div>
</div>
<div class='section' id='section-43'>
<div class='docs'>
<div class='section-link'>
<a href='#section-43'>#</a>
</div>
<p>අවධානයයොමු කරන්න සොෆ්ට්මැක්ස් මොඩියුලය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">202</span> <span class="bp">self</span><span class="o">.</span><span class="n">softmax</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Softmax</span><span class="p">(</span><span class="n">dim</span><span class="o">=-</span><span class="mi">2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-44'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-44'>#</a>
</div>
<h4>හේතුආවරණ ගණනය කරන්න</h4>
<ul><li><code class="highlight"><span></span><span class="n">attn</span></code>
හැඩයේ <a href="batch_size, query_seq_len, key_seq_len, n_heads">කාණ්ඩ_ප්රමාණය, විමර්_සෙක්_ලන්, යතුරු_සේක්_ලන්, එන්_හෙඩ්ස්</a>ඇත</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">204</span> <span class="k">def</span> <span class="nf">_get_mask</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">attn</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-45'>
<div class='docs'>
<div class='section-link'>
<a href='#section-45'>#</a>
</div>
<p>විමසුමසහ යතුරු දිග </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">212</span> <span class="n">nq</span><span class="p">,</span> <span class="n">nk</span> <span class="o">=</span> <span class="n">attn</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">:</span><span class="mi">3</span><span class="p">]</span></pre></div>
</div>
</div>
<div class='section' id='section-46'>
<div class='docs'>
<div class='section-link'>
<a href='#section-46'>#</a>
</div>
<p>වෙස්මුහුණ සාදන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">215</span> <span class="k">if</span> <span class="p">(</span>
<span class="lineno">216</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span>
<span class="lineno">217</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">!=</span> <span class="n">nq</span> <span class="ow">or</span>
<span class="lineno">218</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">!=</span> <span class="n">nk</span> <span class="ow">or</span>
<span class="lineno">219</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span><span class="o">.</span><span class="n">device</span> <span class="o">!=</span> <span class="n">attn</span><span class="o">.</span><span class="n">device</span>
<span class="lineno">220</span> <span class="p">):</span>
<span class="lineno">221</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">triu</span><span class="p">(</span><span class="n">attn</span><span class="o">.</span><span class="n">new_ones</span><span class="p">([</span><span class="n">nq</span><span class="p">,</span> <span class="n">nk</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bool</span><span class="p">),</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">nk</span> <span class="o">-</span> <span class="n">nq</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-47'>
<div class='docs'>
<div class='section-link'>
<a href='#section-47'>#</a>
</div>
<p>හැඹිලියසිට ආපසු </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">224</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">causal_mask</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:,</span> <span class="p">:,</span> <span class="kc">None</span><span class="p">]</span></pre></div>
</div>
</div>
<div class='section' id='section-48'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-48'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩය ඇත <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">226</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-49'>
<div class='docs'>
<div class='section-link'>
<a href='#section-49'>#</a>
</div>
<p>විමසුම, යතුර සහ වටිනාකම් කාවැද්දීම් ලබා ගන්න (සියල්ල සංයුක්ත කර ඇත). පසුගිය මානයක් ප්රමාණය n_hidden -> සිට වෙනස් වනු ඇත <code class="highlight"><span></span><span class="mi">3</span> <span class="n">x</span> <span class="n">n_hidden</span></code>
</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">232</span> <span class="n">qkv</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">qkv_lin</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-50'>
<div class='docs'>
<div class='section-link'>
<a href='#section-50'>#</a>
</div>
<p>හැඩයවෙනස් කිරීමෙන් හිස් වලට බෙදන්න <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">d_k</span><span class="p">]</span></code>
</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">235</span> <span class="n">qkv</span> <span class="o">=</span> <span class="n">qkv</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">*</span><span class="n">qkv</span><span class="o">.</span><span class="n">shape</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-51'>
<div class='docs'>
<div class='section-link'>
<a href='#section-51'>#</a>
</div>
<p>විමසුමටබෙදන්න, යතුර සහ හැඩය එක් එක් අගය කරන්න <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">d_k</span><span class="p">]</span></code>
</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">237</span> <span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">qkv</span><span class="p">,</span> <span class="n">qkv</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">//</span> <span class="mi">3</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-52'>
<div class='docs'>
<div class='section-link'>
<a href='#section-52'>#</a>
</div>
<p>අපිපෙර ටෝකන වල තත්වයන් හැඹිලි කරන්නේ නම් </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">240</span> <span class="k">if</span> <span class="n">get_cache</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;use_cache&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-53'>
<div class='docs'>
<div class='section-link'>
<a href='#section-53'>#</a>
</div>
<p>රාජ්යid ගේ ලබා ගන්න. අපි පෙර රාජ්යයන් ලබා ගැනීමට හා ඉදිරි රාජ්යයන් ගබඩා කිරීම සඳහා භාවිතා </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">242</span> <span class="n">prev_state_id</span><span class="p">,</span> <span class="n">next_state_id</span> <span class="o">=</span> <span class="n">get_cache</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;state_ids&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-54'>
<div class='docs'>
<div class='section-link'>
<a href='#section-54'>#</a>
</div>
<p>හැඹිලියතිබේ නම් </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">244</span> <span class="k">if</span> <span class="n">prev_state_id</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-55'>
<div class='docs'>
<div class='section-link'>
<a href='#section-55'>#</a>
</div>
<p>අතීතයතුරු සහ අගයන් ලබා ගන්න. මේවාට හැඩය ඇත <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">prev_seq_len</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="n">d_k</span><span class="p">]</span></code>
</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">246</span> <span class="n">k_past</span><span class="p">,</span> <span class="n">v_past</span> <span class="o">=</span> <span class="n">get_cache</span><span class="p">()</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;attn_kv_</span><span class="si">{</span><span class="n">prev_state_id</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-56'>
<div class='docs'>
<div class='section-link'>
<a href='#section-56'>#</a>
</div>
<p>වත්මන්කාවැද්දීම් වල ඕෆ්සෙට් </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">248</span> <span class="n">offset</span> <span class="o">=</span> <span class="n">k_past</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span></pre></div>
</div>
</div>
<div class='section' id='section-57'>
<div class='docs'>
<div class='section-link'>
<a href='#section-57'>#</a>
</div>
<p>කඹයකාවැද්දීම් එකතු කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">251</span> <span class="n">q</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">offset</span><span class="o">=</span><span class="n">offset</span><span class="p">)</span>
<span class="lineno">252</span> <span class="n">k</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">offset</span><span class="o">=</span><span class="n">offset</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-58'>
<div class='docs'>
<div class='section-link'>
<a href='#section-58'>#</a>
</div>
<p>අතීතයසංයුක්ත කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">255</span> <span class="n">k</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">([</span><span class="n">k_past</span><span class="p">,</span> <span class="n">k</span><span class="p">],</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="lineno">256</span> <span class="n">v</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">([</span><span class="n">v_past</span><span class="p">,</span> <span class="n">v</span><span class="p">],</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="lineno">257</span> <span class="k">else</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-59'>
<div class='docs'>
<div class='section-link'>
<a href='#section-59'>#</a>
</div>
<p>කඹයකාවැද්දීම් එකතු කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">259</span> <span class="n">q</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">q</span><span class="p">)</span>
<span class="lineno">260</span> <span class="n">k</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">k</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-60'>
<div class='docs'>
<div class='section-link'>
<a href='#section-60'>#</a>
</div>
<p>වත්මන්තත්වය සුරකින්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">263</span> <span class="n">get_cache</span><span class="p">()</span><span class="o">.</span><span class="n">push</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;attn_kv_</span><span class="si">{</span><span class="n">next_state_id</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">,</span> <span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
<span class="lineno">264</span> <span class="k">else</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-61'>
<div class='docs'>
<div class='section-link'>
<a href='#section-61'>#</a>
</div>
<p>හැඹිලියක්නැත - කඹය කාවැද්දීම් එකතු කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">266</span> <span class="n">q</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">q</span><span class="p">)</span>
<span class="lineno">267</span> <span class="n">k</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rope</span><span class="p">(</span><span class="n">k</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-62'>
<div class='docs'>
<div class='section-link'>
<a href='#section-62'>#</a>
</div>
<p>අවධානයගණනය කිරීම සඳහා fp16 කිරීමට ස්වයංක්රීය-වාත්තු අක්රීය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">270</span> <span class="k">with</span> <span class="n">autocast</span><span class="p">(</span><span class="n">enabled</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="lineno">271</span> <span class="k">if</span> <span class="n">q</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-63'>
<div class='docs'>
<div class='section-link'>
<a href='#section-63'>#</a>
</div>
<p>වත්මන්dtype fp16 නම් fp32 බවට පරිවර්තනය කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">273</span> <span class="n">attn</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s1">&#39;bihk,bjhk-&gt;bijh&#39;</span><span class="p">,</span> <span class="n">q</span><span class="o">.</span><span class="n">float</span><span class="p">(),</span> <span class="n">k</span><span class="o">.</span><span class="n">float</span><span class="p">())</span>
<span class="lineno">274</span> <span class="k">else</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-64'>
<div class='docs'>
<div class='section-link'>
<a href='#section-64'>#</a>
</div>
<p>bfloatසඳහා වාත්තු නොකරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">276</span> <span class="n">attn</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s1">&#39;bihk,bjhk-&gt;bijh&#39;</span><span class="p">,</span> <span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-65'>
<div class='docs'>
<div class='section-link'>
<a href='#section-65'>#</a>
</div>
<p>පරිමාණඅවධානය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">279</span> <span class="n">attn</span> <span class="o">=</span> <span class="n">attn</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">scale</span></pre></div>
</div>
</div>
<div class='section' id='section-66'>
<div class='docs'>
<div class='section-link'>
<a href='#section-66'>#</a>
</div>
<p>හේතුවෙස්මුහුණ ලබා ගන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">282</span> <span class="n">mask</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_mask</span><span class="p">(</span><span class="n">attn</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-67'>
<div class='docs'>
<div class='section-link'>
<a href='#section-67'>#</a>
</div>
<p>වෙස්යොදන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">284</span> <span class="n">attn</span><span class="o">.</span><span class="n">masked_fill_</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mask_fill</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-68'>
<div class='docs'>
<div class='section-link'>
<a href='#section-68'>#</a>
</div>
<p>අවධානයසොෆ්ට්මැක්ස් </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">287</span> <span class="n">attn</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">attn</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-69'>
<div class='docs'>
<div class='section-link'>
<a href='#section-69'>#</a>
</div>
<p>අවධානයබර තැබූ අගයන් ලබා ගන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">290</span> <span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s1">&#39;bijh,bjhk-&gt;bihk&#39;</span><span class="p">,</span> <span class="n">attn</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">v</span><span class="o">.</span><span class="n">dtype</span><span class="p">),</span> <span class="n">v</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-70'>
<div class='docs'>
<div class='section-link'>
<a href='#section-70'>#</a>
</div>
<p><code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="n">d_k</span><span class="p">]</span> <span class="n">to</span></code>
<a href="batch_size, seq_len, n_hidden">Batch_size, seq_len, n_hidden</a>`වෙතින් නැවත සකස් කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">293</span> <span class="n">output</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">*</span><span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-71'>
<div class='docs'>
<div class='section-link'>
<a href='#section-71'>#</a>
</div>
<p>අවසානරේඛීය ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">296</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">output</span><span class="p">(</span><span class="n">output</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-72'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-72'>#</a>
</div>
<h2>ප්රතිපෝෂණජාලය</h2>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">299</span><span class="k">class</span> <span class="nc">FFNLayer</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-73'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-73'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම ප්රමාණය වේ</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">304</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">,</span> <span class="n">d_ff</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-74'>
<div class='docs'>
<div class='section-link'>
<a href='#section-74'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">308</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="lineno">309</span>
<span class="lineno">310</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">d_ff</span><span class="p">:</span>
<span class="lineno">311</span> <span class="n">d_ff</span> <span class="o">=</span> <span class="n">n_hidden</span> <span class="o">*</span> <span class="mi">4</span></pre></div>
</div>
</div>
<div class='section' id='section-75'>
<div class='docs'>
<div class='section-link'>
<a href='#section-75'>#</a>
</div>
<p>පුළුල්රේඛීය ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">314</span> <span class="bp">self</span><span class="o">.</span><span class="n">dense_h_h4</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">,</span> <span class="n">d_ff</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-76'>
<div class='docs'>
<div class='section-link'>
<a href='#section-76'>#</a>
</div>
<p>GELUසක්රිය කිරීම </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">316</span> <span class="bp">self</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">GELU</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-77'>
<div class='docs'>
<div class='section-link'>
<a href='#section-77'>#</a>
</div>
<p>සංකෝචනයරේඛීය ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">318</span> <span class="bp">self</span><span class="o">.</span><span class="n">dense_h4_h</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">d_ff</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-78'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-78'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩය ඇත <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">320</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-79'>
<div class='docs'>
<div class='section-link'>
<a href='#section-79'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">324</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">dense_h_h4</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">325</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">activation</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">326</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">dense_h4_h</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">327</span>
<span class="lineno">328</span> <span class="k">return</span> <span class="n">x</span></pre></div>
</div>
</div>
<div class='section' id='section-80'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-80'>#</a>
</div>
<h2>ට්රාන්ස්ෆෝමර්ස්ථරය</h2>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">331</span><span class="k">class</span> <span class="nc">TransformerLayer</span><span class="p">(</span><span class="n">NeoXModule</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-81'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-81'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම ප්රමාණය වේ </li>
<li><code class="highlight"><span></span><span class="n">n_heads</span></code>
හිස් සංඛ්යාව වේ</li></ul>
<p><em>පිටතක්රියාත්මක කිරීම අතහැර දැමීම ඇතුළත් නොවේ</em>. </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">336</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-82'>
<div class='docs'>
<div class='section-link'>
<a href='#section-82'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">343</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-83'>
<div class='docs'>
<div class='section-link'>
<a href='#section-83'>#</a>
</div>
<p>අවධානයටපෙර ස්ථර සාමාන්යකරණය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">346</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_attn</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">LayerNorm</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-84'>
<div class='docs'>
<div class='section-link'>
<a href='#section-84'>#</a>
</div>
<p>FFNට පෙර ස්ථර සාමාන්යකරණය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">348</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_ffn</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">LayerNorm</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-85'>
<div class='docs'>
<div class='section-link'>
<a href='#section-85'>#</a>
</div>
<p>අවධානයස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">351</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention</span> <span class="o">=</span> <span class="n">AttentionLayer</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-86'>
<div class='docs'>
<div class='section-link'>
<a href='#section-86'>#</a>
</div>
<p>FFNස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">353</span> <span class="bp">self</span><span class="o">.</span><span class="n">ffn</span> <span class="o">=</span> <span class="n">FFNLayer</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-87'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-87'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩයේ කාවැද්දීම් වේ <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">355</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-88'>
<div class='docs'>
<div class='section-link'>
<a href='#section-88'>#</a>
</div>
<p>අවශේෂසම්බන්ධතාවය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">361</span> <span class="n">residual</span> <span class="o">=</span> <span class="n">x</span></pre></div>
</div>
</div>
<div class='section' id='section-89'>
<div class='docs'>
<div class='section-link'>
<a href='#section-89'>#</a>
</div>
<p>නියෝක්ස්සමාන්තරව අවධානය සහ ප්රතිපෝෂණ ජාලය ක්රියාත්මක කරයි </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">363</span> <span class="n">attn</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_attn</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
<span class="lineno">364</span> <span class="n">ffn</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ffn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_ffn</span><span class="p">(</span><span class="n">x</span><span class="p">))</span></pre></div>
</div>
</div>
<div class='section' id='section-90'>
<div class='docs'>
<div class='section-link'>
<a href='#section-90'>#</a>
</div>
<p>ඒවාසහ අවශේෂ සම්බන්ධතාවය එකතු කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">366</span> <span class="k">return</span> <span class="n">attn</span> <span class="o">+</span> <span class="n">ffn</span> <span class="o">+</span> <span class="n">residual</span></pre></div>
</div>
</div>
<div class='section' id='section-91'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-91'>#</a>
</div>
<p> මුරපොලපූරණය කිරීමට කේතය</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">368</span> <span class="k">def</span> <span class="nf">load_state</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">p2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]):</span></pre></div>
</div>
</div>
<div class='section' id='section-92'>
<div class='docs'>
<div class='section-link'>
<a href='#section-92'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">372</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Load transformer layer&#39;</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-93'>
<div class='docs'>
<div class='section-link'>
<a href='#section-93'>#</a>
</div>
<p>අවධානයයොමු ප්රතිදානය පරිණාමනය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">374</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_sum</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">output</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;attention.dense.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">375</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_1</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">output</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;attention.dense.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-94'>
<div class='docs'>
<div class='section-link'>
<a href='#section-94'>#</a>
</div>
<p>අවධානයවිමසුම, යතුර සහ අගය පරිවර්තනය කිරීම </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">378</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">qkv_lin</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;attention.query_key_value.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">379</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">qkv_lin</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;attention.query_key_value.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-95'>
<div class='docs'>
<div class='section-link'>
<a href='#section-95'>#</a>
</div>
<p>අවධානයටපෙර ස්ථර සම්මතය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">382</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_attn</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;input_layernorm.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">383</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_attn</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;input_layernorm.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-96'>
<div class='docs'>
<div class='section-link'>
<a href='#section-96'>#</a>
</div>
<p>FFNදෙවන පරිණාමනය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">386</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h_h4</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;mlp.dense_h_to_4h.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">387</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h_h4</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;mlp.dense_h_to_4h.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-97'>
<div class='docs'>
<div class='section-link'>
<a href='#section-97'>#</a>
</div>
<p>FFNපළමු පරිවර්තනය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">390</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_sum</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h4_h</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;mlp.dense_4h_to_h.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">391</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_1</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h4_h</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;mlp.dense_4h_to_h.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-98'>
<div class='docs'>
<div class='section-link'>
<a href='#section-98'>#</a>
</div>
<p>FFNට පෙර ස්ථර සම්මතය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">394</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_ffn</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;post_attention_layernorm.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">395</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_ln_ffn</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;post_attention_layernorm.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-99'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-99'>#</a>
</div>
<h2>අවසානසාමාන්යකරණ ස්තරය</h2>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">398</span><span class="k">class</span> <span class="nc">FinalNorm</span><span class="p">(</span><span class="n">NeoXModule</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-100'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-100'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම ප්රමාණය වේ</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">403</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-101'>
<div class='docs'>
<div class='section-link'>
<a href='#section-101'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">407</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="lineno">408</span>
<span class="lineno">409</span> <span class="bp">self</span><span class="o">.</span><span class="n">ln</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">LayerNorm</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-102'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-102'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩයේ කාවැද්දීම් වේ <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">411</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-103'>
<div class='docs'>
<div class='section-link'>
<a href='#section-103'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">415</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ln</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-104'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-104'>#</a>
</div>
<p> මුරපොලපූරණය කිරීමට කේතය</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">417</span> <span class="k">def</span> <span class="nf">load_state</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">p2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]):</span></pre></div>
</div>
</div>
<div class='section' id='section-105'>
<div class='docs'>
<div class='section-link'>
<a href='#section-105'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">421</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Load final normalization layer&#39;</span><span class="p">):</span>
<span class="lineno">422</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ln</span><span class="o">.</span><span class="n">bias</span><span class="p">,</span> <span class="s1">&#39;norm.bias&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span>
<span class="lineno">423</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_duplicate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ln</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;norm.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-106'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-106'>#</a>
</div>
<p> කියවීමේස්ථරය</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">426</span><span class="k">class</span> <span class="nc">ReadoutLayer</span><span class="p">(</span><span class="n">NeoXModule</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-107'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-107'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම ප්රමාණය වේ </li>
<li><code class="highlight"><span></span><span class="n">n_vocab</span></code>
වචන මාලාවේ ප්රමාණය වේ</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">431</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">,</span> <span class="n">n_vocab</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50_432</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-108'>
<div class='docs'>
<div class='section-link'>
<a href='#section-108'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">436</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="lineno">437</span>
<span class="lineno">438</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_vocab</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-109'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-109'>#</a>
</div>
<ul><li><code class="highlight"><span></span><span class="n">x</span></code>
හැඩයේ කාවැද්දීම් වේ <code class="highlight"><span></span><span class="p">[</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">]</span></code>
</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">440</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-110'>
<div class='docs'>
<div class='section-link'>
<a href='#section-110'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">444</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-111'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-111'>#</a>
</div>
<p> මුරපොලපූරණය කිරීමට කේතය</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">446</span> <span class="k">def</span> <span class="nf">load_state</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">p2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]):</span></pre></div>
</div>
</div>
<div class='section' id='section-112'>
<div class='docs'>
<div class='section-link'>
<a href='#section-112'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">450</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Load final linear layer&#39;</span><span class="p">):</span>
<span class="lineno">451</span> <span class="n">checkpoint</span><span class="o">.</span><span class="n">merge_params_dim_0</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="s1">&#39;final_linear.weight&#39;</span><span class="p">,</span> <span class="n">p1</span><span class="p">,</span> <span class="n">p2</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-113'>
<div class='docs'>
<div class='section-link'>
<a href='#section-113'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">454</span><span class="k">class</span> <span class="nc">LayerGenerator</span><span class="p">:</span>
<span class="lineno">455</span> <span class="n">pre_created_layers</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="n">NeoXModule</span><span class="p">]]</span></pre></div>
</div>
</div>
<div class='section' id='section-114'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-114'>#</a>
</div>
<h3>ස්ථරනිර්මාණය කිරීමට උත්පාදක යන්ත්රය</h3>
<p>ස්ථරජනනය කරනු ලබන්නේ මුරපොලවල් මෙන් එකම අනුපිළිවෙලකි. </p>
<p>ස්ථරයක්නොමැති <code class="highlight"><span></span><span class="kc">None</span></code>
විට එය ලබා දෙයි; අපි ස්ථර දර්ශක නියෝක්ස් ලෙස භාවිතා කරන අතර අපගේ ක්රියාත්මක කිරීමේදී අපට අවශ්ය නොවන පරිවර්තන ස්ථර දෙකක් තිබේ. </p>
<ul><li><code class="highlight"><span></span><span class="n">n_vocab</span></code>
යනු වචන මාලාවේ ටෝකන ගණන </li>
<li><code class="highlight"><span></span><span class="n">n_hidden</span></code>
කාවැද්දීම් වල ඇති ලක්ෂණ ගණන </li>
<li><code class="highlight"><span></span><span class="n">n_layers</span></code>
ට්රාන්ස්ෆෝමර් ස්ථර ගණන </li>
<li><code class="highlight"><span></span><span class="n">n_heads</span></code>
අවධානය යොමු ප්රධානීන් සංඛ්යාව වේ </li>
<li><code class="highlight"><span></span><span class="n">filter_layers</span></code>
භාවිතා කළ යුතු ස්ථර සමූහයයි. කිසිවක් නොමැති නම් සියලුම ස්ථර භාවිතා කරනු ඇත. අඩු ස්ථර සහිත ආකෘතියේ කුඩා අනුවාදයන් පරීක්ෂා කිරීමට මෙය භාවිතා </li>කරයි
<li><code class="highlight"><span></span><span class="n">is_clone_layers</span></code>
ට්රාන්ස්ෆෝමර් ස්ථර ක්ලෝන කළ යුතුද යන්න නියම කරයි (ටිකක් වේගවත්) </li>
<li><code class="highlight"><span></span><span class="n">dtype</span></code>
ආකෘතියේ දත්ත වර්ගයයි </li>
<li><code class="highlight"><span></span><span class="n">device</span></code>
ආකෘතියේ උපාංගය වේ </li>
<li><code class="highlight"><span></span><span class="n">is_llm_int8</span></code>
INT8 ප්රමාණකරණය භාවිතා කළ යුතුද යන්න නියම කරයි </li>
<li><code class="highlight"><span></span><span class="n">llm_int8_threshold</span></code>
යනු පිටත විශේෂාංග වෙන් කිරීම <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord mathnormal" style="margin-right:0.0037em;">α</span></span></span></span></span> සඳහා භාවිතා කරන එළිපත්ත</li></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">457</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n_vocab</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50_432</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">6_144</span><span class="p">,</span>
<span class="lineno">458</span> <span class="n">n_layers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">44</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span><span class="p">,</span>
<span class="lineno">459</span> <span class="n">filter_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Set</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="lineno">460</span> <span class="n">is_clone_layers</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="lineno">461</span> <span class="n">dtype</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">dtype</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">float</span><span class="p">,</span>
<span class="lineno">462</span> <span class="n">device</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">&#39;cpu&#39;</span><span class="p">),</span>
<span class="lineno">463</span> <span class="n">is_llm_int8</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="lineno">464</span> <span class="n">llm_int8_threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">6.0</span><span class="p">,</span>
<span class="lineno">465</span> <span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-115'>
<div class='docs'>
<div class='section-link'>
<a href='#section-115'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">486</span> <span class="k">if</span> <span class="n">filter_layers</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">487</span> <span class="n">filter_layers</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_layers</span> <span class="o">+</span> <span class="mi">3</span><span class="p">))</span>
<span class="lineno">488</span>
<span class="lineno">489</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_vocab</span> <span class="o">=</span> <span class="n">n_vocab</span>
<span class="lineno">490</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_hidden</span> <span class="o">=</span> <span class="n">n_hidden</span>
<span class="lineno">491</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_layers</span> <span class="o">=</span> <span class="n">n_layers</span>
<span class="lineno">492</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span> <span class="o">=</span> <span class="n">n_heads</span>
<span class="lineno">493</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_layers</span> <span class="o">=</span> <span class="n">filter_layers</span>
<span class="lineno">494</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_clone_layers</span> <span class="o">=</span> <span class="n">is_clone_layers</span>
<span class="lineno">495</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span> <span class="o">=</span> <span class="n">dtype</span>
<span class="lineno">496</span> <span class="bp">self</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="n">device</span>
<span class="lineno">497</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_llm_int8</span> <span class="o">=</span> <span class="n">is_llm_int8</span>
<span class="lineno">498</span> <span class="bp">self</span><span class="o">.</span><span class="n">llm_int8_threshold</span> <span class="o">=</span> <span class="n">llm_int8_threshold</span>
<span class="lineno">499</span>
<span class="lineno">500</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span>
<span class="lineno">501</span> <span class="n">transformer_layer</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="lineno">502</span> <span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-116'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-116'>#</a>
</div>
<h4>භාවිතයසඳහා ස්තරය සකස් කරයි</h4>
<p>අපිස්තරය උපාංගය වෙත ගෙන ගොස් නිවැරදි දත්ත වර්ගයට පරිවර්තනය කරමු</p>
<ul><li><code class="highlight"><span></span><span class="n">layer</span></code>
සකස් කළ යුතු ස්ථරයයි </li>
<p>සකස්කළ ස්තරය<em>ආපසු ලබා දෙයි</em> </p></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">504</span> <span class="k">def</span> <span class="nf">_prepare_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">layer</span><span class="p">:</span> <span class="n">NeoXModule</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-117'>
<div class='docs'>
<div class='section-link'>
<a href='#section-117'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">513</span> <span class="k">return</span> <span class="n">layer</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-118'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-118'>#</a>
</div>
<p> <a id="post_load_prepare"></a></p>
<h3>පිරික්සුම්ස්ථානය පැටවීමෙන් පසු ස්ථර පරිවර්තනයන්</h3>
<p>මෙමශ්රිතය පිරික්සුම් ස්ථානය පැටවීමෙන් පසු ස්ථර පරිවර්තනයන් ක්රියාත්මක කරයි. </p>
<p>දැනටඑය අදාළ වන්නේ INT8 ප්රමාණකරණය පමණි. </p>
<ul><li><code class="highlight"><span></span><span class="n">layer</span></code>
සකස් කළ යුතු ස්ථරයයි </li>
<li><code class="highlight"><span></span><span class="n">is_llm_int8</span></code>
INT8 ප්රමාණකරණය භාවිතා කළ යුතුද යන්න නියම කරයි </li>
<li><code class="highlight"><span></span><span class="n">device</span></code>
ආකෘතියේ උපාංගය වේ </li>
<li><code class="highlight"><span></span><span class="n">llm_int8_threshold</span></code>
යනු පිටත විශේෂාංග වෙන් කිරීම <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord mathnormal" style="margin-right:0.0037em;">α</span></span></span></span></span> සඳහා භාවිතා කරන එළිපත්ත </li>
<p>සකස්කළ ස්තරය<em>ආපසු ලබා දෙයි</em> </p></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">515</span> <span class="nd">@torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">()</span>
<span class="lineno">516</span> <span class="k">def</span> <span class="nf">post_load_prepare</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">layer</span><span class="p">:</span> <span class="n">NeoXModule</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span>
<span class="lineno">517</span> <span class="n">is_llm_int8</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="lineno">518</span> <span class="n">device</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="lineno">519</span> <span class="n">llm_int8_threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="lineno">520</span> <span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-119'>
<div class='docs'>
<div class='section-link'>
<a href='#section-119'>#</a>
</div>
<p>නියමකර නොමැති නම් පෙරනිමි අගයන් ලබා ගන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">538</span> <span class="k">if</span> <span class="n">is_llm_int8</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">539</span> <span class="n">is_llm_int8</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_llm_int8</span>
<span class="lineno">540</span> <span class="k">if</span> <span class="n">device</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">541</span> <span class="n">device</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">device</span>
<span class="lineno">542</span> <span class="k">if</span> <span class="n">llm_int8_threshold</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">543</span> <span class="n">llm_int8_threshold</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">llm_int8_threshold</span></pre></div>
</div>
</div>
<div class='section' id='section-120'>
<div class='docs'>
<div class='section-link'>
<a href='#section-120'>#</a>
</div>
<p>INT8ප්රමාණකරණය භාවිතා නොකරන්නේ නම් මඟ හරින්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">546</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">is_llm_int8</span><span class="p">:</span>
<span class="lineno">547</span> <span class="k">return</span> <span class="n">layer</span></pre></div>
</div>
</div>
<div class='section' id='section-121'>
<div class='docs'>
<div class='section-link'>
<a href='#section-121'>#</a>
</div>
<p>ට්රාන්ස්ෆෝමර්ස්ථර වල රේඛීය ස්ථර පමණක් පරිවර්තනය කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">550</span> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">TransformerLayer</span><span class="p">):</span>
<span class="lineno">551</span> <span class="k">return</span> <span class="n">layer</span></pre></div>
</div>
</div>
<div class='section' id='section-122'>
<div class='docs'>
<div class='section-link'>
<a href='#section-122'>#</a>
</div>
<p><a href="./utils/llm_int8.html">උපයෝගිතා</a>වල <code class="highlight"><span></span><span class="n">make_llm_int8_linear</span></code>
අර්ථ දක්වා ඇති භාවිතා කරන්න. </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">554</span> <span class="kn">from</span> <span class="nn">labml_nn.neox.utils.llm_int8</span> <span class="kn">import</span> <span class="n">make_llm_int8_linear</span></pre></div>
</div>
</div>
<div class='section' id='section-123'>
<div class='docs'>
<div class='section-link'>
<a href='#section-123'>#</a>
</div>
<p>රේඛීයස්ථර පරිවර්තනය කරන්න </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">557</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Convert to int8&#39;</span><span class="p">):</span>
<span class="lineno">558</span> <span class="n">layer</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">output</span> <span class="o">=</span> <span class="n">make_llm_int8_linear</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">output</span><span class="p">,</span>
<span class="lineno">559</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
<span class="lineno">560</span> <span class="n">threshold</span><span class="o">=</span><span class="n">llm_int8_threshold</span><span class="p">)</span>
<span class="lineno">561</span> <span class="n">layer</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">qkv_lin</span> <span class="o">=</span> <span class="n">make_llm_int8_linear</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">qkv_lin</span><span class="p">,</span>
<span class="lineno">562</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
<span class="lineno">563</span> <span class="n">threshold</span><span class="o">=</span><span class="n">llm_int8_threshold</span><span class="p">)</span>
<span class="lineno">564</span> <span class="n">layer</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h_h4</span> <span class="o">=</span> <span class="n">make_llm_int8_linear</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h_h4</span><span class="p">,</span>
<span class="lineno">565</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
<span class="lineno">566</span> <span class="n">threshold</span><span class="o">=</span><span class="n">llm_int8_threshold</span><span class="p">)</span>
<span class="lineno">567</span> <span class="n">layer</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h4_h</span> <span class="o">=</span> <span class="n">make_llm_int8_linear</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">ffn</span><span class="o">.</span><span class="n">dense_h4_h</span><span class="p">,</span>
<span class="lineno">568</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
<span class="lineno">569</span> <span class="n">threshold</span><span class="o">=</span><span class="n">llm_int8_threshold</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-124'>
<div class='docs'>
<div class='section-link'>
<a href='#section-124'>#</a>
</div>
<p> </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">571</span> <span class="k">return</span> <span class="n">layer</span></pre></div>
</div>
</div>
<div class='section' id='section-125'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-125'>#</a>
</div>
<h4>ස්තරයක්නිර්මාණය කර හැච් කරයි</h4>
<p>පරාමිතීන්ආරම්භ කිරීමට කාලය ගතවන නිසා හැඹිලි ස්ථර පිටපත් කිරීම නව ස්ථර ආරම්භ කිරීමට වඩා වේගවත් වේ. </p>
<ul><li><code class="highlight"><span></span><span class="n">name</span></code>
යනු ස්තරයේ නමයි </li>
<li><code class="highlight"><span></span><span class="n">creator</span></code>
ස්තරය නිර්මාණය කිරීමේ කාර්යයයි </li>
<p>සාදනලද ස්තරය හෝ කැච් ස්ථරයේ පිටපතක්<em>ආපසු ලබා දෙයි</em> </p></ul>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">573</span> <span class="k">def</span> <span class="nf">_create_and_cache_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">creator</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[],</span> <span class="n">NeoXModule</span><span class="p">]):</span></pre></div>
</div>
</div>
<div class='section' id='section-126'>
<div class='docs'>
<div class='section-link'>
<a href='#section-126'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">585</span> <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_clone_layers</span><span class="p">:</span>
<span class="lineno">586</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_layer</span><span class="p">(</span><span class="n">creator</span><span class="p">())</span>
<span class="lineno">587</span>
<span class="lineno">588</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">589</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_layer</span><span class="p">(</span><span class="n">creator</span><span class="p">())</span>
<span class="lineno">590</span>
<span class="lineno">591</span> <span class="n">layer</span> <span class="o">=</span> <span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span><span class="p">[</span><span class="n">name</span><span class="p">])</span>
<span class="lineno">592</span> <span class="k">return</span> <span class="n">layer</span></pre></div>
</div>
</div>
<div class='section' id='section-127'>
<div class='docs'>
<div class='section-link'>
<a href='#section-127'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">594</span> <span class="k">def</span> <span class="nf">_create_transformer_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">595</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_and_cache_layer</span><span class="p">(</span>
<span class="lineno">596</span> <span class="s1">&#39;transformer_layer&#39;</span><span class="p">,</span>
<span class="lineno">597</span> <span class="k">lambda</span><span class="p">:</span> <span class="n">TransformerLayer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n_hidden</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span><span class="p">)</span>
<span class="lineno">598</span> <span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-128'>
<div class='docs'>
<div class='section-link'>
<a href='#section-128'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">600</span> <span class="k">def</span> <span class="nf">_create_embedding_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">601</span> <span class="k">return</span> <span class="n">Embedding</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n_vocab</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-129'>
<div class='docs'>
<div class='section-link'>
<a href='#section-129'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">603</span> <span class="k">def</span> <span class="nf">_create_final_norm_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">604</span> <span class="k">return</span> <span class="n">FinalNorm</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n_hidden</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-130'>
<div class='docs'>
<div class='section-link'>
<a href='#section-130'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">606</span> <span class="k">def</span> <span class="nf">_create_readout_layer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">607</span> <span class="k">return</span> <span class="n">ReadoutLayer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n_hidden</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_vocab</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-131'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-131'>#</a>
</div>
<h3>ස්ථරලබා ගැනීම සඳහා උත්පාදක යන්ත්රය</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">609</span> <span class="nd">@torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">()</span>
<span class="lineno">610</span> <span class="k">def</span> <span class="nf">get_layers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Generator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">NeoXModule</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]:</span></pre></div>
</div>
</div>
<div class='section' id='section-132'>
<div class='docs'>
<div class='section-link'>
<a href='#section-132'>#</a>
</div>
<p>කාවැද්දීමස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">615</span> <span class="k">if</span> <span class="mi">0</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_layers</span><span class="p">:</span>
<span class="lineno">616</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Embedding layer&#39;</span><span class="p">):</span>
<span class="lineno">617</span> <span class="n">layer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_layer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_create_embedding_layer</span><span class="p">())</span>
<span class="lineno">618</span> <span class="k">yield</span> <span class="n">layer</span><span class="p">,</span> <span class="p">(</span><span class="s1">&#39;layer_00-model_00-model_states.pt&#39;</span><span class="p">,</span> <span class="s1">&#39;layer_00-model_01-model_states.pt&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-133'>
<div class='docs'>
<div class='section-link'>
<a href='#section-133'>#</a>
</div>
<p>ට්රාන්ස්ෆෝමර්ස්ථර </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">621</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n_layers</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-134'>
<div class='docs'>
<div class='section-link'>
<a href='#section-134'>#</a>
</div>
<p>ට්රාන්ස්ෆෝමර්ස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">623</span> <span class="k">if</span> <span class="n">i</span> <span class="o">+</span> <span class="mi">1</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_layers</span><span class="p">:</span>
<span class="lineno">624</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Transformer Layer </span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">):</span>
<span class="lineno">625</span> <span class="k">yield</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_transformer_layer</span><span class="p">(),</span> \
<span class="lineno">626</span> <span class="p">(</span><span class="sa">f</span><span class="s1">&#39;layer_</span><span class="si">{</span><span class="n">i</span> <span class="o">+</span> <span class="mi">2</span> <span class="si">:</span><span class="s1">02d</span><span class="si">}</span><span class="s1">-model_00-model_states.pt&#39;</span><span class="p">,</span>
<span class="lineno">627</span> <span class="sa">f</span><span class="s1">&#39;layer_</span><span class="si">{</span><span class="n">i</span> <span class="o">+</span> <span class="mi">2</span> <span class="si">:</span><span class="s1">02d</span><span class="si">}</span><span class="s1">-model_01-model_states.pt&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-135'>
<div class='docs'>
<div class='section-link'>
<a href='#section-135'>#</a>
</div>
<p>අවසානසාමාන්යකරණ ස්තරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">630</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_layers</span> <span class="o">+</span> <span class="mi">1</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_layers</span><span class="p">:</span>
<span class="lineno">631</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Final norm layer&#39;</span><span class="p">):</span>
<span class="lineno">632</span> <span class="n">layer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_layer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_create_final_norm_layer</span><span class="p">())</span>
<span class="lineno">633</span> <span class="k">yield</span> <span class="n">layer</span><span class="p">,</span> <span class="p">(</span><span class="s1">&#39;layer_47-model_00-model_states.pt&#39;</span><span class="p">,</span> <span class="s1">&#39;layer_47-model_01-model_states.pt&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-136'>
<div class='docs'>
<div class='section-link'>
<a href='#section-136'>#</a>
</div>
<p>කියවීමේස්ථරය </p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">636</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_layers</span> <span class="o">+</span> <span class="mi">2</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_layers</span><span class="p">:</span>
<span class="lineno">637</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Readout layer&#39;</span><span class="p">):</span>
<span class="lineno">638</span> <span class="n">layer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_layer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_create_readout_layer</span><span class="p">())</span>
<span class="lineno">639</span> <span class="k">yield</span> <span class="n">layer</span><span class="p">,</span> <span class="p">(</span><span class="s1">&#39;layer_48-model_00-model_states.pt&#39;</span><span class="p">,</span> <span class="s1">&#39;layer_48-model_01-model_states.pt&#39;</span><span class="p">)</span>
<span class="lineno">640</span>
<span class="lineno">641</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
<span class="lineno">642</span> <span class="bp">self</span><span class="o">.</span><span class="n">pre_created_layers</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span></pre></div>
</div>
</div>
<div class='section' id='section-137'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-137'>#</a>
</div>
<h3>මුළුස්ථර ගණන නැවත ලබා දෙයි</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">644</span> <span class="nd">@property</span>
<span class="lineno">645</span> <span class="k">def</span> <span class="nf">total_layers</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-138'>
<div class='docs'>
<div class='section-link'>
<a href='#section-138'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">649</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_layers</span> <span class="o">+</span> <span class="mi">3</span></pre></div>
</div>
</div>
<div class='section' id='section-139'>
<div class='docs doc-strings'>
<div class='section-link'>
<a href='#section-139'>#</a>
</div>
<h3>ස්ථරපැටවීමට උත්පාදක යන්ත්රය</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">651</span> <span class="nd">@torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">()</span>
<span class="lineno">652</span> <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Generator</span><span class="p">[</span><span class="n">NeoXModule</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]:</span></pre></div>
</div>
</div>
<div class='section' id='section-140'>
<div class='docs'>
<div class='section-link'>
<a href='#section-140'>#</a>
</div>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">656</span> <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s2">&quot;Layers&quot;</span><span class="p">):</span>
<span class="lineno">657</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">files</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_layers</span><span class="p">()):</span>
<span class="lineno">658</span> <span class="k">if</span> <span class="n">files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">659</span> <span class="n">layer</span><span class="o">.</span><span class="n">load_state</span><span class="p">(</span><span class="o">*</span><span class="n">checkpoint</span><span class="o">.</span><span class="n">load_checkpoint_files</span><span class="p">(</span><span class="n">files</span><span class="p">))</span>
<span class="lineno">660</span>
<span class="lineno">661</span> <span class="n">layer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">post_load_prepare</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>
<span class="lineno">662</span>
<span class="lineno">663</span> <span class="n">monit</span><span class="o">.</span><span class="n">progress</span><span class="p">(</span><span class="nb">min</span><span class="p">(</span><span class="mf">0.99</span><span class="p">,</span> <span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">/</span> <span class="bp">self</span><span class="o">.</span><span class="n">total_layers</span><span class="p">))</span>
<span class="lineno">664</span> <span class="k">yield</span> <span class="n">layer</span></pre></div>
</div>
</div>
<div class='footer'>
<a href="https://papers.labml.ai">Trending Research Papers</a>
<a href="https://labml.ai">labml.ai</a>
</div>
</div>
<script src=../interactive.js?v=1"></script>
<script>
function handleImages() {
var images = document.querySelectorAll('p>img')
for (var i = 0; i < images.length; ++i) {
handleImage(images[i])
}
}
function handleImage(img) {
img.parentElement.style.textAlign = 'center'
var modal = document.createElement('div')
modal.id = 'modal'
var modalContent = document.createElement('div')
modal.appendChild(modalContent)
var modalImage = document.createElement('img')
modalContent.appendChild(modalImage)
var span = document.createElement('span')
span.classList.add('close')
span.textContent = 'x'
modal.appendChild(span)
img.onclick = function () {
console.log('clicked')
document.body.appendChild(modal)
modalImage.src = img.src
}
span.onclick = function () {
document.body.removeChild(modal)
}
}
handleImages()
</script>
</body>
</html>