diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index 846d4471..68ba05f3 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -65,7 +65,7 @@ class MultiHeadAttention(Module):
 
         This computes scaled multi-headed attention for given `query`, `key` and `value` vectors.
 
-        $$Attention(Q, K, V) = \underset{seq}{softmax}\Bigg(\frac{Q K^T}{\sqrt{d_k}}\Bigg)V$$
+        $$Attention(Q, K, V) = \\underset{seq}{softmax}\Bigg(\frac{Q K^T}{\sqrt{d_k}}\Bigg)V$$
 
         In simple terms, it finds keys that matches the query, and get the values of
          those keys.
diff --git a/setup.py b/setup.py
index c32a99c2..4ad00768 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("readme.md", "r") as f:
 
 setuptools.setup(
     name='labml_nn',
-    version='0.4.72',
+    version='0.4.73',
     author="Varuna Jayasiri, Nipun Wijerathne",
     author_email="vpjayasiri@gmail.com, hnipun@gmail.com",
     description="A collection of PyTorch implementations of neural network architectures and layers.",