fix: fix cls_token bug in vit.

2025-08-14 17:41:37 +08:00 · 2023-11-06 11:28:45 +08:00
parent 9a42ac2697
commit ffafaf1df7
1 changed files with 2 additions and 2 deletions
--- a/labml_nn/transformers/vit/init.py
+++ b/labml_nn/transformers/vit/init.py
@ -191,11 +191,11 @@ class VisionTransformer(Module):
        """
        # Get patch embeddings. This gives a tensor of shape `[patches, batch_size, d_model]`
        x = self.patch_emb(x)
-        # Add positional embeddings
-        x = self.pos_emb(x)
        # Concatenate the `[CLS]` token embeddings before feeding the transformer
        cls_token_emb = self.cls_token_emb.expand(-1, x.shape[1], -1)
        x = torch.cat([cls_token_emb, x])
+        # Add positional embeddings
+        x = self.pos_emb(x)

        # Pass through transformer layers with no attention masking
        for layer in self.transformer_layers: