From bc32b507ea06a51390ddc3d15dc5bdbf19f10986 Mon Sep 17 00:00:00 2001 From: lakshith Date: Wed, 31 Jul 2024 20:39:46 +0530 Subject: [PATCH] clear notebook outputs --- labml_nn/transformers/LoRA/experiment.ipynb | 75 ++--------- labml_nn/transformers/LoRA/train.ipynb | 137 ++++---------------- 2 files changed, 34 insertions(+), 178 deletions(-) diff --git a/labml_nn/transformers/LoRA/experiment.ipynb b/labml_nn/transformers/LoRA/experiment.ipynb index 7070991d..f0ae1c84 100644 --- a/labml_nn/transformers/LoRA/experiment.ipynb +++ b/labml_nn/transformers/LoRA/experiment.ipynb @@ -1,12 +1,7 @@ { "cells": [ { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:22:57.496965Z", - "start_time": "2024-07-31T12:22:55.151730Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n", @@ -14,15 +9,10 @@ ], "id": "cffa3ec341b4905a", "outputs": [], - "execution_count": 1 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:22:57.986397Z", - "start_time": "2024-07-31T12:22:57.498305Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from transformers import AutoTokenizer\n", @@ -31,17 +21,13 @@ ], "id": "c2b0b7e18394ea9e", "outputs": [], - "execution_count": 2 + "execution_count": null }, { "cell_type": "code", "id": "initial_id", "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2024-07-31T12:22:58.562136Z", - "start_time": "2024-07-31T12:22:57.987296Z" - } + "collapsed": true }, "source": [ "model = GPTModel()\n", @@ -54,32 +40,11 @@ "if unexpected_keys:\n", " print(f\"Unexpected keys: {unexpected_keys}\")" ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_7130/2581223434.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " state_dict = torch.load('transformed.pth')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing keys: ['token_embedding.lora_a', 'token_embedding.lora_b', 'position_embedding.lora_a', 'position_embedding.lora_b', 'blocks.0.attn.c_att.lora_a', 'blocks.0.attn.c_att.lora_b', 'blocks.0.attn.c_proj.lora_a', 'blocks.0.attn.c_proj.lora_b', 'blocks.0.ffn.c_fc.lora_a', 'blocks.0.ffn.c_fc.lora_b', 'blocks.0.ffn.c_proj.lora_a', 'blocks.0.ffn.c_proj.lora_b', 'blocks.1.attn.c_att.lora_a', 'blocks.1.attn.c_att.lora_b', 'blocks.1.attn.c_proj.lora_a', 'blocks.1.attn.c_proj.lora_b', 'blocks.1.ffn.c_fc.lora_a', 'blocks.1.ffn.c_fc.lora_b', 'blocks.1.ffn.c_proj.lora_a', 'blocks.1.ffn.c_proj.lora_b', 'blocks.2.attn.c_att.lora_a', 'blocks.2.attn.c_att.lora_b', 'blocks.2.attn.c_proj.lora_a', 'blocks.2.attn.c_proj.lora_b', 'blocks.2.ffn.c_fc.lora_a', 'blocks.2.ffn.c_fc.lora_b', 'blocks.2.ffn.c_proj.lora_a', 'blocks.2.ffn.c_proj.lora_b', 'blocks.3.attn.c_att.lora_a', 'blocks.3.attn.c_att.lora_b', 'blocks.3.attn.c_proj.lora_a', 'blocks.3.attn.c_proj.lora_b', 'blocks.3.ffn.c_fc.lora_a', 'blocks.3.ffn.c_fc.lora_b', 'blocks.3.ffn.c_proj.lora_a', 'blocks.3.ffn.c_proj.lora_b', 'blocks.4.attn.c_att.lora_a', 'blocks.4.attn.c_att.lora_b', 'blocks.4.attn.c_proj.lora_a', 'blocks.4.attn.c_proj.lora_b', 'blocks.4.ffn.c_fc.lora_a', 'blocks.4.ffn.c_fc.lora_b', 'blocks.4.ffn.c_proj.lora_a', 'blocks.4.ffn.c_proj.lora_b', 'blocks.5.attn.c_att.lora_a', 'blocks.5.attn.c_att.lora_b', 'blocks.5.attn.c_proj.lora_a', 'blocks.5.attn.c_proj.lora_b', 'blocks.5.ffn.c_fc.lora_a', 'blocks.5.ffn.c_fc.lora_b', 'blocks.5.ffn.c_proj.lora_a', 'blocks.5.ffn.c_proj.lora_b', 'blocks.6.attn.c_att.lora_a', 'blocks.6.attn.c_att.lora_b', 'blocks.6.attn.c_proj.lora_a', 'blocks.6.attn.c_proj.lora_b', 'blocks.6.ffn.c_fc.lora_a', 'blocks.6.ffn.c_fc.lora_b', 'blocks.6.ffn.c_proj.lora_a', 'blocks.6.ffn.c_proj.lora_b', 'blocks.7.attn.c_att.lora_a', 'blocks.7.attn.c_att.lora_b', 'blocks.7.attn.c_proj.lora_a', 'blocks.7.attn.c_proj.lora_b', 'blocks.7.ffn.c_fc.lora_a', 'blocks.7.ffn.c_fc.lora_b', 'blocks.7.ffn.c_proj.lora_a', 'blocks.7.ffn.c_proj.lora_b', 'blocks.8.attn.c_att.lora_a', 'blocks.8.attn.c_att.lora_b', 'blocks.8.attn.c_proj.lora_a', 'blocks.8.attn.c_proj.lora_b', 'blocks.8.ffn.c_fc.lora_a', 'blocks.8.ffn.c_fc.lora_b', 'blocks.8.ffn.c_proj.lora_a', 'blocks.8.ffn.c_proj.lora_b', 'blocks.9.attn.c_att.lora_a', 'blocks.9.attn.c_att.lora_b', 'blocks.9.attn.c_proj.lora_a', 'blocks.9.attn.c_proj.lora_b', 'blocks.9.ffn.c_fc.lora_a', 'blocks.9.ffn.c_fc.lora_b', 'blocks.9.ffn.c_proj.lora_a', 'blocks.9.ffn.c_proj.lora_b', 'blocks.10.attn.c_att.lora_a', 'blocks.10.attn.c_att.lora_b', 'blocks.10.attn.c_proj.lora_a', 'blocks.10.attn.c_proj.lora_b', 'blocks.10.ffn.c_fc.lora_a', 'blocks.10.ffn.c_fc.lora_b', 'blocks.10.ffn.c_proj.lora_a', 'blocks.10.ffn.c_proj.lora_b', 'blocks.11.attn.c_att.lora_a', 'blocks.11.attn.c_att.lora_b', 'blocks.11.attn.c_proj.lora_a', 'blocks.11.attn.c_proj.lora_b', 'blocks.11.ffn.c_fc.lora_a', 'blocks.11.ffn.c_fc.lora_b', 'blocks.11.ffn.c_proj.lora_a', 'blocks.11.ffn.c_proj.lora_b', 'lm_head.lora_a', 'lm_head.lora_b']\n" - ] - } - ], - "execution_count": 3 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:23:00.447976Z", - "start_time": "2024-07-31T12:22:58.566527Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "prompt = \"hello how are you\"\n", @@ -96,32 +61,16 @@ " print(tokenizer.decode(id))" ], "id": "f4f7826ec3729b66", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ",\n", - " to\n", - " you\n", - " doing\n" - ] - } - ], - "execution_count": 4 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:23:00.452060Z", - "start_time": "2024-07-31T12:23:00.448904Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "", "id": "c12776360008a974", "outputs": [], - "execution_count": 4 + "execution_count": null } ], "metadata": { diff --git a/labml_nn/transformers/LoRA/train.ipynb b/labml_nn/transformers/LoRA/train.ipynb index cd70bfb3..b2e3038e 100644 --- a/labml_nn/transformers/LoRA/train.ipynb +++ b/labml_nn/transformers/LoRA/train.ipynb @@ -7,41 +7,27 @@ "collapsed": true, "jupyter": { "outputs_hidden": true - }, - "ExecuteTime": { - "end_time": "2024-07-31T12:57:37.296030Z", - "start_time": "2024-07-31T12:57:37.292368Z" } }, "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", "outputs": [], - "execution_count": 1 + "execution_count": null }, { "cell_type": "code", "id": "3b1e507015ba6b81", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:37.317651Z", - "start_time": "2024-07-31T12:57:37.313808Z" - } - }, + "metadata": {}, "source": [ "with open('input.txt', 'r', encoding='utf-8') as f:\n", " text = f.read()" ], "outputs": [], - "execution_count": 2 + "execution_count": null }, { "cell_type": "code", "id": "ac8e51ae5bbfcae7", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:40.488939Z", - "start_time": "2024-07-31T12:57:37.319486Z" - } - }, + "metadata": {}, "source": [ "from transformers import AutoTokenizer\n", "\n", @@ -49,75 +35,47 @@ "\n", "tokens = tokenizer.encode(text, add_special_tokens=False)" ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors\n" - ] - } - ], - "execution_count": 3 + "outputs": [], + "execution_count": null }, { "cell_type": "code", "id": "aeefcdf813e427e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:40.495510Z", - "start_time": "2024-07-31T12:57:40.490341Z" - } - }, + "metadata": {}, "source": [ "context_length = 512\n", "batch_size = 2" ], "outputs": [], - "execution_count": 4 + "execution_count": null }, { "cell_type": "code", "id": "a384b42274f008a2", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:40.522050Z", - "start_time": "2024-07-31T12:57:40.496842Z" - } - }, + "metadata": {}, "source": [ "num_batches = len(tokens) // (batch_size * context_length)\n", "tokens = tokens[:num_batches * batch_size * context_length]" ], "outputs": [], - "execution_count": 5 + "execution_count": null }, { "cell_type": "code", "id": "5c4cc78ac1a02c1d", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:40.592272Z", - "start_time": "2024-07-31T12:57:40.524063Z" - } - }, + "metadata": {}, "source": [ "import torch\n", "\n", "input_ids = torch.tensor(tokens).view(-1, context_length)" ], "outputs": [], - "execution_count": 6 + "execution_count": null }, { "cell_type": "code", "id": "7037fd75e2161382", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:40.601199Z", - "start_time": "2024-07-31T12:57:40.593250Z" - } - }, + "metadata": {}, "source": [ "from torch.utils.data import DataLoader, TensorDataset\n", "from torch.optim import Adam\n", @@ -137,17 +95,12 @@ "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)" ], "outputs": [], - "execution_count": 7 + "execution_count": null }, { "cell_type": "code", "id": "a98b7baa064b8494", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:41.577878Z", - "start_time": "2024-07-31T12:57:40.602187Z" - } - }, + "metadata": {}, "source": [ "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n", "\n", @@ -157,15 +110,10 @@ "_ = model.load_state_dict(state_dict, strict=False)" ], "outputs": [], - "execution_count": 8 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:43.098187Z", - "start_time": "2024-07-31T12:57:41.578713Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "device = \"cuda\"\n", @@ -173,17 +121,12 @@ ], "id": "2e0fa8b3082df716", "outputs": [], - "execution_count": 9 + "execution_count": null }, { "cell_type": "code", "id": "e2f5076894770740", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:57.044755Z", - "start_time": "2024-07-31T12:57:43.099050Z" - } - }, + "metadata": {}, "source": [ "from labml import tracker, experiment\n", "\n", @@ -236,49 +179,13 @@ "\n", "print(\"Training complete.\")" ], - "outputs": [ - { - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "
\n",
-       "LoRA.GPT2: 7a14822c4f3c11efad8354ef33f17c7c\n",
-       "\t[dirty]: \"training loop\"\n",
-       "Monitor experiment at http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c\n",
-       "Still updating labml server, please wait for it to complete...
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[10], line 25\u001B[0m\n\u001B[1;32m 22\u001B[0m loss \u001B[38;5;241m=\u001B[39m criterion(shift_logits\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m, shift_logits\u001B[38;5;241m.\u001B[39msize(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)), shift_labels\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m))\n\u001B[1;32m 24\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[0;32m---> 25\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m 26\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n\u001B[1;32m 28\u001B[0m tracker\u001B[38;5;241m.\u001B[39msave(step, {\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m: loss})\n", - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:521\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m 511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 512\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m 513\u001B[0m Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m 514\u001B[0m (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 519\u001B[0m inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m 520\u001B[0m )\n\u001B[0;32m--> 521\u001B[0m torch\u001B[38;5;241m.\u001B[39mautograd\u001B[38;5;241m.\u001B[39mbackward(\n\u001B[1;32m 522\u001B[0m \u001B[38;5;28mself\u001B[39m, gradient, retain_graph, create_graph, inputs\u001B[38;5;241m=\u001B[39minputs\n\u001B[1;32m 523\u001B[0m )\n", - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:289\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m 284\u001B[0m retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m 286\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m 287\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m 288\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 289\u001B[0m _engine_run_backward(\n\u001B[1;32m 290\u001B[0m tensors,\n\u001B[1;32m 291\u001B[0m grad_tensors_,\n\u001B[1;32m 292\u001B[0m retain_graph,\n\u001B[1;32m 293\u001B[0m create_graph,\n\u001B[1;32m 294\u001B[0m inputs,\n\u001B[1;32m 295\u001B[0m allow_unreachable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 296\u001B[0m accumulate_grad\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 297\u001B[0m )\n", - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:768\u001B[0m, in \u001B[0;36m_engine_run_backward\u001B[0;34m(t_outputs, *args, **kwargs)\u001B[0m\n\u001B[1;32m 766\u001B[0m unregister_hooks \u001B[38;5;241m=\u001B[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001B[1;32m 767\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 768\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m Variable\u001B[38;5;241m.\u001B[39m_execution_engine\u001B[38;5;241m.\u001B[39mrun_backward( \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 769\u001B[0m t_outputs, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[1;32m 770\u001B[0m ) \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 771\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 772\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m attach_logging_hooks:\n", - "\u001B[0;31mKeyboardInterrupt\u001B[0m: " - ] - } - ], - "execution_count": 10 + "outputs": [], + "execution_count": null }, { "cell_type": "code", "id": "da2d4023002648dc", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T12:57:57.046254Z", - "start_time": "2024-07-31T12:57:57.045954Z" - } - }, + "metadata": {}, "source": [], "outputs": [], "execution_count": null