jkbhagatio · Mar 7, 2024
diff --git a/‎.github/workflows/build_env_run_tests.yml
+32 b/‎.github/workflows/build_env_run_tests.yml
+32
diff --git a/‎data/gifs/austen_combo.gif
1.04 MB b/‎data/gifs/austen_combo.gif
1.04 MB
diff --git a/‎data/gifs/shakespeare_combo.gif
1020 KB b/‎data/gifs/shakespeare_combo.gif
1020 KB
diff --git a/‎data/tiny_austen.txt
+80,050 b/‎data/tiny_austen.txt
+80,050
diff --git a/‎data/tiny_austen_tokens.txt
+2 b/‎data/tiny_austen_tokens.txt
+2
diff --git a/‎data/tiny_shakespeare.txt
+40,000 b/‎data/tiny_shakespeare.txt
+40,000
diff --git a/‎data/tiny_shakespeare_tokens.txt
+2 b/‎data/tiny_shakespeare_tokens.txt
+2
diff --git a/‎license.md
+9 b/‎license.md
+9
diff --git a/‎nanogpt.py
+455 b/‎nanogpt.py
+455
diff --git a/‎pyproject.toml
+128 b/‎pyproject.toml
+128
diff --git a/‎readme.md
+101 b/‎readme.md
+101
diff --git a/‎tests/__pycache__/test_nanogpt.cpython-311-pytest-7.4.3.pyc
12.4 KB b/‎tests/__pycache__/test_nanogpt.cpython-311-pytest-7.4.3.pyc
12.4 KB
diff --git a/‎tests/__pycache__/test_nanogpt.cpython-311.pyc
4.48 KB b/‎tests/__pycache__/test_nanogpt.cpython-311.pyc
4.48 KB
diff --git a/‎tests/__pycache__/tests.cpython-311-pytest-7.4.3.pyc
11.5 KB b/‎tests/__pycache__/tests.cpython-311-pytest-7.4.3.pyc
11.5 KB
diff --git a/‎tests/test_nanogpt.py
+72 b/‎tests/test_nanogpt.py
+72
diff --git a/‎tutorial.ipynb
+1,581 b/‎tutorial.ipynb
+1,581
@@ -0,0 +1,32 @@
+name: build_env_run_tests
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:  # Allows running manually from Github's 'Actions' tab
+jobs:
+  build_env_run_tests:
+    name: Build env and run tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Set up environment
+        run: |
+          python -m venv env
+          source env/bin/activate
+          python -m pip install -e .
+      - name: Pytest with coverage report
+        run: |
+          python -m pytest -n auto --cov=nanogpt --cov-report=xml
+      - name: Upload test coverage report to codecov
+        uses: codecov/codecov-action@v3
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: ./coverage.xml
+          verbose: true
@@ -0,0 +1,2 @@
+
+ !"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz
@@ -0,0 +1,2 @@
+
+ !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
@@ -0,0 +1,9 @@
+# Released under MIT License
+
+Copyright (c) 2024 Jai Bhagat
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,128 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+# See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options.
+name = "nanogpt"
+version = "0.1.0"
+readme = "readme.md"
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Development Status :: 3 - Alpha",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+authors = [
+    {name = "Jai Bhagat", email = "jkbhagatio@gmail.com"}
+]
+requires-python = ">=3.10"
+dependencies = [
+    "black[jupyter]",
+    "ipdb",
+    "ipykernel",
+    "isort",
+    "jupyter",
+    "jupyterlab",
+    "matplotlib",
+    "numpy",
+    "optuna",
+    "pyright",
+    "pytest",
+    "pytest-cov",
+    "pytest-sphinx",
+    "ruff",
+    "scikit-learn",
+    "scipy",
+    "setuptools",
+    "tiktoken",
+    "torch",
+    "tqdm",
+    "twine",
+    "wandb",
+    "wheel",
+]
+license = {file = "license.md"}
+
+[project.urls]
+Homepage = "https://github.com/jkbhagatio/nanogpt"
+Repository = "https://github.com/jkbhagatio/nanogpt"
+
+[project.optional-dependencies]
+dev = [
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "nanogpt.version.VERSION"}
+
+[tool.black]
+line-length = 100
+color = false
+exclude = '''
+/(
+    \.git
+    | \.mypy_cache
+    | \.tox
+    | \.venv
+    | _build
+    | build
+    | dist
+    | env
+    | venv
+)/
+'''
+
+[tool.isort]
+profile = "black"
+multi_line_output = 2
+
+[tool.ruff]
+select = ["E", "W", "F", "I", "D", "UP", "S", "B", "A", "C4", "ICN", "PIE", "PT", "SIM", "PL"]
+line-length = 100
+ignore = [
+    "E201", "E202", "E203", "E231", "E731", "E702",
+    "S101",
+    "PT013",
+    "PLR0912", "PLR0913", "PLR0915"
+]
+extend-exclude = [".git", ".github", ".idea", ".vscode"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"]
+
+[tool.ruff.pydocstyle]
+convention = "google"
+
+[tool.pyright]
+reportMissingImports = "none"
+reportImportCycles = "error"
+reportUnusedImport = "error"
+reportUnusedClass = "error"
+reportUnusedfunction = "error"
+reportUnusedVariable = "error"
+reportDuplicateImport = "error"
+reportWildcardImportFromLibrary = "error"
+reportPrivateUsage = "error"
+reportCallInDefaultInitializer = "error"
+reportUnnecessaryIsInstance = "error"
+reportUnnecesaryCast = "error"
+reportUnnecesarryComparison = "error"
+reportUnnecessaryContains = "error"
+reportAssertAlwaysTrue = "error"
+reportSelfClsParameterName = "error"
+reportUnusedExpression = "error"
+reportMatchNotExhaustive = "error"
+reportShadowedImports = "error"
+# *Note*: we may want to set all 'ReportOptional*' rules to "none", but leaving 'em default for now
+venvPath = "."
+venv = ".venv"
+
+[tool.pytest.ini_options]
+testpaths = "tests"
+python_classes = [
+  "Test*",
+  "*Test"
+]
+log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+log_level = "DEBUG"
@@ -0,0 +1,101 @@
+# nanoGPT
+
+[![build_and_tests](https://github.com/jkbhagatio/nanoGPT/actions/workflows/build_env_run_tests.yml/badge.svg)](https://github.com/jkbhagatio/nanoGPT/actions/workflows/build_env_run_tests.yml)
+
+A minimal (nanomal?) repository containing code for building, training, and running nanoGPT: a nano-version of OpenAI's GPT-3 Decoder-only Transformer, following this tutorial from Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY
+
+A trained nanoGPT using this codebase acts only as a character-level text-completer (i.e. the end of the "pretraining stage" in typical Large Language Model development, here with tokens as only single characters).
+
+Multi-head self-attention is implemented "from scratch", at the level of pytorch tensors. These "self-attention units" are combined in "transformer blocks", which are then used in the nanoGPT model class.
+
+While the overall architecture is similar, this nanoGPT makes departures from Karpathy's nanoGPT in: naming conventions, data loading and training configuration, projecting embedding dimensions to attention heads, the format of operations in self-attention units and transformer blocks, output model generation (by adding parameters such as `temp` and `top_k`), and more.
+
+## Examples
+
+### nanoGPT-Shakespeare
+
+Trained on the complete works of William Shakespeare. 
+
+Output generated from models trained after approximately 320000 (top), 640000 (middle), and 960000 (bottom) examples.
+
+![nanoGPT-Shakespeare-GIF](./data/gifs/shakespeare_combo.gif)
+
+### nanoGPT-Austen
+
+Trained on the complete works of Jane Austen.
+
+Output generated from models trained after approximately 320000 (top), 640000 (middle), and 960000 (bottom) examples.
+
+
+![nanoGPT-Austen-GIF](./data/gifs/austen_combo.gif)
+
+## Repository Contents
+
+- `nanoGPT.py` contains code that for building, training, and running nanoGPT.
+
+- `tutorial.ipynb` is a notebook that serves as a tutorial for the step-by-step building of nanoGPT.
+
+- `data/` contains the works of Shakespeare and Austen in .txt format, which can be used to train nanoGPT.
+
+- `tests/` contains tests that can be run via pytest for verifying components of nanoGPT work as expected.
+
+- `.github/workflows/` contains a github actions workflow for building the python environment, running tests, and uploading the results to codecov.
+
+## Usage
+
+### Python environment creation
+
+0. Create and activate a >= Python3.10 environment, clone this repository, then within this repository's directory run `pip install -e .` to install the necessary Python package dependencies from pyproject.toml.
+
+### Run a pretrained nanoGPT model
+
+1. Download a pretrained nanoGPT pytorch model (.pth), its config file (.json), and its tokens file (.json) from [here](https://drive.google.com/drive/folders/1M99XHrX31O8opWYHzTnvVBwEkYadH5ct?usp=sharing) <br>
+(e.g. `nanogpt_shakespeare.pth`, `nanogpt_shakespeare_config.json`, `tiny_shakespeare_tokens.txt`) 
+<br>
+and _**save them in a new directory with no other files**_.
+2. Then, in the terminal within this directory with the python environment activated, 
+```bash
+python -m nanogpt --model-dir "<path/to/downloaded_files/>" --in-txt "Wherefore art thou, Romeo? We are such stuff as dreams are made on. The course of true love never did run smooth." --n-tokens 200
+```
+(run `python -m nanogpt --help` for additional command-line arguments.)
+
+### Train a nanoGPT model
+
+E.g. on the selected works of Jane Austen. Activate the python environment, and within this directory launch a Python interpreter, and...
+
+```python
+# Imports
+from pathlib import Path
+import sys
+import torch
+
+# Set torch device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Import nanogpt
+nanogpt_dir = Path.cwd()
+sys.path.append(nanogpt_dir)
+import nanogpt
+
+# Load in text file to train on and build dataloaders
+works_file = nanogpt_dir / "data/tiny_austen.txt"
+ctx_len=256
+X, Y = nanogpt.build_dataset(works_file, ctx_len)
+train_loader, val_loader, test_loader = nanogpt.build_dataloaders(X, Y)
+
+# Instantiate a nanoGPT model object
+tokens_file = nanogpt_dir / "data/tiny_austen_tokens.txt"
+with open(tokens_file) as f:
+    tokens = f.read()
+model = nanogpt.NanoGPT(n_tokens=len(tokens), ctx_len=ctx_len)
+
+# Train nanoGPT
+optimizer = torch.optim.AdamW(nanogpt.parameters(), lr=1e-3)
+loss_fn = nn.CrossEntropyLoss()
+nanogpt.train(model, train_loader, val_loader, optimizer, loss_fn, max_epochs=1)  # see nanoGPT.py for additional params for `train()`
+
+# Wait =p ... and then generate output
+n_gen_tokens = 500  # number of tokens to generate
+nanogpt.generate(model, tokens, n_tokens=n_gen_tokens)
+
+```
@@ -0,0 +1,72 @@
+"""Tests transformer components of NanoGPT."""
+
+
+import sys
+from pathlib import Path
+filepath = Path(__file__)
+sys.path.append(str(filepath.parent.parent.resolve()))
+
+import torch
+
+from nanogpt import Block, Feedforward, Head, MultiHead, NanoGPT, generate
+
+# Setup.
+tokens_file = filepath.parent.parent / "data/tiny_shakespeare_tokens.txt"
+with open(tokens_file) as f:
+    tokens = list(f.read())
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+nanogpt = NanoGPT(
+    n_tokens=len(tokens),
+    ctx_len=256,
+    n_blocks=6,
+    n_heads=8,
+    head_sz=48,
+    emb_dim=384,
+    ff_dim=4,
+    dropout=0.1,
+).to(device)
+batch_sz = 32
+
+
+def test_Head(nanogpt=nanogpt, batch_sz=batch_sz):
+    """Tests Head forward pass."""
+    head = Head(nanogpt.head_sz, nanogpt.emb_dim).to(device)
+    x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
+    assert head(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.head_sz)
+
+
+def test_MultiHead(nanogpt=nanogpt, batch_sz=batch_sz):
+    """Tests MultiHead forward pass."""
+    multi_head = MultiHead(nanogpt.n_heads, nanogpt.head_sz, nanogpt.emb_dim).to(device)
+    x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
+    assert multi_head(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.n_heads * nanogpt.head_sz)
+
+
+def test_Feedforward(nanogpt=nanogpt, batch_sz=batch_sz):
+    """Tests Feedforward forward pass."""
+    feedforward = Feedforward(nanogpt.emb_dim, nanogpt.ff_dim).to(device)
+    x = torch.rand(batch_sz, nanogpt.n_heads * nanogpt.head_sz, nanogpt.emb_dim).to(device)
+    assert feedforward(x).shape == (batch_sz, nanogpt.emb_dim, nanogpt.emb_dim)
+
+
+def test_Block(nanogpt=nanogpt, batch_sz=batch_sz):
+    """Tests Block forward pass."""
+    block = Block(
+        nanogpt.n_heads, nanogpt.head_sz, nanogpt.emb_dim, nanogpt.ff_dim, dropout=0.1
+    ).to(device)
+    x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
+    assert block(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.emb_dim)
+
+
+def test_NanoGPT(nanogpt=nanogpt, batch_sz=batch_sz):
+    """Tests NanoGPT forward pass."""
+    x = torch.randint(0, nanogpt.n_tokens, (batch_sz, nanogpt.ctx_len)).to(device)
+    assert nanogpt(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.n_tokens)
+
+
+def test_generate(nanogpt=nanogpt, tokens=tokens):
+    """Tests `generate`."""
+    in_txt = "test"
+    generated_text = generate(nanogpt, tokens, in_txt=in_txt, n_tokens=100, temp=0.5, top_k=50, seed=42)
+    assert isinstance(generated_text, str)
+    assert len(generated_text) > len(in_txt)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+ !"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+ !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz`