Skip to content

Commit 8d9e9dc

Browse files
committedMar 7, 2024·
one-shot =p
0 parents  commit 8d9e9dc

16 files changed

+122432
-0
lines changed
 
+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: build_env_run_tests
2+
on:
3+
push:
4+
branches: [ main ]
5+
pull_request:
6+
branches: [ main ]
7+
workflow_dispatch: # Allows running manually from Github's 'Actions' tab
8+
jobs:
9+
build_env_run_tests:
10+
name: Build env and run tests
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout repo
14+
uses: actions/checkout@v3
15+
- name: Set up Python
16+
uses: actions/setup-python@v4
17+
with:
18+
python-version: 3.11
19+
- name: Set up environment
20+
run: |
21+
python -m venv env
22+
source env/bin/activate
23+
python -m pip install -e .
24+
- name: Pytest with coverage report
25+
run: |
26+
python -m pytest -n auto --cov=nanogpt --cov-report=xml
27+
- name: Upload test coverage report to codecov
28+
uses: codecov/codecov-action@v3
29+
with:
30+
token: ${{ secrets.CODECOV_TOKEN }}
31+
file: ./coverage.xml
32+
verbose: true

‎data/gifs/austen_combo.gif

1.04 MB
Loading

‎data/gifs/shakespeare_combo.gif

1020 KB
Loading

‎data/tiny_austen.txt

+80,050
Large diffs are not rendered by default.

‎data/tiny_austen_tokens.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
!"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz

‎data/tiny_shakespeare.txt

+40,000
Large diffs are not rendered by default.

‎data/tiny_shakespeare_tokens.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
!$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

‎license.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Released under MIT License
2+
3+
Copyright (c) 2024 Jai Bhagat
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6+
7+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8+
9+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

‎nanogpt.py

+455
Large diffs are not rendered by default.

‎pyproject.toml

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
[build-system]
2+
requires = ["setuptools", "wheel"]
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
# See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options.
7+
name = "nanogpt"
8+
version = "0.1.0"
9+
readme = "readme.md"
10+
classifiers = [
11+
"Intended Audience :: Science/Research",
12+
"Development Status :: 3 - Alpha",
13+
"License :: OSI Approved :: Apache Software License",
14+
"Programming Language :: Python :: 3",
15+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
16+
]
17+
authors = [
18+
{name = "Jai Bhagat", email = "jkbhagatio@gmail.com"}
19+
]
20+
requires-python = ">=3.10"
21+
dependencies = [
22+
"black[jupyter]",
23+
"ipdb",
24+
"ipykernel",
25+
"isort",
26+
"jupyter",
27+
"jupyterlab",
28+
"matplotlib",
29+
"numpy",
30+
"optuna",
31+
"pyright",
32+
"pytest",
33+
"pytest-cov",
34+
"pytest-sphinx",
35+
"ruff",
36+
"scikit-learn",
37+
"scipy",
38+
"setuptools",
39+
"tiktoken",
40+
"torch",
41+
"tqdm",
42+
"twine",
43+
"wandb",
44+
"wheel",
45+
]
46+
license = {file = "license.md"}
47+
48+
[project.urls]
49+
Homepage = "https://github.com/jkbhagatio/nanogpt"
50+
Repository = "https://github.com/jkbhagatio/nanogpt"
51+
52+
[project.optional-dependencies]
53+
dev = [
54+
]
55+
56+
[tool.setuptools.dynamic]
57+
version = {attr = "nanogpt.version.VERSION"}
58+
59+
[tool.black]
60+
line-length = 100
61+
color = false
62+
exclude = '''
63+
/(
64+
\.git
65+
| \.mypy_cache
66+
| \.tox
67+
| \.venv
68+
| _build
69+
| build
70+
| dist
71+
| env
72+
| venv
73+
)/
74+
'''
75+
76+
[tool.isort]
77+
profile = "black"
78+
multi_line_output = 2
79+
80+
[tool.ruff]
81+
select = ["E", "W", "F", "I", "D", "UP", "S", "B", "A", "C4", "ICN", "PIE", "PT", "SIM", "PL"]
82+
line-length = 100
83+
ignore = [
84+
"E201", "E202", "E203", "E231", "E731", "E702",
85+
"S101",
86+
"PT013",
87+
"PLR0912", "PLR0913", "PLR0915"
88+
]
89+
extend-exclude = [".git", ".github", ".idea", ".vscode"]
90+
91+
[tool.ruff.per-file-ignores]
92+
"__init__.py" = ["F401"]
93+
94+
[tool.ruff.pydocstyle]
95+
convention = "google"
96+
97+
[tool.pyright]
98+
reportMissingImports = "none"
99+
reportImportCycles = "error"
100+
reportUnusedImport = "error"
101+
reportUnusedClass = "error"
102+
reportUnusedfunction = "error"
103+
reportUnusedVariable = "error"
104+
reportDuplicateImport = "error"
105+
reportWildcardImportFromLibrary = "error"
106+
reportPrivateUsage = "error"
107+
reportCallInDefaultInitializer = "error"
108+
reportUnnecessaryIsInstance = "error"
109+
reportUnnecesaryCast = "error"
110+
reportUnnecesarryComparison = "error"
111+
reportUnnecessaryContains = "error"
112+
reportAssertAlwaysTrue = "error"
113+
reportSelfClsParameterName = "error"
114+
reportUnusedExpression = "error"
115+
reportMatchNotExhaustive = "error"
116+
reportShadowedImports = "error"
117+
# *Note*: we may want to set all 'ReportOptional*' rules to "none", but leaving 'em default for now
118+
venvPath = "."
119+
venv = ".venv"
120+
121+
[tool.pytest.ini_options]
122+
testpaths = "tests"
123+
python_classes = [
124+
"Test*",
125+
"*Test"
126+
]
127+
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
128+
log_level = "DEBUG"

‎readme.md

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# nanoGPT
2+
3+
[![build_and_tests](https://github.com/jkbhagatio/nanoGPT/actions/workflows/build_env_run_tests.yml/badge.svg)](https://github.com/jkbhagatio/nanoGPT/actions/workflows/build_env_run_tests.yml)
4+
5+
A minimal (nanomal?) repository containing code for building, training, and running nanoGPT: a nano-version of OpenAI's GPT-3 Decoder-only Transformer, following this tutorial from Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY
6+
7+
A trained nanoGPT using this codebase acts only as a character-level text-completer (i.e. the end of the "pretraining stage" in typical Large Language Model development, here with tokens as only single characters).
8+
9+
Multi-head self-attention is implemented "from scratch", at the level of pytorch tensors. These "self-attention units" are combined in "transformer blocks", which are then used in the nanoGPT model class.
10+
11+
While the overall architecture is similar, this nanoGPT makes departures from Karpathy's nanoGPT in: naming conventions, data loading and training configuration, projecting embedding dimensions to attention heads, the format of operations in self-attention units and transformer blocks, output model generation (by adding parameters such as `temp` and `top_k`), and more.
12+
13+
## Examples
14+
15+
### nanoGPT-Shakespeare
16+
17+
Trained on the complete works of William Shakespeare.
18+
19+
Output generated from models trained after approximately 320000 (top), 640000 (middle), and 960000 (bottom) examples.
20+
21+
![nanoGPT-Shakespeare-GIF](./data/gifs/shakespeare_combo.gif)
22+
23+
### nanoGPT-Austen
24+
25+
Trained on the complete works of Jane Austen.
26+
27+
Output generated from models trained after approximately 320000 (top), 640000 (middle), and 960000 (bottom) examples.
28+
29+
30+
![nanoGPT-Austen-GIF](./data/gifs/austen_combo.gif)
31+
32+
## Repository Contents
33+
34+
- `nanoGPT.py` contains code that for building, training, and running nanoGPT.
35+
36+
- `tutorial.ipynb` is a notebook that serves as a tutorial for the step-by-step building of nanoGPT.
37+
38+
- `data/` contains the works of Shakespeare and Austen in .txt format, which can be used to train nanoGPT.
39+
40+
- `tests/` contains tests that can be run via pytest for verifying components of nanoGPT work as expected.
41+
42+
- `.github/workflows/` contains a github actions workflow for building the python environment, running tests, and uploading the results to codecov.
43+
44+
## Usage
45+
46+
### Python environment creation
47+
48+
0. Create and activate a >= Python3.10 environment, clone this repository, then within this repository's directory run `pip install -e .` to install the necessary Python package dependencies from pyproject.toml.
49+
50+
### Run a pretrained nanoGPT model
51+
52+
1. Download a pretrained nanoGPT pytorch model (.pth), its config file (.json), and its tokens file (.json) from [here](https://drive.google.com/drive/folders/1M99XHrX31O8opWYHzTnvVBwEkYadH5ct?usp=sharing) <br>
53+
(e.g. `nanogpt_shakespeare.pth`, `nanogpt_shakespeare_config.json`, `tiny_shakespeare_tokens.txt`)
54+
<br>
55+
and _**save them in a new directory with no other files**_.
56+
2. Then, in the terminal within this directory with the python environment activated,
57+
```bash
58+
python -m nanogpt --model-dir "<path/to/downloaded_files/>" --in-txt "Wherefore art thou, Romeo? We are such stuff as dreams are made on. The course of true love never did run smooth." --n-tokens 200
59+
```
60+
(run `python -m nanogpt --help` for additional command-line arguments.)
61+
62+
### Train a nanoGPT model
63+
64+
E.g. on the selected works of Jane Austen. Activate the python environment, and within this directory launch a Python interpreter, and...
65+
66+
```python
67+
# Imports
68+
from pathlib import Path
69+
import sys
70+
import torch
71+
72+
# Set torch device
73+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
74+
75+
# Import nanogpt
76+
nanogpt_dir = Path.cwd()
77+
sys.path.append(nanogpt_dir)
78+
import nanogpt
79+
80+
# Load in text file to train on and build dataloaders
81+
works_file = nanogpt_dir / "data/tiny_austen.txt"
82+
ctx_len=256
83+
X, Y = nanogpt.build_dataset(works_file, ctx_len)
84+
train_loader, val_loader, test_loader = nanogpt.build_dataloaders(X, Y)
85+
86+
# Instantiate a nanoGPT model object
87+
tokens_file = nanogpt_dir / "data/tiny_austen_tokens.txt"
88+
with open(tokens_file) as f:
89+
tokens = f.read()
90+
model = nanogpt.NanoGPT(n_tokens=len(tokens), ctx_len=ctx_len)
91+
92+
# Train nanoGPT
93+
optimizer = torch.optim.AdamW(nanogpt.parameters(), lr=1e-3)
94+
loss_fn = nn.CrossEntropyLoss()
95+
nanogpt.train(model, train_loader, val_loader, optimizer, loss_fn, max_epochs=1) # see nanoGPT.py for additional params for `train()`
96+
97+
# Wait =p ... and then generate output
98+
n_gen_tokens = 500 # number of tokens to generate
99+
nanogpt.generate(model, tokens, n_tokens=n_gen_tokens)
100+
101+
```
Binary file not shown.
4.48 KB
Binary file not shown.
Binary file not shown.

‎tests/test_nanogpt.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""Tests transformer components of NanoGPT."""
2+
3+
4+
import sys
5+
from pathlib import Path
6+
filepath = Path(__file__)
7+
sys.path.append(str(filepath.parent.parent.resolve()))
8+
9+
import torch
10+
11+
from nanogpt import Block, Feedforward, Head, MultiHead, NanoGPT, generate
12+
13+
# Setup.
14+
tokens_file = filepath.parent.parent / "data/tiny_shakespeare_tokens.txt"
15+
with open(tokens_file) as f:
16+
tokens = list(f.read())
17+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18+
nanogpt = NanoGPT(
19+
n_tokens=len(tokens),
20+
ctx_len=256,
21+
n_blocks=6,
22+
n_heads=8,
23+
head_sz=48,
24+
emb_dim=384,
25+
ff_dim=4,
26+
dropout=0.1,
27+
).to(device)
28+
batch_sz = 32
29+
30+
31+
def test_Head(nanogpt=nanogpt, batch_sz=batch_sz):
32+
"""Tests Head forward pass."""
33+
head = Head(nanogpt.head_sz, nanogpt.emb_dim).to(device)
34+
x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
35+
assert head(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.head_sz)
36+
37+
38+
def test_MultiHead(nanogpt=nanogpt, batch_sz=batch_sz):
39+
"""Tests MultiHead forward pass."""
40+
multi_head = MultiHead(nanogpt.n_heads, nanogpt.head_sz, nanogpt.emb_dim).to(device)
41+
x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
42+
assert multi_head(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.n_heads * nanogpt.head_sz)
43+
44+
45+
def test_Feedforward(nanogpt=nanogpt, batch_sz=batch_sz):
46+
"""Tests Feedforward forward pass."""
47+
feedforward = Feedforward(nanogpt.emb_dim, nanogpt.ff_dim).to(device)
48+
x = torch.rand(batch_sz, nanogpt.n_heads * nanogpt.head_sz, nanogpt.emb_dim).to(device)
49+
assert feedforward(x).shape == (batch_sz, nanogpt.emb_dim, nanogpt.emb_dim)
50+
51+
52+
def test_Block(nanogpt=nanogpt, batch_sz=batch_sz):
53+
"""Tests Block forward pass."""
54+
block = Block(
55+
nanogpt.n_heads, nanogpt.head_sz, nanogpt.emb_dim, nanogpt.ff_dim, dropout=0.1
56+
).to(device)
57+
x = torch.rand(batch_sz, nanogpt.ctx_len, nanogpt.emb_dim).to(device)
58+
assert block(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.emb_dim)
59+
60+
61+
def test_NanoGPT(nanogpt=nanogpt, batch_sz=batch_sz):
62+
"""Tests NanoGPT forward pass."""
63+
x = torch.randint(0, nanogpt.n_tokens, (batch_sz, nanogpt.ctx_len)).to(device)
64+
assert nanogpt(x).shape == (batch_sz, nanogpt.ctx_len, nanogpt.n_tokens)
65+
66+
67+
def test_generate(nanogpt=nanogpt, tokens=tokens):
68+
"""Tests `generate`."""
69+
in_txt = "test"
70+
generated_text = generate(nanogpt, tokens, in_txt=in_txt, n_tokens=100, temp=0.5, top_k=50, seed=42)
71+
assert isinstance(generated_text, str)
72+
assert len(generated_text) > len(in_txt)

‎tutorial.ipynb

+1,581
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.