diff --git a/.gitignore b/.gitignore index 781e463..16fb53a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,12 @@ -__pycache__ -test_notebooks/ -/build -/dist -*.egg-info -.vscode/settings.json -/site -.DS_Store \ No newline at end of file +__pycache__ +test_notebooks/ +/build +/dist +*.egg-info +.vscode/settings.json +/site +.DS_Store + +/test +/trained_model +aitextgen.tokenizer.json \ No newline at end of file diff --git a/README.md b/README.md index 7e644df..6f767b7 100644 --- a/README.md +++ b/README.md @@ -62,39 +62,44 @@ from aitextgen.tokenizers import train_tokenizer from aitextgen.utils import GPT2ConfigCPU from aitextgen import aitextgen -# The name of the downloaded Shakespeare text for training -file_name = "input.txt" - -# Train a custom BPE Tokenizer on the downloaded text -# This will save one file: `aitextgen.tokenizer.json`, which contains the -# information needed to rebuild the tokenizer. -train_tokenizer(file_name) -tokenizer_file = "aitextgen.tokenizer.json" - -# GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training -# e.g. the # of input tokens here is 64 vs. 1024 for base GPT-2. -config = GPT2ConfigCPU() - -# Instantiate aitextgen using the created tokenizer and config -ai = aitextgen(tokenizer_file=tokenizer_file, config=config) - -# You can build datasets for training by creating TokenDatasets, -# which automatically processes the dataset with the appropriate size. -data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64) - -# Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. -# On a 2020 8-core iMac, this took ~25 minutes to run. -ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000) - -# Generate text from it! -ai.generate(10, prompt="ROMEO:") - -# With your trained model, you can reload the model at any time by -# providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer. -ai2 = aitextgen(model_folder="trained_model", - tokenizer_file="aitextgen.tokenizer.json") - -ai2.generate(10, prompt="ROMEO:") +# Your code needs to be wrapped inside a main function, +# as otherwise multiple child processes from pytorch_lightning cannot be spawned +def main(): + # The name of the downloaded Shakespeare text for training + file_name = "input.txt" + + # Train a custom BPE Tokenizer on the downloaded text + # This will save one file: `aitextgen.tokenizer.json`, which contains the + # information needed to rebuild the tokenizer. + train_tokenizer(file_name) + tokenizer_file = "aitextgen.tokenizer.json" + + # GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training + # e.g. the # of input tokens here is 64 vs. 1024 for base GPT-2. + config = GPT2ConfigCPU() + + # Instantiate aitextgen using the created tokenizer and config + ai = aitextgen(tokenizer_file=tokenizer_file, config=config) + + # You can build datasets for training by creating TokenDatasets, + # which automatically processes the dataset with the appropriate size. + data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64) + + # Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. + # On a 2020 8-core iMac, this took ~25 minutes to run. + ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000) + + # Generate text from it! + ai.generate(10, prompt="ROMEO:") + + # With your trained model, you can reload the model at any time by + # providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer. + ai2 = aitextgen(model_folder="trained_model", + tokenizer_file="aitextgen.tokenizer.json") + + ai2.generate(10, prompt="ROMEO:") +if __name__ == "__main__": + main() ``` Want to run aitextgen and finetune GPT-2? Use the Colab notebooks in the Demos section, or [follow the documentation](https://docs.aitextgen.io/) to get more information and learn some helpful tips! diff --git a/aitextgen/aitextgen.py b/aitextgen/aitextgen.py index 93c0291..5f4c845 100644 --- a/aitextgen/aitextgen.py +++ b/aitextgen/aitextgen.py @@ -11,7 +11,7 @@ import pytorch_lightning as pl import torch from pkg_resources import resource_filename -# from pytorch_lightning.plugins import DeepSpeedPlugin +from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin from tqdm.auto import trange from transformers import ( AutoConfig, @@ -698,7 +698,7 @@ def train( # use the DeepSpeed plugin if installed and specified deepspeed_plugin = None if is_gpu_used and use_deepspeed: - deepspeed_plugin = DeepSpeedPlugin() + deepspeed_plugin = DeepSpeedPrecisionPlugin("16-mixed" if fp16 else "32-true") logger.info("Using DeepSpeed training.") if not fp16: logger.info("Setting FP16 to True for DeepSpeed ZeRO Training.") @@ -706,7 +706,7 @@ def train( train_params = dict( accumulate_grad_batches=gradient_accumulation_steps, - gpus=n_gpu, + num_nodes=n_gpu, max_steps=num_steps, gradient_clip_val=max_grad_norm, enable_checkpointing=False, #checkpoint_callback deprecated in pytorch_lighning v1.7 @@ -737,7 +737,7 @@ def train( if tpu_cores > 0: train_params["tpu_cores"] = tpu_cores - train_params["gpus"] = 0 + train_params["num_nodes"] = 0 n_gpu = 0 # benchmark gives a boost for GPUs if input size is constant, diff --git a/aitextgen/train.py b/aitextgen/train.py index a73fc6c..933e80b 100644 --- a/aitextgen/train.py +++ b/aitextgen/train.py @@ -10,7 +10,7 @@ from transformers import get_linear_schedule_with_warmup import pytorch_lightning as pl -from pytorch_lightning.callbacks.progress import ProgressBarBase +from pytorch_lightning.callbacks.progress.progress_bar import ProgressBar from pytorch_lightning.accelerators import TPUAccelerator @@ -83,7 +83,7 @@ def configure_optimizers(self): return [optimizer], [scheduler] -class ATGProgressBar(ProgressBarBase): +class ATGProgressBar(ProgressBar): """A variant progress bar that works off of steps and prints periodically.""" def __init__( @@ -156,8 +156,8 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): if self.steps == 0 and self.gpu: torch.cuda.empty_cache() - metrics = self.get_metrics(trainer, pl_module) - current_loss = float(metrics["loss"]) + #metrics = self.get_metrics(trainer, pl_module) + current_loss = float(outputs["loss"]) self.steps += 1 avg_loss = 0 if current_loss == current_loss: # don't add if current_loss is NaN diff --git a/requirements.txt b/requirements.txt index a9e23ef..bfa9426 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -transformers>=4.5.1 -fire>=0.3.0 -pytorch-lightning>=1.8.0 -torch>=1.6.0 +fire~=0.5.0 +pytorch-lightning~=2.0.0 +transformers~=4.26.0 +torch~=1.13.0 diff --git a/setup.py b/setup.py index 03c16fd..90a6362 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,25 @@ -from setuptools import setup - -setup( - name="aitextgen", - packages=["aitextgen"], # this must be the same as the name above - version="0.6.0", - description="A robust Python tool for text-based AI training and generation using GPT-2.", - long_description=open("README.md", "r", encoding="utf-8").read(), - long_description_content_type="text/markdown", - author="Max Woolf", - author_email="max@minimaxir.com", - url="https://github.com/minimaxir/aitextgen", - keywords=["gpt-2", "gpt2", "text generation", "ai"], - classifiers=[], - license="MIT", - entry_points={"console_scripts": ["aitextgen=aitextgen.cli:aitextgen_cli"]}, - python_requires=">=3.6", - include_package_data=True, - install_requires=[ - "transformers>=4.5.1", - "fire>=0.3.0", - "pytorch-lightning>=1.7.0", - "torch>=1.6.0", - ], -) +from setuptools import setup + +setup( + name="aitextgen", + packages=["aitextgen"], # this must be the same as the name above + version="0.6.1", + description="A robust Python tool for text-based AI training and generation using GPT-2.", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + author="Max Woolf", + author_email="max@minimaxir.com", + url="https://github.com/minimaxir/aitextgen", + keywords=["gpt-2", "gpt2", "text generation", "ai"], + classifiers=[], + license="MIT", + entry_points={"console_scripts": ["aitextgen=aitextgen.cli:aitextgen_cli"]}, + python_requires=">=3.6", + include_package_data=True, + install_requires=[ + "fire~=0.5.0", + "pytorch-lightning~=2.0.0", + "transformers~=4.26.0", + "torch~=1.13.0", + ], +)