Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding replay into GPT-NeoX #1200

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added CPT code
bentherien committed Apr 14, 2024
commit 6ff3ae6ce5ffb4d5cac95de77f5e686a18b1c8bd
25 changes: 25 additions & 0 deletions configs/datasets/train/pile+slim_pajama_300B_each.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
# This will sample with equal likelihood Pile and SlimPajama:
"train-data-paths": [
"data/pile/train/pile_train",
'data/slim_pajama/train_300B/ArXiv/ArXiv',
'data/slim_pajama/train_300B/Book/Book',
'data/slim_pajama/train_300B/C4/C4',
'data/slim_pajama/train_300B/Wikipedia/Wikipedia',
'data/slim_pajama/train_300B/Github/Github',
'data/slim_pajama/train_300B/StackExchange/StackExchange',
'data/slim_pajama/train_300B/CommonCrawl/CommonCrawl',],
"train-data-weights": [
50.0,
2.2140923205,
2.101565663,
13.344249736,
1.9986465625,
2.612070528,
1.6855393625,
26.0438358255
],
"train-dataset-name": 'pile+slim_pajama_300B_each',
"train-iters": 264732,
"lr-decay-iters": 264732,
}
33 changes: 33 additions & 0 deletions configs/datasets/train/pile_shard0.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"train-data-paths": [
"data/pile/shard_0/shard_0_text_document",
],
"train-data-weights": [
1.,
],
"train-dataset-name": 'pile_shard0',
"train-iters": 1000,
"lr-decay-iters": 1000,
"is_replay_enabled": true,
"replay_config": {
"enabled": true,
# Have to specify idx filenames from original pretraining on tasks, as they contain the num iterations
# and seen indices assuming we're using the same (non-replay) seed as during pretraining
"replay_idx_paths_prefixes": [
"data/pile/shard_0/shard_0_text_document_train_0_indexmap_32160ns_2048sl_1234s",
],
"replay_data_weights":[
1.00,
],
"replay_idx_offsets": [
1,
],
# Fraction of samples coming from the replay buffer, between 0 and 1.
"replay_fraction": 0.5,
# Seed and reshuffle go hand in hand. They control whether you want to see the replay data in the same order
# as you've seen it (done by setting reshuffle to false), and if you decide to reshuffle, what seed you should
# use to reshuffle the seen data.
"replay_seed": 1234,
"replay_reshuffle_idx": false,
},
}
11 changes: 11 additions & 0 deletions configs/datasets/train/pile_train.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"train-data-paths": [
"data/pile/train/pile_train",
],
"train-data-weights": [
1.,
],
"train-dataset-name": 'pile_train',
"train-iters": 132366,
"lr-decay-iters": 132366,
}
32 changes: 32 additions & 0 deletions configs/datasets/train/rp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
# or for weighted datasets:
"train-data-paths": [
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/arxiv/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/book/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/c4/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/wikipedia/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/github/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/stackexchange/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2019-30/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2020-05/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2021-04/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2022-05/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2023-06/folder_train/tokenized_text_document",
],
"train-data-weights": [
2.5,
4.5,
15.0,
4.5,
4.5,
2.0,
13.4,
13.4,
13.4,
13.4,
13.4
],
"train-dataset-name": 'rp',


}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-100B/Book/Book',
'data/slim_pajama/tokenized_train_0-100B/C4/C4',
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-100B/Github/Github',
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_1',
}
28 changes: 28 additions & 0 deletions configs/datasets/train/slim_pajama_100B_1_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-100B/Book/Book',
'data/slim_pajama/tokenized_train_0-100B/C4/C4',
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-100B/Github/Github',
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_1/splits',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,

5.0
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_1_replay5',
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_100B-200B/Book/Book',
'data/slim_pajama/tokenized_train_100B-200B/C4/C4',
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_100B-200B/Github/Github',
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
4.03666599074094,
3.927523855378127,
25.467175464208918,
3.7984379710376293,
4.990226864678155,
3.1957646326079723,
49.58420522134826,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_2',
}
45 changes: 45 additions & 0 deletions configs/datasets/train/slim_pajama_100B_2_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_100B-200B/Book/Book',
'data/slim_pajama/tokenized_train_100B-200B/C4/C4',
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_100B-200B/Github/Github',
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_2/splits',

'data/sp_replay_shards/100B_1_shard1/ArXiv/ArXiv',
'data/sp_replay_shards/100B_1_shard1/Book/Book',
'data/sp_replay_shards/100B_1_shard1/C4/C4',
'data/sp_replay_shards/100B_1_shard1/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_1_shard1/Github/Github',
'data/sp_replay_shards/100B_1_shard1/StackExchange/StackExchange',
'data/sp_replay_shards/100B_1_shard1/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
4.03666599074094,
3.927523855378127,
25.467175464208918,
3.7984379710376293,
4.990226864678155,
3.1957646326079723,
49.58420522134826,

3.8125,

# total: 1.1875,
0.04337997179394097,
0.04880477004015989,
0.3205243831725252,
0.0475528531698962,
0.06249330493660256,
0.03976979771580249,
0.6249749191710727,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_2_replay5',
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_3.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_200B-300B/Book/Book',
'data/slim_pajama/tokenized_train_200B-300B/C4/C4',
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_200B-300B/Github/Github',
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.491756366873565,
4.084283062119696,
25.524317038754475,
3.8109321899190314,
4.89534056131328,
3.254459546224121,
49.93891123479581,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_3',
}
61 changes: 61 additions & 0 deletions configs/datasets/train/slim_pajama_100B_3_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_200B-300B/Book/Book',
'data/slim_pajama/tokenized_train_200B-300B/C4/C4',
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_200B-300B/Github/Github',
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_3/splits',

'data/sp_replay_shards/100B_1_shard2/ArXiv/ArXiv',
'data/sp_replay_shards/100B_1_shard2/Book/Book',
'data/sp_replay_shards/100B_1_shard2/C4/C4',
'data/sp_replay_shards/100B_1_shard2/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_1_shard2/Github/Github',
'data/sp_replay_shards/100B_1_shard2/StackExchange/StackExchange',
'data/sp_replay_shards/100B_1_shard2/CommonCrawl/CommonCrawl',

'data/sp_replay_shards/100B_2_shard1/ArXiv/ArXiv',
'data/sp_replay_shards/100B_2_shard1/Book/Book',
'data/sp_replay_shards/100B_2_shard1/C4/C4',
'data/sp_replay_shards/100B_2_shard1/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_2_shard1/Github/Github',
'data/sp_replay_shards/100B_2_shard1/StackExchange/StackExchange',
'data/sp_replay_shards/100B_2_shard1/CommonCrawl/CommonCrawl',
],
"train-data-weights": [3.491756366873565,
4.084283062119696,
25.524317038754475,
3.8109321899190314,
4.89534056131328,
3.254459546224121,
49.93891123479581,

3.088125,

# total: 0.961875,
0.03513777715309219,
0.03953186373252951,
0.2596247503697454,
0.03851781106761592,
0.05061957699864807,
0.03221353614980002,
0.506229684528569,

#total: 0.95,
0.0403666599074094,
0.03927523855378127,
0.25467175464208913,
0.03798437971037629,
0.049902268646781545,
0.03195764632607972,
0.4958420522134826,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_3_replay5',
}
23 changes: 23 additions & 0 deletions configs/datasets/train/slim_pajama_150B.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/train_150B/ArXiv/ArXiv',
'data/slim_pajama/train_150B/Book/Book',
'data/slim_pajama/train_150B/C4/C4',
'data/slim_pajama/train_150B/Wikipedia/Wikipedia',
'data/slim_pajama/train_150B/Github/Github',
'data/slim_pajama/train_150B/StackExchange/StackExchange',
'data/slim_pajama/train_150B/CommonCrawl/CommonCrawl',],
"train-data-weights": [
4.576447650075095,
4.198505982426652,
26.62982374026485,
3.9945183507095225,
5.218824282422116,
3.372167199706489,
52.00971279439528
],
"train-dataset-name": 'slim_pajama_150B',
"train-iters": 66342,
"lr-decay-iters": 66342,
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_200B_1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-200B/Book/Book',
'data/slim_pajama/tokenized_train_0-200B/C4/C4',
'data/slim_pajama/tokenized_train_0-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-200B/Github/Github',
'data/slim_pajama/tokenized_train_0-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-200B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,
],
"train-iters": 88457,
"lr-decay-iters": 88457,
"train-dataset-name": 'slim_pajama_200B_1',
}
Loading