-
Notifications
You must be signed in to change notification settings - Fork 12
/
price-openai.py
168 lines (143 loc) · 5.57 KB
/
price-openai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
import tiktoken
import numpy as np
from collections import defaultdict
from rich.console import Console
from rich.panel import Panel
from rich.console import Group
from rich.align import Align
from rich import box
from rich.markdown import Markdown
console = Console()
data_path = "openai-finetune-train.jsonl"
with open(data_path, encoding="utf-8") as f:
dataset = [json.loads(line) for line in f]
for message in dataset[0]["messages"]:
example_message_msg = message
format_errors = defaultdict(int)
for ex in dataset:
if not isinstance(ex, dict):
format_errors["data_type"] += 1
continue
messages = ex.get("messages", None)
if not messages:
format_errors["missing_messages_list"] += 1
continue
for message in messages:
if "role" not in message or "content" not in message:
format_errors["message_missing_key"] += 1
if any(k not in ("role", "content", "name") for k in message):
format_errors["message_unrecognized_key"] += 1
if message.get("role", None) not in ("system", "user", "assistant"):
format_errors["unrecognized_role"] += 1
content = message.get("content", None)
if not content or not isinstance(content, str):
format_errors["missing_content"] += 1
if not any(message.get("role", None) == "assistant" for message in messages):
format_errors["example_missing_assistant_message"] += 1
if format_errors:
print("Found errors:")
for k, v in format_errors.items():
format_error_found = f"{k}: {v}"
else:
format_error_found = "No errors found"
encoding = tiktoken.get_encoding("cl100k_base")
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(str(value)))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3
return num_tokens
def num_assistant_tokens_from_messages(messages):
num_tokens = 0
for message in messages:
if message["role"] == "assistant":
num_tokens += len(encoding.encode(message["content"]))
return num_tokens
def print_distribution(values, name):
distribution_return_value = f"""
Distribution of {name}:
min / max: {min(values)}, {max(values)}
mean / median: {np.mean(values)}, {np.median(values)}
p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}
"""
return distribution_return_value
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []
for ex in dataset:
messages = ex["messages"]
if not any(message["role"] == "system" for message in messages):
n_missing_system += 1
if not any(message["role"] == "user" for message in messages):
n_missing_user += 1
n_messages.append(len(messages))
convo_lens.append(num_tokens_from_messages(messages))
assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
n_msg_distribution_return_value = print_distribution(n_messages, "num_messages_per_example")
convo_lens_distribution_return_value = print_distribution(convo_lens, "num_total_tokens_per_example")
assistant_message_lens_distribution_return_value = print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
MAX_TOKENS_PER_EXAMPLE = 4096
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 3
MAX_EPOCHS = 25
n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
total_count = n_epochs * n_billing_tokens_in_dataset
example_message = f"""
- `Num of Examples`: {len(dataset)}
- `First Example`:
```
{example_message_msg}
```
-------------
- `Error`: {format_error_found}
-------------
- `Num examples missing user message`: {n_missing_user}
- `Num examples missing system message`: {n_missing_system}
-------------
`Distribution values`:
```
{n_msg_distribution_return_value}
{convo_lens_distribution_return_value}
{assistant_message_lens_distribution_return_value}
```
-------------
- {n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning
- Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training
- By default, you'll train for {n_epochs} epochs on this dataset
- By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens
- The estimate cost for 1000 tokens is ~$0.0080 in GPT3.5 turbo considering:
```
Total Tokens: ~{total_count}
Number of Epochs: {n_epochs}
The model used: GPT3.5 turbo
The amount: ~${(total_count/1000)*0.0080}
```
"""
mark_msg = Markdown(example_message)
message_panel = Panel(
Align.center(
Group("\n", Align.center(mark_msg)),
vertical="middle",
),
box=box.ROUNDED,
padding=(1, 2),
title="[b red]OPENAI fine tune calculations",
border_style="blue",
)
console.print(message_panel)