Skip to content

Commit f08076a

Browse files
authored
Proper conversion for HF spaces, diffusers, transformers
1 parent 88679d5 commit f08076a

12 files changed

+98900
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
{
2+
"_name_or_path": "clip-vit-large-patch14/",
3+
"architectures": [
4+
"CLIPModel"
5+
],
6+
"initializer_factor": 1.0,
7+
"logit_scale_init_value": 4.6052,
8+
"model_type": "clip",
9+
"projection_dim": 768,
10+
"text_config": {
11+
"_name_or_path": "",
12+
"add_cross_attention": false,
13+
"architectures": null,
14+
"attention_dropout": 0.0,
15+
"bad_words_ids": null,
16+
"bos_token_id": 0,
17+
"chunk_size_feed_forward": 0,
18+
"cross_attention_hidden_size": null,
19+
"decoder_start_token_id": null,
20+
"diversity_penalty": 0.0,
21+
"do_sample": false,
22+
"dropout": 0.0,
23+
"early_stopping": false,
24+
"encoder_no_repeat_ngram_size": 0,
25+
"eos_token_id": 2,
26+
"finetuning_task": null,
27+
"forced_bos_token_id": null,
28+
"forced_eos_token_id": null,
29+
"hidden_act": "quick_gelu",
30+
"hidden_size": 768,
31+
"id2label": {
32+
"0": "LABEL_0",
33+
"1": "LABEL_1"
34+
},
35+
"initializer_factor": 1.0,
36+
"initializer_range": 0.02,
37+
"intermediate_size": 3072,
38+
"is_decoder": false,
39+
"is_encoder_decoder": false,
40+
"label2id": {
41+
"LABEL_0": 0,
42+
"LABEL_1": 1
43+
},
44+
"layer_norm_eps": 1e-05,
45+
"length_penalty": 1.0,
46+
"max_length": 20,
47+
"max_position_embeddings": 77,
48+
"min_length": 0,
49+
"model_type": "clip_text_model",
50+
"no_repeat_ngram_size": 0,
51+
"num_attention_heads": 12,
52+
"num_beam_groups": 1,
53+
"num_beams": 1,
54+
"num_hidden_layers": 12,
55+
"num_return_sequences": 1,
56+
"output_attentions": false,
57+
"output_hidden_states": false,
58+
"output_scores": false,
59+
"pad_token_id": 1,
60+
"prefix": null,
61+
"problem_type": null,
62+
"projection_dim" : 768,
63+
"pruned_heads": {},
64+
"remove_invalid_values": false,
65+
"repetition_penalty": 1.0,
66+
"return_dict": true,
67+
"return_dict_in_generate": false,
68+
"sep_token_id": null,
69+
"task_specific_params": null,
70+
"temperature": 1.0,
71+
"tie_encoder_decoder": false,
72+
"tie_word_embeddings": true,
73+
"tokenizer_class": null,
74+
"top_k": 50,
75+
"top_p": 1.0,
76+
"torch_dtype": null,
77+
"torchscript": false,
78+
"transformers_version": "4.44.2",
79+
"use_bfloat16": false,
80+
"vocab_size": 49408
81+
},
82+
"text_config_dict": {
83+
"hidden_size": 768,
84+
"intermediate_size": 3072,
85+
"num_attention_heads": 12,
86+
"num_hidden_layers": 12,
87+
"projection_dim": 768
88+
},
89+
"torch_dtype": "float32",
90+
"transformers_version": null,
91+
"vision_config": {
92+
"_name_or_path": "",
93+
"add_cross_attention": false,
94+
"architectures": null,
95+
"attention_dropout": 0.0,
96+
"bad_words_ids": null,
97+
"bos_token_id": null,
98+
"chunk_size_feed_forward": 0,
99+
"cross_attention_hidden_size": null,
100+
"decoder_start_token_id": null,
101+
"diversity_penalty": 0.0,
102+
"do_sample": false,
103+
"dropout": 0.0,
104+
"early_stopping": false,
105+
"encoder_no_repeat_ngram_size": 0,
106+
"eos_token_id": null,
107+
"finetuning_task": null,
108+
"forced_bos_token_id": null,
109+
"forced_eos_token_id": null,
110+
"hidden_act": "quick_gelu",
111+
"hidden_size": 1024,
112+
"id2label": {
113+
"0": "LABEL_0",
114+
"1": "LABEL_1"
115+
},
116+
"image_size": 224,
117+
"initializer_factor": 1.0,
118+
"initializer_range": 0.02,
119+
"intermediate_size": 4096,
120+
"is_decoder": false,
121+
"is_encoder_decoder": false,
122+
"label2id": {
123+
"LABEL_0": 0,
124+
"LABEL_1": 1
125+
},
126+
"layer_norm_eps": 1e-05,
127+
"length_penalty": 1.0,
128+
"max_length": 20,
129+
"min_length": 0,
130+
"model_type": "clip_vision_model",
131+
"no_repeat_ngram_size": 0,
132+
"num_attention_heads": 16,
133+
"num_beam_groups": 1,
134+
"num_beams": 1,
135+
"num_hidden_layers": 24,
136+
"num_return_sequences": 1,
137+
"output_attentions": false,
138+
"output_hidden_states": false,
139+
"output_scores": false,
140+
"pad_token_id": null,
141+
"patch_size": 14,
142+
"prefix": null,
143+
"problem_type": null,
144+
"projection_dim" : 768,
145+
"pruned_heads": {},
146+
"remove_invalid_values": false,
147+
"repetition_penalty": 1.0,
148+
"return_dict": true,
149+
"return_dict_in_generate": false,
150+
"sep_token_id": null,
151+
"task_specific_params": null,
152+
"temperature": 1.0,
153+
"tie_encoder_decoder": false,
154+
"tie_word_embeddings": true,
155+
"tokenizer_class": null,
156+
"top_k": 50,
157+
"top_p": 1.0,
158+
"torch_dtype": null,
159+
"torchscript": false,
160+
"transformers_version": "4.44.2",
161+
"use_bfloat16": false
162+
},
163+
"vision_config_dict": {
164+
"hidden_size": 1024,
165+
"intermediate_size": 4096,
166+
"num_attention_heads": 16,
167+
"num_hidden_layers": 24,
168+
"patch_size": 14,
169+
"projection_dim": 768
170+
}
171+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# coding=utf-8
2+
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import argparse
17+
18+
import torch
19+
from clip import load
20+
21+
from transformers import CLIPConfig, CLIPModel
22+
23+
24+
def copy_attn_layer(hf_attn_layer, pt_attn_layer):
25+
q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
26+
q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
27+
28+
out_proj_weights = pt_attn_layer.out_proj.weight
29+
out_proj_bias = pt_attn_layer.out_proj.bias
30+
31+
hf_attn_layer.q_proj.weight.data = q_proj
32+
hf_attn_layer.q_proj.bias.data = q_proj_bias
33+
34+
hf_attn_layer.k_proj.weight.data = k_proj
35+
hf_attn_layer.k_proj.bias.data = k_proj_bias
36+
37+
hf_attn_layer.v_proj.weight.data = v_proj
38+
hf_attn_layer.v_proj.bias.data = v_proj_bias
39+
40+
hf_attn_layer.out_proj.weight = out_proj_weights
41+
hf_attn_layer.out_proj.bias = out_proj_bias
42+
43+
44+
def copy_mlp(hf_mlp, pt_mlp):
45+
copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
46+
copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
47+
48+
49+
def copy_linear(hf_linear, pt_linear):
50+
hf_linear.weight = pt_linear.weight
51+
hf_linear.bias = pt_linear.bias
52+
53+
54+
def copy_layer(hf_layer, pt_layer):
55+
# copy layer norms
56+
copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
57+
copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
58+
59+
# copy MLP
60+
copy_mlp(hf_layer.mlp, pt_layer.mlp)
61+
62+
# copy attn
63+
copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
64+
65+
66+
def copy_layers(hf_layers, pt_layers):
67+
for hf_layer, pt_layer in zip(hf_layers, pt_layers):
68+
copy_layer(hf_layer, pt_layer)
69+
70+
71+
def copy_encoder(hf_encoder, pt_model):
72+
# copy embeds
73+
hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
74+
hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
75+
76+
# copy layer norm
77+
copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
78+
79+
# copy hidden layers
80+
copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
81+
82+
83+
def copy_text_model_and_projection(hf_model, pt_model):
84+
# copy projection
85+
hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
86+
87+
# copy text encoder
88+
copy_encoder(hf_model.text_model, pt_model)
89+
90+
91+
def copy_vison_model_and_projection(hf_model, pt_model):
92+
# copy projection
93+
hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
94+
95+
# copy layer norms
96+
copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
97+
copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
98+
99+
# copy embeds
100+
hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
101+
hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
102+
hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
103+
104+
# copy encoder
105+
copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
106+
107+
108+
@torch.no_grad()
109+
def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
110+
"""
111+
Copy/paste/tweak model's weights to transformers design.
112+
"""
113+
if config_path is not None:
114+
config = CLIPConfig.from_pretrained(config_path)
115+
else:
116+
config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})
117+
118+
hf_model = CLIPModel(config).eval()
119+
120+
pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
121+
pt_model = pt_model.eval()
122+
123+
copy_text_model_and_projection(hf_model, pt_model)
124+
copy_vison_model_and_projection(hf_model, pt_model)
125+
hf_model.logit_scale = pt_model.logit_scale
126+
127+
# Use `eos_token` so the example is more meaningful
128+
input_ids = torch.tensor(
129+
[
130+
[config.text_config.bos_token_id]
131+
+ list(range(3, 77))
132+
+ [config.text_config.eos_token_id]
133+
+ [config.text_config.pad_token_id]
134+
]
135+
)
136+
pixel_values = torch.randn(1, 3, 224, 224)
137+
138+
hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
139+
hf_logits_per_image = hf_outputs.logits_per_image
140+
hf_logits_per_text = hf_outputs.logits_per_text
141+
pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
142+
143+
assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
144+
assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
145+
146+
hf_model.save_pretrained(pytorch_dump_folder_path)
147+
148+
149+
if __name__ == "__main__":
150+
parser = argparse.ArgumentParser()
151+
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
152+
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
153+
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
154+
args = parser.parse_args()
155+
156+
convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import torch
2+
from safetensors.torch import load_file, save_file
3+
4+
"""
5+
This is the ORIGINAL CLIP-L text encoder only model. Get it from HuggingFace, for example for Flux.1:
6+
https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main/text_encoder
7+
This is important as we are comparing the names of the keys in the model, discarding any that
8+
should not be present in the text-encoder-only model, as expected by [here: Flux.1].
9+
"""
10+
original_state_dict = load_file("model.safetensors")
11+
12+
# Fine-tune after conversion with HuggingFace pytorch-to-HF script:
13+
finetuned_state_dict = load_file("my-finetune.safetensors")
14+
15+
# Create a new dictionary for the text encoder with matching keys AND! precision
16+
# If you have any issues with this, please use the standard "extract-TE.py" script
17+
filtered_state_dict = {}
18+
for key, tensor in finetuned_state_dict.items():
19+
if key in original_state_dict:
20+
# Match precision (dtype) of the original tensor
21+
target_dtype = original_state_dict[key].dtype
22+
if tensor.dtype != target_dtype:
23+
tensor = tensor.to(target_dtype)
24+
filtered_state_dict[key] = tensor
25+
26+
# Save the filtered state dictionary with matched precision
27+
save_file(filtered_state_dict, "my-finetune_TE-only_dtype.safetensors")
28+
29+
# Load the saved text encoder model
30+
filtered_loaded_state_dict = load_file("my-finetune_TE-only_dtype.safetensors")
31+
32+
# Function to compare the model structures
33+
def compare_models(model1, model2):
34+
"""Compare two model state dictionaries by key, shape, and dtype."""
35+
print(f"{'Key':<50} {'Model 1 Shape':<30} {'Model 2 Shape':<30} {'Match'}")
36+
print("-" * 130)
37+
for key in model1.keys() | model2.keys():
38+
shape1 = model1.get(key, None)
39+
shape2 = model2.get(key, None)
40+
if shape1 is not None and shape2 is not None:
41+
match = shape1.shape == shape2.shape and shape1.dtype == shape2.dtype
42+
print(f"{key:<50} {str(shape1.shape):<30} {str(shape2.shape):<30} {match}")
43+
else:
44+
print(f"{key:<50} {'N/A' if shape1 is None else str(shape1.shape):<30} "
45+
f"{'N/A' if shape2 is None else str(shape2.shape):<30} {'No'}")
46+
47+
# Perform comparison
48+
compare_models(original_state_dict, filtered_loaded_state_dict)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import torch
2+
from safetensors.torch import load_file, save_file
3+
4+
"""
5+
This is the ORIGINAL CLIP-L text encoder only model. Get it from HuggingFace, for example for Flux.1:
6+
https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main/text_encoder
7+
This is important as we are comparing the names of the keys in the model, discarding any that
8+
should not be present in the text-encoder-only model, as expected by [here: Flux.1].
9+
"""
10+
original_state_dict = load_file("model.safetensors")
11+
12+
# Fine-tune after conversion with HuggingFace pytorch-to-HF script:
13+
finetuned_state_dict = load_file("my-finetune.safetensors")
14+
15+
# Create a new dictionary for the text encoder: Only contains what also exists in original model.safetensors.
16+
filtered_state_dict = {k: v for k, v in finetuned_state_dict.items() if k in original_state_dict}
17+
18+
# Save the filtered state dictionary
19+
save_file(filtered_state_dict, "my-finetune_TE-only.safetensors")

0 commit comments

Comments
 (0)