Gengzigang
commited on
Commit
·
9dd61e3
1
Parent(s):
0a00000
update HF model
Browse files- .gitattributes +1 -0
- README.md +60 -2
- config.json +6 -158
- configuration_clip.py +79 -78
- pytorch_model.bin → model.safetensors +2 -2
- modeling_clip.py +109 -60
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
3 |
---
|
4 |
<div align="center">
|
5 |
|
@@ -29,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
|
|
29 |
## Usage
|
30 |
|
31 |
### Huggingface Version
|
|
|
32 |
```python
|
33 |
from PIL import Image
|
34 |
from transformers import AutoModel
|
@@ -37,9 +42,8 @@ import torch
|
|
37 |
|
38 |
image_path = "CLIP.png"
|
39 |
model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
|
40 |
-
image_size = 224
|
41 |
|
42 |
-
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-
|
43 |
model = AutoModel.from_pretrained(
|
44 |
model_name_or_path,
|
45 |
torch_dtype=torch.float16,
|
@@ -51,5 +55,59 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
|
|
51 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
52 |
outputs = model.get_image_features(input_pixels)
|
53 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
## BibTeX & Citation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- CLIP
|
5 |
+
- LLM2CLIP
|
6 |
+
pipeline_tag: zero-shot-classification
|
7 |
---
|
8 |
<div align="center">
|
9 |
|
|
|
33 |
## Usage
|
34 |
|
35 |
### Huggingface Version
|
36 |
+
Image Embeddings
|
37 |
```python
|
38 |
from PIL import Image
|
39 |
from transformers import AutoModel
|
|
|
42 |
|
43 |
image_path = "CLIP.png"
|
44 |
model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
|
|
|
45 |
|
46 |
+
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
|
47 |
model = AutoModel.from_pretrained(
|
48 |
model_name_or_path,
|
49 |
torch_dtype=torch.float16,
|
|
|
55 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
56 |
outputs = model.get_image_features(input_pixels)
|
57 |
```
|
58 |
+
Retrieval
|
59 |
+
```python
|
60 |
+
from PIL import Image
|
61 |
+
from transformers import AutoModel, AutoConfig, AutoTokenizer
|
62 |
+
from transformers import CLIPImageProcessor
|
63 |
+
import torch
|
64 |
+
from llm2vec import LLM2Vec
|
65 |
+
|
66 |
+
processor = CLIPImageProcessor.from_pretrained("openai/openai/clip-vit-base-patch16")
|
67 |
+
model_name_or_path = "microsoft/LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
|
68 |
+
model = AutoModel.from_pretrained(
|
69 |
+
model_name_or_path,
|
70 |
+
torch_dtype=torch.float16,
|
71 |
+
trust_remote_code=True).to('cuda').eval()
|
72 |
+
|
73 |
+
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
|
74 |
+
config = AutoConfig.from_pretrained(
|
75 |
+
llm_model_name, trust_remote_code=True
|
76 |
+
)
|
77 |
+
llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
|
78 |
+
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
|
79 |
+
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
|
80 |
+
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
|
81 |
+
|
82 |
+
captions = ["a diagram", "a dog", "a cat"]
|
83 |
+
image_path = "CLIP.png"
|
84 |
+
|
85 |
+
image = Image.open(image_path)
|
86 |
+
input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
|
87 |
+
|
88 |
+
with torch.no_grad(), torch.cuda.amp.autocast():
|
89 |
+
image_features = model.get_image_features(input_pixels)
|
90 |
+
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
|
91 |
+
text_features = model.get_text_features(text_features)
|
92 |
+
|
93 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
94 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
95 |
+
|
96 |
+
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
97 |
+
|
98 |
+
print("Label probs:", text_probs)
|
99 |
+
|
100 |
+
```
|
101 |
|
102 |
## BibTeX & Citation
|
103 |
+
|
104 |
+
```
|
105 |
+
@misc{huang2024llm2clippowerfullanguagemodel,
|
106 |
+
title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
|
107 |
+
author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
|
108 |
+
year={2024},
|
109 |
+
eprint={2411.04997},
|
110 |
+
archivePrefix={arXiv},
|
111 |
+
primaryClass={cs.CV},
|
112 |
+
url={https://arxiv.org/abs/2411.04997},
|
113 |
+
}
|
config.json
CHANGED
@@ -1,179 +1,27 @@
|
|
1 |
{
|
2 |
-
"_commit_hash": null,
|
3 |
-
"_name_or_path": "LLM2CLIP-Openai-L-14",
|
4 |
"architectures": [
|
5 |
-
"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "configuration_clip.CLIPConfig",
|
9 |
-
"AutoModel": "modeling_clip.
|
10 |
},
|
11 |
"initializer_factor": 1.0,
|
12 |
"logit_scale_init_value": 2.6592,
|
13 |
"model_type": "clip",
|
14 |
"projection_dim": 1280,
|
15 |
"text_config": {
|
16 |
-
"_name_or_path": "",
|
17 |
-
"add_cross_attention": false,
|
18 |
-
"architectures": null,
|
19 |
-
"attention_dropout": 0.0,
|
20 |
-
"bad_words_ids": null,
|
21 |
-
"begin_suppress_tokens": null,
|
22 |
"bos_token_id": 0,
|
23 |
-
"
|
24 |
-
"cross_attention_hidden_size": null,
|
25 |
-
"decoder_start_token_id": null,
|
26 |
-
"diversity_penalty": 0.0,
|
27 |
-
"do_sample": false,
|
28 |
-
"early_stopping": false,
|
29 |
-
"encoder_no_repeat_ngram_size": 0,
|
30 |
"eos_token_id": 2,
|
31 |
-
"exponential_decay_length_penalty": null,
|
32 |
-
"finetuning_task": null,
|
33 |
-
"forced_bos_token_id": null,
|
34 |
-
"forced_eos_token_id": null,
|
35 |
-
"hidden_act": "gelu",
|
36 |
-
"hidden_size": 512,
|
37 |
-
"id2label": {
|
38 |
-
"0": "LABEL_0",
|
39 |
-
"1": "LABEL_1"
|
40 |
-
},
|
41 |
-
"initializer_factor": 1.0,
|
42 |
-
"initializer_range": 0.02,
|
43 |
-
"intermediate_size": 2048,
|
44 |
-
"is_decoder": false,
|
45 |
-
"is_encoder_decoder": false,
|
46 |
-
"k_bias": true,
|
47 |
-
"label2id": {
|
48 |
-
"LABEL_0": 0,
|
49 |
-
"LABEL_1": 1
|
50 |
-
},
|
51 |
-
"layer_norm_eps": 1e-05,
|
52 |
-
"length_penalty": 1.0,
|
53 |
-
"max_length": 20,
|
54 |
-
"max_position_embeddings": 77,
|
55 |
-
"min_length": 0,
|
56 |
"model_type": "clip_text_model",
|
57 |
-
"
|
58 |
-
"num_attention_heads": 8,
|
59 |
-
"num_beam_groups": 1,
|
60 |
-
"num_beams": 1,
|
61 |
-
"num_hidden_layers": 12,
|
62 |
-
"num_return_sequences": 1,
|
63 |
-
"output_attentions": false,
|
64 |
-
"output_hidden_states": false,
|
65 |
-
"output_scores": false,
|
66 |
-
"pad_token_id": 1,
|
67 |
-
"post_layernorm": false,
|
68 |
-
"prefix": null,
|
69 |
-
"problem_type": null,
|
70 |
-
"projection_dim": 512,
|
71 |
-
"pruned_heads": {},
|
72 |
-
"q_bias": true,
|
73 |
-
"remove_invalid_values": false,
|
74 |
-
"repetition_penalty": 1.0,
|
75 |
-
"return_dict": true,
|
76 |
-
"return_dict_in_generate": false,
|
77 |
-
"sep_token_id": null,
|
78 |
-
"suppress_tokens": null,
|
79 |
-
"task_specific_params": null,
|
80 |
-
"temperature": 1.0,
|
81 |
-
"tf_legacy_loss": false,
|
82 |
-
"tie_encoder_decoder": false,
|
83 |
-
"tie_word_embeddings": true,
|
84 |
-
"tokenizer_class": null,
|
85 |
-
"top_k": 50,
|
86 |
-
"top_p": 1.0,
|
87 |
-
"torch_dtype": null,
|
88 |
-
"torchscript": false,
|
89 |
-
"transformers_version": "4.44.2",
|
90 |
-
"typical_p": 1.0,
|
91 |
-
"use_bfloat16": false,
|
92 |
-
"v_bias": true,
|
93 |
-
"vocab_size": 49408
|
94 |
},
|
95 |
"torch_dtype": "float32",
|
96 |
-
"transformers_version":
|
97 |
"vision_config": {
|
98 |
-
"_name_or_path": "",
|
99 |
-
"add_cross_attention": false,
|
100 |
-
"architectures": null,
|
101 |
-
"attention_dropout": 0.0,
|
102 |
-
"bad_words_ids": null,
|
103 |
-
"begin_suppress_tokens": null,
|
104 |
-
"bos_token_id": null,
|
105 |
-
"chunk_size_feed_forward": 0,
|
106 |
-
"cross_attention_hidden_size": null,
|
107 |
-
"decoder_start_token_id": null,
|
108 |
-
"diversity_penalty": 0.0,
|
109 |
-
"do_sample": false,
|
110 |
"dropout": 0.0,
|
111 |
-
"early_stopping": false,
|
112 |
-
"encoder_no_repeat_ngram_size": 0,
|
113 |
-
"eos_token_id": null,
|
114 |
-
"exponential_decay_length_penalty": null,
|
115 |
-
"finetuning_task": null,
|
116 |
-
"forced_bos_token_id": null,
|
117 |
-
"forced_eos_token_id": null,
|
118 |
-
"hidden_act": "gelu",
|
119 |
-
"hidden_size": 768,
|
120 |
-
"id2label": {
|
121 |
-
"0": "LABEL_0",
|
122 |
-
"1": "LABEL_1"
|
123 |
-
},
|
124 |
-
"image_size": 224,
|
125 |
-
"initializer_factor": 1.0,
|
126 |
-
"initializer_range": 0.02,
|
127 |
-
"intermediate_size": 3072,
|
128 |
-
"is_decoder": false,
|
129 |
-
"is_encoder_decoder": false,
|
130 |
-
"k_bias": true,
|
131 |
-
"label2id": {
|
132 |
-
"LABEL_0": 0,
|
133 |
-
"LABEL_1": 1
|
134 |
-
},
|
135 |
-
"layer_norm_eps": 1e-05,
|
136 |
-
"length_penalty": 1.0,
|
137 |
-
"max_length": 20,
|
138 |
-
"min_length": 0,
|
139 |
"model_type": "clip_vision_model",
|
140 |
-
"
|
141 |
-
"num_attention_heads": 12,
|
142 |
-
"num_beam_groups": 1,
|
143 |
-
"num_beams": 1,
|
144 |
-
"num_channels": 3,
|
145 |
-
"num_hidden_layers": 12,
|
146 |
-
"num_return_sequences": 1,
|
147 |
-
"output_attentions": false,
|
148 |
-
"output_hidden_states": false,
|
149 |
-
"output_scores": false,
|
150 |
-
"pad_token_id": null,
|
151 |
-
"patch_size": 16,
|
152 |
-
"post_layernorm": false,
|
153 |
-
"prefix": null,
|
154 |
-
"problem_type": null,
|
155 |
-
"projection_dim": 512,
|
156 |
-
"pruned_heads": {},
|
157 |
-
"q_bias": true,
|
158 |
-
"remove_invalid_values": false,
|
159 |
-
"repetition_penalty": 1.0,
|
160 |
-
"return_dict": true,
|
161 |
-
"return_dict_in_generate": false,
|
162 |
-
"sep_token_id": null,
|
163 |
-
"suppress_tokens": null,
|
164 |
-
"task_specific_params": null,
|
165 |
-
"temperature": 1.0,
|
166 |
-
"tf_legacy_loss": false,
|
167 |
-
"tie_encoder_decoder": false,
|
168 |
-
"tie_word_embeddings": true,
|
169 |
-
"tokenizer_class": null,
|
170 |
-
"top_k": 50,
|
171 |
-
"top_p": 1.0,
|
172 |
-
"torch_dtype": null,
|
173 |
-
"torchscript": false,
|
174 |
-
"transformers_version": "4.44.2",
|
175 |
-
"typical_p": 1.0,
|
176 |
-
"use_bfloat16": false,
|
177 |
-
"v_bias": true
|
178 |
}
|
179 |
}
|
|
|
1 |
{
|
|
|
|
|
2 |
"architectures": [
|
3 |
+
"LLM2CLIPModel"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
"AutoConfig": "configuration_clip.CLIPConfig",
|
7 |
+
"AutoModel": "modeling_clip.LLM2CLIPModel"
|
8 |
},
|
9 |
"initializer_factor": 1.0,
|
10 |
"logit_scale_init_value": 2.6592,
|
11 |
"model_type": "clip",
|
12 |
"projection_dim": 1280,
|
13 |
"text_config": {
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"bos_token_id": 0,
|
15 |
+
"dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
"eos_token_id": 2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"model_type": "clip_text_model",
|
18 |
+
"projection_dim": 1280
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
},
|
20 |
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.40.2",
|
22 |
"vision_config": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
"dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"model_type": "clip_vision_model",
|
25 |
+
"patch_size": 16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
}
|
27 |
}
|
configuration_clip.py
CHANGED
@@ -26,9 +26,9 @@ if TYPE_CHECKING:
|
|
26 |
from transformers.utils import TensorType
|
27 |
|
28 |
from transformers.configuration_utils import PretrainedConfig
|
|
|
29 |
from transformers.utils import logging
|
30 |
|
31 |
-
|
32 |
logger = logging.get_logger(__name__)
|
33 |
|
34 |
|
@@ -50,25 +50,33 @@ class CLIPTextConfig(PretrainedConfig):
|
|
50 |
Dimensionality of the encoder layers and the pooler layer.
|
51 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
52 |
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
|
|
|
53 |
num_hidden_layers (`int`, *optional*, defaults to 12):
|
54 |
Number of hidden layers in the Transformer encoder.
|
55 |
num_attention_heads (`int`, *optional*, defaults to 8):
|
56 |
Number of attention heads for each attention layer in the Transformer encoder.
|
57 |
-
max_position_embeddings (`int`, *optional*, defaults to 77)
|
58 |
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
59 |
just in case (e.g., 512 or 1024 or 2048).
|
60 |
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
61 |
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
62 |
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
63 |
-
layer_norm_eps (`float`, *optional*, defaults to 1e-
|
64 |
The epsilon used by the layer normalization layers.
|
65 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
66 |
The dropout ratio for the attention probabilities.
|
67 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
68 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
69 |
-
initializer_factor (`float`, *optional*, defaults to 1):
|
70 |
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
71 |
testing).
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
Example:
|
74 |
|
@@ -84,7 +92,9 @@ class CLIPTextConfig(PretrainedConfig):
|
|
84 |
>>> # Accessing the model configuration
|
85 |
>>> configuration = model.config
|
86 |
```"""
|
|
|
87 |
model_type = "clip_text_model"
|
|
|
88 |
|
89 |
def __init__(
|
90 |
self,
|
@@ -95,18 +105,16 @@ class CLIPTextConfig(PretrainedConfig):
|
|
95 |
num_hidden_layers=12,
|
96 |
num_attention_heads=8,
|
97 |
max_position_embeddings=77,
|
98 |
-
hidden_act="
|
99 |
layer_norm_eps=1e-5,
|
100 |
attention_dropout=0.0,
|
101 |
initializer_range=0.02,
|
102 |
initializer_factor=1.0,
|
103 |
-
|
104 |
-
|
105 |
-
v_bias=True,
|
106 |
-
post_layernorm=False,
|
107 |
pad_token_id=1,
|
108 |
-
bos_token_id=
|
109 |
-
eos_token_id=
|
110 |
**kwargs,
|
111 |
):
|
112 |
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
@@ -122,28 +130,8 @@ class CLIPTextConfig(PretrainedConfig):
|
|
122 |
self.hidden_act = hidden_act
|
123 |
self.initializer_range = initializer_range
|
124 |
self.initializer_factor = initializer_factor
|
125 |
-
self.q_bias=q_bias
|
126 |
-
self.k_bias=k_bias
|
127 |
-
self.v_bias=v_bias
|
128 |
-
self.post_layernorm = post_layernorm
|
129 |
self.attention_dropout = attention_dropout
|
130 |
|
131 |
-
@classmethod
|
132 |
-
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
133 |
-
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
134 |
-
|
135 |
-
# get the text config dict if we are loading from CLIPConfig
|
136 |
-
if config_dict.get("model_type") == "clip":
|
137 |
-
config_dict = config_dict["text_config"]
|
138 |
-
|
139 |
-
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
140 |
-
logger.warning(
|
141 |
-
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
142 |
-
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
143 |
-
)
|
144 |
-
|
145 |
-
return cls.from_dict(config_dict, **kwargs)
|
146 |
-
|
147 |
|
148 |
class CLIPVisionConfig(PretrainedConfig):
|
149 |
r"""
|
@@ -160,24 +148,28 @@ class CLIPVisionConfig(PretrainedConfig):
|
|
160 |
Dimensionality of the encoder layers and the pooler layer.
|
161 |
intermediate_size (`int`, *optional*, defaults to 3072):
|
162 |
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
|
|
|
163 |
num_hidden_layers (`int`, *optional*, defaults to 12):
|
164 |
Number of hidden layers in the Transformer encoder.
|
165 |
num_attention_heads (`int`, *optional*, defaults to 12):
|
166 |
Number of attention heads for each attention layer in the Transformer encoder.
|
|
|
|
|
167 |
image_size (`int`, *optional*, defaults to 224):
|
168 |
The size (resolution) of each image.
|
169 |
patch_size (`int`, *optional*, defaults to 32):
|
170 |
The size (resolution) of each patch.
|
171 |
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
172 |
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
173 |
-
`"relu"`, `"selu"` and `"gelu_new"`
|
174 |
-
layer_norm_eps (`float`, *optional*, defaults to 1e-
|
175 |
The epsilon used by the layer normalization layers.
|
176 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
177 |
The dropout ratio for the attention probabilities.
|
178 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
179 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
180 |
-
initializer_factor (`float`, *optional*, defaults to 1):
|
181 |
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
182 |
testing).
|
183 |
|
@@ -197,6 +189,7 @@ class CLIPVisionConfig(PretrainedConfig):
|
|
197 |
```"""
|
198 |
|
199 |
model_type = "clip_vision_model"
|
|
|
200 |
|
201 |
def __init__(
|
202 |
self,
|
@@ -208,15 +201,11 @@ class CLIPVisionConfig(PretrainedConfig):
|
|
208 |
num_channels=3,
|
209 |
image_size=224,
|
210 |
patch_size=32,
|
211 |
-
hidden_act="
|
212 |
layer_norm_eps=1e-5,
|
213 |
attention_dropout=0.0,
|
214 |
initializer_range=0.02,
|
215 |
initializer_factor=1.0,
|
216 |
-
q_bias=True,
|
217 |
-
k_bias=True,
|
218 |
-
v_bias=True,
|
219 |
-
post_layernorm=False,
|
220 |
**kwargs,
|
221 |
):
|
222 |
super().__init__(**kwargs)
|
@@ -231,30 +220,10 @@ class CLIPVisionConfig(PretrainedConfig):
|
|
231 |
self.image_size = image_size
|
232 |
self.initializer_range = initializer_range
|
233 |
self.initializer_factor = initializer_factor
|
234 |
-
self.q_bias=q_bias
|
235 |
-
self.k_bias=k_bias
|
236 |
-
self.v_bias=v_bias
|
237 |
-
self.post_layernorm = post_layernorm
|
238 |
self.attention_dropout = attention_dropout
|
239 |
self.layer_norm_eps = layer_norm_eps
|
240 |
self.hidden_act = hidden_act
|
241 |
|
242 |
-
@classmethod
|
243 |
-
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
244 |
-
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
245 |
-
|
246 |
-
# get the vision config dict if we are loading from CLIPConfig
|
247 |
-
if config_dict.get("model_type") == "clip":
|
248 |
-
config_dict = config_dict["vision_config"]
|
249 |
-
|
250 |
-
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
251 |
-
logger.warning(
|
252 |
-
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
253 |
-
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
254 |
-
)
|
255 |
-
|
256 |
-
return cls.from_dict(config_dict, **kwargs)
|
257 |
-
|
258 |
|
259 |
class CLIPConfig(PretrainedConfig):
|
260 |
r"""
|
@@ -272,9 +241,9 @@ class CLIPConfig(PretrainedConfig):
|
|
272 |
vision_config (`dict`, *optional*):
|
273 |
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
274 |
projection_dim (`int`, *optional*, defaults to 512):
|
275 |
-
|
276 |
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
277 |
-
The
|
278 |
kwargs (*optional*):
|
279 |
Dictionary of keyword arguments.
|
280 |
|
@@ -303,7 +272,7 @@ class CLIPConfig(PretrainedConfig):
|
|
303 |
```"""
|
304 |
|
305 |
model_type = "clip"
|
306 |
-
|
307 |
|
308 |
def __init__(
|
309 |
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
@@ -339,9 +308,9 @@ class CLIPConfig(PretrainedConfig):
|
|
339 |
else:
|
340 |
message = (
|
341 |
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
342 |
-
f'value `text_config["{key}"]` will be
|
343 |
)
|
344 |
-
logger.
|
345 |
|
346 |
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
347 |
text_config.update(_text_config_dict)
|
@@ -371,9 +340,9 @@ class CLIPConfig(PretrainedConfig):
|
|
371 |
else:
|
372 |
message = (
|
373 |
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
374 |
-
f'The value `vision_config["{key}"]` will be
|
375 |
)
|
376 |
-
logger.
|
377 |
|
378 |
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
379 |
vision_config.update(_vision_config_dict)
|
@@ -405,16 +374,48 @@ class CLIPConfig(PretrainedConfig):
|
|
405 |
|
406 |
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
407 |
|
408 |
-
def to_dict(self):
|
409 |
-
"""
|
410 |
-
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
411 |
-
|
412 |
-
Returns:
|
413 |
-
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
414 |
-
"""
|
415 |
-
output = copy.deepcopy(self.__dict__)
|
416 |
-
output["text_config"] = self.text_config.to_dict()
|
417 |
-
output["vision_config"] = self.vision_config.to_dict()
|
418 |
-
output["model_type"] = self.__class__.model_type
|
419 |
-
return output
|
420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
from transformers.utils import TensorType
|
27 |
|
28 |
from transformers.configuration_utils import PretrainedConfig
|
29 |
+
from transformers.onnx import OnnxConfig
|
30 |
from transformers.utils import logging
|
31 |
|
|
|
32 |
logger = logging.get_logger(__name__)
|
33 |
|
34 |
|
|
|
50 |
Dimensionality of the encoder layers and the pooler layer.
|
51 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
52 |
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
53 |
+
projection_dim (`int`, *optional*, defaults to 512):
|
54 |
+
Dimensionality of text and vision projection layers.
|
55 |
num_hidden_layers (`int`, *optional*, defaults to 12):
|
56 |
Number of hidden layers in the Transformer encoder.
|
57 |
num_attention_heads (`int`, *optional*, defaults to 8):
|
58 |
Number of attention heads for each attention layer in the Transformer encoder.
|
59 |
+
max_position_embeddings (`int`, *optional*, defaults to 77):
|
60 |
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
61 |
just in case (e.g., 512 or 1024 or 2048).
|
62 |
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
63 |
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
64 |
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
65 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
66 |
The epsilon used by the layer normalization layers.
|
67 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
68 |
The dropout ratio for the attention probabilities.
|
69 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
70 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
71 |
+
initializer_factor (`float`, *optional*, defaults to 1.0):
|
72 |
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
73 |
testing).
|
74 |
+
pad_token_id (`int`, *optional*, defaults to 1):
|
75 |
+
Padding token id.
|
76 |
+
bos_token_id (`int`, *optional*, defaults to 49406):
|
77 |
+
Beginning of stream token id.
|
78 |
+
eos_token_id (`int`, *optional*, defaults to 49407):
|
79 |
+
End of stream token id.
|
80 |
|
81 |
Example:
|
82 |
|
|
|
92 |
>>> # Accessing the model configuration
|
93 |
>>> configuration = model.config
|
94 |
```"""
|
95 |
+
|
96 |
model_type = "clip_text_model"
|
97 |
+
base_config_key = "text_config"
|
98 |
|
99 |
def __init__(
|
100 |
self,
|
|
|
105 |
num_hidden_layers=12,
|
106 |
num_attention_heads=8,
|
107 |
max_position_embeddings=77,
|
108 |
+
hidden_act="quick_gelu",
|
109 |
layer_norm_eps=1e-5,
|
110 |
attention_dropout=0.0,
|
111 |
initializer_range=0.02,
|
112 |
initializer_factor=1.0,
|
113 |
+
# This differs from `CLIPTokenizer`'s default and from openai/clip
|
114 |
+
# See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
|
|
|
|
|
115 |
pad_token_id=1,
|
116 |
+
bos_token_id=49406,
|
117 |
+
eos_token_id=49407,
|
118 |
**kwargs,
|
119 |
):
|
120 |
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
|
|
130 |
self.hidden_act = hidden_act
|
131 |
self.initializer_range = initializer_range
|
132 |
self.initializer_factor = initializer_factor
|
|
|
|
|
|
|
|
|
133 |
self.attention_dropout = attention_dropout
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
class CLIPVisionConfig(PretrainedConfig):
|
137 |
r"""
|
|
|
148 |
Dimensionality of the encoder layers and the pooler layer.
|
149 |
intermediate_size (`int`, *optional*, defaults to 3072):
|
150 |
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
151 |
+
projection_dim (`int`, *optional*, defaults to 512):
|
152 |
+
Dimensionality of text and vision projection layers.
|
153 |
num_hidden_layers (`int`, *optional*, defaults to 12):
|
154 |
Number of hidden layers in the Transformer encoder.
|
155 |
num_attention_heads (`int`, *optional*, defaults to 12):
|
156 |
Number of attention heads for each attention layer in the Transformer encoder.
|
157 |
+
num_channels (`int`, *optional*, defaults to 3):
|
158 |
+
The number of input channels.
|
159 |
image_size (`int`, *optional*, defaults to 224):
|
160 |
The size (resolution) of each image.
|
161 |
patch_size (`int`, *optional*, defaults to 32):
|
162 |
The size (resolution) of each patch.
|
163 |
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
164 |
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
165 |
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
166 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
167 |
The epsilon used by the layer normalization layers.
|
168 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
169 |
The dropout ratio for the attention probabilities.
|
170 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
171 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
172 |
+
initializer_factor (`float`, *optional*, defaults to 1.0):
|
173 |
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
174 |
testing).
|
175 |
|
|
|
189 |
```"""
|
190 |
|
191 |
model_type = "clip_vision_model"
|
192 |
+
base_config_key = "vision_config"
|
193 |
|
194 |
def __init__(
|
195 |
self,
|
|
|
201 |
num_channels=3,
|
202 |
image_size=224,
|
203 |
patch_size=32,
|
204 |
+
hidden_act="quick_gelu",
|
205 |
layer_norm_eps=1e-5,
|
206 |
attention_dropout=0.0,
|
207 |
initializer_range=0.02,
|
208 |
initializer_factor=1.0,
|
|
|
|
|
|
|
|
|
209 |
**kwargs,
|
210 |
):
|
211 |
super().__init__(**kwargs)
|
|
|
220 |
self.image_size = image_size
|
221 |
self.initializer_range = initializer_range
|
222 |
self.initializer_factor = initializer_factor
|
|
|
|
|
|
|
|
|
223 |
self.attention_dropout = attention_dropout
|
224 |
self.layer_norm_eps = layer_norm_eps
|
225 |
self.hidden_act = hidden_act
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
class CLIPConfig(PretrainedConfig):
|
229 |
r"""
|
|
|
241 |
vision_config (`dict`, *optional*):
|
242 |
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
243 |
projection_dim (`int`, *optional*, defaults to 512):
|
244 |
+
Dimensionality of text and vision projection layers.
|
245 |
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
246 |
+
The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
|
247 |
kwargs (*optional*):
|
248 |
Dictionary of keyword arguments.
|
249 |
|
|
|
272 |
```"""
|
273 |
|
274 |
model_type = "clip"
|
275 |
+
sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
|
276 |
|
277 |
def __init__(
|
278 |
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
|
|
308 |
else:
|
309 |
message = (
|
310 |
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
311 |
+
f'value `text_config["{key}"]` will be overridden.'
|
312 |
)
|
313 |
+
logger.info(message)
|
314 |
|
315 |
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
316 |
text_config.update(_text_config_dict)
|
|
|
340 |
else:
|
341 |
message = (
|
342 |
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
343 |
+
f'The value `vision_config["{key}"]` will be overridden.'
|
344 |
)
|
345 |
+
logger.info(message)
|
346 |
|
347 |
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
348 |
vision_config.update(_vision_config_dict)
|
|
|
374 |
|
375 |
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
+
class CLIPOnnxConfig(OnnxConfig):
|
379 |
+
@property
|
380 |
+
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
381 |
+
return OrderedDict(
|
382 |
+
[
|
383 |
+
("input_ids", {0: "batch", 1: "sequence"}),
|
384 |
+
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
|
385 |
+
("attention_mask", {0: "batch", 1: "sequence"}),
|
386 |
+
]
|
387 |
+
)
|
388 |
+
|
389 |
+
@property
|
390 |
+
def outputs(self) -> Mapping[str, Mapping[int, str]]:
|
391 |
+
return OrderedDict(
|
392 |
+
[
|
393 |
+
("logits_per_image", {0: "batch"}),
|
394 |
+
("logits_per_text", {0: "batch"}),
|
395 |
+
("text_embeds", {0: "batch"}),
|
396 |
+
("image_embeds", {0: "batch"}),
|
397 |
+
]
|
398 |
+
)
|
399 |
+
|
400 |
+
@property
|
401 |
+
def atol_for_validation(self) -> float:
|
402 |
+
return 1e-4
|
403 |
+
|
404 |
+
def generate_dummy_inputs(
|
405 |
+
self,
|
406 |
+
processor: "ProcessorMixin",
|
407 |
+
batch_size: int = -1,
|
408 |
+
seq_length: int = -1,
|
409 |
+
framework: Optional["TensorType"] = None,
|
410 |
+
) -> Mapping[str, Any]:
|
411 |
+
text_input_dict = super().generate_dummy_inputs(
|
412 |
+
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
|
413 |
+
)
|
414 |
+
image_input_dict = super().generate_dummy_inputs(
|
415 |
+
processor.image_processor, batch_size=batch_size, framework=framework
|
416 |
+
)
|
417 |
+
return {**text_input_dict, **image_input_dict}
|
418 |
+
|
419 |
+
@property
|
420 |
+
def default_onnx_opset(self) -> int:
|
421 |
+
return 14
|
pytorch_model.bin → model.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd872fd6bf16bfba5624e8f13c14168f5b25496fc25246c04556bc858dd9a6d
|
3 |
+
size 1442236212
|
modeling_clip.py
CHANGED
@@ -37,9 +37,9 @@ from transformers.utils import (
|
|
37 |
logging,
|
38 |
replace_return_docstrings,
|
39 |
)
|
|
|
40 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
41 |
|
42 |
-
|
43 |
if is_flash_attn_2_available():
|
44 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
45 |
|
@@ -603,16 +603,15 @@ class CLIPPreTrainedModel(PreTrainedModel):
|
|
603 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
604 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
605 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
606 |
-
elif isinstance(module,
|
607 |
-
pass
|
608 |
# nn.init.normal_(
|
609 |
# module.text_projection.weight,
|
610 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
611 |
# )
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
617 |
nn.init.normal_(
|
618 |
module.visual_projection.weight,
|
@@ -1112,80 +1111,97 @@ class CLIPVisionModel(CLIPPreTrainedModel):
|
|
1112 |
|
1113 |
|
1114 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
1115 |
-
class
|
1116 |
config_class = CLIPConfig
|
1117 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
1118 |
|
1119 |
def __init__(self, config: CLIPConfig):
|
1120 |
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
1121 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
1122 |
raise TypeError(
|
1123 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
1124 |
f" {type(config.vision_config)}."
|
1125 |
)
|
1126 |
|
|
|
1127 |
vision_config = config.vision_config
|
1128 |
|
1129 |
self.projection_dim = config.projection_dim
|
|
|
1130 |
self.vision_embed_dim = vision_config.hidden_size
|
|
|
|
|
|
|
|
|
|
|
|
|
1131 |
|
1132 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
1133 |
self.vision_model = vision_model.vision_model
|
1134 |
|
1135 |
-
|
1136 |
-
|
1137 |
-
self.visual_projection = nn.Parameter(scale * torch.randn(self.vision_embed_dim, self.projection_dim))
|
1138 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
1139 |
|
1140 |
# Initialize weights and apply final processing
|
1141 |
self.post_init()
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
self
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
-
|
1159 |
-
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
-
|
1165 |
-
|
1166 |
-
|
1167 |
-
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
|
1173 |
-
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
1179 |
-
|
1180 |
-
|
1181 |
-
|
1182 |
-
|
1183 |
-
|
1184 |
-
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
|
|
|
|
|
|
|
|
1189 |
|
1190 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
1191 |
def get_image_features(
|
@@ -1232,7 +1248,7 @@ class CLIPModel(CLIPPreTrainedModel):
|
|
1232 |
)
|
1233 |
|
1234 |
pooled_output = vision_outputs[1] # pooled_output
|
1235 |
-
image_features =
|
1236 |
|
1237 |
return image_features
|
1238 |
|
@@ -1413,7 +1429,40 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
|
|
1413 |
attentions=text_outputs.attentions,
|
1414 |
)
|
1415 |
|
1416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1417 |
@add_start_docstrings(
|
1418 |
"""
|
1419 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|
|
|
37 |
logging,
|
38 |
replace_return_docstrings,
|
39 |
)
|
40 |
+
# from configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
41 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
42 |
|
|
|
43 |
if is_flash_attn_2_available():
|
44 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
45 |
|
|
|
603 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
604 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
605 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
606 |
+
elif isinstance(module, LLM2CLIPModel):
|
|
|
607 |
# nn.init.normal_(
|
608 |
# module.text_projection.weight,
|
609 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
610 |
# )
|
611 |
+
nn.init.normal_(
|
612 |
+
module.visual_projection.weight,
|
613 |
+
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
|
614 |
+
)
|
615 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
616 |
nn.init.normal_(
|
617 |
module.visual_projection.weight,
|
|
|
1111 |
|
1112 |
|
1113 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
1114 |
+
class LLM2CLIPModel(CLIPPreTrainedModel):
|
1115 |
config_class = CLIPConfig
|
1116 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
1117 |
|
1118 |
def __init__(self, config: CLIPConfig):
|
1119 |
super().__init__(config)
|
1120 |
+
# if not isinstance(config.text_config, CLIPTextConfig):
|
1121 |
+
# raise TypeError(
|
1122 |
+
# "config.text_config is expected to be of type CLIPTextConfig but is of type"
|
1123 |
+
# f" {type(config.text_config)}."
|
1124 |
+
# )
|
1125 |
+
|
1126 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
1127 |
raise TypeError(
|
1128 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
1129 |
f" {type(config.vision_config)}."
|
1130 |
)
|
1131 |
|
1132 |
+
# text_config = config.text_config
|
1133 |
vision_config = config.vision_config
|
1134 |
|
1135 |
self.projection_dim = config.projection_dim
|
1136 |
+
# self.text_embed_dim = text_config.hidden_size
|
1137 |
self.vision_embed_dim = vision_config.hidden_size
|
1138 |
+
|
1139 |
+
adapter = LLM2CLIP_Adapter()
|
1140 |
+
self.text_adapter = adapter
|
1141 |
+
|
1142 |
+
# text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
|
1143 |
+
# self.text_model = text_model.text_model
|
1144 |
|
1145 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
1146 |
self.vision_model = vision_model.vision_model
|
1147 |
|
1148 |
+
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
1149 |
+
# self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
|
|
1150 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
1151 |
|
1152 |
# Initialize weights and apply final processing
|
1153 |
self.post_init()
|
1154 |
+
|
1155 |
+
def get_text_features(self, inputs):
|
1156 |
+
#TODO: make this more flexible and configurable
|
1157 |
+
return self.text_adapter(inputs)
|
1158 |
+
|
1159 |
+
# @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
|
1160 |
+
# def get_text_features(
|
1161 |
+
# self,
|
1162 |
+
# input_ids: Optional[torch.Tensor] = None,
|
1163 |
+
# attention_mask: Optional[torch.Tensor] = None,
|
1164 |
+
# position_ids: Optional[torch.Tensor] = None,
|
1165 |
+
# output_attentions: Optional[bool] = None,
|
1166 |
+
# output_hidden_states: Optional[bool] = None,
|
1167 |
+
# return_dict: Optional[bool] = None,
|
1168 |
+
# ) -> torch.FloatTensor:
|
1169 |
+
# r"""
|
1170 |
+
# Returns:
|
1171 |
+
# text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
|
1172 |
+
# applying the projection layer to the pooled output of [`CLIPTextModel`].
|
1173 |
+
|
1174 |
+
# Examples:
|
1175 |
+
|
1176 |
+
# ```python
|
1177 |
+
# >>> from transformers import AutoTokenizer, CLIPModel
|
1178 |
+
|
1179 |
+
# >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
1180 |
+
# >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
1181 |
+
|
1182 |
+
# >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
1183 |
+
# >>> text_features = model.get_text_features(**inputs)
|
1184 |
+
# ```"""
|
1185 |
+
# # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
|
1186 |
+
# output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1187 |
+
# output_hidden_states = (
|
1188 |
+
# output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
1189 |
+
# )
|
1190 |
+
# return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
1191 |
+
|
1192 |
+
# text_outputs = self.text_model(
|
1193 |
+
# input_ids=input_ids,
|
1194 |
+
# attention_mask=attention_mask,
|
1195 |
+
# position_ids=position_ids,
|
1196 |
+
# output_attentions=output_attentions,
|
1197 |
+
# output_hidden_states=output_hidden_states,
|
1198 |
+
# return_dict=return_dict,
|
1199 |
+
# )
|
1200 |
+
|
1201 |
+
# pooled_output = text_outputs[1]
|
1202 |
+
# text_features = self.text_projection(pooled_output)
|
1203 |
+
|
1204 |
+
# return text_features
|
1205 |
|
1206 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
1207 |
def get_image_features(
|
|
|
1248 |
)
|
1249 |
|
1250 |
pooled_output = vision_outputs[1] # pooled_output
|
1251 |
+
image_features = self.visual_projection(pooled_output)
|
1252 |
|
1253 |
return image_features
|
1254 |
|
|
|
1429 |
attentions=text_outputs.attentions,
|
1430 |
)
|
1431 |
|
1432 |
+
class LinearBlock(nn.Module):
|
1433 |
+
def __init__(self, dim, expansion_factor=4, dropout=0.,norm_layer=nn.LayerNorm):
|
1434 |
+
super().__init__()
|
1435 |
+
self.fn = nn.Sequential(
|
1436 |
+
nn.Linear(dim, int(expansion_factor * dim)),
|
1437 |
+
nn.GELU(),
|
1438 |
+
nn.Dropout(dropout),
|
1439 |
+
nn.Linear(int(expansion_factor * dim), dim),
|
1440 |
+
)
|
1441 |
+
self.ln = norm_layer(dim)
|
1442 |
+
|
1443 |
+
def forward(self, x):
|
1444 |
+
return x + self.fn(self.ln(x))
|
1445 |
+
|
1446 |
+
class LLM2CLIP_Adapter(nn.Module):
|
1447 |
+
def __init__(self):
|
1448 |
+
super().__init__()
|
1449 |
+
#TODO: make this more flexible and configurable
|
1450 |
+
# hard-coded values from the LLM2CLIP model
|
1451 |
+
text_embedding_dim = 4096
|
1452 |
+
expansion_factor = 2
|
1453 |
+
adaptor_num_layers = 4
|
1454 |
+
proj_bias = True
|
1455 |
+
output_dim = 1280
|
1456 |
+
self.adaptor = nn.Sequential(
|
1457 |
+
*[LinearBlock(text_embedding_dim, expansion_factor) for _ in range(adaptor_num_layers)],
|
1458 |
+
nn.LayerNorm(text_embedding_dim),
|
1459 |
+
nn.Linear(text_embedding_dim, output_dim, bias=proj_bias),
|
1460 |
+
)
|
1461 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
1462 |
+
hidden_states = torch.nn.functional.normalize(hidden_states, p=2, dim=1)
|
1463 |
+
hidden_states = self.adaptor(hidden_states)
|
1464 |
+
return hidden_states
|
1465 |
+
|
1466 |
@add_start_docstrings(
|
1467 |
"""
|
1468 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|