jiyatai commited on
Commit
add7dbd
·
verified ·
1 Parent(s): 606533d

Upload 35 files

Browse files
Files changed (35) hide show
  1. weights/model-base/config.json +49 -0
  2. weights/model-base/configuration_qwen.py +65 -0
  3. weights/model-base/generation_config.json +11 -0
  4. weights/model-base/modeling_qwen.py +1182 -0
  5. weights/model-base/prediction.json +0 -0
  6. weights/model-base/pytorch_model-00001-of-00002.bin +3 -0
  7. weights/model-base/pytorch_model-00002-of-00002.bin +3 -0
  8. weights/model-base/pytorch_model.bin.index.json +860 -0
  9. weights/model-base/qwen.tiktoken +0 -0
  10. weights/model-base/qwen_generation_utils.py +420 -0
  11. weights/model-base/special_tokens_map.json +3 -0
  12. weights/model-base/tokenization_qwen.py +598 -0
  13. weights/model-base/tokenizer_config.json +12 -0
  14. weights/model-base/trainer_state.json +0 -0
  15. weights/model-base/training_args.bin +3 -0
  16. weights/model-base/visual.py +545 -0
  17. weights/model-base/zero_to_fp32.py +587 -0
  18. weights/model-idf/config.json +49 -0
  19. weights/model-idf/configuration_qwen.py +65 -0
  20. weights/model-idf/generation_config.json +11 -0
  21. weights/model-idf/modeling_qwen.py +1182 -0
  22. weights/model-idf/prediction.json +0 -0
  23. weights/model-idf/prediction_mi.json +797 -0
  24. weights/model-idf/pytorch_model-00001-of-00002.bin +3 -0
  25. weights/model-idf/pytorch_model-00002-of-00002.bin +3 -0
  26. weights/model-idf/pytorch_model.bin.index.json +864 -0
  27. weights/model-idf/qwen.tiktoken +0 -0
  28. weights/model-idf/qwen_generation_utils.py +420 -0
  29. weights/model-idf/special_tokens_map.json +3 -0
  30. weights/model-idf/tokenization_qwen.py +598 -0
  31. weights/model-idf/tokenizer_config.json +12 -0
  32. weights/model-idf/trainer_state.json +0 -0
  33. weights/model-idf/training_args.bin +3 -0
  34. weights/model-idf/visual.py +545 -0
  35. weights/model-idf/zero_to_fp32.py +578 -0
weights/model-base/config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/bn/automl-aigc/yatai/Qwen-VL/result/qwen_alpha_full_llava_mini_1/checkpoint-3000",
3
+ "architectures": [
4
+ "QWenLMHeadModel"
5
+ ],
6
+ "attn_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_qwen.QWenConfig",
9
+ "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
10
+ },
11
+ "bf16": true,
12
+ "emb_dropout_prob": 0.0,
13
+ "fp16": false,
14
+ "fp32": false,
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 22016,
18
+ "kv_channels": 128,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "max_position_embeddings": 8192,
21
+ "model_type": "qwen",
22
+ "no_bias": true,
23
+ "num_attention_heads": 32,
24
+ "num_hidden_layers": 32,
25
+ "onnx_safe": null,
26
+ "rotary_emb_base": 10000,
27
+ "rotary_pct": 1.0,
28
+ "scale_attn_weights": true,
29
+ "seq_length": 2048,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_type": "QWenTokenizer",
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.32.0",
34
+ "use_cache": false,
35
+ "use_dynamic_ntk": true,
36
+ "use_flash_attn": false,
37
+ "use_logn_attn": true,
38
+ "visual": {
39
+ "heads": 16,
40
+ "image_size": 448,
41
+ "image_start_id": 151857,
42
+ "layers": 48,
43
+ "mlp_ratio": 4.9231,
44
+ "output_dim": 4096,
45
+ "patch_size": 14,
46
+ "width": 1664
47
+ },
48
+ "vocab_size": 151936
49
+ }
weights/model-base/configuration_qwen.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from transformers import PretrainedConfig
7
+
8
+
9
+ class QWenConfig(PretrainedConfig):
10
+ model_type = "qwen"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=151936,
16
+ hidden_size=4096,
17
+ num_hidden_layers=32,
18
+ num_attention_heads=32,
19
+ emb_dropout_prob=0.0,
20
+ attn_dropout_prob=0.0,
21
+ layer_norm_epsilon=1e-6,
22
+ initializer_range=0.02,
23
+ max_position_embeddings=8192,
24
+ scale_attn_weights=True,
25
+ use_cache=True,
26
+ bf16=False,
27
+ fp16=False,
28
+ fp32=False,
29
+ kv_channels=128,
30
+ rotary_pct=1.0,
31
+ rotary_emb_base=10000,
32
+ use_dynamic_ntk=True,
33
+ use_logn_attn=True,
34
+ use_flash_attn="auto",
35
+ intermediate_size=22016,
36
+ no_bias=True,
37
+ tie_word_embeddings=False,
38
+ **kwargs,
39
+ ):
40
+ self.vocab_size = vocab_size
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.emb_dropout_prob = emb_dropout_prob
46
+ self.attn_dropout_prob = attn_dropout_prob
47
+ self.layer_norm_epsilon = layer_norm_epsilon
48
+ self.initializer_range = initializer_range
49
+ self.scale_attn_weights = scale_attn_weights
50
+ self.use_cache = use_cache
51
+ self.max_position_embeddings = max_position_embeddings
52
+ self.bf16 = bf16
53
+ self.fp16 = fp16
54
+ self.fp32 = fp32
55
+ self.kv_channels = kv_channels
56
+ self.rotary_pct = rotary_pct
57
+ self.rotary_emb_base = rotary_emb_base
58
+ self.use_dynamic_ntk = use_dynamic_ntk
59
+ self.use_logn_attn = use_logn_attn
60
+ self.use_flash_attn = use_flash_attn
61
+ self.no_bias = no_bias
62
+ super().__init__(
63
+ tie_word_embeddings=tie_word_embeddings,
64
+ **kwargs
65
+ )
weights/model-base/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chat_format": "chatml",
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 512,
6
+ "max_window_size": 6144,
7
+ "pad_token_id": 151643,
8
+ "top_k": 0,
9
+ "top_p": 0.3,
10
+ "transformers_version": "4.32.0"
11
+ }
weights/model-base/modeling_qwen.py ADDED
@@ -0,0 +1,1182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import importlib
7
+ import math
8
+ from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ import torch.utils.checkpoint
13
+ from torch.cuda.amp import autocast
14
+
15
+ from torch.nn import CrossEntropyLoss
16
+ from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
17
+ from transformers.generation.logits_process import LogitsProcessorList
18
+
19
+ if TYPE_CHECKING:
20
+ from transformers.generation.streamers import BaseStreamer
21
+ from transformers.generation.utils import GenerateOutput
22
+ from transformers.modeling_outputs import (
23
+ BaseModelOutputWithPast,
24
+ CausalLMOutputWithPast,
25
+ )
26
+ from transformers.modeling_utils import PreTrainedModel
27
+ from transformers.utils import logging
28
+
29
+ try:
30
+ from einops import rearrange
31
+ except ImportError:
32
+ rearrange = None
33
+ from torch import nn
34
+
35
+ SUPPORT_CUDA = torch.cuda.is_available()
36
+ SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
37
+ SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
38
+
39
+ from .configuration_qwen import QWenConfig
40
+ from .qwen_generation_utils import (
41
+ HistoryType,
42
+ make_context,
43
+ decode_tokens,
44
+ get_stop_words_ids,
45
+ StopWordsLogitsProcessor,
46
+ )
47
+ from .visual import VisionTransformer
48
+
49
+
50
+ logger = logging.get_logger(__name__)
51
+
52
+ _CHECKPOINT_FOR_DOC = "qwen"
53
+ _CONFIG_FOR_DOC = "QWenConfig"
54
+
55
+ QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
56
+
57
+ _ERROR_BAD_CHAT_FORMAT = """\
58
+ We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
59
+ If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
60
+ 我们检测到您可能在使用预训练模型(而非chat模型)进行多轮chat,因为您当前在generation_config指定的chat_format,并未设置为我们在对话中所支持的"chatml"格式。
61
+ 如果您在直接使用我们从Huggingface提供的模型,请确保您在调用model.chat()时,使用的是"Qwen/Qwen-7B-Chat"模型(而非"Qwen/Qwen-7B"预训练模型)。
62
+ """
63
+
64
+ _SENTINEL = object()
65
+ _ERROR_STREAM_IN_CHAT = """\
66
+ Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
67
+ 向model.chat()传入参数stream的用法可能存在Bug,该用法已被废弃,将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
68
+ """
69
+
70
+ apply_rotary_emb_func = None
71
+ rms_norm = None
72
+
73
+
74
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
75
+ def _make_causal_mask(
76
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
77
+ ):
78
+ """
79
+ Make causal mask used for bi-directional self-attention.
80
+ """
81
+ bsz, tgt_len = input_ids_shape
82
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
83
+ mask_cond = torch.arange(mask.size(-1), device=device)
84
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
85
+ mask = mask.to(dtype)
86
+
87
+ if past_key_values_length > 0:
88
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
89
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
90
+
91
+
92
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
93
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
94
+ """
95
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
96
+ """
97
+ bsz, src_len = mask.size()
98
+ tgt_len = tgt_len if tgt_len is not None else src_len
99
+
100
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
101
+
102
+ inverted_mask = 1.0 - expanded_mask
103
+
104
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
105
+
106
+
107
+ class QWenAttention(nn.Module):
108
+ def __init__(self, config):
109
+ super().__init__()
110
+
111
+ self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
112
+ self.seq_length = config.seq_length
113
+
114
+ self.hidden_size = config.hidden_size
115
+ self.split_size = config.hidden_size
116
+ self.num_heads = config.num_attention_heads
117
+ self.head_dim = self.hidden_size // self.num_heads
118
+
119
+ self.scale_attn_weights = True
120
+
121
+ self.projection_size = config.kv_channels * config.num_attention_heads
122
+
123
+ assert self.projection_size % config.num_attention_heads == 0
124
+ self.hidden_size_per_attention_head = (
125
+ self.projection_size // config.num_attention_heads
126
+ )
127
+
128
+ self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
129
+
130
+ self.c_proj = nn.Linear(
131
+ config.hidden_size, self.projection_size, bias=not config.no_bias
132
+ )
133
+
134
+ self.is_fp32 = not (config.bf16 or config.fp16)
135
+ self.bf16 = config.bf16
136
+
137
+ self.use_dynamic_ntk = config.use_dynamic_ntk
138
+ self.use_logn_attn = config.use_logn_attn
139
+
140
+ logn_list = [
141
+ math.log(i, self.seq_length) if i > self.seq_length else 1
142
+ for i in range(1, 32768)
143
+ ]
144
+ self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
145
+
146
+ self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
147
+
148
+ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
149
+ attn_weights = torch.matmul(query, key.transpose(-1, -2))
150
+
151
+ if self.scale_attn_weights:
152
+ attn_weights = attn_weights / torch.full(
153
+ [],
154
+ value.size(-1) ** 0.5,
155
+ dtype=attn_weights.dtype,
156
+ device=attn_weights.device,
157
+ )
158
+
159
+ query_length, key_length = query.size(-2), key.size(-2)
160
+ # causal_mask = self.bias[
161
+ # :, :, key_length - query_length : key_length, :key_length
162
+ # ]
163
+ # mask_value = torch.finfo(attn_weights.dtype).min
164
+ # mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
165
+ # attn_weights.device
166
+ # )
167
+ # attn_weights = torch.where(
168
+ # causal_mask, attn_weights.to(attn_weights.dtype), mask_value
169
+ # )
170
+ attn_weights = attn_weights + attention_mask
171
+
172
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
173
+
174
+ attn_weights = attn_weights.type(value.dtype)
175
+ attn_weights = self.attn_dropout(attn_weights)
176
+
177
+ if head_mask is not None:
178
+ attn_weights = attn_weights * head_mask
179
+
180
+ attn_output = torch.matmul(attn_weights, value)
181
+ attn_output = attn_output.transpose(1, 2)
182
+
183
+ return attn_output, attn_weights
184
+
185
+ def _upcast_and_reordered_attn(
186
+ self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
187
+ ):
188
+ bsz, num_heads, q_seq_len, dk = query.size()
189
+ _, _, k_seq_len, _ = key.size()
190
+
191
+ attn_weights = torch.empty(
192
+ bsz * num_heads,
193
+ q_seq_len,
194
+ k_seq_len,
195
+ dtype=torch.float32,
196
+ device=query.device,
197
+ )
198
+
199
+ scale_factor = 1.0
200
+ if self.scale_attn_weights:
201
+ scale_factor /= float(value.size(-1)) ** 0.5
202
+
203
+ with autocast(enabled=False):
204
+ q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
205
+ -1, dk, k_seq_len
206
+ )
207
+ attn_weights = torch.baddbmm(
208
+ attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
209
+ )
210
+ attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
211
+
212
+ query_length, key_length = query.size(-2), key.size(-2)
213
+ causal_mask = registered_causal_mask[
214
+ :, :, key_length - query_length : key_length, :key_length
215
+ ]
216
+ mask_value = torch.finfo(attn_weights.dtype).min
217
+ mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
218
+ attn_weights.device
219
+ )
220
+ attn_weights = torch.where(causal_mask, attn_weights, mask_value)
221
+
222
+ if attention_mask is not None:
223
+ attn_weights = attn_weights + attention_mask
224
+
225
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
226
+
227
+ if attn_weights.dtype != torch.float32:
228
+ raise RuntimeError(
229
+ "Error with upcasting, attn_weights does not have dtype torch.float32"
230
+ )
231
+ attn_weights = attn_weights.type(value.dtype)
232
+ attn_weights = self.attn_dropout(attn_weights)
233
+
234
+ if head_mask is not None:
235
+ attn_weights = attn_weights * head_mask
236
+
237
+ attn_output = torch.matmul(attn_weights, value)
238
+
239
+ return attn_output, attn_weights
240
+
241
+ def _split_heads(self, tensor, num_heads, attn_head_size):
242
+ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
243
+ tensor = tensor.view(new_shape)
244
+ return tensor
245
+
246
+ def _merge_heads(self, tensor, num_heads, attn_head_size):
247
+ tensor = tensor.contiguous()
248
+ new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
249
+ return tensor.view(new_shape)
250
+
251
+ def forward(
252
+ self,
253
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
254
+ rotary_pos_emb: Optional[List[torch.Tensor]] = None,
255
+ registered_causal_mask: Optional[torch.Tensor] = None,
256
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
257
+ attention_mask: Optional[torch.FloatTensor] = None,
258
+ head_mask: Optional[torch.FloatTensor] = None,
259
+ encoder_hidden_states: Optional[torch.Tensor] = None,
260
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
261
+ output_attentions: Optional[bool] = False,
262
+ use_cache: Optional[bool] = False,
263
+ ):
264
+
265
+ mixed_x_layer = self.c_attn(hidden_states)
266
+
267
+ query, key, value = mixed_x_layer.split(self.split_size, dim=2)
268
+
269
+ query = self._split_heads(query, self.num_heads, self.head_dim)
270
+ key = self._split_heads(key, self.num_heads, self.head_dim)
271
+ value = self._split_heads(value, self.num_heads, self.head_dim)
272
+
273
+ if rotary_pos_emb is not None:
274
+ cur_len = query.shape[1]
275
+ rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
276
+ rotary_pos_emb = (rotary_pos_emb,) * 2
277
+ q_pos_emb, k_pos_emb = rotary_pos_emb
278
+ # Slice the pos emb for current inference
279
+ query = apply_rotary_pos_emb(query, q_pos_emb)
280
+ key = apply_rotary_pos_emb(key, k_pos_emb)
281
+
282
+ if layer_past is not None:
283
+ past_key, past_value = layer_past[0], layer_past[1]
284
+ key = torch.cat((past_key, key), dim=1)
285
+ value = torch.cat((past_value, value), dim=1)
286
+
287
+ if use_cache:
288
+ present = (key, value)
289
+ else:
290
+ present = None
291
+
292
+ if self.use_logn_attn and not self.training:
293
+ if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
294
+ self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
295
+ seq_start = key.size(1) - query.size(1)
296
+ seq_end = key.size(1)
297
+ logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
298
+ query = query * logn_tensor.expand_as(query)
299
+
300
+ query = query.permute(0, 2, 1, 3)
301
+ key = key.permute(0, 2, 1, 3)
302
+ value = value.permute(0, 2, 1, 3)
303
+ attn_output, attn_weight = self._attn(
304
+ query, key, value, registered_causal_mask, attention_mask, head_mask
305
+ )
306
+ context_layer = self._merge_heads(
307
+ attn_output, self.num_heads, self.head_dim
308
+ )
309
+
310
+ attn_output = self.c_proj(context_layer)
311
+
312
+ outputs = (attn_output, present)
313
+ if output_attentions:
314
+ outputs += (attn_weight,)
315
+
316
+ return outputs
317
+
318
+
319
+ class QWenMLP(nn.Module):
320
+ def __init__(self, config):
321
+ super().__init__()
322
+ self.w1 = nn.Linear(
323
+ config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
324
+ )
325
+ self.w2 = nn.Linear(
326
+ config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
327
+ )
328
+ ff_dim_in = config.intermediate_size // 2
329
+ self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
330
+
331
+ def forward(self, hidden_states):
332
+ a1 = self.w1(hidden_states)
333
+ a2 = self.w2(hidden_states)
334
+ intermediate_parallel = a1 * F.silu(a2)
335
+ output = self.c_proj(intermediate_parallel)
336
+ return output
337
+
338
+ class QWenBlock(nn.Module):
339
+ def __init__(self, config):
340
+ super().__init__()
341
+ hidden_size = config.hidden_size
342
+ self.bf16 = config.bf16
343
+
344
+ self.ln_1 = RMSNorm(
345
+ hidden_size,
346
+ eps=config.layer_norm_epsilon,
347
+ )
348
+ self.attn = QWenAttention(config)
349
+ self.ln_2 = RMSNorm(
350
+ hidden_size,
351
+ eps=config.layer_norm_epsilon,
352
+ )
353
+
354
+ self.mlp = QWenMLP(config)
355
+
356
+ def forward(
357
+ self,
358
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
359
+ rotary_pos_emb: Optional[List[torch.Tensor]] = None,
360
+ registered_causal_mask: Optional[torch.Tensor] = None,
361
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
362
+ attention_mask: Optional[torch.FloatTensor] = None,
363
+ head_mask: Optional[torch.FloatTensor] = None,
364
+ encoder_hidden_states: Optional[torch.Tensor] = None,
365
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
366
+ use_cache: Optional[bool] = False,
367
+ output_attentions: Optional[bool] = False,
368
+ ):
369
+ layernorm_output = self.ln_1(hidden_states)
370
+
371
+ attn_outputs = self.attn(
372
+ layernorm_output,
373
+ rotary_pos_emb,
374
+ registered_causal_mask=registered_causal_mask,
375
+ layer_past=layer_past,
376
+ attention_mask=attention_mask,
377
+ head_mask=head_mask,
378
+ use_cache=use_cache,
379
+ output_attentions=output_attentions,
380
+ )
381
+ attn_output = attn_outputs[0]
382
+
383
+ outputs = attn_outputs[1:]
384
+
385
+ residual = hidden_states
386
+ layernorm_input = attn_output + residual
387
+
388
+ layernorm_output = self.ln_2(layernorm_input)
389
+
390
+ residual = layernorm_input
391
+ mlp_output = self.mlp(layernorm_output)
392
+ hidden_states = residual + mlp_output
393
+
394
+ if use_cache:
395
+ outputs = (hidden_states,) + outputs
396
+ else:
397
+ outputs = (hidden_states,) + outputs[1:]
398
+
399
+ return outputs
400
+
401
+
402
+ class QWenPreTrainedModel(PreTrainedModel):
403
+ config_class = QWenConfig
404
+ base_model_prefix = "transformer"
405
+ is_parallelizable = False
406
+ supports_gradient_checkpointing = True
407
+ _no_split_modules = ["QWenBlock"]
408
+
409
+ def __init__(self, *inputs, **kwargs):
410
+ super().__init__(*inputs, **kwargs)
411
+
412
+ def _init_weights(self, module):
413
+ """Initialize the weights."""
414
+ if isinstance(module, nn.Linear):
415
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
416
+ if module.bias is not None:
417
+ module.bias.data.zero_()
418
+ elif isinstance(module, nn.Embedding):
419
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
420
+ if module.padding_idx is not None:
421
+ module.weight.data[module.padding_idx].zero_()
422
+ elif isinstance(module, RMSNorm):
423
+ module.weight.data.fill_(1.0)
424
+
425
+ for name, p in module.named_parameters():
426
+ if name == "c_proj.weight":
427
+ p.data.normal_(
428
+ mean=0.0,
429
+ std=(
430
+ self.config.initializer_range
431
+ / math.sqrt(2 * self.config.num_hidden_layers)
432
+ ),
433
+ )
434
+
435
+ def _set_gradient_checkpointing(self, module, value=False):
436
+ if isinstance(module, QWenModel):
437
+ module.gradient_checkpointing = value
438
+
439
+
440
+ class QWenModel(QWenPreTrainedModel):
441
+ _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
442
+
443
+ def __init__(self, config):
444
+ super().__init__(config)
445
+ self.vocab_size = config.vocab_size
446
+ self.num_hidden_layers = config.num_hidden_layers
447
+ self.embed_dim = config.hidden_size
448
+
449
+ self.gradient_checkpointing = False
450
+ self.use_dynamic_ntk = config.use_dynamic_ntk
451
+ self.seq_length = config.seq_length
452
+
453
+ self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
454
+
455
+ self.drop = nn.Dropout(config.emb_dropout_prob)
456
+
457
+ if config.rotary_pct == 1.0:
458
+ self.rotary_ndims = None
459
+ else:
460
+ assert config.rotary_pct < 1
461
+ self.rotary_ndims = int(
462
+ config.kv_channels * config.rotary_pct
463
+ )
464
+ dim = (
465
+ self.rotary_ndims
466
+ if self.rotary_ndims is not None
467
+ else config.kv_channels
468
+ )
469
+ self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
470
+
471
+ self.use_flash_attn = config.use_flash_attn
472
+ self.is_fp32 = not (config.bf16 or config.fp16)
473
+ self.registered_causal_mask = None
474
+ # if (
475
+ # self.use_flash_attn
476
+ # and flash_attn_unpadded_func is not None
477
+ # and not self.is_fp32
478
+ # ):
479
+ # self.registered_causal_mask = None
480
+ # else:
481
+ # max_positions = config.max_position_embeddings
482
+ # self.register_buffer(
483
+ # "registered_causal_mask",
484
+ # torch.tril(
485
+ # torch.ones((max_positions, max_positions), dtype=torch.bool)
486
+ # ).view(1, 1, max_positions, max_positions),
487
+ # persistent=False,
488
+ # )
489
+
490
+ self.h = nn.ModuleList(
491
+ [
492
+ QWenBlock(
493
+ config
494
+ )
495
+ for i in range(config.num_hidden_layers)
496
+ ]
497
+ )
498
+ self.ln_f = RMSNorm(
499
+ self.embed_dim,
500
+ eps=config.layer_norm_epsilon,
501
+ )
502
+
503
+ self.visual = VisionTransformer(**config.visual) # vit + resampler
504
+
505
+ self.post_init()
506
+
507
+ def get_input_embeddings(self):
508
+ return self.wte
509
+
510
+ def set_input_embeddings(self, new_embeddings):
511
+ self.wte = new_embeddings
512
+
513
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
514
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
515
+ # create causal mask
516
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
517
+ combined_attention_mask = None
518
+ if input_shape[-1] > 1:
519
+ combined_attention_mask = _make_causal_mask(
520
+ input_shape,
521
+ inputs_embeds.dtype,
522
+ device=inputs_embeds.device,
523
+ past_key_values_length=past_key_values_length,
524
+ )
525
+
526
+ if attention_mask is not None:
527
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
528
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
529
+ inputs_embeds.device
530
+ )
531
+ combined_attention_mask = (
532
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
533
+ )
534
+
535
+ return combined_attention_mask
536
+
537
+
538
+ def forward(
539
+ self,
540
+ input_ids: Optional[torch.LongTensor] = None,
541
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
542
+ attention_mask: Optional[torch.FloatTensor] = None,
543
+ token_type_ids: Optional[torch.LongTensor] = None,
544
+ position_ids: Optional[torch.LongTensor] = None,
545
+ head_mask: Optional[torch.FloatTensor] = None,
546
+ inputs_embeds: Optional[torch.FloatTensor] = None,
547
+ encoder_hidden_states: Optional[torch.Tensor] = None,
548
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
549
+ use_cache: Optional[bool] = None,
550
+ output_attentions: Optional[bool] = None,
551
+ output_hidden_states: Optional[bool] = None,
552
+ return_dict: Optional[bool] = None,
553
+ ):
554
+ if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
555
+ bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
556
+ eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
557
+ assert (bos_pos[0] == eos_pos[0]).all()
558
+ img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
559
+ images = []
560
+ # for i, a, b in img_pos:
561
+ # image = input_ids[i][a + 1 : b - 1].tolist()
562
+ # image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
563
+ # images.append(bytes(image).decode('utf-8'))
564
+
565
+ old_i = -1
566
+ images_flag = []
567
+ id_test_flag = 0
568
+ for i, a, b in img_pos:
569
+ image = input_ids[i][a + 1 : b - 1].tolist()
570
+ image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
571
+ images.append(bytes(image).decode('utf-8'))
572
+ if i != old_i:
573
+ if input_ids[i][a-2] == 374:
574
+ id_test_flag = 1
575
+ else:
576
+ id_test_flag = 0
577
+ old_i = i
578
+ if input_ids[i][a-2] == 374:
579
+ images_flag.append(0)
580
+ elif id_test_flag == 1:
581
+ images_flag.append(1)
582
+ else:
583
+ images_flag.append(2)
584
+
585
+ images = self.visual.encode(images, images_flag)
586
+ assert images.shape[0] == len(images)
587
+ fake_images = None
588
+ elif self.training:
589
+ fake_images=torch.zeros(1,3,224,224).to(
590
+ dtype=self.visual.conv1.weight.dtype, device=self.visual.conv1.weight.device)
591
+ images = self.visual(fake_images)
592
+ else:
593
+ fake_images = None
594
+ images = None
595
+
596
+ output_attentions = (
597
+ output_attentions
598
+ if output_attentions is not None
599
+ else self.config.output_attentions
600
+ )
601
+ output_hidden_states = (
602
+ output_hidden_states
603
+ if output_hidden_states is not None
604
+ else self.config.output_hidden_states
605
+ )
606
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
607
+ return_dict = (
608
+ return_dict if return_dict is not None else self.config.use_return_dict
609
+ )
610
+
611
+ if input_ids is not None and inputs_embeds is not None:
612
+ raise ValueError(
613
+ "You cannot specify both input_ids and inputs_embeds at the same time"
614
+ )
615
+ elif input_ids is not None:
616
+ input_shape = input_ids.size()
617
+ input_ids = input_ids.view(-1, input_shape[-1])
618
+ batch_size = input_ids.shape[0]
619
+ elif inputs_embeds is not None:
620
+ input_shape = inputs_embeds.size()[:-1]
621
+ batch_size = inputs_embeds.shape[0]
622
+ else:
623
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
624
+
625
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
626
+
627
+ if token_type_ids is not None:
628
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
629
+ if position_ids is not None:
630
+ position_ids = position_ids.view(-1, input_shape[-1])
631
+
632
+ if past_key_values is None:
633
+ past_length = 0
634
+ past_key_values = tuple([None] * len(self.h))
635
+ else:
636
+ past_length = past_key_values[0][0].size(-2)
637
+
638
+ if position_ids is None:
639
+ position_ids = torch.arange(
640
+ past_length,
641
+ input_shape[-1] + past_length,
642
+ dtype=torch.long,
643
+ device=device,
644
+ )
645
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
646
+
647
+ encoder_attention_mask = None
648
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
649
+
650
+ if inputs_embeds is None:
651
+ inputs_embeds = self.wte(input_ids)
652
+
653
+ if batch_size <= 0:
654
+ raise ValueError("batch_size has to be defined and > 0")
655
+ attention_mask = self._prepare_decoder_attention_mask(
656
+ attention_mask, input_shape, inputs_embeds, past_length
657
+ )
658
+
659
+ hidden_states = inputs_embeds
660
+
661
+ kv_seq_len = hidden_states.size()[1]
662
+ if past_key_values[0] is not None:
663
+ # past key values[0][0] shape: bs * seq_len * head_num * dim
664
+ kv_seq_len += past_key_values[0][0].shape[1]
665
+ if (
666
+ self.use_dynamic_ntk
667
+ and kv_seq_len == hidden_states.size()[1]
668
+ and not self.training
669
+ ):
670
+ context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
671
+ ntk_alpha = 2 ** math.ceil(context_value) - 1
672
+ ntk_alpha = max(ntk_alpha, 1)
673
+ else:
674
+ ntk_alpha = self.rotary_emb._ntk_alpha_cached
675
+
676
+ rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
677
+ for idx in range(len(rotary_pos_emb)):
678
+ rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
679
+
680
+ hidden_states = self.drop(hidden_states).clone()
681
+ if fake_images is not None:
682
+ hidden_states = hidden_states + images.mean()*0
683
+ elif images is not None:
684
+ for idx, (i, a, b) in enumerate(img_pos):
685
+ hidden_states[i][a + 1 : b] = images[idx]
686
+ output_shape = input_shape + (hidden_states.size(-1),)
687
+
688
+ if self.gradient_checkpointing and self.training:
689
+ if use_cache:
690
+ logger.warning_once(
691
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
692
+ )
693
+ use_cache = False
694
+
695
+ presents = () if use_cache else None
696
+ all_self_attentions = () if output_attentions else None
697
+ all_hidden_states = () if output_hidden_states else None
698
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
699
+
700
+ if output_hidden_states:
701
+ all_hidden_states = all_hidden_states + (hidden_states,)
702
+
703
+ if self.gradient_checkpointing and self.training:
704
+
705
+ def create_custom_forward(module):
706
+ def custom_forward(*inputs):
707
+ # None for past_key_value
708
+ return module(*inputs, use_cache, output_attentions)
709
+
710
+ return custom_forward
711
+
712
+ outputs = torch.utils.checkpoint.checkpoint(
713
+ create_custom_forward(block),
714
+ hidden_states,
715
+ rotary_pos_emb,
716
+ self.registered_causal_mask,
717
+ None,
718
+ attention_mask,
719
+ head_mask[i],
720
+ encoder_hidden_states,
721
+ encoder_attention_mask,
722
+ )
723
+ else:
724
+ outputs = block(
725
+ hidden_states,
726
+ layer_past=layer_past,
727
+ rotary_pos_emb=rotary_pos_emb,
728
+ registered_causal_mask=self.registered_causal_mask,
729
+ attention_mask=attention_mask,
730
+ head_mask=head_mask[i],
731
+ encoder_hidden_states=encoder_hidden_states,
732
+ encoder_attention_mask=encoder_attention_mask,
733
+ use_cache=use_cache,
734
+ output_attentions=output_attentions,
735
+ )
736
+
737
+ hidden_states = outputs[0]
738
+ if use_cache is True:
739
+ presents = presents + (outputs[1],)
740
+
741
+ if output_attentions:
742
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
743
+
744
+ hidden_states = self.ln_f(hidden_states)
745
+ hidden_states = hidden_states.view(output_shape)
746
+ # Add last hidden state
747
+ if output_hidden_states:
748
+ all_hidden_states = all_hidden_states + (hidden_states,)
749
+
750
+ if not return_dict:
751
+ return tuple(
752
+ v for v in [hidden_states, presents, all_hidden_states] if v is not None
753
+ )
754
+
755
+ return BaseModelOutputWithPast(
756
+ last_hidden_state=hidden_states,
757
+ past_key_values=presents,
758
+ hidden_states=all_hidden_states,
759
+ attentions=all_self_attentions,
760
+ )
761
+
762
+
763
+ class QWenLMHeadModel(QWenPreTrainedModel):
764
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
765
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
766
+
767
+ def __init__(self, config):
768
+ super().__init__(config)
769
+ assert (
770
+ config.bf16 + config.fp16 + config.fp32 <= 1
771
+ ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
772
+
773
+ autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
774
+
775
+ if autoset_precision:
776
+ if SUPPORT_BF16:
777
+ logger.warn(
778
+ "The model is automatically converting to bf16 for faster inference. "
779
+ "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
780
+ )
781
+ config.bf16 = True
782
+ elif SUPPORT_FP16:
783
+ logger.warn(
784
+ "The model is automatically converting to fp16 for faster inference. "
785
+ "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
786
+ )
787
+ config.fp16 = True
788
+ else:
789
+ config.fp32 = True
790
+
791
+ if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
792
+ logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
793
+ if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
794
+ logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
795
+ if config.fp32:
796
+ if SUPPORT_BF16:
797
+ logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
798
+ elif SUPPORT_FP16:
799
+ logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
800
+
801
+ self.transformer = QWenModel(config)
802
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
803
+
804
+ if config.bf16:
805
+ self.transformer.bfloat16()
806
+ self.lm_head.bfloat16()
807
+ if config.fp16:
808
+ self.transformer.half()
809
+ self.lm_head.half()
810
+ self.post_init()
811
+
812
+ def get_output_embeddings(self):
813
+ return self.lm_head
814
+
815
+ def set_output_embeddings(self, new_embeddings):
816
+ self.lm_head = new_embeddings
817
+
818
+ def prepare_inputs_for_generation(
819
+ self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
820
+ ):
821
+ token_type_ids = kwargs.get("token_type_ids", None)
822
+ if past_key_values:
823
+ input_ids = input_ids[:, -1].unsqueeze(-1)
824
+ if token_type_ids is not None:
825
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
826
+
827
+ attention_mask = kwargs.get("attention_mask", None)
828
+ position_ids = kwargs.get("position_ids", None)
829
+
830
+ if attention_mask is not None and position_ids is None:
831
+ position_ids = attention_mask.long().cumsum(-1) - 1
832
+ position_ids.masked_fill_(attention_mask == 0, 1)
833
+ if past_key_values:
834
+ position_ids = position_ids[:, -1].unsqueeze(-1)
835
+ else:
836
+ position_ids = None
837
+
838
+ if inputs_embeds is not None and past_key_values is None:
839
+ model_inputs = {"inputs_embeds": inputs_embeds}
840
+ else:
841
+ model_inputs = {"input_ids": input_ids}
842
+
843
+ model_inputs.update(
844
+ {
845
+ "past_key_values": past_key_values,
846
+ "use_cache": kwargs.get("use_cache"),
847
+ "position_ids": position_ids,
848
+ "attention_mask": attention_mask,
849
+ "token_type_ids": token_type_ids,
850
+ }
851
+ )
852
+ return model_inputs
853
+
854
+ def forward(
855
+ self,
856
+ input_ids: Optional[torch.LongTensor] = None,
857
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
858
+ attention_mask: Optional[torch.FloatTensor] = None,
859
+ token_type_ids: Optional[torch.LongTensor] = None,
860
+ position_ids: Optional[torch.LongTensor] = None,
861
+ head_mask: Optional[torch.FloatTensor] = None,
862
+ inputs_embeds: Optional[torch.FloatTensor] = None,
863
+ encoder_hidden_states: Optional[torch.Tensor] = None,
864
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
865
+ labels: Optional[torch.LongTensor] = None,
866
+ use_cache: Optional[bool] = None,
867
+ output_attentions: Optional[bool] = None,
868
+ output_hidden_states: Optional[bool] = None,
869
+ return_dict: Optional[bool] = None,
870
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
871
+
872
+ return_dict = (
873
+ return_dict if return_dict is not None else self.config.use_return_dict
874
+ )
875
+
876
+ transformer_outputs = self.transformer(
877
+ input_ids,
878
+ past_key_values=past_key_values,
879
+ attention_mask=attention_mask,
880
+ token_type_ids=token_type_ids,
881
+ position_ids=position_ids,
882
+ head_mask=head_mask,
883
+ inputs_embeds=inputs_embeds,
884
+ encoder_hidden_states=encoder_hidden_states,
885
+ encoder_attention_mask=encoder_attention_mask,
886
+ use_cache=use_cache,
887
+ output_attentions=output_attentions,
888
+ output_hidden_states=output_hidden_states,
889
+ return_dict=return_dict,
890
+ )
891
+ hidden_states = transformer_outputs[0]
892
+
893
+ lm_logits = self.lm_head(hidden_states)
894
+
895
+ loss = None
896
+ if labels is not None:
897
+ labels = labels.to(lm_logits.device)
898
+ shift_logits = lm_logits[..., :-1, :].contiguous()
899
+ shift_labels = labels[..., 1:].contiguous()
900
+ loss_fct = CrossEntropyLoss()
901
+ loss = loss_fct(
902
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
903
+ )
904
+
905
+ if not return_dict:
906
+ output = (lm_logits,) + transformer_outputs[1:]
907
+ return ((loss,) + output) if loss is not None else output
908
+
909
+ return CausalLMOutputWithPast(
910
+ loss=loss,
911
+ logits=lm_logits,
912
+ past_key_values=transformer_outputs.past_key_values,
913
+ hidden_states=transformer_outputs.hidden_states,
914
+ attentions=transformer_outputs.attentions,
915
+ )
916
+
917
+ @staticmethod
918
+ def _reorder_cache(
919
+ past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
920
+ ) -> Tuple[Tuple[torch.Tensor]]:
921
+
922
+ return tuple(
923
+ tuple(
924
+ past_state.index_select(0, beam_idx.to(past_state.device))
925
+ for past_state in layer_past
926
+ )
927
+ for layer_past in past_key_values
928
+ )
929
+
930
+ def chat(
931
+ self,
932
+ tokenizer: PreTrainedTokenizer,
933
+ query: str,
934
+ history: Optional[HistoryType],
935
+ system: str = "You are a helpful assistant.",
936
+ append_history: bool = True,
937
+ stream: Optional[bool] = _SENTINEL,
938
+ stop_words_ids: Optional[List[List[int]]] = None,
939
+ generation_config: Optional[GenerationConfig] = None,
940
+ **kwargs,
941
+ ) -> Tuple[str, HistoryType]:
942
+ generation_config = generation_config if generation_config is not None else self.generation_config
943
+
944
+ assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
945
+ assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
946
+ if history is None:
947
+ history = []
948
+ if stop_words_ids is None:
949
+ stop_words_ids = []
950
+
951
+ max_window_size = kwargs.get('max_window_size', None)
952
+ if max_window_size is None:
953
+ max_window_size = generation_config.max_window_size
954
+ raw_text, context_tokens = make_context(
955
+ tokenizer,
956
+ query,
957
+ history=history,
958
+ system=system,
959
+ max_window_size=max_window_size,
960
+ chat_format=generation_config.chat_format,
961
+ )
962
+
963
+ stop_words_ids.extend(get_stop_words_ids(
964
+ generation_config.chat_format, tokenizer
965
+ ))
966
+ input_ids = torch.tensor([context_tokens]).to(self.device)
967
+ outputs = self.generate(
968
+ input_ids,
969
+ stop_words_ids=stop_words_ids,
970
+ return_dict_in_generate=False,
971
+ generation_config=generation_config,
972
+ **kwargs,
973
+ )
974
+
975
+ response = decode_tokens(
976
+ outputs[0],
977
+ tokenizer,
978
+ raw_text_len=len(raw_text),
979
+ context_length=len(context_tokens),
980
+ chat_format=generation_config.chat_format,
981
+ verbose=False,
982
+ errors='replace'
983
+ )
984
+
985
+ if append_history:
986
+ history.append((query, response))
987
+
988
+ return response, history
989
+
990
+ def chat_stream(
991
+ self,
992
+ tokenizer: PreTrainedTokenizer,
993
+ query: str,
994
+ history: Optional[HistoryType],
995
+ system: str = "You are a helpful assistant.",
996
+ stop_words_ids: Optional[List[List[int]]] = None,
997
+ logits_processor: Optional[LogitsProcessorList] = None,
998
+ generation_config: Optional[GenerationConfig] = None,
999
+ **kwargs,
1000
+ ) -> Generator[str, Any, None]:
1001
+ generation_config = generation_config if generation_config is not None else self.generation_config
1002
+ assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
1003
+ if history is None:
1004
+ history = []
1005
+ if stop_words_ids is None:
1006
+ stop_words_ids = []
1007
+
1008
+ max_window_size = kwargs.get('max_window_size', None)
1009
+ if max_window_size is None:
1010
+ max_window_size = generation_config.max_window_size
1011
+ raw_text, context_tokens = make_context(
1012
+ tokenizer,
1013
+ query,
1014
+ history=history,
1015
+ system=system,
1016
+ max_window_size=max_window_size,
1017
+ chat_format=generation_config.chat_format,
1018
+ )
1019
+
1020
+ stop_words_ids.extend(get_stop_words_ids(
1021
+ generation_config.chat_format, tokenizer
1022
+ ))
1023
+ if stop_words_ids is not None:
1024
+ stop_words_logits_processor = StopWordsLogitsProcessor(
1025
+ stop_words_ids=stop_words_ids,
1026
+ eos_token_id=generation_config.eos_token_id,
1027
+ )
1028
+ if logits_processor is None:
1029
+ logits_processor = LogitsProcessorList([stop_words_logits_processor])
1030
+ else:
1031
+ logits_processor.append(stop_words_logits_processor)
1032
+ input_ids = torch.tensor([context_tokens]).to(self.device)
1033
+
1034
+ from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
1035
+ self.__class__.generate_stream = NewGenerationMixin.generate
1036
+ self.__class__.sample_stream = NewGenerationMixin.sample_stream
1037
+ stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
1038
+
1039
+ def stream_generator():
1040
+ outputs = []
1041
+ for token in self.generate_stream(
1042
+ input_ids,
1043
+ return_dict_in_generate=False,
1044
+ generation_config=stream_config,
1045
+ logits_processor=logits_processor,
1046
+ seed=-1,
1047
+ **kwargs):
1048
+ outputs.append(token.item())
1049
+ yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore', keep_image_special=True)
1050
+
1051
+ return stream_generator()
1052
+
1053
+ def generate(
1054
+ self,
1055
+ inputs: Optional[torch.Tensor] = None,
1056
+ generation_config: Optional[GenerationConfig] = None,
1057
+ logits_processor: Optional[LogitsProcessorList] = None,
1058
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
1059
+ prefix_allowed_tokens_fn: Optional[
1060
+ Callable[[int, torch.Tensor], List[int]]
1061
+ ] = None,
1062
+ synced_gpus: Optional[bool] = None,
1063
+ assistant_model: Optional["PreTrainedModel"] = None,
1064
+ streamer: Optional["BaseStreamer"] = None,
1065
+ **kwargs,
1066
+ ) -> Union[GenerateOutput, torch.LongTensor]:
1067
+ generation_config = generation_config if generation_config is not None else self.generation_config
1068
+
1069
+ # Process stop_words_ids.
1070
+ stop_words_ids = kwargs.pop("stop_words_ids", None)
1071
+ if stop_words_ids is None and generation_config is not None:
1072
+ stop_words_ids = getattr(generation_config, "stop_words_ids", None)
1073
+ if stop_words_ids is None:
1074
+ stop_words_ids = getattr(generation_config, "stop_words_ids", None)
1075
+
1076
+ if stop_words_ids is not None:
1077
+ stop_words_logits_processor = StopWordsLogitsProcessor(
1078
+ stop_words_ids=stop_words_ids,
1079
+ eos_token_id=generation_config.eos_token_id,
1080
+ )
1081
+ if logits_processor is None:
1082
+ logits_processor = LogitsProcessorList([stop_words_logits_processor])
1083
+ else:
1084
+ logits_processor.append(stop_words_logits_processor)
1085
+
1086
+ return super().generate(
1087
+ inputs,
1088
+ generation_config=generation_config,
1089
+ logits_processor=logits_processor,
1090
+ stopping_criteria=stopping_criteria,
1091
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1092
+ synced_gpus=synced_gpus,
1093
+ assistant_model=assistant_model,
1094
+ streamer=streamer,
1095
+ **kwargs,
1096
+ )
1097
+
1098
+
1099
+ class RotaryEmbedding(torch.nn.Module):
1100
+ def __init__(self, dim, base=10000):
1101
+ super().__init__()
1102
+ self.dim = dim
1103
+ self.base = base
1104
+ self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
1105
+ if importlib.util.find_spec("einops") is None:
1106
+ raise RuntimeError("einops is required for Rotary Embedding")
1107
+
1108
+ self._rotary_pos_emb_cache = None
1109
+ self._seq_len_cached = 0
1110
+ self._ntk_alpha_cached = 1.0
1111
+
1112
+ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
1113
+ seqlen = max_seq_len + offset
1114
+ if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
1115
+ base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
1116
+ self.inv_freq = 1.0 / (
1117
+ base
1118
+ ** (
1119
+ torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
1120
+ / self.dim
1121
+ )
1122
+ )
1123
+ self._seq_len_cached = max(2 * seqlen, 16)
1124
+ self._ntk_alpha_cached = ntk_alpha
1125
+ seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
1126
+ freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
1127
+
1128
+ emb = torch.cat((freqs, freqs), dim=-1)
1129
+ from einops import rearrange
1130
+
1131
+ emb = rearrange(emb, "n d -> 1 n 1 d")
1132
+
1133
+ cos, sin = emb.cos(), emb.sin()
1134
+ self._rotary_pos_emb_cache = [cos, sin]
1135
+
1136
+ def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
1137
+ self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
1138
+ cos, sin = self._rotary_pos_emb_cache
1139
+ return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
1140
+
1141
+
1142
+ def _rotate_half(x):
1143
+ from einops import rearrange
1144
+
1145
+ x = rearrange(x, "... (j d) -> ... j d", j=2)
1146
+ x1, x2 = x.unbind(dim=-2)
1147
+ return torch.cat((-x2, x1), dim=-1)
1148
+
1149
+
1150
+ def apply_rotary_pos_emb(t, freqs):
1151
+ cos, sin = freqs
1152
+ if apply_rotary_emb_func is not None and t.is_cuda:
1153
+ t_ = t.float()
1154
+ cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
1155
+ sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
1156
+ output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
1157
+ return output
1158
+ else:
1159
+ rot_dim = freqs[0].shape[-1]
1160
+ cos, sin = freqs
1161
+ t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
1162
+ t_ = t_.float()
1163
+ t_pass_ = t_pass_.float()
1164
+ t_ = (t_ * cos) + (_rotate_half(t_) * sin)
1165
+ return torch.cat((t_, t_pass_), dim=-1).type_as(t)
1166
+
1167
+
1168
+ class RMSNorm(torch.nn.Module):
1169
+ def __init__(self, dim: int, eps: float = 1e-6):
1170
+ super().__init__()
1171
+ self.eps = eps
1172
+ self.weight = nn.Parameter(torch.ones(dim))
1173
+
1174
+ def _norm(self, x):
1175
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
1176
+
1177
+ def forward(self, x):
1178
+ if rms_norm is not None and x.is_cuda:
1179
+ return rms_norm(x, self.weight, self.eps)
1180
+ else:
1181
+ output = self._norm(x.float()).type_as(x)
1182
+ return output * self.weight
weights/model-base/prediction.json ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-base/pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e329a383fc79ff638accd05840c2d9b4b0b15b2ecbd10fcce4b1da7f1130b281
3
+ size 9969772643
weights/model-base/pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0376c6c4f288f7a689a8c75b1b86af1978941682111d88b4c1f6a42641add7
3
+ size 9344399640
weights/model-base/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 19313870336
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
8
+ "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
9
+ "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
11
+ "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
12
+ "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
13
+ "transformer.h.0.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
14
+ "transformer.h.0.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
15
+ "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
16
+ "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
17
+ "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
18
+ "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
19
+ "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
20
+ "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "transformer.h.1.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
22
+ "transformer.h.1.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
23
+ "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
24
+ "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
25
+ "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
27
+ "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
28
+ "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "transformer.h.10.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
30
+ "transformer.h.10.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
31
+ "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
32
+ "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
33
+ "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
35
+ "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
36
+ "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "transformer.h.11.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
38
+ "transformer.h.11.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
39
+ "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
40
+ "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
41
+ "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
43
+ "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
44
+ "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "transformer.h.12.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
46
+ "transformer.h.12.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
47
+ "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
48
+ "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
49
+ "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
51
+ "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
52
+ "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "transformer.h.13.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
54
+ "transformer.h.13.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
55
+ "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
56
+ "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
57
+ "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
58
+ "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
59
+ "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
60
+ "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "transformer.h.14.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
62
+ "transformer.h.14.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
63
+ "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
64
+ "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
65
+ "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
67
+ "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
68
+ "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "transformer.h.15.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
70
+ "transformer.h.15.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
71
+ "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
72
+ "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
73
+ "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
75
+ "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
76
+ "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "transformer.h.16.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
78
+ "transformer.h.16.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
79
+ "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
80
+ "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
81
+ "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
83
+ "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
84
+ "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "transformer.h.17.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
86
+ "transformer.h.17.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
87
+ "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
88
+ "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
89
+ "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
90
+ "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
91
+ "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
92
+ "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "transformer.h.18.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
94
+ "transformer.h.18.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
95
+ "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
96
+ "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
97
+ "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
99
+ "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
100
+ "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "transformer.h.19.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
102
+ "transformer.h.19.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
103
+ "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
104
+ "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
105
+ "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
107
+ "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
108
+ "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "transformer.h.2.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
110
+ "transformer.h.2.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
111
+ "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
112
+ "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
113
+ "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
115
+ "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
116
+ "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
117
+ "transformer.h.20.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
118
+ "transformer.h.20.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
119
+ "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
120
+ "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
121
+ "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
123
+ "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
124
+ "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
125
+ "transformer.h.21.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
126
+ "transformer.h.21.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
127
+ "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
128
+ "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
129
+ "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
130
+ "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
131
+ "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
132
+ "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
133
+ "transformer.h.22.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
134
+ "transformer.h.22.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
135
+ "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
136
+ "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
137
+ "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
138
+ "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
139
+ "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
140
+ "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
141
+ "transformer.h.23.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
142
+ "transformer.h.23.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
143
+ "transformer.h.24.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
144
+ "transformer.h.24.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
145
+ "transformer.h.24.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
146
+ "transformer.h.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
147
+ "transformer.h.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
148
+ "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
149
+ "transformer.h.24.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
150
+ "transformer.h.24.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
151
+ "transformer.h.25.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
152
+ "transformer.h.25.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
153
+ "transformer.h.25.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "transformer.h.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
155
+ "transformer.h.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
156
+ "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
157
+ "transformer.h.25.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
158
+ "transformer.h.25.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
159
+ "transformer.h.26.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
160
+ "transformer.h.26.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
161
+ "transformer.h.26.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
162
+ "transformer.h.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
163
+ "transformer.h.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
164
+ "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "transformer.h.26.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
166
+ "transformer.h.26.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
167
+ "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
168
+ "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
169
+ "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
171
+ "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
172
+ "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "transformer.h.27.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
174
+ "transformer.h.27.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
175
+ "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
176
+ "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
177
+ "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
179
+ "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
180
+ "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "transformer.h.28.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
182
+ "transformer.h.28.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
183
+ "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
184
+ "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
185
+ "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
187
+ "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
188
+ "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
189
+ "transformer.h.29.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
190
+ "transformer.h.29.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
191
+ "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
192
+ "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
193
+ "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
194
+ "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
195
+ "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
196
+ "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
197
+ "transformer.h.3.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
198
+ "transformer.h.3.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
199
+ "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
200
+ "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
201
+ "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
202
+ "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
203
+ "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
204
+ "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "transformer.h.30.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
206
+ "transformer.h.30.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
207
+ "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
208
+ "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
209
+ "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
211
+ "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
212
+ "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "transformer.h.31.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
214
+ "transformer.h.31.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
215
+ "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
216
+ "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
217
+ "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
219
+ "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
220
+ "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "transformer.h.4.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
222
+ "transformer.h.4.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
223
+ "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
224
+ "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
225
+ "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
226
+ "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
227
+ "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
228
+ "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
229
+ "transformer.h.5.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
230
+ "transformer.h.5.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
231
+ "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
232
+ "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
233
+ "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
234
+ "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
235
+ "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
236
+ "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
237
+ "transformer.h.6.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
238
+ "transformer.h.6.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
239
+ "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
240
+ "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
241
+ "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
243
+ "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
244
+ "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "transformer.h.7.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
246
+ "transformer.h.7.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
247
+ "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
248
+ "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
249
+ "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
251
+ "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
252
+ "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "transformer.h.8.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
254
+ "transformer.h.8.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
255
+ "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
256
+ "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
257
+ "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
259
+ "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
260
+ "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
261
+ "transformer.h.9.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
262
+ "transformer.h.9.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
263
+ "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
264
+ "transformer.visual.attn_pool.attn.in_proj_bias": "pytorch_model-00002-of-00002.bin",
265
+ "transformer.visual.attn_pool.attn.in_proj_weight": "pytorch_model-00002-of-00002.bin",
266
+ "transformer.visual.attn_pool.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
267
+ "transformer.visual.attn_pool.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
268
+ "transformer.visual.attn_pool.kv_proj.weight": "pytorch_model-00002-of-00002.bin",
269
+ "transformer.visual.attn_pool.ln_kv.bias": "pytorch_model-00002-of-00002.bin",
270
+ "transformer.visual.attn_pool.ln_kv.weight": "pytorch_model-00002-of-00002.bin",
271
+ "transformer.visual.attn_pool.ln_q.bias": "pytorch_model-00002-of-00002.bin",
272
+ "transformer.visual.attn_pool.ln_q.weight": "pytorch_model-00002-of-00002.bin",
273
+ "transformer.visual.attn_pool.pos_embed": "pytorch_model-00002-of-00002.bin",
274
+ "transformer.visual.attn_pool.query": "pytorch_model-00002-of-00002.bin",
275
+ "transformer.visual.conv1.weight": "pytorch_model-00002-of-00002.bin",
276
+ "transformer.visual.ln_post.bias": "pytorch_model-00002-of-00002.bin",
277
+ "transformer.visual.ln_post.weight": "pytorch_model-00002-of-00002.bin",
278
+ "transformer.visual.ln_pre.bias": "pytorch_model-00002-of-00002.bin",
279
+ "transformer.visual.ln_pre.weight": "pytorch_model-00002-of-00002.bin",
280
+ "transformer.visual.positional_embedding": "pytorch_model-00002-of-00002.bin",
281
+ "transformer.visual.proj": "pytorch_model-00002-of-00002.bin",
282
+ "transformer.visual.transformer.resblocks.0.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
283
+ "transformer.visual.transformer.resblocks.0.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
284
+ "transformer.visual.transformer.resblocks.0.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
285
+ "transformer.visual.transformer.resblocks.0.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
286
+ "transformer.visual.transformer.resblocks.0.ln_1.bias": "pytorch_model-00002-of-00002.bin",
287
+ "transformer.visual.transformer.resblocks.0.ln_1.weight": "pytorch_model-00002-of-00002.bin",
288
+ "transformer.visual.transformer.resblocks.0.ln_2.bias": "pytorch_model-00002-of-00002.bin",
289
+ "transformer.visual.transformer.resblocks.0.ln_2.weight": "pytorch_model-00002-of-00002.bin",
290
+ "transformer.visual.transformer.resblocks.0.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
291
+ "transformer.visual.transformer.resblocks.0.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
292
+ "transformer.visual.transformer.resblocks.0.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
293
+ "transformer.visual.transformer.resblocks.0.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
294
+ "transformer.visual.transformer.resblocks.1.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
295
+ "transformer.visual.transformer.resblocks.1.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
296
+ "transformer.visual.transformer.resblocks.1.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
297
+ "transformer.visual.transformer.resblocks.1.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
298
+ "transformer.visual.transformer.resblocks.1.ln_1.bias": "pytorch_model-00002-of-00002.bin",
299
+ "transformer.visual.transformer.resblocks.1.ln_1.weight": "pytorch_model-00002-of-00002.bin",
300
+ "transformer.visual.transformer.resblocks.1.ln_2.bias": "pytorch_model-00002-of-00002.bin",
301
+ "transformer.visual.transformer.resblocks.1.ln_2.weight": "pytorch_model-00002-of-00002.bin",
302
+ "transformer.visual.transformer.resblocks.1.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
303
+ "transformer.visual.transformer.resblocks.1.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
304
+ "transformer.visual.transformer.resblocks.1.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
305
+ "transformer.visual.transformer.resblocks.1.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
306
+ "transformer.visual.transformer.resblocks.10.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
307
+ "transformer.visual.transformer.resblocks.10.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
308
+ "transformer.visual.transformer.resblocks.10.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
309
+ "transformer.visual.transformer.resblocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
310
+ "transformer.visual.transformer.resblocks.10.ln_1.bias": "pytorch_model-00002-of-00002.bin",
311
+ "transformer.visual.transformer.resblocks.10.ln_1.weight": "pytorch_model-00002-of-00002.bin",
312
+ "transformer.visual.transformer.resblocks.10.ln_2.bias": "pytorch_model-00002-of-00002.bin",
313
+ "transformer.visual.transformer.resblocks.10.ln_2.weight": "pytorch_model-00002-of-00002.bin",
314
+ "transformer.visual.transformer.resblocks.10.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
315
+ "transformer.visual.transformer.resblocks.10.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
316
+ "transformer.visual.transformer.resblocks.10.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
317
+ "transformer.visual.transformer.resblocks.10.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
318
+ "transformer.visual.transformer.resblocks.11.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
319
+ "transformer.visual.transformer.resblocks.11.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
320
+ "transformer.visual.transformer.resblocks.11.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
321
+ "transformer.visual.transformer.resblocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
322
+ "transformer.visual.transformer.resblocks.11.ln_1.bias": "pytorch_model-00002-of-00002.bin",
323
+ "transformer.visual.transformer.resblocks.11.ln_1.weight": "pytorch_model-00002-of-00002.bin",
324
+ "transformer.visual.transformer.resblocks.11.ln_2.bias": "pytorch_model-00002-of-00002.bin",
325
+ "transformer.visual.transformer.resblocks.11.ln_2.weight": "pytorch_model-00002-of-00002.bin",
326
+ "transformer.visual.transformer.resblocks.11.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
327
+ "transformer.visual.transformer.resblocks.11.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
328
+ "transformer.visual.transformer.resblocks.11.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
329
+ "transformer.visual.transformer.resblocks.11.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
330
+ "transformer.visual.transformer.resblocks.12.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
331
+ "transformer.visual.transformer.resblocks.12.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
332
+ "transformer.visual.transformer.resblocks.12.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
333
+ "transformer.visual.transformer.resblocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
334
+ "transformer.visual.transformer.resblocks.12.ln_1.bias": "pytorch_model-00002-of-00002.bin",
335
+ "transformer.visual.transformer.resblocks.12.ln_1.weight": "pytorch_model-00002-of-00002.bin",
336
+ "transformer.visual.transformer.resblocks.12.ln_2.bias": "pytorch_model-00002-of-00002.bin",
337
+ "transformer.visual.transformer.resblocks.12.ln_2.weight": "pytorch_model-00002-of-00002.bin",
338
+ "transformer.visual.transformer.resblocks.12.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
339
+ "transformer.visual.transformer.resblocks.12.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
340
+ "transformer.visual.transformer.resblocks.12.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
341
+ "transformer.visual.transformer.resblocks.12.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
342
+ "transformer.visual.transformer.resblocks.13.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
343
+ "transformer.visual.transformer.resblocks.13.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
344
+ "transformer.visual.transformer.resblocks.13.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
345
+ "transformer.visual.transformer.resblocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
346
+ "transformer.visual.transformer.resblocks.13.ln_1.bias": "pytorch_model-00002-of-00002.bin",
347
+ "transformer.visual.transformer.resblocks.13.ln_1.weight": "pytorch_model-00002-of-00002.bin",
348
+ "transformer.visual.transformer.resblocks.13.ln_2.bias": "pytorch_model-00002-of-00002.bin",
349
+ "transformer.visual.transformer.resblocks.13.ln_2.weight": "pytorch_model-00002-of-00002.bin",
350
+ "transformer.visual.transformer.resblocks.13.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
351
+ "transformer.visual.transformer.resblocks.13.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
352
+ "transformer.visual.transformer.resblocks.13.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
353
+ "transformer.visual.transformer.resblocks.13.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
354
+ "transformer.visual.transformer.resblocks.14.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
355
+ "transformer.visual.transformer.resblocks.14.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
356
+ "transformer.visual.transformer.resblocks.14.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
357
+ "transformer.visual.transformer.resblocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
358
+ "transformer.visual.transformer.resblocks.14.ln_1.bias": "pytorch_model-00002-of-00002.bin",
359
+ "transformer.visual.transformer.resblocks.14.ln_1.weight": "pytorch_model-00002-of-00002.bin",
360
+ "transformer.visual.transformer.resblocks.14.ln_2.bias": "pytorch_model-00002-of-00002.bin",
361
+ "transformer.visual.transformer.resblocks.14.ln_2.weight": "pytorch_model-00002-of-00002.bin",
362
+ "transformer.visual.transformer.resblocks.14.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
363
+ "transformer.visual.transformer.resblocks.14.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
364
+ "transformer.visual.transformer.resblocks.14.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
365
+ "transformer.visual.transformer.resblocks.14.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
366
+ "transformer.visual.transformer.resblocks.15.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
367
+ "transformer.visual.transformer.resblocks.15.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
368
+ "transformer.visual.transformer.resblocks.15.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
369
+ "transformer.visual.transformer.resblocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
370
+ "transformer.visual.transformer.resblocks.15.ln_1.bias": "pytorch_model-00002-of-00002.bin",
371
+ "transformer.visual.transformer.resblocks.15.ln_1.weight": "pytorch_model-00002-of-00002.bin",
372
+ "transformer.visual.transformer.resblocks.15.ln_2.bias": "pytorch_model-00002-of-00002.bin",
373
+ "transformer.visual.transformer.resblocks.15.ln_2.weight": "pytorch_model-00002-of-00002.bin",
374
+ "transformer.visual.transformer.resblocks.15.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
375
+ "transformer.visual.transformer.resblocks.15.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
376
+ "transformer.visual.transformer.resblocks.15.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
377
+ "transformer.visual.transformer.resblocks.15.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
378
+ "transformer.visual.transformer.resblocks.16.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
379
+ "transformer.visual.transformer.resblocks.16.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
380
+ "transformer.visual.transformer.resblocks.16.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
381
+ "transformer.visual.transformer.resblocks.16.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
382
+ "transformer.visual.transformer.resblocks.16.ln_1.bias": "pytorch_model-00002-of-00002.bin",
383
+ "transformer.visual.transformer.resblocks.16.ln_1.weight": "pytorch_model-00002-of-00002.bin",
384
+ "transformer.visual.transformer.resblocks.16.ln_2.bias": "pytorch_model-00002-of-00002.bin",
385
+ "transformer.visual.transformer.resblocks.16.ln_2.weight": "pytorch_model-00002-of-00002.bin",
386
+ "transformer.visual.transformer.resblocks.16.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
387
+ "transformer.visual.transformer.resblocks.16.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
388
+ "transformer.visual.transformer.resblocks.16.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
389
+ "transformer.visual.transformer.resblocks.16.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
390
+ "transformer.visual.transformer.resblocks.17.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
391
+ "transformer.visual.transformer.resblocks.17.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
392
+ "transformer.visual.transformer.resblocks.17.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
393
+ "transformer.visual.transformer.resblocks.17.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
394
+ "transformer.visual.transformer.resblocks.17.ln_1.bias": "pytorch_model-00002-of-00002.bin",
395
+ "transformer.visual.transformer.resblocks.17.ln_1.weight": "pytorch_model-00002-of-00002.bin",
396
+ "transformer.visual.transformer.resblocks.17.ln_2.bias": "pytorch_model-00002-of-00002.bin",
397
+ "transformer.visual.transformer.resblocks.17.ln_2.weight": "pytorch_model-00002-of-00002.bin",
398
+ "transformer.visual.transformer.resblocks.17.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
399
+ "transformer.visual.transformer.resblocks.17.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
400
+ "transformer.visual.transformer.resblocks.17.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
401
+ "transformer.visual.transformer.resblocks.17.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
402
+ "transformer.visual.transformer.resblocks.18.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
403
+ "transformer.visual.transformer.resblocks.18.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
404
+ "transformer.visual.transformer.resblocks.18.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
405
+ "transformer.visual.transformer.resblocks.18.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
406
+ "transformer.visual.transformer.resblocks.18.ln_1.bias": "pytorch_model-00002-of-00002.bin",
407
+ "transformer.visual.transformer.resblocks.18.ln_1.weight": "pytorch_model-00002-of-00002.bin",
408
+ "transformer.visual.transformer.resblocks.18.ln_2.bias": "pytorch_model-00002-of-00002.bin",
409
+ "transformer.visual.transformer.resblocks.18.ln_2.weight": "pytorch_model-00002-of-00002.bin",
410
+ "transformer.visual.transformer.resblocks.18.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
411
+ "transformer.visual.transformer.resblocks.18.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
412
+ "transformer.visual.transformer.resblocks.18.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
413
+ "transformer.visual.transformer.resblocks.18.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
414
+ "transformer.visual.transformer.resblocks.19.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
415
+ "transformer.visual.transformer.resblocks.19.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
416
+ "transformer.visual.transformer.resblocks.19.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
417
+ "transformer.visual.transformer.resblocks.19.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
418
+ "transformer.visual.transformer.resblocks.19.ln_1.bias": "pytorch_model-00002-of-00002.bin",
419
+ "transformer.visual.transformer.resblocks.19.ln_1.weight": "pytorch_model-00002-of-00002.bin",
420
+ "transformer.visual.transformer.resblocks.19.ln_2.bias": "pytorch_model-00002-of-00002.bin",
421
+ "transformer.visual.transformer.resblocks.19.ln_2.weight": "pytorch_model-00002-of-00002.bin",
422
+ "transformer.visual.transformer.resblocks.19.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
423
+ "transformer.visual.transformer.resblocks.19.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
424
+ "transformer.visual.transformer.resblocks.19.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
425
+ "transformer.visual.transformer.resblocks.19.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
426
+ "transformer.visual.transformer.resblocks.2.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
427
+ "transformer.visual.transformer.resblocks.2.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
428
+ "transformer.visual.transformer.resblocks.2.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
429
+ "transformer.visual.transformer.resblocks.2.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
430
+ "transformer.visual.transformer.resblocks.2.ln_1.bias": "pytorch_model-00002-of-00002.bin",
431
+ "transformer.visual.transformer.resblocks.2.ln_1.weight": "pytorch_model-00002-of-00002.bin",
432
+ "transformer.visual.transformer.resblocks.2.ln_2.bias": "pytorch_model-00002-of-00002.bin",
433
+ "transformer.visual.transformer.resblocks.2.ln_2.weight": "pytorch_model-00002-of-00002.bin",
434
+ "transformer.visual.transformer.resblocks.2.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
435
+ "transformer.visual.transformer.resblocks.2.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
436
+ "transformer.visual.transformer.resblocks.2.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
437
+ "transformer.visual.transformer.resblocks.2.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
438
+ "transformer.visual.transformer.resblocks.20.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
439
+ "transformer.visual.transformer.resblocks.20.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
440
+ "transformer.visual.transformer.resblocks.20.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
441
+ "transformer.visual.transformer.resblocks.20.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
442
+ "transformer.visual.transformer.resblocks.20.ln_1.bias": "pytorch_model-00002-of-00002.bin",
443
+ "transformer.visual.transformer.resblocks.20.ln_1.weight": "pytorch_model-00002-of-00002.bin",
444
+ "transformer.visual.transformer.resblocks.20.ln_2.bias": "pytorch_model-00002-of-00002.bin",
445
+ "transformer.visual.transformer.resblocks.20.ln_2.weight": "pytorch_model-00002-of-00002.bin",
446
+ "transformer.visual.transformer.resblocks.20.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
447
+ "transformer.visual.transformer.resblocks.20.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
448
+ "transformer.visual.transformer.resblocks.20.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
449
+ "transformer.visual.transformer.resblocks.20.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
450
+ "transformer.visual.transformer.resblocks.21.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
451
+ "transformer.visual.transformer.resblocks.21.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
452
+ "transformer.visual.transformer.resblocks.21.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
453
+ "transformer.visual.transformer.resblocks.21.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
454
+ "transformer.visual.transformer.resblocks.21.ln_1.bias": "pytorch_model-00002-of-00002.bin",
455
+ "transformer.visual.transformer.resblocks.21.ln_1.weight": "pytorch_model-00002-of-00002.bin",
456
+ "transformer.visual.transformer.resblocks.21.ln_2.bias": "pytorch_model-00002-of-00002.bin",
457
+ "transformer.visual.transformer.resblocks.21.ln_2.weight": "pytorch_model-00002-of-00002.bin",
458
+ "transformer.visual.transformer.resblocks.21.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
459
+ "transformer.visual.transformer.resblocks.21.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
460
+ "transformer.visual.transformer.resblocks.21.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
461
+ "transformer.visual.transformer.resblocks.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
462
+ "transformer.visual.transformer.resblocks.22.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
463
+ "transformer.visual.transformer.resblocks.22.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
464
+ "transformer.visual.transformer.resblocks.22.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
465
+ "transformer.visual.transformer.resblocks.22.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
466
+ "transformer.visual.transformer.resblocks.22.ln_1.bias": "pytorch_model-00002-of-00002.bin",
467
+ "transformer.visual.transformer.resblocks.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
468
+ "transformer.visual.transformer.resblocks.22.ln_2.bias": "pytorch_model-00002-of-00002.bin",
469
+ "transformer.visual.transformer.resblocks.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
470
+ "transformer.visual.transformer.resblocks.22.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
471
+ "transformer.visual.transformer.resblocks.22.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
472
+ "transformer.visual.transformer.resblocks.22.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
473
+ "transformer.visual.transformer.resblocks.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
474
+ "transformer.visual.transformer.resblocks.23.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
475
+ "transformer.visual.transformer.resblocks.23.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
476
+ "transformer.visual.transformer.resblocks.23.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
477
+ "transformer.visual.transformer.resblocks.23.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
478
+ "transformer.visual.transformer.resblocks.23.ln_1.bias": "pytorch_model-00002-of-00002.bin",
479
+ "transformer.visual.transformer.resblocks.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
480
+ "transformer.visual.transformer.resblocks.23.ln_2.bias": "pytorch_model-00002-of-00002.bin",
481
+ "transformer.visual.transformer.resblocks.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
482
+ "transformer.visual.transformer.resblocks.23.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
483
+ "transformer.visual.transformer.resblocks.23.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
484
+ "transformer.visual.transformer.resblocks.23.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
485
+ "transformer.visual.transformer.resblocks.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
486
+ "transformer.visual.transformer.resblocks.24.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
487
+ "transformer.visual.transformer.resblocks.24.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
488
+ "transformer.visual.transformer.resblocks.24.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
489
+ "transformer.visual.transformer.resblocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
490
+ "transformer.visual.transformer.resblocks.24.ln_1.bias": "pytorch_model-00002-of-00002.bin",
491
+ "transformer.visual.transformer.resblocks.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
492
+ "transformer.visual.transformer.resblocks.24.ln_2.bias": "pytorch_model-00002-of-00002.bin",
493
+ "transformer.visual.transformer.resblocks.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
494
+ "transformer.visual.transformer.resblocks.24.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
495
+ "transformer.visual.transformer.resblocks.24.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
496
+ "transformer.visual.transformer.resblocks.24.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
497
+ "transformer.visual.transformer.resblocks.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
498
+ "transformer.visual.transformer.resblocks.25.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
499
+ "transformer.visual.transformer.resblocks.25.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
500
+ "transformer.visual.transformer.resblocks.25.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
501
+ "transformer.visual.transformer.resblocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
502
+ "transformer.visual.transformer.resblocks.25.ln_1.bias": "pytorch_model-00002-of-00002.bin",
503
+ "transformer.visual.transformer.resblocks.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
504
+ "transformer.visual.transformer.resblocks.25.ln_2.bias": "pytorch_model-00002-of-00002.bin",
505
+ "transformer.visual.transformer.resblocks.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
506
+ "transformer.visual.transformer.resblocks.25.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
507
+ "transformer.visual.transformer.resblocks.25.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
508
+ "transformer.visual.transformer.resblocks.25.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
509
+ "transformer.visual.transformer.resblocks.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
510
+ "transformer.visual.transformer.resblocks.26.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
511
+ "transformer.visual.transformer.resblocks.26.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
512
+ "transformer.visual.transformer.resblocks.26.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
513
+ "transformer.visual.transformer.resblocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
514
+ "transformer.visual.transformer.resblocks.26.ln_1.bias": "pytorch_model-00002-of-00002.bin",
515
+ "transformer.visual.transformer.resblocks.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
516
+ "transformer.visual.transformer.resblocks.26.ln_2.bias": "pytorch_model-00002-of-00002.bin",
517
+ "transformer.visual.transformer.resblocks.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
518
+ "transformer.visual.transformer.resblocks.26.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
519
+ "transformer.visual.transformer.resblocks.26.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
520
+ "transformer.visual.transformer.resblocks.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
521
+ "transformer.visual.transformer.resblocks.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
522
+ "transformer.visual.transformer.resblocks.27.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
523
+ "transformer.visual.transformer.resblocks.27.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
524
+ "transformer.visual.transformer.resblocks.27.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
525
+ "transformer.visual.transformer.resblocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
526
+ "transformer.visual.transformer.resblocks.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
527
+ "transformer.visual.transformer.resblocks.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
528
+ "transformer.visual.transformer.resblocks.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
529
+ "transformer.visual.transformer.resblocks.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
530
+ "transformer.visual.transformer.resblocks.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
531
+ "transformer.visual.transformer.resblocks.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
532
+ "transformer.visual.transformer.resblocks.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
533
+ "transformer.visual.transformer.resblocks.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
534
+ "transformer.visual.transformer.resblocks.28.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
535
+ "transformer.visual.transformer.resblocks.28.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
536
+ "transformer.visual.transformer.resblocks.28.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
537
+ "transformer.visual.transformer.resblocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
538
+ "transformer.visual.transformer.resblocks.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
539
+ "transformer.visual.transformer.resblocks.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
540
+ "transformer.visual.transformer.resblocks.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
541
+ "transformer.visual.transformer.resblocks.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
542
+ "transformer.visual.transformer.resblocks.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
543
+ "transformer.visual.transformer.resblocks.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
544
+ "transformer.visual.transformer.resblocks.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
545
+ "transformer.visual.transformer.resblocks.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
546
+ "transformer.visual.transformer.resblocks.29.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
547
+ "transformer.visual.transformer.resblocks.29.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
548
+ "transformer.visual.transformer.resblocks.29.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
549
+ "transformer.visual.transformer.resblocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
550
+ "transformer.visual.transformer.resblocks.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
551
+ "transformer.visual.transformer.resblocks.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
552
+ "transformer.visual.transformer.resblocks.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
553
+ "transformer.visual.transformer.resblocks.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
554
+ "transformer.visual.transformer.resblocks.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
555
+ "transformer.visual.transformer.resblocks.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
556
+ "transformer.visual.transformer.resblocks.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
557
+ "transformer.visual.transformer.resblocks.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
558
+ "transformer.visual.transformer.resblocks.3.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
559
+ "transformer.visual.transformer.resblocks.3.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
560
+ "transformer.visual.transformer.resblocks.3.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
561
+ "transformer.visual.transformer.resblocks.3.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
562
+ "transformer.visual.transformer.resblocks.3.ln_1.bias": "pytorch_model-00002-of-00002.bin",
563
+ "transformer.visual.transformer.resblocks.3.ln_1.weight": "pytorch_model-00002-of-00002.bin",
564
+ "transformer.visual.transformer.resblocks.3.ln_2.bias": "pytorch_model-00002-of-00002.bin",
565
+ "transformer.visual.transformer.resblocks.3.ln_2.weight": "pytorch_model-00002-of-00002.bin",
566
+ "transformer.visual.transformer.resblocks.3.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
567
+ "transformer.visual.transformer.resblocks.3.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
568
+ "transformer.visual.transformer.resblocks.3.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
569
+ "transformer.visual.transformer.resblocks.3.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
570
+ "transformer.visual.transformer.resblocks.30.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
571
+ "transformer.visual.transformer.resblocks.30.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
572
+ "transformer.visual.transformer.resblocks.30.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
573
+ "transformer.visual.transformer.resblocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
574
+ "transformer.visual.transformer.resblocks.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
575
+ "transformer.visual.transformer.resblocks.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
576
+ "transformer.visual.transformer.resblocks.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
577
+ "transformer.visual.transformer.resblocks.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
578
+ "transformer.visual.transformer.resblocks.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
579
+ "transformer.visual.transformer.resblocks.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
580
+ "transformer.visual.transformer.resblocks.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
581
+ "transformer.visual.transformer.resblocks.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
582
+ "transformer.visual.transformer.resblocks.31.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
583
+ "transformer.visual.transformer.resblocks.31.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
584
+ "transformer.visual.transformer.resblocks.31.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
585
+ "transformer.visual.transformer.resblocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
586
+ "transformer.visual.transformer.resblocks.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
587
+ "transformer.visual.transformer.resblocks.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
588
+ "transformer.visual.transformer.resblocks.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
589
+ "transformer.visual.transformer.resblocks.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
590
+ "transformer.visual.transformer.resblocks.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
591
+ "transformer.visual.transformer.resblocks.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
592
+ "transformer.visual.transformer.resblocks.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
593
+ "transformer.visual.transformer.resblocks.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
594
+ "transformer.visual.transformer.resblocks.32.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
595
+ "transformer.visual.transformer.resblocks.32.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
596
+ "transformer.visual.transformer.resblocks.32.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
597
+ "transformer.visual.transformer.resblocks.32.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
598
+ "transformer.visual.transformer.resblocks.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
599
+ "transformer.visual.transformer.resblocks.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
600
+ "transformer.visual.transformer.resblocks.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
601
+ "transformer.visual.transformer.resblocks.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
602
+ "transformer.visual.transformer.resblocks.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
603
+ "transformer.visual.transformer.resblocks.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
604
+ "transformer.visual.transformer.resblocks.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
605
+ "transformer.visual.transformer.resblocks.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
606
+ "transformer.visual.transformer.resblocks.33.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
607
+ "transformer.visual.transformer.resblocks.33.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
608
+ "transformer.visual.transformer.resblocks.33.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
609
+ "transformer.visual.transformer.resblocks.33.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
610
+ "transformer.visual.transformer.resblocks.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
611
+ "transformer.visual.transformer.resblocks.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
612
+ "transformer.visual.transformer.resblocks.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
613
+ "transformer.visual.transformer.resblocks.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
614
+ "transformer.visual.transformer.resblocks.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
615
+ "transformer.visual.transformer.resblocks.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
616
+ "transformer.visual.transformer.resblocks.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
617
+ "transformer.visual.transformer.resblocks.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
618
+ "transformer.visual.transformer.resblocks.34.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
619
+ "transformer.visual.transformer.resblocks.34.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
620
+ "transformer.visual.transformer.resblocks.34.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
621
+ "transformer.visual.transformer.resblocks.34.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
622
+ "transformer.visual.transformer.resblocks.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
623
+ "transformer.visual.transformer.resblocks.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
624
+ "transformer.visual.transformer.resblocks.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
625
+ "transformer.visual.transformer.resblocks.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
626
+ "transformer.visual.transformer.resblocks.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
627
+ "transformer.visual.transformer.resblocks.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
628
+ "transformer.visual.transformer.resblocks.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
629
+ "transformer.visual.transformer.resblocks.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
630
+ "transformer.visual.transformer.resblocks.35.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
631
+ "transformer.visual.transformer.resblocks.35.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
632
+ "transformer.visual.transformer.resblocks.35.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
633
+ "transformer.visual.transformer.resblocks.35.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
634
+ "transformer.visual.transformer.resblocks.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
635
+ "transformer.visual.transformer.resblocks.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
636
+ "transformer.visual.transformer.resblocks.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
637
+ "transformer.visual.transformer.resblocks.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
638
+ "transformer.visual.transformer.resblocks.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
639
+ "transformer.visual.transformer.resblocks.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
640
+ "transformer.visual.transformer.resblocks.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
641
+ "transformer.visual.transformer.resblocks.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
642
+ "transformer.visual.transformer.resblocks.36.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
643
+ "transformer.visual.transformer.resblocks.36.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
644
+ "transformer.visual.transformer.resblocks.36.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
645
+ "transformer.visual.transformer.resblocks.36.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
646
+ "transformer.visual.transformer.resblocks.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
647
+ "transformer.visual.transformer.resblocks.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
648
+ "transformer.visual.transformer.resblocks.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
649
+ "transformer.visual.transformer.resblocks.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
650
+ "transformer.visual.transformer.resblocks.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
651
+ "transformer.visual.transformer.resblocks.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
652
+ "transformer.visual.transformer.resblocks.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
653
+ "transformer.visual.transformer.resblocks.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
654
+ "transformer.visual.transformer.resblocks.37.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
655
+ "transformer.visual.transformer.resblocks.37.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
656
+ "transformer.visual.transformer.resblocks.37.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
657
+ "transformer.visual.transformer.resblocks.37.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
658
+ "transformer.visual.transformer.resblocks.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
659
+ "transformer.visual.transformer.resblocks.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
660
+ "transformer.visual.transformer.resblocks.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
661
+ "transformer.visual.transformer.resblocks.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
662
+ "transformer.visual.transformer.resblocks.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
663
+ "transformer.visual.transformer.resblocks.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
664
+ "transformer.visual.transformer.resblocks.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
665
+ "transformer.visual.transformer.resblocks.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
666
+ "transformer.visual.transformer.resblocks.38.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
667
+ "transformer.visual.transformer.resblocks.38.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
668
+ "transformer.visual.transformer.resblocks.38.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
669
+ "transformer.visual.transformer.resblocks.38.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
670
+ "transformer.visual.transformer.resblocks.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
671
+ "transformer.visual.transformer.resblocks.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
672
+ "transformer.visual.transformer.resblocks.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
673
+ "transformer.visual.transformer.resblocks.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
674
+ "transformer.visual.transformer.resblocks.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
675
+ "transformer.visual.transformer.resblocks.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
676
+ "transformer.visual.transformer.resblocks.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
677
+ "transformer.visual.transformer.resblocks.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
678
+ "transformer.visual.transformer.resblocks.39.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
679
+ "transformer.visual.transformer.resblocks.39.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
680
+ "transformer.visual.transformer.resblocks.39.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
681
+ "transformer.visual.transformer.resblocks.39.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
682
+ "transformer.visual.transformer.resblocks.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
683
+ "transformer.visual.transformer.resblocks.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
684
+ "transformer.visual.transformer.resblocks.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
685
+ "transformer.visual.transformer.resblocks.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
686
+ "transformer.visual.transformer.resblocks.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
687
+ "transformer.visual.transformer.resblocks.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
688
+ "transformer.visual.transformer.resblocks.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
689
+ "transformer.visual.transformer.resblocks.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
690
+ "transformer.visual.transformer.resblocks.4.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
691
+ "transformer.visual.transformer.resblocks.4.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
692
+ "transformer.visual.transformer.resblocks.4.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
693
+ "transformer.visual.transformer.resblocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
694
+ "transformer.visual.transformer.resblocks.4.ln_1.bias": "pytorch_model-00002-of-00002.bin",
695
+ "transformer.visual.transformer.resblocks.4.ln_1.weight": "pytorch_model-00002-of-00002.bin",
696
+ "transformer.visual.transformer.resblocks.4.ln_2.bias": "pytorch_model-00002-of-00002.bin",
697
+ "transformer.visual.transformer.resblocks.4.ln_2.weight": "pytorch_model-00002-of-00002.bin",
698
+ "transformer.visual.transformer.resblocks.4.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
699
+ "transformer.visual.transformer.resblocks.4.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
700
+ "transformer.visual.transformer.resblocks.4.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
701
+ "transformer.visual.transformer.resblocks.4.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
702
+ "transformer.visual.transformer.resblocks.40.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
703
+ "transformer.visual.transformer.resblocks.40.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
704
+ "transformer.visual.transformer.resblocks.40.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
705
+ "transformer.visual.transformer.resblocks.40.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
706
+ "transformer.visual.transformer.resblocks.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
707
+ "transformer.visual.transformer.resblocks.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
708
+ "transformer.visual.transformer.resblocks.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
709
+ "transformer.visual.transformer.resblocks.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
710
+ "transformer.visual.transformer.resblocks.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
711
+ "transformer.visual.transformer.resblocks.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
712
+ "transformer.visual.transformer.resblocks.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
713
+ "transformer.visual.transformer.resblocks.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
714
+ "transformer.visual.transformer.resblocks.41.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
715
+ "transformer.visual.transformer.resblocks.41.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
716
+ "transformer.visual.transformer.resblocks.41.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
717
+ "transformer.visual.transformer.resblocks.41.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
718
+ "transformer.visual.transformer.resblocks.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
719
+ "transformer.visual.transformer.resblocks.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
720
+ "transformer.visual.transformer.resblocks.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
721
+ "transformer.visual.transformer.resblocks.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
722
+ "transformer.visual.transformer.resblocks.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
723
+ "transformer.visual.transformer.resblocks.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
724
+ "transformer.visual.transformer.resblocks.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
725
+ "transformer.visual.transformer.resblocks.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
726
+ "transformer.visual.transformer.resblocks.42.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
727
+ "transformer.visual.transformer.resblocks.42.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
728
+ "transformer.visual.transformer.resblocks.42.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
729
+ "transformer.visual.transformer.resblocks.42.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
730
+ "transformer.visual.transformer.resblocks.42.ln_1.bias": "pytorch_model-00002-of-00002.bin",
731
+ "transformer.visual.transformer.resblocks.42.ln_1.weight": "pytorch_model-00002-of-00002.bin",
732
+ "transformer.visual.transformer.resblocks.42.ln_2.bias": "pytorch_model-00002-of-00002.bin",
733
+ "transformer.visual.transformer.resblocks.42.ln_2.weight": "pytorch_model-00002-of-00002.bin",
734
+ "transformer.visual.transformer.resblocks.42.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
735
+ "transformer.visual.transformer.resblocks.42.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
736
+ "transformer.visual.transformer.resblocks.42.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
737
+ "transformer.visual.transformer.resblocks.42.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
738
+ "transformer.visual.transformer.resblocks.43.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
739
+ "transformer.visual.transformer.resblocks.43.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
740
+ "transformer.visual.transformer.resblocks.43.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
741
+ "transformer.visual.transformer.resblocks.43.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
742
+ "transformer.visual.transformer.resblocks.43.ln_1.bias": "pytorch_model-00002-of-00002.bin",
743
+ "transformer.visual.transformer.resblocks.43.ln_1.weight": "pytorch_model-00002-of-00002.bin",
744
+ "transformer.visual.transformer.resblocks.43.ln_2.bias": "pytorch_model-00002-of-00002.bin",
745
+ "transformer.visual.transformer.resblocks.43.ln_2.weight": "pytorch_model-00002-of-00002.bin",
746
+ "transformer.visual.transformer.resblocks.43.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
747
+ "transformer.visual.transformer.resblocks.43.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
748
+ "transformer.visual.transformer.resblocks.43.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
749
+ "transformer.visual.transformer.resblocks.43.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
750
+ "transformer.visual.transformer.resblocks.44.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
751
+ "transformer.visual.transformer.resblocks.44.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
752
+ "transformer.visual.transformer.resblocks.44.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
753
+ "transformer.visual.transformer.resblocks.44.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
754
+ "transformer.visual.transformer.resblocks.44.ln_1.bias": "pytorch_model-00002-of-00002.bin",
755
+ "transformer.visual.transformer.resblocks.44.ln_1.weight": "pytorch_model-00002-of-00002.bin",
756
+ "transformer.visual.transformer.resblocks.44.ln_2.bias": "pytorch_model-00002-of-00002.bin",
757
+ "transformer.visual.transformer.resblocks.44.ln_2.weight": "pytorch_model-00002-of-00002.bin",
758
+ "transformer.visual.transformer.resblocks.44.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
759
+ "transformer.visual.transformer.resblocks.44.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
760
+ "transformer.visual.transformer.resblocks.44.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
761
+ "transformer.visual.transformer.resblocks.44.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
762
+ "transformer.visual.transformer.resblocks.45.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
763
+ "transformer.visual.transformer.resblocks.45.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
764
+ "transformer.visual.transformer.resblocks.45.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
765
+ "transformer.visual.transformer.resblocks.45.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
766
+ "transformer.visual.transformer.resblocks.45.ln_1.bias": "pytorch_model-00002-of-00002.bin",
767
+ "transformer.visual.transformer.resblocks.45.ln_1.weight": "pytorch_model-00002-of-00002.bin",
768
+ "transformer.visual.transformer.resblocks.45.ln_2.bias": "pytorch_model-00002-of-00002.bin",
769
+ "transformer.visual.transformer.resblocks.45.ln_2.weight": "pytorch_model-00002-of-00002.bin",
770
+ "transformer.visual.transformer.resblocks.45.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
771
+ "transformer.visual.transformer.resblocks.45.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
772
+ "transformer.visual.transformer.resblocks.45.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
773
+ "transformer.visual.transformer.resblocks.45.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
774
+ "transformer.visual.transformer.resblocks.46.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
775
+ "transformer.visual.transformer.resblocks.46.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
776
+ "transformer.visual.transformer.resblocks.46.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
777
+ "transformer.visual.transformer.resblocks.46.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
778
+ "transformer.visual.transformer.resblocks.46.ln_1.bias": "pytorch_model-00002-of-00002.bin",
779
+ "transformer.visual.transformer.resblocks.46.ln_1.weight": "pytorch_model-00002-of-00002.bin",
780
+ "transformer.visual.transformer.resblocks.46.ln_2.bias": "pytorch_model-00002-of-00002.bin",
781
+ "transformer.visual.transformer.resblocks.46.ln_2.weight": "pytorch_model-00002-of-00002.bin",
782
+ "transformer.visual.transformer.resblocks.46.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
783
+ "transformer.visual.transformer.resblocks.46.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
784
+ "transformer.visual.transformer.resblocks.46.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
785
+ "transformer.visual.transformer.resblocks.46.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
786
+ "transformer.visual.transformer.resblocks.47.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
787
+ "transformer.visual.transformer.resblocks.47.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
788
+ "transformer.visual.transformer.resblocks.47.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
789
+ "transformer.visual.transformer.resblocks.47.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
790
+ "transformer.visual.transformer.resblocks.47.ln_1.bias": "pytorch_model-00002-of-00002.bin",
791
+ "transformer.visual.transformer.resblocks.47.ln_1.weight": "pytorch_model-00002-of-00002.bin",
792
+ "transformer.visual.transformer.resblocks.47.ln_2.bias": "pytorch_model-00002-of-00002.bin",
793
+ "transformer.visual.transformer.resblocks.47.ln_2.weight": "pytorch_model-00002-of-00002.bin",
794
+ "transformer.visual.transformer.resblocks.47.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
795
+ "transformer.visual.transformer.resblocks.47.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
796
+ "transformer.visual.transformer.resblocks.47.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
797
+ "transformer.visual.transformer.resblocks.47.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
798
+ "transformer.visual.transformer.resblocks.5.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
799
+ "transformer.visual.transformer.resblocks.5.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
800
+ "transformer.visual.transformer.resblocks.5.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
801
+ "transformer.visual.transformer.resblocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
802
+ "transformer.visual.transformer.resblocks.5.ln_1.bias": "pytorch_model-00002-of-00002.bin",
803
+ "transformer.visual.transformer.resblocks.5.ln_1.weight": "pytorch_model-00002-of-00002.bin",
804
+ "transformer.visual.transformer.resblocks.5.ln_2.bias": "pytorch_model-00002-of-00002.bin",
805
+ "transformer.visual.transformer.resblocks.5.ln_2.weight": "pytorch_model-00002-of-00002.bin",
806
+ "transformer.visual.transformer.resblocks.5.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
807
+ "transformer.visual.transformer.resblocks.5.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
808
+ "transformer.visual.transformer.resblocks.5.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
809
+ "transformer.visual.transformer.resblocks.5.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
810
+ "transformer.visual.transformer.resblocks.6.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
811
+ "transformer.visual.transformer.resblocks.6.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
812
+ "transformer.visual.transformer.resblocks.6.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
813
+ "transformer.visual.transformer.resblocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
814
+ "transformer.visual.transformer.resblocks.6.ln_1.bias": "pytorch_model-00002-of-00002.bin",
815
+ "transformer.visual.transformer.resblocks.6.ln_1.weight": "pytorch_model-00002-of-00002.bin",
816
+ "transformer.visual.transformer.resblocks.6.ln_2.bias": "pytorch_model-00002-of-00002.bin",
817
+ "transformer.visual.transformer.resblocks.6.ln_2.weight": "pytorch_model-00002-of-00002.bin",
818
+ "transformer.visual.transformer.resblocks.6.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
819
+ "transformer.visual.transformer.resblocks.6.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
820
+ "transformer.visual.transformer.resblocks.6.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
821
+ "transformer.visual.transformer.resblocks.6.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
822
+ "transformer.visual.transformer.resblocks.7.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
823
+ "transformer.visual.transformer.resblocks.7.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
824
+ "transformer.visual.transformer.resblocks.7.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
825
+ "transformer.visual.transformer.resblocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
826
+ "transformer.visual.transformer.resblocks.7.ln_1.bias": "pytorch_model-00002-of-00002.bin",
827
+ "transformer.visual.transformer.resblocks.7.ln_1.weight": "pytorch_model-00002-of-00002.bin",
828
+ "transformer.visual.transformer.resblocks.7.ln_2.bias": "pytorch_model-00002-of-00002.bin",
829
+ "transformer.visual.transformer.resblocks.7.ln_2.weight": "pytorch_model-00002-of-00002.bin",
830
+ "transformer.visual.transformer.resblocks.7.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
831
+ "transformer.visual.transformer.resblocks.7.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
832
+ "transformer.visual.transformer.resblocks.7.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
833
+ "transformer.visual.transformer.resblocks.7.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
834
+ "transformer.visual.transformer.resblocks.8.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
835
+ "transformer.visual.transformer.resblocks.8.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
836
+ "transformer.visual.transformer.resblocks.8.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
837
+ "transformer.visual.transformer.resblocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
838
+ "transformer.visual.transformer.resblocks.8.ln_1.bias": "pytorch_model-00002-of-00002.bin",
839
+ "transformer.visual.transformer.resblocks.8.ln_1.weight": "pytorch_model-00002-of-00002.bin",
840
+ "transformer.visual.transformer.resblocks.8.ln_2.bias": "pytorch_model-00002-of-00002.bin",
841
+ "transformer.visual.transformer.resblocks.8.ln_2.weight": "pytorch_model-00002-of-00002.bin",
842
+ "transformer.visual.transformer.resblocks.8.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
843
+ "transformer.visual.transformer.resblocks.8.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
844
+ "transformer.visual.transformer.resblocks.8.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
845
+ "transformer.visual.transformer.resblocks.8.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
846
+ "transformer.visual.transformer.resblocks.9.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
847
+ "transformer.visual.transformer.resblocks.9.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
848
+ "transformer.visual.transformer.resblocks.9.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
849
+ "transformer.visual.transformer.resblocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
850
+ "transformer.visual.transformer.resblocks.9.ln_1.bias": "pytorch_model-00002-of-00002.bin",
851
+ "transformer.visual.transformer.resblocks.9.ln_1.weight": "pytorch_model-00002-of-00002.bin",
852
+ "transformer.visual.transformer.resblocks.9.ln_2.bias": "pytorch_model-00002-of-00002.bin",
853
+ "transformer.visual.transformer.resblocks.9.ln_2.weight": "pytorch_model-00002-of-00002.bin",
854
+ "transformer.visual.transformer.resblocks.9.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
855
+ "transformer.visual.transformer.resblocks.9.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
856
+ "transformer.visual.transformer.resblocks.9.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
857
+ "transformer.visual.transformer.resblocks.9.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
858
+ "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
859
+ }
860
+ }
weights/model-base/qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-base/qwen_generation_utils.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Generation support."""
7
+
8
+ from typing import Tuple, List, Union, Iterable
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from transformers import PreTrainedTokenizer
14
+ from transformers import logging
15
+ from transformers.generation import LogitsProcessor
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+ # Types.
20
+ HistoryType = List[Tuple[str, str]]
21
+ TokensType = List[int]
22
+ BatchTokensType = List[List[int]]
23
+
24
+
25
+ def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
26
+ for tokens in batch:
27
+ context_length = len(tokens)
28
+ if context_length < seq_length:
29
+ tokens.extend([pad_id] * (seq_length - context_length))
30
+ return batch
31
+
32
+
33
+ def get_ltor_masks_and_position_ids(
34
+ data,
35
+ eod_token,
36
+ reset_position_ids,
37
+ reset_attention_mask,
38
+ eod_mask_loss,
39
+ ):
40
+ """Build masks and position id for left to right model."""
41
+
42
+ # Extract batch size and sequence length.
43
+ micro_batch_size, seq_length = data.size()
44
+
45
+ # Attention mask (lower triangular).
46
+ if reset_attention_mask:
47
+ att_mask_batch = micro_batch_size
48
+ else:
49
+ att_mask_batch = 1
50
+ attention_mask = torch.tril(
51
+ torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
52
+ ).view(att_mask_batch, 1, seq_length, seq_length)
53
+
54
+ # Loss mask.
55
+ loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
56
+ if eod_mask_loss:
57
+ loss_mask[data == eod_token] = 0.0
58
+
59
+ # Position ids.
60
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
61
+ position_ids = position_ids.unsqueeze(0).expand_as(data)
62
+ # We need to clone as the ids will be modifed based on batch index.
63
+ if reset_position_ids:
64
+ position_ids = position_ids.clone()
65
+
66
+ if reset_position_ids or reset_attention_mask:
67
+ # Loop through the batches:
68
+ for b in range(micro_batch_size):
69
+
70
+ # Find indecies where EOD token is.
71
+ eod_index = position_ids[b, data[b] == eod_token]
72
+ # Detach indecies from positions if going to modify positions.
73
+ if reset_position_ids:
74
+ eod_index = eod_index.clone()
75
+
76
+ # Loop through EOD indecies:
77
+ prev_index = 0
78
+ for j in range(eod_index.size()[0]):
79
+ i = eod_index[j]
80
+ # Mask attention loss.
81
+ if reset_attention_mask:
82
+ attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
83
+ # Reset positions.
84
+ if reset_position_ids:
85
+ position_ids[b, (i + 1) :] -= i + 1 - prev_index
86
+ prev_index = i + 1
87
+
88
+ # Convert attention mask to binary:
89
+ attention_mask = attention_mask < 0.5
90
+
91
+ return attention_mask, loss_mask, position_ids
92
+
93
+
94
+ def get_batch(context_tokens: torch.LongTensor, eod_id: int):
95
+ """Generate batch from context tokens."""
96
+ # Move to GPU.
97
+ tokens = context_tokens.contiguous().to(context_tokens.device)
98
+ # Get the attention mask and postition ids.
99
+ attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
100
+ tokens,
101
+ eod_id,
102
+ reset_position_ids=False,
103
+ reset_attention_mask=False,
104
+ eod_mask_loss=False,
105
+ )
106
+ return tokens, attention_mask, position_ids
107
+
108
+
109
+ def get_stop_words_ids(chat_format, tokenizer):
110
+ if chat_format == "raw":
111
+ stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
112
+ elif chat_format == "chatml":
113
+ stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
114
+ else:
115
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
116
+ return stop_words_ids
117
+
118
+
119
+ def make_context(
120
+ tokenizer: PreTrainedTokenizer,
121
+ query: str,
122
+ history: List[Tuple[str, str]] = None,
123
+ system: str = "",
124
+ max_window_size: int = 6144,
125
+ chat_format: str = "chatml",
126
+ ):
127
+ if history is None:
128
+ history = []
129
+
130
+ if chat_format == "chatml":
131
+ im_start, im_end = "<|im_start|>", "<|im_end|>"
132
+ im_start_tokens = [tokenizer.im_start_id]
133
+ im_end_tokens = [tokenizer.im_end_id]
134
+ nl_tokens = tokenizer.encode("\n")
135
+
136
+ def _tokenize_str(role, content):
137
+ return f"{role}\n{content}", tokenizer.encode(
138
+ role, allowed_special=set(tokenizer.IMAGE_ST)
139
+ ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
140
+
141
+ system_text, system_tokens_part = _tokenize_str("system", system)
142
+ system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
143
+
144
+ raw_text = ""
145
+ context_tokens = []
146
+
147
+ for turn_query, turn_response in reversed(history):
148
+ query_text, query_tokens_part = _tokenize_str("user", turn_query)
149
+ query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
150
+ if turn_response is not None:
151
+ response_text, response_tokens_part = _tokenize_str(
152
+ "assistant", turn_response
153
+ )
154
+ response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
155
+
156
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
157
+ prev_chat = (
158
+ f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
159
+ )
160
+ else:
161
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens
162
+ prev_chat = f"\n{im_start}{query_text}{im_end}\n"
163
+
164
+ current_context_size = (
165
+ len(system_tokens) + len(next_context_tokens) + len(context_tokens)
166
+ )
167
+ if current_context_size < max_window_size:
168
+ context_tokens = next_context_tokens + context_tokens
169
+ raw_text = prev_chat + raw_text
170
+ else:
171
+ break
172
+
173
+ context_tokens = system_tokens + context_tokens
174
+ raw_text = f"{im_start}{system_text}{im_end}" + raw_text
175
+ context_tokens += (
176
+ nl_tokens
177
+ + im_start_tokens
178
+ + _tokenize_str("user", query)[1]
179
+ + im_end_tokens
180
+ + nl_tokens
181
+ + im_start_tokens
182
+ + tokenizer.encode("assistant")
183
+ + nl_tokens
184
+ )
185
+ raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
186
+
187
+ elif chat_format == "raw":
188
+ raw_text = query
189
+ context_tokens = tokenizer.encode(raw_text)
190
+ else:
191
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
192
+
193
+ return raw_text, context_tokens
194
+
195
+
196
+ def _decode_default(
197
+ tokens: List[int],
198
+ *,
199
+ stop_words: List[str],
200
+ eod_words: List[str],
201
+ tokenizer: PreTrainedTokenizer,
202
+ raw_text_len: int,
203
+ verbose: bool = False,
204
+ return_end_reason: bool = False,
205
+ errors: str='replace',
206
+ ):
207
+ trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
208
+ if verbose:
209
+ print("\nRaw Generate: ", trim_decode_tokens)
210
+
211
+ end_reason = f"Gen length {len(tokens)}"
212
+ for stop_word in stop_words:
213
+ trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
214
+ for eod_word in eod_words:
215
+ if eod_word in trim_decode_tokens:
216
+ end_reason = f"Gen {eod_word!r}"
217
+ trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
218
+ trim_decode_tokens = trim_decode_tokens.strip()
219
+ if verbose:
220
+ print("\nEnd Reason:", end_reason)
221
+ print("\nGenerate: ", trim_decode_tokens)
222
+
223
+ if return_end_reason:
224
+ return trim_decode_tokens, end_reason
225
+ else:
226
+ return trim_decode_tokens
227
+
228
+
229
+ def _decode_chatml(
230
+ tokens: List[int],
231
+ *,
232
+ stop_words: List[str],
233
+ eod_token_ids: List[int],
234
+ tokenizer: PreTrainedTokenizer,
235
+ raw_text_len: int,
236
+ context_length: int,
237
+ verbose: bool = False,
238
+ return_end_reason: bool = False,
239
+ errors: str='replace'
240
+ ):
241
+ end_reason = f"Gen length {len(tokens)}"
242
+ eod_token_idx = context_length
243
+ for eod_token_idx in range(context_length, len(tokens)):
244
+ if tokens[eod_token_idx] in eod_token_ids:
245
+ end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
246
+ break
247
+
248
+ trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
249
+ if verbose:
250
+ print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
251
+ print("\nRaw Generate:", trim_decode_tokens)
252
+ print("\nEnd Reason:", end_reason)
253
+ for stop_word in stop_words:
254
+ trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
255
+ trim_decode_tokens = trim_decode_tokens.strip()
256
+ if verbose:
257
+ print("\nGenerate:", trim_decode_tokens)
258
+
259
+ if return_end_reason:
260
+ return trim_decode_tokens, end_reason
261
+ else:
262
+ return trim_decode_tokens
263
+
264
+
265
+ def decode_tokens(
266
+ tokens: Union[torch.LongTensor, TokensType],
267
+ tokenizer: PreTrainedTokenizer,
268
+ raw_text_len: int,
269
+ context_length: int,
270
+ chat_format: str,
271
+ verbose: bool = False,
272
+ return_end_reason: bool = False,
273
+ errors: str="replace",
274
+ ) -> str:
275
+ if torch.is_tensor(tokens):
276
+ tokens = tokens.cpu().numpy().tolist()
277
+
278
+ if chat_format == "chatml":
279
+ return _decode_chatml(
280
+ tokens,
281
+ stop_words=[],
282
+ eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
283
+ tokenizer=tokenizer,
284
+ raw_text_len=raw_text_len,
285
+ context_length=context_length,
286
+ verbose=verbose,
287
+ return_end_reason=return_end_reason,
288
+ errors=errors,
289
+ )
290
+ elif chat_format == "raw":
291
+ return _decode_default(
292
+ tokens,
293
+ stop_words=["<|endoftext|>"],
294
+ eod_words=["<|endoftext|>"],
295
+ tokenizer=tokenizer,
296
+ raw_text_len=raw_text_len,
297
+ verbose=verbose,
298
+ return_end_reason=return_end_reason,
299
+ errors=errors,
300
+ )
301
+ else:
302
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
303
+
304
+
305
+ class StopWordsLogitsProcessor(LogitsProcessor):
306
+ """
307
+ :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
308
+
309
+ Args:
310
+ stop_words_ids (:obj:`List[List[int]]`):
311
+ List of list of token ids of stop ids. In order to get the tokens of the words
312
+ that should not appear in the generated text, use :obj:`tokenizer(bad_word,
313
+ add_prefix_space=True).input_ids`.
314
+ eos_token_id (:obj:`int`):
315
+ The id of the `end-of-sequence` token.
316
+ """
317
+
318
+ def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
319
+
320
+ if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
321
+ raise ValueError(
322
+ f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
323
+ )
324
+ if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
325
+ raise ValueError(
326
+ f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
327
+ )
328
+ if any(
329
+ any(
330
+ (not isinstance(token_id, (int, np.integer)) or token_id < 0)
331
+ for token_id in stop_word_ids
332
+ )
333
+ for stop_word_ids in stop_words_ids
334
+ ):
335
+ raise ValueError(
336
+ f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
337
+ )
338
+
339
+ self.stop_words_ids = list(
340
+ filter(
341
+ lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
342
+ )
343
+ )
344
+ self.eos_token_id = eos_token_id
345
+ for stop_token_seq in self.stop_words_ids:
346
+ assert (
347
+ len(stop_token_seq) > 0
348
+ ), "Stop words token sequences {} cannot have an empty list".format(
349
+ stop_words_ids
350
+ )
351
+
352
+ def __call__(
353
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor
354
+ ) -> torch.FloatTensor:
355
+ stopped_samples = self._calc_stopped_samples(input_ids)
356
+ for i, should_stop in enumerate(stopped_samples):
357
+ if should_stop:
358
+ scores[i, self.eos_token_id] = float(2**15)
359
+ return scores
360
+
361
+ def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
362
+ if len(tokens) == 0:
363
+ # if bad word tokens is just one token always ban it
364
+ return True
365
+ elif len(tokens) > len(prev_tokens):
366
+ # if bad word tokens are longer then prev input_ids they can't be equal
367
+ return False
368
+ elif prev_tokens[-len(tokens) :].tolist() == tokens:
369
+ # if tokens match
370
+ return True
371
+ else:
372
+ return False
373
+
374
+ def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
375
+ stopped_samples = []
376
+ for prev_input_ids_slice in prev_input_ids:
377
+ match = False
378
+ for stop_token_seq in self.stop_words_ids:
379
+ if self._tokens_match(prev_input_ids_slice, stop_token_seq):
380
+ # if tokens do not match continue
381
+ match = True
382
+ break
383
+ stopped_samples.append(match)
384
+
385
+ return stopped_samples
386
+
387
+
388
+ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
389
+ """This function has been mostly taken from huggingface conversational
390
+ ai code at
391
+ https://medium.com/huggingface/how-to-build-a-state-of-the-art-
392
+ conversational-ai-with-transfer-learning-2d818ac26313"""
393
+
394
+ if top_k > 0:
395
+ # Remove all tokens with a probability less than the
396
+ # last token of the top-k
397
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
398
+ logits[indices_to_remove] = filter_value
399
+
400
+ if top_p > 0.0:
401
+ # Cconvert to 1D
402
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
403
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
404
+
405
+ # Remove tokens with cumulative probability above the threshold
406
+ sorted_indices_to_remove = cumulative_probs > top_p
407
+ # Shift the indices to the right to keep also the first token
408
+ # above the threshold
409
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
410
+ sorted_indices_to_remove[..., 0] = 0
411
+ for i in range(sorted_indices.size(0)):
412
+ indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
413
+ logits[i][indices_to_remove] = filter_value
414
+
415
+ return logits
416
+
417
+
418
+ def switch(val1, val2, boolean):
419
+ boolean = boolean.type_as(val1)
420
+ return (1 - boolean) * val1 + boolean * val2
weights/model-base/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<|endoftext|>"
3
+ }
weights/model-base/tokenization_qwen.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import requests
12
+ import unicodedata
13
+ from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
14
+
15
+ import tiktoken
16
+ import numpy as np
17
+ from PIL import Image
18
+ from PIL import ImageFont
19
+ from PIL import ImageDraw
20
+ from transformers import PreTrainedTokenizer, AddedToken
21
+ from transformers.utils import try_to_load_from_cache
22
+
23
+ import matplotlib.colors as mcolors
24
+ from matplotlib.font_manager import FontProperties
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
30
+ FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
31
+ if FONT_PATH is None:
32
+ if not os.path.exists("SimSun.ttf"):
33
+ ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
34
+ open("SimSun.ttf", "wb").write(ttf.content)
35
+ FONT_PATH = "SimSun.ttf"
36
+
37
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
38
+ ENDOFTEXT = "<|endoftext|>"
39
+ IMSTART = "<|im_start|>"
40
+ IMEND = "<|im_end|>"
41
+ # as the default behavior is changed to allow special tokens in
42
+ # regular texts, the surface forms of special tokens need to be
43
+ # as different as possible to minimize the impact
44
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
45
+ SPECIAL_TOKENS = (
46
+ ENDOFTEXT,
47
+ IMSTART,
48
+ IMEND,
49
+ ) + EXTRAS
50
+ IMG_TOKEN_SPAN = 256
51
+
52
+
53
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
54
+ with open(tiktoken_bpe_file, "rb") as f:
55
+ contents = f.read()
56
+ return {
57
+ base64.b64decode(token): int(rank)
58
+ for token, rank in (line.split() for line in contents.splitlines() if line)
59
+ }
60
+
61
+ def _list_find(
62
+ input_list: List[Any],
63
+ candidates: Tuple[Any],
64
+ start: int = 0,
65
+ ):
66
+ for i in range(start, len(input_list)):
67
+ if input_list[i] in candidates:
68
+ return i
69
+ return -1
70
+
71
+ def _replace_closed_tag(
72
+ input_tokens: List[Any],
73
+ start_tags: Union[Any, Tuple[Any]],
74
+ end_tags: Union[Any, Tuple[Any]],
75
+ inclusive_replace_func: Callable,
76
+ exclusive_replace_func: Callable = lambda x: x,
77
+ ):
78
+ if isinstance(start_tags, (str, int)):
79
+ start_tags = (start_tags,)
80
+ if isinstance(end_tags, (str, int)):
81
+ end_tags = (end_tags,)
82
+ assert len(start_tags) == len(end_tags)
83
+
84
+ output_tokens = []
85
+ end = 0
86
+ while True:
87
+ start = _list_find(input_tokens, start_tags, end)
88
+ if start == -1:
89
+ break
90
+ output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
91
+ tag_idx = start_tags.index(input_tokens[start])
92
+ end = _list_find(input_tokens, (end_tags[tag_idx],), start)
93
+ if end == -1:
94
+ raise ValueError("Unclosed image token")
95
+ output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1]))
96
+ end += 1
97
+ output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
98
+ return output_tokens
99
+
100
+ class QWenTokenizer(PreTrainedTokenizer):
101
+ """QWen tokenizer."""
102
+
103
+ vocab_files_names = VOCAB_FILES_NAMES
104
+
105
+ def __init__(
106
+ self,
107
+ vocab_file,
108
+ errors="replace",
109
+ image_start_tag='<img>',
110
+ image_end_tag='</img>',
111
+ image_pad_tag='<imgpad>',
112
+ ref_start_tag='<ref>',
113
+ ref_end_tag='</ref>',
114
+ box_start_tag='<box>',
115
+ box_end_tag='</box>',
116
+ quad_start_tag='<quad>',
117
+ quad_end_tag='</quad>',
118
+ **kwargs,
119
+ ):
120
+ super().__init__(**kwargs)
121
+ self.image_start_tag = image_start_tag
122
+ self.image_end_tag = image_end_tag
123
+ self.image_pad_tag = image_pad_tag
124
+ self.ref_start_tag = ref_start_tag
125
+ self.ref_end_tag = ref_end_tag
126
+ self.box_start_tag = box_start_tag
127
+ self.box_end_tag = box_end_tag
128
+ self.quad_start_tag = quad_start_tag
129
+ self.quad_end_tag = quad_end_tag
130
+ self.IMAGE_ST = (
131
+ ref_start_tag, ref_end_tag,
132
+ box_start_tag, box_end_tag,
133
+ quad_start_tag, quad_end_tag,
134
+ image_start_tag, image_end_tag,
135
+ image_pad_tag
136
+ )
137
+
138
+ self.errors = errors # how to handle errors in decoding
139
+
140
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
141
+ self.special_tokens = {
142
+ token: index
143
+ for index, token in enumerate(
144
+ SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
145
+ )
146
+ }
147
+ self.img_start_id = self.special_tokens[self.image_start_tag]
148
+ self.img_end_id = self.special_tokens[self.image_end_tag]
149
+ self.img_pad_id = self.special_tokens[self.image_pad_tag]
150
+ self.ref_start_id = self.special_tokens[self.ref_start_tag]
151
+ self.ref_end_id = self.special_tokens[self.ref_end_tag]
152
+ self.box_start_id = self.special_tokens[self.box_start_tag]
153
+ self.box_end_id = self.special_tokens[self.box_end_tag]
154
+ self.quad_start_id = self.special_tokens[self.quad_start_tag]
155
+ self.quad_end_id = self.special_tokens[self.quad_end_tag]
156
+ self.image_special_tokens = set([
157
+ self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
158
+ self.quad_start_id, self.quad_end_id,
159
+ ])
160
+
161
+ enc = tiktoken.Encoding(
162
+ "Qwen",
163
+ pat_str=PAT_STR,
164
+ mergeable_ranks=self.mergeable_ranks,
165
+ special_tokens=self.special_tokens,
166
+ )
167
+ assert (
168
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
169
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
170
+
171
+ self.decoder = {
172
+ v: k for k, v in self.mergeable_ranks.items()
173
+ } # type: dict[int, bytes|str]
174
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
175
+
176
+ self.tokenizer = enc # type: tiktoken.Encoding
177
+
178
+ self.eod_id = self.tokenizer.eot_token
179
+ self.im_start_id = self.special_tokens[IMSTART]
180
+ self.im_end_id = self.special_tokens[IMEND]
181
+
182
+ def __getstate__(self):
183
+ # for pickle lovers
184
+ state = self.__dict__.copy()
185
+ del state['tokenizer']
186
+ return state
187
+
188
+ def __setstate__(self, state):
189
+ # tokenizer is not python native; don't pass it; rebuild it
190
+ self.__dict__.update(state)
191
+ enc = tiktoken.Encoding(
192
+ "Qwen",
193
+ pat_str=PAT_STR,
194
+ mergeable_ranks=self.mergeable_ranks,
195
+ special_tokens=self.special_tokens,
196
+ )
197
+ self.tokenizer = enc
198
+
199
+
200
+ def __len__(self) -> int:
201
+ return self.tokenizer.n_vocab
202
+
203
+ def get_vocab(self) -> Dict[bytes, int]:
204
+ return self.mergeable_ranks
205
+
206
+ def convert_tokens_to_ids(
207
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
208
+ ) -> List[int]:
209
+ ids = []
210
+ if isinstance(tokens, (str, bytes)):
211
+ if tokens in self.special_tokens:
212
+ return self.special_tokens[tokens]
213
+ else:
214
+ return self.mergeable_ranks.get(tokens)
215
+ for token in tokens:
216
+ if token in self.special_tokens:
217
+ ids.append(self.special_tokens[token])
218
+ else:
219
+ ids.append(self.mergeable_ranks.get(token))
220
+ return ids
221
+
222
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
223
+ if not special_tokens and new_tokens:
224
+ raise ValueError('Adding regular tokens is not supported')
225
+ for token in new_tokens:
226
+ surface_form = token.content if isinstance(token, AddedToken) else token
227
+ if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
228
+ raise ValueError('Adding unknown special tokens is not supported')
229
+ return 0
230
+
231
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
232
+ """
233
+ Save only the vocabulary of the tokenizer (vocabulary).
234
+
235
+ Returns:
236
+ `Tuple(str)`: Paths to the files saved.
237
+ """
238
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
239
+ with open(file_path, "w", encoding="utf8") as w:
240
+ for k, v in self.mergeable_ranks.items():
241
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
242
+ w.write(line)
243
+ return (file_path,)
244
+
245
+ def tokenize(
246
+ self,
247
+ text: str,
248
+ allowed_special: Union[Set, str] = "all",
249
+ disallowed_special: Union[Collection, str] = (),
250
+ **kwargs,
251
+ ) -> List[Union[bytes, str]]:
252
+ """
253
+ Converts a string in a sequence of tokens.
254
+
255
+ Args:
256
+ text (`str`):
257
+ The sequence to be encoded.
258
+ allowed_special (`Literal["all"]` or `set`):
259
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
260
+ Default to "all".
261
+ disallowed_special (`Literal["all"]` or `Collection`):
262
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
263
+ Default to an empty tuple.
264
+
265
+ kwargs (additional keyword arguments, *optional*):
266
+ Will be passed to the underlying model specific encode method.
267
+
268
+ Returns:
269
+ `List[bytes|str]`: The list of tokens.
270
+ """
271
+ tokens = []
272
+ text = unicodedata.normalize("NFC", text)
273
+
274
+ # this implementation takes a detour: text -> token id -> token surface forms
275
+ for t in self.tokenizer.encode(
276
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
277
+ ):
278
+ tokens.append(self.decoder[t])
279
+
280
+ def _encode_imgurl(img_tokens):
281
+ assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
282
+ img_tokens = img_tokens[1:-1]
283
+ img_url = b''.join(img_tokens)
284
+ out_img_tokens = list(map(self.decoder.get, img_url))
285
+ if len(out_img_tokens) > IMG_TOKEN_SPAN:
286
+ raise ValueError("The content in {}..{} is too long".format(
287
+ self.image_start_tag, self.image_end_tag))
288
+ out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
289
+ out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
290
+ return out_img_tokens
291
+
292
+ return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
293
+
294
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
295
+ """
296
+ Converts a sequence of tokens in a single string.
297
+ """
298
+ text = ""
299
+ temp = b""
300
+ for t in tokens:
301
+ if isinstance(t, str):
302
+ if temp:
303
+ text += temp.decode("utf-8", errors=self.errors)
304
+ temp = b""
305
+ text += t
306
+ elif isinstance(t, bytes):
307
+ temp += t
308
+ else:
309
+ raise TypeError("token should only be of type types or str")
310
+ if temp:
311
+ text += temp.decode("utf-8", errors=self.errors)
312
+ return text
313
+
314
+ @property
315
+ def vocab_size(self):
316
+ return self.tokenizer.n_vocab
317
+
318
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
319
+ """Converts an id to a token, special tokens included"""
320
+ if index in self.decoder:
321
+ return self.decoder[index]
322
+ raise ValueError("unknown ids")
323
+
324
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
325
+ """Converts a token to an id using the vocab, special tokens included"""
326
+ if token in self.special_tokens:
327
+ return self.special_tokens[token]
328
+ if token in self.mergeable_ranks:
329
+ return self.mergeable_ranks[token]
330
+ raise ValueError("unknown token")
331
+
332
+ def _tokenize(self, text: str, **kwargs):
333
+ """
334
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
335
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
336
+
337
+ Do NOT take care of added tokens.
338
+ """
339
+ raise NotImplementedError
340
+
341
+ def _decode(
342
+ self,
343
+ token_ids: Union[int, List[int]],
344
+ skip_special_tokens: bool = False,
345
+ errors: str = None,
346
+ **kwargs,
347
+ ) -> str:
348
+ if isinstance(token_ids, int):
349
+ token_ids = [token_ids]
350
+
351
+ def _decode_imgurl(img_token_ids):
352
+ assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
353
+ img_token_ids = img_token_ids[1:-1]
354
+ img_token_ids = img_token_ids[ : img_token_ids.index(self.img_pad_id)]
355
+ img_url = bytes(img_token_ids).decode('utf-8')
356
+ return [self.img_start_id] + self.tokenizer.encode(img_url) + [self.img_end_id]
357
+
358
+ token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
359
+
360
+ if skip_special_tokens:
361
+ if kwargs.get('keep_image_special', False):
362
+ token_ids = [i for i in token_ids if i < self.eod_id
363
+ or i in self.image_special_tokens]
364
+ else:
365
+ token_ids = [i for i in token_ids if i < self.eod_id]
366
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
367
+
368
+ def to_list_format(self, text: str):
369
+ text = unicodedata.normalize("NFC", text)
370
+ token_ids = self.tokenizer.encode(
371
+ text, allowed_special=set(self.IMAGE_ST + (ENDOFTEXT,)))
372
+
373
+ def _encode_vl_info(tokens):
374
+ if len(tokens) == 0:
375
+ return []
376
+ if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
377
+ key = 'image'
378
+ elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
379
+ key = 'ref'
380
+ elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
381
+ key = 'box'
382
+ elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
383
+ key = 'quad'
384
+ else:
385
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
386
+ return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
387
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
388
+ val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
389
+ return [{key: val}]
390
+
391
+ return _replace_closed_tag(
392
+ token_ids,
393
+ (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
394
+ (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
395
+ _encode_vl_info,
396
+ _encode_vl_info,
397
+ )
398
+
399
+ def from_list_format(self, list_format: List[Dict]):
400
+ text = ''
401
+ num_images = 0
402
+ for ele in list_format:
403
+ if 'image' in ele:
404
+ num_images += 1
405
+ text += f'Picture {num_images}: '
406
+ text += self.image_start_tag + ele['image'] + self.image_end_tag
407
+ text += '\n'
408
+ elif 'text' in ele:
409
+ text += ele['text']
410
+ elif 'box' in ele:
411
+ if 'ref' in ele:
412
+ text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
413
+ for box in ele['box']:
414
+ text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
415
+ else:
416
+ raise ValueError("Unsupport element: " + str(ele))
417
+ return text
418
+
419
+ def _fetch_latest_picture(self, response, history):
420
+ if history is None:
421
+ history = []
422
+ _history = history + [(response, None)]
423
+ for q, r in _history[::-1]:
424
+ for ele in self.to_list_format(q)[::-1]:
425
+ if 'image' in ele:
426
+ return ele['image']
427
+ return None
428
+
429
+ def _fetch_all_box_with_ref(self, text):
430
+ list_format = self.to_list_format(text)
431
+ output = []
432
+ for i, ele in enumerate(list_format):
433
+ if 'box' in ele:
434
+ bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
435
+ assert len(bbox) == 4
436
+ output.append({'box': bbox})
437
+ if i > 0 and 'ref' in list_format[i-1]:
438
+ output[-1]['ref'] = list_format[i-1]['ref'].strip()
439
+ return output
440
+
441
+ def draw_bbox_on_latest_picture(
442
+ self,
443
+ response,
444
+ history=None,
445
+ ) -> Optional[Image.Image]:
446
+ image = self._fetch_latest_picture(response, history)
447
+ if image is None:
448
+ return None
449
+ if image.startswith("http://") or image.startswith("https://"):
450
+ image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
451
+ h, w = image.height, image.width
452
+ else:
453
+ image = np.asarray(Image.open(image).convert("RGB"))
454
+ h, w = image.shape[0], image.shape[1]
455
+ visualizer = Visualizer(image)
456
+
457
+ boxes = self._fetch_all_box_with_ref(response)
458
+ if not boxes:
459
+ return None
460
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()]) # init color
461
+ for box in boxes:
462
+ if 'ref' in box: # random new color for new refexps
463
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()])
464
+ x1, y1, x2, y2 = box['box']
465
+ x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
466
+ visualizer.draw_box((x1, y1, x2, y2), alpha=1, edge_color=color)
467
+ if 'ref' in box:
468
+ visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
469
+ return visualizer.output
470
+
471
+
472
+ import colorsys
473
+ import logging
474
+ import math
475
+ import numpy as np
476
+ import matplotlib as mpl
477
+ import matplotlib.colors as mplc
478
+ import matplotlib.figure as mplfigure
479
+ import torch
480
+ from matplotlib.backends.backend_agg import FigureCanvasAgg
481
+ from PIL import Image
482
+ import random
483
+
484
+ logger = logging.getLogger(__name__)
485
+
486
+
487
+ class VisImage:
488
+ def __init__(self, img, scale=1.0):
489
+ self.img = img
490
+ self.scale = scale
491
+ self.width, self.height = img.shape[1], img.shape[0]
492
+ self._setup_figure(img)
493
+
494
+ def _setup_figure(self, img):
495
+ fig = mplfigure.Figure(frameon=False)
496
+ self.dpi = fig.get_dpi()
497
+ # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
498
+ # (https://github.com/matplotlib/matplotlib/issues/15363)
499
+ fig.set_size_inches(
500
+ (self.width * self.scale + 1e-2) / self.dpi,
501
+ (self.height * self.scale + 1e-2) / self.dpi,
502
+ )
503
+ self.canvas = FigureCanvasAgg(fig)
504
+ # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
505
+ ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
506
+ ax.axis("off")
507
+ self.fig = fig
508
+ self.ax = ax
509
+ self.reset_image(img)
510
+
511
+ def reset_image(self, img):
512
+ img = img.astype("uint8")
513
+ self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
514
+
515
+ def save(self, filepath):
516
+ self.fig.savefig(filepath)
517
+
518
+ def get_image(self):
519
+ canvas = self.canvas
520
+ s, (width, height) = canvas.print_to_buffer()
521
+
522
+ buffer = np.frombuffer(s, dtype="uint8")
523
+
524
+ img_rgba = buffer.reshape(height, width, 4)
525
+ rgb, alpha = np.split(img_rgba, [3], axis=2)
526
+ return rgb.astype("uint8")
527
+
528
+
529
+ class Visualizer:
530
+ def __init__(self, img_rgb, metadata=None, scale=1.0):
531
+ self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
532
+ self.font_path = FONT_PATH
533
+ self.output = VisImage(self.img, scale=scale)
534
+ self.cpu_device = torch.device("cpu")
535
+
536
+ # too small texts are useless, therefore clamp to 14
537
+ self._default_font_size = max(
538
+ np.sqrt(self.output.height * self.output.width) // 30, 15 // scale
539
+ )
540
+
541
+ def draw_text(
542
+ self,
543
+ text,
544
+ position,
545
+ *,
546
+ font_size=None,
547
+ color="g",
548
+ horizontal_alignment="center",
549
+ rotation=0,
550
+ ):
551
+ if not font_size:
552
+ font_size = self._default_font_size
553
+
554
+ # since the text background is dark, we don't want the text to be dark
555
+ color = np.maximum(list(mplc.to_rgb(color)), 0.2)
556
+ color[np.argmax(color)] = max(0.8, np.max(color))
557
+
558
+ x, y = position
559
+ self.output.ax.text(
560
+ x,
561
+ y,
562
+ text,
563
+ size=font_size * self.output.scale,
564
+ fontproperties=FontProperties(fname=self.font_path),
565
+ bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
566
+ verticalalignment="top",
567
+ horizontalalignment=horizontal_alignment,
568
+ color=color,
569
+ zorder=10,
570
+ rotation=rotation,
571
+ )
572
+ return self.output
573
+
574
+ def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
575
+
576
+ x0, y0, x1, y1 = box_coord
577
+ width = x1 - x0
578
+ height = y1 - y0
579
+
580
+ linewidth = max(self._default_font_size / 4, 1)
581
+
582
+ self.output.ax.add_patch(
583
+ mpl.patches.Rectangle(
584
+ (x0, y0),
585
+ width,
586
+ height,
587
+ fill=False,
588
+ edgecolor=edge_color,
589
+ linewidth=linewidth * self.output.scale,
590
+ alpha=alpha,
591
+ linestyle=line_style,
592
+ )
593
+ )
594
+ return self.output
595
+
596
+ def get_output(self):
597
+
598
+ return self.output
weights/model-base/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 2048,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
weights/model-base/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-base/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cff5574e35478085cb9eb62d37ff481a661783b79e9bb13705e6b8c52ebf52bb
3
+ size 6840
weights/model-base/visual.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from collections import OrderedDict
7
+ import math
8
+ import requests
9
+ from io import BytesIO
10
+ from functools import partial
11
+ from PIL import Image
12
+ from typing import Callable, Optional, Sequence, Tuple, List
13
+ import numpy as np
14
+
15
+ import torch
16
+ from torch import nn
17
+ from torch.nn import functional as F
18
+ from torch.nn.init import normal_
19
+ from torchvision import transforms
20
+ from torchvision.transforms import InterpolationMode
21
+
22
+
23
+ def get_abs_pos(abs_pos, tgt_size):
24
+ # abs_pos: L, C
25
+ # tgt_size: M
26
+ # return: M, C
27
+ src_size = int(math.sqrt(abs_pos.size(0)))
28
+ tgt_size = int(math.sqrt(tgt_size))
29
+ dtype = abs_pos.dtype
30
+
31
+ if src_size != tgt_size:
32
+ return F.interpolate(
33
+ abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
34
+ size=(tgt_size, tgt_size),
35
+ mode="bicubic",
36
+ align_corners=False,
37
+ ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
38
+ else:
39
+ return abs_pos
40
+
41
+ # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
42
+ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
43
+ """
44
+ grid_size: int of the grid height and width
45
+ return:
46
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
47
+ """
48
+ grid_h = np.arange(grid_size, dtype=np.float32)
49
+ grid_w = np.arange(grid_size, dtype=np.float32)
50
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
51
+ grid = np.stack(grid, axis=0)
52
+
53
+ grid = grid.reshape([2, 1, grid_size, grid_size])
54
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
55
+ if cls_token:
56
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
57
+ return pos_embed
58
+
59
+
60
+ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
61
+ assert embed_dim % 2 == 0
62
+
63
+ # use half of dimensions to encode grid_h
64
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
65
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
66
+
67
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
68
+ return emb
69
+
70
+
71
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
72
+ """
73
+ embed_dim: output dimension for each position
74
+ pos: a list of positions to be encoded: size (M,)
75
+ out: (M, D)
76
+ """
77
+ assert embed_dim % 2 == 0
78
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
79
+ omega /= embed_dim / 2.
80
+ omega = 1. / 10000**omega # (D/2,)
81
+
82
+ pos = pos.reshape(-1) # (M,)
83
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
84
+
85
+ emb_sin = np.sin(out) # (M, D/2)
86
+ emb_cos = np.cos(out) # (M, D/2)
87
+
88
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
89
+ return emb
90
+
91
+
92
+ class Resampler(nn.Module):
93
+ """
94
+ A 2D perceiver-resampler network with one cross attention layers by
95
+ (grid_size**2) learnable queries and 2d sincos pos_emb
96
+ Outputs:
97
+ A tensor with the shape of (grid_size**2, embed_dim)
98
+ """
99
+ def __init__(
100
+ self,
101
+ grid_size,
102
+ embed_dim,
103
+ num_heads,
104
+ kv_dim=None,
105
+ norm_layer=nn.LayerNorm
106
+ ):
107
+ super().__init__()
108
+ self.num_queries = grid_size ** 2
109
+ self.embed_dim = embed_dim
110
+ self.num_heads = num_heads
111
+
112
+ self.pos_embed = nn.Parameter(
113
+ torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
114
+ ).requires_grad_(False)
115
+
116
+ self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
117
+ normal_(self.query, std=.02)
118
+
119
+ if kv_dim is not None and kv_dim != embed_dim:
120
+ self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
121
+ else:
122
+ self.kv_proj = nn.Identity()
123
+
124
+ self.attn = nn.MultiheadAttention(embed_dim, num_heads) # only out_proj
125
+ self.ln_q = norm_layer(embed_dim)
126
+ self.ln_kv = norm_layer(embed_dim)
127
+
128
+ # new
129
+ # self.attn1 = nn.MultiheadAttention(embed_dim, num_heads)
130
+ # self.attn1.apply(self._init_weights_zero)
131
+ # self.id_query_fc = nn.Linear(embed_dim, embed_dim, bias=False)
132
+ # self.test_feat_fc = nn.Linear(embed_dim, embed_dim, bias=False)
133
+ # self.id_query_fc.apply(self._init_weights_zero)
134
+ # self.test_feat_fc.apply(self._init_weights_zero)
135
+
136
+ # self.apply(self._init_weights)
137
+
138
+ def _init_weights(self, m):
139
+ if isinstance(m, nn.Linear):
140
+ normal_(m.weight, std=.02)
141
+ if isinstance(m, nn.Linear) and m.bias is not None:
142
+ nn.init.constant_(m.bias, 0)
143
+ elif isinstance(m, nn.LayerNorm):
144
+ nn.init.constant_(m.bias, 0)
145
+ nn.init.constant_(m.weight, 1.0)
146
+
147
+ def _init_weights_zero(self, m):
148
+ if isinstance(m, nn.Linear):
149
+ nn.init.constant_(m.weight, 0)
150
+ if isinstance(m, nn.Linear) and m.bias is not None:
151
+ nn.init.constant_(m.bias, 0)
152
+ elif isinstance(m, nn.LayerNorm):
153
+ nn.init.constant_(m.bias, 0)
154
+ nn.init.constant_(m.weight, 1.0)
155
+
156
+ # def forward(self, x, images_flag, attn_mask=None): # idadapter3
157
+ # pos_embed = get_abs_pos(self.pos_embed, x.size(1))
158
+
159
+ # x = self.kv_proj(x)
160
+ # x = self.ln_kv(x).permute(1, 0, 2)
161
+
162
+ # N = x.shape[1]
163
+ # q = self.ln_q(self.query)
164
+ # q_ = self._repeat(q, N)
165
+
166
+ # out_list = []
167
+ # ind = 0
168
+ # while ind < len(images_flag):
169
+ # if images_flag[ind] == 2:
170
+ # end_ind = ind+1
171
+ # while True:
172
+ # if end_ind >= len(images_flag):
173
+ # break
174
+ # if images_flag[end_ind] == 2:
175
+ # end_ind += 1
176
+ # else:
177
+ # break
178
+ # fake_out = self.attn1(
179
+ # q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
180
+ # x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
181
+ # x[:,ind:end_ind,:],
182
+ # attn_mask=attn_mask)[0]
183
+
184
+ # llava_out = self.attn(
185
+ # q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
186
+ # x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
187
+ # x[:,ind:end_ind,:],
188
+ # attn_mask=attn_mask)[0]
189
+ # llava_out = llava_out + 0 * fake_out
190
+ # ind = end_ind
191
+ # out_list.append(llava_out)
192
+
193
+ # elif images_flag[ind] == 0:
194
+ # id_end_ind = ind+1
195
+ # while True:
196
+ # if id_end_ind >= len(images_flag):
197
+ # break
198
+ # if images_flag[id_end_ind] == 0:
199
+ # id_end_ind += 1
200
+ # else:
201
+ # break
202
+ # id_out = self.attn(
203
+ # q_[:,ind:id_end_ind,:] + self.pos_embed.unsqueeze(1),
204
+ # x[:,ind:id_end_ind,:] + pos_embed.unsqueeze(1),
205
+ # x[:,ind:id_end_ind,:],
206
+ # attn_mask=attn_mask)[0]
207
+ # ind = id_end_ind
208
+ # out_list.append(id_out)
209
+ # test_end_ind = ind+1
210
+ # while True:
211
+ # if test_end_ind >= len(images_flag):
212
+ # break
213
+ # if images_flag[test_end_ind] == 1:
214
+ # test_end_ind += 1
215
+ # else:
216
+ # break
217
+ # id_query = id_out.detach()
218
+ # id_query = id_query.permute(1,0,2).reshape([-1, id_query.shape[-1]])
219
+ # id_query = self._repeat(id_query, test_end_ind-ind)
220
+ # test_feats = x[:,ind:test_end_ind,:]
221
+ # test_out1 = self.attn1(
222
+ # test_feats + pos_embed.unsqueeze(1),
223
+ # id_query,
224
+ # id_query,
225
+ # attn_mask=attn_mask)[0]
226
+ # test_out1 = test_out1 + test_feats # residual
227
+ # test_out2 = self.attn(
228
+ # q_[:,ind:test_end_ind,:] + self.pos_embed.unsqueeze(1),
229
+ # test_out1 + pos_embed.unsqueeze(1),
230
+ # test_out1,
231
+ # attn_mask=attn_mask)[0]
232
+ # ind = test_end_ind
233
+ # out_list.append(test_out2)
234
+
235
+ # else:
236
+ # print('error')
237
+
238
+ # out = torch.cat(out_list, 1)
239
+ # # except:
240
+ # # fake_out = self.attn1(
241
+ # # self._repeat(q, N) + self.pos_embed.unsqueeze(1),
242
+ # # x + pos_embed.unsqueeze(1),
243
+ # # x,
244
+ # # attn_mask=attn_mask)[0]
245
+ # # out = self.attn(
246
+ # # self._repeat(q, N) + self.pos_embed.unsqueeze(1),
247
+ # # x + pos_embed.unsqueeze(1),
248
+ # # x,
249
+ # # attn_mask=attn_mask)[0]
250
+ # # out = out + 0 * fake_out
251
+ # # t = time.time()
252
+ # # with open('/mnt/bn/automl-aigc/yatai/Qwen-VL/bug2.txt', 'a') as f:
253
+ # # f.write(f"{t}: visual id-former error\n")
254
+ # return out.permute(1,0,2)
255
+
256
+ def forward(self, x, images_flag=None, attn_mask=None): # base
257
+
258
+ pos_embed = get_abs_pos(self.pos_embed, x.size(1))
259
+
260
+ x = self.kv_proj(x)
261
+ x = self.ln_kv(x).permute(1, 0, 2)
262
+
263
+ N = x.shape[1]
264
+ q = self.ln_q(self.query)
265
+
266
+ out = self.attn(
267
+ self._repeat(q, N) + self.pos_embed.unsqueeze(1),
268
+ x + pos_embed.unsqueeze(1),
269
+ x,
270
+ attn_mask=attn_mask)[0]
271
+
272
+ return out.permute(1, 0, 2)
273
+
274
+ def _repeat(self, query, N: int):
275
+ return query.unsqueeze(1).repeat(1, N, 1)
276
+
277
+
278
+ class VisualAttention(nn.Module):
279
+ """self-attention layer class.
280
+
281
+ Self-attention layer takes input with size [s, b, h]
282
+ and returns output of the same size.
283
+ """
284
+
285
+ def __init__(self, embed_dim, num_heads,
286
+ bias=True, kdim=None, vdim=None):
287
+ super(VisualAttention, self).__init__()
288
+ self.embed_dim = embed_dim
289
+ self.kdim = kdim if kdim is not None else embed_dim
290
+ self.vdim = vdim if vdim is not None else embed_dim
291
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
292
+
293
+ self.num_heads = num_heads
294
+
295
+ # Per attention head and per partition values.
296
+ assert embed_dim % num_heads == 0
297
+ self.hidden_size_per_attention_head = embed_dim // num_heads
298
+ self.num_attention_heads_per_partition = num_heads
299
+ self.hidden_size_per_partition = embed_dim
300
+
301
+ # Strided linear layer.
302
+ assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
303
+ self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
304
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
305
+ self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
306
+
307
+ def forward(self, query, key, value, attn_mask = None):
308
+ # query/key/value: [sq, b, h]
309
+ sq, b, _ = query.size()
310
+
311
+ assert torch.allclose(query, key), 'Only Support Self-Attention Currently'
312
+ sk = sq
313
+ mixed_x_layer = self.in_proj(query)
314
+
315
+ # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
316
+ new_tensor_shape = mixed_x_layer.size()[:-1] + \
317
+ (self.num_attention_heads_per_partition,
318
+ 3 * self.hidden_size_per_attention_head)
319
+ mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
320
+
321
+ # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
322
+ query_layer, key_layer, value_layer = mixed_x_layer.split(
323
+ self.hidden_size_per_attention_head, dim=-1)
324
+
325
+ # [sq, b, np, hn] -> [sq, b * np, hn]
326
+ query_layer = query_layer.view(sq,
327
+ b * self.num_attention_heads_per_partition,
328
+ self.hidden_size_per_attention_head).transpose(0, 1)
329
+ # [sk, b, np, hn] -> [sk, b * np, hn]
330
+ key_layer = key_layer.view(sk,
331
+ b * self.num_attention_heads_per_partition,
332
+ self.hidden_size_per_attention_head).transpose(0, 1)
333
+
334
+ q_scaled = query_layer / self.norm_factor
335
+ if attn_mask is not None:
336
+ attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
337
+ else:
338
+ attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
339
+ attention_probs = attention_probs.softmax(dim=-1)
340
+
341
+ value_layer = value_layer.view(sk,
342
+ b * self.num_attention_heads_per_partition,
343
+ self.hidden_size_per_attention_head).transpose(0, 1)
344
+
345
+ # matmul: [b * np, sq, hn]
346
+ context_layer = torch.bmm(attention_probs, value_layer)
347
+
348
+ # change view [b, np, sq, hn]
349
+ context_layer = context_layer.view(b,
350
+ self.num_attention_heads_per_partition,
351
+ sq, self.hidden_size_per_attention_head)
352
+
353
+ # [b, np, sq, hn] --> [sq, b, np, hn]
354
+ context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
355
+
356
+ # [sq, b, np, hn] --> [sq, b, hp]
357
+ new_context_layer_shape = context_layer.size()[:-2] + \
358
+ (self.hidden_size_per_partition,)
359
+ context_layer = context_layer.view(*new_context_layer_shape)
360
+
361
+ output = self.out_proj(context_layer)
362
+
363
+ return output
364
+
365
+
366
+ class VisualAttentionBlock(nn.Module):
367
+ def __init__(
368
+ self,
369
+ d_model: int,
370
+ n_head: int,
371
+ mlp_ratio: float = 4.0,
372
+ act_layer: Callable = nn.GELU,
373
+ norm_layer: Callable = nn.LayerNorm,
374
+ is_cross_attention: bool = False,
375
+ ):
376
+ super().__init__()
377
+
378
+ self.ln_1 = norm_layer(d_model)
379
+ if is_cross_attention:
380
+ self.ln_1_kv = norm_layer(d_model)
381
+
382
+ self.ln_2 = norm_layer(d_model)
383
+ mlp_width = int(d_model * mlp_ratio)
384
+ self.attn = VisualAttention(d_model, n_head)
385
+ self.mlp = nn.Sequential(OrderedDict([
386
+ ("c_fc", nn.Linear(d_model, mlp_width)),
387
+ ("gelu", act_layer()),
388
+ ("c_proj", nn.Linear(mlp_width, d_model))
389
+ ]))
390
+
391
+ def attention(
392
+ self,
393
+ q_x: torch.Tensor,
394
+ k_x: Optional[torch.Tensor] = None,
395
+ v_x: Optional[torch.Tensor] = None,
396
+ attn_mask: Optional[torch.Tensor] = None,
397
+ ):
398
+ k_x = k_x if k_x is not None else q_x
399
+ v_x = v_x if v_x is not None else q_x
400
+
401
+ attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
402
+ return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
403
+
404
+ def forward(
405
+ self,
406
+ q_x: torch.Tensor,
407
+ k_x: Optional[torch.Tensor] = None,
408
+ v_x: Optional[torch.Tensor] = None,
409
+ attn_mask: Optional[torch.Tensor] = None,
410
+ ):
411
+ k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
412
+ v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
413
+
414
+ x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
415
+ x = x + self.mlp(self.ln_2(x))
416
+ return x
417
+
418
+
419
+ class TransformerBlock(nn.Module):
420
+ def __init__(
421
+ self,
422
+ width: int,
423
+ layers: int,
424
+ heads: int,
425
+ mlp_ratio: float = 4.0,
426
+ act_layer: Callable = nn.GELU,
427
+ norm_layer: Callable = nn.LayerNorm,
428
+ ):
429
+ super().__init__()
430
+ self.width = width
431
+ self.layers = layers
432
+
433
+ self.resblocks = nn.ModuleList([
434
+ VisualAttentionBlock(
435
+ width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer)
436
+ for _ in range(layers)
437
+ ])
438
+
439
+ def get_cast_dtype(self) -> torch.dtype:
440
+ return self.resblocks[0].mlp.c_fc.weight.dtype
441
+
442
+ def get_cast_device(self) -> torch.device:
443
+ return self.resblocks[0].mlp.c_fc.weight.device
444
+
445
+ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
446
+ for r in self.resblocks:
447
+ x = r(x, attn_mask=attn_mask)
448
+ return x
449
+
450
+
451
+ class VisionTransformer(nn.Module):
452
+
453
+ def __init__(
454
+ self,
455
+ image_size: int,
456
+ patch_size: int,
457
+ width: int,
458
+ layers: int,
459
+ heads: int,
460
+ mlp_ratio: float,
461
+ n_queries: int = 256,
462
+ output_dim: int = 512,
463
+ **kwargs
464
+ ):
465
+ super().__init__()
466
+ image_height, image_width = self.image_size = (image_size, image_size)
467
+ patch_height, patch_width = self.patch_size = (patch_size, patch_size)
468
+ self.grid_size = (image_height // patch_height, image_width // patch_width)
469
+ self.output_dim = output_dim
470
+
471
+ mean = (0.48145466, 0.4578275, 0.40821073)
472
+ std = (0.26862954, 0.26130258, 0.27577711)
473
+ self.image_transform = transforms.Compose([
474
+ transforms.Resize(
475
+ (image_size, image_size),
476
+ interpolation=InterpolationMode.BICUBIC
477
+ ),
478
+ transforms.ToTensor(),
479
+ transforms.Normalize(mean=mean, std=std),
480
+ ])
481
+
482
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
483
+
484
+ # class embeddings and positional embeddings
485
+ scale = width ** -0.5
486
+ self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
487
+
488
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
489
+ act_layer = nn.GELU
490
+
491
+ self.ln_pre = norm_layer(width)
492
+ self.transformer = TransformerBlock(
493
+ width,
494
+ layers,
495
+ heads,
496
+ mlp_ratio,
497
+ act_layer=act_layer,
498
+ norm_layer=norm_layer,
499
+ )
500
+
501
+ self.attn_pool = Resampler(
502
+ grid_size=int(math.sqrt(n_queries)),
503
+ embed_dim=output_dim,
504
+ num_heads=output_dim // 128,
505
+ kv_dim=width,
506
+ norm_layer=norm_layer,
507
+ )
508
+ self.ln_post = norm_layer(output_dim)
509
+ self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
510
+
511
+ def forward(self, x: torch.Tensor, images_flag):
512
+ x = x.to(
513
+ dtype=self.transformer.get_cast_dtype(),
514
+ device=self.transformer.get_cast_device(),
515
+ )
516
+ # to patches
517
+ x = self.conv1(x) # shape = [*, width, grid, grid]
518
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
519
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
520
+
521
+ x = x + get_abs_pos(self.positional_embedding, x.size(1))
522
+
523
+ x = self.ln_pre(x)
524
+
525
+ x = x.permute(1, 0, 2) # NLD -> LND
526
+ x = self.transformer(x)
527
+ x = x.permute(1, 0, 2) # LND -> NLD
528
+
529
+ x = self.attn_pool(x, images_flag)
530
+ x = self.ln_post(x)
531
+ x = x @ self.proj
532
+
533
+ return x
534
+
535
+ def encode(self, image_paths: List[str], images_flag):
536
+ images = []
537
+ for image_path in image_paths:
538
+ if image_path.startswith("http://") or image_path.startswith("https://"):
539
+ image = Image.open(requests.get(image_path, stream=True).raw)
540
+ else:
541
+ image = Image.open(image_path)
542
+ image = image.convert("RGB")
543
+ images.append(self.image_transform(image))
544
+ images = torch.stack(images, dim=0)
545
+ return self(images, images_flag)
weights/model-base/zero_to_fp32.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
+ elif zero_stage == 3:
216
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
+
218
+
219
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
+ return
222
+
223
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
+
226
+ if debug:
227
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
+
230
+ wanted_params = len(frozen_param_shapes)
231
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
+ print(f'Frozen params: Have {avail_numel} numels to process.')
234
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
+
236
+ total_params = 0
237
+ total_numel = 0
238
+ for name, shape in frozen_param_shapes.items():
239
+ total_params += 1
240
+ unpartitioned_numel = shape.numel()
241
+ total_numel += unpartitioned_numel
242
+
243
+ state_dict[name] = frozen_param_fragments[name]
244
+
245
+ if debug:
246
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
+
248
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
+
250
+
251
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
252
+ param_shapes = zero_model_states[0].param_shapes
253
+
254
+ # Reconstruction protocol:
255
+ #
256
+ # XXX: document this
257
+
258
+ if debug:
259
+ for i in range(world_size):
260
+ for j in range(len(fp32_flat_groups[0])):
261
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
262
+
263
+ # XXX: memory usage doubles here (zero2)
264
+ num_param_groups = len(fp32_flat_groups[0])
265
+ merged_single_partition_of_fp32_groups = []
266
+ for i in range(num_param_groups):
267
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
268
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
269
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
270
+ avail_numel = sum(
271
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
272
+
273
+ if debug:
274
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
275
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
276
+ # not asserting if there is a mismatch due to possible padding
277
+ print(f"Have {avail_numel} numels to process.")
278
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
279
+
280
+ # params
281
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
282
+ # out-of-core computing solution
283
+ total_numel = 0
284
+ total_params = 0
285
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
286
+ offset = 0
287
+ avail_numel = full_single_fp32_vector.numel()
288
+ for name, shape in shapes.items():
289
+
290
+ unpartitioned_numel = shape.numel()
291
+ total_numel += unpartitioned_numel
292
+ total_params += 1
293
+
294
+ if debug:
295
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
296
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
297
+ offset += unpartitioned_numel
298
+
299
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
300
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
301
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
302
+ # live optimizer object, so we are checking that the numbers are within the right range
303
+ align_to = 2 * world_size
304
+
305
+ def zero2_align(x):
306
+ return align_to * math.ceil(x / align_to)
307
+
308
+ if debug:
309
+ print(f"original offset={offset}, avail_numel={avail_numel}")
310
+
311
+ offset = zero2_align(offset)
312
+ avail_numel = zero2_align(avail_numel)
313
+
314
+ if debug:
315
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
316
+
317
+ # Sanity check
318
+ if offset != avail_numel:
319
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
320
+
321
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
322
+
323
+
324
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
325
+ state_dict = OrderedDict()
326
+
327
+ # buffers
328
+ buffers = zero_model_states[0].buffers
329
+ state_dict.update(buffers)
330
+ if debug:
331
+ print(f"added {len(buffers)} buffers")
332
+
333
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
334
+
335
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
336
+
337
+ # recover shared parameters
338
+ for pair in zero_model_states[0].shared_params:
339
+ if pair[1] in state_dict:
340
+ state_dict[pair[0]] = state_dict[pair[1]]
341
+
342
+ return state_dict
343
+
344
+
345
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
346
+ remainder = unpartitioned_numel % world_size
347
+ padding_numel = (world_size - remainder) if remainder else 0
348
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
349
+ return partitioned_numel, padding_numel
350
+
351
+
352
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
353
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
354
+ return
355
+
356
+ if debug:
357
+ for i in range(world_size):
358
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
359
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
360
+
361
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
362
+ wanted_params = len(frozen_param_shapes)
363
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
364
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
365
+ print(f'Frozen params: Have {avail_numel} numels to process.')
366
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
367
+
368
+ total_params = 0
369
+ total_numel = 0
370
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
371
+ total_params += 1
372
+ unpartitioned_numel = shape.numel()
373
+ total_numel += unpartitioned_numel
374
+
375
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
376
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
377
+
378
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
379
+
380
+ if debug:
381
+ print(
382
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
383
+ )
384
+
385
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
386
+
387
+
388
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
389
+ param_shapes = zero_model_states[0].param_shapes
390
+ avail_numel = fp32_flat_groups[0].numel() * world_size
391
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
392
+ # param, re-consolidating each param, while dealing with padding if any
393
+
394
+ # merge list of dicts, preserving order
395
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
396
+
397
+ if debug:
398
+ for i in range(world_size):
399
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
400
+
401
+ wanted_params = len(param_shapes)
402
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
403
+ # not asserting if there is a mismatch due to possible padding
404
+ avail_numel = fp32_flat_groups[0].numel() * world_size
405
+ print(f"Trainable params: Have {avail_numel} numels to process.")
406
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
407
+
408
+ # params
409
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
410
+ # out-of-core computing solution
411
+ offset = 0
412
+ total_numel = 0
413
+ total_params = 0
414
+ for name, shape in param_shapes.items():
415
+
416
+ unpartitioned_numel = shape.numel()
417
+ total_numel += unpartitioned_numel
418
+ total_params += 1
419
+
420
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
421
+
422
+ if debug:
423
+ print(
424
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
425
+ )
426
+
427
+ # XXX: memory usage doubles here
428
+ state_dict[name] = torch.cat(
429
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
430
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
431
+ offset += partitioned_numel
432
+
433
+ offset *= world_size
434
+
435
+ # Sanity check
436
+ if offset != avail_numel:
437
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
438
+
439
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
440
+
441
+
442
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
443
+ state_dict = OrderedDict()
444
+
445
+ # buffers
446
+ buffers = zero_model_states[0].buffers
447
+ state_dict.update(buffers)
448
+ if debug:
449
+ print(f"added {len(buffers)} buffers")
450
+
451
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
452
+
453
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
454
+
455
+ # recover shared parameters
456
+ for pair in zero_model_states[0].shared_params:
457
+ if pair[1] in state_dict:
458
+ state_dict[pair[0]] = state_dict[pair[1]]
459
+
460
+ return state_dict
461
+
462
+
463
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
464
+ """
465
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
466
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
467
+ via a model hub.
468
+
469
+ Args:
470
+ - ``checkpoint_dir``: path to the desired checkpoint folder
471
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
472
+
473
+ Returns:
474
+ - pytorch ``state_dict``
475
+
476
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
477
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
478
+ the checkpoint.
479
+
480
+ A typical usage might be ::
481
+
482
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
483
+ # do the training and checkpoint saving
484
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
485
+ model = model.cpu() # move to cpu
486
+ model.load_state_dict(state_dict)
487
+ # submit to model hub or save the model to share with others
488
+
489
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
490
+ application. i.e. you will need to re-initialize the deepspeed engine, since
491
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
492
+
493
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
494
+
495
+ """
496
+ if tag is None:
497
+ latest_path = os.path.join(checkpoint_dir, 'latest')
498
+ if os.path.isfile(latest_path):
499
+ with open(latest_path, 'r') as fd:
500
+ tag = fd.read().strip()
501
+ else:
502
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
503
+
504
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
505
+
506
+ if not os.path.isdir(ds_checkpoint_dir):
507
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
508
+
509
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
510
+
511
+
512
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
513
+ """
514
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
515
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
516
+
517
+ Args:
518
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
519
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
520
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
521
+ """
522
+
523
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
524
+ print(f"Saving fp32 state dict to {output_file}")
525
+ torch.save(state_dict, output_file)
526
+
527
+
528
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
529
+ """
530
+ 1. Put the provided model to cpu
531
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
532
+ 3. Load it into the provided model
533
+
534
+ Args:
535
+ - ``model``: the model object to update
536
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
537
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
538
+
539
+ Returns:
540
+ - ``model`: modified model
541
+
542
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
543
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
544
+ conveniently placed for you in the checkpoint folder.
545
+
546
+ A typical usage might be ::
547
+
548
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
549
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
550
+ # submit to model hub or save the model to share with others
551
+
552
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
553
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
554
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
555
+
556
+ """
557
+ logger.info(f"Extracting fp32 weights")
558
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
559
+
560
+ logger.info(f"Overwriting model with fp32 weights")
561
+ model = model.cpu()
562
+ model.load_state_dict(state_dict, strict=False)
563
+
564
+ return model
565
+
566
+
567
+ if __name__ == "__main__":
568
+
569
+ parser = argparse.ArgumentParser()
570
+ parser.add_argument("checkpoint_dir",
571
+ type=str,
572
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
573
+ parser.add_argument(
574
+ "output_file",
575
+ type=str,
576
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
577
+ parser.add_argument("-t",
578
+ "--tag",
579
+ type=str,
580
+ default=None,
581
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
582
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
583
+ args = parser.parse_args()
584
+
585
+ debug = args.debug
586
+
587
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
weights/model-idf/config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/bn/automl-aigc/yatai/Qwen-VL/result/qwen_alpha_full_llava_mini_1/checkpoint-3000",
3
+ "architectures": [
4
+ "QWenLMHeadModel"
5
+ ],
6
+ "attn_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_qwen.QWenConfig",
9
+ "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
10
+ },
11
+ "bf16": true,
12
+ "emb_dropout_prob": 0.0,
13
+ "fp16": false,
14
+ "fp32": false,
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 22016,
18
+ "kv_channels": 128,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "max_position_embeddings": 8192,
21
+ "model_type": "qwen",
22
+ "no_bias": true,
23
+ "num_attention_heads": 32,
24
+ "num_hidden_layers": 32,
25
+ "onnx_safe": null,
26
+ "rotary_emb_base": 10000,
27
+ "rotary_pct": 1.0,
28
+ "scale_attn_weights": true,
29
+ "seq_length": 2048,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_type": "QWenTokenizer",
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.32.0",
34
+ "use_cache": false,
35
+ "use_dynamic_ntk": true,
36
+ "use_flash_attn": false,
37
+ "use_logn_attn": true,
38
+ "visual": {
39
+ "heads": 16,
40
+ "image_size": 448,
41
+ "image_start_id": 151857,
42
+ "layers": 48,
43
+ "mlp_ratio": 4.9231,
44
+ "output_dim": 4096,
45
+ "patch_size": 14,
46
+ "width": 1664
47
+ },
48
+ "vocab_size": 151936
49
+ }
weights/model-idf/configuration_qwen.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from transformers import PretrainedConfig
7
+
8
+
9
+ class QWenConfig(PretrainedConfig):
10
+ model_type = "qwen"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=151936,
16
+ hidden_size=4096,
17
+ num_hidden_layers=32,
18
+ num_attention_heads=32,
19
+ emb_dropout_prob=0.0,
20
+ attn_dropout_prob=0.0,
21
+ layer_norm_epsilon=1e-6,
22
+ initializer_range=0.02,
23
+ max_position_embeddings=8192,
24
+ scale_attn_weights=True,
25
+ use_cache=True,
26
+ bf16=False,
27
+ fp16=False,
28
+ fp32=False,
29
+ kv_channels=128,
30
+ rotary_pct=1.0,
31
+ rotary_emb_base=10000,
32
+ use_dynamic_ntk=True,
33
+ use_logn_attn=True,
34
+ use_flash_attn="auto",
35
+ intermediate_size=22016,
36
+ no_bias=True,
37
+ tie_word_embeddings=False,
38
+ **kwargs,
39
+ ):
40
+ self.vocab_size = vocab_size
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.emb_dropout_prob = emb_dropout_prob
46
+ self.attn_dropout_prob = attn_dropout_prob
47
+ self.layer_norm_epsilon = layer_norm_epsilon
48
+ self.initializer_range = initializer_range
49
+ self.scale_attn_weights = scale_attn_weights
50
+ self.use_cache = use_cache
51
+ self.max_position_embeddings = max_position_embeddings
52
+ self.bf16 = bf16
53
+ self.fp16 = fp16
54
+ self.fp32 = fp32
55
+ self.kv_channels = kv_channels
56
+ self.rotary_pct = rotary_pct
57
+ self.rotary_emb_base = rotary_emb_base
58
+ self.use_dynamic_ntk = use_dynamic_ntk
59
+ self.use_logn_attn = use_logn_attn
60
+ self.use_flash_attn = use_flash_attn
61
+ self.no_bias = no_bias
62
+ super().__init__(
63
+ tie_word_embeddings=tie_word_embeddings,
64
+ **kwargs
65
+ )
weights/model-idf/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chat_format": "chatml",
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 512,
6
+ "max_window_size": 6144,
7
+ "pad_token_id": 151643,
8
+ "top_k": 0,
9
+ "top_p": 0.3,
10
+ "transformers_version": "4.32.0"
11
+ }
weights/model-idf/modeling_qwen.py ADDED
@@ -0,0 +1,1182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import importlib
7
+ import math
8
+ from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ import torch.utils.checkpoint
13
+ from torch.cuda.amp import autocast
14
+
15
+ from torch.nn import CrossEntropyLoss
16
+ from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
17
+ from transformers.generation.logits_process import LogitsProcessorList
18
+
19
+ if TYPE_CHECKING:
20
+ from transformers.generation.streamers import BaseStreamer
21
+ from transformers.generation.utils import GenerateOutput
22
+ from transformers.modeling_outputs import (
23
+ BaseModelOutputWithPast,
24
+ CausalLMOutputWithPast,
25
+ )
26
+ from transformers.modeling_utils import PreTrainedModel
27
+ from transformers.utils import logging
28
+
29
+ try:
30
+ from einops import rearrange
31
+ except ImportError:
32
+ rearrange = None
33
+ from torch import nn
34
+
35
+ SUPPORT_CUDA = torch.cuda.is_available()
36
+ SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
37
+ SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
38
+
39
+ from .configuration_qwen import QWenConfig
40
+ from .qwen_generation_utils import (
41
+ HistoryType,
42
+ make_context,
43
+ decode_tokens,
44
+ get_stop_words_ids,
45
+ StopWordsLogitsProcessor,
46
+ )
47
+ from .visual import VisionTransformer
48
+
49
+
50
+ logger = logging.get_logger(__name__)
51
+
52
+ _CHECKPOINT_FOR_DOC = "qwen"
53
+ _CONFIG_FOR_DOC = "QWenConfig"
54
+
55
+ QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
56
+
57
+ _ERROR_BAD_CHAT_FORMAT = """\
58
+ We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
59
+ If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
60
+ 我们检测到您可能在使用预训练模型(而非chat模型)进行多轮chat,因为您当前在generation_config指定的chat_format,并未设置为我们在对话中所支持的"chatml"格式。
61
+ 如果您在直接使用我们从Huggingface提供的模型,请确保您在调用model.chat()时,使用的是"Qwen/Qwen-7B-Chat"模型(而非"Qwen/Qwen-7B"预训练模型)。
62
+ """
63
+
64
+ _SENTINEL = object()
65
+ _ERROR_STREAM_IN_CHAT = """\
66
+ Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
67
+ 向model.chat()传入参数stream的用法可能存在Bug,该用法已被废弃,将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
68
+ """
69
+
70
+ apply_rotary_emb_func = None
71
+ rms_norm = None
72
+
73
+
74
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
75
+ def _make_causal_mask(
76
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
77
+ ):
78
+ """
79
+ Make causal mask used for bi-directional self-attention.
80
+ """
81
+ bsz, tgt_len = input_ids_shape
82
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
83
+ mask_cond = torch.arange(mask.size(-1), device=device)
84
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
85
+ mask = mask.to(dtype)
86
+
87
+ if past_key_values_length > 0:
88
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
89
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
90
+
91
+
92
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
93
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
94
+ """
95
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
96
+ """
97
+ bsz, src_len = mask.size()
98
+ tgt_len = tgt_len if tgt_len is not None else src_len
99
+
100
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
101
+
102
+ inverted_mask = 1.0 - expanded_mask
103
+
104
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
105
+
106
+
107
+ class QWenAttention(nn.Module):
108
+ def __init__(self, config):
109
+ super().__init__()
110
+
111
+ self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
112
+ self.seq_length = config.seq_length
113
+
114
+ self.hidden_size = config.hidden_size
115
+ self.split_size = config.hidden_size
116
+ self.num_heads = config.num_attention_heads
117
+ self.head_dim = self.hidden_size // self.num_heads
118
+
119
+ self.scale_attn_weights = True
120
+
121
+ self.projection_size = config.kv_channels * config.num_attention_heads
122
+
123
+ assert self.projection_size % config.num_attention_heads == 0
124
+ self.hidden_size_per_attention_head = (
125
+ self.projection_size // config.num_attention_heads
126
+ )
127
+
128
+ self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
129
+
130
+ self.c_proj = nn.Linear(
131
+ config.hidden_size, self.projection_size, bias=not config.no_bias
132
+ )
133
+
134
+ self.is_fp32 = not (config.bf16 or config.fp16)
135
+ self.bf16 = config.bf16
136
+
137
+ self.use_dynamic_ntk = config.use_dynamic_ntk
138
+ self.use_logn_attn = config.use_logn_attn
139
+
140
+ logn_list = [
141
+ math.log(i, self.seq_length) if i > self.seq_length else 1
142
+ for i in range(1, 32768)
143
+ ]
144
+ self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
145
+
146
+ self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
147
+
148
+ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
149
+ attn_weights = torch.matmul(query, key.transpose(-1, -2))
150
+
151
+ if self.scale_attn_weights:
152
+ attn_weights = attn_weights / torch.full(
153
+ [],
154
+ value.size(-1) ** 0.5,
155
+ dtype=attn_weights.dtype,
156
+ device=attn_weights.device,
157
+ )
158
+
159
+ query_length, key_length = query.size(-2), key.size(-2)
160
+ # causal_mask = self.bias[
161
+ # :, :, key_length - query_length : key_length, :key_length
162
+ # ]
163
+ # mask_value = torch.finfo(attn_weights.dtype).min
164
+ # mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
165
+ # attn_weights.device
166
+ # )
167
+ # attn_weights = torch.where(
168
+ # causal_mask, attn_weights.to(attn_weights.dtype), mask_value
169
+ # )
170
+ attn_weights = attn_weights + attention_mask
171
+
172
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
173
+
174
+ attn_weights = attn_weights.type(value.dtype)
175
+ attn_weights = self.attn_dropout(attn_weights)
176
+
177
+ if head_mask is not None:
178
+ attn_weights = attn_weights * head_mask
179
+
180
+ attn_output = torch.matmul(attn_weights, value)
181
+ attn_output = attn_output.transpose(1, 2)
182
+
183
+ return attn_output, attn_weights
184
+
185
+ def _upcast_and_reordered_attn(
186
+ self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
187
+ ):
188
+ bsz, num_heads, q_seq_len, dk = query.size()
189
+ _, _, k_seq_len, _ = key.size()
190
+
191
+ attn_weights = torch.empty(
192
+ bsz * num_heads,
193
+ q_seq_len,
194
+ k_seq_len,
195
+ dtype=torch.float32,
196
+ device=query.device,
197
+ )
198
+
199
+ scale_factor = 1.0
200
+ if self.scale_attn_weights:
201
+ scale_factor /= float(value.size(-1)) ** 0.5
202
+
203
+ with autocast(enabled=False):
204
+ q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
205
+ -1, dk, k_seq_len
206
+ )
207
+ attn_weights = torch.baddbmm(
208
+ attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
209
+ )
210
+ attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
211
+
212
+ query_length, key_length = query.size(-2), key.size(-2)
213
+ causal_mask = registered_causal_mask[
214
+ :, :, key_length - query_length : key_length, :key_length
215
+ ]
216
+ mask_value = torch.finfo(attn_weights.dtype).min
217
+ mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
218
+ attn_weights.device
219
+ )
220
+ attn_weights = torch.where(causal_mask, attn_weights, mask_value)
221
+
222
+ if attention_mask is not None:
223
+ attn_weights = attn_weights + attention_mask
224
+
225
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
226
+
227
+ if attn_weights.dtype != torch.float32:
228
+ raise RuntimeError(
229
+ "Error with upcasting, attn_weights does not have dtype torch.float32"
230
+ )
231
+ attn_weights = attn_weights.type(value.dtype)
232
+ attn_weights = self.attn_dropout(attn_weights)
233
+
234
+ if head_mask is not None:
235
+ attn_weights = attn_weights * head_mask
236
+
237
+ attn_output = torch.matmul(attn_weights, value)
238
+
239
+ return attn_output, attn_weights
240
+
241
+ def _split_heads(self, tensor, num_heads, attn_head_size):
242
+ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
243
+ tensor = tensor.view(new_shape)
244
+ return tensor
245
+
246
+ def _merge_heads(self, tensor, num_heads, attn_head_size):
247
+ tensor = tensor.contiguous()
248
+ new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
249
+ return tensor.view(new_shape)
250
+
251
+ def forward(
252
+ self,
253
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
254
+ rotary_pos_emb: Optional[List[torch.Tensor]] = None,
255
+ registered_causal_mask: Optional[torch.Tensor] = None,
256
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
257
+ attention_mask: Optional[torch.FloatTensor] = None,
258
+ head_mask: Optional[torch.FloatTensor] = None,
259
+ encoder_hidden_states: Optional[torch.Tensor] = None,
260
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
261
+ output_attentions: Optional[bool] = False,
262
+ use_cache: Optional[bool] = False,
263
+ ):
264
+
265
+ mixed_x_layer = self.c_attn(hidden_states)
266
+
267
+ query, key, value = mixed_x_layer.split(self.split_size, dim=2)
268
+
269
+ query = self._split_heads(query, self.num_heads, self.head_dim)
270
+ key = self._split_heads(key, self.num_heads, self.head_dim)
271
+ value = self._split_heads(value, self.num_heads, self.head_dim)
272
+
273
+ if rotary_pos_emb is not None:
274
+ cur_len = query.shape[1]
275
+ rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
276
+ rotary_pos_emb = (rotary_pos_emb,) * 2
277
+ q_pos_emb, k_pos_emb = rotary_pos_emb
278
+ # Slice the pos emb for current inference
279
+ query = apply_rotary_pos_emb(query, q_pos_emb)
280
+ key = apply_rotary_pos_emb(key, k_pos_emb)
281
+
282
+ if layer_past is not None:
283
+ past_key, past_value = layer_past[0], layer_past[1]
284
+ key = torch.cat((past_key, key), dim=1)
285
+ value = torch.cat((past_value, value), dim=1)
286
+
287
+ if use_cache:
288
+ present = (key, value)
289
+ else:
290
+ present = None
291
+
292
+ if self.use_logn_attn and not self.training:
293
+ if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
294
+ self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
295
+ seq_start = key.size(1) - query.size(1)
296
+ seq_end = key.size(1)
297
+ logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
298
+ query = query * logn_tensor.expand_as(query)
299
+
300
+ query = query.permute(0, 2, 1, 3)
301
+ key = key.permute(0, 2, 1, 3)
302
+ value = value.permute(0, 2, 1, 3)
303
+ attn_output, attn_weight = self._attn(
304
+ query, key, value, registered_causal_mask, attention_mask, head_mask
305
+ )
306
+ context_layer = self._merge_heads(
307
+ attn_output, self.num_heads, self.head_dim
308
+ )
309
+
310
+ attn_output = self.c_proj(context_layer)
311
+
312
+ outputs = (attn_output, present)
313
+ if output_attentions:
314
+ outputs += (attn_weight,)
315
+
316
+ return outputs
317
+
318
+
319
+ class QWenMLP(nn.Module):
320
+ def __init__(self, config):
321
+ super().__init__()
322
+ self.w1 = nn.Linear(
323
+ config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
324
+ )
325
+ self.w2 = nn.Linear(
326
+ config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
327
+ )
328
+ ff_dim_in = config.intermediate_size // 2
329
+ self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
330
+
331
+ def forward(self, hidden_states):
332
+ a1 = self.w1(hidden_states)
333
+ a2 = self.w2(hidden_states)
334
+ intermediate_parallel = a1 * F.silu(a2)
335
+ output = self.c_proj(intermediate_parallel)
336
+ return output
337
+
338
+ class QWenBlock(nn.Module):
339
+ def __init__(self, config):
340
+ super().__init__()
341
+ hidden_size = config.hidden_size
342
+ self.bf16 = config.bf16
343
+
344
+ self.ln_1 = RMSNorm(
345
+ hidden_size,
346
+ eps=config.layer_norm_epsilon,
347
+ )
348
+ self.attn = QWenAttention(config)
349
+ self.ln_2 = RMSNorm(
350
+ hidden_size,
351
+ eps=config.layer_norm_epsilon,
352
+ )
353
+
354
+ self.mlp = QWenMLP(config)
355
+
356
+ def forward(
357
+ self,
358
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
359
+ rotary_pos_emb: Optional[List[torch.Tensor]] = None,
360
+ registered_causal_mask: Optional[torch.Tensor] = None,
361
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
362
+ attention_mask: Optional[torch.FloatTensor] = None,
363
+ head_mask: Optional[torch.FloatTensor] = None,
364
+ encoder_hidden_states: Optional[torch.Tensor] = None,
365
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
366
+ use_cache: Optional[bool] = False,
367
+ output_attentions: Optional[bool] = False,
368
+ ):
369
+ layernorm_output = self.ln_1(hidden_states)
370
+
371
+ attn_outputs = self.attn(
372
+ layernorm_output,
373
+ rotary_pos_emb,
374
+ registered_causal_mask=registered_causal_mask,
375
+ layer_past=layer_past,
376
+ attention_mask=attention_mask,
377
+ head_mask=head_mask,
378
+ use_cache=use_cache,
379
+ output_attentions=output_attentions,
380
+ )
381
+ attn_output = attn_outputs[0]
382
+
383
+ outputs = attn_outputs[1:]
384
+
385
+ residual = hidden_states
386
+ layernorm_input = attn_output + residual
387
+
388
+ layernorm_output = self.ln_2(layernorm_input)
389
+
390
+ residual = layernorm_input
391
+ mlp_output = self.mlp(layernorm_output)
392
+ hidden_states = residual + mlp_output
393
+
394
+ if use_cache:
395
+ outputs = (hidden_states,) + outputs
396
+ else:
397
+ outputs = (hidden_states,) + outputs[1:]
398
+
399
+ return outputs
400
+
401
+
402
+ class QWenPreTrainedModel(PreTrainedModel):
403
+ config_class = QWenConfig
404
+ base_model_prefix = "transformer"
405
+ is_parallelizable = False
406
+ supports_gradient_checkpointing = True
407
+ _no_split_modules = ["QWenBlock"]
408
+
409
+ def __init__(self, *inputs, **kwargs):
410
+ super().__init__(*inputs, **kwargs)
411
+
412
+ def _init_weights(self, module):
413
+ """Initialize the weights."""
414
+ if isinstance(module, nn.Linear):
415
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
416
+ if module.bias is not None:
417
+ module.bias.data.zero_()
418
+ elif isinstance(module, nn.Embedding):
419
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
420
+ if module.padding_idx is not None:
421
+ module.weight.data[module.padding_idx].zero_()
422
+ elif isinstance(module, RMSNorm):
423
+ module.weight.data.fill_(1.0)
424
+
425
+ for name, p in module.named_parameters():
426
+ if name == "c_proj.weight":
427
+ p.data.normal_(
428
+ mean=0.0,
429
+ std=(
430
+ self.config.initializer_range
431
+ / math.sqrt(2 * self.config.num_hidden_layers)
432
+ ),
433
+ )
434
+
435
+ def _set_gradient_checkpointing(self, module, value=False):
436
+ if isinstance(module, QWenModel):
437
+ module.gradient_checkpointing = value
438
+
439
+
440
+ class QWenModel(QWenPreTrainedModel):
441
+ _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
442
+
443
+ def __init__(self, config):
444
+ super().__init__(config)
445
+ self.vocab_size = config.vocab_size
446
+ self.num_hidden_layers = config.num_hidden_layers
447
+ self.embed_dim = config.hidden_size
448
+
449
+ self.gradient_checkpointing = False
450
+ self.use_dynamic_ntk = config.use_dynamic_ntk
451
+ self.seq_length = config.seq_length
452
+
453
+ self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
454
+
455
+ self.drop = nn.Dropout(config.emb_dropout_prob)
456
+
457
+ if config.rotary_pct == 1.0:
458
+ self.rotary_ndims = None
459
+ else:
460
+ assert config.rotary_pct < 1
461
+ self.rotary_ndims = int(
462
+ config.kv_channels * config.rotary_pct
463
+ )
464
+ dim = (
465
+ self.rotary_ndims
466
+ if self.rotary_ndims is not None
467
+ else config.kv_channels
468
+ )
469
+ self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
470
+
471
+ self.use_flash_attn = config.use_flash_attn
472
+ self.is_fp32 = not (config.bf16 or config.fp16)
473
+ self.registered_causal_mask = None
474
+ # if (
475
+ # self.use_flash_attn
476
+ # and flash_attn_unpadded_func is not None
477
+ # and not self.is_fp32
478
+ # ):
479
+ # self.registered_causal_mask = None
480
+ # else:
481
+ # max_positions = config.max_position_embeddings
482
+ # self.register_buffer(
483
+ # "registered_causal_mask",
484
+ # torch.tril(
485
+ # torch.ones((max_positions, max_positions), dtype=torch.bool)
486
+ # ).view(1, 1, max_positions, max_positions),
487
+ # persistent=False,
488
+ # )
489
+
490
+ self.h = nn.ModuleList(
491
+ [
492
+ QWenBlock(
493
+ config
494
+ )
495
+ for i in range(config.num_hidden_layers)
496
+ ]
497
+ )
498
+ self.ln_f = RMSNorm(
499
+ self.embed_dim,
500
+ eps=config.layer_norm_epsilon,
501
+ )
502
+
503
+ self.visual = VisionTransformer(**config.visual) # vit + resampler
504
+
505
+ self.post_init()
506
+
507
+ def get_input_embeddings(self):
508
+ return self.wte
509
+
510
+ def set_input_embeddings(self, new_embeddings):
511
+ self.wte = new_embeddings
512
+
513
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
514
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
515
+ # create causal mask
516
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
517
+ combined_attention_mask = None
518
+ if input_shape[-1] > 1:
519
+ combined_attention_mask = _make_causal_mask(
520
+ input_shape,
521
+ inputs_embeds.dtype,
522
+ device=inputs_embeds.device,
523
+ past_key_values_length=past_key_values_length,
524
+ )
525
+
526
+ if attention_mask is not None:
527
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
528
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
529
+ inputs_embeds.device
530
+ )
531
+ combined_attention_mask = (
532
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
533
+ )
534
+
535
+ return combined_attention_mask
536
+
537
+
538
+ def forward(
539
+ self,
540
+ input_ids: Optional[torch.LongTensor] = None,
541
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
542
+ attention_mask: Optional[torch.FloatTensor] = None,
543
+ token_type_ids: Optional[torch.LongTensor] = None,
544
+ position_ids: Optional[torch.LongTensor] = None,
545
+ head_mask: Optional[torch.FloatTensor] = None,
546
+ inputs_embeds: Optional[torch.FloatTensor] = None,
547
+ encoder_hidden_states: Optional[torch.Tensor] = None,
548
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
549
+ use_cache: Optional[bool] = None,
550
+ output_attentions: Optional[bool] = None,
551
+ output_hidden_states: Optional[bool] = None,
552
+ return_dict: Optional[bool] = None,
553
+ ):
554
+ if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
555
+ bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
556
+ eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
557
+ assert (bos_pos[0] == eos_pos[0]).all()
558
+ img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
559
+ images = []
560
+ # for i, a, b in img_pos:
561
+ # image = input_ids[i][a + 1 : b - 1].tolist()
562
+ # image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
563
+ # images.append(bytes(image).decode('utf-8'))
564
+
565
+ old_i = -1
566
+ images_flag = []
567
+ id_test_flag = 0
568
+ for i, a, b in img_pos:
569
+ image = input_ids[i][a + 1 : b - 1].tolist()
570
+ image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
571
+ images.append(bytes(image).decode('utf-8'))
572
+ if i != old_i:
573
+ if input_ids[i][a-2] == 374:
574
+ id_test_flag = 1
575
+ else:
576
+ id_test_flag = 0
577
+ old_i = i
578
+ if input_ids[i][a-2] == 374:
579
+ images_flag.append(0)
580
+ elif id_test_flag == 1:
581
+ images_flag.append(1)
582
+ else:
583
+ images_flag.append(2)
584
+
585
+ images = self.visual.encode(images, images_flag)
586
+ assert images.shape[0] == len(images)
587
+ fake_images = None
588
+ elif self.training:
589
+ fake_images=torch.zeros(1,3,224,224).to(
590
+ dtype=self.visual.conv1.weight.dtype, device=self.visual.conv1.weight.device)
591
+ images = self.visual(fake_images)
592
+ else:
593
+ fake_images = None
594
+ images = None
595
+
596
+ output_attentions = (
597
+ output_attentions
598
+ if output_attentions is not None
599
+ else self.config.output_attentions
600
+ )
601
+ output_hidden_states = (
602
+ output_hidden_states
603
+ if output_hidden_states is not None
604
+ else self.config.output_hidden_states
605
+ )
606
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
607
+ return_dict = (
608
+ return_dict if return_dict is not None else self.config.use_return_dict
609
+ )
610
+
611
+ if input_ids is not None and inputs_embeds is not None:
612
+ raise ValueError(
613
+ "You cannot specify both input_ids and inputs_embeds at the same time"
614
+ )
615
+ elif input_ids is not None:
616
+ input_shape = input_ids.size()
617
+ input_ids = input_ids.view(-1, input_shape[-1])
618
+ batch_size = input_ids.shape[0]
619
+ elif inputs_embeds is not None:
620
+ input_shape = inputs_embeds.size()[:-1]
621
+ batch_size = inputs_embeds.shape[0]
622
+ else:
623
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
624
+
625
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
626
+
627
+ if token_type_ids is not None:
628
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
629
+ if position_ids is not None:
630
+ position_ids = position_ids.view(-1, input_shape[-1])
631
+
632
+ if past_key_values is None:
633
+ past_length = 0
634
+ past_key_values = tuple([None] * len(self.h))
635
+ else:
636
+ past_length = past_key_values[0][0].size(-2)
637
+
638
+ if position_ids is None:
639
+ position_ids = torch.arange(
640
+ past_length,
641
+ input_shape[-1] + past_length,
642
+ dtype=torch.long,
643
+ device=device,
644
+ )
645
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
646
+
647
+ encoder_attention_mask = None
648
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
649
+
650
+ if inputs_embeds is None:
651
+ inputs_embeds = self.wte(input_ids)
652
+
653
+ if batch_size <= 0:
654
+ raise ValueError("batch_size has to be defined and > 0")
655
+ attention_mask = self._prepare_decoder_attention_mask(
656
+ attention_mask, input_shape, inputs_embeds, past_length
657
+ )
658
+
659
+ hidden_states = inputs_embeds
660
+
661
+ kv_seq_len = hidden_states.size()[1]
662
+ if past_key_values[0] is not None:
663
+ # past key values[0][0] shape: bs * seq_len * head_num * dim
664
+ kv_seq_len += past_key_values[0][0].shape[1]
665
+ if (
666
+ self.use_dynamic_ntk
667
+ and kv_seq_len == hidden_states.size()[1]
668
+ and not self.training
669
+ ):
670
+ context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
671
+ ntk_alpha = 2 ** math.ceil(context_value) - 1
672
+ ntk_alpha = max(ntk_alpha, 1)
673
+ else:
674
+ ntk_alpha = self.rotary_emb._ntk_alpha_cached
675
+
676
+ rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
677
+ for idx in range(len(rotary_pos_emb)):
678
+ rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
679
+
680
+ hidden_states = self.drop(hidden_states).clone()
681
+ if fake_images is not None:
682
+ hidden_states = hidden_states + images.mean()*0
683
+ elif images is not None:
684
+ for idx, (i, a, b) in enumerate(img_pos):
685
+ hidden_states[i][a + 1 : b] = images[idx]
686
+ output_shape = input_shape + (hidden_states.size(-1),)
687
+
688
+ if self.gradient_checkpointing and self.training:
689
+ if use_cache:
690
+ logger.warning_once(
691
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
692
+ )
693
+ use_cache = False
694
+
695
+ presents = () if use_cache else None
696
+ all_self_attentions = () if output_attentions else None
697
+ all_hidden_states = () if output_hidden_states else None
698
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
699
+
700
+ if output_hidden_states:
701
+ all_hidden_states = all_hidden_states + (hidden_states,)
702
+
703
+ if self.gradient_checkpointing and self.training:
704
+
705
+ def create_custom_forward(module):
706
+ def custom_forward(*inputs):
707
+ # None for past_key_value
708
+ return module(*inputs, use_cache, output_attentions)
709
+
710
+ return custom_forward
711
+
712
+ outputs = torch.utils.checkpoint.checkpoint(
713
+ create_custom_forward(block),
714
+ hidden_states,
715
+ rotary_pos_emb,
716
+ self.registered_causal_mask,
717
+ None,
718
+ attention_mask,
719
+ head_mask[i],
720
+ encoder_hidden_states,
721
+ encoder_attention_mask,
722
+ )
723
+ else:
724
+ outputs = block(
725
+ hidden_states,
726
+ layer_past=layer_past,
727
+ rotary_pos_emb=rotary_pos_emb,
728
+ registered_causal_mask=self.registered_causal_mask,
729
+ attention_mask=attention_mask,
730
+ head_mask=head_mask[i],
731
+ encoder_hidden_states=encoder_hidden_states,
732
+ encoder_attention_mask=encoder_attention_mask,
733
+ use_cache=use_cache,
734
+ output_attentions=output_attentions,
735
+ )
736
+
737
+ hidden_states = outputs[0]
738
+ if use_cache is True:
739
+ presents = presents + (outputs[1],)
740
+
741
+ if output_attentions:
742
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
743
+
744
+ hidden_states = self.ln_f(hidden_states)
745
+ hidden_states = hidden_states.view(output_shape)
746
+ # Add last hidden state
747
+ if output_hidden_states:
748
+ all_hidden_states = all_hidden_states + (hidden_states,)
749
+
750
+ if not return_dict:
751
+ return tuple(
752
+ v for v in [hidden_states, presents, all_hidden_states] if v is not None
753
+ )
754
+
755
+ return BaseModelOutputWithPast(
756
+ last_hidden_state=hidden_states,
757
+ past_key_values=presents,
758
+ hidden_states=all_hidden_states,
759
+ attentions=all_self_attentions,
760
+ )
761
+
762
+
763
+ class QWenLMHeadModel(QWenPreTrainedModel):
764
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
765
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
766
+
767
+ def __init__(self, config):
768
+ super().__init__(config)
769
+ assert (
770
+ config.bf16 + config.fp16 + config.fp32 <= 1
771
+ ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
772
+
773
+ autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
774
+
775
+ if autoset_precision:
776
+ if SUPPORT_BF16:
777
+ logger.warn(
778
+ "The model is automatically converting to bf16 for faster inference. "
779
+ "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
780
+ )
781
+ config.bf16 = True
782
+ elif SUPPORT_FP16:
783
+ logger.warn(
784
+ "The model is automatically converting to fp16 for faster inference. "
785
+ "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
786
+ )
787
+ config.fp16 = True
788
+ else:
789
+ config.fp32 = True
790
+
791
+ if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
792
+ logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
793
+ if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
794
+ logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
795
+ if config.fp32:
796
+ if SUPPORT_BF16:
797
+ logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
798
+ elif SUPPORT_FP16:
799
+ logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
800
+
801
+ self.transformer = QWenModel(config)
802
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
803
+
804
+ if config.bf16:
805
+ self.transformer.bfloat16()
806
+ self.lm_head.bfloat16()
807
+ if config.fp16:
808
+ self.transformer.half()
809
+ self.lm_head.half()
810
+ self.post_init()
811
+
812
+ def get_output_embeddings(self):
813
+ return self.lm_head
814
+
815
+ def set_output_embeddings(self, new_embeddings):
816
+ self.lm_head = new_embeddings
817
+
818
+ def prepare_inputs_for_generation(
819
+ self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
820
+ ):
821
+ token_type_ids = kwargs.get("token_type_ids", None)
822
+ if past_key_values:
823
+ input_ids = input_ids[:, -1].unsqueeze(-1)
824
+ if token_type_ids is not None:
825
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
826
+
827
+ attention_mask = kwargs.get("attention_mask", None)
828
+ position_ids = kwargs.get("position_ids", None)
829
+
830
+ if attention_mask is not None and position_ids is None:
831
+ position_ids = attention_mask.long().cumsum(-1) - 1
832
+ position_ids.masked_fill_(attention_mask == 0, 1)
833
+ if past_key_values:
834
+ position_ids = position_ids[:, -1].unsqueeze(-1)
835
+ else:
836
+ position_ids = None
837
+
838
+ if inputs_embeds is not None and past_key_values is None:
839
+ model_inputs = {"inputs_embeds": inputs_embeds}
840
+ else:
841
+ model_inputs = {"input_ids": input_ids}
842
+
843
+ model_inputs.update(
844
+ {
845
+ "past_key_values": past_key_values,
846
+ "use_cache": kwargs.get("use_cache"),
847
+ "position_ids": position_ids,
848
+ "attention_mask": attention_mask,
849
+ "token_type_ids": token_type_ids,
850
+ }
851
+ )
852
+ return model_inputs
853
+
854
+ def forward(
855
+ self,
856
+ input_ids: Optional[torch.LongTensor] = None,
857
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
858
+ attention_mask: Optional[torch.FloatTensor] = None,
859
+ token_type_ids: Optional[torch.LongTensor] = None,
860
+ position_ids: Optional[torch.LongTensor] = None,
861
+ head_mask: Optional[torch.FloatTensor] = None,
862
+ inputs_embeds: Optional[torch.FloatTensor] = None,
863
+ encoder_hidden_states: Optional[torch.Tensor] = None,
864
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
865
+ labels: Optional[torch.LongTensor] = None,
866
+ use_cache: Optional[bool] = None,
867
+ output_attentions: Optional[bool] = None,
868
+ output_hidden_states: Optional[bool] = None,
869
+ return_dict: Optional[bool] = None,
870
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
871
+
872
+ return_dict = (
873
+ return_dict if return_dict is not None else self.config.use_return_dict
874
+ )
875
+
876
+ transformer_outputs = self.transformer(
877
+ input_ids,
878
+ past_key_values=past_key_values,
879
+ attention_mask=attention_mask,
880
+ token_type_ids=token_type_ids,
881
+ position_ids=position_ids,
882
+ head_mask=head_mask,
883
+ inputs_embeds=inputs_embeds,
884
+ encoder_hidden_states=encoder_hidden_states,
885
+ encoder_attention_mask=encoder_attention_mask,
886
+ use_cache=use_cache,
887
+ output_attentions=output_attentions,
888
+ output_hidden_states=output_hidden_states,
889
+ return_dict=return_dict,
890
+ )
891
+ hidden_states = transformer_outputs[0]
892
+
893
+ lm_logits = self.lm_head(hidden_states)
894
+
895
+ loss = None
896
+ if labels is not None:
897
+ labels = labels.to(lm_logits.device)
898
+ shift_logits = lm_logits[..., :-1, :].contiguous()
899
+ shift_labels = labels[..., 1:].contiguous()
900
+ loss_fct = CrossEntropyLoss()
901
+ loss = loss_fct(
902
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
903
+ )
904
+
905
+ if not return_dict:
906
+ output = (lm_logits,) + transformer_outputs[1:]
907
+ return ((loss,) + output) if loss is not None else output
908
+
909
+ return CausalLMOutputWithPast(
910
+ loss=loss,
911
+ logits=lm_logits,
912
+ past_key_values=transformer_outputs.past_key_values,
913
+ hidden_states=transformer_outputs.hidden_states,
914
+ attentions=transformer_outputs.attentions,
915
+ )
916
+
917
+ @staticmethod
918
+ def _reorder_cache(
919
+ past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
920
+ ) -> Tuple[Tuple[torch.Tensor]]:
921
+
922
+ return tuple(
923
+ tuple(
924
+ past_state.index_select(0, beam_idx.to(past_state.device))
925
+ for past_state in layer_past
926
+ )
927
+ for layer_past in past_key_values
928
+ )
929
+
930
+ def chat(
931
+ self,
932
+ tokenizer: PreTrainedTokenizer,
933
+ query: str,
934
+ history: Optional[HistoryType],
935
+ system: str = "You are a helpful assistant.",
936
+ append_history: bool = True,
937
+ stream: Optional[bool] = _SENTINEL,
938
+ stop_words_ids: Optional[List[List[int]]] = None,
939
+ generation_config: Optional[GenerationConfig] = None,
940
+ **kwargs,
941
+ ) -> Tuple[str, HistoryType]:
942
+ generation_config = generation_config if generation_config is not None else self.generation_config
943
+
944
+ assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
945
+ assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
946
+ if history is None:
947
+ history = []
948
+ if stop_words_ids is None:
949
+ stop_words_ids = []
950
+
951
+ max_window_size = kwargs.get('max_window_size', None)
952
+ if max_window_size is None:
953
+ max_window_size = generation_config.max_window_size
954
+ raw_text, context_tokens = make_context(
955
+ tokenizer,
956
+ query,
957
+ history=history,
958
+ system=system,
959
+ max_window_size=max_window_size,
960
+ chat_format=generation_config.chat_format,
961
+ )
962
+
963
+ stop_words_ids.extend(get_stop_words_ids(
964
+ generation_config.chat_format, tokenizer
965
+ ))
966
+ input_ids = torch.tensor([context_tokens]).to(self.device)
967
+ outputs = self.generate(
968
+ input_ids,
969
+ stop_words_ids=stop_words_ids,
970
+ return_dict_in_generate=False,
971
+ generation_config=generation_config,
972
+ **kwargs,
973
+ )
974
+
975
+ response = decode_tokens(
976
+ outputs[0],
977
+ tokenizer,
978
+ raw_text_len=len(raw_text),
979
+ context_length=len(context_tokens),
980
+ chat_format=generation_config.chat_format,
981
+ verbose=False,
982
+ errors='replace'
983
+ )
984
+
985
+ if append_history:
986
+ history.append((query, response))
987
+
988
+ return response, history
989
+
990
+ def chat_stream(
991
+ self,
992
+ tokenizer: PreTrainedTokenizer,
993
+ query: str,
994
+ history: Optional[HistoryType],
995
+ system: str = "You are a helpful assistant.",
996
+ stop_words_ids: Optional[List[List[int]]] = None,
997
+ logits_processor: Optional[LogitsProcessorList] = None,
998
+ generation_config: Optional[GenerationConfig] = None,
999
+ **kwargs,
1000
+ ) -> Generator[str, Any, None]:
1001
+ generation_config = generation_config if generation_config is not None else self.generation_config
1002
+ assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
1003
+ if history is None:
1004
+ history = []
1005
+ if stop_words_ids is None:
1006
+ stop_words_ids = []
1007
+
1008
+ max_window_size = kwargs.get('max_window_size', None)
1009
+ if max_window_size is None:
1010
+ max_window_size = generation_config.max_window_size
1011
+ raw_text, context_tokens = make_context(
1012
+ tokenizer,
1013
+ query,
1014
+ history=history,
1015
+ system=system,
1016
+ max_window_size=max_window_size,
1017
+ chat_format=generation_config.chat_format,
1018
+ )
1019
+
1020
+ stop_words_ids.extend(get_stop_words_ids(
1021
+ generation_config.chat_format, tokenizer
1022
+ ))
1023
+ if stop_words_ids is not None:
1024
+ stop_words_logits_processor = StopWordsLogitsProcessor(
1025
+ stop_words_ids=stop_words_ids,
1026
+ eos_token_id=generation_config.eos_token_id,
1027
+ )
1028
+ if logits_processor is None:
1029
+ logits_processor = LogitsProcessorList([stop_words_logits_processor])
1030
+ else:
1031
+ logits_processor.append(stop_words_logits_processor)
1032
+ input_ids = torch.tensor([context_tokens]).to(self.device)
1033
+
1034
+ from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
1035
+ self.__class__.generate_stream = NewGenerationMixin.generate
1036
+ self.__class__.sample_stream = NewGenerationMixin.sample_stream
1037
+ stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
1038
+
1039
+ def stream_generator():
1040
+ outputs = []
1041
+ for token in self.generate_stream(
1042
+ input_ids,
1043
+ return_dict_in_generate=False,
1044
+ generation_config=stream_config,
1045
+ logits_processor=logits_processor,
1046
+ seed=-1,
1047
+ **kwargs):
1048
+ outputs.append(token.item())
1049
+ yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore', keep_image_special=True)
1050
+
1051
+ return stream_generator()
1052
+
1053
+ def generate(
1054
+ self,
1055
+ inputs: Optional[torch.Tensor] = None,
1056
+ generation_config: Optional[GenerationConfig] = None,
1057
+ logits_processor: Optional[LogitsProcessorList] = None,
1058
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
1059
+ prefix_allowed_tokens_fn: Optional[
1060
+ Callable[[int, torch.Tensor], List[int]]
1061
+ ] = None,
1062
+ synced_gpus: Optional[bool] = None,
1063
+ assistant_model: Optional["PreTrainedModel"] = None,
1064
+ streamer: Optional["BaseStreamer"] = None,
1065
+ **kwargs,
1066
+ ) -> Union[GenerateOutput, torch.LongTensor]:
1067
+ generation_config = generation_config if generation_config is not None else self.generation_config
1068
+
1069
+ # Process stop_words_ids.
1070
+ stop_words_ids = kwargs.pop("stop_words_ids", None)
1071
+ if stop_words_ids is None and generation_config is not None:
1072
+ stop_words_ids = getattr(generation_config, "stop_words_ids", None)
1073
+ if stop_words_ids is None:
1074
+ stop_words_ids = getattr(generation_config, "stop_words_ids", None)
1075
+
1076
+ if stop_words_ids is not None:
1077
+ stop_words_logits_processor = StopWordsLogitsProcessor(
1078
+ stop_words_ids=stop_words_ids,
1079
+ eos_token_id=generation_config.eos_token_id,
1080
+ )
1081
+ if logits_processor is None:
1082
+ logits_processor = LogitsProcessorList([stop_words_logits_processor])
1083
+ else:
1084
+ logits_processor.append(stop_words_logits_processor)
1085
+
1086
+ return super().generate(
1087
+ inputs,
1088
+ generation_config=generation_config,
1089
+ logits_processor=logits_processor,
1090
+ stopping_criteria=stopping_criteria,
1091
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1092
+ synced_gpus=synced_gpus,
1093
+ assistant_model=assistant_model,
1094
+ streamer=streamer,
1095
+ **kwargs,
1096
+ )
1097
+
1098
+
1099
+ class RotaryEmbedding(torch.nn.Module):
1100
+ def __init__(self, dim, base=10000):
1101
+ super().__init__()
1102
+ self.dim = dim
1103
+ self.base = base
1104
+ self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
1105
+ if importlib.util.find_spec("einops") is None:
1106
+ raise RuntimeError("einops is required for Rotary Embedding")
1107
+
1108
+ self._rotary_pos_emb_cache = None
1109
+ self._seq_len_cached = 0
1110
+ self._ntk_alpha_cached = 1.0
1111
+
1112
+ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
1113
+ seqlen = max_seq_len + offset
1114
+ if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
1115
+ base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
1116
+ self.inv_freq = 1.0 / (
1117
+ base
1118
+ ** (
1119
+ torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
1120
+ / self.dim
1121
+ )
1122
+ )
1123
+ self._seq_len_cached = max(2 * seqlen, 16)
1124
+ self._ntk_alpha_cached = ntk_alpha
1125
+ seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
1126
+ freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
1127
+
1128
+ emb = torch.cat((freqs, freqs), dim=-1)
1129
+ from einops import rearrange
1130
+
1131
+ emb = rearrange(emb, "n d -> 1 n 1 d")
1132
+
1133
+ cos, sin = emb.cos(), emb.sin()
1134
+ self._rotary_pos_emb_cache = [cos, sin]
1135
+
1136
+ def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
1137
+ self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
1138
+ cos, sin = self._rotary_pos_emb_cache
1139
+ return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
1140
+
1141
+
1142
+ def _rotate_half(x):
1143
+ from einops import rearrange
1144
+
1145
+ x = rearrange(x, "... (j d) -> ... j d", j=2)
1146
+ x1, x2 = x.unbind(dim=-2)
1147
+ return torch.cat((-x2, x1), dim=-1)
1148
+
1149
+
1150
+ def apply_rotary_pos_emb(t, freqs):
1151
+ cos, sin = freqs
1152
+ if apply_rotary_emb_func is not None and t.is_cuda:
1153
+ t_ = t.float()
1154
+ cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
1155
+ sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
1156
+ output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
1157
+ return output
1158
+ else:
1159
+ rot_dim = freqs[0].shape[-1]
1160
+ cos, sin = freqs
1161
+ t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
1162
+ t_ = t_.float()
1163
+ t_pass_ = t_pass_.float()
1164
+ t_ = (t_ * cos) + (_rotate_half(t_) * sin)
1165
+ return torch.cat((t_, t_pass_), dim=-1).type_as(t)
1166
+
1167
+
1168
+ class RMSNorm(torch.nn.Module):
1169
+ def __init__(self, dim: int, eps: float = 1e-6):
1170
+ super().__init__()
1171
+ self.eps = eps
1172
+ self.weight = nn.Parameter(torch.ones(dim))
1173
+
1174
+ def _norm(self, x):
1175
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
1176
+
1177
+ def forward(self, x):
1178
+ if rms_norm is not None and x.is_cuda:
1179
+ return rms_norm(x, self.weight, self.eps)
1180
+ else:
1181
+ output = self._norm(x.float()).type_as(x)
1182
+ return output * self.weight
weights/model-idf/prediction.json ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-idf/prediction_mi.json ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 0,
4
+ "question": "Julia is <img>image_1</img>\n Victoria is <img>image_2</img>\n Watching the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
5
+ "prediction": "Julia, lying down, is given water by Victoria, who stands beside her. As the interaction progresses, Victoria remains close to Julia, holding the glass of water, indicating a comforting and attentive care given to her."
6
+ },
7
+ {
8
+ "id": 1,
9
+ "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n In these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What action does Julia take? ",
10
+ "prediction": "Julia is initially focused on something in front of her, then she turns to talk to Jack, and finally, she seems to be walking away from Jack."
11
+ },
12
+ {
13
+ "id": 2,
14
+ "question": "Jack is <img>image_1</img>\n Beech is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is the interaction between Jack and Beech through the images?",
15
+ "prediction": "Jack and Beech appear to have a tense conversation, with Beech at first facing Jack, then turning away, and finally facing him again from a different angle."
16
+ },
17
+ {
18
+ "id": 3,
19
+ "question": "Jack is <img>image_1</img>\n Julia is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
20
+ "prediction": "Jack, dressed in a grey outfit, rides a motorcycle while Julia stands nearby. A scuffle ensues as Jack confronts Julia, and subsequently, both are seen walking side by side through a barren landscape, suggesting a partnership or shared journey."
21
+ },
22
+ {
23
+ "id": 4,
24
+ "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
25
+ "prediction": "Julia and Jack stand together, overlooking a damaged landscape. They face each other in a close, intimate conversation. Later, Jack listens as Julia uses a telescope, indicating a shared moment of exploration or contemplation."
26
+ },
27
+ {
28
+ "id": 5,
29
+ "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
30
+ "prediction": "Julia and Jack start by walking together on a bridge, then Jack is seen adjusting the view of a telescope while Julia watches. They later stand close, facing each other with the telescope between them, and finally, Jack seems to be explaining something to Julia, gesturing towards the view."
31
+ },
32
+ {
33
+ "id": 6,
34
+ "question": "Glen McCreavy is <img>image_1</img>\n Nicholas Hardiment is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
35
+ "prediction": "Glen McCreery and Nicholas Hardiment start relaxed, lounging side by side. They move on to an outdoor gathering, where Glen is seen engaged in conversation while Nicholas is accept a plate of food. Later, they stand face to face in a more serious exchange. Finally, they are seen in a group setting, still conversing, now with more people involved."
36
+ },
37
+ {
38
+ "id": 7,
39
+ "question": "Glen McCreavy is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n. What are in the background?",
40
+ "prediction": "There are trees and a field in the background."
41
+ },
42
+ {
43
+ "id": 8,
44
+ "question": "Tamara Drewe is <img>image_1</img>\n Beth Hardiment is <img>image_2</img>\n In the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is Tamara Drewe doing?",
45
+ "prediction": "Tamara Drewe is walking across a field and having a conversation with Beth Hardiment."
46
+ },
47
+ {
48
+ "id": 9,
49
+ "question": "Ben Sergeant is <img>image_1</img>\n Tamara Drewe is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
50
+ "prediction": "Ben Sergeant and Tamara Drewe stand outside, facing each other in a conversation. They move indoors, where Ben listens to Tamara, who seems to be explaining something. The discussion continues, and it becomes more intense, with Tamara gesturing emphatically towards Ben. Finally, they come close enough to embrace, indicating a moment of resolution or comfort."
51
+ },
52
+ {
53
+ "id": 10,
54
+ "question": "Beth Hardiment is <img>image_1</img>\n Glen McCreavy is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
55
+ "prediction": "Beth Hardiment stands beside a riding lawn mower as Glen McCreavy sits on it, ready to mow the grass. She then steps away from the mower, watching while he continues with the task. Throughout, they appear to be in a conversation, possibly discussing the work at hand or sharing responsibilities."
56
+ },
57
+ {
58
+ "id": 11,
59
+ "question": "Beth Hardiment is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n In the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Beth Hardiment doing?",
60
+ "prediction": "Beth Hardiment is holding a chicken and talking to Andy Cobb."
61
+ },
62
+ {
63
+ "id": 12,
64
+ "question": "Nicholas Hardiment is <img>image_1</img>\n Beth Hardiment is <img>image_2</img>\n Ben Sergeant is <img>image_3</img>\n Can you provide a description of the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
65
+ "prediction": "Nicholas Hardiment stands alone by a wooden fence, then is joined by Beth Hardiment who appears to be in conversation with him. In the next moment, Ben Sergeant joins them, and they all seem to be engaged in a group discussion."
66
+ },
67
+ {
68
+ "id": 13,
69
+ "question": "Ben Sergeant is <img>image_1</img>\n Tamara Drewe is <img>image_2</img>\n Nicholas Hardiment is <img>image_3</img>\n Please explain what's happening in the photos <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
70
+ "prediction": "Ben Sergeant, wearing a black leather jacket, stands close to Tamara Drewe and Nicholas Hardiment in a bookshop. The scene shifts, and now Nicholas is engrossed in a conversation with Ben while Tamara watches from a distance. Later, only the backs of their heads are visible as they move through the bustling shop. Eventually, Ben and Tamara sit opposite each other at a table, engrossed in a book signing, while Nicholas stands beside them."
71
+ },
72
+ {
73
+ "id": 14,
74
+ "question": "Andy Cobb is <img>image_1</img>\n Ben Sergeant is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What interaction occurs between Andy Cobb and Ben Sergeant?",
75
+ "prediction": "Andy Cobb is seen initially squatting near a structure, then standing and facing Ben Sergeant, who approaches him. Finally, they appear to be in a close conversation or argument."
76
+ },
77
+ {
78
+ "id": 15,
79
+ "question": "Jody Long is <img>image_1</img>\n Tell me about the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
80
+ "prediction": "Jody Long sits at a desk in a dimly lit room, engaged with her computer. Later, she is seen from a different angle, appearing to be in a conversation while standing. Subsequently, Jody is depicted with another individual; they seem to be in the midst of a discussion, with Jody gesturing towards her chest."
81
+ },
82
+ {
83
+ "id": 16,
84
+ "question": "Tamara Drewe is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
85
+ "prediction": "Andy Cobb stands by the door as Tamara Drewe looks on from a distance. They exit the building together, with Tamara trailing slightly behind Andy. Later, Tamara watches from the street as Andy interacts with a fence. Throughout, the tension between them is palpable, suggesting a story of unspoken words and silently felt emotions."
86
+ },
87
+ {
88
+ "id": 17,
89
+ "question": "Jody Long is <img>image_1</img>\n Ben Sergeant is <img>image_2</img>\n Give a description for these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
90
+ "prediction": "Jody Long and Ben Sergeant are first seen in the kitchen where Jody is pouring a drink. They then move to the table, with Jody looking thoughtful while Ben stands behind her. The pair walk towards a door, and eventually find themselves outside, standing close and facing each other by a tree."
91
+ },
92
+ {
93
+ "id": 18,
94
+ "question": "Gavin Nichols is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Gavin Nichols doing in these scenes?",
95
+ "prediction": "Gavin Nichols is shown standing in a doorway, then in an office facing someone, later he is seen sitting down, and finally bending over a desk."
96
+ },
97
+ {
98
+ "id": 19,
99
+ "question": "Shana Harris is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n Could you interpret the scene from these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
100
+ "prediction": "Shana Harris, wearing a white shirt and red vest, stands to the left as she converses with Gavin Nichols, who is sitting on the edge of a desk in a white shirt and blue tie. Their discussion appears to evolve as Shana turns slightly towards Gavin, who then leans back in his chair, looking up at her."
101
+ },
102
+ {
103
+ "id": 20,
104
+ "question": "Chris (as Chris Gorham) is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
105
+ "prediction": "Gavin Nichols and Chris Gorham are in a kitchen, preparing food side by side on a table. Initially, Chris is on the left and Gavin on the right, both focused on their tasks. Then, Chris moves to the right as Gavin shifts slightly towards the center, both continuing to work on their dishes. Finally, Chris stands in the center with his back to the camera, while Gavin is on the left, facing towards Chris, engaged in the task at hand."
106
+ },
107
+ {
108
+ "id": 21,
109
+ "question": "Shana Harris is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
110
+ "prediction": "Shana Harris is busy with kitchen work while Gavin Nichols stands by the counter holding a bottle. As she continues her task, Gavin moves closer, drink in hand, engaging her with what seems like a conversation. They appear to be in a kitchen, possibly preparing for a meal together."
111
+ },
112
+ {
113
+ "id": 22,
114
+ "question": "Gavin Nichols is <img>image_1</img>\n Joe Harris is <img>image_2</img>\n Shana Harris is <img>image_3</img>\n Tell me about the pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
115
+ "prediction": "Gavin Nichols, Joe Harris, and Shana Harris walk together down a city street. They engage in conversation, with Gavin and Joe facing each other while Shana walks alongside them. Eventually, they stop and have a meal together at a dining table, with Gavin and Joe seated and Shana standing nearby."
116
+ },
117
+ {
118
+ "id": 23,
119
+ "question": "Gavin Nichols is <img>image_1</img>\n Shana Harris is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
120
+ "prediction": "Gavin Nichols and Shana Harris are first seen in a tense conversation across a table; Shana looks on as Gavin drinks from a can. The situation escalates as Gavin stands up on the patio, holding Shana's hand, and then they are lying down together, seemingly in a confrontational or exhausted manner."
121
+ },
122
+ {
123
+ "id": 24,
124
+ "question": "Det. Hollis Lucetti is <img>image_1</img>\n Describe the scene in these pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
125
+ "prediction": "Det. Hollis Lucetti stands in a dining room, initially alone, then seen leaning down to speak to a young girl at the table. He later sits down with the family, engaging in a meal together."
126
+ },
127
+ {
128
+ "id": 25,
129
+ "question": "Freddie Quell is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n. What is Freddie Quell doing in these images? ",
130
+ "prediction": "Freddie Quell is seen having a close interaction with a woman, then sitting alone at a table possibly in a restaurant, and finally operating a camera in a different setting."
131
+ },
132
+ {
133
+ "id": 26,
134
+ "question": "Lancaster Dodd is <img>image_1</img>\n Analyze the contents of the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
135
+ "prediction": "Lancaster Dodd is initially seen joyously raising his arms with others at a celebration. The scene shifts to a more intimate setting where he is seated at a wedding table, initially appearing to engage in conversation. His mood shifts as he is captured standing, gesturing emphatically with his hand raised, suggesting an emotional or animated moment."
136
+ },
137
+ {
138
+ "id": 27,
139
+ "question": "Peggy Dodd is <img>image_1</img>\n Freddie Quell is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
140
+ "prediction": "Peggy Dodd and Freddie Quell sit opposite each other at a table, engaging in conversation. The setting changes to a dimly lit room where they stand close together, facing forward. Later, Freddie appears to be intently observing something while Peggy is seated in the background, looking on."
141
+ },
142
+ {
143
+ "id": 28,
144
+ "question": "Lancaster Dodd is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What happens to Lancaster Dodd?",
145
+ "prediction": "Lancaster Dodd stands on a balcony, is then shoved by someone, walks outside with his head down, and finally appears to be led away by his arm."
146
+ },
147
+ {
148
+ "id": 29,
149
+ "question": "Cal is <img>image_1</img>\n Jacob is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
150
+ "prediction": "Cal and Jacob engage in a conversation while walking together outdoors. As they continue, Jacob stops and gestures towards Cal, explaining something. They then stand facing each other in front of a shoe display, with Jacob still animatedly talking."
151
+ },
152
+ {
153
+ "id": 30,
154
+ "question": "Cal is <img>image_1</img>\n Jacob is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
155
+ "prediction": "Cal, in a striped yellow shirt, is seen shopping inside a store where Jacob, dressed in a suit, is positioned near a mannequin. Jacob approaches Cal, and they interact closely, with Jacob adjusting Cal's shirt. Cal, now holding his hat, continues to engage with Jacob as they converse."
156
+ },
157
+ {
158
+ "id": 31,
159
+ "question": "Jacob is <img>image_1</img>\n Cal is <img>image_2</img>\n Kate is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
160
+ "prediction": "Jacob and Cal are seen shaking hands in a bar, while Kate sits nearby, looking surprised. The scene shifts to a more relaxed setting where Jacob and Cal engage in a one-on-one conversation over drinks at a table. Their interaction becomes more intense as they stand up, with Cal gesturing emphatically towards Jacob. The evening ends with Jacob measuring Cal against a door frame, seemingly checking his height."
161
+ },
162
+ {
163
+ "id": 32,
164
+ "question": "Jessica is <img>image_1</img>\n Robbie is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
165
+ "prediction": "Jessica and Robbie are first seen amidst a crowd, with Robbie slightly behind Jessica. In the next moment, they are standing apart, with Robbie looking on as Jessica walks away. Finally, they are close to each other, standing side by side, facing the same direction with onlookers in the background."
166
+ },
167
+ {
168
+ "id": 33,
169
+ "question": "Cal is <img>image_1</img>\n Bernie is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
170
+ "prediction": "Cal, holding a paint can, exchanges a high-five with Bernie who's wearing a red vest in a store. Later, Cal, with his back towards the camera, seems to be in a discussion with Bernie and other workers in a warehouse. Finally, Cal is seen leaning on a cart while Bernie gestures as if explaining something to him in a aisle of the store."
171
+ },
172
+ {
173
+ "id": 34,
174
+ "question": "Claire is <img>image_1</img>\n Jessica is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
175
+ "prediction": "Jessica and Claire stand closely in a kitchen, then they are seen outside with Claire talking to Jessica as they stand by a car. Finally, Jessica is driving the car with Claire as a passenger, looking distressed."
176
+ },
177
+ {
178
+ "id": 35,
179
+ "question": "Jacob is <img>image_1</img>\n Emily is <img>image_2</img>\n Hannah is <img>image_3</img>\n Could you interpret the scene from these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
180
+ "prediction": "Jacob, Emily, and Hannah are initially seated together, engaged in different conversations. They eventually clasp their hands in applause, indicating a shared moment. The sequence concludes with Emily and Hannah hugging each other, while Jacob stands at a distance, observing them."
181
+ },
182
+ {
183
+ "id": 36,
184
+ "question": "Cal is <img>image_1</img>\n Robbie is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
185
+ "prediction": "Cal, dressed in a suit, stands at a podium while Robbie, in a striped suit, sits beside him. Robbie takes the microphone, and Cal leans in to listen as Robbie speaks. Cal then gestures while looking down, continuing the interaction with Robbie, who holds the microphone throughout their conversation."
186
+ },
187
+ {
188
+ "id": 37,
189
+ "question": "Lynn Childers is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
190
+ "prediction": "Lynn and Sam Childers stand close together in a kitchen, facing each other. They seem to be in a serious conversation, with Sam turning slightly towards Lynn, who is looking back at him. The tension between them builds as they stand in close proximity, culminating in a passionate embrace."
191
+ },
192
+ {
193
+ "id": 38,
194
+ "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
195
+ "prediction": "Sam and Lynn Childers sit together in a church, initially looking forward. Their attention shifts as they stand and face each other, with Lynn reaching out to Sam. They then turn, standing side by side, facing a gathering, with Sam gesturing emphatically towards Lynn."
196
+ },
197
+ {
198
+ "id": 39,
199
+ "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n?",
200
+ "prediction": "Sam Childers, in a checkered shirt, is seen carrying Lynn Childers, wearing a black dress, as they walk through a forest trail. They maintain close proximity throughout the sequence, indicating a moment of intimacy or support between them."
201
+ },
202
+ {
203
+ "id": 40,
204
+ "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
205
+ "prediction": "Sam Childers stands under a tree shade, watching as Deng sits on a chair in a seemingly organized outdoor area. Later, both Sam and Deng are seen inside a vehicle, with Sam in the driver's seat and Deng as the passenger, looking ahead. Finally, Sam and Deng walk side by side among a large encampment, with Sam leading the way."
206
+ },
207
+ {
208
+ "id": 41,
209
+ "question": "Deng is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n In the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What are Sam Childers doing throughout these images? ",
210
+ "prediction": "Sam Childers is having a seated conversation with Deng, thenSam plays with a gun while Deng watches."
211
+ },
212
+ {
213
+ "id": 42,
214
+ "question": "Paige is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
215
+ "prediction": "Sam Childers, visible in a reflection in the mirror, is close to Paige in a bedroom. He then sits on the bed facing Paige, talking to her. Lastly, they appear to share a moment together as he shows her something on his arm while seated on the bed."
216
+ },
217
+ {
218
+ "id": 43,
219
+ "question": "Sam Childers is <img>image_1</img>\n In the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n. What is Sam Childers doing?",
220
+ "prediction": "Sam Childers is standing in front of a group of people, then he is seen walking through a puddle of water, and finally, he is standing in a room with a microphone in front of him."
221
+ },
222
+ {
223
+ "id": 44,
224
+ "question": "Sam Childers is <img>image_1</img>\n Describe the scene in these pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
225
+ "prediction": "Sam Childers, engaged and active, interacts with a group of children in a dirt-covered area. He begins by playing soccer with them, then demonstrates a bat for the kids, and finally, he engages in a running game, leading a child in front of a house. The children follow him enthusiastically, participating in the playful activities."
226
+ },
227
+ {
228
+ "id": 45,
229
+ "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
230
+ "prediction": "Sam Childers and Lynn Childers are initially seen standing at the entrance of a room, Sam holding a drink while Lynn looks on. They then move closer together, engaging in a conversation with Lynn turning towards Sam as they talk. Finally, Sam and Lynn stand side by side, continuing their interaction with Lynn facing Sam, partially turned away from the camera."
231
+ },
232
+ {
233
+ "id": 46,
234
+ "question": "Lynn Childers is <img>image_1</img>\n Paige is <img>image_2</img>\n Sam Childers is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
235
+ "prediction": "Lynn Childers, wearing a blue dress, and Paige walk together beside a house, with Sam Childers following slightly behind them. They continue walking outside, passing by a convertible. Lynn and Paige stop to sit at an outdoor table, where Lynn appears to be in conversation with Sam, who is standing and gesturing."
236
+ },
237
+ {
238
+ "id": 47,
239
+ "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
240
+ "prediction": "Sam Childers and Deng are seen together inside a vehicle, with Sam in the driver's seat and Deng as the front passenger. They appear to be engaged in a conversation or a journey. In the following scenes, they switch positions, with Deng taking the driver's seat and Sam as the front passenger, indicating a possible role change during their travel. Throughout their journey, they maintain a dialogue, as seen by the consistent orientation towards each other."
241
+ },
242
+ {
243
+ "id": 48,
244
+ "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
245
+ "prediction": "Sam Childers and Deng are seen interacting with a group of children in a dry, open field. Initially, Sam is helping a child while Deng observes. As the scene progresses, they face each other among the children, with Sam turning towards Deng, possibly in conversation, while Deng looks on from a distance."
246
+ },
247
+ {
248
+ "id": 49,
249
+ "question": "Josh Lambert is <img>image_1</img>\n Renai Lambert is <img>image_2</img>\n Dalton Lambert is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
250
+ "prediction": "Josh Lambert is seen adjusting his tie with Renai Lambert standing beside him, while Dalton Lambert is seated nearby. In the next moment, Josh is busy moving boxes in the kitchen with Renai looking on, and Dalton is playing with a toy on the floor. Finally, Josh bends down to interact with Dalton, who is standing by a table, as Renai watches from a closer distance."
251
+ },
252
+ {
253
+ "id": 50,
254
+ "question": "Josh Lambert is <img>image_1</img>\n Renai Lambert is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is Josh Lambert doing?",
255
+ "prediction": "Josh Lambert is first standing by the piano, then he bends down to interact with a child, and finally he is seen sitting on the couch."
256
+ },
257
+ {
258
+ "id": 51,
259
+ "question": "Sparrow is <img>image_1</img>\n For the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
260
+ "prediction": "Sparrow, facing a group of people, appears to be the center of attention. He then turns to converse with someone off-camera, later standing with his back to the camera, and finally, he gets into a vehicle, sitting in the passenger seat."
261
+ },
262
+ {
263
+ "id": 52,
264
+ "question": "Sparrow is <img>image_1</img>\n Roast Pork is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
265
+ "prediction": "Sparrow and Roast Pork, both in suits, appear to be having a heated exchange among a group of onlookers. They move through the crowd, exchanging intense looks and gestures. Eventually, their confrontation escalates as they stand face-to-face, before conclusion with a physical altercation, shoving each other around a table."
266
+ },
267
+ {
268
+ "id": 53,
269
+ "question": "Roast Pork is <img>image_1</img>\n Explain the content of these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
270
+ "prediction": "Roast pork walks through a restaurant, casually observing his surroundings. He pauses to interact with a tabled guest, seemingly in a light, casual exchange. The moment progresses to a more intense interaction as he confronts another individual, with his arm extended. Finally, he is seen walking away from the area, carrying his own plate, amongst other diners."
271
+ },
272
+ {
273
+ "id": 54,
274
+ "question": "Roast Pork is <img>image_1</img>\n Kerosene is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
275
+ "prediction": "Roast pork, wearing a white jacket, exchanges a red object with Kerosene, who is dressed in a grey shirt. They hold hands over the item, then roast pork turns his back to Kerosene, who now wears a green jacket. Finally, they appear to have a close conversation while facing each other."
276
+ },
277
+ {
278
+ "id": 55,
279
+ "question": "Roast Pork is <img>image_1</img>\n Nancy is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
280
+ "prediction": "Roast Pork and Nancy appear to be engaged in a tense conversation, facing each other closely. Subsequently, they are seen together with two children, focusing on an object on the table, suggesting a shared family moment. Their interaction seems to be part of a larger, ongoing dialogue in a home setting."
281
+ },
282
+ {
283
+ "id": 56,
284
+ "question": "Peter Sullivan is <img>image_1</img>\n Seth Bregman is <img>image_2</img>\n Could you interpret the scene from these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
285
+ "prediction": "Seth Bregman, wearing a dark suit, is seated while Peter Sullivan, in a light blue shirt, reads a document. As the scene progresses, Seth leans back in his chair while Peter, still holding the document, appears to be speaking to Seth. They seem to be engaged in a work-related discussion in an office-like setting."
286
+ },
287
+ {
288
+ "id": 57,
289
+ "question": "Will Emerson is <img>image_1</img>\n Seth Bregman is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
290
+ "prediction": "Will Emerson and Seth Bregman are first seen in a car, with Will driving and Seth as the passenger. They then switch positions, with Seth taking the wheel and Will as the passenger, as they continue their journey. Eventually, Seth steps out of the car, while Will remains inside, looking towards him as he walks away."
291
+ },
292
+ {
293
+ "id": 58,
294
+ "question": "John Tuld is <img>image_1</img>\n Sam Rogers is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
295
+ "prediction": "Sam Rogers stands at a distance in a room, facing towards John Tuld, who is slightly turned away. In the following moments, John is seen with his back towards Sam, appearing to be in deep thought at the sink, while Sam seems to be observing him from a closer distance. Their interaction suggests a tense atmosphere between them."
296
+ },
297
+ {
298
+ "id": 59,
299
+ "question": "Sam Rogers is <img>image_1</img>\n Peter Sullivan is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
300
+ "prediction": "Sam Rogers, dressed in a suit, is seen in conversation with Peter Sullivan, who is holding a coffee cup. They appear to be engaged in a serious discussion while walking outside. Throughout their interaction, they maintain a steady pace, indicating the importance of their dialogue."
301
+ },
302
+ {
303
+ "id": 60,
304
+ "question": "Sarah Robertson is <img>image_1</img>\n Eric Dale is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
305
+ "prediction": "Sarah Robertson stands by a window with a city view as Eric Dale, initially out of view, seems to be engaged in a conversation with her. They are in a well-lit room with a modern decor. Sarah then sits on a couch, facing Eric who is now standing close by. They continue their interaction, with Eric slightly leaning towards Sarah, suggesting a deepening dialogue between them."
306
+ },
307
+ {
308
+ "id": 61,
309
+ "question": "John Bennett is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is the sequence of actions John Bennett performs? ",
310
+ "prediction": "John Bennett starts by standing among a crowd, then drinks from a water bottle, walks through the crowd, and finally seems to be in a physical altercation with someone."
311
+ },
312
+ {
313
+ "id": 62,
314
+ "question": "John Bennett is <img>image_1</img>\n Lori Collins is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
315
+ "prediction": "Lori Collins stands by the entrance while John Bennett sits at the counter with his back to her. They face each other from opposite ends of the counter, appearing to be in a conversation. John then stands up, turning towards Lori, continuing their interaction. The sequence ends with John sitting back down, still engaged in dialogue with Lori."
316
+ },
317
+ {
318
+ "id": 63,
319
+ "question": "Elite Guard No. 1 is <img>image_1</img>\n Perseus is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
320
+ "prediction": "Perseus and Elite Guard No. 1 stand close, looking out towards the sea, with Perseus' back partially turned to the viewer. The scene shifts as they watch a Pegasus take flight, Perseus turning his back to No. 1, who seems to be looking at the horse. Subsequently, Perseus is seen close up, conversing with No. 1, while the horse's attention is drawn towards something off-camera. Finally, Perseus mounts the horse, engaging with No. 1 one last time before setting off."
321
+ },
322
+ {
323
+ "id": 64,
324
+ "question": "Driss is <img>image_1</img>\n Philippe is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
325
+ "prediction": "Driss leans in towards Philippe who is seated, appearing to have a close conversation. Later, they are seen in a bustling environment with Driss leading Philippe, who seems to be in a wheelchair, through a crowded space. Eventually, they join a group of people, sitting side by side, with Driss turned towards Philippe, continuing their interaction."
326
+ },
327
+ {
328
+ "id": 65,
329
+ "question": "Driss is <img>image_1</img>\n Philippe is <img>image_2</img>\n Yvonne is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
330
+ "prediction": "Driss stands to the right of Philippe, who is seated in a wheelchair, while Yvonne is to the left of Philippe, looking their way. Driss seems to be engaging with them, possibly in conversation. Yvonne moves closer to Philippe, gesturing as if explaining something. Driss leans forward, listening intently, and finally, Driss stands upright, facing the camera with both Yvonne and Philippe behind him."
331
+ },
332
+ {
333
+ "id": 66,
334
+ "question": "Magalie is <img>image_1</img>\n Driss is <img>image_2</img>\n Yvonne is <img>image_3</img>\n Examine the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n and describe their content.",
335
+ "prediction": "Magalie and Driss engage in a close conversation while Yvonne observes. Subsequently, Driss walks away with a pink backpack as Magalie watches him leave. Yvonne remains behind, looking in their direction. Later, Magalie and Yvonne stand at the bottom of a staircase, facing each other, suggesting a continuation of their discussion."
336
+ },
337
+ {
338
+ "id": 67,
339
+ "question": "Fantine is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What is happening to Fantine in these images?",
340
+ "prediction": "Fantine is involved in a physical altercation with other individuals, where she appears to be in a defensive stance against attack."
341
+ },
342
+ {
343
+ "id": 68,
344
+ "question": "Fantine is <img>image_1</img>\n In the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
345
+ "prediction": "Fantine appears in close proximity to another person, then moves through a crowded space, later she is seen crouching on a street with a fire behind her, and finally, she is kneeling on the ground with a distressed expression."
346
+ },
347
+ {
348
+ "id": 69,
349
+ "question": "Enjolras is <img>image_1</img>\n Marius is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
350
+ "prediction": "Enjolras and Marius lead a successful rally in a crowded square, with Enjolras gesturing passionately and Marius supporting a man on a podium. The atmosphere is joyous and energetic as they stand among the gathered crowd. Later, the two are seen close together, with Enjolras extending his arm towards a statue on a gate, and Marius leaning over a barrier amidst the crowd, the mood shift from celebration to a more serious, perhaps contemplative tone. They then move indoors, Enjolras supports a wounded man while Marius looks on, indicating a transition from public engagement to a more private, possibly strategic gathering."
351
+ },
352
+ {
353
+ "id": 70,
354
+ "question": "Cosette is <img>image_1</img>\n Marius is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
355
+ "prediction": "Marius and Cosette stand close to each other in a warm embrace, sharing an intimate moment. They then move through a bustling crowd, with Marius leading and Cosette following, creating a sense of connection between them amidst the chaos. Finally, Marius turns to face Cosette, suggesting a moment of pause or a significant exchange in their interaction."
356
+ },
357
+ {
358
+ "id": 71,
359
+ "question": "Tom Sawyer is <img>image_1</img>\n Huck Finn is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
360
+ "prediction": "Tom Sawyer and Huck Finn are engaged in an activity together, with Tom holding a pair of glasses and a small cup and Huck focusing on something in Tom's hands. They are surrounded by various items, indicating a casual, outdoor setting. As the scene progresses, Huck stands to get a better view, emphasizing their involvement in the same activity."
361
+ },
362
+ {
363
+ "id": 72,
364
+ "question": "Tante Polly is <img>image_1</img>\n Tom Sawyer is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
365
+ "prediction": "Tante Polly and Tom Sawyer are first seen outdoors, with Polly sitting on a fence and Tom facing her from a distance. They move closer for a more intimate interaction, with Sawyer's hand gently touching Polly's shoulder in a comforting gesture. Finally, they sit together at a dining table, where Polly appears to be explaining something to Sawyer as he listens attentively."
366
+ },
367
+ {
368
+ "id": 73,
369
+ "question": "Indianer Joe is <img>image_1</img>\n For the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, What does Indianer Joe do during the sequence of images?",
370
+ "prediction": "Indianer Joe starts by sitting at the bar with a drink, then interacts with someone across the table, and finally leans in for a close conversation."
371
+ },
372
+ {
373
+ "id": 74,
374
+ "question": "Tom Sawyer is <img>image_1</img>\n Huck Finn is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
375
+ "prediction": "Tom Sawyer and Huck Finn are first seen walking side by side with Tom holding a book. They then run energetically through a wooden structure area. Next, they face each other in a close conversation. Finally, they are peeking through a barred window, engrossed in something out of view."
376
+ },
377
+ {
378
+ "id": 75,
379
+ "question": "Tom Sawyer is <img>image_1</img>\n In the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
380
+ "prediction": "Tom Sawyer sits among his classmates in a rustic classroom, initially looking forward. He then turns his head to the right, appearing to be in conversation with someone off-camera. Following that, Tom faces forward again before leaning over a table as if to read or write something, engaging fully with the activity in front of him."
381
+ },
382
+ {
383
+ "id": 76,
384
+ "question": "Tom Sawyer is <img>image_1</img>\n Richter Thatcher is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
385
+ "prediction": "Tom Sawyer and Richter Thatcher are engaged in a tea party outdoors. They sit across from each other, with a table filled with tea and cake between them. In one moment, Richter leans in closely towards Tom, while in another, Tom leans in towards Richter, suggesting an intimate or serious exchange. Later, Richter enjoys a drink with a smoke, still seated beside Tom."
386
+ },
387
+ {
388
+ "id": 77,
389
+ "question": "Tante Polly is <img>image_1</img>\n Richter Thatcher is <img>image_2</img>\n Tom Sawyer is <img>image_3</img>\n Can you provide a description of the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
390
+ "prediction": "Tante Polly stands solemnly amidst a crowd, Richter Thatcher is by her side, and Tom Sawyer is seen sitting down, all dressed in black. As the scene progresses, Tom turns his head towards Polly and Thatcher, while Polly leans forward, appearing to be in a conversation with someone next to her. The interaction seems heartfelt, with gestures of comfort and care amidst a sea of other mourners."
391
+ },
392
+ {
393
+ "id": 78,
394
+ "question": "Richard is <img>image_1</img>\n Olivia is <img>image_2</img>\n Kate is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
395
+ "prediction": "Kate, Olivia, and Richard are together at a social gathering. Kate and Olivia engage in a conversation across the table, while Richard seems to be listening intently to Olivia. As the night progresses, Kate turns towards Richard, possibly making a point or sharing a laugh, while Olivia looks on. The atmosphere appears convivial, with Richard's attention primarily on Olivia."
396
+ },
397
+ {
398
+ "id": 79,
399
+ "question": "Kate is <img>image_1</img>\n Richard is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
400
+ "prediction": "Kate stands in the doorway while Richard is lounging on the couch with a glass of wine and a stuffed animal. She moves closer to Richard, who is busy reading a book. Finally, Kate confronts Richard face-to-face, standing right in front of him."
401
+ },
402
+ {
403
+ "id": 80,
404
+ "question": "Richard is <img>image_1</img>\n Olivia is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
405
+ "prediction": "Richard, wearing a blue suit, stands beside a classic car as Olivia, in a black dress, sits in the driver's seat. They then drive away with Richard in the passenger seat and Olivia behind the wheel. After reaching their destination, they exit the car and walk side by side, engaging in conversation."
406
+ },
407
+ {
408
+ "id": 81,
409
+ "question": "Jake is <img>image_1</img>\n Richard is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
410
+ "prediction": "Richard, dressed in a white suit, is seen close to Jake in various settings, initially indoors. They appear to be sharing an affectionate moment, with Richard's focus solely on Jake. As the sequence progresses, Richard seems to be engaging more actively with Jake, perhaps in a playful or instructive manner, while Jake listens attentively."
411
+ },
412
+ {
413
+ "id": 82,
414
+ "question": "Richard is <img>image_1</img>\n Gordon is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
415
+ "prediction": "Gordon, dressed in a blue suit, stands facing Richard, who is in a blue blazer and white pants. They appear to have a conversation as Gordon's back is initially turned to Richard. Then, Gordon is seen embracing a woman, Richard looks on. Later, they walk together towards an indoor setting."
416
+ },
417
+ {
418
+ "id": 83,
419
+ "question": "Chris is <img>image_1</img>\n Leon is <img>image_2</img>\n Marie is <img>image_3</img>\n Describe the scene in these pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
420
+ "prediction": "Chris, Leon, and Marie walk down a hospital corridor with Leon leading the way. They encounter an older man lying on the floor, and Chris and Marie stop to interact with him while Leon watches from a distance. Eventually, Chris and Marie embrace the older man, showing signs of comfort and affection."
421
+ },
422
+ {
423
+ "id": 84,
424
+ "question": "Frank is <img>image_1</img>\n Chris is <img>image_2</img>\n Leon is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
425
+ "prediction": "Frank, Chris, and Leon are in a living room with Frank sitting on a stool and Chris and Leon lounging on a couch. They seem to be in a casual conversation, with Frank gesturing while talking to Chris. Their discussion appears ongoing as they remain seated, with Leon occasionally joining in from across the room."
426
+ },
427
+ {
428
+ "id": 85,
429
+ "question": "Chris is <img>image_1</img>\n Natalie is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
430
+ "prediction": "Natalie, in a red outfit, sits closely beside Chris, engaging in conversation in a cozy room with a Christmas tree in the background. They shift positions, maintaining their interaction, before Chris stands up, turning away slightly as if in mid-discussion. The sequence ends with Chris leaning against a wall and Natalie sitting on the floor, both appearing to be in a serious exchange."
431
+ },
432
+ {
433
+ "id": 86,
434
+ "question": "Monica is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What happens to Monica during the sequence of these images?",
435
+ "prediction": "Monica appears to be in a conversation, then becomes visibly distressed, and is finally escorted out of the room by a man."
436
+ },
437
+ {
438
+ "id": 87,
439
+ "question": "Abbey is <img>image_1</img>\n Anthony is <img>image_2</img>\n Tom is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
440
+ "prediction": "Tom sits across from Anthony, with Abbey seated next to Tom. They appear to be engaged in a serious conversation. Anthony stands emphatically, seemingly making a point to Tom, while Abbey looks on. The interaction intensifies as Abbey leans in closer, indicating a moment of urgency or importance in their dialogue."
441
+ },
442
+ {
443
+ "id": 88,
444
+ "question": "Tom is <img>image_1</img>\n Anthony is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
445
+ "prediction": "Tom and Anthony engage in a conversation while sitting on a bench, with Tom appearing to be the more animated speaker. They then stand facing each other, continuing their discussion, before Tom seems to listen attentively to Anthony, who is now talking with a gesture indicating emphasis or explanation."
446
+ },
447
+ {
448
+ "id": 89,
449
+ "question": "Haruta Kamijou is <img>image_1</img>\n Chika Homura is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
450
+ "prediction": "Haruta Kamijou and Chika Homura start by standing indoors, facing each other with Chika holding a saxophone. They move outside where they walk side by side, both holding papers. Next, they are seen from behind, walking among other students with Chika leading the way. Finally, they stand in front of a table with a poster, engaging in conversation with a group of people around them."
451
+ },
452
+ {
453
+ "id": 90,
454
+ "question": "Chika Homura is <img>image_1</img>\n Can you provide a description of the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
455
+ "prediction": "Chika Homura, wearing a school uniform with a tie, is seen in a classroom setting. She appears focused on her task initially, then is shown playing a wind instrument. Later, she is amongst a group of students moving around the room, and finally, she is seen sitting, likely still engaged in a class activity."
456
+ },
457
+ {
458
+ "id": 91,
459
+ "question": "Ji is <img>image_1</img>\n Wang Yi-Chi is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
460
+ "prediction": "Ji and Wang Yi-Chi are initially seen discussing something in a room. The scene shifts, showing Wang looking at a document while Ji appears to be in the midst of a conversation with him. The setting changes to an outdoor auto shop where they continue their interaction; first, under the hood of a car, then standing beside it."
461
+ },
462
+ {
463
+ "id": 92,
464
+ "question": "Maggie is <img>image_1</img>\n Wang Yi-Chi is <img>image_2</img>\n In the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Maggie doing?",
465
+ "prediction": "Maggie is having a conversation with Wang Yi-Chi."
466
+ },
467
+ {
468
+ "id": 93,
469
+ "question": "Maggie is <img>image_1</img>\n Please describe the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
470
+ "prediction": "Maggie, seated at a table with a drink in hand, appears to be engaged in a meeting or a discussion. She seems attentive and involved in the ongoing conversation or activity, with her position and posture maintaining throughout the sequence of images."
471
+ },
472
+ {
473
+ "id": 94,
474
+ "question": "Jeff is <img>image_1</img>\n Shawn is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
475
+ "prediction": "Jeff, wearing a purple shirt, is in a room talking on the phone. Meanwhile, Shawn, not far behind, seems to be looking at Jeff, holding a sandwich in his hand. The situation escalates as Jeff sits on a couch while Shawn leans closely towards him from behind, indicating a tense or serious exchange between them."
476
+ },
477
+ {
478
+ "id": 95,
479
+ "question": "Tonya is <img>image_1</img>\n Diane Rawlinson is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
480
+ "prediction": "Tonya, with her back to the camera, appears to be speaking at a press conference. Diane Rawlinson stands beside her, slightly out of focus. The crowd, capturing the attention of the media and onlookers, watches Tonya from behind as she addresses them."
481
+ },
482
+ {
483
+ "id": 96,
484
+ "question": "Momo Adachi is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
485
+ "prediction": "Momo Adachi and Sae Kashiwagi are in the school cafeteria; Sae is sitting while leaning on her hand, and Momo is sitting across from her. They appear to be engaged in a conversation. Momo then gestures emphatically towards Sae, who looks surprised or amused. Eventually, Sae leans forward, resting her head on her crossed arms, while Momo reaches out to her, maintaining the interaction."
486
+ },
487
+ {
488
+ "id": 97,
489
+ "question": "Momo Adachi is <img>image_1</img>\n Kairi Okayasu is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
490
+ "prediction": "Momo Adachi and Kairi Okayasu are first seen hurriedly walking down a hallway together. They then pause in a classroom, facing each other, with a tension between them palpable. Momo reaches out to touch Kairi's necklace, before the two share a close, intimate moment, with Kairi slightly leaning in towards Momo."
491
+ },
492
+ {
493
+ "id": 98,
494
+ "question": "Momo Adachi is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
495
+ "prediction": "Momo Adachi, holding an umbrella, walks past Sae Kashiwagi. They exchange looks as Momo continues walking. Eventually, Momo turns to face Sae, presenting something in her hands, while Sae seems to be examining it closely."
496
+ },
497
+ {
498
+ "id": 99,
499
+ "question": "Sae Kashiwagi is <img>image_1</img>\n Toji is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
500
+ "prediction": "Sae Kashiwagi and Toji are initially at a distance, with Sae sitting and Toji standing across from her. Toji then moves closer to Sae, until they are face to face in close proximity. They appear to be engaged in an intense, personal conversation."
501
+ },
502
+ {
503
+ "id": 100,
504
+ "question": "Toji is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Kairi Okayasu is <img>image_3</img>\n Tell me about the pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
505
+ "prediction": "Toji, Kairi, and Momo Adachi walk together, with Toji placed between Kairi and Momo. As they continue, Toji looks back, possibly addressing Kairi, who turns her body towards Toji while engaging with him. Eventually, Momo steps forward, moving closer to Toji as they converse."
506
+ },
507
+ {
508
+ "id": 101,
509
+ "question": "Toji is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
510
+ "prediction": "Sae Kashiwagi, wearing a school uniform, is first seen sitting while looking to her side. Then, she is observed from behind, standing and walking away from her table. In the final image, Sae is seen walking briskly across the room, moving away from the table where Toji, also in a uniform, remains standing."
511
+ },
512
+ {
513
+ "id": 102,
514
+ "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
515
+ "prediction": "Kairi Okayasu and Momo Adachi are in a kitchen, engaging in a conversation while Momo eats a piece of cake. Kairi, wearing a cooking apron, seems to be in the middle of a conversation as Momo listens and enjoys her cake. The interaction appears to be a casual and friendly conversation over a meal."
516
+ },
517
+ {
518
+ "id": 103,
519
+ "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
520
+ "prediction": "Kairi Okayasu and Momo Adachi walk through a crowded area, with Kairi leading the way. They stop their conversation in a park, with Kairi gesturing while talking to Momo. Eventually, they sit together among the flowers, continuing their interaction with Kairi still animatedly speaking."
521
+ },
522
+ {
523
+ "id": 104,
524
+ "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
525
+ "prediction": "Kairi Okayasu and Momo Adachi are first seen sitting opposite each other at a cafeteria table, engaged in conversation. They then hold hands while walking through a school campus, showing a close bond. Later, they walk together in a crowded area, still in their school uniforms, continuing their interaction."
526
+ },
527
+ {
528
+ "id": 105,
529
+ "question": "Elise Rainier is <img>image_1</img>\n Imogen Rainier is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
530
+ "prediction": "Elise Rainier stands facing Imogen Rainier, who is initially seen from behind. They appear to be in a serious conversation. Then, Imogen is shown from the front, continuing the dialogue with Elise. The scene concludes with both women, now older, standing in the background as emergency responders attend to an incident in the foreground."
531
+ },
532
+ {
533
+ "id": 106,
534
+ "question": "Dan Zhang is <img>image_1</img>\n Red Ye is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
535
+ "prediction": "Dan Zhang, standing beside a motorcycle, presents a document to Red Ye, who is facing away from him. Red then turns to accept the paper, and they continue to engage with each other closely, before Red rides away on the motorcycle while Dan watches."
536
+ },
537
+ {
538
+ "id": 107,
539
+ "question": "Dan Zhang is <img>image_1</img>\n Po Chen is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, What actions does Dan Zhang take?",
540
+ "prediction": "Dan Zhang starts by walking alongside Po Chen, then he is seen waiting at a bus stop, and finally, he rides the subway."
541
+ },
542
+ {
543
+ "id": 108,
544
+ "question": "King Kong is <img>image_1</img>\n Dan Zhang is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
545
+ "prediction": "Dan Zhang faces King Kong in a tense confrontation. Kong stands confidently while Dan is more reserved. As the situation escalates, Dan seems to be arguing or presenting a case to a back turned Kong. The final moment shows Dan taking a defensive stance with a gun pointed at Kong, indicating a standoff."
546
+ },
547
+ {
548
+ "id": 109,
549
+ "question": "Po Chen is <img>image_1</img>\n Red Ye is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
550
+ "prediction": "Po Chen and Red Ye start off standing side by side on a bridge, then Po turns to face Red as they engage in a conversation. The interaction becomes more intimate as they lean in close, and eventually, they sit side by side again, continuing their dialogue."
551
+ },
552
+ {
553
+ "id": 110,
554
+ "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
555
+ "prediction": "Arata Kaizaki and Chizuru Hishiro are in a classroom with Arata walking in from the back, approaching Chizuru who is sitting at her desk. They then have a conversation, with Arata leaning towards Chizuru, who is listening attentively while holding a red paper."
556
+ },
557
+ {
558
+ "id": 111,
559
+ "question": "Arata Kaizaki is <img>image_1</img>\n Rena Kariu is <img>image_2</img>\n Kazuomi Oga is <img>image_3</img>\n Please give a summary of the images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
560
+ "prediction": "Arata Kaizaki is seen sitting at a desk, turning around to face Rena Kariu and Kazuomi Oga who are sitting across from him. They seem to be engaged in a conversation as Arata gestures with his hands while talking to Rena. The interaction appears to be in a classroom setting with other students around them."
561
+ },
562
+ {
563
+ "id": 112,
564
+ "question": "Chizuru Hishiro is <img>image_1</img>\n Arata Kaizaki is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
565
+ "prediction": "Chizuru Hishiro and Arata Kaizaki are seen walking through a school cafeteria. They face each other, engaging in a conversation. Arata hands something to Chizuru, who looks surprised. They continue to interact, with Chizuru appearing to be explaining something to Arata."
566
+ },
567
+ {
568
+ "id": 113,
569
+ "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
570
+ "prediction": "Arata Kaizaki and Chizuru Hishiro are first seen at lockers; Arata is busying himself while Chizuru stands by. They move to a park where they sit on a bench; Arata looks away, seemingly preoccupied. Chizuru stands upright, facing Arata. Finally, Arata stands, gesturing towards Chizuru, who stands at attention."
571
+ },
572
+ {
573
+ "id": 114,
574
+ "question": "Rena Kariu is <img>image_1</img>\n Kazuomi Oga is <img>image_2</img>\n Chizuru Hishiro is <img>image_3</img>\n What do you see in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
575
+ "prediction": "Rena Kariu stands at the bottom of a staircase, looking up, while Chizuru Hishiro and Kazuomi Oga are on the stairs ahead of her. Chizuru and Oga engage in a conversation as Rena observes them. Then, Rena joins them, walking up the stairs to join Chizuru and Oga who wait for her."
576
+ },
577
+ {
578
+ "id": 115,
579
+ "question": "Chizuru Hishiro is <img>image_1</img>\n Arata Kaizaki is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
580
+ "prediction": "Chizuru Hishiro, dressed in a school uniform, is seen close to Arata Kaizaki. Initially, she is behind him, but then she moves closer, even holding a bowl while facing him. Their interaction suggests a service or care environment, with Chizuru in a attentive stance towards Arata."
581
+ },
582
+ {
583
+ "id": 116,
584
+ "question": "Arata Kaizaki is <img>image_1</img>\n Kazuomi Oga is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
585
+ "prediction": "Arata Kaizaki and Kazuomi Oga are first seen in a study setting where Arata is focused on writing, and Kazuomi appears to be taking a photo or checking his phone. They later sit across from each other at a table in a caf\u00e9, engaging in conversation. Finally, they face each other in a more formal setting, continuing their interaction."
586
+ },
587
+ {
588
+ "id": 117,
589
+ "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
590
+ "prediction": "Arata Kaizaki and Chizuru Hishiro start inside a building, standing in a line with others, seemingly waiting. They move outdoors, where Arata sits on the ground among a group, engaging in a picnic atmosphere. Finally, they stand alone, facing forward with a trees background, indicating a shift from group to individual moments."
591
+ },
592
+ {
593
+ "id": 118,
594
+ "question": "Anna Karenina is <img>image_1</img>\n Kitty is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
595
+ "prediction": "Anna Karenina and Kitty engage in a conversation, with Anna listening intently to Kitty. The setting shifts to a grand staircase where Kitty, in a white dress, seems to instruct Anna, leading her by the hand. They move closer, with Anna holding onto Kitty's wrist, suggesting a moment of guidance or preparation for a dance or social event."
596
+ },
597
+ {
598
+ "id": 119,
599
+ "question": "Oblonsky is <img>image_1</img>\n Anna Karenina is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
600
+ "prediction": "Oblonsky and Anna Karenina are seen together in different settings, initially close to each other under a fur-lined umbrella, then walking hand in hand into a grand event. At the event, they are initially apart, with Anna appearing to call attention to herself. However, they soon find each other again, Anna sitting while Oblonsky stands beside her, engaging her with a gesture."
601
+ },
602
+ {
603
+ "id": 120,
604
+ "question": "Vronsky is <img>image_1</img>\n Kitty is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
605
+ "prediction": "Vronsky, dressed in a white military uniform, stands on a balcony with other officers, while Kitty, wearing a light dress, stands nearby. They move through a crowded ballroom, where Vronsky gently takes Kitty's hand, guiding her among the dancing guests. They are seen in close proximity, seemingly engaged in a private moment amidst the bustling event."
606
+ },
607
+ {
608
+ "id": 121,
609
+ "question": "Irving Rosenfeld is <img>image_1</img>\n Sydney Prosser is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
610
+ "prediction": "Sydney Prosser and Irving Rosenfeld are perusing through a selection of fur coats in a warehouse. They move closer to each other, with Sydney examining a coat and Irving observing her choice. Then, they stand side by side, with Sydney adjusting a coat for Irving to try on. They appear to be engaged in a collaboration related to the coats."
611
+ },
612
+ {
613
+ "id": 122,
614
+ "question": "Rosalyn Rosenfeld is <img>image_1</img>\n Irving Rosenfeld is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
615
+ "prediction": "Rosalyn and Irving Rosenfeld engage in a conversation in a kitchen; Rosalyn appears animated while Irving listens intently. They switch positions, with Irving now speaking into a phone held to his ear and Rosalyn watching him. Their conversation continues, with Rosalyn turning towards Irving, who is still on the phone, seemingly deep in discussion."
616
+ },
617
+ {
618
+ "id": 123,
619
+ "question": "Richie DiMaso is <img>image_1</img>\n Irving Rosenfeld is <img>image_2</img>\n Stoddard Thorsen is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
620
+ "prediction": "Richie DiMaso and Irving Rosenfeld are engaged in a discussion while Stoddard Thorsen listens. As the conversation progresses, Richie becomes more animated, gesturing with his hands. Eventually, Richie leans in towards Stoddard, emphasizing his point, and then seen physically restraining Stoddard against a wall."
621
+ },
622
+ {
623
+ "id": 124,
624
+ "question": "Razieh is <img>image_1</img>\n Nader is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
625
+ "prediction": "Nader is in the kitchen with Razieh, who seems to be caring for someone. They are engaged in a daily routine, possibly preparing food or cleaning up. As they work, Razieh's attention shifts towards Nader, who is standing by observing. Their interaction appears to be casual, with Nader checking in or providing guidance as Razieh performs her tasks."
626
+ },
627
+ {
628
+ "id": 125,
629
+ "question": "Simin is <img>image_1</img>\n Nader is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
630
+ "prediction": "Simin is busy chopping vegetables in the kitchen while Nader is observed from a distance. Shortly after, Nader moves closer and appears to be peeking around a corner. The two then stand close together, facing a common direction, and eventually, Simin is seen engaged in a phone conversation with Nader sitting beside her."
631
+ },
632
+ {
633
+ "id": 126,
634
+ "question": "Simin is <img>image_1</img>\n Nader is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
635
+ "prediction": "Simin and Nader are together in a kitchen; Simin is turned away from Nader, who seems to be working at the sink. In the next moment, Nader is seen from behind, facing a dish rack near the sink while Simin is not in the frame. Then, they are seen through a window, sitting opposite each other with Simin appearing engaged in a task and Nader gesturing with his hand. Finally, they are indoors, with Simin focused on writing or drawing at a table while Nader is partially visible in the foreground, seemingly in conversation with Simin."
636
+ },
637
+ {
638
+ "id": 127,
639
+ "question": "Tris is <img>image_1</img>\n Christina is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
640
+ "prediction": "Tris and Christina exit a train, moving quickly onto a rooftop. They then navigate cautiously over the rooftop, with Tris leading the way and Christina following close behind. Eventually, they face each other, with Tris turning to include Christina in a group discussion."
641
+ },
642
+ {
643
+ "id": 128,
644
+ "question": "Tris is <img>image_1</img>\n Peter is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
645
+ "prediction": "Peter and Tris are initially seen side by side, seemingly in a serious conversation. The scene shifts to a training session where both are engaged in a physical activity, displaying a focus on their interaction. Later, they are seen walking together with a group, suggesting a sense of belonging and coordination among them."
646
+ },
647
+ {
648
+ "id": 129,
649
+ "question": "Tris is <img>image_1</img>\n Christina is <img>image_2</img>\n Peter is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
650
+ "prediction": "Initially, Tris, Christina, and Peter stand side by side, with Tris and Peter mostly in profile views and Christina facing forward. The scene shifts to a more intimate setting where Tris is lying down while Peter is seated beside her, looking at Tris. Finally, the group is seen again, with Tris standing in the center and Peter to her left, both facing forward. Christina is not in the foreground but is present in the background."
651
+ },
652
+ {
653
+ "id": 130,
654
+ "question": "Four is <img>image_1</img>\n Tris is <img>image_2</img>\n Eric is <img>image_3</img>\n What do you see in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
655
+ "prediction": "Eric, Tris, and Four stand together amidst a crowd, with Eric on the left and Four to the right of Tris. They seem to be in a tense situation, as Eric is later seen holding a gun, with Four closely watching from the side. In the final scene, they stand side by side, surrounded by others, displaying a sense of unity and possibly preparation for an event."
656
+ },
657
+ {
658
+ "id": 131,
659
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
660
+ "prediction": "Imamura stands on a rooftop, looking down, while Wakaba, seated further back, looks towards him. In the following moments, Wakaba gets closer to the edge, seemingly in a precarious position, as Imamura watches from afar, maintaining his focus on her."
661
+ },
662
+ {
663
+ "id": 132,
664
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
665
+ "prediction": "Wakaba, seated with hands clasped, converses with Imamura who is sitting beside her. Later, they walk together through a store with other patrons in the background. Eventually, they sit together at a table with Imamura turning his head towards Wakaba, continuing their interaction."
666
+ },
667
+ {
668
+ "id": 133,
669
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
670
+ "prediction": "Imamura is first seen crouched down, seemingly focused on an activity on the floor. Soon after, Wakaba appears in the room, sitting at a table while Imamura is still on the floor. The sequence concludes with both Imamura and Wakaba sitting side by side, engaged in a conversation."
671
+ },
672
+ {
673
+ "id": 134,
674
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
675
+ "prediction": "Imamura and Wakaba walk through a hallway together; Wakaba holds a piece of paper. Imamura looks at her, then points at something above him as they continue their walk."
676
+ },
677
+ {
678
+ "id": 135,
679
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
680
+ "prediction": "Wakaba and Imamura are seated in a car with another person. Initially, Wakaba is focused on reading a book while Imamura appears to be looking at her. Then, Imamura is seen eating chips and conversing with Wakaba, who remains attentive to her book. They continue their journey, with Imamura occasionally looking away but generally engaged in the interaction."
681
+ },
682
+ {
683
+ "id": 136,
684
+ "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
685
+ "prediction": "Imamura and Wakaba are seated next to each other among a crowd, initially looking forward. Wakaba then claps her hands enthusiastically while Imamura looks on. Subsequently, both turn their heads towards each other, engaging in a conversation or interaction, with Wakaba holding a red program."
686
+ },
687
+ {
688
+ "id": 137,
689
+ "question": "Utah is <img>image_1</img>\n In the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Utah doing?",
690
+ "prediction": "Utah starts by steering the boat, then he prepares to jump on a jet ski, and finally, he is seen holding onto a rope, likely waiting to ride the jet ski."
691
+ },
692
+ {
693
+ "id": 138,
694
+ "question": "Utah is <img>image_1</img>\n Bodhi is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
695
+ "prediction": "Utah and Bodhi stand side by side in a rugged landscape, with a vehicle behind them. They then appear inside a car, with Utah in the driver's seat and Bodhi as the passenger, driving through what seems to be a stormy environment. Subsequently, they are seen together with a sense of urgency, as Bodhi appears to support Utah, who seems to be in distress, while moving off the vehicle."
696
+ },
697
+ {
698
+ "id": 139,
699
+ "question": "Utah is <img>image_1</img>\n Pappas is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
700
+ "prediction": "Utah and Pappas are initially inside, with Utah in the foreground and Pappas behind. They move outside, where Utah is seen being pushed out of a car while Pappas stands by. Subsequently, both are shown sitting in a car with Pappas driving and Utah as a passenger. The sequence ends with Utah leaning out of the car window, apparently using a phone while Pappas watches."
701
+ },
702
+ {
703
+ "id": 140,
704
+ "question": "Zhang Xuening is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Zhang Xuening doing?",
705
+ "prediction": "Zhang Xuening is socializing at a party, initially standing alone, then she is seen conversing with a man, and finally, she is kneeling down while talking to another guest."
706
+ },
707
+ {
708
+ "id": 141,
709
+ "question": "Zhang Xuening is <img>image_1</img>\n Provide a description for the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
710
+ "prediction": "Zhang Xuening, dressed in a red dress, is seen at a social gathering. Initially, she is engaged in a conversation with guests. Later, she is captured from behind, walking away from the group. Eventually, Zhang is shown in a close-up, facing someone, possibly continuing an exchange."
711
+ },
712
+ {
713
+ "id": 142,
714
+ "question": "He Bing is <img>image_1</img>\n Zhang Xuening is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
715
+ "prediction": "He Bing and Zhang Xuening are seated opposite each other in a train compartment, engaged in a conversation. He extends his hand towards her, possibly making a point or gesturing during their discussion. Later, the setting shifts and they appear in a dimly lit room, with He Bing facing towards Zhang Xuening, suggesting a continuation of their dialogue or a more intense exchange. Eventually, they stand in a crowded space, maintaining a distance from each other, indicating a shift in their interaction or environment."
716
+ },
717
+ {
718
+ "id": 143,
719
+ "question": "He Bing is <img>image_1</img>\n Zhang Xuening is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
720
+ "prediction": "He Bing sits beside Zhang Xuening's bed, looking at her as she lies asleep. The next day, he stands by her bedside, watching her while she lies propped up by pillows. Eventually, Zhang Xuening is seen holding a glass, with He Bing standing at her doorway, looking back at her. He then leans closer, talking to her as he remains standing by the bedside."
721
+ },
722
+ {
723
+ "id": 144,
724
+ "question": "Zhang Xuening is <img>image_1</img>\n Please provide an overview of the visual information in these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
725
+ "prediction": "Zhang Xuening, dressed in a brown suit, is initially seated and engaged in a conversation. She then stands up, facing a standing man, seemingly continuing the discussion. Eventually, she is seen with her hand over her mouth, indicating a change in the interaction's intensity or a moment of reflection."
726
+ },
727
+ {
728
+ "id": 145,
729
+ "question": "Shen Jing is <img>image_1</img>\n He Bing is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
730
+ "prediction": "Shen Jing stands facing He Bing, who is seated and appears to be in a conversation with her. They then sit opposite each other, continuing their interaction. The scene progresses with them sitting closely on a bed, seemingly in a serious discussion."
731
+ },
732
+ {
733
+ "id": 146,
734
+ "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
735
+ "prediction": "Claude and Marie Verneuil are initially seen conversing with drinks in hand, standing in a room. The setting then shifts to a dining scene where Claude is seen speaking while gesturing with his hands, while Marie listens attentively. The sequence ends with the couple in a car, suggesting they have left the house together."
736
+ },
737
+ {
738
+ "id": 147,
739
+ "question": "David Benichou is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What is David doing?",
740
+ "prediction": "David is initially seen extending his arm out, then walking beside a woman, later leaning slightly towards her, and finally crouching or kneeling beside a baby carriage."
741
+ },
742
+ {
743
+ "id": 148,
744
+ "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
745
+ "prediction": "Claude Verneuil, dressed in a shirt and tie, engages in a conversation with Marie Verneuil, who is holding a plate of food. As they talk, Marie listens and gestures with the plate, while Claude responds. Eventually, they walk together towards a building in the distance."
746
+ },
747
+ {
748
+ "id": 149,
749
+ "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
750
+ "prediction": "Claude Verneuil is initially seated at a desk, facing away from Marie Verneuil, who stands in the doorway. Claude then turns to greet Marie with a handshake, welcoming her inside. Subsequently, Claude and Marie sit closely together on a bench outside, appearing to have a conversation."
751
+ },
752
+ {
753
+ "id": 150,
754
+ "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
755
+ "prediction": "Claude Verneuil and Marie Verneuil are initially engaged in a conversation over food in a warmly lit kitchen. They then move through a hallway where Claude is adjusting a picture frame, with Marie looking on. Subsequently, they appear together with Claude holding a bottle, suggesting a social gathering. Finally, they stand among a group of people, continuing to converse."
756
+ },
757
+ {
758
+ "id": 151,
759
+ "question": "Marie Verneuil is <img>image_1</img>\n Claude Verneuil is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
760
+ "prediction": "Marie and Claude Verneuil are first seen sitting on a couch with Marie reading a magazine and Claude looking relaxed. Their positions shift slightly as they continue their interaction, but they remain on the couch. Later, they stand together, engaging in conversation with a visiting couple, before returning to the couch for a more formal setting with teacups in sight, maintaining their earlier relaxed demeanor."
761
+ },
762
+ {
763
+ "id": 152,
764
+ "question": "Claude Verneuil is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Claude doing?",
765
+ "prediction": "Claude is initially carrying a chicken, then he walks down the aisle of a train, seems to be arguing with someone, and finally appears to be pushing that person."
766
+ },
767
+ {
768
+ "id": 153,
769
+ "question": "Anna Howland-Jones is <img>image_1</img>\n Alice Howland is <img>image_2</img>\n John Howland is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
770
+ "prediction": "Alice and Anna Howland stand close, conversing intimately, while John is seen in the background. The atmosphere changes as John moves into the background, appearing to prepare something as Alice and Anna look on. Later, all three are focused on a kitchen counter, with John cooking and Anna and Alice leaning over the table. The scene shifts to a hospital room, where John stands near a seated Anna and Alice, who is holding a baby."
771
+ },
772
+ {
773
+ "id": 154,
774
+ "question": "Gail Harris is <img>image_1</img>\n Tell me about the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
775
+ "prediction": "Gail Harris starts in the kitchen, preparing at the table with children nearby. She continues her task, moving around the kitchen as the children seem engaged in conversation or play. The scene shifts to a street where Gail, carrying something, walks with purpose amidst vintage cars. Finally, she is seen inside a vehicle, perhaps ready to drive away with the children."
776
+ },
777
+ {
778
+ "id": 155,
779
+ "question": "Gail Harris is <img>image_1</img>\n J. Paul Getty is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
780
+ "prediction": "Gail Harris and J. Paul Getty engage in a conversation, facing each other. The setting shifts, showing them from a different angle as they continue their discussion. They remain in close proximity throughout, indicating a serious and intimate exchange between the two."
781
+ },
782
+ {
783
+ "id": 156,
784
+ "question": "J. Paul Getty is <img>image_1</img>\n Provide a description for the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
785
+ "prediction": "J. Paul Getty is first seen in a leisurely setting, standing by a table filled with food and drinks. He then appears in a stark, outdoor scene, standing dominant in the frame. Finally, Getty is shown in a touching moment, holding hands with an unnamed child, as they walk together amidst the cold."
786
+ },
787
+ {
788
+ "id": 157,
789
+ "question": "J. Paul Getty is <img>image_1</img>\n Fletcher Chace is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
790
+ "prediction": "J. Paul Getty and Fletcher Chace are first seen seated indoors with Chace's hands clasped on a table while Getty sits with his hands together. They are then outside, with Getty standing and Chace partially obscured behind him. In the following scene, Chace walks ahead while Getty sits on a bench with a cane. Finally, they are seen together outdoors, with Chace looking ahead and Getty in front of him. Throughout these moments, their positions relative to each other suggest a dynamic conversation or interaction, starting indoors and moving outside."
791
+ },
792
+ {
793
+ "id": 158,
794
+ "question": "Gail Harris is <img>image_1</img>\n Fletcher Chace is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
795
+ "prediction": "Gail Harris and Fletcher Chace walk through an airport arrival area together, followed by a media scrum. Chace is seen speaking to the media as Harris stands beside him. In a subsequent moment, Chace seems to be leading Harris away from the cameras, suggesting a departure or an end to the media interaction."
796
+ }
797
+ ]
weights/model-idf/pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e43a8429bf7629454f0d530e5112fea1dab265c3d8bb046bb91e8f2aa81c07a4
3
+ size 9969772092
weights/model-idf/pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56d7dfbdffabe1fb21e008bca646991286f0438aaa2aed2cd5925ed269d72480
3
+ size 9478651117
weights/model-idf/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,864 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 19448120832
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
8
+ "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
9
+ "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
11
+ "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
12
+ "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
13
+ "transformer.h.0.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
14
+ "transformer.h.0.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
15
+ "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
16
+ "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
17
+ "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
18
+ "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
19
+ "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
20
+ "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "transformer.h.1.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
22
+ "transformer.h.1.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
23
+ "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
24
+ "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
25
+ "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
27
+ "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
28
+ "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "transformer.h.10.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
30
+ "transformer.h.10.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
31
+ "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
32
+ "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
33
+ "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
35
+ "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
36
+ "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "transformer.h.11.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
38
+ "transformer.h.11.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
39
+ "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
40
+ "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
41
+ "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
43
+ "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
44
+ "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "transformer.h.12.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
46
+ "transformer.h.12.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
47
+ "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
48
+ "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
49
+ "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
51
+ "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
52
+ "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "transformer.h.13.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
54
+ "transformer.h.13.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
55
+ "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
56
+ "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
57
+ "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
58
+ "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
59
+ "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
60
+ "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "transformer.h.14.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
62
+ "transformer.h.14.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
63
+ "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
64
+ "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
65
+ "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
67
+ "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
68
+ "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "transformer.h.15.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
70
+ "transformer.h.15.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
71
+ "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
72
+ "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
73
+ "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
75
+ "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
76
+ "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "transformer.h.16.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
78
+ "transformer.h.16.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
79
+ "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
80
+ "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
81
+ "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
83
+ "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
84
+ "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "transformer.h.17.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
86
+ "transformer.h.17.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
87
+ "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
88
+ "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
89
+ "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
90
+ "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
91
+ "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
92
+ "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "transformer.h.18.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
94
+ "transformer.h.18.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
95
+ "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
96
+ "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
97
+ "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
99
+ "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
100
+ "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "transformer.h.19.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
102
+ "transformer.h.19.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
103
+ "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
104
+ "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
105
+ "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
107
+ "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
108
+ "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "transformer.h.2.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
110
+ "transformer.h.2.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
111
+ "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
112
+ "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
113
+ "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
115
+ "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
116
+ "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
117
+ "transformer.h.20.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
118
+ "transformer.h.20.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
119
+ "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
120
+ "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
121
+ "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
123
+ "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
124
+ "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
125
+ "transformer.h.21.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
126
+ "transformer.h.21.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
127
+ "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
128
+ "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
129
+ "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
130
+ "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
131
+ "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
132
+ "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
133
+ "transformer.h.22.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
134
+ "transformer.h.22.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
135
+ "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
136
+ "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
137
+ "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
138
+ "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
139
+ "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
140
+ "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
141
+ "transformer.h.23.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
142
+ "transformer.h.23.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
143
+ "transformer.h.24.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
144
+ "transformer.h.24.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
145
+ "transformer.h.24.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
146
+ "transformer.h.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
147
+ "transformer.h.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
148
+ "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
149
+ "transformer.h.24.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
150
+ "transformer.h.24.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
151
+ "transformer.h.25.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
152
+ "transformer.h.25.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
153
+ "transformer.h.25.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "transformer.h.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
155
+ "transformer.h.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
156
+ "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
157
+ "transformer.h.25.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
158
+ "transformer.h.25.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
159
+ "transformer.h.26.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
160
+ "transformer.h.26.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
161
+ "transformer.h.26.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
162
+ "transformer.h.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
163
+ "transformer.h.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
164
+ "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "transformer.h.26.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
166
+ "transformer.h.26.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
167
+ "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
168
+ "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
169
+ "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
171
+ "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
172
+ "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "transformer.h.27.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
174
+ "transformer.h.27.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
175
+ "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
176
+ "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
177
+ "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
179
+ "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
180
+ "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "transformer.h.28.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
182
+ "transformer.h.28.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
183
+ "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
184
+ "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
185
+ "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
187
+ "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
188
+ "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
189
+ "transformer.h.29.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
190
+ "transformer.h.29.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
191
+ "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
192
+ "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
193
+ "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
194
+ "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
195
+ "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
196
+ "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
197
+ "transformer.h.3.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
198
+ "transformer.h.3.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
199
+ "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
200
+ "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
201
+ "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
202
+ "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
203
+ "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
204
+ "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "transformer.h.30.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
206
+ "transformer.h.30.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
207
+ "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
208
+ "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
209
+ "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
211
+ "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
212
+ "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "transformer.h.31.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
214
+ "transformer.h.31.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
215
+ "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
216
+ "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
217
+ "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
219
+ "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
220
+ "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "transformer.h.4.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
222
+ "transformer.h.4.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
223
+ "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
224
+ "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
225
+ "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
226
+ "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
227
+ "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
228
+ "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
229
+ "transformer.h.5.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
230
+ "transformer.h.5.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
231
+ "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
232
+ "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
233
+ "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
234
+ "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
235
+ "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
236
+ "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
237
+ "transformer.h.6.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
238
+ "transformer.h.6.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
239
+ "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
240
+ "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
241
+ "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
243
+ "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
244
+ "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "transformer.h.7.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
246
+ "transformer.h.7.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
247
+ "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
248
+ "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
249
+ "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
251
+ "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
252
+ "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "transformer.h.8.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
254
+ "transformer.h.8.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
255
+ "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
256
+ "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
257
+ "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
259
+ "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
260
+ "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
261
+ "transformer.h.9.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
262
+ "transformer.h.9.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
263
+ "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
264
+ "transformer.visual.attn_pool.attn.in_proj_bias": "pytorch_model-00002-of-00002.bin",
265
+ "transformer.visual.attn_pool.attn.in_proj_weight": "pytorch_model-00002-of-00002.bin",
266
+ "transformer.visual.attn_pool.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
267
+ "transformer.visual.attn_pool.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
268
+ "transformer.visual.attn_pool.attn1.in_proj_bias": "pytorch_model-00002-of-00002.bin",
269
+ "transformer.visual.attn_pool.attn1.in_proj_weight": "pytorch_model-00002-of-00002.bin",
270
+ "transformer.visual.attn_pool.attn1.out_proj.bias": "pytorch_model-00002-of-00002.bin",
271
+ "transformer.visual.attn_pool.attn1.out_proj.weight": "pytorch_model-00002-of-00002.bin",
272
+ "transformer.visual.attn_pool.kv_proj.weight": "pytorch_model-00002-of-00002.bin",
273
+ "transformer.visual.attn_pool.ln_kv.bias": "pytorch_model-00002-of-00002.bin",
274
+ "transformer.visual.attn_pool.ln_kv.weight": "pytorch_model-00002-of-00002.bin",
275
+ "transformer.visual.attn_pool.ln_q.bias": "pytorch_model-00002-of-00002.bin",
276
+ "transformer.visual.attn_pool.ln_q.weight": "pytorch_model-00002-of-00002.bin",
277
+ "transformer.visual.attn_pool.pos_embed": "pytorch_model-00002-of-00002.bin",
278
+ "transformer.visual.attn_pool.query": "pytorch_model-00002-of-00002.bin",
279
+ "transformer.visual.conv1.weight": "pytorch_model-00002-of-00002.bin",
280
+ "transformer.visual.ln_post.bias": "pytorch_model-00002-of-00002.bin",
281
+ "transformer.visual.ln_post.weight": "pytorch_model-00002-of-00002.bin",
282
+ "transformer.visual.ln_pre.bias": "pytorch_model-00002-of-00002.bin",
283
+ "transformer.visual.ln_pre.weight": "pytorch_model-00002-of-00002.bin",
284
+ "transformer.visual.positional_embedding": "pytorch_model-00002-of-00002.bin",
285
+ "transformer.visual.proj": "pytorch_model-00002-of-00002.bin",
286
+ "transformer.visual.transformer.resblocks.0.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
287
+ "transformer.visual.transformer.resblocks.0.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
288
+ "transformer.visual.transformer.resblocks.0.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
289
+ "transformer.visual.transformer.resblocks.0.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
290
+ "transformer.visual.transformer.resblocks.0.ln_1.bias": "pytorch_model-00002-of-00002.bin",
291
+ "transformer.visual.transformer.resblocks.0.ln_1.weight": "pytorch_model-00002-of-00002.bin",
292
+ "transformer.visual.transformer.resblocks.0.ln_2.bias": "pytorch_model-00002-of-00002.bin",
293
+ "transformer.visual.transformer.resblocks.0.ln_2.weight": "pytorch_model-00002-of-00002.bin",
294
+ "transformer.visual.transformer.resblocks.0.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
295
+ "transformer.visual.transformer.resblocks.0.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
296
+ "transformer.visual.transformer.resblocks.0.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
297
+ "transformer.visual.transformer.resblocks.0.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
298
+ "transformer.visual.transformer.resblocks.1.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
299
+ "transformer.visual.transformer.resblocks.1.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
300
+ "transformer.visual.transformer.resblocks.1.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
301
+ "transformer.visual.transformer.resblocks.1.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
302
+ "transformer.visual.transformer.resblocks.1.ln_1.bias": "pytorch_model-00002-of-00002.bin",
303
+ "transformer.visual.transformer.resblocks.1.ln_1.weight": "pytorch_model-00002-of-00002.bin",
304
+ "transformer.visual.transformer.resblocks.1.ln_2.bias": "pytorch_model-00002-of-00002.bin",
305
+ "transformer.visual.transformer.resblocks.1.ln_2.weight": "pytorch_model-00002-of-00002.bin",
306
+ "transformer.visual.transformer.resblocks.1.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
307
+ "transformer.visual.transformer.resblocks.1.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
308
+ "transformer.visual.transformer.resblocks.1.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
309
+ "transformer.visual.transformer.resblocks.1.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
310
+ "transformer.visual.transformer.resblocks.10.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
311
+ "transformer.visual.transformer.resblocks.10.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
312
+ "transformer.visual.transformer.resblocks.10.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
313
+ "transformer.visual.transformer.resblocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
314
+ "transformer.visual.transformer.resblocks.10.ln_1.bias": "pytorch_model-00002-of-00002.bin",
315
+ "transformer.visual.transformer.resblocks.10.ln_1.weight": "pytorch_model-00002-of-00002.bin",
316
+ "transformer.visual.transformer.resblocks.10.ln_2.bias": "pytorch_model-00002-of-00002.bin",
317
+ "transformer.visual.transformer.resblocks.10.ln_2.weight": "pytorch_model-00002-of-00002.bin",
318
+ "transformer.visual.transformer.resblocks.10.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
319
+ "transformer.visual.transformer.resblocks.10.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
320
+ "transformer.visual.transformer.resblocks.10.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
321
+ "transformer.visual.transformer.resblocks.10.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
322
+ "transformer.visual.transformer.resblocks.11.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
323
+ "transformer.visual.transformer.resblocks.11.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
324
+ "transformer.visual.transformer.resblocks.11.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
325
+ "transformer.visual.transformer.resblocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
326
+ "transformer.visual.transformer.resblocks.11.ln_1.bias": "pytorch_model-00002-of-00002.bin",
327
+ "transformer.visual.transformer.resblocks.11.ln_1.weight": "pytorch_model-00002-of-00002.bin",
328
+ "transformer.visual.transformer.resblocks.11.ln_2.bias": "pytorch_model-00002-of-00002.bin",
329
+ "transformer.visual.transformer.resblocks.11.ln_2.weight": "pytorch_model-00002-of-00002.bin",
330
+ "transformer.visual.transformer.resblocks.11.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
331
+ "transformer.visual.transformer.resblocks.11.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
332
+ "transformer.visual.transformer.resblocks.11.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
333
+ "transformer.visual.transformer.resblocks.11.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
334
+ "transformer.visual.transformer.resblocks.12.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
335
+ "transformer.visual.transformer.resblocks.12.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
336
+ "transformer.visual.transformer.resblocks.12.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
337
+ "transformer.visual.transformer.resblocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
338
+ "transformer.visual.transformer.resblocks.12.ln_1.bias": "pytorch_model-00002-of-00002.bin",
339
+ "transformer.visual.transformer.resblocks.12.ln_1.weight": "pytorch_model-00002-of-00002.bin",
340
+ "transformer.visual.transformer.resblocks.12.ln_2.bias": "pytorch_model-00002-of-00002.bin",
341
+ "transformer.visual.transformer.resblocks.12.ln_2.weight": "pytorch_model-00002-of-00002.bin",
342
+ "transformer.visual.transformer.resblocks.12.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
343
+ "transformer.visual.transformer.resblocks.12.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
344
+ "transformer.visual.transformer.resblocks.12.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
345
+ "transformer.visual.transformer.resblocks.12.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
346
+ "transformer.visual.transformer.resblocks.13.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
347
+ "transformer.visual.transformer.resblocks.13.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
348
+ "transformer.visual.transformer.resblocks.13.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
349
+ "transformer.visual.transformer.resblocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
350
+ "transformer.visual.transformer.resblocks.13.ln_1.bias": "pytorch_model-00002-of-00002.bin",
351
+ "transformer.visual.transformer.resblocks.13.ln_1.weight": "pytorch_model-00002-of-00002.bin",
352
+ "transformer.visual.transformer.resblocks.13.ln_2.bias": "pytorch_model-00002-of-00002.bin",
353
+ "transformer.visual.transformer.resblocks.13.ln_2.weight": "pytorch_model-00002-of-00002.bin",
354
+ "transformer.visual.transformer.resblocks.13.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
355
+ "transformer.visual.transformer.resblocks.13.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
356
+ "transformer.visual.transformer.resblocks.13.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
357
+ "transformer.visual.transformer.resblocks.13.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
358
+ "transformer.visual.transformer.resblocks.14.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
359
+ "transformer.visual.transformer.resblocks.14.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
360
+ "transformer.visual.transformer.resblocks.14.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
361
+ "transformer.visual.transformer.resblocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
362
+ "transformer.visual.transformer.resblocks.14.ln_1.bias": "pytorch_model-00002-of-00002.bin",
363
+ "transformer.visual.transformer.resblocks.14.ln_1.weight": "pytorch_model-00002-of-00002.bin",
364
+ "transformer.visual.transformer.resblocks.14.ln_2.bias": "pytorch_model-00002-of-00002.bin",
365
+ "transformer.visual.transformer.resblocks.14.ln_2.weight": "pytorch_model-00002-of-00002.bin",
366
+ "transformer.visual.transformer.resblocks.14.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
367
+ "transformer.visual.transformer.resblocks.14.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
368
+ "transformer.visual.transformer.resblocks.14.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
369
+ "transformer.visual.transformer.resblocks.14.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
370
+ "transformer.visual.transformer.resblocks.15.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
371
+ "transformer.visual.transformer.resblocks.15.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
372
+ "transformer.visual.transformer.resblocks.15.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
373
+ "transformer.visual.transformer.resblocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
374
+ "transformer.visual.transformer.resblocks.15.ln_1.bias": "pytorch_model-00002-of-00002.bin",
375
+ "transformer.visual.transformer.resblocks.15.ln_1.weight": "pytorch_model-00002-of-00002.bin",
376
+ "transformer.visual.transformer.resblocks.15.ln_2.bias": "pytorch_model-00002-of-00002.bin",
377
+ "transformer.visual.transformer.resblocks.15.ln_2.weight": "pytorch_model-00002-of-00002.bin",
378
+ "transformer.visual.transformer.resblocks.15.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
379
+ "transformer.visual.transformer.resblocks.15.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
380
+ "transformer.visual.transformer.resblocks.15.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
381
+ "transformer.visual.transformer.resblocks.15.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
382
+ "transformer.visual.transformer.resblocks.16.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
383
+ "transformer.visual.transformer.resblocks.16.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
384
+ "transformer.visual.transformer.resblocks.16.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
385
+ "transformer.visual.transformer.resblocks.16.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
386
+ "transformer.visual.transformer.resblocks.16.ln_1.bias": "pytorch_model-00002-of-00002.bin",
387
+ "transformer.visual.transformer.resblocks.16.ln_1.weight": "pytorch_model-00002-of-00002.bin",
388
+ "transformer.visual.transformer.resblocks.16.ln_2.bias": "pytorch_model-00002-of-00002.bin",
389
+ "transformer.visual.transformer.resblocks.16.ln_2.weight": "pytorch_model-00002-of-00002.bin",
390
+ "transformer.visual.transformer.resblocks.16.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
391
+ "transformer.visual.transformer.resblocks.16.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
392
+ "transformer.visual.transformer.resblocks.16.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
393
+ "transformer.visual.transformer.resblocks.16.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
394
+ "transformer.visual.transformer.resblocks.17.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
395
+ "transformer.visual.transformer.resblocks.17.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
396
+ "transformer.visual.transformer.resblocks.17.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
397
+ "transformer.visual.transformer.resblocks.17.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
398
+ "transformer.visual.transformer.resblocks.17.ln_1.bias": "pytorch_model-00002-of-00002.bin",
399
+ "transformer.visual.transformer.resblocks.17.ln_1.weight": "pytorch_model-00002-of-00002.bin",
400
+ "transformer.visual.transformer.resblocks.17.ln_2.bias": "pytorch_model-00002-of-00002.bin",
401
+ "transformer.visual.transformer.resblocks.17.ln_2.weight": "pytorch_model-00002-of-00002.bin",
402
+ "transformer.visual.transformer.resblocks.17.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
403
+ "transformer.visual.transformer.resblocks.17.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
404
+ "transformer.visual.transformer.resblocks.17.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
405
+ "transformer.visual.transformer.resblocks.17.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
406
+ "transformer.visual.transformer.resblocks.18.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
407
+ "transformer.visual.transformer.resblocks.18.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
408
+ "transformer.visual.transformer.resblocks.18.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
409
+ "transformer.visual.transformer.resblocks.18.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
410
+ "transformer.visual.transformer.resblocks.18.ln_1.bias": "pytorch_model-00002-of-00002.bin",
411
+ "transformer.visual.transformer.resblocks.18.ln_1.weight": "pytorch_model-00002-of-00002.bin",
412
+ "transformer.visual.transformer.resblocks.18.ln_2.bias": "pytorch_model-00002-of-00002.bin",
413
+ "transformer.visual.transformer.resblocks.18.ln_2.weight": "pytorch_model-00002-of-00002.bin",
414
+ "transformer.visual.transformer.resblocks.18.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
415
+ "transformer.visual.transformer.resblocks.18.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
416
+ "transformer.visual.transformer.resblocks.18.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
417
+ "transformer.visual.transformer.resblocks.18.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
418
+ "transformer.visual.transformer.resblocks.19.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
419
+ "transformer.visual.transformer.resblocks.19.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
420
+ "transformer.visual.transformer.resblocks.19.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
421
+ "transformer.visual.transformer.resblocks.19.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
422
+ "transformer.visual.transformer.resblocks.19.ln_1.bias": "pytorch_model-00002-of-00002.bin",
423
+ "transformer.visual.transformer.resblocks.19.ln_1.weight": "pytorch_model-00002-of-00002.bin",
424
+ "transformer.visual.transformer.resblocks.19.ln_2.bias": "pytorch_model-00002-of-00002.bin",
425
+ "transformer.visual.transformer.resblocks.19.ln_2.weight": "pytorch_model-00002-of-00002.bin",
426
+ "transformer.visual.transformer.resblocks.19.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
427
+ "transformer.visual.transformer.resblocks.19.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
428
+ "transformer.visual.transformer.resblocks.19.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
429
+ "transformer.visual.transformer.resblocks.19.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
430
+ "transformer.visual.transformer.resblocks.2.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
431
+ "transformer.visual.transformer.resblocks.2.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
432
+ "transformer.visual.transformer.resblocks.2.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
433
+ "transformer.visual.transformer.resblocks.2.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
434
+ "transformer.visual.transformer.resblocks.2.ln_1.bias": "pytorch_model-00002-of-00002.bin",
435
+ "transformer.visual.transformer.resblocks.2.ln_1.weight": "pytorch_model-00002-of-00002.bin",
436
+ "transformer.visual.transformer.resblocks.2.ln_2.bias": "pytorch_model-00002-of-00002.bin",
437
+ "transformer.visual.transformer.resblocks.2.ln_2.weight": "pytorch_model-00002-of-00002.bin",
438
+ "transformer.visual.transformer.resblocks.2.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
439
+ "transformer.visual.transformer.resblocks.2.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
440
+ "transformer.visual.transformer.resblocks.2.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
441
+ "transformer.visual.transformer.resblocks.2.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
442
+ "transformer.visual.transformer.resblocks.20.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
443
+ "transformer.visual.transformer.resblocks.20.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
444
+ "transformer.visual.transformer.resblocks.20.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
445
+ "transformer.visual.transformer.resblocks.20.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
446
+ "transformer.visual.transformer.resblocks.20.ln_1.bias": "pytorch_model-00002-of-00002.bin",
447
+ "transformer.visual.transformer.resblocks.20.ln_1.weight": "pytorch_model-00002-of-00002.bin",
448
+ "transformer.visual.transformer.resblocks.20.ln_2.bias": "pytorch_model-00002-of-00002.bin",
449
+ "transformer.visual.transformer.resblocks.20.ln_2.weight": "pytorch_model-00002-of-00002.bin",
450
+ "transformer.visual.transformer.resblocks.20.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
451
+ "transformer.visual.transformer.resblocks.20.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
452
+ "transformer.visual.transformer.resblocks.20.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
453
+ "transformer.visual.transformer.resblocks.20.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
454
+ "transformer.visual.transformer.resblocks.21.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
455
+ "transformer.visual.transformer.resblocks.21.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
456
+ "transformer.visual.transformer.resblocks.21.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
457
+ "transformer.visual.transformer.resblocks.21.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
458
+ "transformer.visual.transformer.resblocks.21.ln_1.bias": "pytorch_model-00002-of-00002.bin",
459
+ "transformer.visual.transformer.resblocks.21.ln_1.weight": "pytorch_model-00002-of-00002.bin",
460
+ "transformer.visual.transformer.resblocks.21.ln_2.bias": "pytorch_model-00002-of-00002.bin",
461
+ "transformer.visual.transformer.resblocks.21.ln_2.weight": "pytorch_model-00002-of-00002.bin",
462
+ "transformer.visual.transformer.resblocks.21.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
463
+ "transformer.visual.transformer.resblocks.21.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
464
+ "transformer.visual.transformer.resblocks.21.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
465
+ "transformer.visual.transformer.resblocks.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
466
+ "transformer.visual.transformer.resblocks.22.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
467
+ "transformer.visual.transformer.resblocks.22.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
468
+ "transformer.visual.transformer.resblocks.22.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
469
+ "transformer.visual.transformer.resblocks.22.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
470
+ "transformer.visual.transformer.resblocks.22.ln_1.bias": "pytorch_model-00002-of-00002.bin",
471
+ "transformer.visual.transformer.resblocks.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
472
+ "transformer.visual.transformer.resblocks.22.ln_2.bias": "pytorch_model-00002-of-00002.bin",
473
+ "transformer.visual.transformer.resblocks.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
474
+ "transformer.visual.transformer.resblocks.22.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
475
+ "transformer.visual.transformer.resblocks.22.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
476
+ "transformer.visual.transformer.resblocks.22.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
477
+ "transformer.visual.transformer.resblocks.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
478
+ "transformer.visual.transformer.resblocks.23.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
479
+ "transformer.visual.transformer.resblocks.23.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
480
+ "transformer.visual.transformer.resblocks.23.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
481
+ "transformer.visual.transformer.resblocks.23.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
482
+ "transformer.visual.transformer.resblocks.23.ln_1.bias": "pytorch_model-00002-of-00002.bin",
483
+ "transformer.visual.transformer.resblocks.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
484
+ "transformer.visual.transformer.resblocks.23.ln_2.bias": "pytorch_model-00002-of-00002.bin",
485
+ "transformer.visual.transformer.resblocks.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
486
+ "transformer.visual.transformer.resblocks.23.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
487
+ "transformer.visual.transformer.resblocks.23.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
488
+ "transformer.visual.transformer.resblocks.23.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
489
+ "transformer.visual.transformer.resblocks.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
490
+ "transformer.visual.transformer.resblocks.24.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
491
+ "transformer.visual.transformer.resblocks.24.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
492
+ "transformer.visual.transformer.resblocks.24.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
493
+ "transformer.visual.transformer.resblocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
494
+ "transformer.visual.transformer.resblocks.24.ln_1.bias": "pytorch_model-00002-of-00002.bin",
495
+ "transformer.visual.transformer.resblocks.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
496
+ "transformer.visual.transformer.resblocks.24.ln_2.bias": "pytorch_model-00002-of-00002.bin",
497
+ "transformer.visual.transformer.resblocks.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
498
+ "transformer.visual.transformer.resblocks.24.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
499
+ "transformer.visual.transformer.resblocks.24.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
500
+ "transformer.visual.transformer.resblocks.24.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
501
+ "transformer.visual.transformer.resblocks.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
502
+ "transformer.visual.transformer.resblocks.25.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
503
+ "transformer.visual.transformer.resblocks.25.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
504
+ "transformer.visual.transformer.resblocks.25.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
505
+ "transformer.visual.transformer.resblocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
506
+ "transformer.visual.transformer.resblocks.25.ln_1.bias": "pytorch_model-00002-of-00002.bin",
507
+ "transformer.visual.transformer.resblocks.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
508
+ "transformer.visual.transformer.resblocks.25.ln_2.bias": "pytorch_model-00002-of-00002.bin",
509
+ "transformer.visual.transformer.resblocks.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
510
+ "transformer.visual.transformer.resblocks.25.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
511
+ "transformer.visual.transformer.resblocks.25.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
512
+ "transformer.visual.transformer.resblocks.25.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
513
+ "transformer.visual.transformer.resblocks.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
514
+ "transformer.visual.transformer.resblocks.26.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
515
+ "transformer.visual.transformer.resblocks.26.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
516
+ "transformer.visual.transformer.resblocks.26.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
517
+ "transformer.visual.transformer.resblocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
518
+ "transformer.visual.transformer.resblocks.26.ln_1.bias": "pytorch_model-00002-of-00002.bin",
519
+ "transformer.visual.transformer.resblocks.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
520
+ "transformer.visual.transformer.resblocks.26.ln_2.bias": "pytorch_model-00002-of-00002.bin",
521
+ "transformer.visual.transformer.resblocks.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
522
+ "transformer.visual.transformer.resblocks.26.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
523
+ "transformer.visual.transformer.resblocks.26.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
524
+ "transformer.visual.transformer.resblocks.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
525
+ "transformer.visual.transformer.resblocks.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
526
+ "transformer.visual.transformer.resblocks.27.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
527
+ "transformer.visual.transformer.resblocks.27.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
528
+ "transformer.visual.transformer.resblocks.27.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
529
+ "transformer.visual.transformer.resblocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
530
+ "transformer.visual.transformer.resblocks.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
531
+ "transformer.visual.transformer.resblocks.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
532
+ "transformer.visual.transformer.resblocks.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
533
+ "transformer.visual.transformer.resblocks.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
534
+ "transformer.visual.transformer.resblocks.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
535
+ "transformer.visual.transformer.resblocks.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
536
+ "transformer.visual.transformer.resblocks.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
537
+ "transformer.visual.transformer.resblocks.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
538
+ "transformer.visual.transformer.resblocks.28.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
539
+ "transformer.visual.transformer.resblocks.28.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
540
+ "transformer.visual.transformer.resblocks.28.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
541
+ "transformer.visual.transformer.resblocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
542
+ "transformer.visual.transformer.resblocks.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
543
+ "transformer.visual.transformer.resblocks.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
544
+ "transformer.visual.transformer.resblocks.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
545
+ "transformer.visual.transformer.resblocks.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
546
+ "transformer.visual.transformer.resblocks.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
547
+ "transformer.visual.transformer.resblocks.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
548
+ "transformer.visual.transformer.resblocks.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
549
+ "transformer.visual.transformer.resblocks.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
550
+ "transformer.visual.transformer.resblocks.29.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
551
+ "transformer.visual.transformer.resblocks.29.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
552
+ "transformer.visual.transformer.resblocks.29.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
553
+ "transformer.visual.transformer.resblocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
554
+ "transformer.visual.transformer.resblocks.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
555
+ "transformer.visual.transformer.resblocks.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
556
+ "transformer.visual.transformer.resblocks.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
557
+ "transformer.visual.transformer.resblocks.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
558
+ "transformer.visual.transformer.resblocks.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
559
+ "transformer.visual.transformer.resblocks.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
560
+ "transformer.visual.transformer.resblocks.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
561
+ "transformer.visual.transformer.resblocks.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
562
+ "transformer.visual.transformer.resblocks.3.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
563
+ "transformer.visual.transformer.resblocks.3.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
564
+ "transformer.visual.transformer.resblocks.3.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
565
+ "transformer.visual.transformer.resblocks.3.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
566
+ "transformer.visual.transformer.resblocks.3.ln_1.bias": "pytorch_model-00002-of-00002.bin",
567
+ "transformer.visual.transformer.resblocks.3.ln_1.weight": "pytorch_model-00002-of-00002.bin",
568
+ "transformer.visual.transformer.resblocks.3.ln_2.bias": "pytorch_model-00002-of-00002.bin",
569
+ "transformer.visual.transformer.resblocks.3.ln_2.weight": "pytorch_model-00002-of-00002.bin",
570
+ "transformer.visual.transformer.resblocks.3.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
571
+ "transformer.visual.transformer.resblocks.3.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
572
+ "transformer.visual.transformer.resblocks.3.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
573
+ "transformer.visual.transformer.resblocks.3.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
574
+ "transformer.visual.transformer.resblocks.30.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
575
+ "transformer.visual.transformer.resblocks.30.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
576
+ "transformer.visual.transformer.resblocks.30.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
577
+ "transformer.visual.transformer.resblocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
578
+ "transformer.visual.transformer.resblocks.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
579
+ "transformer.visual.transformer.resblocks.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
580
+ "transformer.visual.transformer.resblocks.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
581
+ "transformer.visual.transformer.resblocks.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
582
+ "transformer.visual.transformer.resblocks.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
583
+ "transformer.visual.transformer.resblocks.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
584
+ "transformer.visual.transformer.resblocks.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
585
+ "transformer.visual.transformer.resblocks.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
586
+ "transformer.visual.transformer.resblocks.31.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
587
+ "transformer.visual.transformer.resblocks.31.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
588
+ "transformer.visual.transformer.resblocks.31.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
589
+ "transformer.visual.transformer.resblocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
590
+ "transformer.visual.transformer.resblocks.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
591
+ "transformer.visual.transformer.resblocks.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
592
+ "transformer.visual.transformer.resblocks.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
593
+ "transformer.visual.transformer.resblocks.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
594
+ "transformer.visual.transformer.resblocks.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
595
+ "transformer.visual.transformer.resblocks.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
596
+ "transformer.visual.transformer.resblocks.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
597
+ "transformer.visual.transformer.resblocks.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
598
+ "transformer.visual.transformer.resblocks.32.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
599
+ "transformer.visual.transformer.resblocks.32.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
600
+ "transformer.visual.transformer.resblocks.32.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
601
+ "transformer.visual.transformer.resblocks.32.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
602
+ "transformer.visual.transformer.resblocks.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
603
+ "transformer.visual.transformer.resblocks.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
604
+ "transformer.visual.transformer.resblocks.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
605
+ "transformer.visual.transformer.resblocks.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
606
+ "transformer.visual.transformer.resblocks.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
607
+ "transformer.visual.transformer.resblocks.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
608
+ "transformer.visual.transformer.resblocks.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
609
+ "transformer.visual.transformer.resblocks.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
610
+ "transformer.visual.transformer.resblocks.33.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
611
+ "transformer.visual.transformer.resblocks.33.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
612
+ "transformer.visual.transformer.resblocks.33.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
613
+ "transformer.visual.transformer.resblocks.33.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
614
+ "transformer.visual.transformer.resblocks.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
615
+ "transformer.visual.transformer.resblocks.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
616
+ "transformer.visual.transformer.resblocks.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
617
+ "transformer.visual.transformer.resblocks.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
618
+ "transformer.visual.transformer.resblocks.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
619
+ "transformer.visual.transformer.resblocks.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
620
+ "transformer.visual.transformer.resblocks.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
621
+ "transformer.visual.transformer.resblocks.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
622
+ "transformer.visual.transformer.resblocks.34.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
623
+ "transformer.visual.transformer.resblocks.34.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
624
+ "transformer.visual.transformer.resblocks.34.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
625
+ "transformer.visual.transformer.resblocks.34.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
626
+ "transformer.visual.transformer.resblocks.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
627
+ "transformer.visual.transformer.resblocks.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
628
+ "transformer.visual.transformer.resblocks.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
629
+ "transformer.visual.transformer.resblocks.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
630
+ "transformer.visual.transformer.resblocks.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
631
+ "transformer.visual.transformer.resblocks.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
632
+ "transformer.visual.transformer.resblocks.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
633
+ "transformer.visual.transformer.resblocks.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
634
+ "transformer.visual.transformer.resblocks.35.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
635
+ "transformer.visual.transformer.resblocks.35.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
636
+ "transformer.visual.transformer.resblocks.35.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
637
+ "transformer.visual.transformer.resblocks.35.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
638
+ "transformer.visual.transformer.resblocks.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
639
+ "transformer.visual.transformer.resblocks.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
640
+ "transformer.visual.transformer.resblocks.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
641
+ "transformer.visual.transformer.resblocks.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
642
+ "transformer.visual.transformer.resblocks.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
643
+ "transformer.visual.transformer.resblocks.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
644
+ "transformer.visual.transformer.resblocks.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
645
+ "transformer.visual.transformer.resblocks.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
646
+ "transformer.visual.transformer.resblocks.36.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
647
+ "transformer.visual.transformer.resblocks.36.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
648
+ "transformer.visual.transformer.resblocks.36.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
649
+ "transformer.visual.transformer.resblocks.36.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
650
+ "transformer.visual.transformer.resblocks.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
651
+ "transformer.visual.transformer.resblocks.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
652
+ "transformer.visual.transformer.resblocks.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
653
+ "transformer.visual.transformer.resblocks.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
654
+ "transformer.visual.transformer.resblocks.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
655
+ "transformer.visual.transformer.resblocks.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
656
+ "transformer.visual.transformer.resblocks.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
657
+ "transformer.visual.transformer.resblocks.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
658
+ "transformer.visual.transformer.resblocks.37.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
659
+ "transformer.visual.transformer.resblocks.37.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
660
+ "transformer.visual.transformer.resblocks.37.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
661
+ "transformer.visual.transformer.resblocks.37.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
662
+ "transformer.visual.transformer.resblocks.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
663
+ "transformer.visual.transformer.resblocks.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
664
+ "transformer.visual.transformer.resblocks.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
665
+ "transformer.visual.transformer.resblocks.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
666
+ "transformer.visual.transformer.resblocks.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
667
+ "transformer.visual.transformer.resblocks.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
668
+ "transformer.visual.transformer.resblocks.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
669
+ "transformer.visual.transformer.resblocks.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
670
+ "transformer.visual.transformer.resblocks.38.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
671
+ "transformer.visual.transformer.resblocks.38.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
672
+ "transformer.visual.transformer.resblocks.38.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
673
+ "transformer.visual.transformer.resblocks.38.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
674
+ "transformer.visual.transformer.resblocks.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
675
+ "transformer.visual.transformer.resblocks.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
676
+ "transformer.visual.transformer.resblocks.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
677
+ "transformer.visual.transformer.resblocks.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
678
+ "transformer.visual.transformer.resblocks.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
679
+ "transformer.visual.transformer.resblocks.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
680
+ "transformer.visual.transformer.resblocks.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
681
+ "transformer.visual.transformer.resblocks.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
682
+ "transformer.visual.transformer.resblocks.39.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
683
+ "transformer.visual.transformer.resblocks.39.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
684
+ "transformer.visual.transformer.resblocks.39.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
685
+ "transformer.visual.transformer.resblocks.39.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
686
+ "transformer.visual.transformer.resblocks.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
687
+ "transformer.visual.transformer.resblocks.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
688
+ "transformer.visual.transformer.resblocks.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
689
+ "transformer.visual.transformer.resblocks.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
690
+ "transformer.visual.transformer.resblocks.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
691
+ "transformer.visual.transformer.resblocks.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
692
+ "transformer.visual.transformer.resblocks.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
693
+ "transformer.visual.transformer.resblocks.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
694
+ "transformer.visual.transformer.resblocks.4.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
695
+ "transformer.visual.transformer.resblocks.4.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
696
+ "transformer.visual.transformer.resblocks.4.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
697
+ "transformer.visual.transformer.resblocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
698
+ "transformer.visual.transformer.resblocks.4.ln_1.bias": "pytorch_model-00002-of-00002.bin",
699
+ "transformer.visual.transformer.resblocks.4.ln_1.weight": "pytorch_model-00002-of-00002.bin",
700
+ "transformer.visual.transformer.resblocks.4.ln_2.bias": "pytorch_model-00002-of-00002.bin",
701
+ "transformer.visual.transformer.resblocks.4.ln_2.weight": "pytorch_model-00002-of-00002.bin",
702
+ "transformer.visual.transformer.resblocks.4.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
703
+ "transformer.visual.transformer.resblocks.4.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
704
+ "transformer.visual.transformer.resblocks.4.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
705
+ "transformer.visual.transformer.resblocks.4.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
706
+ "transformer.visual.transformer.resblocks.40.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
707
+ "transformer.visual.transformer.resblocks.40.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
708
+ "transformer.visual.transformer.resblocks.40.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
709
+ "transformer.visual.transformer.resblocks.40.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
710
+ "transformer.visual.transformer.resblocks.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
711
+ "transformer.visual.transformer.resblocks.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
712
+ "transformer.visual.transformer.resblocks.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
713
+ "transformer.visual.transformer.resblocks.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
714
+ "transformer.visual.transformer.resblocks.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
715
+ "transformer.visual.transformer.resblocks.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
716
+ "transformer.visual.transformer.resblocks.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
717
+ "transformer.visual.transformer.resblocks.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
718
+ "transformer.visual.transformer.resblocks.41.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
719
+ "transformer.visual.transformer.resblocks.41.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
720
+ "transformer.visual.transformer.resblocks.41.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
721
+ "transformer.visual.transformer.resblocks.41.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
722
+ "transformer.visual.transformer.resblocks.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
723
+ "transformer.visual.transformer.resblocks.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
724
+ "transformer.visual.transformer.resblocks.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
725
+ "transformer.visual.transformer.resblocks.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
726
+ "transformer.visual.transformer.resblocks.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
727
+ "transformer.visual.transformer.resblocks.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
728
+ "transformer.visual.transformer.resblocks.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
729
+ "transformer.visual.transformer.resblocks.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
730
+ "transformer.visual.transformer.resblocks.42.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
731
+ "transformer.visual.transformer.resblocks.42.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
732
+ "transformer.visual.transformer.resblocks.42.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
733
+ "transformer.visual.transformer.resblocks.42.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
734
+ "transformer.visual.transformer.resblocks.42.ln_1.bias": "pytorch_model-00002-of-00002.bin",
735
+ "transformer.visual.transformer.resblocks.42.ln_1.weight": "pytorch_model-00002-of-00002.bin",
736
+ "transformer.visual.transformer.resblocks.42.ln_2.bias": "pytorch_model-00002-of-00002.bin",
737
+ "transformer.visual.transformer.resblocks.42.ln_2.weight": "pytorch_model-00002-of-00002.bin",
738
+ "transformer.visual.transformer.resblocks.42.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
739
+ "transformer.visual.transformer.resblocks.42.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
740
+ "transformer.visual.transformer.resblocks.42.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
741
+ "transformer.visual.transformer.resblocks.42.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
742
+ "transformer.visual.transformer.resblocks.43.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
743
+ "transformer.visual.transformer.resblocks.43.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
744
+ "transformer.visual.transformer.resblocks.43.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
745
+ "transformer.visual.transformer.resblocks.43.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
746
+ "transformer.visual.transformer.resblocks.43.ln_1.bias": "pytorch_model-00002-of-00002.bin",
747
+ "transformer.visual.transformer.resblocks.43.ln_1.weight": "pytorch_model-00002-of-00002.bin",
748
+ "transformer.visual.transformer.resblocks.43.ln_2.bias": "pytorch_model-00002-of-00002.bin",
749
+ "transformer.visual.transformer.resblocks.43.ln_2.weight": "pytorch_model-00002-of-00002.bin",
750
+ "transformer.visual.transformer.resblocks.43.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
751
+ "transformer.visual.transformer.resblocks.43.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
752
+ "transformer.visual.transformer.resblocks.43.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
753
+ "transformer.visual.transformer.resblocks.43.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
754
+ "transformer.visual.transformer.resblocks.44.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
755
+ "transformer.visual.transformer.resblocks.44.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
756
+ "transformer.visual.transformer.resblocks.44.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
757
+ "transformer.visual.transformer.resblocks.44.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
758
+ "transformer.visual.transformer.resblocks.44.ln_1.bias": "pytorch_model-00002-of-00002.bin",
759
+ "transformer.visual.transformer.resblocks.44.ln_1.weight": "pytorch_model-00002-of-00002.bin",
760
+ "transformer.visual.transformer.resblocks.44.ln_2.bias": "pytorch_model-00002-of-00002.bin",
761
+ "transformer.visual.transformer.resblocks.44.ln_2.weight": "pytorch_model-00002-of-00002.bin",
762
+ "transformer.visual.transformer.resblocks.44.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
763
+ "transformer.visual.transformer.resblocks.44.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
764
+ "transformer.visual.transformer.resblocks.44.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
765
+ "transformer.visual.transformer.resblocks.44.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
766
+ "transformer.visual.transformer.resblocks.45.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
767
+ "transformer.visual.transformer.resblocks.45.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
768
+ "transformer.visual.transformer.resblocks.45.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
769
+ "transformer.visual.transformer.resblocks.45.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
770
+ "transformer.visual.transformer.resblocks.45.ln_1.bias": "pytorch_model-00002-of-00002.bin",
771
+ "transformer.visual.transformer.resblocks.45.ln_1.weight": "pytorch_model-00002-of-00002.bin",
772
+ "transformer.visual.transformer.resblocks.45.ln_2.bias": "pytorch_model-00002-of-00002.bin",
773
+ "transformer.visual.transformer.resblocks.45.ln_2.weight": "pytorch_model-00002-of-00002.bin",
774
+ "transformer.visual.transformer.resblocks.45.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
775
+ "transformer.visual.transformer.resblocks.45.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
776
+ "transformer.visual.transformer.resblocks.45.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
777
+ "transformer.visual.transformer.resblocks.45.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
778
+ "transformer.visual.transformer.resblocks.46.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
779
+ "transformer.visual.transformer.resblocks.46.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
780
+ "transformer.visual.transformer.resblocks.46.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
781
+ "transformer.visual.transformer.resblocks.46.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
782
+ "transformer.visual.transformer.resblocks.46.ln_1.bias": "pytorch_model-00002-of-00002.bin",
783
+ "transformer.visual.transformer.resblocks.46.ln_1.weight": "pytorch_model-00002-of-00002.bin",
784
+ "transformer.visual.transformer.resblocks.46.ln_2.bias": "pytorch_model-00002-of-00002.bin",
785
+ "transformer.visual.transformer.resblocks.46.ln_2.weight": "pytorch_model-00002-of-00002.bin",
786
+ "transformer.visual.transformer.resblocks.46.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
787
+ "transformer.visual.transformer.resblocks.46.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
788
+ "transformer.visual.transformer.resblocks.46.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
789
+ "transformer.visual.transformer.resblocks.46.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
790
+ "transformer.visual.transformer.resblocks.47.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
791
+ "transformer.visual.transformer.resblocks.47.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
792
+ "transformer.visual.transformer.resblocks.47.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
793
+ "transformer.visual.transformer.resblocks.47.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
794
+ "transformer.visual.transformer.resblocks.47.ln_1.bias": "pytorch_model-00002-of-00002.bin",
795
+ "transformer.visual.transformer.resblocks.47.ln_1.weight": "pytorch_model-00002-of-00002.bin",
796
+ "transformer.visual.transformer.resblocks.47.ln_2.bias": "pytorch_model-00002-of-00002.bin",
797
+ "transformer.visual.transformer.resblocks.47.ln_2.weight": "pytorch_model-00002-of-00002.bin",
798
+ "transformer.visual.transformer.resblocks.47.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
799
+ "transformer.visual.transformer.resblocks.47.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
800
+ "transformer.visual.transformer.resblocks.47.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
801
+ "transformer.visual.transformer.resblocks.47.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
802
+ "transformer.visual.transformer.resblocks.5.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
803
+ "transformer.visual.transformer.resblocks.5.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
804
+ "transformer.visual.transformer.resblocks.5.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
805
+ "transformer.visual.transformer.resblocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
806
+ "transformer.visual.transformer.resblocks.5.ln_1.bias": "pytorch_model-00002-of-00002.bin",
807
+ "transformer.visual.transformer.resblocks.5.ln_1.weight": "pytorch_model-00002-of-00002.bin",
808
+ "transformer.visual.transformer.resblocks.5.ln_2.bias": "pytorch_model-00002-of-00002.bin",
809
+ "transformer.visual.transformer.resblocks.5.ln_2.weight": "pytorch_model-00002-of-00002.bin",
810
+ "transformer.visual.transformer.resblocks.5.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
811
+ "transformer.visual.transformer.resblocks.5.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
812
+ "transformer.visual.transformer.resblocks.5.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
813
+ "transformer.visual.transformer.resblocks.5.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
814
+ "transformer.visual.transformer.resblocks.6.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
815
+ "transformer.visual.transformer.resblocks.6.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
816
+ "transformer.visual.transformer.resblocks.6.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
817
+ "transformer.visual.transformer.resblocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
818
+ "transformer.visual.transformer.resblocks.6.ln_1.bias": "pytorch_model-00002-of-00002.bin",
819
+ "transformer.visual.transformer.resblocks.6.ln_1.weight": "pytorch_model-00002-of-00002.bin",
820
+ "transformer.visual.transformer.resblocks.6.ln_2.bias": "pytorch_model-00002-of-00002.bin",
821
+ "transformer.visual.transformer.resblocks.6.ln_2.weight": "pytorch_model-00002-of-00002.bin",
822
+ "transformer.visual.transformer.resblocks.6.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
823
+ "transformer.visual.transformer.resblocks.6.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
824
+ "transformer.visual.transformer.resblocks.6.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
825
+ "transformer.visual.transformer.resblocks.6.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
826
+ "transformer.visual.transformer.resblocks.7.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
827
+ "transformer.visual.transformer.resblocks.7.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
828
+ "transformer.visual.transformer.resblocks.7.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
829
+ "transformer.visual.transformer.resblocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
830
+ "transformer.visual.transformer.resblocks.7.ln_1.bias": "pytorch_model-00002-of-00002.bin",
831
+ "transformer.visual.transformer.resblocks.7.ln_1.weight": "pytorch_model-00002-of-00002.bin",
832
+ "transformer.visual.transformer.resblocks.7.ln_2.bias": "pytorch_model-00002-of-00002.bin",
833
+ "transformer.visual.transformer.resblocks.7.ln_2.weight": "pytorch_model-00002-of-00002.bin",
834
+ "transformer.visual.transformer.resblocks.7.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
835
+ "transformer.visual.transformer.resblocks.7.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
836
+ "transformer.visual.transformer.resblocks.7.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
837
+ "transformer.visual.transformer.resblocks.7.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
838
+ "transformer.visual.transformer.resblocks.8.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
839
+ "transformer.visual.transformer.resblocks.8.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
840
+ "transformer.visual.transformer.resblocks.8.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
841
+ "transformer.visual.transformer.resblocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
842
+ "transformer.visual.transformer.resblocks.8.ln_1.bias": "pytorch_model-00002-of-00002.bin",
843
+ "transformer.visual.transformer.resblocks.8.ln_1.weight": "pytorch_model-00002-of-00002.bin",
844
+ "transformer.visual.transformer.resblocks.8.ln_2.bias": "pytorch_model-00002-of-00002.bin",
845
+ "transformer.visual.transformer.resblocks.8.ln_2.weight": "pytorch_model-00002-of-00002.bin",
846
+ "transformer.visual.transformer.resblocks.8.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
847
+ "transformer.visual.transformer.resblocks.8.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
848
+ "transformer.visual.transformer.resblocks.8.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
849
+ "transformer.visual.transformer.resblocks.8.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
850
+ "transformer.visual.transformer.resblocks.9.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
851
+ "transformer.visual.transformer.resblocks.9.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
852
+ "transformer.visual.transformer.resblocks.9.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
853
+ "transformer.visual.transformer.resblocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
854
+ "transformer.visual.transformer.resblocks.9.ln_1.bias": "pytorch_model-00002-of-00002.bin",
855
+ "transformer.visual.transformer.resblocks.9.ln_1.weight": "pytorch_model-00002-of-00002.bin",
856
+ "transformer.visual.transformer.resblocks.9.ln_2.bias": "pytorch_model-00002-of-00002.bin",
857
+ "transformer.visual.transformer.resblocks.9.ln_2.weight": "pytorch_model-00002-of-00002.bin",
858
+ "transformer.visual.transformer.resblocks.9.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
859
+ "transformer.visual.transformer.resblocks.9.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
860
+ "transformer.visual.transformer.resblocks.9.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
861
+ "transformer.visual.transformer.resblocks.9.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
862
+ "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
863
+ }
864
+ }
weights/model-idf/qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-idf/qwen_generation_utils.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Generation support."""
7
+
8
+ from typing import Tuple, List, Union, Iterable
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from transformers import PreTrainedTokenizer
14
+ from transformers import logging
15
+ from transformers.generation import LogitsProcessor
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+ # Types.
20
+ HistoryType = List[Tuple[str, str]]
21
+ TokensType = List[int]
22
+ BatchTokensType = List[List[int]]
23
+
24
+
25
+ def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
26
+ for tokens in batch:
27
+ context_length = len(tokens)
28
+ if context_length < seq_length:
29
+ tokens.extend([pad_id] * (seq_length - context_length))
30
+ return batch
31
+
32
+
33
+ def get_ltor_masks_and_position_ids(
34
+ data,
35
+ eod_token,
36
+ reset_position_ids,
37
+ reset_attention_mask,
38
+ eod_mask_loss,
39
+ ):
40
+ """Build masks and position id for left to right model."""
41
+
42
+ # Extract batch size and sequence length.
43
+ micro_batch_size, seq_length = data.size()
44
+
45
+ # Attention mask (lower triangular).
46
+ if reset_attention_mask:
47
+ att_mask_batch = micro_batch_size
48
+ else:
49
+ att_mask_batch = 1
50
+ attention_mask = torch.tril(
51
+ torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
52
+ ).view(att_mask_batch, 1, seq_length, seq_length)
53
+
54
+ # Loss mask.
55
+ loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
56
+ if eod_mask_loss:
57
+ loss_mask[data == eod_token] = 0.0
58
+
59
+ # Position ids.
60
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
61
+ position_ids = position_ids.unsqueeze(0).expand_as(data)
62
+ # We need to clone as the ids will be modifed based on batch index.
63
+ if reset_position_ids:
64
+ position_ids = position_ids.clone()
65
+
66
+ if reset_position_ids or reset_attention_mask:
67
+ # Loop through the batches:
68
+ for b in range(micro_batch_size):
69
+
70
+ # Find indecies where EOD token is.
71
+ eod_index = position_ids[b, data[b] == eod_token]
72
+ # Detach indecies from positions if going to modify positions.
73
+ if reset_position_ids:
74
+ eod_index = eod_index.clone()
75
+
76
+ # Loop through EOD indecies:
77
+ prev_index = 0
78
+ for j in range(eod_index.size()[0]):
79
+ i = eod_index[j]
80
+ # Mask attention loss.
81
+ if reset_attention_mask:
82
+ attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
83
+ # Reset positions.
84
+ if reset_position_ids:
85
+ position_ids[b, (i + 1) :] -= i + 1 - prev_index
86
+ prev_index = i + 1
87
+
88
+ # Convert attention mask to binary:
89
+ attention_mask = attention_mask < 0.5
90
+
91
+ return attention_mask, loss_mask, position_ids
92
+
93
+
94
+ def get_batch(context_tokens: torch.LongTensor, eod_id: int):
95
+ """Generate batch from context tokens."""
96
+ # Move to GPU.
97
+ tokens = context_tokens.contiguous().to(context_tokens.device)
98
+ # Get the attention mask and postition ids.
99
+ attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
100
+ tokens,
101
+ eod_id,
102
+ reset_position_ids=False,
103
+ reset_attention_mask=False,
104
+ eod_mask_loss=False,
105
+ )
106
+ return tokens, attention_mask, position_ids
107
+
108
+
109
+ def get_stop_words_ids(chat_format, tokenizer):
110
+ if chat_format == "raw":
111
+ stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
112
+ elif chat_format == "chatml":
113
+ stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
114
+ else:
115
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
116
+ return stop_words_ids
117
+
118
+
119
+ def make_context(
120
+ tokenizer: PreTrainedTokenizer,
121
+ query: str,
122
+ history: List[Tuple[str, str]] = None,
123
+ system: str = "",
124
+ max_window_size: int = 6144,
125
+ chat_format: str = "chatml",
126
+ ):
127
+ if history is None:
128
+ history = []
129
+
130
+ if chat_format == "chatml":
131
+ im_start, im_end = "<|im_start|>", "<|im_end|>"
132
+ im_start_tokens = [tokenizer.im_start_id]
133
+ im_end_tokens = [tokenizer.im_end_id]
134
+ nl_tokens = tokenizer.encode("\n")
135
+
136
+ def _tokenize_str(role, content):
137
+ return f"{role}\n{content}", tokenizer.encode(
138
+ role, allowed_special=set(tokenizer.IMAGE_ST)
139
+ ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
140
+
141
+ system_text, system_tokens_part = _tokenize_str("system", system)
142
+ system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
143
+
144
+ raw_text = ""
145
+ context_tokens = []
146
+
147
+ for turn_query, turn_response in reversed(history):
148
+ query_text, query_tokens_part = _tokenize_str("user", turn_query)
149
+ query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
150
+ if turn_response is not None:
151
+ response_text, response_tokens_part = _tokenize_str(
152
+ "assistant", turn_response
153
+ )
154
+ response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
155
+
156
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
157
+ prev_chat = (
158
+ f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
159
+ )
160
+ else:
161
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens
162
+ prev_chat = f"\n{im_start}{query_text}{im_end}\n"
163
+
164
+ current_context_size = (
165
+ len(system_tokens) + len(next_context_tokens) + len(context_tokens)
166
+ )
167
+ if current_context_size < max_window_size:
168
+ context_tokens = next_context_tokens + context_tokens
169
+ raw_text = prev_chat + raw_text
170
+ else:
171
+ break
172
+
173
+ context_tokens = system_tokens + context_tokens
174
+ raw_text = f"{im_start}{system_text}{im_end}" + raw_text
175
+ context_tokens += (
176
+ nl_tokens
177
+ + im_start_tokens
178
+ + _tokenize_str("user", query)[1]
179
+ + im_end_tokens
180
+ + nl_tokens
181
+ + im_start_tokens
182
+ + tokenizer.encode("assistant")
183
+ + nl_tokens
184
+ )
185
+ raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
186
+
187
+ elif chat_format == "raw":
188
+ raw_text = query
189
+ context_tokens = tokenizer.encode(raw_text)
190
+ else:
191
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
192
+
193
+ return raw_text, context_tokens
194
+
195
+
196
+ def _decode_default(
197
+ tokens: List[int],
198
+ *,
199
+ stop_words: List[str],
200
+ eod_words: List[str],
201
+ tokenizer: PreTrainedTokenizer,
202
+ raw_text_len: int,
203
+ verbose: bool = False,
204
+ return_end_reason: bool = False,
205
+ errors: str='replace',
206
+ ):
207
+ trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
208
+ if verbose:
209
+ print("\nRaw Generate: ", trim_decode_tokens)
210
+
211
+ end_reason = f"Gen length {len(tokens)}"
212
+ for stop_word in stop_words:
213
+ trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
214
+ for eod_word in eod_words:
215
+ if eod_word in trim_decode_tokens:
216
+ end_reason = f"Gen {eod_word!r}"
217
+ trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
218
+ trim_decode_tokens = trim_decode_tokens.strip()
219
+ if verbose:
220
+ print("\nEnd Reason:", end_reason)
221
+ print("\nGenerate: ", trim_decode_tokens)
222
+
223
+ if return_end_reason:
224
+ return trim_decode_tokens, end_reason
225
+ else:
226
+ return trim_decode_tokens
227
+
228
+
229
+ def _decode_chatml(
230
+ tokens: List[int],
231
+ *,
232
+ stop_words: List[str],
233
+ eod_token_ids: List[int],
234
+ tokenizer: PreTrainedTokenizer,
235
+ raw_text_len: int,
236
+ context_length: int,
237
+ verbose: bool = False,
238
+ return_end_reason: bool = False,
239
+ errors: str='replace'
240
+ ):
241
+ end_reason = f"Gen length {len(tokens)}"
242
+ eod_token_idx = context_length
243
+ for eod_token_idx in range(context_length, len(tokens)):
244
+ if tokens[eod_token_idx] in eod_token_ids:
245
+ end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
246
+ break
247
+
248
+ trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
249
+ if verbose:
250
+ print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
251
+ print("\nRaw Generate:", trim_decode_tokens)
252
+ print("\nEnd Reason:", end_reason)
253
+ for stop_word in stop_words:
254
+ trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
255
+ trim_decode_tokens = trim_decode_tokens.strip()
256
+ if verbose:
257
+ print("\nGenerate:", trim_decode_tokens)
258
+
259
+ if return_end_reason:
260
+ return trim_decode_tokens, end_reason
261
+ else:
262
+ return trim_decode_tokens
263
+
264
+
265
+ def decode_tokens(
266
+ tokens: Union[torch.LongTensor, TokensType],
267
+ tokenizer: PreTrainedTokenizer,
268
+ raw_text_len: int,
269
+ context_length: int,
270
+ chat_format: str,
271
+ verbose: bool = False,
272
+ return_end_reason: bool = False,
273
+ errors: str="replace",
274
+ ) -> str:
275
+ if torch.is_tensor(tokens):
276
+ tokens = tokens.cpu().numpy().tolist()
277
+
278
+ if chat_format == "chatml":
279
+ return _decode_chatml(
280
+ tokens,
281
+ stop_words=[],
282
+ eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
283
+ tokenizer=tokenizer,
284
+ raw_text_len=raw_text_len,
285
+ context_length=context_length,
286
+ verbose=verbose,
287
+ return_end_reason=return_end_reason,
288
+ errors=errors,
289
+ )
290
+ elif chat_format == "raw":
291
+ return _decode_default(
292
+ tokens,
293
+ stop_words=["<|endoftext|>"],
294
+ eod_words=["<|endoftext|>"],
295
+ tokenizer=tokenizer,
296
+ raw_text_len=raw_text_len,
297
+ verbose=verbose,
298
+ return_end_reason=return_end_reason,
299
+ errors=errors,
300
+ )
301
+ else:
302
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
303
+
304
+
305
+ class StopWordsLogitsProcessor(LogitsProcessor):
306
+ """
307
+ :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
308
+
309
+ Args:
310
+ stop_words_ids (:obj:`List[List[int]]`):
311
+ List of list of token ids of stop ids. In order to get the tokens of the words
312
+ that should not appear in the generated text, use :obj:`tokenizer(bad_word,
313
+ add_prefix_space=True).input_ids`.
314
+ eos_token_id (:obj:`int`):
315
+ The id of the `end-of-sequence` token.
316
+ """
317
+
318
+ def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
319
+
320
+ if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
321
+ raise ValueError(
322
+ f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
323
+ )
324
+ if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
325
+ raise ValueError(
326
+ f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
327
+ )
328
+ if any(
329
+ any(
330
+ (not isinstance(token_id, (int, np.integer)) or token_id < 0)
331
+ for token_id in stop_word_ids
332
+ )
333
+ for stop_word_ids in stop_words_ids
334
+ ):
335
+ raise ValueError(
336
+ f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
337
+ )
338
+
339
+ self.stop_words_ids = list(
340
+ filter(
341
+ lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
342
+ )
343
+ )
344
+ self.eos_token_id = eos_token_id
345
+ for stop_token_seq in self.stop_words_ids:
346
+ assert (
347
+ len(stop_token_seq) > 0
348
+ ), "Stop words token sequences {} cannot have an empty list".format(
349
+ stop_words_ids
350
+ )
351
+
352
+ def __call__(
353
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor
354
+ ) -> torch.FloatTensor:
355
+ stopped_samples = self._calc_stopped_samples(input_ids)
356
+ for i, should_stop in enumerate(stopped_samples):
357
+ if should_stop:
358
+ scores[i, self.eos_token_id] = float(2**15)
359
+ return scores
360
+
361
+ def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
362
+ if len(tokens) == 0:
363
+ # if bad word tokens is just one token always ban it
364
+ return True
365
+ elif len(tokens) > len(prev_tokens):
366
+ # if bad word tokens are longer then prev input_ids they can't be equal
367
+ return False
368
+ elif prev_tokens[-len(tokens) :].tolist() == tokens:
369
+ # if tokens match
370
+ return True
371
+ else:
372
+ return False
373
+
374
+ def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
375
+ stopped_samples = []
376
+ for prev_input_ids_slice in prev_input_ids:
377
+ match = False
378
+ for stop_token_seq in self.stop_words_ids:
379
+ if self._tokens_match(prev_input_ids_slice, stop_token_seq):
380
+ # if tokens do not match continue
381
+ match = True
382
+ break
383
+ stopped_samples.append(match)
384
+
385
+ return stopped_samples
386
+
387
+
388
+ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
389
+ """This function has been mostly taken from huggingface conversational
390
+ ai code at
391
+ https://medium.com/huggingface/how-to-build-a-state-of-the-art-
392
+ conversational-ai-with-transfer-learning-2d818ac26313"""
393
+
394
+ if top_k > 0:
395
+ # Remove all tokens with a probability less than the
396
+ # last token of the top-k
397
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
398
+ logits[indices_to_remove] = filter_value
399
+
400
+ if top_p > 0.0:
401
+ # Cconvert to 1D
402
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
403
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
404
+
405
+ # Remove tokens with cumulative probability above the threshold
406
+ sorted_indices_to_remove = cumulative_probs > top_p
407
+ # Shift the indices to the right to keep also the first token
408
+ # above the threshold
409
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
410
+ sorted_indices_to_remove[..., 0] = 0
411
+ for i in range(sorted_indices.size(0)):
412
+ indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
413
+ logits[i][indices_to_remove] = filter_value
414
+
415
+ return logits
416
+
417
+
418
+ def switch(val1, val2, boolean):
419
+ boolean = boolean.type_as(val1)
420
+ return (1 - boolean) * val1 + boolean * val2
weights/model-idf/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<|endoftext|>"
3
+ }
weights/model-idf/tokenization_qwen.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import requests
12
+ import unicodedata
13
+ from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
14
+
15
+ import tiktoken
16
+ import numpy as np
17
+ from PIL import Image
18
+ from PIL import ImageFont
19
+ from PIL import ImageDraw
20
+ from transformers import PreTrainedTokenizer, AddedToken
21
+ from transformers.utils import try_to_load_from_cache
22
+
23
+ import matplotlib.colors as mcolors
24
+ from matplotlib.font_manager import FontProperties
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
30
+ FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
31
+ if FONT_PATH is None:
32
+ if not os.path.exists("SimSun.ttf"):
33
+ ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
34
+ open("SimSun.ttf", "wb").write(ttf.content)
35
+ FONT_PATH = "SimSun.ttf"
36
+
37
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
38
+ ENDOFTEXT = "<|endoftext|>"
39
+ IMSTART = "<|im_start|>"
40
+ IMEND = "<|im_end|>"
41
+ # as the default behavior is changed to allow special tokens in
42
+ # regular texts, the surface forms of special tokens need to be
43
+ # as different as possible to minimize the impact
44
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
45
+ SPECIAL_TOKENS = (
46
+ ENDOFTEXT,
47
+ IMSTART,
48
+ IMEND,
49
+ ) + EXTRAS
50
+ IMG_TOKEN_SPAN = 256
51
+
52
+
53
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
54
+ with open(tiktoken_bpe_file, "rb") as f:
55
+ contents = f.read()
56
+ return {
57
+ base64.b64decode(token): int(rank)
58
+ for token, rank in (line.split() for line in contents.splitlines() if line)
59
+ }
60
+
61
+ def _list_find(
62
+ input_list: List[Any],
63
+ candidates: Tuple[Any],
64
+ start: int = 0,
65
+ ):
66
+ for i in range(start, len(input_list)):
67
+ if input_list[i] in candidates:
68
+ return i
69
+ return -1
70
+
71
+ def _replace_closed_tag(
72
+ input_tokens: List[Any],
73
+ start_tags: Union[Any, Tuple[Any]],
74
+ end_tags: Union[Any, Tuple[Any]],
75
+ inclusive_replace_func: Callable,
76
+ exclusive_replace_func: Callable = lambda x: x,
77
+ ):
78
+ if isinstance(start_tags, (str, int)):
79
+ start_tags = (start_tags,)
80
+ if isinstance(end_tags, (str, int)):
81
+ end_tags = (end_tags,)
82
+ assert len(start_tags) == len(end_tags)
83
+
84
+ output_tokens = []
85
+ end = 0
86
+ while True:
87
+ start = _list_find(input_tokens, start_tags, end)
88
+ if start == -1:
89
+ break
90
+ output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
91
+ tag_idx = start_tags.index(input_tokens[start])
92
+ end = _list_find(input_tokens, (end_tags[tag_idx],), start)
93
+ if end == -1:
94
+ raise ValueError("Unclosed image token")
95
+ output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1]))
96
+ end += 1
97
+ output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
98
+ return output_tokens
99
+
100
+ class QWenTokenizer(PreTrainedTokenizer):
101
+ """QWen tokenizer."""
102
+
103
+ vocab_files_names = VOCAB_FILES_NAMES
104
+
105
+ def __init__(
106
+ self,
107
+ vocab_file,
108
+ errors="replace",
109
+ image_start_tag='<img>',
110
+ image_end_tag='</img>',
111
+ image_pad_tag='<imgpad>',
112
+ ref_start_tag='<ref>',
113
+ ref_end_tag='</ref>',
114
+ box_start_tag='<box>',
115
+ box_end_tag='</box>',
116
+ quad_start_tag='<quad>',
117
+ quad_end_tag='</quad>',
118
+ **kwargs,
119
+ ):
120
+ super().__init__(**kwargs)
121
+ self.image_start_tag = image_start_tag
122
+ self.image_end_tag = image_end_tag
123
+ self.image_pad_tag = image_pad_tag
124
+ self.ref_start_tag = ref_start_tag
125
+ self.ref_end_tag = ref_end_tag
126
+ self.box_start_tag = box_start_tag
127
+ self.box_end_tag = box_end_tag
128
+ self.quad_start_tag = quad_start_tag
129
+ self.quad_end_tag = quad_end_tag
130
+ self.IMAGE_ST = (
131
+ ref_start_tag, ref_end_tag,
132
+ box_start_tag, box_end_tag,
133
+ quad_start_tag, quad_end_tag,
134
+ image_start_tag, image_end_tag,
135
+ image_pad_tag
136
+ )
137
+
138
+ self.errors = errors # how to handle errors in decoding
139
+
140
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
141
+ self.special_tokens = {
142
+ token: index
143
+ for index, token in enumerate(
144
+ SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
145
+ )
146
+ }
147
+ self.img_start_id = self.special_tokens[self.image_start_tag]
148
+ self.img_end_id = self.special_tokens[self.image_end_tag]
149
+ self.img_pad_id = self.special_tokens[self.image_pad_tag]
150
+ self.ref_start_id = self.special_tokens[self.ref_start_tag]
151
+ self.ref_end_id = self.special_tokens[self.ref_end_tag]
152
+ self.box_start_id = self.special_tokens[self.box_start_tag]
153
+ self.box_end_id = self.special_tokens[self.box_end_tag]
154
+ self.quad_start_id = self.special_tokens[self.quad_start_tag]
155
+ self.quad_end_id = self.special_tokens[self.quad_end_tag]
156
+ self.image_special_tokens = set([
157
+ self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
158
+ self.quad_start_id, self.quad_end_id,
159
+ ])
160
+
161
+ enc = tiktoken.Encoding(
162
+ "Qwen",
163
+ pat_str=PAT_STR,
164
+ mergeable_ranks=self.mergeable_ranks,
165
+ special_tokens=self.special_tokens,
166
+ )
167
+ assert (
168
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
169
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
170
+
171
+ self.decoder = {
172
+ v: k for k, v in self.mergeable_ranks.items()
173
+ } # type: dict[int, bytes|str]
174
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
175
+
176
+ self.tokenizer = enc # type: tiktoken.Encoding
177
+
178
+ self.eod_id = self.tokenizer.eot_token
179
+ self.im_start_id = self.special_tokens[IMSTART]
180
+ self.im_end_id = self.special_tokens[IMEND]
181
+
182
+ def __getstate__(self):
183
+ # for pickle lovers
184
+ state = self.__dict__.copy()
185
+ del state['tokenizer']
186
+ return state
187
+
188
+ def __setstate__(self, state):
189
+ # tokenizer is not python native; don't pass it; rebuild it
190
+ self.__dict__.update(state)
191
+ enc = tiktoken.Encoding(
192
+ "Qwen",
193
+ pat_str=PAT_STR,
194
+ mergeable_ranks=self.mergeable_ranks,
195
+ special_tokens=self.special_tokens,
196
+ )
197
+ self.tokenizer = enc
198
+
199
+
200
+ def __len__(self) -> int:
201
+ return self.tokenizer.n_vocab
202
+
203
+ def get_vocab(self) -> Dict[bytes, int]:
204
+ return self.mergeable_ranks
205
+
206
+ def convert_tokens_to_ids(
207
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
208
+ ) -> List[int]:
209
+ ids = []
210
+ if isinstance(tokens, (str, bytes)):
211
+ if tokens in self.special_tokens:
212
+ return self.special_tokens[tokens]
213
+ else:
214
+ return self.mergeable_ranks.get(tokens)
215
+ for token in tokens:
216
+ if token in self.special_tokens:
217
+ ids.append(self.special_tokens[token])
218
+ else:
219
+ ids.append(self.mergeable_ranks.get(token))
220
+ return ids
221
+
222
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
223
+ if not special_tokens and new_tokens:
224
+ raise ValueError('Adding regular tokens is not supported')
225
+ for token in new_tokens:
226
+ surface_form = token.content if isinstance(token, AddedToken) else token
227
+ if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
228
+ raise ValueError('Adding unknown special tokens is not supported')
229
+ return 0
230
+
231
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
232
+ """
233
+ Save only the vocabulary of the tokenizer (vocabulary).
234
+
235
+ Returns:
236
+ `Tuple(str)`: Paths to the files saved.
237
+ """
238
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
239
+ with open(file_path, "w", encoding="utf8") as w:
240
+ for k, v in self.mergeable_ranks.items():
241
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
242
+ w.write(line)
243
+ return (file_path,)
244
+
245
+ def tokenize(
246
+ self,
247
+ text: str,
248
+ allowed_special: Union[Set, str] = "all",
249
+ disallowed_special: Union[Collection, str] = (),
250
+ **kwargs,
251
+ ) -> List[Union[bytes, str]]:
252
+ """
253
+ Converts a string in a sequence of tokens.
254
+
255
+ Args:
256
+ text (`str`):
257
+ The sequence to be encoded.
258
+ allowed_special (`Literal["all"]` or `set`):
259
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
260
+ Default to "all".
261
+ disallowed_special (`Literal["all"]` or `Collection`):
262
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
263
+ Default to an empty tuple.
264
+
265
+ kwargs (additional keyword arguments, *optional*):
266
+ Will be passed to the underlying model specific encode method.
267
+
268
+ Returns:
269
+ `List[bytes|str]`: The list of tokens.
270
+ """
271
+ tokens = []
272
+ text = unicodedata.normalize("NFC", text)
273
+
274
+ # this implementation takes a detour: text -> token id -> token surface forms
275
+ for t in self.tokenizer.encode(
276
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
277
+ ):
278
+ tokens.append(self.decoder[t])
279
+
280
+ def _encode_imgurl(img_tokens):
281
+ assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
282
+ img_tokens = img_tokens[1:-1]
283
+ img_url = b''.join(img_tokens)
284
+ out_img_tokens = list(map(self.decoder.get, img_url))
285
+ if len(out_img_tokens) > IMG_TOKEN_SPAN:
286
+ raise ValueError("The content in {}..{} is too long".format(
287
+ self.image_start_tag, self.image_end_tag))
288
+ out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
289
+ out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
290
+ return out_img_tokens
291
+
292
+ return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
293
+
294
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
295
+ """
296
+ Converts a sequence of tokens in a single string.
297
+ """
298
+ text = ""
299
+ temp = b""
300
+ for t in tokens:
301
+ if isinstance(t, str):
302
+ if temp:
303
+ text += temp.decode("utf-8", errors=self.errors)
304
+ temp = b""
305
+ text += t
306
+ elif isinstance(t, bytes):
307
+ temp += t
308
+ else:
309
+ raise TypeError("token should only be of type types or str")
310
+ if temp:
311
+ text += temp.decode("utf-8", errors=self.errors)
312
+ return text
313
+
314
+ @property
315
+ def vocab_size(self):
316
+ return self.tokenizer.n_vocab
317
+
318
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
319
+ """Converts an id to a token, special tokens included"""
320
+ if index in self.decoder:
321
+ return self.decoder[index]
322
+ raise ValueError("unknown ids")
323
+
324
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
325
+ """Converts a token to an id using the vocab, special tokens included"""
326
+ if token in self.special_tokens:
327
+ return self.special_tokens[token]
328
+ if token in self.mergeable_ranks:
329
+ return self.mergeable_ranks[token]
330
+ raise ValueError("unknown token")
331
+
332
+ def _tokenize(self, text: str, **kwargs):
333
+ """
334
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
335
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
336
+
337
+ Do NOT take care of added tokens.
338
+ """
339
+ raise NotImplementedError
340
+
341
+ def _decode(
342
+ self,
343
+ token_ids: Union[int, List[int]],
344
+ skip_special_tokens: bool = False,
345
+ errors: str = None,
346
+ **kwargs,
347
+ ) -> str:
348
+ if isinstance(token_ids, int):
349
+ token_ids = [token_ids]
350
+
351
+ def _decode_imgurl(img_token_ids):
352
+ assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
353
+ img_token_ids = img_token_ids[1:-1]
354
+ img_token_ids = img_token_ids[ : img_token_ids.index(self.img_pad_id)]
355
+ img_url = bytes(img_token_ids).decode('utf-8')
356
+ return [self.img_start_id] + self.tokenizer.encode(img_url) + [self.img_end_id]
357
+
358
+ token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
359
+
360
+ if skip_special_tokens:
361
+ if kwargs.get('keep_image_special', False):
362
+ token_ids = [i for i in token_ids if i < self.eod_id
363
+ or i in self.image_special_tokens]
364
+ else:
365
+ token_ids = [i for i in token_ids if i < self.eod_id]
366
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
367
+
368
+ def to_list_format(self, text: str):
369
+ text = unicodedata.normalize("NFC", text)
370
+ token_ids = self.tokenizer.encode(
371
+ text, allowed_special=set(self.IMAGE_ST + (ENDOFTEXT,)))
372
+
373
+ def _encode_vl_info(tokens):
374
+ if len(tokens) == 0:
375
+ return []
376
+ if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
377
+ key = 'image'
378
+ elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
379
+ key = 'ref'
380
+ elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
381
+ key = 'box'
382
+ elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
383
+ key = 'quad'
384
+ else:
385
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
386
+ return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
387
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
388
+ val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
389
+ return [{key: val}]
390
+
391
+ return _replace_closed_tag(
392
+ token_ids,
393
+ (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
394
+ (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
395
+ _encode_vl_info,
396
+ _encode_vl_info,
397
+ )
398
+
399
+ def from_list_format(self, list_format: List[Dict]):
400
+ text = ''
401
+ num_images = 0
402
+ for ele in list_format:
403
+ if 'image' in ele:
404
+ num_images += 1
405
+ text += f'Picture {num_images}: '
406
+ text += self.image_start_tag + ele['image'] + self.image_end_tag
407
+ text += '\n'
408
+ elif 'text' in ele:
409
+ text += ele['text']
410
+ elif 'box' in ele:
411
+ if 'ref' in ele:
412
+ text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
413
+ for box in ele['box']:
414
+ text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
415
+ else:
416
+ raise ValueError("Unsupport element: " + str(ele))
417
+ return text
418
+
419
+ def _fetch_latest_picture(self, response, history):
420
+ if history is None:
421
+ history = []
422
+ _history = history + [(response, None)]
423
+ for q, r in _history[::-1]:
424
+ for ele in self.to_list_format(q)[::-1]:
425
+ if 'image' in ele:
426
+ return ele['image']
427
+ return None
428
+
429
+ def _fetch_all_box_with_ref(self, text):
430
+ list_format = self.to_list_format(text)
431
+ output = []
432
+ for i, ele in enumerate(list_format):
433
+ if 'box' in ele:
434
+ bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
435
+ assert len(bbox) == 4
436
+ output.append({'box': bbox})
437
+ if i > 0 and 'ref' in list_format[i-1]:
438
+ output[-1]['ref'] = list_format[i-1]['ref'].strip()
439
+ return output
440
+
441
+ def draw_bbox_on_latest_picture(
442
+ self,
443
+ response,
444
+ history=None,
445
+ ) -> Optional[Image.Image]:
446
+ image = self._fetch_latest_picture(response, history)
447
+ if image is None:
448
+ return None
449
+ if image.startswith("http://") or image.startswith("https://"):
450
+ image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
451
+ h, w = image.height, image.width
452
+ else:
453
+ image = np.asarray(Image.open(image).convert("RGB"))
454
+ h, w = image.shape[0], image.shape[1]
455
+ visualizer = Visualizer(image)
456
+
457
+ boxes = self._fetch_all_box_with_ref(response)
458
+ if not boxes:
459
+ return None
460
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()]) # init color
461
+ for box in boxes:
462
+ if 'ref' in box: # random new color for new refexps
463
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()])
464
+ x1, y1, x2, y2 = box['box']
465
+ x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
466
+ visualizer.draw_box((x1, y1, x2, y2), alpha=1, edge_color=color)
467
+ if 'ref' in box:
468
+ visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
469
+ return visualizer.output
470
+
471
+
472
+ import colorsys
473
+ import logging
474
+ import math
475
+ import numpy as np
476
+ import matplotlib as mpl
477
+ import matplotlib.colors as mplc
478
+ import matplotlib.figure as mplfigure
479
+ import torch
480
+ from matplotlib.backends.backend_agg import FigureCanvasAgg
481
+ from PIL import Image
482
+ import random
483
+
484
+ logger = logging.getLogger(__name__)
485
+
486
+
487
+ class VisImage:
488
+ def __init__(self, img, scale=1.0):
489
+ self.img = img
490
+ self.scale = scale
491
+ self.width, self.height = img.shape[1], img.shape[0]
492
+ self._setup_figure(img)
493
+
494
+ def _setup_figure(self, img):
495
+ fig = mplfigure.Figure(frameon=False)
496
+ self.dpi = fig.get_dpi()
497
+ # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
498
+ # (https://github.com/matplotlib/matplotlib/issues/15363)
499
+ fig.set_size_inches(
500
+ (self.width * self.scale + 1e-2) / self.dpi,
501
+ (self.height * self.scale + 1e-2) / self.dpi,
502
+ )
503
+ self.canvas = FigureCanvasAgg(fig)
504
+ # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
505
+ ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
506
+ ax.axis("off")
507
+ self.fig = fig
508
+ self.ax = ax
509
+ self.reset_image(img)
510
+
511
+ def reset_image(self, img):
512
+ img = img.astype("uint8")
513
+ self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
514
+
515
+ def save(self, filepath):
516
+ self.fig.savefig(filepath)
517
+
518
+ def get_image(self):
519
+ canvas = self.canvas
520
+ s, (width, height) = canvas.print_to_buffer()
521
+
522
+ buffer = np.frombuffer(s, dtype="uint8")
523
+
524
+ img_rgba = buffer.reshape(height, width, 4)
525
+ rgb, alpha = np.split(img_rgba, [3], axis=2)
526
+ return rgb.astype("uint8")
527
+
528
+
529
+ class Visualizer:
530
+ def __init__(self, img_rgb, metadata=None, scale=1.0):
531
+ self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
532
+ self.font_path = FONT_PATH
533
+ self.output = VisImage(self.img, scale=scale)
534
+ self.cpu_device = torch.device("cpu")
535
+
536
+ # too small texts are useless, therefore clamp to 14
537
+ self._default_font_size = max(
538
+ np.sqrt(self.output.height * self.output.width) // 30, 15 // scale
539
+ )
540
+
541
+ def draw_text(
542
+ self,
543
+ text,
544
+ position,
545
+ *,
546
+ font_size=None,
547
+ color="g",
548
+ horizontal_alignment="center",
549
+ rotation=0,
550
+ ):
551
+ if not font_size:
552
+ font_size = self._default_font_size
553
+
554
+ # since the text background is dark, we don't want the text to be dark
555
+ color = np.maximum(list(mplc.to_rgb(color)), 0.2)
556
+ color[np.argmax(color)] = max(0.8, np.max(color))
557
+
558
+ x, y = position
559
+ self.output.ax.text(
560
+ x,
561
+ y,
562
+ text,
563
+ size=font_size * self.output.scale,
564
+ fontproperties=FontProperties(fname=self.font_path),
565
+ bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
566
+ verticalalignment="top",
567
+ horizontalalignment=horizontal_alignment,
568
+ color=color,
569
+ zorder=10,
570
+ rotation=rotation,
571
+ )
572
+ return self.output
573
+
574
+ def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
575
+
576
+ x0, y0, x1, y1 = box_coord
577
+ width = x1 - x0
578
+ height = y1 - y0
579
+
580
+ linewidth = max(self._default_font_size / 4, 1)
581
+
582
+ self.output.ax.add_patch(
583
+ mpl.patches.Rectangle(
584
+ (x0, y0),
585
+ width,
586
+ height,
587
+ fill=False,
588
+ edgecolor=edge_color,
589
+ linewidth=linewidth * self.output.scale,
590
+ alpha=alpha,
591
+ linestyle=line_style,
592
+ )
593
+ )
594
+ return self.output
595
+
596
+ def get_output(self):
597
+
598
+ return self.output
weights/model-idf/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 2048,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
weights/model-idf/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
weights/model-idf/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af8991e433f62470b81e72354c2330cd5f430895d2ee3ab60779f7dc22cfc953
3
+ size 6395
weights/model-idf/visual.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from collections import OrderedDict
7
+ import math
8
+ import requests
9
+ from io import BytesIO
10
+ from functools import partial
11
+ from PIL import Image
12
+ from typing import Callable, Optional, Sequence, Tuple, List
13
+ import numpy as np
14
+
15
+ import torch
16
+ from torch import nn
17
+ from torch.nn import functional as F
18
+ from torch.nn.init import normal_
19
+ from torchvision import transforms
20
+ from torchvision.transforms import InterpolationMode
21
+
22
+
23
+ def get_abs_pos(abs_pos, tgt_size):
24
+ # abs_pos: L, C
25
+ # tgt_size: M
26
+ # return: M, C
27
+ src_size = int(math.sqrt(abs_pos.size(0)))
28
+ tgt_size = int(math.sqrt(tgt_size))
29
+ dtype = abs_pos.dtype
30
+
31
+ if src_size != tgt_size:
32
+ return F.interpolate(
33
+ abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
34
+ size=(tgt_size, tgt_size),
35
+ mode="bicubic",
36
+ align_corners=False,
37
+ ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
38
+ else:
39
+ return abs_pos
40
+
41
+ # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
42
+ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
43
+ """
44
+ grid_size: int of the grid height and width
45
+ return:
46
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
47
+ """
48
+ grid_h = np.arange(grid_size, dtype=np.float32)
49
+ grid_w = np.arange(grid_size, dtype=np.float32)
50
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
51
+ grid = np.stack(grid, axis=0)
52
+
53
+ grid = grid.reshape([2, 1, grid_size, grid_size])
54
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
55
+ if cls_token:
56
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
57
+ return pos_embed
58
+
59
+
60
+ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
61
+ assert embed_dim % 2 == 0
62
+
63
+ # use half of dimensions to encode grid_h
64
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
65
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
66
+
67
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
68
+ return emb
69
+
70
+
71
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
72
+ """
73
+ embed_dim: output dimension for each position
74
+ pos: a list of positions to be encoded: size (M,)
75
+ out: (M, D)
76
+ """
77
+ assert embed_dim % 2 == 0
78
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
79
+ omega /= embed_dim / 2.
80
+ omega = 1. / 10000**omega # (D/2,)
81
+
82
+ pos = pos.reshape(-1) # (M,)
83
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
84
+
85
+ emb_sin = np.sin(out) # (M, D/2)
86
+ emb_cos = np.cos(out) # (M, D/2)
87
+
88
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
89
+ return emb
90
+
91
+
92
+ class Resampler(nn.Module):
93
+ """
94
+ A 2D perceiver-resampler network with one cross attention layers by
95
+ (grid_size**2) learnable queries and 2d sincos pos_emb
96
+ Outputs:
97
+ A tensor with the shape of (grid_size**2, embed_dim)
98
+ """
99
+ def __init__(
100
+ self,
101
+ grid_size,
102
+ embed_dim,
103
+ num_heads,
104
+ kv_dim=None,
105
+ norm_layer=nn.LayerNorm
106
+ ):
107
+ super().__init__()
108
+ self.num_queries = grid_size ** 2
109
+ self.embed_dim = embed_dim
110
+ self.num_heads = num_heads
111
+
112
+ self.pos_embed = nn.Parameter(
113
+ torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
114
+ ).requires_grad_(False)
115
+
116
+ self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
117
+ normal_(self.query, std=.02)
118
+
119
+ if kv_dim is not None and kv_dim != embed_dim:
120
+ self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
121
+ else:
122
+ self.kv_proj = nn.Identity()
123
+
124
+ self.attn = nn.MultiheadAttention(embed_dim, num_heads) # only out_proj
125
+ self.ln_q = norm_layer(embed_dim)
126
+ self.ln_kv = norm_layer(embed_dim)
127
+
128
+ # new
129
+ self.attn1 = nn.MultiheadAttention(embed_dim, num_heads)
130
+ self.attn1.apply(self._init_weights_zero)
131
+ # self.id_query_fc = nn.Linear(embed_dim, embed_dim, bias=False)
132
+ # self.test_feat_fc = nn.Linear(embed_dim, embed_dim, bias=False)
133
+ # self.id_query_fc.apply(self._init_weights_zero)
134
+ # self.test_feat_fc.apply(self._init_weights_zero)
135
+
136
+ # self.apply(self._init_weights)
137
+
138
+ def _init_weights(self, m):
139
+ if isinstance(m, nn.Linear):
140
+ normal_(m.weight, std=.02)
141
+ if isinstance(m, nn.Linear) and m.bias is not None:
142
+ nn.init.constant_(m.bias, 0)
143
+ elif isinstance(m, nn.LayerNorm):
144
+ nn.init.constant_(m.bias, 0)
145
+ nn.init.constant_(m.weight, 1.0)
146
+
147
+ def _init_weights_zero(self, m):
148
+ if isinstance(m, nn.Linear):
149
+ nn.init.constant_(m.weight, 0)
150
+ if isinstance(m, nn.Linear) and m.bias is not None:
151
+ nn.init.constant_(m.bias, 0)
152
+ elif isinstance(m, nn.LayerNorm):
153
+ nn.init.constant_(m.bias, 0)
154
+ nn.init.constant_(m.weight, 1.0)
155
+
156
+ def forward(self, x, images_flag, attn_mask=None): # idadapter3
157
+ pos_embed = get_abs_pos(self.pos_embed, x.size(1))
158
+
159
+ x = self.kv_proj(x)
160
+ x = self.ln_kv(x).permute(1, 0, 2)
161
+
162
+ N = x.shape[1]
163
+ q = self.ln_q(self.query)
164
+ q_ = self._repeat(q, N)
165
+
166
+ out_list = []
167
+ ind = 0
168
+ while ind < len(images_flag):
169
+ if images_flag[ind] == 2:
170
+ end_ind = ind+1
171
+ while True:
172
+ if end_ind >= len(images_flag):
173
+ break
174
+ if images_flag[end_ind] == 2:
175
+ end_ind += 1
176
+ else:
177
+ break
178
+ fake_out = self.attn1(
179
+ q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
180
+ x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
181
+ x[:,ind:end_ind,:],
182
+ attn_mask=attn_mask)[0]
183
+
184
+ llava_out = self.attn(
185
+ q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
186
+ x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
187
+ x[:,ind:end_ind,:],
188
+ attn_mask=attn_mask)[0]
189
+ llava_out = llava_out + 0 * fake_out
190
+ ind = end_ind
191
+ out_list.append(llava_out)
192
+
193
+ elif images_flag[ind] == 0:
194
+ id_end_ind = ind+1
195
+ while True:
196
+ if id_end_ind >= len(images_flag):
197
+ break
198
+ if images_flag[id_end_ind] == 0:
199
+ id_end_ind += 1
200
+ else:
201
+ break
202
+ id_out = self.attn(
203
+ q_[:,ind:id_end_ind,:] + self.pos_embed.unsqueeze(1),
204
+ x[:,ind:id_end_ind,:] + pos_embed.unsqueeze(1),
205
+ x[:,ind:id_end_ind,:],
206
+ attn_mask=attn_mask)[0]
207
+ ind = id_end_ind
208
+ out_list.append(id_out)
209
+ test_end_ind = ind+1
210
+ while True:
211
+ if test_end_ind >= len(images_flag):
212
+ break
213
+ if images_flag[test_end_ind] == 1:
214
+ test_end_ind += 1
215
+ else:
216
+ break
217
+ id_query = id_out.detach()
218
+ id_query = id_query.permute(1,0,2).reshape([-1, id_query.shape[-1]])
219
+ id_query = self._repeat(id_query, test_end_ind-ind)
220
+ test_feats = x[:,ind:test_end_ind,:]
221
+ test_out1 = self.attn1(
222
+ test_feats + pos_embed.unsqueeze(1),
223
+ id_query,
224
+ id_query,
225
+ attn_mask=attn_mask)[0]
226
+ test_out1 = test_out1 + test_feats # residual
227
+ test_out2 = self.attn(
228
+ q_[:,ind:test_end_ind,:] + self.pos_embed.unsqueeze(1),
229
+ test_out1 + pos_embed.unsqueeze(1),
230
+ test_out1,
231
+ attn_mask=attn_mask)[0]
232
+ ind = test_end_ind
233
+ out_list.append(test_out2)
234
+
235
+ else:
236
+ print('error')
237
+
238
+ out = torch.cat(out_list, 1)
239
+ # except:
240
+ # fake_out = self.attn1(
241
+ # self._repeat(q, N) + self.pos_embed.unsqueeze(1),
242
+ # x + pos_embed.unsqueeze(1),
243
+ # x,
244
+ # attn_mask=attn_mask)[0]
245
+ # out = self.attn(
246
+ # self._repeat(q, N) + self.pos_embed.unsqueeze(1),
247
+ # x + pos_embed.unsqueeze(1),
248
+ # x,
249
+ # attn_mask=attn_mask)[0]
250
+ # out = out + 0 * fake_out
251
+ # t = time.time()
252
+ # with open('/mnt/bn/automl-aigc/yatai/Qwen-VL/bug2.txt', 'a') as f:
253
+ # f.write(f"{t}: visual id-former error\n")
254
+ return out.permute(1,0,2)
255
+
256
+ # def forward(self, x, images_flag=None, attn_mask=None):
257
+
258
+ # pos_embed = get_abs_pos(self.pos_embed, x.size(1))
259
+
260
+ # x = self.kv_proj(x)
261
+ # x = self.ln_kv(x).permute(1, 0, 2)
262
+
263
+ # N = x.shape[1]
264
+ # q = self.ln_q(self.query)
265
+
266
+ # out = self.attn(
267
+ # self._repeat(q, N) + self.pos_embed.unsqueeze(1),
268
+ # x + pos_embed.unsqueeze(1),
269
+ # x,
270
+ # attn_mask=attn_mask)[0]
271
+
272
+ # return out.permute(1, 0, 2)
273
+
274
+ def _repeat(self, query, N: int):
275
+ return query.unsqueeze(1).repeat(1, N, 1)
276
+
277
+
278
+ class VisualAttention(nn.Module):
279
+ """self-attention layer class.
280
+
281
+ Self-attention layer takes input with size [s, b, h]
282
+ and returns output of the same size.
283
+ """
284
+
285
+ def __init__(self, embed_dim, num_heads,
286
+ bias=True, kdim=None, vdim=None):
287
+ super(VisualAttention, self).__init__()
288
+ self.embed_dim = embed_dim
289
+ self.kdim = kdim if kdim is not None else embed_dim
290
+ self.vdim = vdim if vdim is not None else embed_dim
291
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
292
+
293
+ self.num_heads = num_heads
294
+
295
+ # Per attention head and per partition values.
296
+ assert embed_dim % num_heads == 0
297
+ self.hidden_size_per_attention_head = embed_dim // num_heads
298
+ self.num_attention_heads_per_partition = num_heads
299
+ self.hidden_size_per_partition = embed_dim
300
+
301
+ # Strided linear layer.
302
+ assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
303
+ self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
304
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
305
+ self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
306
+
307
+ def forward(self, query, key, value, attn_mask = None):
308
+ # query/key/value: [sq, b, h]
309
+ sq, b, _ = query.size()
310
+
311
+ assert torch.allclose(query, key), 'Only Support Self-Attention Currently'
312
+ sk = sq
313
+ mixed_x_layer = self.in_proj(query)
314
+
315
+ # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
316
+ new_tensor_shape = mixed_x_layer.size()[:-1] + \
317
+ (self.num_attention_heads_per_partition,
318
+ 3 * self.hidden_size_per_attention_head)
319
+ mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
320
+
321
+ # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
322
+ query_layer, key_layer, value_layer = mixed_x_layer.split(
323
+ self.hidden_size_per_attention_head, dim=-1)
324
+
325
+ # [sq, b, np, hn] -> [sq, b * np, hn]
326
+ query_layer = query_layer.view(sq,
327
+ b * self.num_attention_heads_per_partition,
328
+ self.hidden_size_per_attention_head).transpose(0, 1)
329
+ # [sk, b, np, hn] -> [sk, b * np, hn]
330
+ key_layer = key_layer.view(sk,
331
+ b * self.num_attention_heads_per_partition,
332
+ self.hidden_size_per_attention_head).transpose(0, 1)
333
+
334
+ q_scaled = query_layer / self.norm_factor
335
+ if attn_mask is not None:
336
+ attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
337
+ else:
338
+ attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
339
+ attention_probs = attention_probs.softmax(dim=-1)
340
+
341
+ value_layer = value_layer.view(sk,
342
+ b * self.num_attention_heads_per_partition,
343
+ self.hidden_size_per_attention_head).transpose(0, 1)
344
+
345
+ # matmul: [b * np, sq, hn]
346
+ context_layer = torch.bmm(attention_probs, value_layer)
347
+
348
+ # change view [b, np, sq, hn]
349
+ context_layer = context_layer.view(b,
350
+ self.num_attention_heads_per_partition,
351
+ sq, self.hidden_size_per_attention_head)
352
+
353
+ # [b, np, sq, hn] --> [sq, b, np, hn]
354
+ context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
355
+
356
+ # [sq, b, np, hn] --> [sq, b, hp]
357
+ new_context_layer_shape = context_layer.size()[:-2] + \
358
+ (self.hidden_size_per_partition,)
359
+ context_layer = context_layer.view(*new_context_layer_shape)
360
+
361
+ output = self.out_proj(context_layer)
362
+
363
+ return output
364
+
365
+
366
+ class VisualAttentionBlock(nn.Module):
367
+ def __init__(
368
+ self,
369
+ d_model: int,
370
+ n_head: int,
371
+ mlp_ratio: float = 4.0,
372
+ act_layer: Callable = nn.GELU,
373
+ norm_layer: Callable = nn.LayerNorm,
374
+ is_cross_attention: bool = False,
375
+ ):
376
+ super().__init__()
377
+
378
+ self.ln_1 = norm_layer(d_model)
379
+ if is_cross_attention:
380
+ self.ln_1_kv = norm_layer(d_model)
381
+
382
+ self.ln_2 = norm_layer(d_model)
383
+ mlp_width = int(d_model * mlp_ratio)
384
+ self.attn = VisualAttention(d_model, n_head)
385
+ self.mlp = nn.Sequential(OrderedDict([
386
+ ("c_fc", nn.Linear(d_model, mlp_width)),
387
+ ("gelu", act_layer()),
388
+ ("c_proj", nn.Linear(mlp_width, d_model))
389
+ ]))
390
+
391
+ def attention(
392
+ self,
393
+ q_x: torch.Tensor,
394
+ k_x: Optional[torch.Tensor] = None,
395
+ v_x: Optional[torch.Tensor] = None,
396
+ attn_mask: Optional[torch.Tensor] = None,
397
+ ):
398
+ k_x = k_x if k_x is not None else q_x
399
+ v_x = v_x if v_x is not None else q_x
400
+
401
+ attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
402
+ return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
403
+
404
+ def forward(
405
+ self,
406
+ q_x: torch.Tensor,
407
+ k_x: Optional[torch.Tensor] = None,
408
+ v_x: Optional[torch.Tensor] = None,
409
+ attn_mask: Optional[torch.Tensor] = None,
410
+ ):
411
+ k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
412
+ v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
413
+
414
+ x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
415
+ x = x + self.mlp(self.ln_2(x))
416
+ return x
417
+
418
+
419
+ class TransformerBlock(nn.Module):
420
+ def __init__(
421
+ self,
422
+ width: int,
423
+ layers: int,
424
+ heads: int,
425
+ mlp_ratio: float = 4.0,
426
+ act_layer: Callable = nn.GELU,
427
+ norm_layer: Callable = nn.LayerNorm,
428
+ ):
429
+ super().__init__()
430
+ self.width = width
431
+ self.layers = layers
432
+
433
+ self.resblocks = nn.ModuleList([
434
+ VisualAttentionBlock(
435
+ width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer)
436
+ for _ in range(layers)
437
+ ])
438
+
439
+ def get_cast_dtype(self) -> torch.dtype:
440
+ return self.resblocks[0].mlp.c_fc.weight.dtype
441
+
442
+ def get_cast_device(self) -> torch.device:
443
+ return self.resblocks[0].mlp.c_fc.weight.device
444
+
445
+ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
446
+ for r in self.resblocks:
447
+ x = r(x, attn_mask=attn_mask)
448
+ return x
449
+
450
+
451
+ class VisionTransformer(nn.Module):
452
+
453
+ def __init__(
454
+ self,
455
+ image_size: int,
456
+ patch_size: int,
457
+ width: int,
458
+ layers: int,
459
+ heads: int,
460
+ mlp_ratio: float,
461
+ n_queries: int = 256,
462
+ output_dim: int = 512,
463
+ **kwargs
464
+ ):
465
+ super().__init__()
466
+ image_height, image_width = self.image_size = (image_size, image_size)
467
+ patch_height, patch_width = self.patch_size = (patch_size, patch_size)
468
+ self.grid_size = (image_height // patch_height, image_width // patch_width)
469
+ self.output_dim = output_dim
470
+
471
+ mean = (0.48145466, 0.4578275, 0.40821073)
472
+ std = (0.26862954, 0.26130258, 0.27577711)
473
+ self.image_transform = transforms.Compose([
474
+ transforms.Resize(
475
+ (image_size, image_size),
476
+ interpolation=InterpolationMode.BICUBIC
477
+ ),
478
+ transforms.ToTensor(),
479
+ transforms.Normalize(mean=mean, std=std),
480
+ ])
481
+
482
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
483
+
484
+ # class embeddings and positional embeddings
485
+ scale = width ** -0.5
486
+ self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
487
+
488
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
489
+ act_layer = nn.GELU
490
+
491
+ self.ln_pre = norm_layer(width)
492
+ self.transformer = TransformerBlock(
493
+ width,
494
+ layers,
495
+ heads,
496
+ mlp_ratio,
497
+ act_layer=act_layer,
498
+ norm_layer=norm_layer,
499
+ )
500
+
501
+ self.attn_pool = Resampler(
502
+ grid_size=int(math.sqrt(n_queries)),
503
+ embed_dim=output_dim,
504
+ num_heads=output_dim // 128,
505
+ kv_dim=width,
506
+ norm_layer=norm_layer,
507
+ )
508
+ self.ln_post = norm_layer(output_dim)
509
+ self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
510
+
511
+ def forward(self, x: torch.Tensor, images_flag):
512
+ x = x.to(
513
+ dtype=self.transformer.get_cast_dtype(),
514
+ device=self.transformer.get_cast_device(),
515
+ )
516
+ # to patches
517
+ x = self.conv1(x) # shape = [*, width, grid, grid]
518
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
519
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
520
+
521
+ x = x + get_abs_pos(self.positional_embedding, x.size(1))
522
+
523
+ x = self.ln_pre(x)
524
+
525
+ x = x.permute(1, 0, 2) # NLD -> LND
526
+ x = self.transformer(x)
527
+ x = x.permute(1, 0, 2) # LND -> NLD
528
+
529
+ x = self.attn_pool(x, images_flag)
530
+ x = self.ln_post(x)
531
+ x = x @ self.proj
532
+
533
+ return x
534
+
535
+ def encode(self, image_paths: List[str], images_flag):
536
+ images = []
537
+ for image_path in image_paths:
538
+ if image_path.startswith("http://") or image_path.startswith("https://"):
539
+ image = Image.open(requests.get(image_path, stream=True).raw)
540
+ else:
541
+ image = Image.open(image_path)
542
+ image = image.convert("RGB")
543
+ images.append(self.image_transform(image))
544
+ images = torch.stack(images, dim=0)
545
+ return self(images, images_flag)
weights/model-idf/zero_to_fp32.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage == 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dicts.append(torch.load(f, map_location=device))
147
+
148
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
149
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
150
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
151
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
152
+
153
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
154
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
155
+ # use the max of the partition_count to get the dp world_size.
156
+
157
+ if type(world_size) is list:
158
+ world_size = max(world_size)
159
+
160
+ if world_size != total_files:
161
+ raise ValueError(
162
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
163
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
164
+ )
165
+
166
+ # the groups are named differently in each stage
167
+ if zero_stage == 2:
168
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
169
+ elif zero_stage == 3:
170
+ fp32_groups_key = FP32_FLAT_GROUPS
171
+ else:
172
+ raise ValueError(f"unknown zero stage {zero_stage}")
173
+
174
+ if zero_stage == 2:
175
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
176
+ elif zero_stage == 3:
177
+ # if there is more than one param group, there will be multiple flattened tensors - one
178
+ # flattened tensor per group - for simplicity merge them into a single tensor
179
+ #
180
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
181
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
182
+
183
+ fp32_flat_groups = [
184
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
185
+ ]
186
+
187
+ return zero_stage, world_size, fp32_flat_groups
188
+
189
+
190
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
191
+ """
192
+ Returns fp32 state_dict reconstructed from ds checkpoint
193
+
194
+ Args:
195
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
196
+
197
+ """
198
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
199
+
200
+ optim_files = get_optim_files(ds_checkpoint_dir)
201
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
202
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
203
+
204
+ model_files = get_model_state_files(ds_checkpoint_dir)
205
+
206
+ zero_model_states = parse_model_states(model_files)
207
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
208
+
209
+ if zero_stage == 2:
210
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
211
+ elif zero_stage == 3:
212
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
248
+ param_shapes = zero_model_states[0].param_shapes
249
+
250
+ # Reconstruction protocol:
251
+ #
252
+ # XXX: document this
253
+
254
+ if debug:
255
+ for i in range(world_size):
256
+ for j in range(len(fp32_flat_groups[0])):
257
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
258
+
259
+ # XXX: memory usage doubles here (zero2)
260
+ num_param_groups = len(fp32_flat_groups[0])
261
+ merged_single_partition_of_fp32_groups = []
262
+ for i in range(num_param_groups):
263
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
264
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
265
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
266
+ avail_numel = sum(
267
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
268
+
269
+ if debug:
270
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
271
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
272
+ # not asserting if there is a mismatch due to possible padding
273
+ print(f"Have {avail_numel} numels to process.")
274
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
275
+
276
+ # params
277
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
278
+ # out-of-core computing solution
279
+ total_numel = 0
280
+ total_params = 0
281
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
282
+ offset = 0
283
+ avail_numel = full_single_fp32_vector.numel()
284
+ for name, shape in shapes.items():
285
+
286
+ unpartitioned_numel = shape.numel()
287
+ total_numel += unpartitioned_numel
288
+ total_params += 1
289
+
290
+ if debug:
291
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
292
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
293
+ offset += unpartitioned_numel
294
+
295
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
296
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
297
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
298
+ # live optimizer object, so we are checking that the numbers are within the right range
299
+ align_to = 2 * world_size
300
+
301
+ def zero2_align(x):
302
+ return align_to * math.ceil(x / align_to)
303
+
304
+ if debug:
305
+ print(f"original offset={offset}, avail_numel={avail_numel}")
306
+
307
+ offset = zero2_align(offset)
308
+ avail_numel = zero2_align(avail_numel)
309
+
310
+ if debug:
311
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
312
+
313
+ # Sanity check
314
+ if offset != avail_numel:
315
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
316
+
317
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
318
+
319
+
320
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
321
+ state_dict = OrderedDict()
322
+
323
+ # buffers
324
+ buffers = zero_model_states[0].buffers
325
+ state_dict.update(buffers)
326
+ if debug:
327
+ print(f"added {len(buffers)} buffers")
328
+
329
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
330
+
331
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
332
+
333
+ # recover shared parameters
334
+ for pair in zero_model_states[0].shared_params:
335
+ if pair[1] in state_dict:
336
+ state_dict[pair[0]] = state_dict[pair[1]]
337
+
338
+ return state_dict
339
+
340
+
341
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
342
+ remainder = unpartitioned_numel % world_size
343
+ padding_numel = (world_size - remainder) if remainder else 0
344
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
345
+ return partitioned_numel, padding_numel
346
+
347
+
348
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
349
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
350
+ return
351
+
352
+ if debug:
353
+ for i in range(world_size):
354
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
355
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
356
+
357
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
358
+ wanted_params = len(frozen_param_shapes)
359
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
360
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
361
+ print(f'Frozen params: Have {avail_numel} numels to process.')
362
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
363
+
364
+ total_params = 0
365
+ total_numel = 0
366
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
367
+ total_params += 1
368
+ unpartitioned_numel = shape.numel()
369
+ total_numel += unpartitioned_numel
370
+
371
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
372
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
373
+
374
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
375
+
376
+ if debug:
377
+ print(
378
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
379
+ )
380
+
381
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
382
+
383
+
384
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
385
+ param_shapes = zero_model_states[0].param_shapes
386
+ avail_numel = fp32_flat_groups[0].numel() * world_size
387
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
388
+ # param, re-consolidating each param, while dealing with padding if any
389
+
390
+ # merge list of dicts, preserving order
391
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
392
+
393
+ if debug:
394
+ for i in range(world_size):
395
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
396
+
397
+ wanted_params = len(param_shapes)
398
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
399
+ # not asserting if there is a mismatch due to possible padding
400
+ avail_numel = fp32_flat_groups[0].numel() * world_size
401
+ print(f"Trainable params: Have {avail_numel} numels to process.")
402
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
403
+
404
+ # params
405
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
406
+ # out-of-core computing solution
407
+ offset = 0
408
+ total_numel = 0
409
+ total_params = 0
410
+ for name, shape in param_shapes.items():
411
+
412
+ unpartitioned_numel = shape.numel()
413
+ total_numel += unpartitioned_numel
414
+ total_params += 1
415
+
416
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
417
+
418
+ if debug:
419
+ print(
420
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
421
+ )
422
+
423
+ # XXX: memory usage doubles here
424
+ state_dict[name] = torch.cat(
425
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
426
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
427
+ offset += partitioned_numel
428
+
429
+ offset *= world_size
430
+
431
+ # Sanity check
432
+ if offset != avail_numel:
433
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
434
+
435
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
436
+
437
+
438
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
439
+ state_dict = OrderedDict()
440
+
441
+ # buffers
442
+ buffers = zero_model_states[0].buffers
443
+ state_dict.update(buffers)
444
+ if debug:
445
+ print(f"added {len(buffers)} buffers")
446
+
447
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
448
+
449
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
450
+
451
+ # recover shared parameters
452
+ for pair in zero_model_states[0].shared_params:
453
+ if pair[1] in state_dict:
454
+ state_dict[pair[0]] = state_dict[pair[1]]
455
+
456
+ return state_dict
457
+
458
+
459
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
460
+ """
461
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
462
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
463
+ via a model hub.
464
+
465
+ Args:
466
+ - ``checkpoint_dir``: path to the desired checkpoint folder
467
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
468
+
469
+ Returns:
470
+ - pytorch ``state_dict``
471
+
472
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
473
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
474
+ the checkpoint.
475
+
476
+ A typical usage might be ::
477
+
478
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
479
+ # do the training and checkpoint saving
480
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
481
+ model = model.cpu() # move to cpu
482
+ model.load_state_dict(state_dict)
483
+ # submit to model hub or save the model to share with others
484
+
485
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
486
+ application. i.e. you will need to re-initialize the deepspeed engine, since
487
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
488
+
489
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
490
+
491
+ """
492
+ if tag is None:
493
+ latest_path = os.path.join(checkpoint_dir, 'latest')
494
+ if os.path.isfile(latest_path):
495
+ with open(latest_path, 'r') as fd:
496
+ tag = fd.read().strip()
497
+ else:
498
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
499
+
500
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
501
+
502
+ if not os.path.isdir(ds_checkpoint_dir):
503
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
504
+
505
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
506
+
507
+
508
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
509
+ """
510
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
511
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
512
+
513
+ Args:
514
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
515
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
516
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
517
+ """
518
+
519
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
520
+ print(f"Saving fp32 state dict to {output_file}")
521
+ torch.save(state_dict, output_file)
522
+
523
+
524
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
525
+ """
526
+ 1. Put the provided model to cpu
527
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
528
+ 3. Load it into the provided model
529
+
530
+ Args:
531
+ - ``model``: the model object to update
532
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
533
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
534
+
535
+ Returns:
536
+ - ``model`: modified model
537
+
538
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
539
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
540
+ conveniently placed for you in the checkpoint folder.
541
+
542
+ A typical usage might be ::
543
+
544
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
545
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
546
+ # submit to model hub or save the model to share with others
547
+
548
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
549
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
550
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
551
+
552
+ """
553
+ logger.info(f"Extracting fp32 weights")
554
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
555
+
556
+ logger.info(f"Overwriting model with fp32 weights")
557
+ model = model.cpu()
558
+ model.load_state_dict(state_dict, strict=False)
559
+
560
+ return model
561
+
562
+
563
+ if __name__ == "__main__":
564
+
565
+ parser = argparse.ArgumentParser()
566
+ parser.add_argument("checkpoint_dir",
567
+ type=str,
568
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
569
+ parser.add_argument(
570
+ "output_file",
571
+ type=str,
572
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
573
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
574
+ args = parser.parse_args()
575
+
576
+ debug = args.debug
577
+
578
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)