jiyatai
/

IDA-VLM

Model card Files Files and versions Community

jiyatai commited on Oct 12, 2024

Commit

add7dbd

verified ·

1 Parent(s): 606533d

Upload 35 files

Browse files

Files changed (35) hide show

weights/model-base/config.json +49 -0
weights/model-base/configuration_qwen.py +65 -0
weights/model-base/generation_config.json +11 -0
weights/model-base/modeling_qwen.py +1182 -0
weights/model-base/prediction.json +0 -0
weights/model-base/pytorch_model-00001-of-00002.bin +3 -0
weights/model-base/pytorch_model-00002-of-00002.bin +3 -0
weights/model-base/pytorch_model.bin.index.json +860 -0
weights/model-base/qwen.tiktoken +0 -0
weights/model-base/qwen_generation_utils.py +420 -0
weights/model-base/special_tokens_map.json +3 -0
weights/model-base/tokenization_qwen.py +598 -0
weights/model-base/tokenizer_config.json +12 -0
weights/model-base/trainer_state.json +0 -0
weights/model-base/training_args.bin +3 -0
weights/model-base/visual.py +545 -0
weights/model-base/zero_to_fp32.py +587 -0
weights/model-idf/config.json +49 -0
weights/model-idf/configuration_qwen.py +65 -0
weights/model-idf/generation_config.json +11 -0
weights/model-idf/modeling_qwen.py +1182 -0
weights/model-idf/prediction.json +0 -0
weights/model-idf/prediction_mi.json +797 -0
weights/model-idf/pytorch_model-00001-of-00002.bin +3 -0
weights/model-idf/pytorch_model-00002-of-00002.bin +3 -0
weights/model-idf/pytorch_model.bin.index.json +864 -0
weights/model-idf/qwen.tiktoken +0 -0
weights/model-idf/qwen_generation_utils.py +420 -0
weights/model-idf/special_tokens_map.json +3 -0
weights/model-idf/tokenization_qwen.py +598 -0
weights/model-idf/tokenizer_config.json +12 -0
weights/model-idf/trainer_state.json +0 -0
weights/model-idf/training_args.bin +3 -0
weights/model-idf/visual.py +545 -0
weights/model-idf/zero_to_fp32.py +578 -0

weights/model-base/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_name_or_path": "/mnt/bn/automl-aigc/yatai/Qwen-VL/result/qwen_alpha_full_llava_mini_1/checkpoint-3000",
+  "architectures": [
+    "QWenLMHeadModel"
+  ],
+  "attn_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_qwen.QWenConfig",
+    "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
+  },
+  "bf16": true,
+  "emb_dropout_prob": 0.0,
+  "fp16": false,
+  "fp32": false,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 22016,
+  "kv_channels": 128,
+  "layer_norm_epsilon": 1e-06,
+  "max_position_embeddings": 8192,
+  "model_type": "qwen",
+  "no_bias": true,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "onnx_safe": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 1.0,
+  "scale_attn_weights": true,
+  "seq_length": 2048,
+  "tie_word_embeddings": false,
+  "tokenizer_type": "QWenTokenizer",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.32.0",
+  "use_cache": false,
+  "use_dynamic_ntk": true,
+  "use_flash_attn": false,
+  "use_logn_attn": true,
+  "visual": {
+    "heads": 16,
+    "image_size": 448,
+    "image_start_id": 151857,
+    "layers": 48,
+    "mlp_ratio": 4.9231,
+    "output_dim": 4096,
+    "patch_size": 14,
+    "width": 1664
+  },
+  "vocab_size": 151936
+}

weights/model-base/configuration_qwen.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from transformers import PretrainedConfig
+class QWenConfig(PretrainedConfig):
+    model_type = "qwen"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        emb_dropout_prob=0.0,
+        attn_dropout_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        max_position_embeddings=8192,
+        scale_attn_weights=True,
+        use_cache=True,
+        bf16=False,
+        fp16=False,
+        fp32=False,
+        kv_channels=128,
+        rotary_pct=1.0,
+        rotary_emb_base=10000,
+        use_dynamic_ntk=True,
+        use_logn_attn=True,
+        use_flash_attn="auto",
+        intermediate_size=22016,
+        no_bias=True,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.emb_dropout_prob = emb_dropout_prob
+        self.attn_dropout_prob = attn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.bf16 = bf16
+        self.fp16 = fp16
+        self.fp32 = fp32
+        self.kv_channels = kv_channels
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.use_dynamic_ntk = use_dynamic_ntk
+        self.use_logn_attn = use_logn_attn
+        self.use_flash_attn = use_flash_attn
+        self.no_bias = no_bias
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )

weights/model-base/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "chat_format": "chatml",
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "max_new_tokens": 512,
+  "max_window_size": 6144,
+  "pad_token_id": 151643,
+  "top_k": 0,
+  "top_p": 0.3,
+  "transformers_version": "4.32.0"
+}

weights/model-base/modeling_qwen.py ADDED Viewed

	@@ -0,0 +1,1182 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
+from transformers.generation.logits_process import LogitsProcessorList
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+from torch import nn
+SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
+SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
+from .configuration_qwen import QWenConfig
+from .qwen_generation_utils import (
+    HistoryType,
+    make_context,
+    decode_tokens,
+    get_stop_words_ids,
+    StopWordsLogitsProcessor,
+)
+from .visual import VisionTransformer
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "qwen"
+_CONFIG_FOR_DOC = "QWenConfig"
+QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
+_ERROR_BAD_CHAT_FORMAT = """\
+We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
+If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
+我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
+如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
+"""
+_SENTINEL = object()
+_ERROR_STREAM_IN_CHAT = """\
+Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
+向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
+"""
+apply_rotary_emb_func = None
+rms_norm = None
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class QWenAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+        self.seq_length = config.seq_length
+        self.hidden_size = config.hidden_size
+        self.split_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.scale_attn_weights = True
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        assert self.projection_size % config.num_attention_heads == 0
+        self.hidden_size_per_attention_head = (
+            self.projection_size // config.num_attention_heads
+        )
+        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
+        self.c_proj = nn.Linear(
+            config.hidden_size, self.projection_size, bias=not config.no_bias
+        )
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        self.bf16 = config.bf16
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.use_logn_attn = config.use_logn_attn
+        logn_list = [
+            math.log(i, self.seq_length) if i > self.seq_length else 1
+            for i in range(1, 32768)
+        ]
+        self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
+        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.full(
+                [],
+                value.size(-1) ** 0.5,
+                dtype=attn_weights.dtype,
+                device=attn_weights.device,
+            )
+        query_length, key_length = query.size(-2), key.size(-2)
+        # causal_mask = self.bias[
+        #     :, :, key_length - query_length : key_length, :key_length
+        # ]
+        # mask_value = torch.finfo(attn_weights.dtype).min
+        # mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
+        #     attn_weights.device
+        # )
+        # attn_weights = torch.where(
+        #     causal_mask, attn_weights.to(attn_weights.dtype), mask_value
+        # )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2)
+        return attn_output, attn_weights
+    def _upcast_and_reordered_attn(
+        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
+    ):
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        attn_weights = torch.empty(
+            bsz * num_heads,
+            q_seq_len,
+            k_seq_len,
+            dtype=torch.float32,
+            device=query.device,
+        )
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
+                -1, dk, k_seq_len
+            )
+            attn_weights = torch.baddbmm(
+                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
+            )
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = registered_causal_mask[
+            :, :, key_length - query_length : key_length, :key_length
+        ]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
+            attn_weights.device
+        )
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError(
+                "Error with upcasting, attn_weights does not have dtype torch.float32"
+            )
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        tensor = tensor.contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ):
+        mixed_x_layer = self.c_attn(hidden_states)
+        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if rotary_pos_emb is not None:
+            cur_len = query.shape[1]
+            rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            # Slice the pos emb for current inference
+            query = apply_rotary_pos_emb(query, q_pos_emb)
+            key = apply_rotary_pos_emb(key, k_pos_emb)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0], layer_past[1]
+            key = torch.cat((past_key, key), dim=1)
+            value = torch.cat((past_value, value), dim=1)
+        if use_cache:
+            present = (key, value)
+        else:
+            present = None
+        if self.use_logn_attn and not self.training:
+            if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
+                self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
+            seq_start = key.size(1) - query.size(1)
+            seq_end = key.size(1)
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+            query = query * logn_tensor.expand_as(query)
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, registered_causal_mask, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
+        attn_output = self.c_proj(context_layer)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weight,)
+        return outputs
+class QWenMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Linear(
+            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
+        )
+        self.w2 = nn.Linear(
+            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
+        )
+        ff_dim_in = config.intermediate_size // 2
+        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
+    def forward(self, hidden_states):
+        a1 = self.w1(hidden_states)
+        a2 = self.w2(hidden_states)
+        intermediate_parallel = a1 * F.silu(a2)
+        output = self.c_proj(intermediate_parallel)
+        return output
+class QWenBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.bf16 = config.bf16
+        self.ln_1 = RMSNorm(
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.attn = QWenAttention(config)
+        self.ln_2 = RMSNorm(
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.mlp = QWenMLP(config)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        layernorm_output = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            layernorm_output,
+            rotary_pos_emb,
+            registered_causal_mask=registered_causal_mask,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        residual = hidden_states
+        layernorm_input = attn_output + residual
+        layernorm_output = self.ln_2(layernorm_input)
+        residual = layernorm_input
+        mlp_output = self.mlp(layernorm_output)
+        hidden_states = residual + mlp_output
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        return outputs
+class QWenPreTrainedModel(PreTrainedModel):
+    config_class = QWenConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["QWenBlock"]
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, RMSNorm):
+            module.weight.data.fill_(1.0)
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                p.data.normal_(
+                    mean=0.0,
+                    std=(
+                        self.config.initializer_range
+                        / math.sqrt(2 * self.config.num_hidden_layers)
+                    ),
+                )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, QWenModel):
+            module.gradient_checkpointing = value
+class QWenModel(QWenPreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_dim = config.hidden_size
+        self.gradient_checkpointing = False
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.seq_length = config.seq_length
+        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.emb_dropout_prob)
+        if config.rotary_pct == 1.0:
+            self.rotary_ndims = None
+        else:
+            assert config.rotary_pct < 1
+            self.rotary_ndims = int(
+                config.kv_channels * config.rotary_pct
+            )
+        dim = (
+            self.rotary_ndims
+            if self.rotary_ndims is not None
+            else config.kv_channels
+        )
+        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
+        self.use_flash_attn = config.use_flash_attn
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        self.registered_causal_mask = None
+        # if (
+        #     self.use_flash_attn
+        #     and flash_attn_unpadded_func is not None
+        #     and not self.is_fp32
+        # ):
+        #     self.registered_causal_mask = None
+        # else:
+        #     max_positions = config.max_position_embeddings
+        #     self.register_buffer(
+        #         "registered_causal_mask",
+        #         torch.tril(
+        #             torch.ones((max_positions, max_positions), dtype=torch.bool)
+        #         ).view(1, 1, max_positions, max_positions),
+        #         persistent=False,
+        #     )
+        self.h = nn.ModuleList(
+            [
+                QWenBlock(
+                    config
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = RMSNorm(
+            self.embed_dim,
+            eps=config.layer_norm_epsilon,
+        )
+        self.visual = VisionTransformer(**config.visual) # vit + resampler
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
+            bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
+            eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
+            assert (bos_pos[0] == eos_pos[0]).all()
+            img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
+            images = []
+            # for i, a, b in img_pos:
+            #     image = input_ids[i][a + 1 : b - 1].tolist()
+            #     image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
+            #     images.append(bytes(image).decode('utf-8'))
+            old_i = -1
+            images_flag = []
+            id_test_flag = 0
+            for i, a, b in img_pos:
+                image = input_ids[i][a + 1 : b - 1].tolist()
+                image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
+                images.append(bytes(image).decode('utf-8'))
+                if i != old_i:
+                    if input_ids[i][a-2] == 374:
+                        id_test_flag = 1
+                    else:
+                        id_test_flag = 0
+                    old_i = i
+                if input_ids[i][a-2] == 374:
+                    images_flag.append(0)
+                elif id_test_flag == 1:
+                    images_flag.append(1)
+                else:
+                    images_flag.append(2)
+            images = self.visual.encode(images, images_flag)
+            assert images.shape[0] == len(images)
+            fake_images = None
+        elif self.training:
+            fake_images=torch.zeros(1,3,224,224).to(
+                dtype=self.visual.conv1.weight.dtype, device=self.visual.conv1.weight.device)
+            images = self.visual(fake_images)
+        else:
+            fake_images = None
+            images = None
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_length,
+                input_shape[-1] + past_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        encoder_attention_mask = None
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_length
+        )
+        hidden_states = inputs_embeds
+        kv_seq_len = hidden_states.size()[1]
+        if past_key_values[0] is not None:
+            # past key values[0][0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += past_key_values[0][0].shape[1]
+        if (
+            self.use_dynamic_ntk
+            and kv_seq_len == hidden_states.size()[1]
+            and not self.training
+        ):
+            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+            ntk_alpha = 2 ** math.ceil(context_value) - 1
+            ntk_alpha = max(ntk_alpha, 1)
+        else:
+            ntk_alpha = self.rotary_emb._ntk_alpha_cached
+        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
+        for idx in range(len(rotary_pos_emb)):
+            rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
+        hidden_states = self.drop(hidden_states).clone()
+        if fake_images is not None:
+            hidden_states = hidden_states + images.mean()*0
+        elif images is not None:
+            for idx, (i, a, b) in enumerate(img_pos):
+                hidden_states[i][a + 1 : b] = images[idx]
+        output_shape = input_shape + (hidden_states.size(-1),)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    rotary_pos_emb,
+                    self.registered_causal_mask,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    rotary_pos_emb=rotary_pos_emb,
+                    registered_causal_mask=self.registered_causal_mask,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, presents, all_hidden_states] if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class QWenLMHeadModel(QWenPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        assert (
+            config.bf16 + config.fp16 + config.fp32 <= 1
+        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
+        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
+        if autoset_precision:
+            if SUPPORT_BF16:
+                logger.warn(
+                    "The model is automatically converting to bf16 for faster inference. "
+                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
+                )
+                config.bf16 = True
+            elif SUPPORT_FP16:
+                logger.warn(
+                    "The model is automatically converting to fp16 for faster inference. "
+                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
+                )
+                config.fp16 = True
+            else:
+                config.fp32 = True
+        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
+            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
+        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
+            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
+        if config.fp32:
+            if SUPPORT_BF16:
+                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
+            elif SUPPORT_FP16:
+                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
+        self.transformer = QWenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.bf16:
+            self.transformer.bfloat16()
+            self.lm_head.bfloat16()
+        if config.fp16:
+            self.transformer.half()
+            self.lm_head.half()
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+        return model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            labels = labels.to(lm_logits.device)
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        return tuple(
+            tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device))
+                for past_state in layer_past
+            )
+            for layer_past in past_key_values
+        )
+    def chat(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        query: str,
+        history: Optional[HistoryType],
+        system: str = "You are a helpful assistant.",
+        append_history: bool = True,
+        stream: Optional[bool] = _SENTINEL,
+        stop_words_ids: Optional[List[List[int]]] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        **kwargs,
+    ) -> Tuple[str, HistoryType]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
+        if history is None:
+            history = []
+        if stop_words_ids is None:
+            stop_words_ids = []
+        max_window_size = kwargs.get('max_window_size', None)
+        if max_window_size is None:
+            max_window_size = generation_config.max_window_size
+        raw_text, context_tokens = make_context(
+            tokenizer,
+            query,
+            history=history,
+            system=system,
+            max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
+        )
+        stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
+        ))
+        input_ids = torch.tensor([context_tokens]).to(self.device)
+        outputs = self.generate(
+                    input_ids,
+                    stop_words_ids=stop_words_ids,
+                    return_dict_in_generate=False,
+                    generation_config=generation_config,
+                    **kwargs,
+                )
+        response = decode_tokens(
+            outputs[0],
+            tokenizer,
+            raw_text_len=len(raw_text),
+            context_length=len(context_tokens),
+            chat_format=generation_config.chat_format,
+            verbose=False,
+            errors='replace'
+        )
+        if append_history:
+            history.append((query, response))
+        return response, history
+    def chat_stream(
+            self,
+            tokenizer: PreTrainedTokenizer,
+            query: str,
+            history: Optional[HistoryType],
+            system: str = "You are a helpful assistant.",
+            stop_words_ids: Optional[List[List[int]]] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            **kwargs,
+    ) -> Generator[str, Any, None]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
+        if history is None:
+            history = []
+        if stop_words_ids is None:
+            stop_words_ids = []
+        max_window_size = kwargs.get('max_window_size', None)
+        if max_window_size is None:
+            max_window_size = generation_config.max_window_size
+        raw_text, context_tokens = make_context(
+            tokenizer,
+            query,
+            history=history,
+            system=system,
+            max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
+        )
+        stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
+        ))
+        if stop_words_ids is not None:
+            stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
+            )
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList([stop_words_logits_processor])
+            else:
+                logits_processor.append(stop_words_logits_processor)
+        input_ids = torch.tensor([context_tokens]).to(self.device)
+        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
+        self.__class__.generate_stream = NewGenerationMixin.generate
+        self.__class__.sample_stream = NewGenerationMixin.sample_stream
+        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
+        def stream_generator():
+            outputs = []
+            for token in self.generate_stream(
+                    input_ids,
+                    return_dict_in_generate=False,
+                    generation_config=stream_config,
+                    logits_processor=logits_processor,
+                    seed=-1,
+                    **kwargs):
+                outputs.append(token.item())
+                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore', keep_image_special=True)
+        return stream_generator()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[
+            Callable[[int, torch.Tensor], List[int]]
+        ] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        # Process stop_words_ids.
+        stop_words_ids = kwargs.pop("stop_words_ids", None)
+        if stop_words_ids is None and generation_config is not None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
+        if stop_words_ids is None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
+        if stop_words_ids is not None:
+            stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
+            )
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList([stop_words_logits_processor])
+            else:
+                logits_processor.append(stop_words_logits_processor)
+        return super().generate(
+            inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            **kwargs,
+        )
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        if importlib.util.find_spec("einops") is None:
+            raise RuntimeError("einops is required for Rotary Embedding")
+        self._rotary_pos_emb_cache = None
+        self._seq_len_cached = 0
+        self._ntk_alpha_cached = 1.0
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (
+                base
+                ** (
+                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
+                    / self.dim
+                )
+            )
+            self._seq_len_cached = max(2 * seqlen, 16)
+            self._ntk_alpha_cached = ntk_alpha
+            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
+            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            from einops import rearrange
+            emb = rearrange(emb, "n d -> 1 n 1 d")
+            cos, sin = emb.cos(), emb.sin()
+            self._rotary_pos_emb_cache = [cos, sin]
+    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        cos, sin = self._rotary_pos_emb_cache
+        return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
+def _rotate_half(x):
+    from einops import rearrange
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(t, freqs):
+    cos, sin = freqs
+    if apply_rotary_emb_func is not None and t.is_cuda:
+        t_ = t.float()
+        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
+        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
+        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
+        return output
+    else:
+        rot_dim = freqs[0].shape[-1]
+        cos, sin = freqs
+        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+        t_ = t_.float()
+        t_pass_ = t_pass_.float()
+        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
+        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        if rms_norm is not None and x.is_cuda:
+            return rms_norm(x, self.weight, self.eps)
+        else:
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight

weights/model-base/prediction.json ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-base/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e329a383fc79ff638accd05840c2d9b4b0b15b2ecbd10fcce4b1da7f1130b281
+size 9969772643

weights/model-base/pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0376c6c4f288f7a689a8c75b1b86af1978941682111d88b4c1f6a42641add7
+size 9344399640

weights/model-base/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,860 @@

+{
+  "metadata": {
+    "total_size": 19313870336
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.21.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.in_proj_bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.in_proj_weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.kv_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_kv.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_kv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_q.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_q.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.pos_embed": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.query": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.conv1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_post.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_post.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_pre.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_pre.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.positional_embedding": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.proj": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

weights/model-base/qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-base/qwen_generation_utils.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Generation support."""
+from typing import Tuple, List, Union, Iterable
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import PreTrainedTokenizer
+from transformers import logging
+from transformers.generation import LogitsProcessor
+logger = logging.get_logger(__name__)
+# Types.
+HistoryType = List[Tuple[str, str]]
+TokensType = List[int]
+BatchTokensType = List[List[int]]
+def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < seq_length:
+            tokens.extend([pad_id] * (seq_length - context_length))
+    return batch
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    reset_position_ids,
+    reset_attention_mask,
+    eod_mask_loss,
+):
+    """Build masks and position id for left to right model."""
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(
+        torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
+    ).view(att_mask_batch, 1, seq_length, seq_length)
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1) :] -= i + 1 - prev_index
+                    prev_index = i + 1
+    # Convert attention mask to binary:
+    attention_mask = attention_mask < 0.5
+    return attention_mask, loss_mask, position_ids
+def get_batch(context_tokens: torch.LongTensor, eod_id: int):
+    """Generate batch from context tokens."""
+    # Move to GPU.
+    tokens = context_tokens.contiguous().to(context_tokens.device)
+    # Get the attention mask and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        eod_id,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+    )
+    return tokens, attention_mask, position_ids
+def get_stop_words_ids(chat_format, tokenizer):
+    if chat_format == "raw":
+        stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
+    elif chat_format == "chatml":
+        stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    return stop_words_ids
+def make_context(
+    tokenizer: PreTrainedTokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set(tokenizer.IMAGE_ST)
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+        raw_text = ""
+        context_tokens = []
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            if turn_response is not None:
+                response_text, response_tokens_part = _tokenize_str(
+                    "assistant", turn_response
+                )
+                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+                prev_chat = (
+                    f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+                )
+            else:
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n"
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    return raw_text, context_tokens
+def _decode_default(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_words: List[str],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace',
+):
+    trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate: ", trim_decode_tokens)
+    end_reason = f"Gen length {len(tokens)}"
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    for eod_word in eod_words:
+        if eod_word in trim_decode_tokens:
+            end_reason = f"Gen {eod_word!r}"
+        trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nEnd Reason:", end_reason)
+        print("\nGenerate: ", trim_decode_tokens)
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+def _decode_chatml(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_token_ids: List[int],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace'
+):
+    end_reason = f"Gen length {len(tokens)}"
+    eod_token_idx = context_length
+    for eod_token_idx in range(context_length, len(tokens)):
+        if tokens[eod_token_idx] in eod_token_ids:
+            end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
+            break
+    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
+        print("\nRaw Generate:", trim_decode_tokens)
+        print("\nEnd Reason:", end_reason)
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nGenerate:", trim_decode_tokens)
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+def decode_tokens(
+    tokens: Union[torch.LongTensor, TokensType],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    chat_format: str,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str="replace",
+) -> str:
+    if torch.is_tensor(tokens):
+        tokens = tokens.cpu().numpy().tolist()
+    if chat_format == "chatml":
+        return _decode_chatml(
+            tokens,
+            stop_words=[],
+            eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            context_length=context_length,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    elif chat_format == "raw":
+        return _decode_default(
+            tokens,
+            stop_words=["<|endoftext|>"],
+            eod_words=["<|endoftext|>"],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+class StopWordsLogitsProcessor(LogitsProcessor):
+    """
+    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
+    Args:
+        stop_words_ids (:obj:`List[List[int]]`):
+            List of list of token ids of stop ids. In order to get the tokens of the words
+            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
+            add_prefix_space=True).input_ids`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+    def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
+        if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
+            raise ValueError(
+                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
+            )
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
+            raise ValueError(
+                f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
+            )
+        if any(
+            any(
+                (not isinstance(token_id, (int, np.integer)) or token_id < 0)
+                for token_id in stop_word_ids
+            )
+            for stop_word_ids in stop_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
+            )
+        self.stop_words_ids = list(
+            filter(
+                lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
+            )
+        )
+        self.eos_token_id = eos_token_id
+        for stop_token_seq in self.stop_words_ids:
+            assert (
+                len(stop_token_seq) > 0
+            ), "Stop words token sequences {} cannot have an empty list".format(
+                stop_words_ids
+            )
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        stopped_samples = self._calc_stopped_samples(input_ids)
+        for i, should_stop in enumerate(stopped_samples):
+            if should_stop:
+                scores[i, self.eos_token_id] = float(2**15)
+        return scores
+    def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        elif len(tokens) > len(prev_tokens):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        elif prev_tokens[-len(tokens) :].tolist() == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+    def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
+        stopped_samples = []
+        for prev_input_ids_slice in prev_input_ids:
+            match = False
+            for stop_token_seq in self.stop_words_ids:
+                if self._tokens_match(prev_input_ids_slice, stop_token_seq):
+                    # if tokens do not match continue
+                    match = True
+                    break
+            stopped_samples.append(match)
+        return stopped_samples
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """This function has been mostly taken from huggingface conversational
+    ai code at
+        https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+             conversational-ai-with-transfer-learning-2d818ac26313"""
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p > 0.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+    return logits
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2

weights/model-base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "pad_token": "<|endoftext|>"
+}

weights/model-base/tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,598 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import requests
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
+import tiktoken
+import numpy as np
+from PIL import Image
+from PIL import ImageFont
+from PIL import ImageDraw
+from transformers import PreTrainedTokenizer, AddedToken
+from transformers.utils import try_to_load_from_cache
+import matplotlib.colors as mcolors
+from matplotlib.font_manager import FontProperties
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
+FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
+if FONT_PATH is None:
+    if not os.path.exists("SimSun.ttf"):
+        ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
+        open("SimSun.ttf", "wb").write(ttf.content)
+    FONT_PATH = "SimSun.ttf"
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+IMG_TOKEN_SPAN = 256
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+def _list_find(
+    input_list: List[Any],
+    candidates: Tuple[Any],
+    start: int = 0,
+):
+    for i in range(start, len(input_list)):
+        if input_list[i] in candidates:
+            return i
+    return -1
+def _replace_closed_tag(
+    input_tokens: List[Any],
+    start_tags: Union[Any, Tuple[Any]],
+    end_tags: Union[Any, Tuple[Any]],
+    inclusive_replace_func: Callable,
+    exclusive_replace_func: Callable = lambda x: x,
+):
+    if isinstance(start_tags, (str, int)):
+        start_tags = (start_tags,)
+    if isinstance(end_tags, (str, int)):
+        end_tags = (end_tags,)
+    assert len(start_tags) == len(end_tags)
+    output_tokens = []
+    end = 0
+    while True:
+        start = _list_find(input_tokens, start_tags, end)
+        if start == -1:
+            break
+        output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
+        tag_idx = start_tags.index(input_tokens[start])
+        end = _list_find(input_tokens, (end_tags[tag_idx],), start)
+        if end == -1:
+            raise ValueError("Unclosed image token")
+        output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1]))
+        end += 1
+    output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
+    return output_tokens
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        image_start_tag='<img>',
+        image_end_tag='</img>',
+        image_pad_tag='<imgpad>',
+        ref_start_tag='<ref>',
+        ref_end_tag='</ref>',
+        box_start_tag='<box>',
+        box_end_tag='</box>',
+        quad_start_tag='<quad>',
+        quad_end_tag='</quad>',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.image_pad_tag = image_pad_tag
+        self.ref_start_tag = ref_start_tag
+        self.ref_end_tag = ref_end_tag
+        self.box_start_tag = box_start_tag
+        self.box_end_tag = box_end_tag
+        self.quad_start_tag = quad_start_tag
+        self.quad_end_tag = quad_end_tag
+        self.IMAGE_ST = (
+            ref_start_tag, ref_end_tag,
+            box_start_tag, box_end_tag,
+            quad_start_tag, quad_end_tag,
+            image_start_tag, image_end_tag,
+            image_pad_tag
+        )
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
+            )
+        }
+        self.img_start_id = self.special_tokens[self.image_start_tag]
+        self.img_end_id = self.special_tokens[self.image_end_tag]
+        self.img_pad_id = self.special_tokens[self.image_pad_tag]
+        self.ref_start_id = self.special_tokens[self.ref_start_tag]
+        self.ref_end_id = self.special_tokens[self.ref_end_tag]
+        self.box_start_id = self.special_tokens[self.box_start_tag]
+        self.box_end_id = self.special_tokens[self.box_end_tag]
+        self.quad_start_id = self.special_tokens[self.quad_start_tag]
+        self.quad_end_id = self.special_tokens[self.quad_end_tag]
+        self.image_special_tokens = set([
+            self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
+            self.quad_start_id, self.quad_end_id,
+        ])
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state['tokenizer']
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
+                raise ValueError('Adding unknown special tokens is not supported')
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        def _encode_imgurl(img_tokens):
+            assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
+            img_tokens = img_tokens[1:-1]
+            img_url = b''.join(img_tokens)
+            out_img_tokens = list(map(self.decoder.get, img_url))
+            if len(out_img_tokens) > IMG_TOKEN_SPAN:
+                raise ValueError("The content in {}..{} is too long".format(
+                    self.image_start_tag, self.image_end_tag))
+            out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
+            out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
+            return out_img_tokens
+        return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        def _decode_imgurl(img_token_ids):
+            assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
+            img_token_ids = img_token_ids[1:-1]
+            img_token_ids = img_token_ids[ : img_token_ids.index(self.img_pad_id)]
+            img_url = bytes(img_token_ids).decode('utf-8')
+            return [self.img_start_id] + self.tokenizer.encode(img_url) + [self.img_end_id]
+        token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
+        if skip_special_tokens:
+            if kwargs.get('keep_image_special', False):
+                token_ids = [i for i in token_ids if i < self.eod_id
+                    or i in self.image_special_tokens]
+            else:
+                token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+    def to_list_format(self, text: str):
+        text = unicodedata.normalize("NFC", text)
+        token_ids = self.tokenizer.encode(
+            text, allowed_special=set(self.IMAGE_ST + (ENDOFTEXT,)))
+        def _encode_vl_info(tokens):
+            if len(tokens) == 0:
+                return []
+            if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
+                key = 'image'
+            elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
+                key = 'ref'
+            elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
+                key = 'box'
+            elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
+                key = 'quad'
+            else:
+                _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+                return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
+            _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+            val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
+            return [{key: val}]
+        return _replace_closed_tag(
+            token_ids,
+            (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
+            (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
+            _encode_vl_info,
+            _encode_vl_info,
+        )
+    def from_list_format(self, list_format: List[Dict]):
+        text = ''
+        num_images = 0
+        for ele in list_format:
+            if 'image' in ele:
+                num_images += 1
+                text += f'Picture {num_images}: '
+                text += self.image_start_tag + ele['image'] + self.image_end_tag
+                text += '\n'
+            elif 'text' in ele:
+                text += ele['text']
+            elif 'box' in ele:
+                if 'ref' in ele:
+                    text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
+                for box in ele['box']:
+                    text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
+            else:
+                raise ValueError("Unsupport element: " + str(ele))
+        return text
+    def _fetch_latest_picture(self, response, history):
+        if history is None:
+            history = []
+        _history = history + [(response, None)]
+        for q, r in _history[::-1]:
+            for ele in self.to_list_format(q)[::-1]:
+                if 'image' in ele:
+                    return ele['image']
+        return None
+    def _fetch_all_box_with_ref(self, text):
+        list_format = self.to_list_format(text)
+        output = []
+        for i, ele in enumerate(list_format):
+            if 'box' in ele:
+                bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
+                assert len(bbox) == 4
+                output.append({'box': bbox})
+                if i > 0 and 'ref' in list_format[i-1]:
+                    output[-1]['ref'] = list_format[i-1]['ref'].strip()
+        return output
+    def draw_bbox_on_latest_picture(
+        self,
+        response,
+        history=None,
+    ) -> Optional[Image.Image]:
+        image = self._fetch_latest_picture(response, history)
+        if image is None:
+            return None
+        if image.startswith("http://") or image.startswith("https://"):
+            image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
+            h, w = image.height, image.width
+        else:
+            image = np.asarray(Image.open(image).convert("RGB"))
+            h, w = image.shape[0], image.shape[1]
+        visualizer = Visualizer(image)
+        boxes = self._fetch_all_box_with_ref(response)
+        if not boxes:
+            return None
+        color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()]) # init color
+        for box in boxes:
+            if 'ref' in box: # random new color for new refexps
+                color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()])
+            x1, y1, x2, y2 = box['box']
+            x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
+            visualizer.draw_box((x1, y1, x2, y2), alpha=1, edge_color=color)
+            if 'ref' in box:
+                visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
+        return visualizer.output
+import colorsys
+import logging
+import math
+import numpy as np
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+import random
+logger = logging.getLogger(__name__)
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+    def _setup_figure(self, img):
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+    def reset_image(self, img):
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+    def save(self, filepath):
+        self.fig.savefig(filepath)
+    def get_image(self):
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        buffer = np.frombuffer(s, dtype="uint8")
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+class Visualizer:
+    def __init__(self, img_rgb, metadata=None, scale=1.0):
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.font_path = FONT_PATH
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+        # too small texts are useless, therefore clamp to 14
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 30, 15 // scale
+        )
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        if not font_size:
+            font_size = self._default_font_size
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            fontproperties=FontProperties(fname=self.font_path),
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+        linewidth = max(self._default_font_size / 4, 1)
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+    def get_output(self):
+        return self.output

weights/model-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 2048,
+  "padding_side": "right",
+  "tokenizer_class": "QWenTokenizer"
+}

weights/model-base/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-base/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff5574e35478085cb9eb62d37ff481a661783b79e9bb13705e6b8c52ebf52bb
+size 6840

weights/model-base/visual.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        normal_(self.query, std=.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads) # only out_proj
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        # new
+        # self.attn1 = nn.MultiheadAttention(embed_dim, num_heads)
+        # self.attn1.apply(self._init_weights_zero)
+        # self.id_query_fc = nn.Linear(embed_dim, embed_dim, bias=False)
+        # self.test_feat_fc = nn.Linear(embed_dim, embed_dim, bias=False)
+        # self.id_query_fc.apply(self._init_weights_zero)
+        # self.test_feat_fc.apply(self._init_weights_zero)
+        # self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _init_weights_zero(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.constant_(m.weight, 0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    # def forward(self, x, images_flag, attn_mask=None): # idadapter3
+    #     pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+    #     x = self.kv_proj(x)
+    #     x = self.ln_kv(x).permute(1, 0, 2)
+    #     N = x.shape[1]
+    #     q = self.ln_q(self.query)
+    #     q_ = self._repeat(q, N)
+    #     out_list = []
+    #     ind = 0
+    #     while ind < len(images_flag):
+    #         if images_flag[ind] == 2:
+    #             end_ind = ind+1
+    #             while True:
+    #                 if end_ind >= len(images_flag):
+    #                     break
+    #                 if images_flag[end_ind] == 2:
+    #                     end_ind += 1
+    #                 else:
+    #                     break
+    #             fake_out = self.attn1(
+    #                 q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
+    #                 x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
+    #                 x[:,ind:end_ind,:],
+    #                 attn_mask=attn_mask)[0]
+    #             llava_out = self.attn(
+    #                 q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
+    #                 x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
+    #                 x[:,ind:end_ind,:],
+    #                 attn_mask=attn_mask)[0]
+    #             llava_out = llava_out + 0 * fake_out
+    #             ind = end_ind
+    #             out_list.append(llava_out)
+    #         elif images_flag[ind] == 0:
+    #             id_end_ind = ind+1
+    #             while True:
+    #                 if id_end_ind >= len(images_flag):
+    #                     break
+    #                 if images_flag[id_end_ind] == 0:
+    #                     id_end_ind += 1
+    #                 else:
+    #                     break
+    #             id_out = self.attn(
+    #                 q_[:,ind:id_end_ind,:] + self.pos_embed.unsqueeze(1),
+    #                 x[:,ind:id_end_ind,:] + pos_embed.unsqueeze(1),
+    #                 x[:,ind:id_end_ind,:],
+    #                 attn_mask=attn_mask)[0]
+    #             ind = id_end_ind
+    #             out_list.append(id_out)
+    #             test_end_ind = ind+1
+    #             while True:
+    #                 if test_end_ind >= len(images_flag):
+    #                     break
+    #                 if images_flag[test_end_ind] == 1:
+    #                     test_end_ind += 1
+    #                 else:
+    #                     break
+    #             id_query = id_out.detach()
+    #             id_query = id_query.permute(1,0,2).reshape([-1, id_query.shape[-1]])
+    #             id_query = self._repeat(id_query, test_end_ind-ind)
+    #             test_feats = x[:,ind:test_end_ind,:]
+    #             test_out1 = self.attn1(
+    #                 test_feats + pos_embed.unsqueeze(1),
+    #                 id_query,
+    #                 id_query,
+    #                 attn_mask=attn_mask)[0]
+    #             test_out1 = test_out1 + test_feats # residual
+    #             test_out2 = self.attn(
+    #                 q_[:,ind:test_end_ind,:] + self.pos_embed.unsqueeze(1),
+    #                 test_out1 + pos_embed.unsqueeze(1),
+    #                 test_out1,
+    #                 attn_mask=attn_mask)[0]
+    #             ind = test_end_ind
+    #             out_list.append(test_out2)
+    #         else:
+    #             print('error')
+    #     out = torch.cat(out_list, 1)
+    #     # except:
+    #     #     fake_out = self.attn1(
+    #     #         self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+    #     #         x + pos_embed.unsqueeze(1),
+    #     #         x,
+    #     #         attn_mask=attn_mask)[0]
+    #     #     out = self.attn(
+    #     #         self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+    #     #         x + pos_embed.unsqueeze(1),
+    #     #         x,
+    #     #         attn_mask=attn_mask)[0]
+    #     #     out = out + 0 * fake_out
+    #     #     t = time.time()
+    #     #     with open('/mnt/bn/automl-aigc/yatai/Qwen-VL/bug2.txt', 'a') as f:
+    #     #         f.write(f"{t}: visual id-former error\n")
+    #     return out.permute(1,0,2)
+    def forward(self, x, images_flag=None, attn_mask=None): # base
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, embed_dim, num_heads,
+                 bias=True, kdim=None, vdim=None):
+        super(VisualAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+    def forward(self, query, key, value, attn_mask = None):
+        # query/key/value: [sq, b, h]
+        sq, b, _ = query.size()
+        assert torch.allclose(query, key), 'Only Support Self-Attention Currently'
+        sk = sq
+        mixed_x_layer = self.in_proj(query)
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+        value_layer = value_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(b,
+            self.num_attention_heads_per_partition,
+            sq, self.hidden_size_per_attention_head)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        output = self.out_proj(context_layer)
+        return output
+class VisualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(
+                width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(
+            self,
+            image_size: int,
+            patch_size: int,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float,
+            n_queries: int = 256,
+            output_dim: int = 512,
+            **kwargs
+    ):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.attn_pool = Resampler(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+        )
+        self.ln_post = norm_layer(output_dim)
+        self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
+    def forward(self, x: torch.Tensor, images_flag):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x, images_flag)
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+    def encode(self, image_paths: List[str], images_flag):
+        images = []
+        for image_path in image_paths:
+            if image_path.startswith("http://") or image_path.startswith("https://"):
+                image = Image.open(requests.get(image_path, stream=True).raw)
+            else:
+                image = Image.open(image_path)
+            image = image.convert("RGB")
+            images.append(self.image_transform(image))
+        images = torch.stack(images, dim=0)
+        return self(images, images_flag)

weights/model-base/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)

weights/model-idf/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_name_or_path": "/mnt/bn/automl-aigc/yatai/Qwen-VL/result/qwen_alpha_full_llava_mini_1/checkpoint-3000",
+  "architectures": [
+    "QWenLMHeadModel"
+  ],
+  "attn_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_qwen.QWenConfig",
+    "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
+  },
+  "bf16": true,
+  "emb_dropout_prob": 0.0,
+  "fp16": false,
+  "fp32": false,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 22016,
+  "kv_channels": 128,
+  "layer_norm_epsilon": 1e-06,
+  "max_position_embeddings": 8192,
+  "model_type": "qwen",
+  "no_bias": true,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "onnx_safe": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 1.0,
+  "scale_attn_weights": true,
+  "seq_length": 2048,
+  "tie_word_embeddings": false,
+  "tokenizer_type": "QWenTokenizer",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.32.0",
+  "use_cache": false,
+  "use_dynamic_ntk": true,
+  "use_flash_attn": false,
+  "use_logn_attn": true,
+  "visual": {
+    "heads": 16,
+    "image_size": 448,
+    "image_start_id": 151857,
+    "layers": 48,
+    "mlp_ratio": 4.9231,
+    "output_dim": 4096,
+    "patch_size": 14,
+    "width": 1664
+  },
+  "vocab_size": 151936
+}

weights/model-idf/configuration_qwen.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from transformers import PretrainedConfig
+class QWenConfig(PretrainedConfig):
+    model_type = "qwen"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        emb_dropout_prob=0.0,
+        attn_dropout_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        max_position_embeddings=8192,
+        scale_attn_weights=True,
+        use_cache=True,
+        bf16=False,
+        fp16=False,
+        fp32=False,
+        kv_channels=128,
+        rotary_pct=1.0,
+        rotary_emb_base=10000,
+        use_dynamic_ntk=True,
+        use_logn_attn=True,
+        use_flash_attn="auto",
+        intermediate_size=22016,
+        no_bias=True,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.emb_dropout_prob = emb_dropout_prob
+        self.attn_dropout_prob = attn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.bf16 = bf16
+        self.fp16 = fp16
+        self.fp32 = fp32
+        self.kv_channels = kv_channels
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.use_dynamic_ntk = use_dynamic_ntk
+        self.use_logn_attn = use_logn_attn
+        self.use_flash_attn = use_flash_attn
+        self.no_bias = no_bias
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )

weights/model-idf/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "chat_format": "chatml",
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "max_new_tokens": 512,
+  "max_window_size": 6144,
+  "pad_token_id": 151643,
+  "top_k": 0,
+  "top_p": 0.3,
+  "transformers_version": "4.32.0"
+}

weights/model-idf/modeling_qwen.py ADDED Viewed

	@@ -0,0 +1,1182 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
+from transformers.generation.logits_process import LogitsProcessorList
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+from torch import nn
+SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
+SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
+from .configuration_qwen import QWenConfig
+from .qwen_generation_utils import (
+    HistoryType,
+    make_context,
+    decode_tokens,
+    get_stop_words_ids,
+    StopWordsLogitsProcessor,
+)
+from .visual import VisionTransformer
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "qwen"
+_CONFIG_FOR_DOC = "QWenConfig"
+QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
+_ERROR_BAD_CHAT_FORMAT = """\
+We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
+If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
+我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
+如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
+"""
+_SENTINEL = object()
+_ERROR_STREAM_IN_CHAT = """\
+Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
+向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
+"""
+apply_rotary_emb_func = None
+rms_norm = None
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class QWenAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+        self.seq_length = config.seq_length
+        self.hidden_size = config.hidden_size
+        self.split_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.scale_attn_weights = True
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        assert self.projection_size % config.num_attention_heads == 0
+        self.hidden_size_per_attention_head = (
+            self.projection_size // config.num_attention_heads
+        )
+        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
+        self.c_proj = nn.Linear(
+            config.hidden_size, self.projection_size, bias=not config.no_bias
+        )
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        self.bf16 = config.bf16
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.use_logn_attn = config.use_logn_attn
+        logn_list = [
+            math.log(i, self.seq_length) if i > self.seq_length else 1
+            for i in range(1, 32768)
+        ]
+        self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
+        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.full(
+                [],
+                value.size(-1) ** 0.5,
+                dtype=attn_weights.dtype,
+                device=attn_weights.device,
+            )
+        query_length, key_length = query.size(-2), key.size(-2)
+        # causal_mask = self.bias[
+        #     :, :, key_length - query_length : key_length, :key_length
+        # ]
+        # mask_value = torch.finfo(attn_weights.dtype).min
+        # mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
+        #     attn_weights.device
+        # )
+        # attn_weights = torch.where(
+        #     causal_mask, attn_weights.to(attn_weights.dtype), mask_value
+        # )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2)
+        return attn_output, attn_weights
+    def _upcast_and_reordered_attn(
+        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
+    ):
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        attn_weights = torch.empty(
+            bsz * num_heads,
+            q_seq_len,
+            k_seq_len,
+            dtype=torch.float32,
+            device=query.device,
+        )
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
+                -1, dk, k_seq_len
+            )
+            attn_weights = torch.baddbmm(
+                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
+            )
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = registered_causal_mask[
+            :, :, key_length - query_length : key_length, :key_length
+        ]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
+            attn_weights.device
+        )
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError(
+                "Error with upcasting, attn_weights does not have dtype torch.float32"
+            )
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        tensor = tensor.contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ):
+        mixed_x_layer = self.c_attn(hidden_states)
+        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if rotary_pos_emb is not None:
+            cur_len = query.shape[1]
+            rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            # Slice the pos emb for current inference
+            query = apply_rotary_pos_emb(query, q_pos_emb)
+            key = apply_rotary_pos_emb(key, k_pos_emb)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0], layer_past[1]
+            key = torch.cat((past_key, key), dim=1)
+            value = torch.cat((past_value, value), dim=1)
+        if use_cache:
+            present = (key, value)
+        else:
+            present = None
+        if self.use_logn_attn and not self.training:
+            if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
+                self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
+            seq_start = key.size(1) - query.size(1)
+            seq_end = key.size(1)
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+            query = query * logn_tensor.expand_as(query)
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, registered_causal_mask, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
+        attn_output = self.c_proj(context_layer)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weight,)
+        return outputs
+class QWenMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Linear(
+            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
+        )
+        self.w2 = nn.Linear(
+            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
+        )
+        ff_dim_in = config.intermediate_size // 2
+        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
+    def forward(self, hidden_states):
+        a1 = self.w1(hidden_states)
+        a2 = self.w2(hidden_states)
+        intermediate_parallel = a1 * F.silu(a2)
+        output = self.c_proj(intermediate_parallel)
+        return output
+class QWenBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.bf16 = config.bf16
+        self.ln_1 = RMSNorm(
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.attn = QWenAttention(config)
+        self.ln_2 = RMSNorm(
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.mlp = QWenMLP(config)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        layernorm_output = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            layernorm_output,
+            rotary_pos_emb,
+            registered_causal_mask=registered_causal_mask,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        residual = hidden_states
+        layernorm_input = attn_output + residual
+        layernorm_output = self.ln_2(layernorm_input)
+        residual = layernorm_input
+        mlp_output = self.mlp(layernorm_output)
+        hidden_states = residual + mlp_output
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        return outputs
+class QWenPreTrainedModel(PreTrainedModel):
+    config_class = QWenConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["QWenBlock"]
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, RMSNorm):
+            module.weight.data.fill_(1.0)
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                p.data.normal_(
+                    mean=0.0,
+                    std=(
+                        self.config.initializer_range
+                        / math.sqrt(2 * self.config.num_hidden_layers)
+                    ),
+                )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, QWenModel):
+            module.gradient_checkpointing = value
+class QWenModel(QWenPreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_dim = config.hidden_size
+        self.gradient_checkpointing = False
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.seq_length = config.seq_length
+        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.emb_dropout_prob)
+        if config.rotary_pct == 1.0:
+            self.rotary_ndims = None
+        else:
+            assert config.rotary_pct < 1
+            self.rotary_ndims = int(
+                config.kv_channels * config.rotary_pct
+            )
+        dim = (
+            self.rotary_ndims
+            if self.rotary_ndims is not None
+            else config.kv_channels
+        )
+        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
+        self.use_flash_attn = config.use_flash_attn
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        self.registered_causal_mask = None
+        # if (
+        #     self.use_flash_attn
+        #     and flash_attn_unpadded_func is not None
+        #     and not self.is_fp32
+        # ):
+        #     self.registered_causal_mask = None
+        # else:
+        #     max_positions = config.max_position_embeddings
+        #     self.register_buffer(
+        #         "registered_causal_mask",
+        #         torch.tril(
+        #             torch.ones((max_positions, max_positions), dtype=torch.bool)
+        #         ).view(1, 1, max_positions, max_positions),
+        #         persistent=False,
+        #     )
+        self.h = nn.ModuleList(
+            [
+                QWenBlock(
+                    config
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = RMSNorm(
+            self.embed_dim,
+            eps=config.layer_norm_epsilon,
+        )
+        self.visual = VisionTransformer(**config.visual) # vit + resampler
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
+            bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
+            eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
+            assert (bos_pos[0] == eos_pos[0]).all()
+            img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
+            images = []
+            # for i, a, b in img_pos:
+            #     image = input_ids[i][a + 1 : b - 1].tolist()
+            #     image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
+            #     images.append(bytes(image).decode('utf-8'))
+            old_i = -1
+            images_flag = []
+            id_test_flag = 0
+            for i, a, b in img_pos:
+                image = input_ids[i][a + 1 : b - 1].tolist()
+                image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
+                images.append(bytes(image).decode('utf-8'))
+                if i != old_i:
+                    if input_ids[i][a-2] == 374:
+                        id_test_flag = 1
+                    else:
+                        id_test_flag = 0
+                    old_i = i
+                if input_ids[i][a-2] == 374:
+                    images_flag.append(0)
+                elif id_test_flag == 1:
+                    images_flag.append(1)
+                else:
+                    images_flag.append(2)
+            images = self.visual.encode(images, images_flag)
+            assert images.shape[0] == len(images)
+            fake_images = None
+        elif self.training:
+            fake_images=torch.zeros(1,3,224,224).to(
+                dtype=self.visual.conv1.weight.dtype, device=self.visual.conv1.weight.device)
+            images = self.visual(fake_images)
+        else:
+            fake_images = None
+            images = None
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_length,
+                input_shape[-1] + past_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        encoder_attention_mask = None
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_length
+        )
+        hidden_states = inputs_embeds
+        kv_seq_len = hidden_states.size()[1]
+        if past_key_values[0] is not None:
+            # past key values[0][0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += past_key_values[0][0].shape[1]
+        if (
+            self.use_dynamic_ntk
+            and kv_seq_len == hidden_states.size()[1]
+            and not self.training
+        ):
+            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+            ntk_alpha = 2 ** math.ceil(context_value) - 1
+            ntk_alpha = max(ntk_alpha, 1)
+        else:
+            ntk_alpha = self.rotary_emb._ntk_alpha_cached
+        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
+        for idx in range(len(rotary_pos_emb)):
+            rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
+        hidden_states = self.drop(hidden_states).clone()
+        if fake_images is not None:
+            hidden_states = hidden_states + images.mean()*0
+        elif images is not None:
+            for idx, (i, a, b) in enumerate(img_pos):
+                hidden_states[i][a + 1 : b] = images[idx]
+        output_shape = input_shape + (hidden_states.size(-1),)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    rotary_pos_emb,
+                    self.registered_causal_mask,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    rotary_pos_emb=rotary_pos_emb,
+                    registered_causal_mask=self.registered_causal_mask,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, presents, all_hidden_states] if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class QWenLMHeadModel(QWenPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        assert (
+            config.bf16 + config.fp16 + config.fp32 <= 1
+        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
+        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
+        if autoset_precision:
+            if SUPPORT_BF16:
+                logger.warn(
+                    "The model is automatically converting to bf16 for faster inference. "
+                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
+                )
+                config.bf16 = True
+            elif SUPPORT_FP16:
+                logger.warn(
+                    "The model is automatically converting to fp16 for faster inference. "
+                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
+                )
+                config.fp16 = True
+            else:
+                config.fp32 = True
+        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
+            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
+        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
+            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
+        if config.fp32:
+            if SUPPORT_BF16:
+                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
+            elif SUPPORT_FP16:
+                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
+        self.transformer = QWenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.bf16:
+            self.transformer.bfloat16()
+            self.lm_head.bfloat16()
+        if config.fp16:
+            self.transformer.half()
+            self.lm_head.half()
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+        return model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            labels = labels.to(lm_logits.device)
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        return tuple(
+            tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device))
+                for past_state in layer_past
+            )
+            for layer_past in past_key_values
+        )
+    def chat(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        query: str,
+        history: Optional[HistoryType],
+        system: str = "You are a helpful assistant.",
+        append_history: bool = True,
+        stream: Optional[bool] = _SENTINEL,
+        stop_words_ids: Optional[List[List[int]]] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        **kwargs,
+    ) -> Tuple[str, HistoryType]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
+        if history is None:
+            history = []
+        if stop_words_ids is None:
+            stop_words_ids = []
+        max_window_size = kwargs.get('max_window_size', None)
+        if max_window_size is None:
+            max_window_size = generation_config.max_window_size
+        raw_text, context_tokens = make_context(
+            tokenizer,
+            query,
+            history=history,
+            system=system,
+            max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
+        )
+        stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
+        ))
+        input_ids = torch.tensor([context_tokens]).to(self.device)
+        outputs = self.generate(
+                    input_ids,
+                    stop_words_ids=stop_words_ids,
+                    return_dict_in_generate=False,
+                    generation_config=generation_config,
+                    **kwargs,
+                )
+        response = decode_tokens(
+            outputs[0],
+            tokenizer,
+            raw_text_len=len(raw_text),
+            context_length=len(context_tokens),
+            chat_format=generation_config.chat_format,
+            verbose=False,
+            errors='replace'
+        )
+        if append_history:
+            history.append((query, response))
+        return response, history
+    def chat_stream(
+            self,
+            tokenizer: PreTrainedTokenizer,
+            query: str,
+            history: Optional[HistoryType],
+            system: str = "You are a helpful assistant.",
+            stop_words_ids: Optional[List[List[int]]] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            **kwargs,
+    ) -> Generator[str, Any, None]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
+        if history is None:
+            history = []
+        if stop_words_ids is None:
+            stop_words_ids = []
+        max_window_size = kwargs.get('max_window_size', None)
+        if max_window_size is None:
+            max_window_size = generation_config.max_window_size
+        raw_text, context_tokens = make_context(
+            tokenizer,
+            query,
+            history=history,
+            system=system,
+            max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
+        )
+        stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
+        ))
+        if stop_words_ids is not None:
+            stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
+            )
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList([stop_words_logits_processor])
+            else:
+                logits_processor.append(stop_words_logits_processor)
+        input_ids = torch.tensor([context_tokens]).to(self.device)
+        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
+        self.__class__.generate_stream = NewGenerationMixin.generate
+        self.__class__.sample_stream = NewGenerationMixin.sample_stream
+        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
+        def stream_generator():
+            outputs = []
+            for token in self.generate_stream(
+                    input_ids,
+                    return_dict_in_generate=False,
+                    generation_config=stream_config,
+                    logits_processor=logits_processor,
+                    seed=-1,
+                    **kwargs):
+                outputs.append(token.item())
+                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore', keep_image_special=True)
+        return stream_generator()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[
+            Callable[[int, torch.Tensor], List[int]]
+        ] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        # Process stop_words_ids.
+        stop_words_ids = kwargs.pop("stop_words_ids", None)
+        if stop_words_ids is None and generation_config is not None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
+        if stop_words_ids is None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
+        if stop_words_ids is not None:
+            stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
+            )
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList([stop_words_logits_processor])
+            else:
+                logits_processor.append(stop_words_logits_processor)
+        return super().generate(
+            inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            **kwargs,
+        )
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        if importlib.util.find_spec("einops") is None:
+            raise RuntimeError("einops is required for Rotary Embedding")
+        self._rotary_pos_emb_cache = None
+        self._seq_len_cached = 0
+        self._ntk_alpha_cached = 1.0
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (
+                base
+                ** (
+                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
+                    / self.dim
+                )
+            )
+            self._seq_len_cached = max(2 * seqlen, 16)
+            self._ntk_alpha_cached = ntk_alpha
+            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
+            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            from einops import rearrange
+            emb = rearrange(emb, "n d -> 1 n 1 d")
+            cos, sin = emb.cos(), emb.sin()
+            self._rotary_pos_emb_cache = [cos, sin]
+    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        cos, sin = self._rotary_pos_emb_cache
+        return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
+def _rotate_half(x):
+    from einops import rearrange
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(t, freqs):
+    cos, sin = freqs
+    if apply_rotary_emb_func is not None and t.is_cuda:
+        t_ = t.float()
+        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
+        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
+        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
+        return output
+    else:
+        rot_dim = freqs[0].shape[-1]
+        cos, sin = freqs
+        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+        t_ = t_.float()
+        t_pass_ = t_pass_.float()
+        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
+        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        if rms_norm is not None and x.is_cuda:
+            return rms_norm(x, self.weight, self.eps)
+        else:
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight

weights/model-idf/prediction.json ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-idf/prediction_mi.json ADDED Viewed

	@@ -0,0 +1,797 @@

+[
+    {
+        "id": 0,
+        "question": "Julia is <img>image_1</img>\n Victoria is <img>image_2</img>\n Watching the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
+        "prediction": "Julia, lying down, is given water by Victoria, who stands beside her. As the interaction progresses, Victoria remains close to Julia, holding the glass of water, indicating a comforting and attentive care given to her."
+    },
+    {
+        "id": 1,
+        "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n In these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What action does Julia take? ",
+        "prediction": "Julia is initially focused on something in front of her, then she turns to talk to Jack, and finally, she seems to be walking away from Jack."
+    },
+    {
+        "id": 2,
+        "question": "Jack is <img>image_1</img>\n Beech is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is the interaction between Jack and Beech through the images?",
+        "prediction": "Jack and Beech appear to have a tense conversation, with Beech at first facing Jack, then turning away, and finally facing him again from a different angle."
+    },
+    {
+        "id": 3,
+        "question": "Jack is <img>image_1</img>\n Julia is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Jack, dressed in a grey outfit, rides a motorcycle while Julia stands nearby. A scuffle ensues as Jack confronts Julia, and subsequently, both are seen walking side by side through a barren landscape, suggesting a partnership or shared journey."
+    },
+    {
+        "id": 4,
+        "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Julia and Jack stand together, overlooking a damaged landscape. They face each other in a close, intimate conversation. Later, Jack listens as Julia uses a telescope, indicating a shared moment of exploration or contemplation."
+    },
+    {
+        "id": 5,
+        "question": "Julia is <img>image_1</img>\n Jack is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Julia and Jack start by walking together on a bridge, then Jack is seen adjusting the view of a telescope while Julia watches. They later stand close, facing each other with the telescope between them, and finally, Jack seems to be explaining something to Julia, gesturing towards the view."
+    },
+    {
+        "id": 6,
+        "question": "Glen McCreavy is <img>image_1</img>\n Nicholas Hardiment is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Glen McCreery and Nicholas Hardiment start relaxed, lounging side by side. They move on to an outdoor gathering, where Glen is seen engaged in conversation while Nicholas is accept a plate of food. Later, they stand face to face in a more serious exchange. Finally, they are seen in a group setting, still conversing, now with more people involved."
+    },
+    {
+        "id": 7,
+        "question": "Glen McCreavy is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n. What are in the background?",
+        "prediction": "There are trees and a field in the background."
+    },
+    {
+        "id": 8,
+        "question": "Tamara Drewe is <img>image_1</img>\n Beth Hardiment is <img>image_2</img>\n In the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is Tamara Drewe doing?",
+        "prediction": "Tamara Drewe is walking across a field and having a conversation with Beth Hardiment."
+    },
+    {
+        "id": 9,
+        "question": "Ben Sergeant is <img>image_1</img>\n Tamara Drewe is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Ben Sergeant and Tamara Drewe stand outside, facing each other in a conversation. They move indoors, where Ben listens to Tamara, who seems to be explaining something. The discussion continues, and it becomes more intense, with Tamara gesturing emphatically towards Ben. Finally, they come close enough to embrace, indicating a moment of resolution or comfort."
+    },
+    {
+        "id": 10,
+        "question": "Beth Hardiment is <img>image_1</img>\n Glen McCreavy is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Beth Hardiment stands beside a riding lawn mower as Glen McCreavy sits on it, ready to mow the grass. She then steps away from the mower, watching while he continues with the task. Throughout, they appear to be in a conversation, possibly discussing the work at hand or sharing responsibilities."
+    },
+    {
+        "id": 11,
+        "question": "Beth Hardiment is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n In the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Beth Hardiment doing?",
+        "prediction": "Beth Hardiment is holding a chicken and talking to Andy Cobb."
+    },
+    {
+        "id": 12,
+        "question": "Nicholas Hardiment is <img>image_1</img>\n Beth Hardiment is <img>image_2</img>\n Ben Sergeant is <img>image_3</img>\n Can you provide a description of the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Nicholas Hardiment stands alone by a wooden fence, then is joined by Beth Hardiment who appears to be in conversation with him. In the next moment, Ben Sergeant joins them, and they all seem to be engaged in a group discussion."
+    },
+    {
+        "id": 13,
+        "question": "Ben Sergeant is <img>image_1</img>\n Tamara Drewe is <img>image_2</img>\n Nicholas Hardiment is <img>image_3</img>\n Please explain what's happening in the photos <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Ben Sergeant, wearing a black leather jacket, stands close to Tamara Drewe and Nicholas Hardiment in a bookshop. The scene shifts, and now Nicholas is engrossed in a conversation with Ben while Tamara watches from a distance. Later, only the backs of their heads are visible as they move through the bustling shop. Eventually, Ben and Tamara sit opposite each other at a table, engrossed in a book signing, while Nicholas stands beside them."
+    },
+    {
+        "id": 14,
+        "question": "Andy Cobb is <img>image_1</img>\n Ben Sergeant is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What interaction occurs between Andy Cobb and Ben Sergeant?",
+        "prediction": "Andy Cobb is seen initially squatting near a structure, then standing and facing Ben Sergeant, who approaches him. Finally, they appear to be in a close conversation or argument."
+    },
+    {
+        "id": 15,
+        "question": "Jody Long is <img>image_1</img>\n Tell me about the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Jody Long sits at a desk in a dimly lit room, engaged with her computer. Later, she is seen from a different angle, appearing to be in a conversation while standing. Subsequently, Jody is depicted with another individual; they seem to be in the midst of a discussion, with Jody gesturing towards her chest."
+    },
+    {
+        "id": 16,
+        "question": "Tamara Drewe is <img>image_1</img>\n Andy Cobb is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Andy Cobb stands by the door as Tamara Drewe looks on from a distance. They exit the building together, with Tamara trailing slightly behind Andy. Later, Tamara watches from the street as Andy interacts with a fence. Throughout, the tension between them is palpable, suggesting a story of unspoken words and silently felt emotions."
+    },
+    {
+        "id": 17,
+        "question": "Jody Long is <img>image_1</img>\n Ben Sergeant is <img>image_2</img>\n Give a description for these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Jody Long and Ben Sergeant are first seen in the kitchen where Jody is pouring a drink. They then move to the table, with Jody looking thoughtful while Ben stands behind her. The pair walk towards a door, and eventually find themselves outside, standing close and facing each other by a tree."
+    },
+    {
+        "id": 18,
+        "question": "Gavin Nichols is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Gavin Nichols doing in these scenes?",
+        "prediction": "Gavin Nichols is shown standing in a doorway, then in an office facing someone, later he is seen sitting down, and finally bending over a desk."
+    },
+    {
+        "id": 19,
+        "question": "Shana Harris is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n Could you interpret the scene from these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Shana Harris, wearing a white shirt and red vest, stands to the left as she converses with Gavin Nichols, who is sitting on the edge of a desk in a white shirt and blue tie. Their discussion appears to evolve as Shana turns slightly towards Gavin, who then leans back in his chair, looking up at her."
+    },
+    {
+        "id": 20,
+        "question": "Chris       (as Chris Gorham) is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Gavin Nichols and Chris Gorham are in a kitchen, preparing food side by side on a table. Initially, Chris is on the left and Gavin on the right, both focused on their tasks. Then, Chris moves to the right as Gavin shifts slightly towards the center, both continuing to work on their dishes. Finally, Chris stands in the center with his back to the camera, while Gavin is on the left, facing towards Chris, engaged in the task at hand."
+    },
+    {
+        "id": 21,
+        "question": "Shana Harris is <img>image_1</img>\n Gavin Nichols is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Shana Harris is busy with kitchen work while Gavin Nichols stands by the counter holding a bottle. As she continues her task, Gavin moves closer, drink in hand, engaging her with what seems like a conversation. They appear to be in a kitchen, possibly preparing for a meal together."
+    },
+    {
+        "id": 22,
+        "question": "Gavin Nichols is <img>image_1</img>\n Joe Harris is <img>image_2</img>\n Shana Harris is <img>image_3</img>\n Tell me about the pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Gavin Nichols, Joe Harris, and Shana Harris walk together down a city street. They engage in conversation, with Gavin and Joe facing each other while Shana walks alongside them. Eventually, they stop and have a meal together at a dining table, with Gavin and Joe seated and Shana standing nearby."
+    },
+    {
+        "id": 23,
+        "question": "Gavin Nichols is <img>image_1</img>\n Shana Harris is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Gavin Nichols and Shana Harris are first seen in a tense conversation across a table; Shana looks on as Gavin drinks from a can. The situation escalates as Gavin stands up on the patio, holding Shana's hand, and then they are lying down together, seemingly in a confrontational or exhausted manner."
+    },
+    {
+        "id": 24,
+        "question": "Det. Hollis Lucetti is <img>image_1</img>\n Describe the scene in these pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
+        "prediction": "Det. Hollis Lucetti stands in a dining room, initially alone, then seen leaning down to speak to a young girl at the table. He later sits down with the family, engaging in a meal together."
+    },
+    {
+        "id": 25,
+        "question": "Freddie Quell is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n. What is Freddie Quell doing in these images? ",
+        "prediction": "Freddie Quell is seen having a close interaction with a woman, then sitting alone at a table possibly in a restaurant, and finally operating a camera in a different setting."
+    },
+    {
+        "id": 26,
+        "question": "Lancaster Dodd is <img>image_1</img>\n Analyze the contents of the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Lancaster Dodd is initially seen joyously raising his arms with others at a celebration. The scene shifts to a more intimate setting where he is seated at a wedding table, initially appearing to engage in conversation. His mood shifts as he is captured standing, gesturing emphatically with his hand raised, suggesting an emotional or animated moment."
+    },
+    {
+        "id": 27,
+        "question": "Peggy Dodd is <img>image_1</img>\n Freddie Quell is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Peggy Dodd and Freddie Quell sit opposite each other at a table, engaging in conversation. The setting changes to a dimly lit room where they stand close together, facing forward. Later, Freddie appears to be intently observing something while Peggy is seated in the background, looking on."
+    },
+    {
+        "id": 28,
+        "question": "Lancaster Dodd is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What happens to Lancaster Dodd?",
+        "prediction": "Lancaster Dodd stands on a balcony, is then shoved by someone, walks outside with his head down, and finally appears to be led away by his arm."
+    },
+    {
+        "id": 29,
+        "question": "Cal is <img>image_1</img>\n Jacob is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Cal and Jacob engage in a conversation while walking together outdoors. As they continue, Jacob stops and gestures towards Cal, explaining something. They then stand facing each other in front of a shoe display, with Jacob still animatedly talking."
+    },
+    {
+        "id": 30,
+        "question": "Cal is <img>image_1</img>\n Jacob is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Cal, in a striped yellow shirt, is seen shopping inside a store where Jacob, dressed in a suit, is positioned near a mannequin. Jacob approaches Cal, and they interact closely, with Jacob adjusting Cal's shirt. Cal, now holding his hat, continues to engage with Jacob as they converse."
+    },
+    {
+        "id": 31,
+        "question": "Jacob is <img>image_1</img>\n Cal is <img>image_2</img>\n Kate is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Jacob and Cal are seen shaking hands in a bar, while Kate sits nearby, looking surprised. The scene shifts to a more relaxed setting where Jacob and Cal engage in a one-on-one conversation over drinks at a table. Their interaction becomes more intense as they stand up, with Cal gesturing emphatically towards Jacob. The evening ends with Jacob measuring Cal against a door frame, seemingly checking his height."
+    },
+    {
+        "id": 32,
+        "question": "Jessica is <img>image_1</img>\n Robbie is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Jessica and Robbie are first seen amidst a crowd, with Robbie slightly behind Jessica. In the next moment, they are standing apart, with Robbie looking on as Jessica walks away. Finally, they are close to each other, standing side by side, facing the same direction with onlookers in the background."
+    },
+    {
+        "id": 33,
+        "question": "Cal is <img>image_1</img>\n Bernie is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Cal, holding a paint can, exchanges a high-five with Bernie who's wearing a red vest in a store. Later, Cal, with his back towards the camera, seems to be in a discussion with Bernie and other workers in a warehouse. Finally, Cal is seen leaning on a cart while Bernie gestures as if explaining something to him in a aisle of the store."
+    },
+    {
+        "id": 34,
+        "question": "Claire is <img>image_1</img>\n Jessica is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
+        "prediction": "Jessica and Claire stand closely in a kitchen, then they are seen outside with Claire talking to Jessica as they stand by a car. Finally, Jessica is driving the car with Claire as a passenger, looking distressed."
+    },
+    {
+        "id": 35,
+        "question": "Jacob is <img>image_1</img>\n Emily is <img>image_2</img>\n Hannah is <img>image_3</img>\n Could you interpret the scene from these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
+        "prediction": "Jacob, Emily, and Hannah are initially seated together, engaged in different conversations. They eventually clasp their hands in applause, indicating a shared moment. The sequence concludes with Emily and Hannah hugging each other, while Jacob stands at a distance, observing them."
+    },
+    {
+        "id": 36,
+        "question": "Cal is <img>image_1</img>\n Robbie is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Cal, dressed in a suit, stands at a podium while Robbie, in a striped suit, sits beside him. Robbie takes the microphone, and Cal leans in to listen as Robbie speaks. Cal then gestures while looking down, continuing the interaction with Robbie, who holds the microphone throughout their conversation."
+    },
+    {
+        "id": 37,
+        "question": "Lynn Childers is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Lynn and Sam Childers stand close together in a kitchen, facing each other. They seem to be in a serious conversation, with Sam turning slightly towards Lynn, who is looking back at him. The tension between them builds as they stand in close proximity, culminating in a passionate embrace."
+    },
+    {
+        "id": 38,
+        "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Sam and Lynn Childers sit together in a church, initially looking forward. Their attention shifts as they stand and face each other, with Lynn reaching out to Sam. They then turn, standing side by side, facing a gathering, with Sam gesturing emphatically towards Lynn."
+    },
+    {
+        "id": 39,
+        "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n?",
+        "prediction": "Sam Childers, in a checkered shirt, is seen carrying Lynn Childers, wearing a black dress, as they walk through a forest trail. They maintain close proximity throughout the sequence, indicating a moment of intimacy or support between them."
+    },
+    {
+        "id": 40,
+        "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Sam Childers stands under a tree shade, watching as Deng sits on a chair in a seemingly organized outdoor area. Later, both Sam and Deng are seen inside a vehicle, with Sam in the driver's seat and Deng as the passenger, looking ahead. Finally, Sam and Deng walk side by side among a large encampment, with Sam leading the way."
+    },
+    {
+        "id": 41,
+        "question": "Deng is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n In the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What are Sam Childers doing throughout these images? ",
+        "prediction": "Sam Childers is having a seated conversation with Deng, thenSam plays with a gun while Deng watches."
+    },
+    {
+        "id": 42,
+        "question": "Paige is <img>image_1</img>\n Sam Childers is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Sam Childers, visible in a reflection in the mirror, is close to Paige in a bedroom. He then sits on the bed facing Paige, talking to her. Lastly, they appear to share a moment together as he shows her something on his arm while seated on the bed."
+    },
+    {
+        "id": 43,
+        "question": "Sam Childers is <img>image_1</img>\n In the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n. What is Sam Childers doing?",
+        "prediction": "Sam Childers is standing in front of a group of people, then he is seen walking through a puddle of water, and finally, he is standing in a room with a microphone in front of him."
+    },
+    {
+        "id": 44,
+        "question": "Sam Childers is <img>image_1</img>\n Describe the scene in these pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Sam Childers, engaged and active, interacts with a group of children in a dirt-covered area. He begins by playing soccer with them, then demonstrates a bat for the kids, and finally, he engages in a running game, leading a child in front of a house. The children follow him enthusiastically, participating in the playful activities."
+    },
+    {
+        "id": 45,
+        "question": "Sam Childers is <img>image_1</img>\n Lynn Childers is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Sam Childers and Lynn Childers are initially seen standing at the entrance of a room, Sam holding a drink while Lynn looks on. They then move closer together, engaging in a conversation with Lynn turning towards Sam as they talk. Finally, Sam and Lynn stand side by side, continuing their interaction with Lynn facing Sam, partially turned away from the camera."
+    },
+    {
+        "id": 46,
+        "question": "Lynn Childers is <img>image_1</img>\n Paige is <img>image_2</img>\n Sam Childers is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Lynn Childers, wearing a blue dress, and Paige walk together beside a house, with Sam Childers following slightly behind them. They continue walking outside, passing by a convertible. Lynn and Paige stop to sit at an outdoor table, where Lynn appears to be in conversation with Sam, who is standing and gesturing."
+    },
+    {
+        "id": 47,
+        "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Sam Childers and Deng are seen together inside a vehicle, with Sam in the driver's seat and Deng as the front passenger. They appear to be engaged in a conversation or a journey. In the following scenes, they switch positions, with Deng taking the driver's seat and Sam as the front passenger, indicating a possible role change during their travel. Throughout their journey, they maintain a dialogue, as seen by the consistent orientation towards each other."
+    },
+    {
+        "id": 48,
+        "question": "Sam Childers is <img>image_1</img>\n Deng is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Sam Childers and Deng are seen interacting with a group of children in a dry, open field. Initially, Sam is helping a child while Deng observes. As the scene progresses, they face each other among the children, with Sam turning towards Deng, possibly in conversation, while Deng looks on from a distance."
+    },
+    {
+        "id": 49,
+        "question": "Josh Lambert is <img>image_1</img>\n Renai Lambert is <img>image_2</img>\n Dalton Lambert is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Josh Lambert is seen adjusting his tie with Renai Lambert standing beside him, while Dalton Lambert is seated nearby. In the next moment, Josh is busy moving boxes in the kitchen with Renai looking on, and Dalton is playing with a toy on the floor. Finally, Josh bends down to interact with Dalton, who is standing by a table, as Renai watches from a closer distance."
+    },
+    {
+        "id": 50,
+        "question": "Josh Lambert is <img>image_1</img>\n Renai Lambert is <img>image_2</img>\n In these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n. What is Josh Lambert doing?",
+        "prediction": "Josh Lambert is first standing by the piano, then he bends down to interact with a child, and finally he is seen sitting on the couch."
+    },
+    {
+        "id": 51,
+        "question": "Sparrow is <img>image_1</img>\n For the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Sparrow, facing a group of people, appears to be the center of attention. He then turns to converse with someone off-camera, later standing with his back to the camera, and finally, he gets into a vehicle, sitting in the passenger seat."
+    },
+    {
+        "id": 52,
+        "question": "Sparrow is <img>image_1</img>\n Roast Pork is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Sparrow and Roast Pork, both in suits, appear to be having a heated exchange among a group of onlookers. They move through the crowd, exchanging intense looks and gestures. Eventually, their confrontation escalates as they stand face-to-face, before conclusion with a physical altercation, shoving each other around a table."
+    },
+    {
+        "id": 53,
+        "question": "Roast Pork is <img>image_1</img>\n Explain the content of these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Roast pork walks through a restaurant, casually observing his surroundings. He pauses to interact with a tabled guest, seemingly in a light, casual exchange. The moment progresses to a more intense interaction as he confronts another individual, with his arm extended. Finally, he is seen walking away from the area, carrying his own plate, amongst other diners."
+    },
+    {
+        "id": 54,
+        "question": "Roast Pork is <img>image_1</img>\n Kerosene is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Roast pork, wearing a white jacket, exchanges a red object with Kerosene, who is dressed in a grey shirt. They hold hands over the item, then roast pork turns his back to Kerosene, who now wears a green jacket. Finally, they appear to have a close conversation while facing each other."
+    },
+    {
+        "id": 55,
+        "question": "Roast Pork is <img>image_1</img>\n Nancy is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Roast Pork and Nancy appear to be engaged in a tense conversation, facing each other closely. Subsequently, they are seen together with two children, focusing on an object on the table, suggesting a shared family moment. Their interaction seems to be part of a larger, ongoing dialogue in a home setting."
+    },
+    {
+        "id": 56,
+        "question": "Peter Sullivan is <img>image_1</img>\n Seth Bregman is <img>image_2</img>\n Could you interpret the scene from these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Seth Bregman, wearing a dark suit, is seated while Peter Sullivan, in a light blue shirt, reads a document. As the scene progresses, Seth leans back in his chair while Peter, still holding the document, appears to be speaking to Seth. They seem to be engaged in a work-related discussion in an office-like setting."
+    },
+    {
+        "id": 57,
+        "question": "Will Emerson is <img>image_1</img>\n Seth Bregman is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Will Emerson and Seth Bregman are first seen in a car, with Will driving and Seth as the passenger. They then switch positions, with Seth taking the wheel and Will as the passenger, as they continue their journey. Eventually, Seth steps out of the car, while Will remains inside, looking towards him as he walks away."
+    },
+    {
+        "id": 58,
+        "question": "John Tuld is <img>image_1</img>\n Sam Rogers is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Sam Rogers stands at a distance in a room, facing towards John Tuld, who is slightly turned away. In the following moments, John is seen with his back towards Sam, appearing to be in deep thought at the sink, while Sam seems to be observing him from a closer distance. Their interaction suggests a tense atmosphere between them."
+    },
+    {
+        "id": 59,
+        "question": "Sam Rogers is <img>image_1</img>\n Peter Sullivan is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
+        "prediction": "Sam Rogers, dressed in a suit, is seen in conversation with Peter Sullivan, who is holding a coffee cup. They appear to be engaged in a serious discussion while walking outside. Throughout their interaction, they maintain a steady pace, indicating the importance of their dialogue."
+    },
+    {
+        "id": 60,
+        "question": "Sarah Robertson is <img>image_1</img>\n Eric Dale is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Sarah Robertson stands by a window with a city view as Eric Dale, initially out of view, seems to be engaged in a conversation with her. They are in a well-lit room with a modern decor. Sarah then sits on a couch, facing Eric who is now standing close by. They continue their interaction, with Eric slightly leaning towards Sarah, suggesting a deepening dialogue between them."
+    },
+    {
+        "id": 61,
+        "question": "John Bennett is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.  What is the sequence of actions John Bennett performs? ",
+        "prediction": "John Bennett starts by standing among a crowd, then drinks from a water bottle, walks through the crowd, and finally seems to be in a physical altercation with someone."
+    },
+    {
+        "id": 62,
+        "question": "John Bennett is <img>image_1</img>\n Lori Collins is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Lori Collins stands by the entrance while John Bennett sits at the counter with his back to her. They face each other from opposite ends of the counter, appearing to be in a conversation. John then stands up, turning towards Lori, continuing their interaction. The sequence ends with John sitting back down, still engaged in dialogue with Lori."
+    },
+    {
+        "id": 63,
+        "question": "Elite Guard No. 1 is <img>image_1</img>\n Perseus is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Perseus and Elite Guard No. 1 stand close, looking out towards the sea, with Perseus' back partially turned to the viewer. The scene shifts as they watch a Pegasus take flight, Perseus turning his back to No. 1, who seems to be looking at the horse. Subsequently, Perseus is seen close up, conversing with No. 1, while the horse's attention is drawn towards something off-camera. Finally, Perseus mounts the horse, engaging with No. 1 one last time before setting off."
+    },
+    {
+        "id": 64,
+        "question": "Driss is <img>image_1</img>\n Philippe is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Driss leans in towards Philippe who is seated, appearing to have a close conversation. Later, they are seen in a bustling environment with Driss leading Philippe, who seems to be in a wheelchair, through a crowded space. Eventually, they join a group of people, sitting side by side, with Driss turned towards Philippe, continuing their interaction."
+    },
+    {
+        "id": 65,
+        "question": "Driss is <img>image_1</img>\n Philippe is <img>image_2</img>\n Yvonne is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Driss stands to the right of Philippe, who is seated in a wheelchair, while Yvonne is to the left of Philippe, looking their way. Driss seems to be engaging with them, possibly in conversation. Yvonne moves closer to Philippe, gesturing as if explaining something. Driss leans forward, listening intently, and finally, Driss stands upright, facing the camera with both Yvonne and Philippe behind him."
+    },
+    {
+        "id": 66,
+        "question": "Magalie is <img>image_1</img>\n Driss is <img>image_2</img>\n Yvonne is <img>image_3</img>\n Examine the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n and describe their content.",
+        "prediction": "Magalie and Driss engage in a close conversation while Yvonne observes. Subsequently, Driss walks away with a pink backpack as Magalie watches him leave. Yvonne remains behind, looking in their direction. Later, Magalie and Yvonne stand at the bottom of a staircase, facing each other, suggesting a continuation of their discussion."
+    },
+    {
+        "id": 67,
+        "question": "Fantine is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What is happening to Fantine in these images?",
+        "prediction": "Fantine is involved in a physical altercation with other individuals, where she appears to be in a defensive stance against attack."
+    },
+    {
+        "id": 68,
+        "question": "Fantine is <img>image_1</img>\n In the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Fantine appears in close proximity to another person, then moves through a crowded space, later she is seen crouching on a street with a fire behind her, and finally, she is kneeling on the ground with a distressed expression."
+    },
+    {
+        "id": 69,
+        "question": "Enjolras is <img>image_1</img>\n Marius is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Enjolras and Marius lead a successful rally in a crowded square, with Enjolras gesturing passionately and Marius supporting a man on a podium. The atmosphere is joyous and energetic as they stand among the gathered crowd. Later, the two are seen close together, with Enjolras extending his arm towards a statue on a gate, and Marius leaning over a barrier amidst the crowd, the mood shift from celebration to a more serious, perhaps contemplative tone. They then move indoors, Enjolras supports a wounded man while Marius looks on, indicating a transition from public engagement to a more private, possibly strategic gathering."
+    },
+    {
+        "id": 70,
+        "question": "Cosette is <img>image_1</img>\n Marius is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Marius and Cosette stand close to each other in a warm embrace, sharing an intimate moment. They then move through a bustling crowd, with Marius leading and Cosette following, creating a sense of connection between them amidst the chaos. Finally, Marius turns to face Cosette, suggesting a moment of pause or a significant exchange in their interaction."
+    },
+    {
+        "id": 71,
+        "question": "Tom Sawyer is <img>image_1</img>\n Huck Finn is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Tom Sawyer and Huck Finn are engaged in an activity together, with Tom holding a pair of glasses and a small cup and Huck focusing on something in Tom's hands. They are surrounded by various items, indicating a casual, outdoor setting. As the scene progresses, Huck stands to get a better view, emphasizing their involvement in the same activity."
+    },
+    {
+        "id": 72,
+        "question": "Tante Polly is <img>image_1</img>\n Tom Sawyer is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Tante Polly and Tom Sawyer are first seen outdoors, with Polly sitting on a fence and Tom facing her from a distance. They move closer for a more intimate interaction, with Sawyer's hand gently touching Polly's shoulder in a comforting gesture. Finally, they sit together at a dining table, where Polly appears to be explaining something to Sawyer as he listens attentively."
+    },
+    {
+        "id": 73,
+        "question": "Indianer Joe is <img>image_1</img>\n For the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, What does Indianer Joe do during the sequence of images?",
+        "prediction": "Indianer Joe starts by sitting at the bar with a drink, then interacts with someone across the table, and finally leans in for a close conversation."
+    },
+    {
+        "id": 74,
+        "question": "Tom Sawyer is <img>image_1</img>\n Huck Finn is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Tom Sawyer and Huck Finn are first seen walking side by side with Tom holding a book. They then run energetically through a wooden structure area. Next, they face each other in a close conversation. Finally, they are peeking through a barred window, engrossed in something out of view."
+    },
+    {
+        "id": 75,
+        "question": "Tom Sawyer is <img>image_1</img>\n In the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Tom Sawyer sits among his classmates in a rustic classroom, initially looking forward. He then turns his head to the right, appearing to be in conversation with someone off-camera. Following that, Tom faces forward again before leaning over a table as if to read or write something, engaging fully with the activity in front of him."
+    },
+    {
+        "id": 76,
+        "question": "Tom Sawyer is <img>image_1</img>\n Richter Thatcher is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Tom Sawyer and Richter Thatcher are engaged in a tea party outdoors. They sit across from each other, with a table filled with tea and cake between them. In one moment, Richter leans in closely towards Tom, while in another, Tom leans in towards Richter, suggesting an intimate or serious exchange. Later, Richter enjoys a drink with a smoke, still seated beside Tom."
+    },
+    {
+        "id": 77,
+        "question": "Tante Polly is <img>image_1</img>\n Richter Thatcher is <img>image_2</img>\n Tom Sawyer is <img>image_3</img>\n Can you provide a description of the following images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
+        "prediction": "Tante Polly stands solemnly amidst a crowd, Richter Thatcher is by her side, and Tom Sawyer is seen sitting down, all dressed in black. As the scene progresses, Tom turns his head towards Polly and Thatcher, while Polly leans forward, appearing to be in a conversation with someone next to her. The interaction seems heartfelt, with gestures of comfort and care amidst a sea of other mourners."
+    },
+    {
+        "id": 78,
+        "question": "Richard is <img>image_1</img>\n Olivia is <img>image_2</img>\n Kate is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Kate, Olivia, and Richard are together at a social gathering. Kate and Olivia engage in a conversation across the table, while Richard seems to be listening intently to Olivia. As the night progresses, Kate turns towards Richard, possibly making a point or sharing a laugh, while Olivia looks on. The atmosphere appears convivial, with Richard's attention primarily on Olivia."
+    },
+    {
+        "id": 79,
+        "question": "Kate is <img>image_1</img>\n Richard is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Kate stands in the doorway while Richard is lounging on the couch with a glass of wine and a stuffed animal. She moves closer to Richard, who is busy reading a book. Finally, Kate confronts Richard face-to-face, standing right in front of him."
+    },
+    {
+        "id": 80,
+        "question": "Richard is <img>image_1</img>\n Olivia is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
+        "prediction": "Richard, wearing a blue suit, stands beside a classic car as Olivia, in a black dress, sits in the driver's seat. They then drive away with Richard in the passenger seat and Olivia behind the wheel. After reaching their destination, they exit the car and walk side by side, engaging in conversation."
+    },
+    {
+        "id": 81,
+        "question": "Jake is <img>image_1</img>\n Richard is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Richard, dressed in a white suit, is seen close to Jake in various settings, initially indoors. They appear to be sharing an affectionate moment, with Richard's focus solely on Jake. As the sequence progresses, Richard seems to be engaging more actively with Jake, perhaps in a playful or instructive manner, while Jake listens attentively."
+    },
+    {
+        "id": 82,
+        "question": "Richard is <img>image_1</img>\n Gordon is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Gordon, dressed in a blue suit, stands facing Richard, who is in a blue blazer and white pants. They appear to have a conversation as Gordon's back is initially turned to Richard. Then, Gordon is seen embracing a woman, Richard looks on. Later, they walk together towards an indoor setting."
+    },
+    {
+        "id": 83,
+        "question": "Chris is <img>image_1</img>\n Leon is <img>image_2</img>\n Marie is <img>image_3</img>\n Describe the scene in these pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Chris, Leon, and Marie walk down a hospital corridor with Leon leading the way. They encounter an older man lying on the floor, and Chris and Marie stop to interact with him while Leon watches from a distance. Eventually, Chris and Marie embrace the older man, showing signs of comfort and affection."
+    },
+    {
+        "id": 84,
+        "question": "Frank is <img>image_1</img>\n Chris is <img>image_2</img>\n Leon is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
+        "prediction": "Frank, Chris, and Leon are in a living room with Frank sitting on a stool and Chris and Leon lounging on a couch. They seem to be in a casual conversation, with Frank gesturing while talking to Chris. Their discussion appears ongoing as they remain seated, with Leon occasionally joining in from across the room."
+    },
+    {
+        "id": 85,
+        "question": "Chris is <img>image_1</img>\n Natalie is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Natalie, in a red outfit, sits closely beside Chris, engaging in conversation in a cozy room with a Christmas tree in the background. They shift positions, maintaining their interaction, before Chris stands up, turning away slightly as if in mid-discussion. The sequence ends with Chris leaning against a wall and Natalie sitting on the floor, both appearing to be in a serious exchange."
+    },
+    {
+        "id": 86,
+        "question": "Monica is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What happens to Monica during the sequence of these images?",
+        "prediction": "Monica appears to be in a conversation, then becomes visibly distressed, and is finally escorted out of the room by a man."
+    },
+    {
+        "id": 87,
+        "question": "Abbey is <img>image_1</img>\n Anthony is <img>image_2</img>\n Tom is <img>image_3</img>\n What is displayed in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
+        "prediction": "Tom sits across from Anthony, with Abbey seated next to Tom. They appear to be engaged in a serious conversation. Anthony stands emphatically, seemingly making a point to Tom, while Abbey looks on. The interaction intensifies as Abbey leans in closer, indicating a moment of urgency or importance in their dialogue."
+    },
+    {
+        "id": 88,
+        "question": "Tom is <img>image_1</img>\n Anthony is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Tom and Anthony engage in a conversation while sitting on a bench, with Tom appearing to be the more animated speaker. They then stand facing each other, continuing their discussion, before Tom seems to listen attentively to Anthony, who is now talking with a gesture indicating emphasis or explanation."
+    },
+    {
+        "id": 89,
+        "question": "Haruta Kamijou is <img>image_1</img>\n Chika Homura is <img>image_2</img>\n Analyze the contents of the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Haruta Kamijou and Chika Homura start by standing indoors, facing each other with Chika holding a saxophone. They move outside where they walk side by side, both holding papers. Next, they are seen from behind, walking among other students with Chika leading the way. Finally, they stand in front of a table with a poster, engaging in conversation with a group of people around them."
+    },
+    {
+        "id": 90,
+        "question": "Chika Homura is <img>image_1</img>\n Can you provide a description of the following images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Chika Homura, wearing a school uniform with a tie, is seen in a classroom setting. She appears focused on her task initially, then is shown playing a wind instrument. Later, she is amongst a group of students moving around the room, and finally, she is seen sitting, likely still engaged in a class activity."
+    },
+    {
+        "id": 91,
+        "question": "Ji is <img>image_1</img>\n Wang Yi-Chi is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Ji and Wang Yi-Chi are initially seen discussing something in a room. The scene shifts, showing Wang looking at a document while Ji appears to be in the midst of a conversation with him. The setting changes to an outdoor auto shop where they continue their interaction; first, under the hood of a car, then standing beside it."
+    },
+    {
+        "id": 92,
+        "question": "Maggie is <img>image_1</img>\n Wang Yi-Chi is <img>image_2</img>\n In the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Maggie doing?",
+        "prediction": "Maggie is having a conversation with Wang Yi-Chi."
+    },
+    {
+        "id": 93,
+        "question": "Maggie is <img>image_1</img>\n Please describe the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
+        "prediction": "Maggie, seated at a table with a drink in hand, appears to be engaged in a meeting or a discussion. She seems attentive and involved in the ongoing conversation or activity, with her position and posture maintaining throughout the sequence of images."
+    },
+    {
+        "id": 94,
+        "question": "Jeff is <img>image_1</img>\n Shawn is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Jeff, wearing a purple shirt, is in a room talking on the phone. Meanwhile, Shawn, not far behind, seems to be looking at Jeff, holding a sandwich in his hand. The situation escalates as Jeff sits on a couch while Shawn leans closely towards him from behind, indicating a tense or serious exchange between them."
+    },
+    {
+        "id": 95,
+        "question": "Tonya is <img>image_1</img>\n Diane Rawlinson is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Tonya, with her back to the camera, appears to be speaking at a press conference. Diane Rawlinson stands beside her, slightly out of focus. The crowd, capturing the attention of the media and onlookers, watches Tonya from behind as she addresses them."
+    },
+    {
+        "id": 96,
+        "question": "Momo Adachi is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Momo Adachi and Sae Kashiwagi are in the school cafeteria; Sae is sitting while leaning on her hand, and Momo is sitting across from her. They appear to be engaged in a conversation. Momo then gestures emphatically towards Sae, who looks surprised or amused. Eventually, Sae leans forward, resting her head on her crossed arms, while Momo reaches out to her, maintaining the interaction."
+    },
+    {
+        "id": 97,
+        "question": "Momo Adachi is <img>image_1</img>\n Kairi Okayasu is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Momo Adachi and Kairi Okayasu are first seen hurriedly walking down a hallway together. They then pause in a classroom, facing each other, with a tension between them palpable. Momo reaches out to touch Kairi's necklace, before the two share a close, intimate moment, with Kairi slightly leaning in towards Momo."
+    },
+    {
+        "id": 98,
+        "question": "Momo Adachi is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Momo Adachi, holding an umbrella, walks past Sae Kashiwagi. They exchange looks as Momo continues walking. Eventually, Momo turns to face Sae, presenting something in her hands, while Sae seems to be examining it closely."
+    },
+    {
+        "id": 99,
+        "question": "Sae Kashiwagi is <img>image_1</img>\n Toji is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Sae Kashiwagi and Toji are initially at a distance, with Sae sitting and Toji standing across from her. Toji then moves closer to Sae, until they are face to face in close proximity. They appear to be engaged in an intense, personal conversation."
+    },
+    {
+        "id": 100,
+        "question": "Toji is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Kairi Okayasu is <img>image_3</img>\n Tell me about the pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Toji, Kairi, and Momo Adachi walk together, with Toji placed between Kairi and Momo. As they continue, Toji looks back, possibly addressing Kairi, who turns her body towards Toji while engaging with him. Eventually, Momo steps forward, moving closer to Toji as they converse."
+    },
+    {
+        "id": 101,
+        "question": "Toji is <img>image_1</img>\n Sae Kashiwagi is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Sae Kashiwagi, wearing a school uniform, is first seen sitting while looking to her side. Then, she is observed from behind, standing and walking away from her table. In the final image, Sae is seen walking briskly across the room, moving away from the table where Toji, also in a uniform, remains standing."
+    },
+    {
+        "id": 102,
+        "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Kairi Okayasu and Momo Adachi are in a kitchen, engaging in a conversation while Momo eats a piece of cake. Kairi, wearing a cooking apron, seems to be in the middle of a conversation as Momo listens and enjoys her cake. The interaction appears to be a casual and friendly conversation over a meal."
+    },
+    {
+        "id": 103,
+        "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Kairi Okayasu and Momo Adachi walk through a crowded area, with Kairi leading the way. They stop their conversation in a park, with Kairi gesturing while talking to Momo. Eventually, they sit together among the flowers, continuing their interaction with Kairi still animatedly speaking."
+    },
+    {
+        "id": 104,
+        "question": "Kairi Okayasu is <img>image_1</img>\n Momo Adachi is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Kairi Okayasu and Momo Adachi are first seen sitting opposite each other at a cafeteria table, engaged in conversation. They then hold hands while walking through a school campus, showing a close bond. Later, they walk together in a crowded area, still in their school uniforms, continuing their interaction."
+    },
+    {
+        "id": 105,
+        "question": "Elise Rainier is <img>image_1</img>\n Imogen Rainier is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Elise Rainier stands facing Imogen Rainier, who is initially seen from behind. They appear to be in a serious conversation. Then, Imogen is shown from the front, continuing the dialogue with Elise. The scene concludes with both women, now older, standing in the background as emergency responders attend to an incident in the foreground."
+    },
+    {
+        "id": 106,
+        "question": "Dan Zhang is <img>image_1</img>\n Red Ye is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Dan Zhang, standing beside a motorcycle, presents a document to Red Ye, who is facing away from him. Red then turns to accept the paper, and they continue to engage with each other closely, before Red rides away on the motorcycle while Dan watches."
+    },
+    {
+        "id": 107,
+        "question": "Dan Zhang is <img>image_1</img>\n Po Chen is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, What actions does Dan Zhang take?",
+        "prediction": "Dan Zhang starts by walking alongside Po Chen, then he is seen waiting at a bus stop, and finally, he rides the subway."
+    },
+    {
+        "id": 108,
+        "question": "King Kong is <img>image_1</img>\n Dan Zhang is <img>image_2</img>\n Can you provide a description of the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Dan Zhang faces King Kong in a tense confrontation. Kong stands confidently while Dan is more reserved. As the situation escalates, Dan seems to be arguing or presenting a case to a back turned Kong. The final moment shows Dan taking a defensive stance with a gun pointed at Kong, indicating a standoff."
+    },
+    {
+        "id": 109,
+        "question": "Po Chen is <img>image_1</img>\n Red Ye is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Po Chen and Red Ye start off standing side by side on a bridge, then Po turns to face Red as they engage in a conversation. The interaction becomes more intimate as they lean in close, and eventually, they sit side by side again, continuing their dialogue."
+    },
+    {
+        "id": 110,
+        "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n, give a description.",
+        "prediction": "Arata Kaizaki and Chizuru Hishiro are in a classroom with Arata walking in from the back, approaching Chizuru who is sitting at her desk. They then have a conversation, with Arata leaning towards Chizuru, who is listening attentively while holding a red paper."
+    },
+    {
+        "id": 111,
+        "question": "Arata Kaizaki is <img>image_1</img>\n Rena Kariu is <img>image_2</img>\n Kazuomi Oga is <img>image_3</img>\n Please give a summary of the images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Arata Kaizaki is seen sitting at a desk, turning around to face Rena Kariu and Kazuomi Oga who are sitting across from him. They seem to be engaged in a conversation as Arata gestures with his hands while talking to Rena. The interaction appears to be in a classroom setting with other students around them."
+    },
+    {
+        "id": 112,
+        "question": "Chizuru Hishiro is <img>image_1</img>\n Arata Kaizaki is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Chizuru Hishiro and Arata Kaizaki are seen walking through a school cafeteria. They face each other, engaging in a conversation. Arata hands something to Chizuru, who looks surprised. They continue to interact, with Chizuru appearing to be explaining something to Arata."
+    },
+    {
+        "id": 113,
+        "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Arata Kaizaki and Chizuru Hishiro are first seen at lockers; Arata is busying himself while Chizuru stands by. They move to a park where they sit on a bench; Arata looks away, seemingly preoccupied. Chizuru stands upright, facing Arata. Finally, Arata stands, gesturing towards Chizuru, who stands at attention."
+    },
+    {
+        "id": 114,
+        "question": "Rena Kariu is <img>image_1</img>\n Kazuomi Oga is <img>image_2</img>\n Chizuru Hishiro is <img>image_3</img>\n What do you see in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Rena Kariu stands at the bottom of a staircase, looking up, while Chizuru Hishiro and Kazuomi Oga are on the stairs ahead of her. Chizuru and Oga engage in a conversation as Rena observes them. Then, Rena joins them, walking up the stairs to join Chizuru and Oga who wait for her."
+    },
+    {
+        "id": 115,
+        "question": "Chizuru Hishiro is <img>image_1</img>\n Arata Kaizaki is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Chizuru Hishiro, dressed in a school uniform, is seen close to Arata Kaizaki. Initially, she is behind him, but then she moves closer, even holding a bowl while facing him. Their interaction suggests a service or care environment, with Chizuru in a attentive stance towards Arata."
+    },
+    {
+        "id": 116,
+        "question": "Arata Kaizaki is <img>image_1</img>\n Kazuomi Oga is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Arata Kaizaki and Kazuomi Oga are first seen in a study setting where Arata is focused on writing, and Kazuomi appears to be taking a photo or checking his phone. They later sit across from each other at a table in a caf\u00e9, engaging in conversation. Finally, they face each other in a more formal setting, continuing their interaction."
+    },
+    {
+        "id": 117,
+        "question": "Arata Kaizaki is <img>image_1</img>\n Chizuru Hishiro is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Arata Kaizaki and Chizuru Hishiro start inside a building, standing in a line with others, seemingly waiting. They move outdoors, where Arata sits on the ground among a group, engaging in a picnic atmosphere. Finally, they stand alone, facing forward with a trees background, indicating a shift from group to individual moments."
+    },
+    {
+        "id": 118,
+        "question": "Anna Karenina is <img>image_1</img>\n Kitty is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Anna Karenina and Kitty engage in a conversation, with Anna listening intently to Kitty. The setting shifts to a grand staircase where Kitty, in a white dress, seems to instruct Anna, leading her by the hand. They move closer, with Anna holding onto Kitty's wrist, suggesting a moment of guidance or preparation for a dance or social event."
+    },
+    {
+        "id": 119,
+        "question": "Oblonsky is <img>image_1</img>\n Anna Karenina is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Oblonsky and Anna Karenina are seen together in different settings, initially close to each other under a fur-lined umbrella, then walking hand in hand into a grand event. At the event, they are initially apart, with Anna appearing to call attention to herself. However, they soon find each other again, Anna sitting while Oblonsky stands beside her, engaging her with a gesture."
+    },
+    {
+        "id": 120,
+        "question": "Vronsky is <img>image_1</img>\n Kitty is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Vronsky, dressed in a white military uniform, stands on a balcony with other officers, while Kitty, wearing a light dress, stands nearby. They move through a crowded ballroom, where Vronsky gently takes Kitty's hand, guiding her among the dancing guests. They are seen in close proximity, seemingly engaged in a private moment amidst the bustling event."
+    },
+    {
+        "id": 121,
+        "question": "Irving Rosenfeld is <img>image_1</img>\n Sydney Prosser is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
+        "prediction": "Sydney Prosser and Irving Rosenfeld are perusing through a selection of fur coats in a warehouse. They move closer to each other, with Sydney examining a coat and Irving observing her choice. Then, they stand side by side, with Sydney adjusting a coat for Irving to try on. They appear to be engaged in a collaboration related to the coats."
+    },
+    {
+        "id": 122,
+        "question": "Rosalyn Rosenfeld is <img>image_1</img>\n Irving Rosenfeld is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
+        "prediction": "Rosalyn and Irving Rosenfeld engage in a conversation in a kitchen; Rosalyn appears animated while Irving listens intently. They switch positions, with Irving now speaking into a phone held to his ear and Rosalyn watching him. Their conversation continues, with Rosalyn turning towards Irving, who is still on the phone, seemingly deep in discussion."
+    },
+    {
+        "id": 123,
+        "question": "Richie DiMaso is <img>image_1</img>\n Irving Rosenfeld is <img>image_2</img>\n Stoddard Thorsen is <img>image_3</img>\n Analyze the contents of the following pictures <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Richie DiMaso and Irving Rosenfeld are engaged in a discussion while Stoddard Thorsen listens. As the conversation progresses, Richie becomes more animated, gesturing with his hands. Eventually, Richie leans in towards Stoddard, emphasizing his point, and then seen physically restraining Stoddard against a wall."
+    },
+    {
+        "id": 124,
+        "question": "Razieh is <img>image_1</img>\n Nader is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Nader is in the kitchen with Razieh, who seems to be caring for someone. They are engaged in a daily routine, possibly preparing food or cleaning up. As they work, Razieh's attention shifts towards Nader, who is standing by observing. Their interaction appears to be casual, with Nader checking in or providing guidance as Razieh performs her tasks."
+    },
+    {
+        "id": 125,
+        "question": "Simin is <img>image_1</img>\n Nader is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Simin is busy chopping vegetables in the kitchen while Nader is observed from a distance. Shortly after, Nader moves closer and appears to be peeking around a corner. The two then stand close together, facing a common direction, and eventually, Simin is seen engaged in a phone conversation with Nader sitting beside her."
+    },
+    {
+        "id": 126,
+        "question": "Simin is <img>image_1</img>\n Nader is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Simin and Nader are together in a kitchen; Simin is turned away from Nader, who seems to be working at the sink. In the next moment, Nader is seen from behind, facing a dish rack near the sink while Simin is not in the frame. Then, they are seen through a window, sitting opposite each other with Simin appearing engaged in a task and Nader gesturing with his hand. Finally, they are indoors, with Simin focused on writing or drawing at a table while Nader is partially visible in the foreground, seemingly in conversation with Simin."
+    },
+    {
+        "id": 127,
+        "question": "Tris is <img>image_1</img>\n Christina is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n and describe their content.",
+        "prediction": "Tris and Christina exit a train, moving quickly onto a rooftop. They then navigate cautiously over the rooftop, with Tris leading the way and Christina following close behind. Eventually, they face each other, with Tris turning to include Christina in a group discussion."
+    },
+    {
+        "id": 128,
+        "question": "Tris is <img>image_1</img>\n Peter is <img>image_2</img>\n Please give a summary of the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Peter and Tris are initially seen side by side, seemingly in a serious conversation. The scene shifts to a training session where both are engaged in a physical activity, displaying a focus on their interaction. Later, they are seen walking together with a group, suggesting a sense of belonging and coordination among them."
+    },
+    {
+        "id": 129,
+        "question": "Tris is <img>image_1</img>\n Christina is <img>image_2</img>\n Peter is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Initially, Tris, Christina, and Peter stand side by side, with Tris and Peter mostly in profile views and Christina facing forward. The scene shifts to a more intimate setting where Tris is lying down while Peter is seated beside her, looking at Tris. Finally, the group is seen again, with Tris standing in the center and Peter to her left, both facing forward. Christina is not in the foreground but is present in the background."
+    },
+    {
+        "id": 130,
+        "question": "Four is <img>image_1</img>\n Tris is <img>image_2</img>\n Eric is <img>image_3</img>\n What do you see in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n?",
+        "prediction": "Eric, Tris, and Four stand together amidst a crowd, with Eric on the left and Four to the right of Tris. They seem to be in a tense situation, as Eric is later seen holding a gun, with Four closely watching from the side. In the final scene, they stand side by side, surrounded by others, displaying a sense of unity and possibly preparation for an event."
+    },
+    {
+        "id": 131,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Please provide an overview of the visual information in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Imamura stands on a rooftop, looking down, while Wakaba, seated further back, looks towards him. In the following moments, Wakaba gets closer to the edge, seemingly in a precarious position, as Imamura watches from afar, maintaining his focus on her."
+    },
+    {
+        "id": 132,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Wakaba, seated with hands clasped, converses with Imamura who is sitting beside her. Later, they walk together through a store with other patrons in the background. Eventually, they sit together at a table with Imamura turning his head towards Wakaba, continuing their interaction."
+    },
+    {
+        "id": 133,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Examine the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n and describe their content.",
+        "prediction": "Imamura is first seen crouched down, seemingly focused on an activity on the floor. Soon after, Wakaba appears in the room, sitting at a table while Imamura is still on the floor. The sequence concludes with both Imamura and Wakaba sitting side by side, engaged in a conversation."
+    },
+    {
+        "id": 134,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Imamura and Wakaba walk through a hallway together; Wakaba holds a piece of paper. Imamura looks at her, then points at something above him as they continue their walk."
+    },
+    {
+        "id": 135,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n For the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Wakaba and Imamura are seated in a car with another person. Initially, Wakaba is focused on reading a book while Imamura appears to be looking at her. Then, Imamura is seen eating chips and conversing with Wakaba, who remains attentive to her book. They continue their journey, with Imamura occasionally looking away but generally engaged in the interaction."
+    },
+    {
+        "id": 136,
+        "question": "Imamura is <img>image_1</img>\n Wakaba is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Imamura and Wakaba are seated next to each other among a crowd, initially looking forward. Wakaba then claps her hands enthusiastically while Imamura looks on. Subsequently, both turn their heads towards each other, engaging in a conversation or interaction, with Wakaba holding a red program."
+    },
+    {
+        "id": 137,
+        "question": "Utah is <img>image_1</img>\n In the following pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Utah doing?",
+        "prediction": "Utah starts by steering the boat, then he prepares to jump on a jet ski, and finally, he is seen holding onto a rope, likely waiting to ride the jet ski."
+    },
+    {
+        "id": 138,
+        "question": "Utah is <img>image_1</img>\n Bodhi is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Utah and Bodhi stand side by side in a rugged landscape, with a vehicle behind them. They then appear inside a car, with Utah in the driver's seat and Bodhi as the passenger, driving through what seems to be a stormy environment. Subsequently, they are seen together with a sense of urgency, as Bodhi appears to support Utah, who seems to be in distress, while moving off the vehicle."
+    },
+    {
+        "id": 139,
+        "question": "Utah is <img>image_1</img>\n Pappas is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Utah and Pappas are initially inside, with Utah in the foreground and Pappas behind. They move outside, where Utah is seen being pushed out of a car while Pappas stands by. Subsequently, both are shown sitting in a car with Pappas driving and Utah as a passenger. The sequence ends with Utah leaning out of the car window, apparently using a phone while Pappas watches."
+    },
+    {
+        "id": 140,
+        "question": "Zhang Xuening is <img>image_1</img>\n In the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Zhang Xuening doing?",
+        "prediction": "Zhang Xuening is socializing at a party, initially standing alone, then she is seen conversing with a man, and finally, she is kneeling down while talking to another guest."
+    },
+    {
+        "id": 141,
+        "question": "Zhang Xuening is <img>image_1</img>\n Provide a description for the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Zhang Xuening, dressed in a red dress, is seen at a social gathering. Initially, she is engaged in a conversation with guests. Later, she is captured from behind, walking away from the group. Eventually, Zhang is shown in a close-up, facing someone, possibly continuing an exchange."
+    },
+    {
+        "id": 142,
+        "question": "He Bing is <img>image_1</img>\n Zhang Xuening is <img>image_2</img>\n Please explain what's happening in the photos <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "He Bing and Zhang Xuening are seated opposite each other in a train compartment, engaged in a conversation. He extends his hand towards her, possibly making a point or gesturing during their discussion. Later, the setting shifts and they appear in a dimly lit room, with He Bing facing towards Zhang Xuening, suggesting a continuation of their dialogue or a more intense exchange. Eventually, they stand in a crowded space, maintaining a distance from each other, indicating a shift in their interaction or environment."
+    },
+    {
+        "id": 143,
+        "question": "He Bing is <img>image_1</img>\n Zhang Xuening is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "He Bing sits beside Zhang Xuening's bed, looking at her as she lies asleep. The next day, he stands by her bedside, watching her while she lies propped up by pillows. Eventually, Zhang Xuening is seen holding a glass, with He Bing standing at her doorway, looking back at her. He then leans closer, talking to her as he remains standing by the bedside."
+    },
+    {
+        "id": 144,
+        "question": "Zhang Xuening is <img>image_1</img>\n Please provide an overview of the visual information in these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Zhang Xuening, dressed in a brown suit, is initially seated and engaged in a conversation. She then stands up, facing a standing man, seemingly continuing the discussion. Eventually, she is seen with her hand over her mouth, indicating a change in the interaction's intensity or a moment of reflection."
+    },
+    {
+        "id": 145,
+        "question": "Shen Jing is <img>image_1</img>\n He Bing is <img>image_2</img>\n Provide a description for the images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Shen Jing stands facing He Bing, who is seated and appears to be in a conversation with her. They then sit opposite each other, continuing their interaction. The scene progresses with them sitting closely on a bed, seemingly in a serious discussion."
+    },
+    {
+        "id": 146,
+        "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n In the following images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n, give a description.",
+        "prediction": "Claude and Marie Verneuil are initially seen conversing with drinks in hand, standing in a room. The setting then shifts to a dining scene where Claude is seen speaking while gesturing with his hands, while Marie listens attentively. The sequence ends with the couple in a car, suggesting they have left the house together."
+    },
+    {
+        "id": 147,
+        "question": "David Benichou is <img>image_1</img>\n  In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n What is David doing?",
+        "prediction": "David is initially seen extending his arm out, then walking beside a woman, later leaning slightly towards her, and finally crouching or kneeling beside a baby carriage."
+    },
+    {
+        "id": 148,
+        "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n Please describe the following pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Claude Verneuil, dressed in a shirt and tie, engages in a conversation with Marie Verneuil, who is holding a plate of food. As they talk, Marie listens and gestures with the plate, while Claude responds. Eventually, they walk together towards a building in the distance."
+    },
+    {
+        "id": 149,
+        "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n What is displayed in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n?",
+        "prediction": "Claude Verneuil is initially seated at a desk, facing away from Marie Verneuil, who stands in the doorway. Claude then turns to greet Marie with a handshake, welcoming her inside. Subsequently, Claude and Marie sit closely together on a bench outside, appearing to have a conversation."
+    },
+    {
+        "id": 150,
+        "question": "Claude Verneuil is <img>image_1</img>\n Marie Verneuil is <img>image_2</img>\n Explain the content of these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Claude Verneuil and Marie Verneuil are initially engaged in a conversation over food in a warmly lit kitchen. They then move through a hallway where Claude is adjusting a picture frame, with Marie looking on. Subsequently, they appear together with Claude holding a bottle, suggesting a social gathering. Finally, they stand among a group of people, continuing to converse."
+    },
+    {
+        "id": 151,
+        "question": "Marie Verneuil is <img>image_1</img>\n Claude Verneuil is <img>image_2</img>\n Tell me about the pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Marie and Claude Verneuil are first seen sitting on a couch with Marie reading a magazine and Claude looking relaxed. Their positions shift slightly as they continue their interaction, but they remain on the couch. Later, they stand together, engaging in conversation with a visiting couple, before returning to the couch for a more formal setting with teacups in sight, maintaining their earlier relaxed demeanor."
+    },
+    {
+        "id": 152,
+        "question": "Claude Verneuil is <img>image_1</img>\n In these images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n. What is Claude doing?",
+        "prediction": "Claude is initially carrying a chicken, then he walks down the aisle of a train, seems to be arguing with someone, and finally appears to be pushing that person."
+    },
+    {
+        "id": 153,
+        "question": "Anna Howland-Jones is <img>image_1</img>\n Alice Howland is <img>image_2</img>\n John Howland is <img>image_3</img>\n Please provide an overview of the visual information in these images <img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n<img>image_7</img>\n.",
+        "prediction": "Alice and Anna Howland stand close, conversing intimately, while John is seen in the background. The atmosphere changes as John moves into the background, appearing to prepare something as Alice and Anna look on. Later, all three are focused on a kitchen counter, with John cooking and Anna and Alice leaning over the table. The scene shifts to a hospital room, where John stands near a seated Anna and Alice, who is holding a baby."
+    },
+    {
+        "id": 154,
+        "question": "Gail Harris is <img>image_1</img>\n Tell me about the pictures <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n.",
+        "prediction": "Gail Harris starts in the kitchen, preparing at the table with children nearby. She continues her task, moving around the kitchen as the children seem engaged in conversation or play. The scene shifts to a street where Gail, carrying something, walks with purpose amidst vintage cars. Finally, she is seen inside a vehicle, perhaps ready to drive away with the children."
+    },
+    {
+        "id": 155,
+        "question": "Gail Harris is <img>image_1</img>\n J. Paul Getty is <img>image_2</img>\n What do you see in these images <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n?",
+        "prediction": "Gail Harris and J. Paul Getty engage in a conversation, facing each other. The setting shifts, showing them from a different angle as they continue their discussion. They remain in close proximity throughout, indicating a serious and intimate exchange between the two."
+    },
+    {
+        "id": 156,
+        "question": "J. Paul Getty is <img>image_1</img>\n Provide a description for the images <img>image_2</img>\n<img>image_3</img>\n<img>image_4</img>\n.",
+        "prediction": "J. Paul Getty is first seen in a leisurely setting, standing by a table filled with food and drinks. He then appears in a stark, outdoor scene, standing dominant in the frame. Finally, Getty is shown in a touching moment, holding hands with an unnamed child, as they walk together amidst the cold."
+    },
+    {
+        "id": 157,
+        "question": "J. Paul Getty is <img>image_1</img>\n Fletcher Chace is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "J. Paul Getty and Fletcher Chace are first seen seated indoors with Chace's hands clasped on a table while Getty sits with his hands together. They are then outside, with Getty standing and Chace partially obscured behind him. In the following scene, Chace walks ahead while Getty sits on a bench with a cane. Finally, they are seen together outdoors, with Chace looking ahead and Getty in front of him. Throughout these moments, their positions relative to each other suggest a dynamic conversation or interaction, starting indoors and moving outside."
+    },
+    {
+        "id": 158,
+        "question": "Gail Harris is <img>image_1</img>\n Fletcher Chace is <img>image_2</img>\n Describe the scene in these pictures <img>image_3</img>\n<img>image_4</img>\n<img>image_5</img>\n<img>image_6</img>\n.",
+        "prediction": "Gail Harris and Fletcher Chace walk through an airport arrival area together, followed by a media scrum. Chace is seen speaking to the media as Harris stands beside him. In a subsequent moment, Chace seems to be leading Harris away from the cameras, suggesting a departure or an end to the media interaction."
+    }
+]

weights/model-idf/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e43a8429bf7629454f0d530e5112fea1dab265c3d8bb046bb91e8f2aa81c07a4
+size 9969772092

weights/model-idf/pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56d7dfbdffabe1fb21e008bca646991286f0438aaa2aed2cd5925ed269d72480
+size 9478651117

weights/model-idf/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,864 @@

+{
+  "metadata": {
+    "total_size": 19448120832
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.0.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.1.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.10.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.11.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.12.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.13.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.14.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.15.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.16.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.17.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.18.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.19.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.2.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.20.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.21.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.21.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.22.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.23.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.24.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.25.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.26.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.27.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.28.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.29.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.3.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.30.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.w1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.31.mlp.w2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.4.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.5.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.6.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.7.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.8.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.w1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.h.9.mlp.w2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.in_proj_bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.in_proj_weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn1.in_proj_bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn1.in_proj_weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn1.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.attn1.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.kv_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_kv.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_kv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_q.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.ln_q.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.pos_embed": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.attn_pool.query": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.conv1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_post.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_post.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_pre.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.ln_pre.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.positional_embedding": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.proj": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.0.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.1.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.10.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.11.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.12.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.13.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.14.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.15.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.16.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.17.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.18.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.19.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.2.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.20.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.21.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.22.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.23.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.24.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.25.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.3.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.4.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.42.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.43.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.44.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.45.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.46.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.47.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.5.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.6.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.7.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.8.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.in_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.in_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_1.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_2.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.ln_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "transformer.visual.transformer.resblocks.9.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

weights/model-idf/qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-idf/qwen_generation_utils.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Generation support."""
+from typing import Tuple, List, Union, Iterable
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import PreTrainedTokenizer
+from transformers import logging
+from transformers.generation import LogitsProcessor
+logger = logging.get_logger(__name__)
+# Types.
+HistoryType = List[Tuple[str, str]]
+TokensType = List[int]
+BatchTokensType = List[List[int]]
+def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < seq_length:
+            tokens.extend([pad_id] * (seq_length - context_length))
+    return batch
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    reset_position_ids,
+    reset_attention_mask,
+    eod_mask_loss,
+):
+    """Build masks and position id for left to right model."""
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(
+        torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
+    ).view(att_mask_batch, 1, seq_length, seq_length)
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1) :] -= i + 1 - prev_index
+                    prev_index = i + 1
+    # Convert attention mask to binary:
+    attention_mask = attention_mask < 0.5
+    return attention_mask, loss_mask, position_ids
+def get_batch(context_tokens: torch.LongTensor, eod_id: int):
+    """Generate batch from context tokens."""
+    # Move to GPU.
+    tokens = context_tokens.contiguous().to(context_tokens.device)
+    # Get the attention mask and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        eod_id,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+    )
+    return tokens, attention_mask, position_ids
+def get_stop_words_ids(chat_format, tokenizer):
+    if chat_format == "raw":
+        stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
+    elif chat_format == "chatml":
+        stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    return stop_words_ids
+def make_context(
+    tokenizer: PreTrainedTokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set(tokenizer.IMAGE_ST)
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+        raw_text = ""
+        context_tokens = []
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            if turn_response is not None:
+                response_text, response_tokens_part = _tokenize_str(
+                    "assistant", turn_response
+                )
+                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+                prev_chat = (
+                    f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+                )
+            else:
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n"
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    return raw_text, context_tokens
+def _decode_default(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_words: List[str],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace',
+):
+    trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate: ", trim_decode_tokens)
+    end_reason = f"Gen length {len(tokens)}"
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    for eod_word in eod_words:
+        if eod_word in trim_decode_tokens:
+            end_reason = f"Gen {eod_word!r}"
+        trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nEnd Reason:", end_reason)
+        print("\nGenerate: ", trim_decode_tokens)
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+def _decode_chatml(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_token_ids: List[int],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace'
+):
+    end_reason = f"Gen length {len(tokens)}"
+    eod_token_idx = context_length
+    for eod_token_idx in range(context_length, len(tokens)):
+        if tokens[eod_token_idx] in eod_token_ids:
+            end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
+            break
+    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
+        print("\nRaw Generate:", trim_decode_tokens)
+        print("\nEnd Reason:", end_reason)
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nGenerate:", trim_decode_tokens)
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+def decode_tokens(
+    tokens: Union[torch.LongTensor, TokensType],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    chat_format: str,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str="replace",
+) -> str:
+    if torch.is_tensor(tokens):
+        tokens = tokens.cpu().numpy().tolist()
+    if chat_format == "chatml":
+        return _decode_chatml(
+            tokens,
+            stop_words=[],
+            eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            context_length=context_length,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    elif chat_format == "raw":
+        return _decode_default(
+            tokens,
+            stop_words=["<|endoftext|>"],
+            eod_words=["<|endoftext|>"],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+class StopWordsLogitsProcessor(LogitsProcessor):
+    """
+    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
+    Args:
+        stop_words_ids (:obj:`List[List[int]]`):
+            List of list of token ids of stop ids. In order to get the tokens of the words
+            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
+            add_prefix_space=True).input_ids`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+    def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
+        if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
+            raise ValueError(
+                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
+            )
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
+            raise ValueError(
+                f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
+            )
+        if any(
+            any(
+                (not isinstance(token_id, (int, np.integer)) or token_id < 0)
+                for token_id in stop_word_ids
+            )
+            for stop_word_ids in stop_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
+            )
+        self.stop_words_ids = list(
+            filter(
+                lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
+            )
+        )
+        self.eos_token_id = eos_token_id
+        for stop_token_seq in self.stop_words_ids:
+            assert (
+                len(stop_token_seq) > 0
+            ), "Stop words token sequences {} cannot have an empty list".format(
+                stop_words_ids
+            )
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        stopped_samples = self._calc_stopped_samples(input_ids)
+        for i, should_stop in enumerate(stopped_samples):
+            if should_stop:
+                scores[i, self.eos_token_id] = float(2**15)
+        return scores
+    def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        elif len(tokens) > len(prev_tokens):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        elif prev_tokens[-len(tokens) :].tolist() == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+    def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
+        stopped_samples = []
+        for prev_input_ids_slice in prev_input_ids:
+            match = False
+            for stop_token_seq in self.stop_words_ids:
+                if self._tokens_match(prev_input_ids_slice, stop_token_seq):
+                    # if tokens do not match continue
+                    match = True
+                    break
+            stopped_samples.append(match)
+        return stopped_samples
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """This function has been mostly taken from huggingface conversational
+    ai code at
+        https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+             conversational-ai-with-transfer-learning-2d818ac26313"""
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p > 0.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+    return logits
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2

weights/model-idf/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "pad_token": "<|endoftext|>"
+}

weights/model-idf/tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,598 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import requests
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
+import tiktoken
+import numpy as np
+from PIL import Image
+from PIL import ImageFont
+from PIL import ImageDraw
+from transformers import PreTrainedTokenizer, AddedToken
+from transformers.utils import try_to_load_from_cache
+import matplotlib.colors as mcolors
+from matplotlib.font_manager import FontProperties
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
+FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
+if FONT_PATH is None:
+    if not os.path.exists("SimSun.ttf"):
+        ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
+        open("SimSun.ttf", "wb").write(ttf.content)
+    FONT_PATH = "SimSun.ttf"
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+IMG_TOKEN_SPAN = 256
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+def _list_find(
+    input_list: List[Any],
+    candidates: Tuple[Any],
+    start: int = 0,
+):
+    for i in range(start, len(input_list)):
+        if input_list[i] in candidates:
+            return i
+    return -1
+def _replace_closed_tag(
+    input_tokens: List[Any],
+    start_tags: Union[Any, Tuple[Any]],
+    end_tags: Union[Any, Tuple[Any]],
+    inclusive_replace_func: Callable,
+    exclusive_replace_func: Callable = lambda x: x,
+):
+    if isinstance(start_tags, (str, int)):
+        start_tags = (start_tags,)
+    if isinstance(end_tags, (str, int)):
+        end_tags = (end_tags,)
+    assert len(start_tags) == len(end_tags)
+    output_tokens = []
+    end = 0
+    while True:
+        start = _list_find(input_tokens, start_tags, end)
+        if start == -1:
+            break
+        output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
+        tag_idx = start_tags.index(input_tokens[start])
+        end = _list_find(input_tokens, (end_tags[tag_idx],), start)
+        if end == -1:
+            raise ValueError("Unclosed image token")
+        output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1]))
+        end += 1
+    output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
+    return output_tokens
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        image_start_tag='<img>',
+        image_end_tag='</img>',
+        image_pad_tag='<imgpad>',
+        ref_start_tag='<ref>',
+        ref_end_tag='</ref>',
+        box_start_tag='<box>',
+        box_end_tag='</box>',
+        quad_start_tag='<quad>',
+        quad_end_tag='</quad>',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.image_pad_tag = image_pad_tag
+        self.ref_start_tag = ref_start_tag
+        self.ref_end_tag = ref_end_tag
+        self.box_start_tag = box_start_tag
+        self.box_end_tag = box_end_tag
+        self.quad_start_tag = quad_start_tag
+        self.quad_end_tag = quad_end_tag
+        self.IMAGE_ST = (
+            ref_start_tag, ref_end_tag,
+            box_start_tag, box_end_tag,
+            quad_start_tag, quad_end_tag,
+            image_start_tag, image_end_tag,
+            image_pad_tag
+        )
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
+            )
+        }
+        self.img_start_id = self.special_tokens[self.image_start_tag]
+        self.img_end_id = self.special_tokens[self.image_end_tag]
+        self.img_pad_id = self.special_tokens[self.image_pad_tag]
+        self.ref_start_id = self.special_tokens[self.ref_start_tag]
+        self.ref_end_id = self.special_tokens[self.ref_end_tag]
+        self.box_start_id = self.special_tokens[self.box_start_tag]
+        self.box_end_id = self.special_tokens[self.box_end_tag]
+        self.quad_start_id = self.special_tokens[self.quad_start_tag]
+        self.quad_end_id = self.special_tokens[self.quad_end_tag]
+        self.image_special_tokens = set([
+            self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
+            self.quad_start_id, self.quad_end_id,
+        ])
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state['tokenizer']
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
+                raise ValueError('Adding unknown special tokens is not supported')
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        def _encode_imgurl(img_tokens):
+            assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
+            img_tokens = img_tokens[1:-1]
+            img_url = b''.join(img_tokens)
+            out_img_tokens = list(map(self.decoder.get, img_url))
+            if len(out_img_tokens) > IMG_TOKEN_SPAN:
+                raise ValueError("The content in {}..{} is too long".format(
+                    self.image_start_tag, self.image_end_tag))
+            out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
+            out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
+            return out_img_tokens
+        return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        def _decode_imgurl(img_token_ids):
+            assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
+            img_token_ids = img_token_ids[1:-1]
+            img_token_ids = img_token_ids[ : img_token_ids.index(self.img_pad_id)]
+            img_url = bytes(img_token_ids).decode('utf-8')
+            return [self.img_start_id] + self.tokenizer.encode(img_url) + [self.img_end_id]
+        token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
+        if skip_special_tokens:
+            if kwargs.get('keep_image_special', False):
+                token_ids = [i for i in token_ids if i < self.eod_id
+                    or i in self.image_special_tokens]
+            else:
+                token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+    def to_list_format(self, text: str):
+        text = unicodedata.normalize("NFC", text)
+        token_ids = self.tokenizer.encode(
+            text, allowed_special=set(self.IMAGE_ST + (ENDOFTEXT,)))
+        def _encode_vl_info(tokens):
+            if len(tokens) == 0:
+                return []
+            if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
+                key = 'image'
+            elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
+                key = 'ref'
+            elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
+                key = 'box'
+            elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
+                key = 'quad'
+            else:
+                _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+                return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
+            _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+            val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
+            return [{key: val}]
+        return _replace_closed_tag(
+            token_ids,
+            (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
+            (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
+            _encode_vl_info,
+            _encode_vl_info,
+        )
+    def from_list_format(self, list_format: List[Dict]):
+        text = ''
+        num_images = 0
+        for ele in list_format:
+            if 'image' in ele:
+                num_images += 1
+                text += f'Picture {num_images}: '
+                text += self.image_start_tag + ele['image'] + self.image_end_tag
+                text += '\n'
+            elif 'text' in ele:
+                text += ele['text']
+            elif 'box' in ele:
+                if 'ref' in ele:
+                    text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
+                for box in ele['box']:
+                    text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
+            else:
+                raise ValueError("Unsupport element: " + str(ele))
+        return text
+    def _fetch_latest_picture(self, response, history):
+        if history is None:
+            history = []
+        _history = history + [(response, None)]
+        for q, r in _history[::-1]:
+            for ele in self.to_list_format(q)[::-1]:
+                if 'image' in ele:
+                    return ele['image']
+        return None
+    def _fetch_all_box_with_ref(self, text):
+        list_format = self.to_list_format(text)
+        output = []
+        for i, ele in enumerate(list_format):
+            if 'box' in ele:
+                bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
+                assert len(bbox) == 4
+                output.append({'box': bbox})
+                if i > 0 and 'ref' in list_format[i-1]:
+                    output[-1]['ref'] = list_format[i-1]['ref'].strip()
+        return output
+    def draw_bbox_on_latest_picture(
+        self,
+        response,
+        history=None,
+    ) -> Optional[Image.Image]:
+        image = self._fetch_latest_picture(response, history)
+        if image is None:
+            return None
+        if image.startswith("http://") or image.startswith("https://"):
+            image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
+            h, w = image.height, image.width
+        else:
+            image = np.asarray(Image.open(image).convert("RGB"))
+            h, w = image.shape[0], image.shape[1]
+        visualizer = Visualizer(image)
+        boxes = self._fetch_all_box_with_ref(response)
+        if not boxes:
+            return None
+        color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()]) # init color
+        for box in boxes:
+            if 'ref' in box: # random new color for new refexps
+                color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()])
+            x1, y1, x2, y2 = box['box']
+            x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
+            visualizer.draw_box((x1, y1, x2, y2), alpha=1, edge_color=color)
+            if 'ref' in box:
+                visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
+        return visualizer.output
+import colorsys
+import logging
+import math
+import numpy as np
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+import random
+logger = logging.getLogger(__name__)
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+    def _setup_figure(self, img):
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+    def reset_image(self, img):
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+    def save(self, filepath):
+        self.fig.savefig(filepath)
+    def get_image(self):
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        buffer = np.frombuffer(s, dtype="uint8")
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+class Visualizer:
+    def __init__(self, img_rgb, metadata=None, scale=1.0):
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.font_path = FONT_PATH
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+        # too small texts are useless, therefore clamp to 14
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 30, 15 // scale
+        )
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        if not font_size:
+            font_size = self._default_font_size
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            fontproperties=FontProperties(fname=self.font_path),
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+        linewidth = max(self._default_font_size / 4, 1)
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+    def get_output(self):
+        return self.output

weights/model-idf/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 2048,
+  "padding_side": "right",
+  "tokenizer_class": "QWenTokenizer"
+}

weights/model-idf/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

weights/model-idf/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af8991e433f62470b81e72354c2330cd5f430895d2ee3ab60779f7dc22cfc953
+size 6395

weights/model-idf/visual.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        normal_(self.query, std=.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads) # only out_proj
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        # new
+        self.attn1 = nn.MultiheadAttention(embed_dim, num_heads)
+        self.attn1.apply(self._init_weights_zero)
+        # self.id_query_fc = nn.Linear(embed_dim, embed_dim, bias=False)
+        # self.test_feat_fc = nn.Linear(embed_dim, embed_dim, bias=False)
+        # self.id_query_fc.apply(self._init_weights_zero)
+        # self.test_feat_fc.apply(self._init_weights_zero)
+        # self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _init_weights_zero(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.constant_(m.weight, 0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x, images_flag, attn_mask=None): # idadapter3
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        q_ = self._repeat(q, N)
+        out_list = []
+        ind = 0
+        while ind < len(images_flag):
+            if images_flag[ind] == 2:
+                end_ind = ind+1
+                while True:
+                    if end_ind >= len(images_flag):
+                        break
+                    if images_flag[end_ind] == 2:
+                        end_ind += 1
+                    else:
+                        break
+                fake_out = self.attn1(
+                    q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
+                    x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
+                    x[:,ind:end_ind,:],
+                    attn_mask=attn_mask)[0]
+                llava_out = self.attn(
+                    q_[:,ind:end_ind,:] + self.pos_embed.unsqueeze(1),
+                    x[:,ind:end_ind,:] + pos_embed.unsqueeze(1),
+                    x[:,ind:end_ind,:],
+                    attn_mask=attn_mask)[0]
+                llava_out = llava_out + 0 * fake_out
+                ind = end_ind
+                out_list.append(llava_out)
+            elif images_flag[ind] == 0:
+                id_end_ind = ind+1
+                while True:
+                    if id_end_ind >= len(images_flag):
+                        break
+                    if images_flag[id_end_ind] == 0:
+                        id_end_ind += 1
+                    else:
+                        break
+                id_out = self.attn(
+                    q_[:,ind:id_end_ind,:] + self.pos_embed.unsqueeze(1),
+                    x[:,ind:id_end_ind,:] + pos_embed.unsqueeze(1),
+                    x[:,ind:id_end_ind,:],
+                    attn_mask=attn_mask)[0]
+                ind = id_end_ind
+                out_list.append(id_out)
+                test_end_ind = ind+1
+                while True:
+                    if test_end_ind >= len(images_flag):
+                        break
+                    if images_flag[test_end_ind] == 1:
+                        test_end_ind += 1
+                    else:
+                        break
+                id_query = id_out.detach()
+                id_query = id_query.permute(1,0,2).reshape([-1, id_query.shape[-1]])
+                id_query = self._repeat(id_query, test_end_ind-ind)
+                test_feats = x[:,ind:test_end_ind,:]
+                test_out1 = self.attn1(
+                    test_feats + pos_embed.unsqueeze(1),
+                    id_query,
+                    id_query,
+                    attn_mask=attn_mask)[0]
+                test_out1 = test_out1 + test_feats # residual
+                test_out2 = self.attn(
+                    q_[:,ind:test_end_ind,:] + self.pos_embed.unsqueeze(1),
+                    test_out1 + pos_embed.unsqueeze(1),
+                    test_out1,
+                    attn_mask=attn_mask)[0]
+                ind = test_end_ind
+                out_list.append(test_out2)
+            else:
+                print('error')
+        out = torch.cat(out_list, 1)
+        # except:
+        #     fake_out = self.attn1(
+        #         self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+        #         x + pos_embed.unsqueeze(1),
+        #         x,
+        #         attn_mask=attn_mask)[0]
+        #     out = self.attn(
+        #         self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+        #         x + pos_embed.unsqueeze(1),
+        #         x,
+        #         attn_mask=attn_mask)[0]
+        #     out = out + 0 * fake_out
+        #     t = time.time()
+        #     with open('/mnt/bn/automl-aigc/yatai/Qwen-VL/bug2.txt', 'a') as f:
+        #         f.write(f"{t}: visual id-former error\n")
+        return out.permute(1,0,2)
+    # def forward(self, x, images_flag=None, attn_mask=None):
+    #     pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+    #     x = self.kv_proj(x)
+    #     x = self.ln_kv(x).permute(1, 0, 2)
+    #     N = x.shape[1]
+    #     q = self.ln_q(self.query)
+    #     out = self.attn(
+    #         self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+    #         x + pos_embed.unsqueeze(1),
+    #         x,
+    #         attn_mask=attn_mask)[0]
+    #     return out.permute(1, 0, 2)
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, embed_dim, num_heads,
+                 bias=True, kdim=None, vdim=None):
+        super(VisualAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+    def forward(self, query, key, value, attn_mask = None):
+        # query/key/value: [sq, b, h]
+        sq, b, _ = query.size()
+        assert torch.allclose(query, key), 'Only Support Self-Attention Currently'
+        sk = sq
+        mixed_x_layer = self.in_proj(query)
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+        value_layer = value_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(b,
+            self.num_attention_heads_per_partition,
+            sq, self.hidden_size_per_attention_head)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        output = self.out_proj(context_layer)
+        return output
+class VisualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(
+                width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(
+            self,
+            image_size: int,
+            patch_size: int,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float,
+            n_queries: int = 256,
+            output_dim: int = 512,
+            **kwargs
+    ):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.attn_pool = Resampler(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+        )
+        self.ln_post = norm_layer(output_dim)
+        self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
+    def forward(self, x: torch.Tensor, images_flag):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x, images_flag)
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+    def encode(self, image_paths: List[str], images_flag):
+        images = []
+        for image_path in image_paths:
+            if image_path.startswith("http://") or image_path.startswith("https://"):
+                image = Image.open(requests.get(image_path, stream=True).raw)
+            else:
+                image = Image.open(image_path)
+            image = image.convert("RGB")
+            images.append(self.image_transform(image))
+        images = torch.stack(images, dim=0)
+        return self(images, images_flag)

weights/model-idf/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,578 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage == 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)