lokinfey commited on 7 days ago

Commit

9735b74

verified ·

1 Parent(s): 4223804

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

.gitattributes +1 -0
added_tokens.json +12 -0
config.json +2542 -0
configuration_phi4mm.py +235 -0
generation_config.json +10 -0
merges.txt +0 -0
openvino_audio_embeddings_model.bin +3 -0
openvino_audio_embeddings_model.xml +144 -0
openvino_audio_encoder_model.bin +3 -0
openvino_audio_encoder_model.xml +0 -0
openvino_audio_forward_embeddings_model.bin +3 -0
openvino_audio_forward_embeddings_model.xml +906 -0
openvino_audio_text_projection_model.bin +3 -0
openvino_audio_text_projection_model.xml +264 -0
openvino_audio_vision_projection_model.bin +3 -0
openvino_audio_vision_projection_model.xml +264 -0
openvino_language_model.bin +3 -0
openvino_language_model.xml +0 -0
openvino_text_embeddings_model.bin +3 -0
openvino_text_embeddings_model.xml +107 -0
openvino_vision_embeddings_model.bin +3 -0
openvino_vision_embeddings_model.xml +0 -0
openvino_vision_projection_model.bin +3 -0
openvino_vision_projection_model.xml +264 -0
preprocessor_config.json +14 -0
processing_phi4mm.py +733 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +126 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "<|/tool_call|>": 200026,
+  "<|/tool|>": 200024,
+  "<|assistant|>": 200019,
+  "<|end|>": 200020,
+  "<|system|>": 200022,
+  "<|tag|>": 200028,
+  "<|tool_call|>": 200025,
+  "<|tool_response|>": 200027,
+  "<|tool|>": 200023,
+  "<|user|>": 200021
+}

config.json ADDED Viewed

	@@ -0,0 +1,2542 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "C:\\Users\\kinfeylo\\Desktop\\AOAI\\Tools\\ORTModel\\Phi-4-multimodal",
+  "architectures": [
+    "Phi4MMForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "audio_processor": {
+    "config": {
+      "activation": "swish",
+      "activation_checkpointing": "",
+      "attention_dim": 1024,
+      "attention_heads": 16,
+      "batch_norm": false,
+      "bias_in_glu": true,
+      "causal": true,
+      "chunk_size": -1,
+      "cnn_layer_norm": true,
+      "conv_activation": "swish",
+      "conv_glu_type": "swish",
+      "depthwise_multiplier": 1,
+      "depthwise_seperable_out_channel": 1024,
+      "dropout_rate": 0.0,
+      "encoder_embedding_config": {
+        "input_size": 80
+      },
+      "ext_pw_kernel_size": 1,
+      "ext_pw_out_channel": 1024,
+      "input_layer": "nemo_conv",
+      "input_size": 80,
+      "kernel_size": 3,
+      "left_chunk": 18,
+      "linear_units": 1536,
+      "nemo_conv_settings": {
+        "conv_channels": 1024
+      },
+      "num_blocks": 24,
+      "relative_attention_bias_args": {
+        "t5_bias_max_distance": 500,
+        "type": "t5"
+      },
+      "time_reduction": 8
+    },
+    "name": "cascades"
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
+    "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
+    "AutoTokenizer": "Xenova/gpt-4o"
+  },
+  "base_vision_feat_height_reduction": 1,
+  "base_vision_feat_height_target": 16,
+  "bos_token_id": 199999,
+  "crop_size": 448,
+  "embd_layer": {
+    "audio_embd_layer": {
+      "compression_rate": 8,
+      "downsample_rate": 1,
+      "embedding_cls": "audio",
+      "enable_gradient_checkpointing": true,
+      "projection_cls": "mlp",
+      "use_conv_downsample": false,
+      "use_qformer": false
+    },
+    "embedding_cls": "image_audio",
+    "image_embd_layer": {
+      "crop_size": 448,
+      "embedding_cls": "tune_image",
+      "enable_gradient_checkpointing": true,
+      "hd_transform_order": "sub_glb",
+      "image_token_compression_cls": "avg_pool_2d",
+      "projection_cls": "mlp",
+      "use_hd_transform": true,
+      "with_learnable_separator": true
+    }
+  },
+  "embd_pdrop": 0.0,
+  "eos_token_id": 199999,
+  "full_attn_mod": 1,
+  "glb_GN": [
+    [
+      [
+        0.0240478515625,
+        -0.03466796875,
+        -0.0201416015625,
+        0.0208740234375,
+        -0.0042724609375,
+        -0.034423828125,
+        0.01043701171875,
+        -0.03955078125,
+        -0.0103759765625,
+        0.0791015625,
+        -0.0225830078125,
+        0.0174560546875,
+        0.006622314453125,
+        -0.003143310546875,
+        0.0272216796875,
+        0.0400390625,
+        0.0166015625,
+        -0.034912109375,
+        0.015869140625,
+        -0.021728515625,
+        -0.0106201171875,
+        0.0400390625,
+        0.0081787109375,
+        -0.009521484375,
+        0.0107421875,
+        0.000499725341796875,
+        0.0439453125,
+        -0.02734375,
+        0.0179443359375,
+        -0.012451171875,
+        0.042724609375,
+        0.00043487548828125,
+        -0.00213623046875,
+        -0.0164794921875,
+        0.0152587890625,
+        0.034912109375,
+        0.0111083984375,
+        -0.0732421875,
+        -0.017822265625,
+        -0.02783203125,
+        -0.024658203125,
+        -0.0126953125,
+        0.00433349609375,
+        -0.0225830078125,
+        -0.0294189453125,
+        -0.006561279296875,
+        0.027587890625,
+        0.0286865234375,
+        0.0164794921875,
+        -0.048583984375,
+        -0.061279296875,
+        0.006927490234375,
+        -0.0225830078125,
+        0.01434326171875,
+        0.00130462646484375,
+        -0.07080078125,
+        -0.006011962890625,
+        -0.0228271484375,
+        0.01300048828125,
+        0.00225830078125,
+        -0.0052490234375,
+        -0.0218505859375,
+        -0.0025177001953125,
+        0.05078125,
+        -0.0283203125,
+        -0.033203125,
+        -0.0279541015625,
+        0.01025390625,
+        -0.011962890625,
+        0.015625,
+        -0.0021514892578125,
+        0.013671875,
+        0.0634765625,
+        -0.0014190673828125,
+        0.006256103515625,
+        0.000865936279296875,
+        0.03662109375,
+        -0.024169921875,
+        0.030517578125,
+        0.035888671875,
+        0.00396728515625,
+        -0.035400390625,
+        0.0311279296875,
+        -0.015869140625,
+        -0.00531005859375,
+        0.0235595703125,
+        0.003143310546875,
+        -0.02099609375,
+        -0.07177734375,
+        -0.035888671875,
+        -0.03125,
+        0.021240234375,
+        -0.04833984375,
+        -0.0299072265625,
+        -0.10791015625,
+        0.023681640625,
+        0.0291748046875,
+        0.003936767578125,
+        -0.0255126953125,
+        0.018310546875,
+        0.005767822265625,
+        0.01422119140625,
+        0.00787353515625,
+        -0.0030059814453125,
+        0.053466796875,
+        0.02734375,
+        0.024658203125,
+        -0.0081787109375,
+        0.0419921875,
+        -0.0240478515625,
+        -0.0208740234375,
+        0.004058837890625,
+        -0.03369140625,
+        0.0439453125,
+        -0.0625,
+        0.003082275390625,
+        0.01007080078125,
+        -0.047119140625,
+        -0.0224609375,
+        0.0181884765625,
+        0.0196533203125,
+        -0.004608154296875,
+        -0.0458984375,
+        0.04736328125,
+        -0.01513671875,
+        -0.08349609375,
+        -0.0576171875,
+        -0.0263671875,
+        -0.0341796875,
+        -0.017578125,
+        0.0145263671875,
+        0.06884765625,
+        0.0291748046875,
+        -0.0164794921875,
+        0.0859375,
+        -0.02685546875,
+        0.003021240234375,
+        -0.0181884765625,
+        0.041015625,
+        0.018310546875,
+        -0.04638671875,
+        -0.08056640625,
+        -0.03759765625,
+        0.0086669921875,
+        -0.0244140625,
+        0.01385498046875,
+        -0.050048828125,
+        -0.037841796875,
+        -0.014404296875,
+        0.0196533203125,
+        0.048095703125,
+        -0.05029296875,
+        0.000946044921875,
+        -0.003875732421875,
+        0.0078125,
+        -0.00726318359375,
+        -0.01275634765625,
+        0.00193023681640625,
+        -0.01556396484375,
+        -0.03857421875,
+        -0.024169921875,
+        -0.009765625,
+        -0.0208740234375,
+        -0.01141357421875,
+        -0.043701171875,
+        -0.005096435546875,
+        -0.045654296875,
+        0.064453125,
+        0.038818359375,
+        0.0004215240478515625,
+        0.0274658203125,
+        0.00299072265625,
+        -0.003265380859375,
+        -0.00811767578125,
+        -0.034912109375,
+        -0.023681640625,
+        -0.0238037109375,
+        -0.0015106201171875,
+        -0.0225830078125,
+        0.005706787109375,
+        0.040283203125,
+        0.047119140625,
+        0.00872802734375,
+        -0.00933837890625,
+        -0.0546875,
+        -0.007476806640625,
+        -0.02099609375,
+        0.056396484375,
+        0.0189208984375,
+        0.0184326171875,
+        -0.0400390625,
+        -0.0142822265625,
+        -0.0703125,
+        -0.035400390625,
+        -0.0086669921875,
+        -0.0517578125,
+        -0.0289306640625,
+        0.04736328125,
+        0.0028533935546875,
+        0.0439453125,
+        0.0301513671875,
+        0.019287109375,
+        -0.0185546875,
+        -0.0185546875,
+        -0.033935546875,
+        0.0159912109375,
+        0.01434326171875,
+        -0.0128173828125,
+        -0.0225830078125,
+        0.056884765625,
+        0.0556640625,
+        -0.03466796875,
+        0.0135498046875,
+        0.0137939453125,
+        0.0732421875,
+        -0.01116943359375,
+        -0.0128173828125,
+        -0.0004100799560546875,
+        0.01434326171875,
+        0.0299072265625,
+        -0.01446533203125,
+        -0.050048828125,
+        -0.036376953125,
+        -0.00775146484375,
+        0.00439453125,
+        0.00811767578125,
+        0.0147705078125,
+        0.01019287109375,
+        0.0019683837890625,
+        -0.00830078125,
+        -0.007659912109375,
+        0.029541015625,
+        -0.003509521484375,
+        0.043701171875,
+        -0.007781982421875,
+        0.0211181640625,
+        -0.0208740234375,
+        0.039794921875,
+        -0.03759765625,
+        0.0045166015625,
+        0.050048828125,
+        0.0196533203125,
+        0.043701171875,
+        0.00848388671875,
+        -0.043212890625,
+        -0.049560546875,
+        -0.062255859375,
+        0.0272216796875,
+        0.03662109375,
+        -0.034912109375,
+        -0.01336669921875,
+        0.05419921875,
+        -0.042236328125,
+        0.000705718994140625,
+        0.003753662109375,
+        0.0225830078125,
+        0.021240234375,
+        -0.0181884765625,
+        0.0257568359375,
+        0.0238037109375,
+        0.0034332275390625,
+        0.045166015625,
+        0.021728515625,
+        -0.0037384033203125,
+        -0.000598907470703125,
+        0.017578125,
+        -0.012939453125,
+        0.040771484375,
+        -0.05419921875,
+        -0.015380859375,
+        -0.040771484375,
+        -0.004974365234375,
+        -0.06689453125,
+        0.0419921875,
+        -0.00043487548828125,
+        0.042724609375,
+        0.01361083984375,
+        -0.013671875,
+        -0.048095703125,
+        -0.00787353515625,
+        -0.03076171875,
+        0.05078125,
+        0.0269775390625,
+        0.0028076171875,
+        -0.0233154296875,
+        -0.0023956298828125,
+        -0.02294921875,
+        -0.0517578125,
+        0.04541015625,
+        0.0035247802734375,
+        -0.004302978515625,
+        0.019775390625,
+        0.002777099609375,
+        -0.04150390625,
+        0.0150146484375,
+        0.0166015625,
+        0.01104736328125,
+        0.0252685546875,
+        0.02587890625,
+        -0.0079345703125,
+        -0.00347900390625,
+        -0.01171875,
+        -0.06298828125,
+        -0.023193359375,
+        0.0233154296875,
+        -0.0311279296875,
+        0.016845703125,
+        -0.006561279296875,
+        0.0257568359375,
+        0.048583984375,
+        -0.00567626953125,
+        -0.049072265625,
+        0.00119781494140625,
+        0.01416015625,
+        -0.0111083984375,
+        -0.01556396484375,
+        -0.022705078125,
+        -0.0184326171875,
+        -0.044189453125,
+        0.00469970703125,
+        -0.0281982421875,
+        0.031494140625,
+        0.00970458984375,
+        -0.00604248046875,
+        -0.00023937225341796875,
+        0.00732421875,
+        -0.032958984375,
+        -0.0361328125,
+        -0.00909423828125,
+        0.03857421875,
+        -0.06201171875,
+        -0.0283203125,
+        0.0791015625,
+        -0.0108642578125,
+        -0.049072265625,
+        0.01068115234375,
+        -0.049072265625,
+        -0.0380859375,
+        -0.048583984375,
+        -0.026123046875,
+        -0.00872802734375,
+        0.0021209716796875,
+        0.00140380859375,
+        -0.0260009765625,
+        0.0050048828125,
+        0.010986328125,
+        -0.0028228759765625,
+        0.0390625,
+        -0.0205078125,
+        -0.00543212890625,
+        -0.0113525390625,
+        0.045166015625,
+        0.00762939453125,
+        -0.029541015625,
+        -0.0106201171875,
+        -0.021484375,
+        -0.000362396240234375,
+        -0.025146484375,
+        -0.0419921875,
+        -0.04736328125,
+        -0.0186767578125,
+        -0.0029144287109375,
+        -0.04052734375,
+        -0.02734375,
+        -0.009521484375,
+        0.0189208984375,
+        0.033935546875,
+        -0.031982421875,
+        -0.044189453125,
+        -0.036376953125,
+        -0.0035400390625,
+        -0.0191650390625,
+        0.0184326171875,
+        -0.0133056640625,
+        -0.0240478515625,
+        -0.05712890625,
+        -0.005157470703125,
+        0.0208740234375,
+        0.0172119140625,
+        -0.0034332275390625,
+        0.068359375,
+        -0.0191650390625,
+        0.004425048828125,
+        0.04150390625,
+        -0.06689453125,
+        -0.0224609375,
+        -0.002899169921875,
+        0.0167236328125,
+        -0.032958984375,
+        0.037353515625,
+        -0.0184326171875,
+        -0.053466796875,
+        -0.0125732421875,
+        -0.04296875,
+        -0.003143310546875,
+        -0.05810546875,
+        0.068359375,
+        -0.04150390625,
+        -0.01275634765625,
+        -0.017333984375,
+        -0.06787109375,
+        -0.03466796875,
+        0.01806640625,
+        -0.00408935546875,
+        0.0294189453125,
+        -0.0498046875,
+        0.038330078125,
+        -0.0615234375,
+        0.072265625,
+        0.0267333984375,
+        -0.055908203125,
+        0.0284423828125,
+        -0.0159912109375,
+        -0.016845703125,
+        0.051513671875,
+        -0.002105712890625,
+        0.0023193359375,
+        -0.00592041015625,
+        -0.00012874603271484375,
+        0.0247802734375,
+        -0.024169921875,
+        -0.031982421875,
+        -0.0020294189453125,
+        -0.06787109375,
+        -0.0128173828125,
+        0.0057373046875,
+        0.034912109375,
+        -0.01416015625,
+        0.004638671875,
+        0.0032806396484375,
+        -0.022705078125,
+        -0.015625,
+        0.03564453125,
+        -0.0272216796875,
+        -0.042724609375,
+        -0.03271484375,
+        0.035400390625,
+        0.0419921875,
+        0.00787353515625,
+        0.0281982421875,
+        -0.0037841796875,
+        -0.01177978515625,
+        -0.03857421875,
+        0.056884765625,
+        -0.0189208984375,
+        0.061767578125,
+        -0.036865234375,
+        0.04638671875,
+        0.060302734375,
+        -0.0537109375,
+        0.0439453125,
+        0.00799560546875,
+        -0.0196533203125,
+        0.0010528564453125,
+        0.0036468505859375,
+        -0.021728515625,
+        0.0032806396484375,
+        -0.006256103515625,
+        0.017822265625,
+        -0.045166015625,
+        -0.0380859375,
+        0.0140380859375,
+        0.016357421875,
+        -0.109375,
+        -0.05859375,
+        0.047607421875,
+        0.01031494140625,
+        -0.01348876953125,
+        0.03466796875,
+        -0.01177978515625,
+        -0.013916015625,
+        -0.0205078125,
+        -0.0439453125,
+        -0.01214599609375,
+        0.035400390625,
+        -0.0184326171875,
+        -0.017822265625,
+        0.0361328125,
+        -0.03662109375,
+        0.0257568359375,
+        0.0022430419921875,
+        -0.03125,
+        -0.0267333984375,
+        -0.03271484375,
+        -0.0260009765625,
+        0.0216064453125,
+        0.04443359375,
+        -0.007293701171875,
+        -0.0177001953125,
+        -0.00286865234375,
+        -0.0017242431640625,
+        -0.0927734375,
+        -0.0164794921875,
+        0.029052734375,
+        0.0242919921875,
+        0.0040283203125,
+        0.012939453125,
+        0.03857421875,
+        0.020263671875,
+        -0.041015625,
+        -0.0169677734375,
+        -0.0301513671875,
+        0.043212890625,
+        0.045654296875,
+        0.01708984375,
+        0.036376953125,
+        0.0125732421875,
+        -0.07177734375,
+        0.006011962890625,
+        -0.01239013671875,
+        -0.0029296875,
+        0.035888671875,
+        -0.03173828125,
+        0.028564453125,
+        0.0308837890625,
+        -0.0517578125,
+        0.021728515625,
+        -0.0179443359375,
+        0.044189453125,
+        0.02783203125,
+        -0.0007476806640625,
+        0.0026397705078125,
+        0.02587890625,
+        0.0625,
+        0.06640625,
+        0.0113525390625,
+        0.027099609375,
+        0.00119781494140625,
+        -0.021484375,
+        0.0296630859375,
+        -0.0106201171875,
+        -0.023193359375,
+        0.0322265625,
+        0.03515625,
+        0.00083160400390625,
+        -0.0238037109375,
+        0.04443359375,
+        0.013671875,
+        0.011474609375,
+        -0.0205078125,
+        -0.0191650390625,
+        0.04443359375,
+        -0.0225830078125,
+        -0.017822265625,
+        -0.0341796875,
+        0.06494140625,
+        0.0294189453125,
+        -0.040771484375,
+        -0.0235595703125,
+        0.043701171875,
+        0.01318359375,
+        -0.0277099609375,
+        0.01055908203125,
+        -0.0081787109375,
+        -0.00714111328125,
+        0.030029296875,
+        -0.032470703125,
+        -0.0030364990234375,
+        0.01031494140625,
+        0.0211181640625,
+        -0.095703125,
+        -0.0003795623779296875,
+        -0.01611328125,
+        0.0205078125,
+        0.004302978515625,
+        0.00457763671875,
+        0.0281982421875,
+        -0.03955078125,
+        0.03369140625,
+        -0.011962890625,
+        -0.01348876953125,
+        0.0081787109375,
+        0.053955078125,
+        -0.02197265625,
+        -0.08935546875,
+        -0.0205078125,
+        0.0269775390625,
+        -8.153915405273438e-05,
+        -0.0296630859375,
+        0.034912109375,
+        -0.03369140625,
+        -0.001007080078125,
+        -0.045166015625,
+        -0.0093994140625,
+        0.020263671875,
+        0.0291748046875,
+        -0.026611328125,
+        -0.002197265625,
+        -0.030517578125,
+        0.0244140625,
+        0.0166015625,
+        0.0272216796875,
+        -0.001312255859375,
+        -0.034912109375,
+        0.035400390625,
+        0.0257568359375,
+        0.005279541015625,
+        0.029052734375,
+        -0.0196533203125,
+        -0.0166015625,
+        -0.0002613067626953125,
+        -0.000545501708984375,
+        0.0849609375,
+        -0.006103515625,
+        0.0390625,
+        -0.0296630859375,
+        0.041259765625,
+        0.025634765625,
+        0.01513671875,
+        -0.00555419921875,
+        0.01348876953125,
+        0.035400390625,
+        0.01409912109375,
+        -0.01806640625,
+        -0.0302734375,
+        -0.060302734375,
+        -0.016845703125,
+        -0.016845703125,
+        0.0189208984375,
+        -0.0311279296875,
+        -0.0537109375,
+        -0.0235595703125,
+        0.0269775390625,
+        -0.0010223388671875,
+        0.0299072265625,
+        0.00140380859375,
+        0.004974365234375,
+        0.00982666015625,
+        0.0028839111328125,
+        -0.0135498046875,
+        0.0203857421875,
+        -0.0235595703125,
+        -0.0283203125,
+        0.0018157958984375,
+        0.01348876953125,
+        -0.0252685546875,
+        0.0186767578125,
+        0.04052734375,
+        -0.01324462890625,
+        0.006866455078125,
+        0.022705078125,
+        0.0255126953125,
+        0.012451171875,
+        -0.0189208984375,
+        -0.007476806640625,
+        0.004425048828125,
+        0.047607421875,
+        0.0140380859375,
+        -0.06689453125,
+        0.008056640625,
+        -0.0201416015625,
+        -0.034423828125,
+        0.023193359375,
+        0.0693359375,
+        0.03125,
+        0.0245361328125,
+        -0.029052734375,
+        0.0252685546875,
+        -0.04150390625,
+        -0.007171630859375,
+        -0.0400390625,
+        0.0166015625,
+        -0.025146484375,
+        -0.0162353515625,
+        -0.019287109375,
+        -0.0223388671875,
+        -0.0089111328125,
+        0.02685546875,
+        -0.0634765625,
+        0.050537109375,
+        0.023193359375,
+        0.04931640625,
+        0.0111083984375,
+        0.01275634765625,
+        0.0380859375,
+        0.05419921875,
+        -0.05859375,
+        -0.0208740234375,
+        -0.046142578125,
+        0.01385498046875,
+        0.0081787109375,
+        0.0240478515625,
+        0.0081787109375,
+        0.04443359375,
+        -0.04736328125,
+        0.021240234375,
+        -0.0084228515625,
+        -0.005767822265625,
+        0.0140380859375,
+        -0.02587890625,
+        0.0014190673828125,
+        -0.0179443359375,
+        -0.0267333984375,
+        -0.0322265625,
+        0.036376953125,
+        -0.049560546875,
+        -0.005340576171875,
+        0.021240234375,
+        0.004913330078125,
+        0.02490234375,
+        0.007293701171875,
+        -0.0517578125,
+        0.00799560546875,
+        -0.040771484375,
+        -0.03857421875,
+        -0.040283203125,
+        -0.007568359375,
+        -0.0250244140625,
+        -0.0230712890625,
+        0.042724609375,
+        0.0172119140625,
+        -0.0185546875,
+        -0.01446533203125,
+        0.0296630859375,
+        0.02099609375,
+        0.030029296875,
+        0.03515625,
+        -0.0277099609375,
+        -0.05029296875,
+        0.031494140625,
+        -0.00262451171875,
+        -0.02001953125,
+        0.033447265625,
+        0.06103515625,
+        -0.0179443359375,
+        -0.03564453125,
+        -0.0194091796875,
+        -0.062255859375,
+        0.0037994384765625,
+        0.038330078125,
+        0.0712890625,
+        -0.0380859375,
+        0.00051116943359375,
+        0.033203125,
+        0.025634765625,
+        -0.02294921875,
+        0.0247802734375,
+        0.033935546875,
+        0.03955078125,
+        -0.01397705078125,
+        -0.006103515625,
+        -0.062255859375,
+        -0.0322265625,
+        -0.004119873046875,
+        -0.017822265625,
+        0.017333984375,
+        0.04345703125,
+        -0.002471923828125,
+        0.0277099609375,
+        -0.0162353515625,
+        0.0751953125,
+        -0.005828857421875,
+        -0.017578125,
+        -0.0220947265625,
+        -0.0439453125,
+        -0.022705078125,
+        -0.028076171875,
+        -0.0164794921875,
+        0.0260009765625,
+        -0.014892578125,
+        -0.01806640625,
+        -0.01141357421875,
+        -0.04248046875,
+        -0.0693359375,
+        0.01141357421875,
+        0.0211181640625,
+        0.007415771484375,
+        -0.03466796875,
+        0.024658203125,
+        0.016357421875,
+        0.04443359375,
+        0.00830078125,
+        -0.033447265625,
+        0.0012359619140625,
+        -0.036865234375,
+        0.0286865234375,
+        -0.04150390625,
+        -0.0308837890625,
+        0.059326171875,
+        -0.0213623046875,
+        0.0140380859375,
+        0.060302734375,
+        0.0101318359375,
+        0.052490234375,
+        0.0242919921875,
+        -0.0213623046875,
+        0.03857421875,
+        -0.000690460205078125,
+        0.048583984375,
+        -0.01300048828125,
+        0.006439208984375,
+        0.005950927734375,
+        -0.06884765625,
+        -0.004364013671875,
+        0.0302734375,
+        0.021728515625,
+        0.029541015625,
+        0.0196533203125,
+        -0.0048828125,
+        -0.0172119140625,
+        0.0009002685546875,
+        -0.0419921875,
+        -0.0185546875,
+        0.06396484375,
+        -0.0028839111328125,
+        0.0272216796875,
+        0.0247802734375,
+        -0.018310546875,
+        0.04052734375,
+        0.06494140625,
+        0.0233154296875,
+        -0.0001506805419921875,
+        -0.0250244140625,
+        -0.06103515625,
+        0.00286865234375,
+        -0.00927734375,
+        -0.01025390625,
+        -0.03466796875,
+        -0.00116729736328125,
+        0.029052734375,
+        0.0150146484375,
+        0.0130615234375,
+        0.068359375,
+        0.054931640625,
+        0.037109375,
+        0.025634765625,
+        -0.02587890625,
+        0.0458984375,
+        0.06591796875,
+        0.01239013671875,
+        -0.0262451171875,
+        0.10693359375,
+        -0.07421875,
+        -0.0174560546875,
+        -0.00604248046875,
+        -0.017578125,
+        0.06103515625,
+        0.0322265625,
+        -0.040771484375,
+        -0.0026397705078125,
+        0.0037841796875,
+        -0.05859375,
+        -0.03662109375,
+        0.0029449462890625,
+        -0.0245361328125,
+        0.0179443359375,
+        0.0220947265625,
+        0.00726318359375,
+        -0.01458740234375,
+        0.0054931640625,
+        0.036376953125,
+        0.02099609375,
+        0.0162353515625,
+        -0.0250244140625,
+        0.109375,
+        -0.024658203125,
+        -0.0206298828125,
+        -0.0269775390625,
+        -0.01043701171875,
+        -0.00994873046875,
+        -0.007720947265625,
+        -0.0002593994140625,
+        -0.01385498046875,
+        0.01153564453125,
+        0.0250244140625,
+        -0.017333984375,
+        -0.034912109375,
+        -0.004913330078125,
+        -0.0223388671875,
+        0.053955078125,
+        0.033447265625,
+        -0.01123046875,
+        -0.0213623046875,
+        0.02880859375,
+        -0.0059814453125,
+        0.00909423828125,
+        0.0021820068359375,
+        -0.050048828125,
+        0.044677734375,
+        -0.025390625,
+        -0.032958984375,
+        -0.033447265625,
+        -0.0250244140625,
+        -0.047607421875,
+        -0.02197265625,
+        -0.017333984375,
+        -0.00897216796875,
+        -0.037353515625,
+        -0.047607421875,
+        -0.006866455078125,
+        0.0145263671875,
+        0.0245361328125,
+        0.0262451171875,
+        0.01953125,
+        0.036376953125,
+        0.0859375,
+        -0.01177978515625,
+        -0.00994873046875,
+        -0.047119140625,
+        0.0166015625,
+        -0.01025390625,
+        0.0093994140625,
+        -0.0274658203125,
+        -0.0220947265625,
+        -0.03369140625,
+        -0.00518798828125,
+        -0.03466796875,
+        0.00179290771484375,
+        0.03173828125,
+        -0.0032958984375,
+        0.036376953125,
+        0.0927734375,
+        -0.01531982421875,
+        -0.037109375,
+        -0.0380859375,
+        -0.0147705078125,
+        0.026611328125,
+        -0.01165771484375,
+        -0.0322265625,
+        0.031005859375,
+        -0.0147705078125,
+        0.00885009765625,
+        0.0262451171875,
+        -0.01239013671875,
+        0.01226806640625,
+        -0.0179443359375,
+        0.030029296875,
+        -0.0234375,
+        0.0028076171875,
+        -0.00665283203125,
+        -0.0230712890625,
+        -0.0029296875,
+        -0.02783203125,
+        -0.01190185546875,
+        0.00299072265625,
+        -0.031982421875,
+        -0.021728515625,
+        0.0262451171875,
+        0.04541015625,
+        0.00189208984375,
+        0.00811767578125,
+        -0.030029296875,
+        -0.0211181640625,
+        0.05615234375,
+        0.00994873046875,
+        -0.0157470703125,
+        0.03369140625,
+        0.006683349609375,
+        0.000865936279296875,
+        -0.0059814453125,
+        -0.007476806640625,
+        -0.0238037109375,
+        0.0458984375,
+        -0.004119873046875,
+        0.0230712890625,
+        0.00732421875,
+        0.0225830078125,
+        0.0294189453125,
+        -0.0302734375,
+        -0.023681640625,
+        0.026123046875,
+        0.05029296875,
+        0.056640625,
+        0.00860595703125,
+        0.01104736328125,
+        -0.01129150390625,
+        -0.00092315673828125,
+        0.007293701171875,
+        0.040771484375,
+        0.002655029296875,
+        0.0174560546875,
+        -0.0162353515625,
+        0.045166015625,
+        -0.026123046875,
+        0.0022125244140625,
+        0.02685546875,
+        0.03173828125,
+        0.00830078125,
+        -0.0556640625,
+        -0.037109375,
+        0.0693359375,
+        0.0291748046875,
+        0.052490234375,
+        0.038818359375,
+        0.0152587890625,
+        -0.03369140625,
+        -0.0218505859375,
+        0.0157470703125,
+        -0.0260009765625,
+        0.005706787109375,
+        0.005462646484375,
+        0.00494384765625,
+        0.00885009765625,
+        0.002044677734375,
+        0.057861328125,
+        0.029296875,
+        -0.0311279296875,
+        -0.03662109375,
+        -0.01416015625,
+        0.007293701171875,
+        0.018798828125,
+        -0.043701171875,
+        0.011962890625,
+        0.0296630859375,
+        0.00299072265625,
+        -0.023681640625,
+        -0.04443359375,
+        0.0233154296875,
+        -0.031005859375,
+        0.0181884765625,
+        0.05517578125,
+        -0.0010528564453125,
+        -0.00075531005859375,
+        0.0157470703125,
+        0.015869140625,
+        -0.0419921875,
+        0.00775146484375,
+        -0.0159912109375,
+        0.0186767578125,
+        -0.03857421875,
+        0.00115966796875,
+        -0.01336669921875,
+        0.00933837890625,
+        -0.01080322265625,
+        -0.0556640625,
+        0.00433349609375,
+        -0.0147705078125,
+        0.03466796875,
+        -0.0308837890625,
+        -0.00162506103515625,
+        0.050048828125,
+        -0.04150390625,
+        -0.0198974609375,
+        -0.0155029296875,
+        0.0267333984375,
+        0.034423828125,
+        0.03466796875,
+        -0.037841796875,
+        0.034912109375,
+        0.0017547607421875,
+        0.0260009765625,
+        -0.0174560546875,
+        -0.046630859375,
+        -0.0159912109375,
+        -0.0238037109375,
+        0.04150390625,
+        -0.03759765625,
+        0.0093994140625,
+        0.0196533203125,
+        -0.019287109375,
+        0.01214599609375,
+        0.01318359375,
+        -0.0203857421875,
+        -0.01318359375,
+        -0.01904296875,
+        0.0235595703125,
+        0.0101318359375,
+        0.003326416015625,
+        -0.04345703125,
+        -0.003265380859375,
+        0.050537109375,
+        -0.021240234375,
+        0.0281982421875,
+        -0.004302978515625,
+        0.0595703125,
+        -0.0062255859375,
+        0.0145263671875,
+        0.01214599609375,
+        0.00250244140625,
+        -0.00909423828125,
+        -0.01519775390625,
+        -0.018310546875,
+        0.00946044921875,
+        -0.064453125,
+        0.052490234375,
+        0.037353515625,
+        0.00823974609375,
+        -0.0074462890625,
+        -0.044189453125,
+        0.023193359375,
+        0.0400390625,
+        0.003143310546875,
+        -0.00012493133544921875,
+        -0.0230712890625,
+        -0.0169677734375,
+        -0.0032806396484375,
+        -0.0269775390625,
+        0.01495361328125,
+        0.033203125,
+        -0.0390625,
+        0.024169921875,
+        -0.05517578125,
+        0.01416015625,
+        -0.0057373046875,
+        0.052490234375,
+        -0.00439453125,
+        -0.039306640625,
+        -0.08056640625,
+        0.049072265625,
+        0.002227783203125,
+        0.02197265625,
+        -0.052978515625,
+        -0.0203857421875,
+        0.034423828125,
+        -0.0096435546875,
+        0.043212890625,
+        0.0361328125,
+        -0.03662109375,
+        0.038330078125,
+        -0.0380859375,
+        -0.040283203125,
+        0.0213623046875,
+        0.02294921875,
+        -0.00152587890625,
+        -0.04296875,
+        0.0400390625,
+        -0.01361083984375,
+        -0.00872802734375,
+        -0.03125,
+        -0.007476806640625,
+        0.0267333984375,
+        -0.000583648681640625,
+        -0.06201171875,
+        -0.048828125,
+        0.041015625,
+        -0.000545501708984375,
+        0.041015625,
+        0.052001953125,
+        0.019287109375,
+        -0.014892578125,
+        0.01434326171875,
+        0.0120849609375,
+        0.0059814453125,
+        -0.0186767578125,
+        0.01483154296875,
+        -0.02978515625,
+        -0.024658203125,
+        -0.0322265625,
+        0.056396484375,
+        0.061279296875,
+        -0.02099609375,
+        -0.0172119140625,
+        0.0279541015625,
+        0.02294921875,
+        -0.02099609375,
+        -0.04541015625,
+        -0.00897216796875,
+        -0.032470703125,
+        0.040283203125,
+        -0.040283203125,
+        -0.040771484375,
+        -0.06787109375
+      ]
+    ]
+  ],
+  "hd_transform_order": "sub_glb",
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "image_dim_out": 1152,
+  "img_processor": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "interpolate_factor": 1,
+  "lm_head_bias": false,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "phi4mm",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 32,
+  "num_img_tokens": 256,
+  "num_key_value_heads": 8,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 199999,
+  "partial_rotary_factor": 0.75,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1,
+      1.118320672,
+      1.250641126,
+      1.398617824,
+      1.564103225,
+      1.74916897,
+      1.956131817,
+      2.187582649,
+      2.446418898,
+      2.735880826,
+      3.059592084,
+      3.421605075,
+      3.826451687,
+      4.279200023,
+      4.785517845,
+      5.351743533,
+      5.984965424,
+      6.693110555,
+      7.485043894,
+      8.370679318,
+      9.36110372,
+      10.4687158,
+      11.70738129,
+      13.09260651,
+      14.64173252,
+      16.37415215,
+      18.31155283,
+      20.47818807,
+      22.90118105,
+      25.61086418,
+      28.64115884,
+      32.03,
+      32.1,
+      32.13,
+      32.23,
+      32.6,
+      32.61,
+      32.64,
+      32.66,
+      32.7,
+      32.71,
+      32.93,
+      32.97,
+      33.28,
+      33.49,
+      33.5,
+      44.16,
+      47.77
+    ],
+    "short_factor": [
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "speech_lora": {
+    "dp": 0.01,
+    "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
+    "lora_alpha": 640,
+    "r": 320
+  },
+  "sub_GN": [
+    [
+      [
+        [
+          0.01287841796875,
+          0.01202392578125,
+          -0.0006866455078125,
+          -0.004180908203125,
+          -3.743171691894531e-05,
+          -0.000934600830078125,
+          0.001434326171875,
+          0.007476806640625,
+          -0.0035400390625,
+          -0.0196533203125,
+          0.00775146484375,
+          0.00098419189453125,
+          0.00921630859375,
+          3.218650817871094e-05,
+          0.009765625,
+          -0.0120849609375,
+          -0.004241943359375,
+          0.00994873046875,
+          0.0013580322265625,
+          0.0012054443359375,
+          0.0047607421875,
+          -0.00185394287109375,
+          -0.0242919921875,
+          0.01214599609375,
+          -0.0101318359375,
+          -0.00070953369140625,
+          -0.005126953125,
+          -0.004425048828125,
+          -0.01251220703125,
+          0.004119873046875,
+          -0.00274658203125,
+          -0.01055908203125,
+          0.00494384765625,
+          -0.0028228759765625,
+          0.0024261474609375,
+          0.0064697265625,
+          0.000865936279296875,
+          -0.00103759765625,
+          -0.0025787353515625,
+          0.0166015625,
+          -0.000675201416015625,
+          0.01177978515625,
+          -0.00018024444580078125,
+          0.00238037109375,
+          -0.003326416015625,
+          0.00153350830078125,
+          -0.00086212158203125,
+          -0.00628662109375,
+          -6.079673767089844e-05,
+          0.005828857421875,
+          0.001495361328125,
+          -0.01275634765625,
+          -0.00909423828125,
+          0.00592041015625,
+          4.863739013671875e-05,
+          0.0067138671875,
+          -0.003631591796875,
+          0.0024871826171875,
+          -8.106231689453125e-05,
+          -0.00148773193359375,
+          -1.2993812561035156e-05,
+          0.00982666015625,
+          0.004669189453125,
+          -0.003570556640625,
+          0.01092529296875,
+          0.0174560546875,
+          -0.005645751953125,
+          0.01263427734375,
+          0.00909423828125,
+          -0.00494384765625,
+          0.00604248046875,
+          -0.0164794921875,
+          -0.0016326904296875,
+          -0.00112152099609375,
+          0.00177764892578125,
+          -0.00139617919921875,
+          -0.00653076171875,
+          0.00982666015625,
+          0.000370025634765625,
+          -0.0159912109375,
+          0.00171661376953125,
+          0.0164794921875,
+          -0.0074462890625,
+          -0.004638671875,
+          -0.01007080078125,
+          -0.004913330078125,
+          0.0177001953125,
+          -0.00689697265625,
+          0.0059814453125,
+          0.014892578125,
+          -0.00927734375,
+          0.025146484375,
+          0.0042724609375,
+          -0.00060272216796875,
+          0.0189208984375,
+          0.007232666015625,
+          -0.002349853515625,
+          0.01483154296875,
+          -0.005279541015625,
+          -0.00933837890625,
+          -0.000530242919921875,
+          -0.00811767578125,
+          0.00848388671875,
+          0.00225830078125,
+          -0.0026702880859375,
+          -0.016357421875,
+          0.0034027099609375,
+          -0.006317138671875,
+          -0.00830078125,
+          -0.007476806640625,
+          0.016357421875,
+          0.00408935546875,
+          -0.0016632080078125,
+          -0.00872802734375,
+          -0.00787353515625,
+          -0.0021820068359375,
+          0.00185394287109375,
+          -0.002685546875,
+          -0.013427734375,
+          -0.006744384765625,
+          4.267692565917969e-05,
+          0.00372314453125,
+          -0.005340576171875,
+          0.0010223388671875,
+          -0.0078125,
+          -0.0021209716796875,
+          0.00994873046875,
+          0.00616455078125,
+          0.0277099609375,
+          -0.0096435546875,
+          -0.01300048828125,
+          -0.0167236328125,
+          -0.01220703125,
+          -0.01214599609375,
+          -0.0016326904296875,
+          -0.002685546875,
+          0.0016632080078125,
+          -0.0177001953125,
+          -0.01080322265625,
+          -0.009521484375,
+          0.009765625,
+          0.0107421875,
+          0.007171630859375,
+          -0.0030364990234375,
+          0.01141357421875,
+          -0.012451171875,
+          -0.004608154296875,
+          0.004669189453125,
+          -0.003265380859375,
+          -0.00970458984375,
+          -0.00860595703125,
+          -0.0103759765625,
+          0.003326416015625,
+          0.0167236328125,
+          0.0084228515625,
+          0.000736236572265625,
+          -0.0032806396484375,
+          0.0125732421875,
+          -0.004241943359375,
+          0.0123291015625,
+          -0.0057373046875,
+          0.0081787109375,
+          0.0029296875,
+          -0.00872802734375,
+          -0.00150299072265625,
+          0.01275634765625,
+          0.0016937255859375,
+          -0.00616455078125,
+          0.01275634765625,
+          -0.0007171630859375,
+          -0.0220947265625,
+          -0.0042724609375,
+          -0.000949859619140625,
+          0.004486083984375,
+          0.0029754638671875,
+          -0.004638671875,
+          0.0076904296875,
+          0.00070953369140625,
+          0.0029449462890625,
+          0.002227783203125,
+          -0.01544189453125,
+          -0.01080322265625,
+          -0.00057220458984375,
+          0.00021648406982421875,
+          0.019775390625,
+          -0.006317138671875,
+          -0.017333984375,
+          -0.015869140625,
+          -0.0032958984375,
+          0.0120849609375,
+          0.00518798828125,
+          0.004669189453125,
+          0.0164794921875,
+          0.004119873046875,
+          -0.0007476806640625,
+          -0.0036773681640625,
+          -0.001953125,
+          -0.006805419921875,
+          0.007537841796875,
+          0.003265380859375,
+          -0.017822265625,
+          -0.00592041015625,
+          -0.00131988525390625,
+          0.00714111328125,
+          0.0079345703125,
+          -0.0015106201171875,
+          0.004119873046875,
+          0.0027008056640625,
+          0.01531982421875,
+          -0.00537109375,
+          -0.00225830078125,
+          -0.0001583099365234375,
+          -0.005828857421875,
+          0.01336669921875,
+          -0.0069580078125,
+          0.01312255859375,
+          0.0262451171875,
+          -0.0027923583984375,
+          0.006103515625,
+          -0.0166015625,
+          0.0074462890625,
+          0.01092529296875,
+          0.005859375,
+          -0.00921630859375,
+          0.00640869140625,
+          -0.01007080078125,
+          0.002105712890625,
+          0.006072998046875,
+          -0.0093994140625,
+          0.006011962890625,
+          -0.004425048828125,
+          -0.0164794921875,
+          -0.00909423828125,
+          -0.017333984375,
+          0.00823974609375,
+          -0.007293701171875,
+          0.006744384765625,
+          -0.005340576171875,
+          -0.004241943359375,
+          0.00799560546875,
+          -0.0048828125,
+          -0.01513671875,
+          -0.011474609375,
+          -0.00897216796875,
+          0.017578125,
+          -0.006683349609375,
+          0.01025390625,
+          -0.0059814453125,
+          -8.153915405273438e-05,
+          0.00750732421875,
+          0.0020294189453125,
+          -0.0033721923828125,
+          0.00250244140625,
+          0.005523681640625,
+          -0.00150299072265625,
+          -0.00994873046875,
+          0.00110626220703125,
+          0.0084228515625,
+          -0.0098876953125,
+          -0.0245361328125,
+          -0.01495361328125,
+          -0.0078125,
+          -0.0137939453125,
+          -0.00093841552734375,
+          -0.00811767578125,
+          -0.003631591796875,
+          -0.010009765625,
+          -0.01519775390625,
+          0.00677490234375,
+          0.0140380859375,
+          -0.0064697265625,
+          -0.002349853515625,
+          0.003021240234375,
+          -0.0032501220703125,
+          -0.001434326171875,
+          -0.0120849609375,
+          0.00421142578125,
+          -0.0130615234375,
+          -0.001068115234375,
+          -0.0126953125,
+          0.0022125244140625,
+          -0.000629425048828125,
+          -0.00140380859375,
+          0.004669189453125,
+          0.0062255859375,
+          0.005584716796875,
+          0.0018463134765625,
+          0.01116943359375,
+          -0.0062255859375,
+          0.0009918212890625,
+          0.00122833251953125,
+          0.01141357421875,
+          -0.009521484375,
+          0.017578125,
+          0.006561279296875,
+          0.003875732421875,
+          -0.0107421875,
+          -0.00994873046875,
+          -0.0069580078125,
+          0.01470947265625,
+          -0.00421142578125,
+          0.006103515625,
+          0.000392913818359375,
+          0.004119873046875,
+          0.0052490234375,
+          -0.00060272216796875,
+          -0.01080322265625,
+          -0.01068115234375,
+          -0.000774383544921875,
+          -0.0172119140625,
+          -0.000835418701171875,
+          -0.0096435546875,
+          0.0022735595703125,
+          -0.001434326171875,
+          0.003692626953125,
+          -0.00119781494140625,
+          0.0026092529296875,
+          0.02490234375,
+          0.015380859375,
+          -0.0201416015625,
+          0.0238037109375,
+          -0.0103759765625,
+          -0.009033203125,
+          -0.01348876953125,
+          0.00125885009765625,
+          0.016845703125,
+          -0.0028533935546875,
+          -0.005126953125,
+          -0.0130615234375,
+          -0.00970458984375,
+          0.00933837890625,
+          0.01611328125,
+          -0.0076904296875,
+          -0.002197265625,
+          0.006988525390625,
+          -0.0223388671875,
+          0.00445556640625,
+          -0.00433349609375,
+          0.0084228515625,
+          -0.00762939453125,
+          -0.0064697265625,
+          0.0150146484375,
+          0.0150146484375,
+          -0.017333984375,
+          0.017822265625,
+          0.00177764892578125,
+          0.00921630859375,
+          -0.00927734375,
+          0.0028533935546875,
+          -2.2411346435546875e-05,
+          -0.00130462646484375,
+          -0.00433349609375,
+          -0.0013580322265625,
+          0.01202392578125,
+          -0.0029754638671875,
+          -0.000385284423828125,
+          -0.004608154296875,
+          -0.0037841796875,
+          0.002166748046875,
+          0.01068115234375,
+          -0.00506591796875,
+          0.001617431640625,
+          -0.0107421875,
+          -7.724761962890625e-05,
+          -0.005523681640625,
+          0.012451171875,
+          -0.00341796875,
+          0.00286865234375,
+          0.0244140625,
+          0.0032196044921875,
+          0.0048828125,
+          0.0177001953125,
+          -0.006072998046875,
+          0.0087890625,
+          0.00017833709716796875,
+          -0.00799560546875,
+          -0.0250244140625,
+          0.003326416015625,
+          0.0017242431640625,
+          0.004791259765625,
+          -0.0159912109375,
+          -0.00177764892578125,
+          0.019775390625,
+          -0.0086669921875,
+          0.01422119140625,
+          -0.005950927734375,
+          0.005035400390625,
+          -0.011474609375,
+          0.00238037109375,
+          -0.004547119140625,
+          0.01177978515625,
+          0.0115966796875,
+          0.0030517578125,
+          -8.7738037109375e-05,
+          -0.00335693359375,
+          0.00592041015625,
+          0.009033203125,
+          0.00139617919921875,
+          -0.0185546875,
+          -0.004547119140625,
+          0.00543212890625,
+          0.02001953125,
+          -0.01019287109375,
+          -0.01275634765625,
+          0.005950927734375,
+          0.00921630859375,
+          0.00131988525390625,
+          2.2530555725097656e-05,
+          -0.00604248046875,
+          0.00885009765625,
+          -0.000335693359375,
+          -0.00848388671875,
+          -0.0072021484375,
+          0.0037841796875,
+          0.00177764892578125,
+          -0.0113525390625,
+          -0.00909423828125,
+          0.004669189453125,
+          -0.01153564453125,
+          0.00390625,
+          0.01116943359375,
+          -0.002288818359375,
+          -0.005615234375,
+          -0.00051116943359375,
+          0.0029144287109375,
+          0.0159912109375,
+          -0.017578125,
+          -0.01416015625,
+          0.0017547607421875,
+          0.00933837890625,
+          0.000835418701171875,
+          0.0064697265625,
+          -0.01080322265625,
+          0.0172119140625,
+          -0.007659912109375,
+          0.00159454345703125,
+          0.006500244140625,
+          -0.00750732421875,
+          0.002532958984375,
+          -0.00909423828125,
+          0.006744384765625,
+          -0.0133056640625,
+          0.002288818359375,
+          -0.00101470947265625,
+          0.003753662109375,
+          -0.0128173828125,
+          0.0081787109375,
+          0.000247955322265625,
+          -0.004302978515625,
+          0.01300048828125,
+          -0.0019989013671875,
+          0.01031494140625,
+          0.0015869140625,
+          0.0135498046875,
+          -0.00323486328125,
+          -0.00021648406982421875,
+          0.00927734375,
+          -0.01226806640625,
+          -0.00946044921875,
+          0.011474609375,
+          -0.01031494140625,
+          -0.006927490234375,
+          -0.0118408203125,
+          0.004913330078125,
+          0.01446533203125,
+          0.0174560546875,
+          -0.00153350830078125,
+          0.005126953125,
+          0.00113677978515625,
+          -0.000141143798828125,
+          0.01373291015625,
+          0.00738525390625,
+          -0.007415771484375,
+          -0.005615234375,
+          -0.00927734375,
+          0.012939453125,
+          0.00173187255859375,
+          -0.00043487548828125,
+          -0.012451171875,
+          0.0101318359375,
+          -0.00150299072265625,
+          -0.006591796875,
+          0.0107421875,
+          0.025634765625,
+          0.0003414154052734375,
+          -0.00017070770263671875,
+          -0.01171875,
+          0.01806640625,
+          0.006256103515625,
+          0.00982666015625,
+          -0.0030670166015625,
+          -0.0091552734375,
+          -0.0179443359375,
+          0.0020751953125,
+          0.006744384765625,
+          -0.00445556640625,
+          -0.00335693359375,
+          -0.00543212890625,
+          -0.015869140625,
+          -0.005523681640625,
+          0.0118408203125,
+          0.0011138916015625,
+          -0.00543212890625,
+          -0.00013637542724609375,
+          -0.001617431640625,
+          0.001617431640625,
+          0.004150390625,
+          0.00074005126953125,
+          -0.019287109375,
+          -0.0078125,
+          -0.016357421875,
+          -0.0146484375,
+          -0.003143310546875,
+          0.0025787353515625,
+          -0.019287109375,
+          -0.005218505859375,
+          -0.00830078125,
+          0.01080322265625,
+          -0.004180908203125,
+          -0.009765625,
+          -0.006927490234375,
+          -0.00823974609375,
+          -0.005035400390625,
+          -0.0185546875,
+          -0.019775390625,
+          0.00011396408081054688,
+          -0.0020751953125,
+          -0.00927734375,
+          -0.006622314453125,
+          0.0037078857421875,
+          -0.0027923583984375,
+          0.0017242431640625,
+          0.001983642578125,
+          -0.007080078125,
+          -0.00640869140625,
+          -0.007659912109375,
+          0.0072021484375,
+          0.002044677734375,
+          -0.01214599609375,
+          0.00171661376953125,
+          -0.0003204345703125,
+          -0.0002765655517578125,
+          0.00921630859375,
+          0.00738525390625,
+          0.00958251953125,
+          -0.000583648681640625,
+          -0.0169677734375,
+          0.000453948974609375,
+          0.006317138671875,
+          -0.0137939453125,
+          -0.018798828125,
+          0.0196533203125,
+          0.01434326171875,
+          0.0030059814453125,
+          0.006195068359375,
+          0.01025390625,
+          0.015625,
+          -0.00897216796875,
+          0.004638671875,
+          -0.03466796875,
+          -0.0008697509765625,
+          -0.000835418701171875,
+          0.0024261474609375,
+          -0.012939453125,
+          0.00848388671875,
+          -0.000820159912109375,
+          -0.00927734375,
+          -0.015625,
+          0.00567626953125,
+          -0.0016632080078125,
+          -0.0019989013671875,
+          -0.0028533935546875,
+          -0.002777099609375,
+          0.0025482177734375,
+          0.01055908203125,
+          0.00714111328125,
+          -0.01055908203125,
+          0.00162506103515625,
+          0.0098876953125,
+          -0.00421142578125,
+          0.0024261474609375,
+          0.01373291015625,
+          0.01611328125,
+          -0.0106201171875,
+          -0.0004405975341796875,
+          -0.0045166015625,
+          -0.0038909912109375,
+          0.00145721435546875,
+          0.01123046875,
+          0.0022430419921875,
+          -0.0078125,
+          0.01177978515625,
+          -0.00142669677734375,
+          -0.000701904296875,
+          -0.0009613037109375,
+          0.01556396484375,
+          0.01019287109375,
+          -0.0155029296875,
+          -0.00537109375,
+          0.01483154296875,
+          -0.01043701171875,
+          0.01165771484375,
+          -0.00799560546875,
+          -0.00390625,
+          -0.00174713134765625,
+          0.009033203125,
+          0.00372314453125,
+          -0.004852294921875,
+          -0.003082275390625,
+          0.012939453125,
+          -0.01055908203125,
+          -0.0052490234375,
+          0.0022125244140625,
+          0.001556396484375,
+          -0.010498046875,
+          0.0020599365234375,
+          0.01611328125,
+          -0.00994873046875,
+          -0.0189208984375,
+          -0.007537841796875,
+          -0.00150299072265625,
+          1.0192394256591797e-05,
+          -0.007598876953125,
+          0.0047607421875,
+          -0.0096435546875,
+          -0.0166015625,
+          0.0126953125,
+          -0.004547119140625,
+          -0.005828857421875,
+          0.0007781982421875,
+          -0.0074462890625,
+          0.000701904296875,
+          0.0018768310546875,
+          0.00396728515625,
+          0.0107421875,
+          -0.0062255859375,
+          0.0211181640625,
+          -0.0194091796875,
+          0.004058837890625,
+          -0.005096435546875,
+          0.0036773681640625,
+          0.00726318359375,
+          -0.003662109375,
+          0.00885009765625,
+          -0.008056640625,
+          0.01446533203125,
+          -0.010009765625,
+          0.002288818359375,
+          0.000629425048828125,
+          0.003814697265625,
+          7.581710815429688e-05,
+          0.001739501953125,
+          -0.0068359375,
+          0.00640869140625,
+          0.002655029296875,
+          0.0115966796875,
+          -0.0062255859375,
+          -0.0032806396484375,
+          0.01116943359375,
+          0.000690460205078125,
+          -0.0062255859375,
+          -0.01043701171875,
+          0.0003662109375,
+          0.01519775390625,
+          -0.00384521484375,
+          0.002227783203125,
+          -0.0027618408203125,
+          -0.01171875,
+          0.00286865234375,
+          -0.001495361328125,
+          0.00177764892578125,
+          -0.009033203125,
+          -0.006744384765625,
+          -0.0184326171875,
+          0.0023193359375,
+          -0.01190185546875,
+          0.006103515625,
+          0.005218505859375,
+          5.3882598876953125e-05,
+          0.0013427734375,
+          0.00360107421875,
+          -0.0031585693359375,
+          0.0068359375,
+          0.00156402587890625,
+          0.0050048828125,
+          0.02001953125,
+          -0.00323486328125,
+          -0.01165771484375,
+          -0.01275634765625,
+          0.0002269744873046875,
+          0.00104522705078125,
+          -0.0004177093505859375,
+          -0.006500244140625,
+          0.0008087158203125,
+          -0.01123046875,
+          0.00823974609375,
+          0.00738525390625,
+          -0.0019683837890625,
+          -0.005340576171875,
+          -0.01214599609375,
+          -0.0027008056640625,
+          0.0040283203125,
+          0.01220703125,
+          -0.006988525390625,
+          -0.00579833984375,
+          0.00372314453125,
+          -0.002197265625,
+          -0.007720947265625,
+          -0.005157470703125,
+          -0.003448486328125,
+          -0.011962890625,
+          0.0125732421875,
+          -0.00125885009765625,
+          0.0010223388671875,
+          0.0012054443359375,
+          -0.0150146484375,
+          -0.00127410888671875,
+          0.01007080078125,
+          0.00445556640625,
+          -0.001190185546875,
+          0.006866455078125,
+          0.0164794921875,
+          -0.018310546875,
+          -0.00408935546875,
+          -0.0001392364501953125,
+          0.00543212890625,
+          0.0020294189453125,
+          0.0003986358642578125,
+          0.010498046875,
+          -0.0189208984375,
+          -0.01263427734375,
+          -0.000972747802734375,
+          -0.00787353515625,
+          0.00811767578125,
+          -0.01263427734375,
+          -0.006500244140625,
+          -0.00689697265625,
+          0.01263427734375,
+          -0.0024566650390625,
+          0.0198974609375,
+          -0.006805419921875,
+          0.00958251953125,
+          -0.0107421875,
+          -0.0031585693359375,
+          0.021484375,
+          -0.0118408203125,
+          -0.001708984375,
+          0.00982666015625,
+          -0.0022430419921875,
+          -0.01025390625,
+          -0.00762939453125,
+          -0.0162353515625,
+          -0.00057220458984375,
+          0.00286865234375,
+          -0.0020904541015625,
+          -0.000255584716796875,
+          0.01104736328125,
+          -0.006683349609375,
+          0.0020751953125,
+          0.000362396240234375,
+          -0.0052490234375,
+          0.0011444091796875,
+          -0.021484375,
+          -0.00026702880859375,
+          0.010009765625,
+          -0.0057373046875,
+          0.0140380859375,
+          -0.00946044921875,
+          0.0072021484375,
+          0.0028076171875,
+          -0.0159912109375,
+          -0.00335693359375,
+          0.0177001953125,
+          0.0027923583984375,
+          0.005706787109375,
+          0.005584716796875,
+          0.0084228515625,
+          -0.001434326171875,
+          -0.00958251953125,
+          -0.00848388671875,
+          -0.0093994140625,
+          -0.0093994140625,
+          0.01214599609375,
+          -0.01312255859375,
+          -0.01287841796875,
+          -0.004638671875,
+          -0.002410888671875,
+          0.005828857421875,
+          -0.004669189453125,
+          -0.006927490234375,
+          0.002716064453125,
+          -0.0089111328125,
+          0.004730224609375,
+          0.0157470703125,
+          -0.00173187255859375,
+          0.00823974609375,
+          -0.00106048583984375,
+          -0.01953125,
+          0.0009918212890625,
+          0.0026397705078125,
+          0.01397705078125,
+          0.003265380859375,
+          0.001556396484375,
+          -0.00116729736328125,
+          -0.001617431640625,
+          0.009033203125,
+          -0.00823974609375,
+          0.00732421875,
+          -0.002197265625,
+          -0.01495361328125,
+          -0.019775390625,
+          0.004058837890625,
+          0.01513671875,
+          0.008056640625,
+          -0.0111083984375,
+          0.0068359375,
+          0.004669189453125,
+          0.01409912109375,
+          0.0001277923583984375,
+          -0.0036773681640625,
+          -0.00555419921875,
+          0.00408935546875,
+          -0.01531982421875,
+          0.00081634521484375,
+          0.007080078125,
+          -0.01080322265625,
+          0.00665283203125,
+          -0.005584716796875,
+          -0.00457763671875,
+          -0.0125732421875,
+          0.01141357421875,
+          -0.0108642578125,
+          0.0277099609375,
+          -0.016845703125,
+          -0.01385498046875,
+          -0.0107421875,
+          -0.0123291015625,
+          -0.01483154296875,
+          -0.0005035400390625,
+          -0.00677490234375,
+          -0.006805419921875,
+          0.0301513671875,
+          0.00982666015625,
+          -0.00194549560546875,
+          0.01519775390625,
+          0.0028076171875,
+          -0.01531982421875,
+          -0.0076904296875,
+          0.0048828125,
+          0.00726318359375,
+          -0.004119873046875,
+          -0.008056640625,
+          0.0037689208984375,
+          0.01556396484375,
+          -0.022216796875,
+          -0.0079345703125,
+          0.01446533203125,
+          0.00933837890625,
+          0.01129150390625,
+          -0.021240234375,
+          0.0038604736328125,
+          -0.00396728515625,
+          -0.001678466796875,
+          0.005706787109375,
+          -0.006683349609375,
+          -0.0009002685546875,
+          -0.00075531005859375,
+          -0.00020122528076171875,
+          -0.00127410888671875,
+          -0.016845703125,
+          0.0011138916015625,
+          0.0145263671875,
+          -0.002593994140625,
+          0.00262451171875,
+          0.0034027099609375,
+          -0.0010528564453125,
+          -0.0040283203125,
+          0.0008392333984375,
+          -0.00054168701171875,
+          0.005950927734375,
+          0.0155029296875,
+          0.0050048828125,
+          0.000873565673828125,
+          0.007476806640625,
+          -0.0206298828125,
+          0.00135040283203125,
+          0.000751495361328125,
+          0.0057373046875,
+          0.0016021728515625,
+          0.0098876953125,
+          0.0093994140625,
+          0.00408935546875,
+          -0.0174560546875,
+          -0.01495361328125,
+          0.00244140625,
+          0.00836181640625,
+          -0.00213623046875,
+          0.0004024505615234375,
+          0.00640869140625,
+          -0.001953125,
+          0.0089111328125,
+          -0.005584716796875,
+          0.006591796875,
+          0.004730224609375,
+          0.0010223388671875,
+          0.0125732421875,
+          0.007476806640625,
+          -0.00058746337890625,
+          0.004974365234375,
+          0.01531982421875,
+          0.003936767578125,
+          -0.005706787109375,
+          0.005157470703125,
+          -0.00156402587890625,
+          0.001983642578125,
+          0.0115966796875,
+          -0.0272216796875,
+          -0.01953125,
+          -0.00025177001953125,
+          -0.003173828125,
+          -0.003173828125,
+          0.00897216796875,
+          -0.01202392578125,
+          -0.002471923828125,
+          0.01556396484375,
+          0.001190185546875,
+          -0.0218505859375,
+          -2.9802322387695312e-05,
+          -0.015869140625,
+          0.0118408203125,
+          -0.004974365234375,
+          -0.00347900390625,
+          -0.003997802734375,
+          -0.0029296875,
+          -0.00390625,
+          0.0150146484375,
+          0.00457763671875,
+          -0.00020313262939453125,
+          -0.005157470703125,
+          -0.010009765625,
+          -0.0022735595703125,
+          0.006561279296875,
+          -0.0103759765625,
+          -0.01239013671875,
+          0.0045166015625,
+          -0.0030670166015625,
+          -0.00933837890625,
+          -0.00616455078125,
+          -0.00250244140625,
+          0.01031494140625,
+          0.00193023681640625,
+          -0.0035247802734375,
+          0.001251220703125,
+          0.0022735595703125,
+          -0.006378173828125,
+          -0.00787353515625,
+          -0.0263671875,
+          -0.007537841796875,
+          -0.001953125,
+          0.01177978515625,
+          -0.0037078857421875,
+          -0.01556396484375,
+          -0.00897216796875,
+          -0.0032958984375,
+          0.00860595703125,
+          -0.002288818359375,
+          -0.002105712890625,
+          -0.0042724609375,
+          -0.0205078125,
+          0.0069580078125,
+          -0.0028076171875,
+          0.004302978515625,
+          -0.0146484375,
+          0.00665283203125,
+          -0.0004367828369140625,
+          -0.01275634765625,
+          -0.001068115234375,
+          -0.007720947265625,
+          0.01544189453125,
+          0.0218505859375,
+          -0.01953125,
+          -0.00897216796875,
+          -0.0186767578125,
+          0.0081787109375,
+          -0.001495361328125,
+          0.007110595703125,
+          0.01202392578125,
+          -0.0118408203125,
+          -0.007568359375,
+          -0.007080078125,
+          -0.00848388671875,
+          -0.004669189453125,
+          0.00469970703125,
+          -0.0008392333984375,
+          0.0022125244140625,
+          0.0032958984375,
+          -0.01025390625,
+          0.006072998046875,
+          0.0030975341796875,
+          0.002349853515625,
+          0.00762939453125,
+          0.0079345703125,
+          -0.0013427734375,
+          -0.00238037109375,
+          -0.003814697265625,
+          -0.001983642578125,
+          0.0025177001953125,
+          -0.01513671875,
+          0.005645751953125,
+          -0.00013065338134765625,
+          -0.0113525390625,
+          -0.0038299560546875,
+          -0.00927734375,
+          -0.0125732421875,
+          -0.004669189453125,
+          -0.0033416748046875,
+          -0.0035552978515625,
+          0.0093994140625,
+          0.00189971923828125,
+          -9.250640869140625e-05,
+          0.000164031982421875,
+          0.000568389892578125,
+          0.00537109375,
+          -0.005523681640625,
+          0.002899169921875,
+          -0.0098876953125,
+          -0.0137939453125,
+          -0.0030059814453125,
+          -0.00701904296875,
+          -0.0084228515625,
+          -0.000823974609375,
+          0.00799560546875,
+          -0.005706787109375,
+          0.00823974609375,
+          -0.00946044921875,
+          -0.0030517578125,
+          -0.0169677734375,
+          0.006378173828125,
+          0.0024566650390625,
+          0.00775146484375,
+          0.00101470947265625,
+          -0.00848388671875,
+          -0.003265380859375,
+          -0.004608154296875,
+          -0.004364013671875,
+          0.001312255859375,
+          0.0111083984375,
+          0.001312255859375,
+          -0.0078125,
+          0.0003509521484375,
+          -0.00131988525390625,
+          -0.0024261474609375,
+          0.0047607421875,
+          -0.01129150390625,
+          0.005645751953125,
+          -0.0103759765625,
+          0.007232666015625,
+          0.000408172607421875,
+          0.006011962890625,
+          0.004547119140625,
+          0.00136566162109375,
+          -0.01361083984375,
+          -0.01055908203125,
+          -0.000904083251953125,
+          0.003509521484375,
+          0.0037689208984375,
+          -0.024658203125,
+          0.00909423828125,
+          0.0034942626953125,
+          0.0113525390625,
+          0.005859375,
+          -0.0027313232421875,
+          0.0010528564453125,
+          0.0164794921875,
+          -0.01226806640625,
+          -0.013427734375,
+          0.00023746490478515625,
+          0.01409912109375,
+          0.01123046875,
+          -0.00872802734375,
+          -0.0010528564453125,
+          0.006011962890625,
+          -0.004608154296875,
+          0.00738525390625,
+          -0.00341796875,
+          -0.00482177734375,
+          0.0024261474609375,
+          0.0089111328125,
+          0.0048828125,
+          0.007110595703125,
+          0.002899169921875,
+          -0.004302978515625,
+          0.004486083984375,
+          0.00714111328125,
+          0.0035858154296875,
+          -0.01092529296875,
+          0.0045166015625,
+          0.00148773193359375,
+          0.00118255615234375,
+          0.00439453125,
+          -0.0135498046875,
+          0.005523681640625,
+          -0.01055908203125,
+          -0.004364013671875,
+          -0.00567626953125,
+          -0.0050048828125,
+          -0.006011962890625,
+          -0.00848388671875,
+          -0.000545501708984375,
+          -0.01153564453125,
+          0.00579833984375,
+          0.0064697265625,
+          -0.004180908203125,
+          -0.00311279296875,
+          -0.000888824462890625,
+          0.0025177001953125,
+          0.0012054443359375,
+          0.0087890625,
+          -0.005401611328125,
+          0.0032806396484375,
+          -0.01190185546875,
+          -0.009033203125,
+          -0.0111083984375,
+          -0.000640869140625,
+          -0.009765625,
+          -0.0167236328125,
+          -0.0023956298828125,
+          0.00023937225341796875,
+          -0.0189208984375,
+          -0.007080078125,
+          -0.00014019012451171875,
+          -0.00958251953125,
+          -0.0076904296875,
+          -0.0027008056640625,
+          0.0047607421875,
+          0.0087890625,
+          -0.0047607421875,
+          -1.0967254638671875e-05,
+          0.010009765625,
+          0.003387451171875,
+          0.015869140625,
+          0.0096435546875,
+          0.010009765625,
+          1.1861324310302734e-05,
+          0.001678466796875,
+          -0.00055694580078125,
+          -0.00140380859375,
+          -0.0031280517578125,
+          -0.005645751953125,
+          -0.00162506103515625,
+          -0.003326416015625,
+          0.0181884765625
+        ]
+      ]
+    ]
+  ],
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vision_lora": {
+    "dp": 0.0,
+    "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
+    "lora_alpha": 512,
+    "r": 256
+  },
+  "vocab_size": 200064
+}

configuration_phi4mm.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Phi-4-MM model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Phi4MMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MMModel`]. It is used to instantiate a Phi-4-MM
+    model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the Phi-4-MM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi4MMModel`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+        bos_token_id (`int`, *optional*, defaults to 199999):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 199999):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 199999):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+    Example:
+    ```python
+    >>> from transformers import Phi4MMModel, Phi4MMConfig
+    >>> # Initializing a Phi-4-MM style configuration
+    >>> configuration = Phi4MMConfig.from_pretrained("TBA")
+    >>> # Initializing a model from the configuration
+    >>> model = Phi4MMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "phi4mm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=200064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=1,
+        bos_token_id=199999,
+        eos_token_id=199999,
+        pad_token_id=199999,
+        sliding_window=None,
+        embd_layer: str = "default",
+        img_processor=None,
+        audio_processor=None,
+        vision_lora=None,
+        speech_lora=None,
+        **kwargs,
+    ):
+        self.embd_layer = embd_layer
+        self.img_processor = img_processor
+        self.audio_processor = audio_processor
+        self.vision_lora = vision_lora
+        self.speech_lora = speech_lora
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_adjustment()
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_adjustment(self):
+        """
+        Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
+        """
+        if self.rope_scaling is None:
+            return
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        # For backward compatibility if previous version used "su" or "yarn"
+        if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
+            self.rope_scaling["type"] = "longrope"
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
+        if not len(rope_scaling_short_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
+            )

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 199999,
+  "eos_token_id": [
+    200020,
+    199999
+  ],
+  "pad_token_id": 199999,
+  "transformers_version": "4.49.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_audio_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43b1464773066a6585f88ac7445481d772fedd3f39aa3bcf26e749ef07eb151d
+size 320

openvino_audio_embeddings_model.xml ADDED Viewed

	@@ -0,0 +1,144 @@

+<?xml version="1.0"?>
+<net name="Model16683" version="11">
+	<layers>
+		<layer id="0" name="input_" type="Parameter" version="opset1">
+			<data shape="?,?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="input_">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="Constant_3240972_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 80" offset="0" size="160" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="Constant_3240972" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="Multiply_3240960" type="Multiply" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="Constant_3240973_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 80" offset="160" size="160" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="Constant_3240973" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="aten::mul/Multiply" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>80</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>80</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>80</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="Result_3239208" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>80</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
+		<edge from-layer="3" from-port="2" to-layer="6" to-port="0" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="0" />
+		<edge from-layer="5" from-port="1" to-layer="6" to-port="1" />
+		<edge from-layer="6" from-port="2" to-layer="7" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_audio_encoder_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f8c8a1b1579a4db3292236a88a12650e8eee9833b40629e8e665f61d45c8535
+size 432016128

openvino_audio_encoder_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_audio_forward_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e430beb598d4d854bc527fbe6c34b1a435cb4c1249325718ac92546ab82b6441
+size 25233456

openvino_audio_forward_embeddings_model.xml ADDED Viewed

	@@ -0,0 +1,906 @@

+<?xml version="1.0"?>
+<net name="Model16686" version="11">
+	<layers>
+		<layer id="0" name="input_tensor" type="Parameter" version="opset1">
+			<data shape="?,?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="input_tensor">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="10" type="Const" version="opset1">
+			<data element_type="i64" shape="" offset="0" size="8" />
+			<output>
+				<port id="0" precision="I64" names="10" />
+			</output>
+		</layer>
+		<layer id="2" name="__module.embed/aten::unsqueeze/Unsqueeze" type="Unsqueeze" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="I64" />
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="13">
+					<dim>-1</dim>
+					<dim>1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="self.embed.conv.0.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 1, 3, 3" offset="8" size="18432" />
+			<output>
+				<port id="0" precision="FP16" names="self.embed.conv.0.weight">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="self.embed.conv.0.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="__module.embed.conv.0/aten::_convolution/Convolution" type="Convolution" version="opset1">
+			<data strides="2, 2" dilations="1, 1" pads_begin="1, 1" pads_end="1, 1" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="__module.embed.conv.0/aten::_convolution/Reshape_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1024, 1, 1" offset="18440" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="__module.embed.conv.0/aten::_convolution/Reshape" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="8" name="__module.embed.conv.0/aten::_convolution/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="25">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="9" name="__module.embed.conv.1/aten::relu/Relu" type="ReLU" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="26">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="10" name="__module.embed.conv.2/aten::_convolution/Reshape_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 1, 1, 3, 3" offset="20488" size="18432" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="11" name="__module.embed.conv.2/aten::_convolution/Reshape" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="12" name="__module.embed.conv.2/aten::_convolution/GroupConvolution" type="GroupConvolution" version="opset1">
+			<data strides="2, 2" pads_begin="1, 1" pads_end="1, 1" dilations="1, 1" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="13" name="__module.embed.conv.2/aten::_convolution/Reshape_1_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1024, 1, 1" offset="38920" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="14" name="__module.embed.conv.2/aten::_convolution/Reshape_1" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="15" name="__module.embed.conv.2/aten::_convolution/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="33">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="16" name="self.embed.conv.3.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 1024, 1, 1" offset="40968" size="2097152" />
+			<output>
+				<port id="0" precision="FP16" names="self.embed.conv.3.weight">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="17" name="self.embed.conv.3.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="18" name="__module.embed.conv.3/aten::_convolution/Convolution" type="Convolution" version="opset1">
+			<data strides="1, 1" dilations="1, 1" pads_begin="0, 0" pads_end="0, 0" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="19" name="__module.embed.conv.3/aten::_convolution/Reshape_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1024, 1, 1" offset="2138120" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="20" name="__module.embed.conv.3/aten::_convolution/Reshape" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="21" name="__module.embed.conv.3/aten::_convolution/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="40">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="22" name="__module.embed.conv.1/aten::relu/Relu_1" type="ReLU" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="41">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="23" name="__module.embed.conv.5/aten::_convolution/Reshape_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 1, 1, 3, 3" offset="2140168" size="18432" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="24" name="__module.embed.conv.5/aten::_convolution/Reshape" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="25" name="__module.embed.conv.5/aten::_convolution/GroupConvolution" type="GroupConvolution" version="opset1">
+			<data strides="2, 2" pads_begin="1, 1" pads_end="1, 1" dilations="1, 1" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="26" name="__module.embed.conv.5/aten::_convolution/Reshape_1_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1024, 1, 1" offset="2158600" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="27" name="__module.embed.conv.5/aten::_convolution/Reshape_1" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="28" name="__module.embed.conv.5/aten::_convolution/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="48">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="29" name="self.embed.conv.6.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 1024, 1, 1" offset="2160648" size="2097152" />
+			<output>
+				<port id="0" precision="FP16" names="self.embed.conv.6.weight">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="30" name="self.embed.conv.6.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="31" name="__module.embed.conv.6/aten::_convolution/Convolution" type="Convolution" version="opset1">
+			<data strides="1, 1" dilations="1, 1" pads_begin="0, 0" pads_end="0, 0" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="32" name="__module.embed.conv.6/aten::_convolution/Reshape_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1024, 1, 1" offset="4257800" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="33" name="__module.embed.conv.6/aten::_convolution/Reshape" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="34" name="__module.embed.conv.6/aten::_convolution/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1024</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="55">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="35" name="__module.embed.conv.1/aten::relu/Relu_2" type="ReLU" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="56,x">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="36" name="__module.embed/aten::transpose/Constant" type="Const" version="opset1">
+			<data element_type="i32" shape="4" offset="4259848" size="16" />
+			<output>
+				<port id="0" precision="I32">
+					<dim>4</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="37" name="__module.embed/aten::transpose/Transpose" type="Transpose" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="I32">
+					<dim>4</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="59">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="38" name="Constant_3243459" type="Const" version="opset1">
+			<data element_type="i64" shape="3" offset="4259864" size="24" />
+			<rt_info>
+				<attribute name="precise" version="0" />
+			</rt_info>
+			<output>
+				<port id="0" precision="I64">
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="39" name="__module.embed/aten::reshape/Reshape" type="Reshape" version="opset1">
+			<data special_zero="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="I64">
+					<dim>3</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="61">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="40" name="self.embed.out.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1024, 10240" offset="4259888" size="20971520" />
+			<output>
+				<port id="0" precision="FP16" names="self.embed.out.weight">
+					<dim>1024</dim>
+					<dim>10240</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="41" name="self.embed.out.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1024</dim>
+					<dim>10240</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>10240</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="42" name="__module.embed.out/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1024</dim>
+					<dim>10240</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="43" name="Constant_3243449_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 1024" offset="25231408" size="2048" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="44" name="Constant_3243449" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="45" name="__module.embed.out/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="46" name="Result_3241509" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="1" />
+		<edge from-layer="2" from-port="2" to-layer="5" to-port="0" />
+		<edge from-layer="3" from-port="0" to-layer="4" to-port="0" />
+		<edge from-layer="4" from-port="1" to-layer="5" to-port="1" />
+		<edge from-layer="5" from-port="2" to-layer="8" to-port="0" />
+		<edge from-layer="6" from-port="0" to-layer="7" to-port="0" />
+		<edge from-layer="7" from-port="1" to-layer="8" to-port="1" />
+		<edge from-layer="8" from-port="2" to-layer="9" to-port="0" />
+		<edge from-layer="9" from-port="1" to-layer="12" to-port="0" />
+		<edge from-layer="10" from-port="0" to-layer="11" to-port="0" />
+		<edge from-layer="11" from-port="1" to-layer="12" to-port="1" />
+		<edge from-layer="12" from-port="2" to-layer="15" to-port="0" />
+		<edge from-layer="13" from-port="0" to-layer="14" to-port="0" />
+		<edge from-layer="14" from-port="1" to-layer="15" to-port="1" />
+		<edge from-layer="15" from-port="2" to-layer="18" to-port="0" />
+		<edge from-layer="16" from-port="0" to-layer="17" to-port="0" />
+		<edge from-layer="17" from-port="1" to-layer="18" to-port="1" />
+		<edge from-layer="18" from-port="2" to-layer="21" to-port="0" />
+		<edge from-layer="19" from-port="0" to-layer="20" to-port="0" />
+		<edge from-layer="20" from-port="1" to-layer="21" to-port="1" />
+		<edge from-layer="21" from-port="2" to-layer="22" to-port="0" />
+		<edge from-layer="22" from-port="1" to-layer="25" to-port="0" />
+		<edge from-layer="23" from-port="0" to-layer="24" to-port="0" />
+		<edge from-layer="24" from-port="1" to-layer="25" to-port="1" />
+		<edge from-layer="25" from-port="2" to-layer="28" to-port="0" />
+		<edge from-layer="26" from-port="0" to-layer="27" to-port="0" />
+		<edge from-layer="27" from-port="1" to-layer="28" to-port="1" />
+		<edge from-layer="28" from-port="2" to-layer="31" to-port="0" />
+		<edge from-layer="29" from-port="0" to-layer="30" to-port="0" />
+		<edge from-layer="30" from-port="1" to-layer="31" to-port="1" />
+		<edge from-layer="31" from-port="2" to-layer="34" to-port="0" />
+		<edge from-layer="32" from-port="0" to-layer="33" to-port="0" />
+		<edge from-layer="33" from-port="1" to-layer="34" to-port="1" />
+		<edge from-layer="34" from-port="2" to-layer="35" to-port="0" />
+		<edge from-layer="35" from-port="1" to-layer="37" to-port="0" />
+		<edge from-layer="36" from-port="0" to-layer="37" to-port="1" />
+		<edge from-layer="37" from-port="2" to-layer="39" to-port="0" />
+		<edge from-layer="38" from-port="0" to-layer="39" to-port="1" />
+		<edge from-layer="39" from-port="2" to-layer="42" to-port="0" />
+		<edge from-layer="40" from-port="0" to-layer="41" to-port="0" />
+		<edge from-layer="41" from-port="1" to-layer="42" to-port="1" />
+		<edge from-layer="42" from-port="2" to-layer="45" to-port="0" />
+		<edge from-layer="43" from-port="0" to-layer="44" to-port="0" />
+		<edge from-layer="44" from-port="1" to-layer="45" to-port="1" />
+		<edge from-layer="45" from-port="2" to-layer="46" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_audio_text_projection_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f01e541b7df78261c7f24db84a60d13c0cc36b93cfdb4598af83a5cc0016276
+size 25178112

openvino_audio_text_projection_model.xml ADDED Viewed

	@@ -0,0 +1,264 @@

+<?xml version="1.0"?>
+<net name="Model27385" version="11">
+	<layers>
+		<layer id="0" name="input" type="Parameter" version="opset1">
+			<data shape="?,?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="input">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="self.0.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 1024" offset="0" size="6291456" />
+			<output>
+				<port id="0" precision="FP16" names="self.0.weight">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="self.0.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="__module.0/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="Constant_4741778_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="6291456" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="Constant_4741778" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="__module.0/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="10">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="__module.1/aten::gelu/Gelu" type="Gelu" version="opset7">
+			<data approximation_mode="ERF" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="12">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="8" name="self.2.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 3072" offset="6297600" size="18874368" />
+			<output>
+				<port id="0" precision="FP16" names="self.2.weight">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="9" name="self.2.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="10" name="__module.2/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="11" name="Constant_4741779_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="25171968" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="12" name="Constant_4741779" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="13" name="__module.2/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="14" name="Result_4740047" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
+		<edge from-layer="3" from-port="2" to-layer="6" to-port="0" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="0" />
+		<edge from-layer="5" from-port="1" to-layer="6" to-port="1" />
+		<edge from-layer="6" from-port="2" to-layer="7" to-port="0" />
+		<edge from-layer="7" from-port="1" to-layer="10" to-port="0" />
+		<edge from-layer="8" from-port="0" to-layer="9" to-port="0" />
+		<edge from-layer="9" from-port="1" to-layer="10" to-port="1" />
+		<edge from-layer="10" from-port="2" to-layer="13" to-port="0" />
+		<edge from-layer="11" from-port="0" to-layer="12" to-port="0" />
+		<edge from-layer="12" from-port="1" to-layer="13" to-port="1" />
+		<edge from-layer="13" from-port="2" to-layer="14" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_audio_vision_projection_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f01e541b7df78261c7f24db84a60d13c0cc36b93cfdb4598af83a5cc0016276
+size 25178112

openvino_audio_vision_projection_model.xml ADDED Viewed

	@@ -0,0 +1,264 @@

+<?xml version="1.0"?>
+<net name="Model27388" version="11">
+	<layers>
+		<layer id="0" name="input" type="Parameter" version="opset1">
+			<data shape="?,?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="input">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="self.0.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 1024" offset="0" size="6291456" />
+			<output>
+				<port id="0" precision="FP16" names="self.0.weight">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="self.0.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="__module.0/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1024</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="Constant_4743673_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="6291456" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="Constant_4743673" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="__module.0/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="10">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="__module.1/aten::gelu/Gelu" type="Gelu" version="opset7">
+			<data approximation_mode="ERF" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="12">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="8" name="self.2.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 3072" offset="6297600" size="18874368" />
+			<output>
+				<port id="0" precision="FP16" names="self.2.weight">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="9" name="self.2.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="10" name="__module.2/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="11" name="Constant_4743674_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="25171968" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="12" name="Constant_4743674" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="13" name="__module.2/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="14" name="Result_4741942" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
+		<edge from-layer="3" from-port="2" to-layer="6" to-port="0" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="0" />
+		<edge from-layer="5" from-port="1" to-layer="6" to-port="1" />
+		<edge from-layer="6" from-port="2" to-layer="7" to-port="0" />
+		<edge from-layer="7" from-port="1" to-layer="10" to-port="0" />
+		<edge from-layer="8" from-port="0" to-layer="9" to-port="0" />
+		<edge from-layer="9" from-port="1" to-layer="10" to-port="1" />
+		<edge from-layer="10" from-port="2" to-layer="13" to-port="0" />
+		<edge from-layer="11" from-port="0" to-layer="12" to-port="0" />
+		<edge from-layer="12" from-port="1" to-layer="13" to-port="1" />
+		<edge from-layer="13" from-port="2" to-layer="14" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_language_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c023c393dd796ef9d1d16e4e03e5dc059a5d9bc99da25845c256d2edfa679c33
+size 2768460624

openvino_language_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_text_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:464ddd70c11b2bc5d66f3da60e7edf051e848b6c520088768759ad9bd6645153
+size 1229193220

openvino_text_embeddings_model.xml ADDED Viewed

	@@ -0,0 +1,107 @@

+<?xml version="1.0"?>
+<net name="Model0" version="11">
+	<layers>
+		<layer id="0" name="input" type="Parameter" version="opset1">
+			<data shape="?,?" element_type="i64" />
+			<output>
+				<port id="0" precision="I64" names="input">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="self.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="200064, 3072" offset="0" size="1229193216" />
+			<output>
+				<port id="0" precision="FP16" names="self.weight">
+					<dim>200064</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="self.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>200064</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>200064</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="aten::embedding/Convert" type="Convert" version="opset1">
+			<data destination_type="i32" />
+			<input>
+				<port id="0" precision="I64">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="I32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="aten::embedding/Constant" type="Const" version="opset1">
+			<data element_type="i32" shape="" offset="1229193216" size="4" />
+			<output>
+				<port id="0" precision="I32" />
+			</output>
+		</layer>
+		<layer id="5" name="aten::embedding/Gather" type="Gather" version="opset8">
+			<data batch_dims="0" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>200064</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="I32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="2" precision="I32" />
+			</input>
+			<output>
+				<port id="3" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="Result_11" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
+		<edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
+		<edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_vision_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b70404777806200c0769a018ec95acf1e4fa7055dfbf03ff517bf13a8c30723c
+size 400024908

openvino_vision_embeddings_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_vision_projection_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f7e2f3f03b477a0a699a740ea365ef61c015142a03ceebb291a6d2ab9b2d4d7
+size 25964544

openvino_vision_projection_model.xml ADDED Viewed

	@@ -0,0 +1,264 @@

+<?xml version="1.0"?>
+<net name="Model33240" version="11">
+	<layers>
+		<layer id="0" name="input" type="Parameter" version="opset1">
+			<data shape="?,?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="input">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="self.0.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 1152" offset="0" size="7077888" />
+			<output>
+				<port id="0" precision="FP16" names="self.0.weight">
+					<dim>3072</dim>
+					<dim>1152</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="self.0.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>1152</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1152</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="__module.0/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>1152</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="Constant_5534472_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="7077888" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="Constant_5534472" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="__module.0/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="10">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="__module.1/aten::gelu/Gelu" type="Gelu" version="opset7">
+			<data approximation_mode="ERF" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32" names="12">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="8" name="self.2.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="3072, 3072" offset="7084032" size="18874368" />
+			<output>
+				<port id="0" precision="FP16" names="self.2.weight">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="9" name="self.2.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="10" name="__module.2/aten::linear/MatMul" type="MatMul" version="opset1">
+			<data transpose_a="false" transpose_b="true" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>3072</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="11" name="Constant_5534473_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1, 1, 3072" offset="25958400" size="6144" />
+			<output>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="12" name="Constant_5534473" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="13" name="__module.2/aten::linear/Add" type="Add" version="opset1">
+			<data auto_broadcast="numpy" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="14" name="Result_5532741" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3072</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
+		<edge from-layer="3" from-port="2" to-layer="6" to-port="0" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="0" />
+		<edge from-layer="5" from-port="1" to-layer="6" to-port="1" />
+		<edge from-layer="6" from-port="2" to-layer="7" to-port="0" />
+		<edge from-layer="7" from-port="1" to-layer="10" to-port="0" />
+		<edge from-layer="8" from-port="0" to-layer="9" to-port="0" />
+		<edge from-layer="9" from-port="1" to-layer="10" to-port="1" />
+		<edge from-layer="10" from-port="2" to-layer="13" to-port="0" />
+		<edge from-layer="11" from-port="0" to-layer="12" to-port="0" />
+		<edge from-layer="12" from-port="1" to-layer="13" to-port="1" />
+		<edge from-layer="13" from-port="2" to-layer="14" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2025.1.0-18311-da00e90afb7" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
+    "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
+    "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
+  },
+  "image_processor_type": "Phi4MMImageProcessor",
+  "processor_class": "Phi4MMProcessor",
+  "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
+  "audio_compression_rate": 8,
+  "audio_downsample_rate": 1,
+  "audio_feat_stride": 1,
+  "dynamic_hd": 36
+}

processing_phi4mm.py ADDED Viewed

	@@ -0,0 +1,733 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4MM
+"""
+import re
+from typing import List, Optional, Tuple, Union
+import math
+from enum import Enum
+import numpy as np
+import scipy
+import torch
+import torchvision
+from transformers import AutoFeatureExtractor, AutoImageProcessor
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import (
+    ImageInput,
+    make_list_of_images,
+    valid_images,
+)
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType, logging
+from torch.nn.utils.rnn import pad_sequence
+logger = logging.get_logger(__name__)
+# Special tokens
+_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r'<\|image_\d+\|>'  # For backward compatibility
+_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r'<\|audio_\d+\|>'  # For backward compatibility
+_IMAGE_SPECIAL_TOKEN = '<|endoftext10|>'
+_AUDIO_SPECIAL_TOKEN = '<|endoftext11|>'
+_IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
+_AUDIO_SPECIAL_TOKEN_ID = 200011  # '<|endoftext11|>'
+class InputMode(Enum):
+    LANGUAGE = 0
+    VISION = 1
+    SPEECH = 2
+    VISION_SPEECH = 3
+class Phi4MMImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Phi4MM image processor.
+    """
+    model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
+    def __init__(
+        self,
+        dynamic_hd,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dynamic_hd = dynamic_hd
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=384, mask_size=27, use_thumbnail=True):
+        orig_width, orig_height = image.size
+        w_crop_num = math.ceil(orig_width/float(image_size))
+        h_crop_num = math.ceil(orig_height/float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+            # calculate the existing image aspect ratio
+            target_ratios = set(
+                (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+                i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = self.find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        # Calculate the ratio
+        ratio_width = target_width / orig_width
+        ratio_height = target_height / orig_height
+        if ratio_width < ratio_height:
+            new_size = (target_width, int(orig_height * ratio_width))
+            padding_width = 0
+            padding_height = target_height - int(orig_height * ratio_width)
+        else:
+            new_size = (int(orig_width * ratio_height), target_height)
+            padding_width = target_width - int(orig_width * ratio_height)
+            padding_height = 0
+        attention_mask = torch.ones((int(mask_size*target_aspect_ratio[1]), int(mask_size*target_aspect_ratio[0])))
+        if padding_width >= 14:
+            attention_mask[:, -math.floor(padding_width/14):] = 0
+        if padding_height >= 14:
+            attention_mask[-math.floor(padding_height/14):,:] = 0
+        assert attention_mask.sum() > 0
+        if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
+            raise ValueError(f'the aspect ratio is very extreme {new_size}')
+        image = torchvision.transforms.functional.resize(image, [new_size[1], new_size[0]],)
+        resized_img = torchvision.transforms.functional.pad(image, [0, 0, padding_width, padding_height], fill=[255,255,255])
+        return resized_img, attention_mask
+    def pad_to_max_num_crops(self, images, max_crops=5):
+        """
+        images: B x 3 x H x W, B<=max_crops
+        """
+        B, _, H, W = images.shape
+        if B < max_crops:
+            pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+            images = torch.cat([images, pad], dim=0)
+        return images
+    def pad_mask_to_max_num_crops(self, masks, max_crops=5):
+        B, H, W = masks.shape
+        if B < max_crops:
+            pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
+            masks = torch.cat([masks, pad], dim=0)
+        return masks
+    def preprocess(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # Basic settings.
+        img_processor = torchvision.transforms.Compose([
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(
+                (0.5, 0.5, 0.5),
+                (0.5, 0.5, 0.5)
+            ),
+        ])
+        dyhd_base_resolution = 448
+        # Dynamic HD
+        base_resolution = dyhd_base_resolution
+        images = [image.convert('RGB') for image in images]
+        # cover 384 and 448 resolution
+        mask_resolution = base_resolution // 14
+        elems, image_attention_masks = [], []
+        for im in images:
+            elem, attention_mask = self.dynamic_preprocess(im, max_num=self.dynamic_hd, image_size=base_resolution, mask_size=mask_resolution)
+            elems.append(elem)
+            image_attention_masks.append(attention_mask)
+        hd_images = [img_processor(im) for im in elems]
+        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(base_resolution, base_resolution), mode='bicubic',).to(im.dtype) for im in hd_images]
+        shapes = [[im.size(1), im.size(2)] for im in hd_images]
+        mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
+        global_attention_mask = [torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images]
+        hd_images_reshape = [im.reshape(1, 3,
+                                            h//base_resolution,
+                                            base_resolution,
+                                            w//base_resolution,
+                                            base_resolution
+                                            ).permute(0,2,4,1,3,5).reshape(-1, 3, base_resolution, base_resolution).contiguous() for im, (h, w) in zip(hd_images, shapes)]
+        attention_masks_reshape = [mask.reshape(1,
+                                            h//mask_resolution,
+                                            mask_resolution,
+                                            w//mask_resolution,
+                                            mask_resolution
+                                            ).permute(0,1,3,2,4).reshape(-1, mask_resolution, mask_resolution).contiguous() for mask, (h, w) in zip(image_attention_masks, mask_shapes)]
+        downsample_attention_masks = [mask[:,0::2,0::2].reshape(1,
+                                            h//mask_resolution,
+                                            w//mask_resolution,
+                                            mask_resolution//2+mask_resolution%2,
+                                            mask_resolution//2+mask_resolution%2
+                                            ).permute(0,1,3,2,4) for mask, (h,w) in zip(attention_masks_reshape, mask_shapes)]
+        downsample_attention_masks = [mask.reshape(mask.size(1)*mask.size(2), mask.size(3)*mask.size(4))for mask in downsample_attention_masks]
+        num_img_tokens = [256 + 1 + int(mask.sum().item()) + int(mask[:,0].sum().item()) + 16 for mask in downsample_attention_masks]
+        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
+        hd_masks_reshape = [torch.cat([_global_mask] + [_mask], dim=0) for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)]
+        max_crops = max([img.size(0) for img in hd_images_reshape])
+        image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
+        mask_transformed = torch.stack(mask_transformed, dim=0)
+        returned_input_image_embeds = image_transformed
+        returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+        returned_image_attention_mask = mask_transformed
+        returned_num_img_tokens = num_img_tokens
+        data = {
+            "input_image_embeds": returned_input_image_embeds,
+            "image_sizes": returned_image_sizes,
+            "image_attention_mask": returned_image_attention_mask,
+            "num_img_tokens": returned_num_img_tokens,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+AudioInput = Tuple[Union[np.ndarray, torch.Tensor], int]
+AudioInputs = List[AudioInput]
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+    khi = max(khi, klo)
+    # Spec 2: SpeechLib uses trianges in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+    return matrix
+class Phi4MMAudioFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
+    def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
+        feature_size = 80
+        sampling_rate = 16000
+        padding_value = 0.0
+        super().__init__(feature_size, sampling_rate, padding_value, **kwargs)
+        self.compression_rate = audio_compression_rate
+        self.qformer_compression_rate = audio_downsample_rate
+        self.feat_stride = audio_feat_stride
+        self._eightk_method = "fillzero"
+        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
+        self._hamming400 = np.hamming(400)  # for 16k audio
+        self._hamming200 = np.hamming(200)  # for 8k audio
+    def duration_to_frames(self, duration):
+        """duration in s, estimated frames"""
+        frame_rate = 10
+        num_frames = duration * 1000 // frame_rate
+        return num_frames
+    def __call__(
+        self,
+        audios: List[AudioInput],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        # Ref: https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py#L161
+        returned_input_audio_embeds = []
+        returned_audio_embed_sizes = []
+        audio_frames_list = []
+        for audio_data, sample_rate in audios:
+            audio_embeds = self._extract_features(audio_data, sample_rate)
+            audio_frames = len(audio_embeds) * self.feat_stride
+            audio_embed_size = self._compute_audio_embed_size(audio_frames)
+            returned_input_audio_embeds.append(torch.tensor(audio_embeds))
+            returned_audio_embed_sizes.append(torch.tensor(audio_embed_size).long())
+            audio_frames_list.append(audio_frames)
+        returned_input_audio_embeds = pad_sequence(
+            returned_input_audio_embeds, batch_first=True
+        )
+        returned_audio_embed_sizes = torch.stack(returned_audio_embed_sizes, dim=0)
+        audio_frames = torch.tensor(audio_frames_list)
+        returned_audio_attention_mask = torch.arange(0, audio_frames.max()).unsqueeze(0) < audio_frames.unsqueeze(1) if len(audios) > 1 else None
+        data = {
+            "input_audio_embeds": returned_input_audio_embeds,
+            "audio_embed_sizes": returned_audio_embed_sizes,
+        }
+        if returned_audio_attention_mask is not None:
+            data["audio_attention_mask"] = returned_audio_attention_mask
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def _extract_spectrogram(self, wav, fs):
+        """Extract spectrogram features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        if wav.ndim > 1:
+            wav = np.squeeze(wav)
+        # by default, we extract the mean if stereo
+        if len(wav.shape) == 2:
+            wav = wav.mean(1)
+        # Resample to 16000 or 8000 if needed
+        if fs > 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
+            fs = 16000
+        elif 8000 < fs < 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
+            fs = 8000
+        elif fs < 8000:
+            raise RuntimeError(f"Unsupported sample rate {fs}")
+        if fs == 8000:
+            if self._eightk_method == "resample":
+                # Input audio is 8 kHz. Convert to 16 kHz before feature
+                # extraction
+                wav = scipy.signal.resample_poly(wav, 2, 1)
+                fs = 16000
+            # Do nothing here for fillzero method
+        elif fs != 16000:
+            # Input audio is not a supported sample rate.
+            raise RuntimeError(f"Input data using an unsupported sample rate: {fs}")
+        preemphasis = 0.97
+        if fs == 8000:
+            n_fft = 256
+            win_length = 200
+            hop_length = 80
+            fft_window = self._hamming200
+        elif fs == 16000:
+            n_fft = 512
+            win_length = 400
+            hop_length = 160
+            fft_window = self._hamming400
+        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
+        n_batch = (wav.shape[0] - win_length) // hop_length + 1
+        # Here we don't use stride_tricks since the input array may not satisfy
+        # memory layout requirement and we need writeable output
+        # Here we only use list of views before copy to desination
+        # so it is more efficient than broadcasting
+        y_frames = np.array(
+            [wav[_stride : _stride + win_length] for _stride in range(0, hop_length * n_batch, hop_length)],
+            dtype=np.float32,
+        )
+        # Spec 2: SpeechLib applies preemphasis within each batch
+        y_frames_prev = np.roll(y_frames, 1, axis=1)
+        y_frames_prev[:, 0] = y_frames_prev[:, 1]
+        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
+        S = np.fft.rfft(fft_window * y_frames, n=n_fft, axis=1).astype(np.complex64)
+        if fs == 8000:
+            # Need to pad the output to look like 16 kHz data but with zeros in
+            # the 4 to 8 kHz bins.
+            frames, bins = S.shape
+            padarray = np.zeros((frames, bins))
+            S = np.concatenate((S[:, 0:-1], padarray), axis=1)  # Nyquist bin gets set to zero
+        spec = np.abs(S).astype(np.float32)
+        return spec
+    def _extract_features(self, wav, fs):
+        """Extract log filterbank features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        spec = self._extract_spectrogram(wav, fs)
+        spec_power = spec**2
+        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
+        log_fbank = np.log(fbank_power).astype(np.float32)
+        return log_fbank
+    def _compute_audio_embed_size(self, audio_frames):
+        integer = audio_frames // self.compression_rate
+        remainder = audio_frames % self.compression_rate
+        result = integer if remainder == 0 else integer + 1
+        integer = result // self.qformer_compression_rate
+        remainder = result % self.qformer_compression_rate
+        result = integer if remainder == 0 else integer + 1  # qformer compression
+        return result
+class Phi4MMProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi4MM processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
+    [`Phi4MMProcessor`] offers all the functionalities of [`Phi4MMImageProcessor`] and [`GPT2Tokenizer`]. See the
+    [`~Phi4MMProcessor.__call__`] and [`~Phi4MMProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Phi4MMImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`GPT2Tokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    tokenizer_class = "GPT2TokenizerFast"
+    image_processor_class = "AutoImageProcessor"  # Phi4MMImageProcessor will be registered later
+    audio_processor_class = "AutoFeatureExtractor"  # Phi4MMAudioFeatureExtractor will be registered later
+    def __init__(self, image_processor, audio_processor, tokenizer):
+        self.image_processor = image_processor
+        self.audio_processor = audio_processor
+        self.tokenizer = tokenizer
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: Optional[ImageInput] = None,
+        audios: Optional[AudioInputs] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
+        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi4MMImageProcessor's [`~Phi4MMImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **input_image_embeds** -- Pixel values to be fed to a model.
+            - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
+            - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
+            - **input_audio_embeds** -- Audio embeddings to be fed to a model.
+            - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+        """
+        image_inputs = self.image_processor(images, return_tensors=return_tensors) if images is not None else {}
+        audio_inputs = self.audio_processor(audios, return_tensors=return_tensors) if audios is not None else {}
+        inputs = self._convert_images_audios_text_to_inputs(
+            image_inputs,
+            audio_inputs,
+            text,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+        )
+        # idenfity the input mode
+        if len(image_inputs) > 0 and len(audio_inputs) > 0:
+            input_mode = InputMode.VISION_SPEECH
+        elif len(image_inputs) > 0:
+            input_mode = InputMode.VISION
+        elif len(audio_inputs) > 0:
+            input_mode = InputMode.SPEECH
+        else:
+            input_mode = InputMode.LANGUAGE
+        inputs["input_mode"] = torch.tensor([input_mode.value], dtype=torch.long)
+        return inputs
+    @property
+    def special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+    def get_special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+    @property
+    def chat_template(self):
+        return self.tokenizer.chat_template
+    def _convert_images_audios_text_to_inputs(
+        self, images, audios, text, padding=False, truncation=None, max_length=None, return_tensors=None
+    ):
+        # prepare image id to image input ids
+        if len(images) > 0:
+            input_image_embeds = images["input_image_embeds"]
+            image_sizes = images["image_sizes"]
+            image_attention_mask = images["image_attention_mask"]
+            num_img_tokens = images['num_img_tokens']
+        else:
+            input_image_embeds = torch.tensor([])
+            image_sizes = torch.tensor([])
+            image_attention_mask = torch.tensor([])
+            num_img_tokens = []
+        # prepare audio id to audio input ids
+        if len(audios) > 0:
+            input_audio_embeds = audios["input_audio_embeds"]
+            audio_embed_sizes = audios["audio_embed_sizes"]
+            audio_attention_mask = audios.get("audio_attention_mask", None)
+        else:
+            input_audio_embeds = torch.tensor([])
+            audio_embed_sizes = torch.tensor([])
+            audio_attention_mask = None
+        # Replace certain special tokens for compatibility
+        # Ref: https://stackoverflow.com/questions/11475885/python-replace-regex
+        if isinstance(text, str):
+            text = [text]
+        assert isinstance(text, list)
+        processed_text = [re.sub(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, _IMAGE_SPECIAL_TOKEN, t) for t in text]
+        processed_text = [re.sub(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, _AUDIO_SPECIAL_TOKEN, t) for t in processed_text]
+        input_ids_list = [self.tokenizer(t).input_ids for t in processed_text]
+        img_cnt, audio_cnt = 0, 0  # only needed for later assertion
+        image_token_count_iter = iter(num_img_tokens)
+        audio_embed_size_iter = iter(audio_embed_sizes.tolist())
+        new_input_ids_list = []
+        for input_ids in input_ids_list:
+            i = 0
+            while i < len(input_ids):
+                token_id = input_ids[i]
+                if token_id == _AUDIO_SPECIAL_TOKEN_ID:
+                    token_count = next(audio_embed_size_iter)
+                    audio_cnt += 1
+                elif token_id == _IMAGE_SPECIAL_TOKEN_ID:
+                    token_count = next(image_token_count_iter)
+                    img_cnt += 1
+                else:
+                    i += 1
+                    continue
+                tokens = [token_id] * token_count
+                input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
+                i += token_count
+            input_ids = torch.tensor(input_ids, dtype=torch.long)
+            new_input_ids_list.append(input_ids)
+        lengths = torch.tensor([len(input_ids) for input_ids in new_input_ids_list])
+        max_len = lengths.max()
+        input_ids = input_ids.new_full((len(new_input_ids_list), max_len), self.tokenizer.pad_token_id)
+        # batched inference requires left padding
+        for i in range(len(new_input_ids_list)):
+            input_ids[i, max_len - len(new_input_ids_list[i]):] = new_input_ids_list[i]
+        # If the below assertion fails, it might be that input pure-text
+        # messages contain image/audio special tokens literally
+        # (<|endoftext10|>, <|endoftext11|>).
+        assert (
+            img_cnt == len(num_img_tokens)
+        ), (
+            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
+            f"does not match number of images ({len(num_img_tokens)})"
+        )
+        assert (
+            audio_cnt == len(audio_embed_sizes)
+        ), (
+            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
+            f"does not match number of audios ({len(audio_embed_sizes)})"
+        )
+        # prepare attention mask
+        seq_range = torch.arange(max_len - 1, -1, -1)
+        attention_mask = seq_range.unsqueeze(0) < lengths.unsqueeze(1)
+        # prepare batch feature
+        data = {
+            "input_ids": input_ids,
+            "input_image_embeds": input_image_embeds,
+            "image_sizes": image_sizes,
+            "image_attention_mask": image_attention_mask,
+            "input_audio_embeds": input_audio_embeds,
+            "audio_embed_sizes": audio_embed_sizes,
+            "audio_attention_mask": audio_attention_mask,
+            "attention_mask": attention_mask,
+        }
+        return BatchFeature(
+            data=data
+        )
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
+AutoImageProcessor.register("Phi4MMImageProcessor", Phi4MMImageProcessor)
+AutoFeatureExtractor.register("Phi4MMAudioFeatureExtractor", Phi4MMAudioFeatureExtractor)

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor"
+  },
+  "processor_class": "Phi4MMProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c1b9f641d4f8b7247b8d5007dd3b6a9f6a87cb5123134fe0d326f14d10c0585
+size 15524479

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|endoftext10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|endoftext11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200019": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200020": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200021": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200022": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200023": {
+      "content": "<|tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200024": {
+      "content": "<|/tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200025": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200026": {
+      "content": "<|/tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200027": {
+      "content": "<|tool_response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200028": {
+      "content": "<|tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff