Maximum2000 commited on 4 days ago

Commit

9fb1a94

verified ·

1 Parent(s): c4be67f

Upload 19 files

Browse files

Files changed (20) hide show

.gitattributes +7 -0
added_tokens.json +12 -0
genai_config.json +90 -0
merges.txt +0 -0
phi-4-mm-embedding.onnx +3 -0
phi-4-mm-embedding.onnx.data +3 -0
phi-4-mm-speech.onnx +3 -0
phi-4-mm-speech.onnx.data +3 -0
phi-4-mm-speech.onnx_adapter +3 -0
phi-4-mm-text.onnx +3 -0
phi-4-mm-text.onnx.data +3 -0
phi-4-mm-vision.onnx +3 -0
phi-4-mm-vision.onnx.data +3 -0
phi-4-mm-vision.onnx_adapter +3 -0
special_tokens_map.json +30 -0
speech_processor.json +48 -0
tokenizer.json +3 -0
tokenizer_config.json +126 -0
vision_processor.json +68 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-embedding.onnx.data filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-speech.onnx_adapter filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-speech.onnx.data filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-text.onnx.data filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-vision.onnx_adapter filter=lfs diff=lfs merge=lfs -text
+phi-4-mm-vision.onnx.data filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "<|/tool_call|>": 200026,
+  "<|/tool|>": 200024,
+  "<|assistant|>": 200019,
+  "<|end|>": 200020,
+  "<|system|>": 200022,
+  "<|tag|>": 200028,
+  "<|tool_call|>": 200025,
+  "<|tool_response|>": 200027,
+  "<|tool|>": 200023,
+  "<|user|>": 200021
+}

genai_config.json ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+    "model": {
+        "bos_token_id": 199999,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "phi-4-mm-text.onnx",
+            "head_size": 128,
+            "hidden_size": 3072,
+            "inputs": {
+                "inputs_embeds": "inputs_embeds",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 24,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 8
+        },
+        "vision": {
+            "filename": "phi-4-mm-vision.onnx",
+            "config_filename": "vision_processor.json",
+            "adapter_filename": "phi-4-mm-vision.onnx_adapter",
+            "inputs": {
+                "pixel_values": "pixel_values",
+                "attention_mask": "image_attention_mask",
+                "image_sizes": "image_sizes"
+            },
+            "outputs": {
+                "image_features": "image_features"
+            }
+        },
+        "speech": {
+            "filename": "phi-4-mm-speech.onnx",
+            "config_filename": "speech_processor.json",
+            "adapter_filename": "phi-4-mm-speech.onnx_adapter",
+            "inputs": {
+                "audio_embeds": "audio_embeds",
+                "attention_mask": "audio_attention_mask",
+                "audio_sizes": "audio_sizes",
+                "audio_projection_mode": "audio_projection_mode"
+            },
+            "outputs": {
+                "audio_features": "audio_features"
+            }
+        },
+        "embedding": {
+            "filename": "phi-4-mm-embedding.onnx",
+            "inputs": {
+                "input_ids": "input_ids",
+                "image_features": "image_features",
+                "audio_features": "audio_features"
+            },
+            "outputs": {
+                "inputs_embeds": "inputs_embeds"
+            }
+        },
+        "eos_token_id": [
+            200020,
+            199999
+        ],
+        "pad_token_id": 199999,
+        "type": "phi4mm",
+        "vocab_size": 200064
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 1,
+        "top_p": 1.0
+    }
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

phi-4-mm-embedding.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e201c5981608c06f93b15860993bf6029cfc3f60401290926c404ab84946dddd
+size 12008

phi-4-mm-embedding.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85e0389fd112f00584016e1429588410e07f548cbedc5111c84bafabb14c825b
+size 1229193216

phi-4-mm-speech.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:716d5b322e623e85e3051fd8e384d9c19efb335b1d2813077f91b03a067e400f
+size 392720

phi-4-mm-speech.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b88f9973d4882f50c34a3356678488e27bca2c986e053099356feed8f5f4060
+size 932832512

phi-4-mm-speech.onnx_adapter ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55b8ff5adb3d30e696617fc1d1b306dd93c30bfee61a98ac1833210ce5a4d18b
+size 1111538200

phi-4-mm-text.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff1c7af0c0dbada3e05ac194158aadcfd6e9ae536f36443165dd213c925c6a6e
+size 471015871

phi-4-mm-text.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad7c5ab3dd37e47a8163d00e0796580ca23860455ec58d29b8652e5011f53da
+size 7672043520

phi-4-mm-vision.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:586dc996160c16b438f2249606b104c2e87a9269ce88c3f72804f0d9fa91b127
+size 358656

phi-4-mm-vision.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db9415c47e96318fbaf65352e14487c7b7a2454b4e71d57ae1bafe19e1368716
+size 822139712

phi-4-mm-vision.onnx_adapter ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e333ae744261958e83dbeda556d8860760b732bc9a00892c8988e69a2f20094e
+size 889240088

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

speech_processor.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "feature_extraction": {
+        "sequence": [
+            {
+                "operation": {
+                    "name": "audio_decoder",
+                    "type": "AudioDecoderEx",
+                    "attrs": {
+                        "target_sample_rates": [
+                            8000,
+                            16000
+                        ]
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi_4_audio_embed",
+                    "type": "Phi4AudioEmbed",
+                    "attrs": {
+                        "audio_compression_rate": 8,
+                        "stft_normal/n_fft": 512,
+                        "stft_normal/frame_length": 400,
+                        "stft_normal/hop_length": 160,
+                        "stft_normal/win_fn": "hamming",
+                        "logmel/chunk_size": 30,
+                        "logmel/hop_length": 160,
+                        "logmel/n_fft": 512,
+                        "logmel/n_mel": 80,
+                        "logmel/feature_first": 0,
+                        "logmel/no_padding": 1,
+                        "stft_normal_8k/n_fft": 256,
+                        "stft_normal_8k/frame_length": 200,
+                        "stft_normal_8k/hop_length": 80,
+                        "stft_normal_8k/win_fn": "hamming",
+                        "logmel_8k/chunk_size": 30,
+                        "logmel_8k/hop_length": 80,
+                        "logmel_8k/n_fft": 512,
+                        "logmel_8k/n_mel": 80,
+                        "logmel_8k/feature_first": 0,
+                        "logmel_8k/no_padding": 1
+                    }
+                }
+            }
+        ],
+        "output_aligner": "phi4-audio-aligner"
+    }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c1b9f641d4f8b7247b8d5007dd3b6a9f6a87cb5123134fe0d326f14d10c0585
+size 15524479

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<|endoftext10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<|endoftext11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200019": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200020": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200021": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200022": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200023": {
+      "content": "<|tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200024": {
+      "content": "<|/tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200025": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200026": {
+      "content": "<|/tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200027": {
+      "content": "<|tool_response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200028": {
+      "content": "<|tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vision_processor.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+    "processor": {
+        "name": "phi_4_vision_processor",
+        "transforms": [
+            {
+                "operation": {
+                    "name": "decode_image",
+                    "type": "DecodeImage",
+                    "attrs": {
+                        "color_space": "RGB"
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi4_vision_dynamic_preprocess",
+                    "type": "Phi4VisionDynamicPreprocess",
+                    "attrs": {
+                        "dynamic_hd": 36,
+                        "dyhd_base_resolution": 448
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "rescale",
+                    "type": "Rescale",
+                    "inputs": [
+                        ":0"
+                    ]
+                }
+            },
+            {
+                "operation": {
+                    "name": "normalize",
+                    "type": "Normalize",
+                    "attrs": {
+                        "mean": [
+                            0.5,
+                            0.5,
+                            0.5
+                        ],
+                        "std": [
+                            0.5,
+                            0.5,
+                            0.5
+                        ]
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi4_vision_processor",
+                    "type": "Phi4VisionProcessor",
+                    "inputs": [
+                        ":0",
+                        "phi4_vision_dynamic_preprocess:1"
+                    ],
+                    "attrs": {
+                        "dyhd_base_resolution": 448,
+                        "interpolation": "CUBIC"
+                    }
+                }
+            }
+        ],
+        "output_aligner": "phi4-vision-aligner"
+    }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff