daekeun-ml commited on
Commit
67c143c
·
verified ·
1 Parent(s): 29afb89

Upload phi4-mm-gradio-demo.ipynb

Browse files
Files changed (1) hide show
  1. demos/phi4-mm-gradio-demo.ipynb +167 -0
demos/phi4-mm-gradio-demo.ipynb ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9dc4dd97-1a2a-409f-8db5-4e0f2f07d49d",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Phi-4-multimodal simple demo\n",
9
+ "\n",
10
+ "Make sure that you must install `gradio, soundfile, and pillow`.\n",
11
+ "\n",
12
+ "- `pip install gradio transformers torch soundfile pillow`\n",
13
+ "- Retrieved from https://www.datacamp.com/tutorial/phi-4-multimodal"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "7ffc47b0-a12b-4b8a-9066-6f15acfc9210",
20
+ "metadata": {
21
+ "tags": []
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "import gradio as gr\n",
26
+ "import torch\n",
27
+ "import requests\n",
28
+ "import io\n",
29
+ "import os\n",
30
+ "import soundfile as sf\n",
31
+ "from PIL import Image\n",
32
+ "from datasets import load_dataset\n",
33
+ "from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\n",
34
+ "\n",
35
+ "\n",
36
+ "max_new_tokens = 256\n",
37
+ "orig_model_path = \"microsoft/Phi-4-multimodal-instruct\"\n",
38
+ "ft_model_path = \"daekeun-ml/Phi-4-multimodal-finetune-ko-speech\"\n",
39
+ "generation_config = GenerationConfig.from_pretrained(ft_model_path, 'generation_config.json')\n",
40
+ "processor = AutoProcessor.from_pretrained(orig_model_path, trust_remote_code=True)\n",
41
+ "model = AutoModelForCausalLM.from_pretrained(\n",
42
+ " ft_model_path,\n",
43
+ " trust_remote_code=True,\n",
44
+ " torch_dtype='auto',\n",
45
+ " _attn_implementation='flash_attention_2',\n",
46
+ ").cuda()\n",
47
+ "\n",
48
+ "user_prompt = '<|user|>'\n",
49
+ "assistant_prompt = '<|assistant|>'\n",
50
+ "prompt_suffix = '<|end|>'"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "4058364b-d041-4168-b8d7-26813467f454",
57
+ "metadata": {
58
+ "tags": []
59
+ },
60
+ "outputs": [],
61
+ "source": [
62
+ "def clean_response(response, instruction_keywords):\n",
63
+ " \"\"\"Removes the prompt text dynamically based on instruction keywords.\"\"\"\n",
64
+ " for keyword in instruction_keywords:\n",
65
+ " if response.lower().startswith(keyword.lower()):\n",
66
+ " response = response[len(keyword):].strip()\n",
67
+ " return response\n",
68
+ "\n",
69
+ "# task prompt is from technical report\n",
70
+ "asr_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio clip into text.{prompt_suffix}{assistant_prompt}'\n",
71
+ "ast_ko_prompt = f'{user_prompt}<|audio_1|>Translate the audio to Korean.{prompt_suffix}{assistant_prompt}'\n",
72
+ "ast_cot_ko_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to Korean. Use <sep> as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'\n",
73
+ "ast_en_prompt = f'{user_prompt}<|audio_1|>Translate the audio to English.{prompt_suffix}{assistant_prompt}'\n",
74
+ "ast_cot_en_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to English. Use <sep> as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'\n",
75
+ "\n",
76
+ "def process_input(file, input_type, question):\n",
77
+ " user_prompt = \"<|user|>\"\n",
78
+ " assistant_prompt = \"<|assistant|>\"\n",
79
+ " prompt_suffix = \"<|end|>\"\n",
80
+ " \n",
81
+ " if input_type == \"Image\":\n",
82
+ " prompt= f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'\n",
83
+ " image = Image.open(file)\n",
84
+ " inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)\n",
85
+ " elif input_type == \"Audio\":\n",
86
+ " prompt= f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'\n",
87
+ " audio, samplerate = sf.read(file)\n",
88
+ " inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)\n",
89
+ " elif input_type == \"Text\":\n",
90
+ " prompt = f'{user_prompt}{question} \"{file}\"{prompt_suffix}{assistant_prompt}'\n",
91
+ " inputs = processor(text=prompt, return_tensors='pt').to(model.device)\n",
92
+ " else:\n",
93
+ " return \"Invalid input type\" \n",
94
+ " \n",
95
+ " generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)\n",
96
+ " response = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]\n",
97
+ " return clean_response(response, [question])\n",
98
+ "\n",
99
+ "def process_text_translate(text, target_language):\n",
100
+ " prompt = f'Transcribe the audio to text, and then Translate the following text to {target_language}: \"{text}\"'\n",
101
+ " return process_input(text, \"Text\", prompt)\n",
102
+ "def process_text_grammar(text):\n",
103
+ " prompt = f'Check the grammar and provide corrections if needed for the following text: \"{text}\"'\n",
104
+ " return process_input(text, \"Text\", prompt)\n",
105
+ "\n",
106
+ "def gradio_interface():\n",
107
+ " with gr.Blocks() as demo:\n",
108
+ " gr.Markdown(\"# Phi 4 Powered - Multimodal Language Tutor\") \n",
109
+ " with gr.Tab(\"Text-Based Learning\"):\n",
110
+ " text_input = gr.Textbox(label=\"Enter Text\")\n",
111
+ " language_input = gr.Textbox(label=\"Target Language\", value=\"Korean\")\n",
112
+ " text_output = gr.Textbox(label=\"Response\")\n",
113
+ " text_translate_btn = gr.Button(\"Translate\")\n",
114
+ " text_grammar_btn = gr.Button(\"Check Grammar\")\n",
115
+ " text_clear_btn = gr.Button(\"Clear\")\n",
116
+ " text_translate_btn.click(process_text_translate, inputs=[text_input, language_input], outputs=text_output)\n",
117
+ " text_grammar_btn.click(process_text_grammar, inputs=[text_input], outputs=text_output)\n",
118
+ " text_clear_btn.click(lambda: (\"\", \"\", \"\"), outputs=[text_input, language_input, text_output]) \n",
119
+ " with gr.Tab(\"Image-Based Learning\"):\n",
120
+ " image_input = gr.Image(type=\"filepath\", label=\"Upload Image\")\n",
121
+ " language_input_image = gr.Textbox(label=\"Target Language for Translation\", value=\"English\")\n",
122
+ " image_output = gr.Textbox(label=\"Response\")\n",
123
+ " image_clear_btn = gr.Button(\"Clear\")\n",
124
+ " image_translate_btn = gr.Button(\"Translate Text in Image\")\n",
125
+ " image_summarize_btn = gr.Button(\"Summarize Image\")\n",
126
+ " image_translate_btn.click(process_input, inputs=[image_input, gr.Textbox(value=\"Image\", visible=False), gr.Textbox(value=\"Extract and translate text\", visible=False)], outputs=image_output)\n",
127
+ " image_summarize_btn.click(process_input, inputs=[image_input, gr.Textbox(value=\"Image\", visible=False), gr.Textbox(value=\"Summarize this image\", visible=False)], outputs=image_output)\n",
128
+ " image_clear_btn.click(lambda: (None, \"\", \"\"), outputs=[image_input, language_input_image, image_output])\n",
129
+ " with gr.Tab(\"Audio-Based Learning\"):\n",
130
+ " audio_input = gr.Audio(type=\"filepath\", label=\"Upload Audio\")\n",
131
+ " language_input_audio = gr.Textbox(label=\"Target Language for Translation\", value=\"English\")\n",
132
+ " transcript_output = gr.Textbox(label=\"Transcribed Text\")\n",
133
+ " translated_output = gr.Textbox(label=\"Translated Text\")\n",
134
+ " audio_clear_btn = gr.Button(\"Clear\")\n",
135
+ " audio_transcribe_btn = gr.Button(\"Transcribe & Translate\")\n",
136
+ " audio_transcribe_btn.click(process_input, inputs=[audio_input, gr.Textbox(value=\"Audio\", visible=False), gr.Textbox(value=\"Transcribe this audio\", visible=False)], outputs=transcript_output)\n",
137
+ " audio_transcribe_btn.click(process_input, inputs=[audio_input, gr.Textbox(value=\"Audio\", visible=False), language_input_audio], outputs=translated_output)\n",
138
+ " audio_clear_btn.click(lambda: (None, \"\", \"\", \"\"), outputs=[audio_input, language_input_audio, transcript_output, translated_output]) \n",
139
+ " demo.launch(debug=True, share=True)\n",
140
+ "\n",
141
+ "if __name__ == \"__main__\":\n",
142
+ " gradio_interface()"
143
+ ]
144
+ }
145
+ ],
146
+ "metadata": {
147
+ "kernelspec": {
148
+ "display_name": "Python 3.10 - SDK v2",
149
+ "language": "python",
150
+ "name": "python310-sdkv2"
151
+ },
152
+ "language_info": {
153
+ "codemirror_mode": {
154
+ "name": "ipython",
155
+ "version": 3
156
+ },
157
+ "file_extension": ".py",
158
+ "mimetype": "text/x-python",
159
+ "name": "python",
160
+ "nbconvert_exporter": "python",
161
+ "pygments_lexer": "ipython3",
162
+ "version": "3.10.14"
163
+ }
164
+ },
165
+ "nbformat": 4,
166
+ "nbformat_minor": 5
167
+ }