{ "cells": [ { "cell_type": "markdown", "id": "9dc4dd97-1a2a-409f-8db5-4e0f2f07d49d", "metadata": {}, "source": [ "# Phi-4-multimodal simple demo\n", "\n", "Make sure that you must install `gradio, soundfile, and pillow`.\n", "\n", "- `pip install gradio transformers torch soundfile pillow`\n", "- Retrieved from https://www.datacamp.com/tutorial/phi-4-multimodal" ] }, { "cell_type": "code", "execution_count": null, "id": "7ffc47b0-a12b-4b8a-9066-6f15acfc9210", "metadata": { "tags": [] }, "outputs": [], "source": [ "import gradio as gr\n", "import torch\n", "import requests\n", "import io\n", "import os\n", "import soundfile as sf\n", "from PIL import Image\n", "from datasets import load_dataset\n", "from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\n", "\n", "\n", "max_new_tokens = 256\n", "orig_model_path = \"microsoft/Phi-4-multimodal-instruct\"\n", "ft_model_path = \"daekeun-ml/Phi-4-multimodal-finetune-ko-speech\"\n", "generation_config = GenerationConfig.from_pretrained(ft_model_path, 'generation_config.json')\n", "processor = AutoProcessor.from_pretrained(orig_model_path, trust_remote_code=True)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " ft_model_path,\n", " trust_remote_code=True,\n", " torch_dtype='auto',\n", " _attn_implementation='flash_attention_2',\n", ").cuda()\n", "\n", "user_prompt = '<|user|>'\n", "assistant_prompt = '<|assistant|>'\n", "prompt_suffix = '<|end|>'" ] }, { "cell_type": "code", "execution_count": null, "id": "4058364b-d041-4168-b8d7-26813467f454", "metadata": { "tags": [] }, "outputs": [], "source": [ "def clean_response(response, instruction_keywords):\n", " \"\"\"Removes the prompt text dynamically based on instruction keywords.\"\"\"\n", " for keyword in instruction_keywords:\n", " if response.lower().startswith(keyword.lower()):\n", " response = response[len(keyword):].strip()\n", " return response\n", "\n", "# task prompt is from technical report\n", "asr_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio clip into text.{prompt_suffix}{assistant_prompt}'\n", "ast_ko_prompt = f'{user_prompt}<|audio_1|>Translate the audio to Korean.{prompt_suffix}{assistant_prompt}'\n", "ast_cot_ko_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to Korean. Use as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'\n", "ast_en_prompt = f'{user_prompt}<|audio_1|>Translate the audio to English.{prompt_suffix}{assistant_prompt}'\n", "ast_cot_en_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to English. Use as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'\n", "\n", "def process_input(file, input_type, question):\n", " user_prompt = \"<|user|>\"\n", " assistant_prompt = \"<|assistant|>\"\n", " prompt_suffix = \"<|end|>\"\n", " \n", " if input_type == \"Image\":\n", " prompt= f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'\n", " image = Image.open(file)\n", " inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)\n", " elif input_type == \"Audio\":\n", " prompt= f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'\n", " audio, samplerate = sf.read(file)\n", " inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)\n", " elif input_type == \"Text\":\n", " prompt = f'{user_prompt}{question} \"{file}\"{prompt_suffix}{assistant_prompt}'\n", " inputs = processor(text=prompt, return_tensors='pt').to(model.device)\n", " else:\n", " return \"Invalid input type\" \n", " \n", " generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)\n", " response = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]\n", " return clean_response(response, [question])\n", "\n", "def process_text_translate(text, target_language):\n", " prompt = f'Transcribe the audio to text, and then Translate the following text to {target_language}: \"{text}\"'\n", " return process_input(text, \"Text\", prompt)\n", "def process_text_grammar(text):\n", " prompt = f'Check the grammar and provide corrections if needed for the following text: \"{text}\"'\n", " return process_input(text, \"Text\", prompt)\n", "\n", "def gradio_interface():\n", " with gr.Blocks() as demo:\n", " gr.Markdown(\"# Phi 4 Powered - Multimodal Language Tutor\") \n", " with gr.Tab(\"Text-Based Learning\"):\n", " text_input = gr.Textbox(label=\"Enter Text\")\n", " language_input = gr.Textbox(label=\"Target Language\", value=\"Korean\")\n", " text_output = gr.Textbox(label=\"Response\")\n", " text_translate_btn = gr.Button(\"Translate\")\n", " text_grammar_btn = gr.Button(\"Check Grammar\")\n", " text_clear_btn = gr.Button(\"Clear\")\n", " text_translate_btn.click(process_text_translate, inputs=[text_input, language_input], outputs=text_output)\n", " text_grammar_btn.click(process_text_grammar, inputs=[text_input], outputs=text_output)\n", " text_clear_btn.click(lambda: (\"\", \"\", \"\"), outputs=[text_input, language_input, text_output]) \n", " with gr.Tab(\"Image-Based Learning\"):\n", " image_input = gr.Image(type=\"filepath\", label=\"Upload Image\")\n", " language_input_image = gr.Textbox(label=\"Target Language for Translation\", value=\"English\")\n", " image_output = gr.Textbox(label=\"Response\")\n", " image_clear_btn = gr.Button(\"Clear\")\n", " image_translate_btn = gr.Button(\"Translate Text in Image\")\n", " image_summarize_btn = gr.Button(\"Summarize Image\")\n", " image_translate_btn.click(process_input, inputs=[image_input, gr.Textbox(value=\"Image\", visible=False), gr.Textbox(value=\"Extract and translate text\", visible=False)], outputs=image_output)\n", " image_summarize_btn.click(process_input, inputs=[image_input, gr.Textbox(value=\"Image\", visible=False), gr.Textbox(value=\"Summarize this image\", visible=False)], outputs=image_output)\n", " image_clear_btn.click(lambda: (None, \"\", \"\"), outputs=[image_input, language_input_image, image_output])\n", " with gr.Tab(\"Audio-Based Learning\"):\n", " audio_input = gr.Audio(type=\"filepath\", label=\"Upload Audio\")\n", " language_input_audio = gr.Textbox(label=\"Target Language for Translation\", value=\"English\")\n", " transcript_output = gr.Textbox(label=\"Transcribed Text\")\n", " translated_output = gr.Textbox(label=\"Translated Text\")\n", " audio_clear_btn = gr.Button(\"Clear\")\n", " audio_transcribe_btn = gr.Button(\"Transcribe & Translate\")\n", " audio_transcribe_btn.click(process_input, inputs=[audio_input, gr.Textbox(value=\"Audio\", visible=False), gr.Textbox(value=\"Transcribe this audio\", visible=False)], outputs=transcript_output)\n", " audio_transcribe_btn.click(process_input, inputs=[audio_input, gr.Textbox(value=\"Audio\", visible=False), language_input_audio], outputs=translated_output)\n", " audio_clear_btn.click(lambda: (None, \"\", \"\", \"\"), outputs=[audio_input, language_input_audio, transcript_output, translated_output]) \n", " demo.launch(debug=True, share=True)\n", "\n", "if __name__ == \"__main__\":\n", " gradio_interface()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10 - SDK v2", "language": "python", "name": "python310-sdkv2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }