--- language: - all base_model: - openai/whisper-small - unsloth/Llama-3.2-1B-Instruct tags: - speech to text - speech recognition --- # FusedWhisperLlama Model Model ini adalah hasil fusion antara Whisper dan LLaMA untuk speech-to-text-to-LLM pipeline. ## Model Description - **Model Type:** FusedWhisperLlama - **Language:** Indonesian & English - **Tasks:** Speech Recognition, Text Generation - **Base Models:** - Whisper: openai/whisper-small - LLaMA: unsloth/Llama-3.2-1B-Instruct ## Usage ```python import torch import torch.nn as nn import librosa import numpy as np import json import os from typing import Dict, Any from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperProcessor, LlamaConfig, LlamaForCausalLM, AutoTokenizer from huggingface_hub import hf_hub_download from pathlib import Path def download_model_files(repo_id: str, local_dir: str): os.makedirs(local_dir, exist_ok=True) config_dir = os.path.join(local_dir, "configs") os.makedirs(config_dir, exist_ok=True) # Download model file print("Downloading model file...") model_path = hf_hub_download( repo_id=repo_id, filename="pytorch_model.bin", local_dir=local_dir ) # Download configs print("Downloading config files...") config_files = [ "config.json", "configs/config_whisper.json", "configs/config_llama.json", # Whisper tokenizer files "configs/tokenizer_whisper/added_tokens.json", "configs/tokenizer_whisper/merges.txt", "configs/tokenizer_whisper/normalizer.json", "configs/tokenizer_whisper/preprocessor_config.json", "configs/tokenizer_whisper/special_tokens_map.json", "configs/tokenizer_whisper/tokenizer_config.json", "configs/tokenizer_whisper/vocab.json", # Llama tokenizer files "configs/tokenizer_llama/special_tokens_map.json", "configs/tokenizer_llama/tokenizer.json", "configs/tokenizer_llama/tokenizer_config.json" ] for file in config_files: try: hf_hub_download( repo_id=repo_id, filename=file, local_dir=local_dir ) print(f"Downloaded {file}") except Exception as e: print(f"Warning: Could not download {file}: {e}") return os.path.join(local_dir, "pytorch_model.bin") class StandaloneFusionInference: def __init__(self, model_path: str, config_dir: str = None, device: str = None): if config_dir is None: config_dir = os.path.join(os.path.dirname(model_path), "configs") # Set device if device is None: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: self.device = torch.device(device) print(f"Using device: {self.device}") # Load configs with open(os.path.join(config_dir, "config_whisper.json"), "r") as f: self.whisper_config = json.load(f) with open(os.path.join(config_dir, "config_llama.json"), "r") as f: self.llama_config = json.load(f) print("Loading Whisper model...") whisper_config = WhisperConfig(**self.whisper_config["whisper_config"]) self.whisper = WhisperForConditionalGeneration(whisper_config) self.processor = WhisperProcessor.from_pretrained( os.path.join(config_dir, "tokenizer_whisper") ) print("Loading LLaMA model...") llama_config = LlamaConfig(**self.llama_config["llama_config"]) self.llm = LlamaForCausalLM(llama_config) # Load LLM tokenizer tokenizer_path = os.path.join(config_dir, "tokenizer_llama") try: self.tokenizer = AutoTokenizer.from_pretrained( tokenizer_path, trust_remote_code=True ) print("Loaded local LLaMA tokenizer") except (OSError, ValueError) as e: print(f"Warning: Could not load local tokenizer ({e}), using default") self.tokenizer = AutoTokenizer.from_pretrained( "unsloth/Llama-3.2-1B-Instruct", trust_remote_code=True ) # Fusion layer self.fusion_layer = nn.Sequential( nn.Linear( self.whisper.config.d_model, self.whisper.config.d_model ), nn.ReLU(), nn.LayerNorm(self.whisper.config.d_model) ) print("Loading model weights...") weights = torch.load(model_path, map_location=self.device) self.whisper.load_state_dict(weights["whisper_model"]) self.llm.load_state_dict(weights["llm_model"]) self.fusion_layer.load_state_dict(weights["fusion_layer"]) # Set to eval mode self.whisper.eval() self.llm.eval() self.fusion_layer.eval() # Move to device self.whisper = self.whisper.to(self.device) self.llm = self.llm.to(self.device) self.fusion_layer = self.fusion_layer.to(self.device) self.system_prompt = self.whisper_config["system_prompt"] print("Model loaded successfully!") def generate(self, audio_path: str) -> Dict[str, Any]: # Load dan proses audio speech, sr = librosa.load(audio_path, sr=16000, mono=True) speech = librosa.util.normalize(speech) # Process dengan whisper processor inputs = self.processor( speech, sampling_rate=16000, return_tensors="pt" ).input_features.to(self.device) with torch.no_grad(): # Get transcription outputs = self.whisper.generate( inputs, max_length=448, num_beams=5, temperature=0.0, no_repeat_ngram_size=3, return_timestamps=False ) transcription = self.processor.batch_decode( outputs, skip_special_tokens=True, normalize=True )[0].strip() # Prepare input untuk LLM prompt = f"System: {self.system_prompt}\nUser: {transcription}" inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) # Generate response outputs = self.llm.generate( **inputs, max_new_tokens=512, temperature=0.7, do_sample=True ) response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return { "transcription": transcription, "response": response } if __name__ == "__main__": # Download model dari Hugging Face Hub repo_id = "johaness14/fused-whisper-llama" local_dir = "downloaded_model" model_path = download_model_files(repo_id, local_dir) # Initialize inference inference = StandaloneFusionInference( model_path, config_dir=os.path.join(local_dir, "configs"), device="cuda" # or "cpu" for CPU-only ) # Run inference audio_path = "path/to/your/audio.wav" output = inference.generate(audio_path) print("\nTranscription:") print(output["transcription"]) print("\nResponse:") print(output["response"]) ``` ## Training Details Model ini menggabungkan kemampuan speech recognition dari Whisper dengan kemampuan text generation dari LLaMA menggunakan fusion layer. ### Training Procedure - **Speech Recognition:** Menggunakan Whisper small model - **Text Generation:** Menggunakan LLaMA 3.2 1B model - **Fusion:** Custom fusion layer untuk menghubungkan kedua model ## Limitations and Biases - Model mungkin memiliki bias dari model dasar yang digunakan - Performa bergantung pada kualitas audio input - Keterbatasan pada panjang teks yang bisa digenerate