import h5py import glob import torch import numpy as np import os import torchaudio import soundfile as sf from utils.g2p.symbols import symbols from utils.g2p import PhonemeBpeTokenizer from utils.prompt_making import make_prompt, make_transcript from data.collation import get_text_token_collater from data.dataset import create_dataloader # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} from data.tokenizer import ( AudioTokenizer, tokenize_audio, ) tokenizer_path = "./utils/g2p/bpe_175.json" tokenizer = PhonemeBpeTokenizer(tokenizer_path) device = 'cuda' if torch.cuda.is_available() else 'cpu' def make_prompts(name, audio_prompt_path, transcript=None): text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_175.json") text_collater = get_text_token_collater() codec = AudioTokenizer(device) wav_pr, sr = torchaudio.load(audio_prompt_path) # check length if wav_pr.size(-1) / sr > 15: raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.") if wav_pr.size(0) == 2: wav_pr = wav_pr.mean(0, keepdim=True) text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript) # tokenize audio encoded_frames = tokenize_audio(codec, (wav_pr, sr)) audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy() # tokenize text phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip()) text_tokens, enroll_x_lens = text_collater( [ phonemes ] ) return audio_tokens, text_tokens, langs, text_pr def create_dataset(data_dir, dataloader_process_only): if dataloader_process_only: h5_output_path=f"{data_dir}/audio_sum.hdf5" ann_output_path=f"{data_dir}/audio_ann_sum.txt" #audio_folder = os.path.join(data_dir, 'audio') audio_paths = glob.glob(f"{data_dir}/*.wav") # Change this to match your audio file extension # Create or open an HDF5 file with h5py.File(h5_output_path, 'w') as h5_file: # Loop through each audio and text file, assuming they have the same stem for audio_path in audio_paths: stem = os.path.splitext(os.path.basename(audio_path))[0] audio_tokens, text_tokens, langs, text = make_prompts(name=stem, audio_prompt_path=audio_path) text_tokens = text_tokens.squeeze(0) # Create a group for each stem grp = h5_file.create_group(stem) # Add audio and text tokens as datasets to the group grp.create_dataset('audio', data=audio_tokens) #grp.create_dataset('text', data=text_tokens) with open(ann_output_path, 'a', encoding='utf-8') as ann_file: try: audio, sample_rate = sf.read(audio_path) duration = len(audio) / sample_rate ann_file.write(f'{stem}|{duration}|{langs[0]}|{text}\n') # 改行を追加 print(f"Successfully wrote to {ann_output_path}") except Exception as e: print(f"An error occurred: {e}") else: dataloader = create_dataloader(data_dir=data_dir, max_duration=20) return dataloader