{ "feature_extraction": { "sequence": [ { "operation": { "name": "audio_decoder", "type": "AudioDecoderEx", "attrs": { "target_sample_rates": [ 8000, 16000 ] } } }, { "operation": { "name": "phi_4_audio_embed", "type": "Phi4AudioEmbed", "attrs": { "audio_compression_rate": 8, "stft_normal/n_fft": 512, "stft_normal/frame_length": 400, "stft_normal/hop_length": 160, "stft_normal/win_fn": "hamming", "logmel/chunk_size": 30, "logmel/hop_length": 160, "logmel/n_fft": 512, "logmel/n_mel": 80, "logmel/feature_first": 0, "logmel/no_padding": 1, "stft_normal_8k/n_fft": 256, "stft_normal_8k/frame_length": 200, "stft_normal_8k/hop_length": 80, "stft_normal_8k/win_fn": "hamming", "logmel_8k/chunk_size": 30, "logmel_8k/hop_length": 80, "logmel_8k/n_fft": 512, "logmel_8k/n_mel": 80, "logmel_8k/feature_first": 0, "logmel_8k/no_padding": 1 } } } ], "output_aligner": "phi4-audio-aligner" } }