# import gradio as gr
import os
import torch
from huggingface_hub import InferenceClient


# Khurram
from fastapi import FastAPI, Query
from pydantic import BaseModel
import uvicorn
from fastapi.responses import JSONResponse
#################

# Import eSpeak TTS pipeline
from tts_cli import (
    build_model as build_model_espeak,
    generate_long_form_tts as generate_long_form_tts_espeak,
)

# Import OpenPhonemizer TTS pipeline
from tts_cli_op import (
    build_model as build_model_open,
    generate_long_form_tts as generate_long_form_tts_open,
)
from pretrained_models import Kokoro
#
# ---------------------------------------------------------------------
# Path to models and voicepacks
# ---------------------------------------------------------------------
MODELS_DIR = "pretrained_models/Kokoro"
VOICES_DIR = "pretrained_models/Kokoro/voices"
HF_TOKEN = os.getenv("HF_TOKEN")
client = InferenceClient(api_key=HF_TOKEN)


# ---------------------------------------------------------------------
# List the models (.pth) and voices (.pt)
# ---------------------------------------------------------------------
def get_models():
    return sorted([f for f in os.listdir(MODELS_DIR) if f.endswith(".pth")])


def get_voices():
    return sorted([f for f in os.listdir(VOICES_DIR) if f.endswith(".pt")])


# ---------------------------------------------------------------------
# We'll map engine selection -> (build_model_func, generate_func)
# ---------------------------------------------------------------------
ENGINES = {
    "espeak": (build_model_espeak, generate_long_form_tts_espeak),
    "openphonemizer": (build_model_open, generate_long_form_tts_open),
}


# ---------------------------------------------------------------------
# The main inference function called by Gradio
# ---------------------------------------------------------------------
def tts_inference(text, engine, model_file, voice_file, speed=1.0):
    """
    text:        Input string
    engine:      "espeak" or "openphonemizer"
    model_file:  Selected .pth from the models folder
    voice_file:  Selected .pt from the voices folder
    speed:       Speech speed
    """

    # 0) Get the response of user query from LLAMA 

    messages = [
	{
		"role": "user",
		"content": [
			{
				"type": "text",
				"text": text + str('describe in one line only')
			} #,
			# {
			# 	"type": "image_url",
			# 	"image_url": {
			# 		"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
			# 	}
			# }
            ]
	}
    ]

    response_from_llama = client.chat.completions.create(
    model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
	messages=messages, 
	max_tokens=500)
    
    # 1) Map engine to the correct build_model + generate_long_form_tts
    build_fn, gen_fn = ENGINES[engine]

    # 2) Prepare paths
    model_path = os.path.join(MODELS_DIR, model_file)
    voice_path = os.path.join(VOICES_DIR, voice_file)

    # 3) Decide device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 4) Load model
    model = build_fn(model_path, device=device)
    # Set submodules eval
    for k, subm in model.items():
        if hasattr(subm, "eval"):
            subm.eval()

    # 5) Load voicepack
    voicepack = torch.load(voice_path, map_location=device)
    if hasattr(voicepack, "eval"):
        voicepack.eval()

    # 6) Generate TTS
    audio, phonemes = gen_fn(model, response_from_llama.choices[0].message['content'], voicepack, speed=speed)
    sr = 22050  # or your actual sample rate

    return (sr, audio)  # Gradio expects (sample_rate, np_array)


#------------------------------------------
# FAST API 
#---------------
app = FastAPI()

class TTSRequest(BaseModel):
    text: str
    engine: str
    model_file: str
    voice_file: str
    speed: float = 1.0

@app.post("/tts")
def generate_tts(request: TTSRequest):
    try:
        sr, audio = tts_inference(
            text="What is Deep SeEK? define in 2 lines",
            engine="openphonemizer",
            model_file="kokoro-v0_19.pth",
            voice_file="af_bella.pt",
            speed=1.0
        )
        
        return JSONResponse(content={
            "sample_rate": sr,
            "audio_tensor": audio.tolist()
        })
    except Exception as e:
        return JSONResponse(content={"error": str(e)}, status_code=500)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


###############################

# # ---------------------------------------------------------------------
# # Build Gradio App
# # ---------------------------------------------------------------------
# def create_gradio_app():
#     model_list = get_models()
#     voice_list = get_voices()

#     css = """
#     h4 {
#         text-align: center;
#         display:block;
#     }
#     h2 {
#         text-align: center;
#         display:block;
#     }
#     """
#     with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
#         gr.Markdown("## LLAMA TTS DEMO - API - GRADIO VISUAL")

#         # Row 1: Text input
#         text_input = gr.Textbox(
#             label="Enter your question",
#             value="What is AI?",
#             lines=2,
#         )

#         # Row 2: Engine selection
#         # engine_dropdown = gr.Dropdown(
#         #     choices=["espeak", "openphonemizer"],
#         #     value="openphonemizer",
#         #     label="Phonemizer",
#         # )

#         # Row 3: Model dropdown
#         # model_dropdown = gr.Dropdown(
#         #     choices=model_list,
#         #     value=model_list[0] if model_list else None,
#         #     label="Model (.pth)",
#         # )

#         # Row 4: Voice dropdown
#         # voice_dropdown = gr.Dropdown(
#         #     choices=voice_list,
#         #     value=voice_list[0] if voice_list else None,
#         #     label="Voice (.pt)",
#         # )

#         # Row 5: Speed slider
#         speed_slider = gr.Slider(
#             minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed"
#         )

#         # Generate button + audio output
#         generate_btn = gr.Button("Generate")
#         tts_output = gr.Audio(label="TTS Output")

#         # Connect the button to our inference function
#         generate_btn.click(
#             fn=tts_inference,
#             inputs=[
#                 text_input,
#                 gr.State("openphonemizer"), #engine_dropdown, 
#                 gr.State("kokoro-v0_19.pth"), #model_dropdown,
#                 gr.State("af_bella.pt"), #voice_dropdown,
#                 speed_slider,
#             ],
#             outputs=tts_output,
#         )

#         gr.Markdown(
#             "#### LLAMA - TTS"
#         )
#     return demo


# # ---------------------------------------------------------------------
# # Main
# # ---------------------------------------------------------------------
# if __name__ == "__main__":
#     app = create_gradio_app()
#     app.launch()