# pip install llama-cpp-python --prefer-binary --extra-index-url=https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/AVX2/cu118
from llama_cpp import Llama

llm = Llama(
    model_path="recurv_llama_13B.gguf",
    n_ctx=2048,         # Context window
    n_threads=4         # Number of CPU threads to use
)

prompt = "What is Paracetamol?"
output = llm(
    prompt,
    max_tokens=256,     # Maximum number of tokens to generate
    temperature=0.5,    # Controls randomness (0.0 = deterministic, 1.0 = creative)
    top_p=0.95,         # Nucleus sampling parameter
    stop=["###"],       # Optional stop words
    echo=True           # Include prompt in the output
)

# Print the generated text
print(output['choices'][0]['text'])