# pip install llama-cpp-python --prefer-binary --extra-index-url=https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/AVX2/cu118 from llama_cpp import Llama llm = Llama( model_path="recurv_llama_13B.gguf", n_ctx=2048, # Context window n_threads=4 # Number of CPU threads to use ) prompt = "What is Paracetamol?" output = llm( prompt, max_tokens=256, # Maximum number of tokens to generate temperature=0.5, # Controls randomness (0.0 = deterministic, 1.0 = creative) top_p=0.95, # Nucleus sampling parameter stop=["###"], # Optional stop words echo=True # Include prompt in the output ) # Print the generated text print(output['choices'][0]['text'])