File size: 4,796 Bytes
660a3f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import streamlit as st
import subprocess
import os
import requests
from huggingface_hub import snapshot_download, login, HfApi
from pathlib import Path
import tempfile

# Define paths for llama.cpp binaries
LLAMA_CPP_PATH = "https://huggingface.co/spaces/KBaba7/llama.cpp/tree/main/llama.cpp"
LLAMA_CPP_BIN = "build/bin"
BUILD_DIR = "build"
CONVERT_SCRIPT = "convert-hf-to-gguf.py"  # Ensure correct path

def run_command(command):
    """ Run a shell command and return its output. """
    result = subprocess.run(
        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True
    )
    return result.stdout, result.stderr

st.title("LLAMA Quantization Pipeline")
st.markdown(
    """
    This tool downloads a model from Hugging Face, converts it to GGUF format, quantizes it, and provides an option to download the final model.
    """
)

st.sidebar.header("Settings")
st.sidebar.write("Please login to your Hugging Face account to use your llama.cpp repository.")
username = st.sidebar.text_input("Hugging Face Username")
password = st.sidebar.text_input("Hugging Face Password", type="password")
model_repo_id = st.sidebar.text_input("Model Repository ID", "Qwen/Qwen2.5-3B")
quantization_options = ["q4_k_m", "q4_0", "q4_1"]
quantization_type = st.sidebar.selectbox("Select Quantization Type", quantization_options)
quant_options = ["f32", "f16", "bf16", "q8_0", "auto"]
quant_type = st.sidebar.selectbox("Select GGUF Output Type", quant_options)
upload_option = st.sidebar.checkbox("Upload quantized model to Hugging Face?", value=False)
run_button = st.button("Run Pipeline")

if run_button:
    st.info("Starting the pipeline. Please be patient...")
    log_area = st.empty()
    logs = []
    
    def log(message):
        logs.append(message)
        log_area.text("\n".join(logs))
    
    try:
        # Download the llama.cpp repository
        snapshot_download(repo_id="KBaba7/llama.cpp", local_dir="llama.cpp", repo_type="space")
        
        # Create temporary directories for the original and quantized models
        temp_path = Path(tempfile.gettempdir())
        original_model_dir = temp_path / "original_model"
        quantized_model_dir = temp_path / "quantized_model"
        original_model_dir.mkdir(parents=True, exist_ok=True)
        quantized_model_dir.mkdir(parents=True, exist_ok=True)
        
        log("Downloading model from Hugging Face...")
        snapshot_download(repo_id=model_repo_id, local_dir=str(original_model_dir), local_dir_use_symlinks=False)
        log(f"Model downloaded to: {original_model_dir}")
        
        log("Converting model to GGUF format...")
        conversion_outfile = quantized_model_dir / "model_converted.gguf"
        conversion_cmd = (
            f"python3 convert-hf-to-gguf.py {original_model_dir} --outtype {quant_type} "
            f"--outfile {conversion_outfile}"
        )
        conv_stdout, conv_stderr = run_command(conversion_cmd)
        log(conv_stdout + conv_stderr)
        
        if not conversion_outfile.exists():
            log("Error: GGUF conversion failed! No output file found.")
            st.error("GGUF conversion failed. Check logs.")
            st.stop()

        log("Quantizing the model...")
        quantized_model_outfile = quantized_model_dir / f"model_quantized_{quantization_type}.gguf"
        quantize_cmd = f"build/bin/llama-quantize {conversion_outfile} {quantized_model_outfile} {quantization_type}"
        quant_stdout, quant_stderr = run_command(quantize_cmd)
        log(quant_stdout + quant_stderr)
        
        if not quantized_model_outfile.exists():
            log("Error: Quantization failed! No output file found.")
            st.error("Quantization failed. Check logs.")
            st.stop()
        
        log("Pipeline completed successfully!")
        st.success("Quantized model ready for download.")
        with open(quantized_model_outfile, "rb") as file:
            st.download_button(label="Download Quantized Model", data=file, file_name=quantized_model_outfile.name)
        
        # Upload if selected
        if upload_option:
            log("Uploading quantized model to Hugging Face...")
            login(username, password)
            api = HfApi()
            target_repo = f"automated-quantization/{quantized_model_outfile.stem}"
            api.create_repo(target_repo, exist_ok=True, repo_type="model")
            api.upload_file(
                path_or_fileobj=str(quantized_model_outfile),
                path_in_repo=quantized_model_outfile.name,
            )
            log("Upload complete!")
    except Exception as e:
        log(f"An error occurred: {e}")
    finally:
        # Remove temporary directories
        original_model_dir.rmdir()
        quantized_model_dir.rmdir()