MekkCyber commited on
Commit
40d5657
·
1 Parent(s): 5b7e792

first push

Browse files
Files changed (3) hide show
  1. README.md +13 -3
  2. app.py +251 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
  title: BitsAndBytes
3
- emoji: 🏢
4
  colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.11.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: BitsAndBytes
3
+ emoji: 💻
4
  colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
10
+
11
+ hf_oauth: true
12
+ # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
13
+ hf_oauth_expiration_minutes: 480
14
+ # optional, see "Scopes" below. "openid profile" is always included.
15
+ hf_oauth_scopes:
16
+ - read-repos
17
+ - write-repos
18
+ - manage-repos
19
+ - inference-api
20
  ---
21
 
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, BitsAndBytesConfig
4
+ import tempfile
5
+ from huggingface_hub import HfApi
6
+ from huggingface_hub import list_models
7
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
8
+ from packaging import version
9
+ import os
10
+ import spaces
11
+
12
+
13
+ def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
14
+ # ^ expect a gr.OAuthProfile object as input to get the user's profile
15
+ # if the user is not logged in, profile will be None
16
+ if profile is None:
17
+ return "Hello !"
18
+ return f"Hello {profile.name} !"
19
+
20
+ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, model_name, quantized_model_name):
21
+ """Check if a model exists in the user's Hugging Face repository."""
22
+ try:
23
+ models = list_models(author=username, token=oauth_token.token)
24
+ model_names = [model.id for model in models]
25
+ if quantized_model_name :
26
+ repo_name = f"{username}/{quantized_model_name}"
27
+ else :
28
+ repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-{quantization_type}"
29
+
30
+ if repo_name in model_names:
31
+ return f"Model '{repo_name}' already exists in your repository."
32
+ else:
33
+ return None # Model does not exist
34
+ except Exception as e:
35
+ return f"Error checking model existence: {str(e)}"
36
+
37
+ def create_model_card(model_name, quantization_type, threshold, quant_type_4, double_quant_4,):
38
+ model_card = f"""---
39
+ base_model:
40
+ - {model_name}
41
+ ---
42
+
43
+ # {model_name} (Quantized)
44
+
45
+ ## Description
46
+ This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with bitsandbytes.
47
+
48
+ ## Quantization Details
49
+ - **Quantization Type**: {quantization_type}
50
+ - **Threshold**: {threshold if quantization_type == "int8" else None}
51
+ - **bnb_4bit_quant_type**: {quant_type_4 if quantization_type == "int4" else None}
52
+ - **bnb_4bit_use_double_quant**: {double_quant_4 if quantization_type=="int4" else None}
53
+
54
+ ## Usage
55
+ You can use this model in your applications by loading it directly from the Hugging Face Hub:
56
+
57
+ ```python
58
+ from transformers import AutoModel
59
+
60
+ model = AutoModel.from_pretrained("{model_name}")"""
61
+
62
+ return model_card
63
+
64
+ def load_model(model_name, quantization_config, auth_token) :
65
+ return AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="cuda", use_auth_token=auth_token.token)
66
+
67
+ def load_model_cpu(model_name, quantization_config, auth_token) :
68
+ return AutoModel.from_pretrained(model_name, quantization_config=quantization_config, use_auth_token=auth_token.token)
69
+
70
+ def quantize_model(model_name, quantization_type, threshold, quant_type_4, double_quant_4, auth_token=None, username=None):
71
+ print(f"Quantizing model: {quantization_type}")
72
+ if quantization_type=="int4":
73
+ quantization_config = BitsAndBytesConfig(
74
+ load_in_4bit=True,
75
+ bnb_4bit_quant_type=quant_type_4,
76
+ bnb_4bit_use_double_quant=True if double_quant_4 == "True" else False,
77
+ )
78
+ else :
79
+ quantization_config = BitsAndBytesConfig(
80
+ load_in_8bit=True,
81
+ llm_int8_threshold=threshold,
82
+ )
83
+ model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token)
84
+
85
+ return model
86
+
87
+ def save_model(model, model_name, quantization_type, threshold, quant_type_4, double_quant_4, username=None, auth_token=None, quantized_model_name=None):
88
+ print("Saving quantized model")
89
+ with tempfile.TemporaryDirectory() as tmpdirname:
90
+
91
+
92
+ model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token)
93
+ if quantized_model_name :
94
+ repo_name = f"{username}/{quantized_model_name}"
95
+ else :
96
+ if quantization_type == "int4_weight_only" :
97
+ repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-{quantization_type}"
98
+ else :
99
+ repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-{quantization_type}"
100
+
101
+ model_card = create_model_card(repo_name, quantization_type, threshold, quant_type_4, double_quant_4)
102
+ with open(os.path.join(tmpdirname, "README.md"), "w") as f:
103
+ f.write(model_card)
104
+ # Push to Hub
105
+ api = HfApi(token=auth_token.token)
106
+ api.create_repo(repo_name, exist_ok=True)
107
+ api.upload_folder(
108
+ folder_path=tmpdirname,
109
+ repo_id=repo_name,
110
+ repo_type="model",
111
+ )
112
+ return f'<h1> 🤗 DONE</h1><br/>Find your repo here: <a href="https://huggingface.co/{repo_name}" target="_blank" style="text-decoration:underline">{repo_name}</a>'
113
+
114
+ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, threshold, quant_type_4, double_quant_4, quantized_model_name):
115
+ if oauth_token is None :
116
+ return "Error : Please Sign In to your HuggingFace account to use the quantizer"
117
+ if not profile:
118
+ return "Error: Please Sign In to your HuggingFace account to use the quantizer"
119
+ exists_message = check_model_exists(oauth_token, profile.username, quantization_type, model_name, quantized_model_name)
120
+ if exists_message :
121
+ return exists_message
122
+
123
+ if not threshold.isdigit() :
124
+ return "Threshold must be a number"
125
+
126
+ threshold = int(threshold)
127
+
128
+ try:
129
+ quantized_model = quantize_model(model_name, quantization_type, threshold, quant_type_4, double_quant_4, oauth_token, profile.username)
130
+ return save_model(quantized_model, model_name, quantization_type, threshold, quant_type_4, double_quant_4, profile.username, oauth_token, quantized_model_name)
131
+ except Exception as e :
132
+ return e
133
+
134
+
135
+ css="""/* Custom CSS to allow scrolling */
136
+ .gradio-container {overflow-y: auto;}
137
+ """
138
+ with gr.Blocks(theme=gr.themes.Ocean(), css=css) as app:
139
+ gr.Markdown(
140
+ """
141
+ # 🤗 LLM Model BitsAndBytes Quantization App
142
+
143
+ Quantize your favorite Hugging Face models using BitsAndBytes and save them to your profile!
144
+ """
145
+ )
146
+
147
+ gr.LoginButton(elem_id="login-button", elem_classes="center-button", min_width=250)
148
+
149
+ m1 = gr.Markdown()
150
+ app.load(hello, inputs=None, outputs=m1)
151
+
152
+
153
+ radio = gr.Radio(["show", "hide"], label="Show Instructions")
154
+ instructions = gr.Markdown(
155
+ """
156
+ ## Instructions
157
+ 1. Login to your HuggingFace account
158
+ 2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it)
159
+ 3. Choose the quantization type.
160
+ 4. Optionally, specify the group size.
161
+ 5. Optionally, choose a custom name for the quantized model
162
+ 6. Click "Quantize and Save Model" to start the process.
163
+ 7. Once complete, you'll receive a link to the quantized model on Hugging Face.
164
+
165
+ Note: This process may take some time depending on the model size and your hardware you can check the container logs to see where are you at in the process!
166
+ """,
167
+ visible=False
168
+ )
169
+ def update_visibility(radio): # Accept the event argument, even if not used
170
+ value = radio # Get the selected value from the radio button
171
+ if value == "show":
172
+ return gr.Textbox(visible=True) #make it visible
173
+ else:
174
+ return gr.Textbox(visible=False)
175
+ radio.change(update_visibility, radio, instructions)
176
+
177
+ with gr.Row():
178
+ with gr.Column():
179
+ with gr.Row():
180
+ model_name = HuggingfaceHubSearch(
181
+ label="Hub Model ID",
182
+ placeholder="Search for model id on Huggingface",
183
+ search_type="model",
184
+ )
185
+ with gr.Row():
186
+ with gr.Column():
187
+ quantization_type = gr.Dropdown(
188
+ info="Quantization Type",
189
+ choices=["int4", "int8"],
190
+ value="int8",
191
+ filterable=False,
192
+ show_label=False,
193
+ )
194
+ threshold_8 = gr.Textbox(
195
+ info="Outlier threshold",
196
+ value=6,
197
+ interactive=True,
198
+ show_label=False,
199
+ visible=False
200
+ )
201
+ quant_type_4 = gr.Dropdown(
202
+ info="The quantization data type in the bnb.nn.Linear4Bit layers",
203
+ choices=["fp4", "nf4"],
204
+ value="fp4",
205
+ visible=False,
206
+ show_label=False
207
+ )
208
+ radio_4 = gr.Radio(["False", "True"], label="Use Double Quant", visible=False, value="False")
209
+
210
+ def update_visibility(quantization_type):
211
+ return gr.update(visible=(quantization_type=="int8")), gr.update(visible=(quantization_type=="int4")), gr.update(visible=(quantization_type=="int4"))
212
+
213
+ quantization_type.change(fn=update_visibility, inputs=quantization_type, outputs=[threshold_8, quant_type_4, radio_4])
214
+
215
+ quantized_model_name = gr.Textbox(
216
+ info="Model Name (optional : to override default)",
217
+ value="",
218
+ interactive=True,
219
+ show_label=False
220
+ )
221
+ with gr.Column():
222
+ quantize_button = gr.Button("Quantize and Save Model", variant="primary")
223
+ output_link = gr.Markdown(label="Quantized Model Link", container=True, min_height=40)
224
+
225
+
226
+ # Adding CSS styles for the username box
227
+ app.css = """
228
+ #username-box {
229
+ background-color: #f0f8ff; /* Light color */
230
+ border-radius: 8px;
231
+ padding: 10px;
232
+ }
233
+ """
234
+ app.css = """
235
+ .center-button {
236
+ display: flex;
237
+ justify-content: center;
238
+ align-items: center;
239
+ margin: 0 auto; /* Center horizontally */
240
+ }
241
+ """
242
+
243
+ quantize_button.click(
244
+ fn=quantize_and_save,
245
+ inputs=[model_name, quantization_type, threshold_8, quant_type_4, radio_4, quantized_model_name],
246
+ outputs=[output_link]
247
+ )
248
+
249
+
250
+ # Launch the app
251
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git@main#egg=transformers
2
+ accelerate
3
+ torchao
4
+ huggingface-hub
5
+ gradio-huggingfacehub-search