api-smollm135m

Sleeping

App Files Files Community

khurrameycon commited on Jan 2

Commit

11fa80d

verified ·

1 Parent(s): 837a126

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -121

app.py CHANGED Viewed

@@ -1,99 +1,3 @@
-# from fastapi import FastAPI, HTTPException
-# from pydantic import BaseModel
-# from transformers import AutoModelForCausalLM, AutoTokenizer
-# import torch
-# from huggingface_hub import snapshot_download
-# from safetensors.torch import load_file
-# class ModelInput(BaseModel):
-#     prompt: str
-#     max_new_tokens: int = 50
-# app = FastAPI()
-# # Define model paths
-# base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
-# adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
-# try:
-#     # First load the base model
-#     print("Loading base model...")
-#     model = AutoModelForCausalLM.from_pretrained(
-#         base_model_path,
-#         torch_dtype=torch.float16,
-#         trust_remote_code=True,
-#         device_map="auto"
-#     )
-#     # Load tokenizer from base model
-#     print("Loading tokenizer...")
-#     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
-#     # Download adapter weights
-#     print("Downloading adapter weights...")
-#     adapter_path_local = snapshot_download(adapter_path)
-#     # Load the safetensors file
-#     print("Loading adapter weights...")
-#     state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")
-#     # Load state dict into model
-#     model.load_state_dict(state_dict, strict=False)
-#     print("Model and adapter loaded successfully!")
-# except Exception as e:
-#     print(f"Error during model loading: {e}")
-#     raise
-# def generate_response(model, tokenizer, instruction, max_new_tokens=128):
-#     """Generate a response from the model based on an instruction."""
-#     try:
-#         messages = [{"role": "user", "content": instruction}]
-#         input_text = tokenizer.apply_chat_template(
-#             messages, tokenize=False, add_generation_prompt=True
-#         )
-#         inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
-#         outputs = model.generate(
-#             inputs,
-#             max_new_tokens=max_new_tokens,
-#             temperature=0.2,
-#             top_p=0.9,
-#             do_sample=True,
-#         )
-#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-#         return response
-#     except Exception as e:
-#         raise ValueError(f"Error generating response: {e}")
-# @app.post("/generate")
-# async def generate_text(input: ModelInput):
-#     try:
-#         response = generate_response(
-#             model=model,
-#             tokenizer=tokenizer,
-#             instruction=input.prompt,
-#             max_new_tokens=input.max_new_tokens
-#         )
-#         return {"generated_text": response}
-#     except Exception as e:
-#         raise HTTPException(status_code=500, detail=str(e))
-# @app.get("/")
-# async def root():
-#     return {"message": "Welcome to the Model API!"}
-# //////////////////////////////////////////
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -103,7 +7,7 @@ from safetensors.torch import load_file
 class ModelInput(BaseModel):
     prompt: str
-    max_new_tokens: int = 2048
 app = FastAPI()
@@ -112,59 +16,56 @@ base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
 adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
 try:
-    # Load the base model
     print("Loading base model...")
     model = AutoModelForCausalLM.from_pretrained(
         base_model_path,
         torch_dtype=torch.float16,
-        device_map="cpu",          # Explicitly set CPU
-        # load_in_8bit=True          # Enable int8 quantization
         trust_remote_code=True,
-        # device_map="auto"
     )
-    # Load tokenizer
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
     # Download adapter weights
     print("Downloading adapter weights...")
-    adapter_path_local = snapshot_download(repo_id=adapter_path)
     # Load the safetensors file
     print("Loading adapter weights...")
-    adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
-    state_dict = load_file(adapter_file)
     # Load state dict into model
-    print("Applying adapter weights...")
     model.load_state_dict(state_dict, strict=False)
     print("Model and adapter loaded successfully!")
 except Exception as e:
     print(f"Error during model loading: {e}")
     raise
-def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
     """Generate a response from the model based on an instruction."""
     try:
-        # Format input for the model
-        inputs = tokenizer.encode(instruction, return_tensors="pt").to(model.device)
-        # Generate response
         outputs = model.generate(
             inputs,
             max_new_tokens=max_new_tokens,
-            temperature=0.7,
             top_p=0.9,
             do_sample=True,
         )
-        # Decode and return the output
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
         raise ValueError(f"Error generating response: {e}")
@@ -175,10 +76,10 @@ async def generate_text(input: ModelInput):
             model=model,
             tokenizer=tokenizer,
             instruction=input.prompt,
-            max_new_tokens=2048
         )
         return {"generated_text": response}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
@@ -186,3 +87,102 @@ async def generate_text(input: ModelInput):
 async def root():
     return {"message": "Welcome to the Model API!"}

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 class ModelInput(BaseModel):
     prompt: str
+    max_new_tokens: int = 50
 app = FastAPI()
 adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
 try:
+    # First load the base model
     print("Loading base model...")
     model = AutoModelForCausalLM.from_pretrained(
         base_model_path,
         torch_dtype=torch.float16,
         trust_remote_code=True,
+        device_map="auto"
     )
+    # Load tokenizer from base model
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
     # Download adapter weights
     print("Downloading adapter weights...")
+    adapter_path_local = snapshot_download(adapter_path)
     # Load the safetensors file
     print("Loading adapter weights...")
+    state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")
     # Load state dict into model
     model.load_state_dict(state_dict, strict=False)
     print("Model and adapter loaded successfully!")
 except Exception as e:
     print(f"Error during model loading: {e}")
     raise
+def generate_response(model, tokenizer, instruction, max_new_tokens=128):
     """Generate a response from the model based on an instruction."""
     try:
+        messages = [{"role": "user", "content": instruction}]
+        input_text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
         outputs = model.generate(
             inputs,
             max_new_tokens=max_new_tokens,
+            temperature=0.2,
             top_p=0.9,
             do_sample=True,
         )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
         raise ValueError(f"Error generating response: {e}")
             model=model,
             tokenizer=tokenizer,
             instruction=input.prompt,
+            max_new_tokens=input.max_new_tokens
         )
         return {"generated_text": response}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 async def root():
     return {"message": "Welcome to the Model API!"}
+# //////////////////////////////////////////
+# from fastapi import FastAPI, HTTPException
+# from pydantic import BaseModel
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# import torch
+# from huggingface_hub import snapshot_download
+# from safetensors.torch import load_file
+# class ModelInput(BaseModel):
+#     prompt: str
+#     max_new_tokens: int = 2048
+# app = FastAPI()
+# # Define model paths
+# base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
+# adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
+# try:
+#     # Load the base model
+#     print("Loading base model...")
+#     model = AutoModelForCausalLM.from_pretrained(
+#         base_model_path,
+#         torch_dtype=torch.float16,
+#         device_map="cpu",          # Explicitly set CPU
+#         # load_in_8bit=True          # Enable int8 quantization
+#         trust_remote_code=True,
+#         # device_map="auto"
+#     )
+#     # Load tokenizer
+#     print("Loading tokenizer...")
+#     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+#     # Download adapter weights
+#     print("Downloading adapter weights...")
+#     adapter_path_local = snapshot_download(repo_id=adapter_path)
+#     # Load the safetensors file
+#     print("Loading adapter weights...")
+#     adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
+#     state_dict = load_file(adapter_file)
+#     # Load state dict into model
+#     print("Applying adapter weights...")
+#     model.load_state_dict(state_dict, strict=False)
+#     print("Model and adapter loaded successfully!")
+# except Exception as e:
+#     print(f"Error during model loading: {e}")
+#     raise
+# def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
+#     """Generate a response from the model based on an instruction."""
+#     try:
+#         # Format input for the model
+#         inputs = tokenizer.encode(instruction, return_tensors="pt").to(model.device)
+#         # Generate response
+#         outputs = model.generate(
+#             inputs,
+#             max_new_tokens=max_new_tokens,
+#             temperature=0.7,
+#             top_p=0.9,
+#             do_sample=True,
+#         )
+#         # Decode and return the output
+#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#         return response
+#     except Exception as e:
+#         raise ValueError(f"Error generating response: {e}")
+# @app.post("/generate")
+# async def generate_text(input: ModelInput):
+#     try:
+#         response = generate_response(
+#             model=model,
+#             tokenizer=tokenizer,
+#             instruction=input.prompt,
+#             max_new_tokens=2048
+#         )
+#         return {"generated_text": response}
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+# @app.get("/")
+# async def root():
+#     return {"message": "Welcome to the Model API!"}