khurrameycon commited on
Commit
11fa80d
·
verified ·
1 Parent(s): 837a126

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -121
app.py CHANGED
@@ -1,99 +1,3 @@
1
- # from fastapi import FastAPI, HTTPException
2
- # from pydantic import BaseModel
3
- # from transformers import AutoModelForCausalLM, AutoTokenizer
4
- # import torch
5
- # from huggingface_hub import snapshot_download
6
- # from safetensors.torch import load_file
7
-
8
- # class ModelInput(BaseModel):
9
- # prompt: str
10
- # max_new_tokens: int = 50
11
-
12
- # app = FastAPI()
13
-
14
- # # Define model paths
15
- # base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
16
- # adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
17
-
18
- # try:
19
- # # First load the base model
20
- # print("Loading base model...")
21
- # model = AutoModelForCausalLM.from_pretrained(
22
- # base_model_path,
23
- # torch_dtype=torch.float16,
24
- # trust_remote_code=True,
25
- # device_map="auto"
26
- # )
27
-
28
- # # Load tokenizer from base model
29
- # print("Loading tokenizer...")
30
- # tokenizer = AutoTokenizer.from_pretrained(base_model_path)
31
-
32
- # # Download adapter weights
33
- # print("Downloading adapter weights...")
34
- # adapter_path_local = snapshot_download(adapter_path)
35
-
36
- # # Load the safetensors file
37
- # print("Loading adapter weights...")
38
- # state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")
39
-
40
- # # Load state dict into model
41
- # model.load_state_dict(state_dict, strict=False)
42
-
43
- # print("Model and adapter loaded successfully!")
44
-
45
- # except Exception as e:
46
- # print(f"Error during model loading: {e}")
47
- # raise
48
-
49
- # def generate_response(model, tokenizer, instruction, max_new_tokens=128):
50
- # """Generate a response from the model based on an instruction."""
51
- # try:
52
- # messages = [{"role": "user", "content": instruction}]
53
- # input_text = tokenizer.apply_chat_template(
54
- # messages, tokenize=False, add_generation_prompt=True
55
- # )
56
-
57
- # inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
58
- # outputs = model.generate(
59
- # inputs,
60
- # max_new_tokens=max_new_tokens,
61
- # temperature=0.2,
62
- # top_p=0.9,
63
- # do_sample=True,
64
- # )
65
-
66
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
67
- # return response
68
-
69
- # except Exception as e:
70
- # raise ValueError(f"Error generating response: {e}")
71
-
72
- # @app.post("/generate")
73
- # async def generate_text(input: ModelInput):
74
- # try:
75
- # response = generate_response(
76
- # model=model,
77
- # tokenizer=tokenizer,
78
- # instruction=input.prompt,
79
- # max_new_tokens=input.max_new_tokens
80
- # )
81
- # return {"generated_text": response}
82
-
83
- # except Exception as e:
84
- # raise HTTPException(status_code=500, detail=str(e))
85
-
86
- # @app.get("/")
87
- # async def root():
88
- # return {"message": "Welcome to the Model API!"}
89
-
90
-
91
-
92
-
93
-
94
-
95
- # //////////////////////////////////////////
96
-
97
  from fastapi import FastAPI, HTTPException
98
  from pydantic import BaseModel
99
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -103,7 +7,7 @@ from safetensors.torch import load_file
103
 
104
  class ModelInput(BaseModel):
105
  prompt: str
106
- max_new_tokens: int = 2048
107
 
108
  app = FastAPI()
109
 
@@ -112,59 +16,56 @@ base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
112
  adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
113
 
114
  try:
115
- # Load the base model
116
  print("Loading base model...")
117
  model = AutoModelForCausalLM.from_pretrained(
118
  base_model_path,
119
  torch_dtype=torch.float16,
120
- device_map="cpu", # Explicitly set CPU
121
- # load_in_8bit=True # Enable int8 quantization
122
  trust_remote_code=True,
123
- # device_map="auto"
124
  )
125
-
126
- # Load tokenizer
127
  print("Loading tokenizer...")
128
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
129
-
130
  # Download adapter weights
131
  print("Downloading adapter weights...")
132
- adapter_path_local = snapshot_download(repo_id=adapter_path)
133
-
134
  # Load the safetensors file
135
  print("Loading adapter weights...")
136
- adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
137
- state_dict = load_file(adapter_file)
138
-
139
  # Load state dict into model
140
- print("Applying adapter weights...")
141
  model.load_state_dict(state_dict, strict=False)
142
-
143
  print("Model and adapter loaded successfully!")
144
 
145
  except Exception as e:
146
  print(f"Error during model loading: {e}")
147
  raise
148
 
149
- def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
150
  """Generate a response from the model based on an instruction."""
151
  try:
152
- # Format input for the model
153
- inputs = tokenizer.encode(instruction, return_tensors="pt").to(model.device)
 
 
154
 
155
- # Generate response
156
  outputs = model.generate(
157
  inputs,
158
  max_new_tokens=max_new_tokens,
159
- temperature=0.7,
160
  top_p=0.9,
161
  do_sample=True,
162
  )
163
-
164
- # Decode and return the output
165
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
166
  return response
167
-
168
  except Exception as e:
169
  raise ValueError(f"Error generating response: {e}")
170
 
@@ -175,10 +76,10 @@ async def generate_text(input: ModelInput):
175
  model=model,
176
  tokenizer=tokenizer,
177
  instruction=input.prompt,
178
- max_new_tokens=2048
179
  )
180
  return {"generated_text": response}
181
-
182
  except Exception as e:
183
  raise HTTPException(status_code=500, detail=str(e))
184
 
@@ -186,3 +87,102 @@ async def generate_text(input: ModelInput):
186
  async def root():
187
  return {"message": "Welcome to the Model API!"}
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
7
 
8
  class ModelInput(BaseModel):
9
  prompt: str
10
+ max_new_tokens: int = 50
11
 
12
  app = FastAPI()
13
 
 
16
  adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
17
 
18
  try:
19
+ # First load the base model
20
  print("Loading base model...")
21
  model = AutoModelForCausalLM.from_pretrained(
22
  base_model_path,
23
  torch_dtype=torch.float16,
 
 
24
  trust_remote_code=True,
25
+ device_map="auto"
26
  )
27
+
28
+ # Load tokenizer from base model
29
  print("Loading tokenizer...")
30
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
31
+
32
  # Download adapter weights
33
  print("Downloading adapter weights...")
34
+ adapter_path_local = snapshot_download(adapter_path)
35
+
36
  # Load the safetensors file
37
  print("Loading adapter weights...")
38
+ state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")
39
+
 
40
  # Load state dict into model
 
41
  model.load_state_dict(state_dict, strict=False)
42
+
43
  print("Model and adapter loaded successfully!")
44
 
45
  except Exception as e:
46
  print(f"Error during model loading: {e}")
47
  raise
48
 
49
+ def generate_response(model, tokenizer, instruction, max_new_tokens=128):
50
  """Generate a response from the model based on an instruction."""
51
  try:
52
+ messages = [{"role": "user", "content": instruction}]
53
+ input_text = tokenizer.apply_chat_template(
54
+ messages, tokenize=False, add_generation_prompt=True
55
+ )
56
 
57
+ inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
58
  outputs = model.generate(
59
  inputs,
60
  max_new_tokens=max_new_tokens,
61
+ temperature=0.2,
62
  top_p=0.9,
63
  do_sample=True,
64
  )
65
+
 
66
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
67
  return response
68
+
69
  except Exception as e:
70
  raise ValueError(f"Error generating response: {e}")
71
 
 
76
  model=model,
77
  tokenizer=tokenizer,
78
  instruction=input.prompt,
79
+ max_new_tokens=input.max_new_tokens
80
  )
81
  return {"generated_text": response}
82
+
83
  except Exception as e:
84
  raise HTTPException(status_code=500, detail=str(e))
85
 
 
87
  async def root():
88
  return {"message": "Welcome to the Model API!"}
89
 
90
+
91
+
92
+
93
+
94
+
95
+ # //////////////////////////////////////////
96
+
97
+ # from fastapi import FastAPI, HTTPException
98
+ # from pydantic import BaseModel
99
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
100
+ # import torch
101
+ # from huggingface_hub import snapshot_download
102
+ # from safetensors.torch import load_file
103
+
104
+ # class ModelInput(BaseModel):
105
+ # prompt: str
106
+ # max_new_tokens: int = 2048
107
+
108
+ # app = FastAPI()
109
+
110
+ # # Define model paths
111
+ # base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
112
+ # adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
113
+
114
+ # try:
115
+ # # Load the base model
116
+ # print("Loading base model...")
117
+ # model = AutoModelForCausalLM.from_pretrained(
118
+ # base_model_path,
119
+ # torch_dtype=torch.float16,
120
+ # device_map="cpu", # Explicitly set CPU
121
+ # # load_in_8bit=True # Enable int8 quantization
122
+ # trust_remote_code=True,
123
+ # # device_map="auto"
124
+ # )
125
+
126
+ # # Load tokenizer
127
+ # print("Loading tokenizer...")
128
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_path)
129
+
130
+ # # Download adapter weights
131
+ # print("Downloading adapter weights...")
132
+ # adapter_path_local = snapshot_download(repo_id=adapter_path)
133
+
134
+ # # Load the safetensors file
135
+ # print("Loading adapter weights...")
136
+ # adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
137
+ # state_dict = load_file(adapter_file)
138
+
139
+ # # Load state dict into model
140
+ # print("Applying adapter weights...")
141
+ # model.load_state_dict(state_dict, strict=False)
142
+
143
+ # print("Model and adapter loaded successfully!")
144
+
145
+ # except Exception as e:
146
+ # print(f"Error during model loading: {e}")
147
+ # raise
148
+
149
+ # def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
150
+ # """Generate a response from the model based on an instruction."""
151
+ # try:
152
+ # # Format input for the model
153
+ # inputs = tokenizer.encode(instruction, return_tensors="pt").to(model.device)
154
+
155
+ # # Generate response
156
+ # outputs = model.generate(
157
+ # inputs,
158
+ # max_new_tokens=max_new_tokens,
159
+ # temperature=0.7,
160
+ # top_p=0.9,
161
+ # do_sample=True,
162
+ # )
163
+
164
+ # # Decode and return the output
165
+ # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
166
+ # return response
167
+
168
+ # except Exception as e:
169
+ # raise ValueError(f"Error generating response: {e}")
170
+
171
+ # @app.post("/generate")
172
+ # async def generate_text(input: ModelInput):
173
+ # try:
174
+ # response = generate_response(
175
+ # model=model,
176
+ # tokenizer=tokenizer,
177
+ # instruction=input.prompt,
178
+ # max_new_tokens=2048
179
+ # )
180
+ # return {"generated_text": response}
181
+
182
+ # except Exception as e:
183
+ # raise HTTPException(status_code=500, detail=str(e))
184
+
185
+ # @app.get("/")
186
+ # async def root():
187
+ # return {"message": "Welcome to the Model API!"}
188
+