Spaces:

Fred808
/

808-GPT2

Running

Fred808 commited on 22 days ago

Commit

11ba705

verified ·

1 Parent(s): 2e59f2f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,13 +11,25 @@ app = FastAPI()
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Load the Google Gemma 2B model and tokenizer
-model_id = "google/gemma-2b"  # Use Google Gemma 2B
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
 # Create a text generation pipeline
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda" if torch.cuda.is_available() else "cpu")
 # Define request body schema
 class TextGenerationRequest(BaseModel):

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Load the Google Gemma 7B model and tokenizer
+model_id = "google/gemma-7b"  # Use Google Gemma 7B
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load the model with 4-bit quantization to reduce VRAM usage
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16,  # Use half-precision for faster inference
+    device_map="auto",          # Automatically offload to available GPUs
+    load_in_4bit=True           # Enable 4-bit quantization
+)
 # Create a text generation pipeline
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device="cuda" if torch.cuda.is_available() else "cpu"
+)
 # Define request body schema
 class TextGenerationRequest(BaseModel):