Starchik commited on
Commit
44c69c7
·
verified ·
1 Parent(s): b2b3bd8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -132
app.py CHANGED
@@ -1,132 +1,10 @@
1
- import os
2
- from threading import Thread
3
- from typing import Iterator
4
-
5
- import gradio as gr
6
- import spaces
7
- import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
-
10
- MAX_MAX_NEW_TOKENS = 2048
11
- DEFAULT_MAX_NEW_TOKENS = 1024
12
- total_count = 0
13
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
14
-
15
- DESCRIPTION = """\
16
- # DeepSeek-33B-Chat
17
- This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) by DeepSeek, a code model with 33B parameters fine-tuned for chat instructions.
18
- **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
19
- """
20
-
21
- # Check if CUDA is available
22
- if not torch.cuda.is_available():
23
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo might be slow on CPU.</p>"
24
- device = torch.device("cpu")
25
- else:
26
- device = torch.device("cuda")
27
- model_id = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
28
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
29
- tokenizer = AutoTokenizer.from_pretrained(model_id)
30
- tokenizer.use_default_system_prompt = False
31
-
32
- # Fallback to CPU for model loading if CUDA is unavailable
33
- if not torch.cuda.is_available():
34
- model_id = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
35
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
36
- tokenizer = AutoTokenizer.from_pretrained(model_id)
37
-
38
- @spaces.GPU
39
- def generate(
40
- message: str,
41
- chat_history: list[tuple[str, str]],
42
- system_prompt: str,
43
- max_new_tokens: int = 1024,
44
- temperature: float = 0.6,
45
- top_p: float = 0.9,
46
- top_k: int = 50,
47
- repetition_penalty: float = 1,
48
- ) -> Iterator[str]:
49
- global total_count
50
- total_count += 1
51
- print(total_count)
52
- os.system("nvidia-smi")
53
- conversation = []
54
- if system_prompt:
55
- conversation.append({"role": "system", "content": system_prompt})
56
- for user, assistant in chat_history:
57
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
58
- conversation.append({"role": "user", "content": message})
59
-
60
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
61
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
62
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
63
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
64
- input_ids = input_ids.to(device)
65
-
66
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
67
- generate_kwargs = dict(
68
- {"input_ids": input_ids},
69
- streamer=streamer,
70
- max_new_tokens=max_new_tokens,
71
- do_sample=False,
72
- top_p=top_p,
73
- top_k=top_k,
74
- num_beams=1,
75
- repetition_penalty=repetition_penalty,
76
- eos_token_id=32021
77
- )
78
- t = Thread(target=model.generate, kwargs=generate_kwargs)
79
- t.start()
80
-
81
- outputs = []
82
- for text in streamer:
83
- outputs.append(text)
84
- yield "".join(outputs).replace("<|EOT|>", "")
85
-
86
- chat_interface = gr.ChatInterface(
87
- fn=generate,
88
- additional_inputs=[
89
- gr.Textbox(label="System prompt", lines=6),
90
- gr.Slider(
91
- label="Max new tokens",
92
- minimum=1,
93
- maximum=MAX_MAX_NEW_TOKENS,
94
- step=1,
95
- value=DEFAULT_MAX_NEW_TOKENS,
96
- ),
97
- gr.Slider(
98
- label="Top-p (nucleus sampling)",
99
- minimum=0.05,
100
- maximum=1.0,
101
- step=0.05,
102
- value=0.9,
103
- ),
104
- gr.Slider(
105
- label="Top-k",
106
- minimum=1,
107
- maximum=1000,
108
- step=1,
109
- value=50,
110
- ),
111
- gr.Slider(
112
- label="Repetition penalty",
113
- minimum=1.0,
114
- maximum=2.0,
115
- step=0.05,
116
- value=1,
117
- ),
118
- ],
119
- stop_btn=gr.Button("Stop"),
120
- examples=[
121
- ["implement snake game using pygame"],
122
- ["Can you explain briefly to me what is the Python programming language?"],
123
- ["write a program to find the factorial of a number"],
124
- ],
125
- )
126
-
127
- with gr.Blocks(css="style.css") as demo:
128
- gr.Markdown(DESCRIPTION)
129
- chat_interface.render()
130
-
131
- if __name__ == "__main__":
132
- demo.queue(max_size=20).launch()
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-7b-instruct-v1.5", trust_remote_code=True)
3
+ model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-7b-instruct-v1.5", trust_remote_code=True).cuda()
4
+ messages=[
5
+ { 'role': 'user', 'content': "write a quick sort algorithm in python."}
6
+ ]
7
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
8
+
9
+ outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
10
+ print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))