GilbertAkham commited on
Commit
fc4efba
·
verified ·
1 Parent(s): bc8874a

Update app.py

Browse files

Change to ZeroGPU

Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -1,5 +1,7 @@
 
1
  import torch
2
  import gradio as gr
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  from peft import PeftModel
5
 
@@ -12,13 +14,13 @@ ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora"
12
  print("🔄 Loading base model and LoRA adapter...")
13
 
14
  bnb_config = BitsAndBytesConfig(
15
- load_in_4bit=True, # load in 4-bit for GPU memory efficiency
16
  bnb_4bit_use_double_quant=True,
17
  bnb_4bit_quant_type="nf4",
18
  bnb_4bit_compute_dtype=torch.float16,
19
  )
20
 
21
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
22
  if tokenizer.pad_token is None:
23
  tokenizer.pad_token = tokenizer.eos_token
24
 
@@ -37,15 +39,14 @@ print("✅ Model and tokenizer loaded successfully!")
37
 
38
 
39
  # -------------------------------------------------
40
- # CHAT / GENERATION FUNCTION
41
  # -------------------------------------------------
 
42
  def generate_response(message, history, system_message, max_tokens, temperature, top_p):
43
  """
44
  Generates text using the multitask LoRA model.
45
- Supports chat, reasoning, summarization, storytelling, etc.
46
  """
47
-
48
- # Construct a conversation-style prompt
49
  prompt = f"{system_message}\n\n"
50
  for turn in history:
51
  prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n"
@@ -65,7 +66,6 @@ def generate_response(message, history, system_message, max_tokens, temperature,
65
  )
66
 
67
  text = tokenizer.decode(output[0], skip_special_tokens=True)
68
- # Extract only the Assistant’s answer
69
  answer = text.split("Assistant:")[-1].strip()
70
 
71
  return answer
@@ -104,7 +104,7 @@ with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo:
104
  - **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
105
  - **Capabilities:**
106
  🧩 Reasoning, 🗣️ Chat, 📧 Email writing, 📚 Summarization, ✍️ Story continuation, 🧾 Report generation
107
- - **Runs locally** (no Inference API required).
108
  """
109
  )
110
  chatbot.render()
 
1
+ # app.py
2
  import torch
3
  import gradio as gr
4
+ import spaces # 👈 Required for ZeroGPU
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
  from peft import PeftModel
7
 
 
14
  print("🔄 Loading base model and LoRA adapter...")
15
 
16
  bnb_config = BitsAndBytesConfig(
17
+ load_in_4bit=True, # 4-bit quantization for GPU memory efficiency
18
  bnb_4bit_use_double_quant=True,
19
  bnb_4bit_quant_type="nf4",
20
  bnb_4bit_compute_dtype=torch.float16,
21
  )
22
 
23
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
24
  if tokenizer.pad_token is None:
25
  tokenizer.pad_token = tokenizer.eos_token
26
 
 
39
 
40
 
41
  # -------------------------------------------------
42
+ # GPU INFERENCE FUNCTION
43
  # -------------------------------------------------
44
+ @spaces.GPU # 👈 Required for ZeroGPU runtime
45
  def generate_response(message, history, system_message, max_tokens, temperature, top_p):
46
  """
47
  Generates text using the multitask LoRA model.
48
+ Supports reasoning, chat, summarization, story continuation, etc.
49
  """
 
 
50
  prompt = f"{system_message}\n\n"
51
  for turn in history:
52
  prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n"
 
66
  )
67
 
68
  text = tokenizer.decode(output[0], skip_special_tokens=True)
 
69
  answer = text.split("Assistant:")[-1].strip()
70
 
71
  return answer
 
104
  - **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
105
  - **Capabilities:**
106
  🧩 Reasoning, 🗣️ Chat, 📧 Email writing, 📚 Summarization, ✍️ Story continuation, 🧾 Report generation
107
+ - **ZeroGPU Enabled:** GPU spins up only when generating responses.
108
  """
109
  )
110
  chatbot.render()