GJAI-R1 / app.py
GilbertAkham's picture
Update app.py
9d537d2 verified
# app.py
import torch
import gradio as gr
import spaces # πŸ‘ˆ Required for ZeroGPU
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
# -------------------------------------------------
# MODEL LOADING
# -------------------------------------------------
BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora"
print("πŸ”„ Loading base model and LoRA adapter...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # 4-bit quantization for GPU memory efficiency
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
quantization_config=bnb_config,
torch_dtype=torch.float16,
trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
model.eval()
print("βœ… Model and tokenizer loaded successfully!")
# -------------------------------------------------
# GPU INFERENCE FUNCTION
# -------------------------------------------------
@spaces.GPU # πŸ‘ˆ Required for ZeroGPU runtime
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
"""
Generates text using the multitask LoRA model.
Supports reasoning, chat, summarization, story continuation, etc.
"""
prompt = f"{system_message}\n\n"
for turn in history:
prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n"
prompt += f"User: {message}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
)
text = tokenizer.decode(output[0], skip_special_tokens=True)
answer = text.split("Assistant:")[-1].strip()
return answer
# -------------------------------------------------
# GRADIO CHAT INTERFACE
# -------------------------------------------------
chatbot = gr.ChatInterface(
fn=generate_response,
type="messages",
additional_inputs=[
gr.Textbox(
value=(
"You are Chat-Bot, a helpful and logical assistant trained for "
"reasoning, email, chatting, summarization, story continuation, and report writing."
),
label="🧠 System Message",
),
gr.Slider(64, 2048, value=512, step=16, label="πŸ“ Max New Tokens"),
gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="🌑️ Temperature"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="🎯 Top-p"),
],
)
# -------------------------------------------------
# UI LAYOUT
# -------------------------------------------------
with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo:
with gr.Sidebar():
gr.Markdown("## πŸ’‘ About This App")
gr.Markdown(
"""
- **Model:** `GilbertAkham/deepseek-R1-multitask-lora`
- **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
- **Capabilities:**
🧩 Reasoning, πŸ—£οΈ Chat, πŸ“§ Email writing, πŸ“š Summarization, ✍️ Story continuation, 🧾 Report generation
- **ZeroGPU Enabled:** GPU spins up only when generating responses.
"""
)
chatbot.render()
# -------------------------------------------------
# LAUNCH
# -------------------------------------------------
if __name__ == "__main__":
demo.launch()