Spaces:
Sleeping
Sleeping
File size: 3,940 Bytes
fc4efba a4a40af b3e6085 fc4efba a4a40af b3e6085 cf02679 a4a40af cf02679 a4a40af fc4efba a4a40af fc4efba a4a40af fc4efba a4a40af fc4efba a4a40af b3e6085 a4a40af fc4efba b3e6085 a4a40af f36d844 a4a40af b3e6085 a4a40af b3e6085 a4a40af b3e6085 cf02679 a4a40af cf02679 b3e6085 a4a40af b3e6085 cf02679 9d537d2 cf02679 a4a40af b3e6085 a4a40af b3e6085 a4a40af cf02679 a4a40af cf02679 b3e6085 a4a40af cf02679 a4a40af fc4efba cf02679 b3e6085 a4a40af cf02679 a4a40af cf02679 b3e6085 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# app.py
import torch
import gradio as gr
import spaces # π Required for ZeroGPU
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
# -------------------------------------------------
# MODEL LOADING
# -------------------------------------------------
BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora"
print("π Loading base model and LoRA adapter...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # 4-bit quantization for GPU memory efficiency
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
quantization_config=bnb_config,
torch_dtype=torch.float16,
trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
model.eval()
print("β
Model and tokenizer loaded successfully!")
# -------------------------------------------------
# GPU INFERENCE FUNCTION
# -------------------------------------------------
@spaces.GPU # π Required for ZeroGPU runtime
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
"""
Generates text using the multitask LoRA model.
Supports reasoning, chat, summarization, story continuation, etc.
"""
prompt = f"{system_message}\n\n"
for turn in history:
prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n"
prompt += f"User: {message}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
)
text = tokenizer.decode(output[0], skip_special_tokens=True)
answer = text.split("Assistant:")[-1].strip()
return answer
# -------------------------------------------------
# GRADIO CHAT INTERFACE
# -------------------------------------------------
chatbot = gr.ChatInterface(
fn=generate_response,
type="messages",
additional_inputs=[
gr.Textbox(
value=(
"You are Chat-Bot, a helpful and logical assistant trained for "
"reasoning, email, chatting, summarization, story continuation, and report writing."
),
label="π§ System Message",
),
gr.Slider(64, 2048, value=512, step=16, label="π Max New Tokens"),
gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="π‘οΈ Temperature"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="π― Top-p"),
],
)
# -------------------------------------------------
# UI LAYOUT
# -------------------------------------------------
with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo:
with gr.Sidebar():
gr.Markdown("## π‘ About This App")
gr.Markdown(
"""
- **Model:** `GilbertAkham/deepseek-R1-multitask-lora`
- **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
- **Capabilities:**
π§© Reasoning, π£οΈ Chat, π§ Email writing, π Summarization, βοΈ Story continuation, π§Ύ Report generation
- **ZeroGPU Enabled:** GPU spins up only when generating responses.
"""
)
chatbot.render()
# -------------------------------------------------
# LAUNCH
# -------------------------------------------------
if __name__ == "__main__":
demo.launch()
|