File size: 3,940 Bytes
fc4efba
a4a40af
b3e6085
fc4efba
a4a40af
 
b3e6085
cf02679
a4a40af
cf02679
a4a40af
 
 
 
 
 
fc4efba
a4a40af
 
 
 
 
fc4efba
a4a40af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc4efba
a4a40af
fc4efba
a4a40af
b3e6085
a4a40af
fc4efba
b3e6085
a4a40af
f36d844
a4a40af
 
 
 
b3e6085
a4a40af
 
 
 
 
 
 
 
 
 
 
 
 
b3e6085
a4a40af
b3e6085
 
cf02679
a4a40af
cf02679
b3e6085
a4a40af
b3e6085
 
cf02679
 
9d537d2
 
cf02679
a4a40af
b3e6085
a4a40af
 
 
b3e6085
 
 
a4a40af
cf02679
a4a40af
cf02679
 
b3e6085
a4a40af
cf02679
 
a4a40af
 
 
 
fc4efba
cf02679
 
b3e6085
 
a4a40af
cf02679
a4a40af
cf02679
b3e6085
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# app.py
import torch
import gradio as gr
import spaces  # πŸ‘ˆ Required for ZeroGPU
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# -------------------------------------------------
# MODEL LOADING
# -------------------------------------------------
BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora"

print("πŸ”„ Loading base model and LoRA adapter...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                   # 4-bit quantization for GPU memory efficiency
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
model.eval()

print("βœ… Model and tokenizer loaded successfully!")


# -------------------------------------------------
# GPU INFERENCE FUNCTION
# -------------------------------------------------
@spaces.GPU  # πŸ‘ˆ Required for ZeroGPU runtime
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
    """
    Generates text using the multitask LoRA model.
    Supports reasoning, chat, summarization, story continuation, etc.
    """
    prompt = f"{system_message}\n\n"
    for turn in history:
        prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n"
    prompt += f"User: {message}\nAssistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = text.split("Assistant:")[-1].strip()

    return answer


# -------------------------------------------------
# GRADIO CHAT INTERFACE
# -------------------------------------------------
chatbot = gr.ChatInterface(
    fn=generate_response,
    type="messages",
    additional_inputs=[
        gr.Textbox(
            value=(
                "You are Chat-Bot, a helpful and logical assistant trained for "
                "reasoning, email, chatting, summarization, story continuation, and report writing."
            ),
            label="🧠 System Message",
        ),
        gr.Slider(64, 2048, value=512, step=16, label="πŸ“ Max New Tokens"),
        gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="🌑️ Temperature"),
        gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="🎯 Top-p"),
    ],
)


# -------------------------------------------------
# UI LAYOUT
# -------------------------------------------------
with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo:
    with gr.Sidebar():
        gr.Markdown("## πŸ’‘ About This App")
        gr.Markdown(
            """
            - **Model:** `GilbertAkham/deepseek-R1-multitask-lora`  
            - **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`  
            - **Capabilities:**  
              🧩 Reasoning, πŸ—£οΈ Chat, πŸ“§ Email writing, πŸ“š Summarization, ✍️ Story continuation, 🧾 Report generation  
            - **ZeroGPU Enabled:** GPU spins up only when generating responses.
            """
        )
    chatbot.render()


# -------------------------------------------------
# LAUNCH
# -------------------------------------------------
if __name__ == "__main__":
    demo.launch()