Spaces:
Sleeping
Sleeping
| # app.py | |
| import torch | |
| import gradio as gr | |
| import spaces # π Required for ZeroGPU | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| # ------------------------------------------------- | |
| # MODEL LOADING | |
| # ------------------------------------------------- | |
| BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
| ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora" | |
| print("π Loading base model and LoRA adapter...") | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, # 4-bit quantization for GPU memory efficiency | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| device_map="auto", | |
| quantization_config=bnb_config, | |
| torch_dtype=torch.float16, | |
| trust_remote_code=True, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) | |
| model.eval() | |
| print("β Model and tokenizer loaded successfully!") | |
| # ------------------------------------------------- | |
| # GPU INFERENCE FUNCTION | |
| # ------------------------------------------------- | |
| # π Required for ZeroGPU runtime | |
| def generate_response(message, history, system_message, max_tokens, temperature, top_p): | |
| """ | |
| Generates text using the multitask LoRA model. | |
| Supports reasoning, chat, summarization, story continuation, etc. | |
| """ | |
| prompt = f"{system_message}\n\n" | |
| for turn in history: | |
| prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n" | |
| prompt += f"User: {message}\nAssistant:" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| repetition_penalty=1.1, | |
| ) | |
| text = tokenizer.decode(output[0], skip_special_tokens=True) | |
| answer = text.split("Assistant:")[-1].strip() | |
| return answer | |
| # ------------------------------------------------- | |
| # GRADIO CHAT INTERFACE | |
| # ------------------------------------------------- | |
| chatbot = gr.ChatInterface( | |
| fn=generate_response, | |
| type="messages", | |
| additional_inputs=[ | |
| gr.Textbox( | |
| value=( | |
| "You are Chat-Bot, a helpful and logical assistant trained for " | |
| "reasoning, email, chatting, summarization, story continuation, and report writing." | |
| ), | |
| label="π§ System Message", | |
| ), | |
| gr.Slider(64, 2048, value=512, step=16, label="π Max New Tokens"), | |
| gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="π‘οΈ Temperature"), | |
| gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="π― Top-p"), | |
| ], | |
| ) | |
| # ------------------------------------------------- | |
| # UI LAYOUT | |
| # ------------------------------------------------- | |
| with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo: | |
| with gr.Sidebar(): | |
| gr.Markdown("## π‘ About This App") | |
| gr.Markdown( | |
| """ | |
| - **Model:** `GilbertAkham/deepseek-R1-multitask-lora` | |
| - **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B` | |
| - **Capabilities:** | |
| π§© Reasoning, π£οΈ Chat, π§ Email writing, π Summarization, βοΈ Story continuation, π§Ύ Report generation | |
| - **ZeroGPU Enabled:** GPU spins up only when generating responses. | |
| """ | |
| ) | |
| chatbot.render() | |
| # ------------------------------------------------- | |
| # LAUNCH | |
| # ------------------------------------------------- | |
| if __name__ == "__main__": | |
| demo.launch() | |