Spaces:

HandsomeSB
/

Speculative-Decoding-Demo

Running

App Files Files Community

HandsomeSB commited on 5 days ago

Commit

6a50f6f

0 Parent(s):

init

Browse files

Files changed (7) hide show

.gradio/flagged/dataset1.csv +7 -0
README.md +3 -0
__pycache__/main.cpython-313.pyc +0 -0
main.ipynb +458 -0
main.py +202 -0
poetry.lock +0 -0
pyproject.toml +19 -0

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Prompt,Max Tokens,Gamma (draft lookahead),Confidence Threshold,Speculative Decoding Visualization,timestamp
+def fibonacci(n):,10,5,0.5,"<div style='font-family: monospace;'><div style='margin-bottom: 20px; padding: 10px; background: #f0f0f0; border-radius: 5px;'><b>Final Output:</b><br/><|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
+<|im_start|>user
+def fibonacci(n):<|im_end|>
+<|im_start|>assistant
+ I understand that you're looking for a Python function</div><div style='margin-bottom: 20px; padding: 10px; background: #e0e0e0; border-radius: 5px;'><b>Acceptance Rate:</b> 8/15 = 53.3%</div><div style='margin-bottom: 10px;'><b>Decoding Steps:</b></div><div style='margin: 10px 0; padding: 10px; border: 1px solid #ccc; border-radius: 5px;'><b>Step 1:</b> <span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'>Certainly</span><span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'>!</span> → <span style='background: #87CEEB; padding: 2px 4px; border-radius: 3px;'> I</span></div><div style='margin: 10px 0; padding: 10px; border: 1px solid #ccc; border-radius: 5px;'><b>Step 2:</b> <span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'>'m</span><span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'> sorry</span><span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'>,</span><span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'> but</span><span style='background: #FFB6C1; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'> I</span> → <span style='background: #87CEEB; padding: 2px 4px; border-radius: 3px;'> understand</span></div><div style='margin: 10px 0; padding: 10px; border: 1px solid #ccc; border-radius: 5px;'><b>Step 3:</b> <span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> that</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> you</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'>'re</span></div><div style='margin: 10px 0; padding: 10px; border: 1px solid #ccc; border-radius: 5px;'><b>Step 4:</b> <span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> looking</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> for</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> a</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> Python</span><span style='background: #90EE90; padding: 2px 4px; margin: 2px; border-radius: 3px;'> function</span></div></div>",2025-12-14 13:54:04.330173

README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Speculative Decoding
2	+
3	+ A project implementing speculative decoding techniques.

__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (9.49 kB). View file

main.ipynb ADDED Viewed

	@@ -0,0 +1,458 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0ef8d28",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/yiyunzhu/Library/Caches/pypoetry/virtualenvs/speculativedecoding-B0TTdUOs-py3.13/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "`torch_dtype` is deprecated! Use `dtype` instead!\n",
+      "Fetching 2 files: 100%|██████████| 2/2 [00:58<00:00, 29.32s/it]\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 27.91it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed\n",
+    "\n",
+    "set_seed(67)\n",
+    "\n",
+    "device = \"mps\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-Coder-0.5B-Instruct\") #HuggingFaceTB/SmolLM2-135M-Instruct\n",
+    "draft_model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2.5-Coder-0.5B-Instruct\", torch_dtype=torch.bfloat16).to(device)\n",
+    "verify_model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2.5-Coder-3B-Instruct\", torch_dtype=torch.bfloat16).to(device) #HuggingFaceTB/SmolLM2-1.7B-Instruct"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "30d81505",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The quick brown fox jumps over the lazy dog'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompt = \"The quick brown fox\"\n",
+    "inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "\n",
+    "generated = input_ids.clone() # [1, seq_len]\n",
+    "draft_probs = []\n",
+    "for _ in range(5): # gamma = 5\n",
+    "    with torch.no_grad():\n",
+    "        outputs = draft_model(generated)\n",
+    "        logits = outputs.logits[:, -1, :] # batch, seq_len, vocab_size\n",
+    "    \n",
+    "    probs = torch.softmax(logits, dim=-1) # [1, 50257]\n",
+    "    next_token = torch.multinomial(probs, num_samples=1)\n",
+    "\n",
+    "    draft_probs.append(probs)\n",
+    "    generated = torch.cat([generated, next_token], dim=-1)\n",
+    "\n",
+    "tokenizer.decode(generated[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2492ee58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token: ' jumps', q(x)=0.9727, p(x)=0.9023\n",
+      "   -> Accepted\n",
+      "Token: ' over', q(x)=1.0000, p(x)=0.9961\n",
+      "   -> Accepted\n",
+      "Token: ' the', q(x)=0.9023, p(x)=0.9102\n",
+      "   -> Accepted\n",
+      "Token: ' lazy', q(x)=1.0000, p(x)=0.9922\n",
+      "   -> Accepted\n",
+      "Token: ' dog', q(x)=0.9922, p(x)=0.9844\n",
+      "   -> Accepted\n",
+      " jumps over the lazy dog\n",
+      "[34208, 916, 279, 15678, 5562]\n"
+     ]
+    }
+   ],
+   "source": [
+    "with torch.no_grad():\n",
+    "    target_outputs = verify_model(generated)\n",
+    "    target_logits = target_outputs.logits[:, -6:-1, :]\n",
+    "\n",
+    "target_probs = torch.softmax(target_logits, dim=-1) # [1, 5, 50257]\n",
+    "accepted_tokens = []\n",
+    "for i in range(5):\n",
+    "    # if q(x) <= p(x), keep\n",
+    "    # if q(x) > p(x), reject with 1-p(x)/q(x) chance\n",
+    "    # if rejected, we sample from norm(max(0, p(x) - q(x)))\n",
+    "    q = draft_probs[i] # [1, 50257]\n",
+    "    p = target_probs[:, i, :] # [1, 50257]\n",
+    "    token = generated[:, i - 5] # [1]\n",
+    "    # assume unbatched for now\n",
+    "    x = token[0].item()\n",
+    "    q_x = q[0, x].item()\n",
+    "    p_x = p[0, x].item()\n",
+    "\n",
+    "    print(f\"Token: '{tokenizer.decode(x)}', q(x)={q_x:.4f}, p(x)={p_x:.4f}\")\n",
+    "    if q_x <= p_x:\n",
+    "        print(\"   -> Accepted\")\n",
+    "        accepted_tokens.append(x)\n",
+    "    else:\n",
+    "        r = torch.rand(1).item()\n",
+    "        acceptance_rate = p_x / q_x\n",
+    "\n",
+    "        if r < acceptance_rate:\n",
+    "            print(\"   -> Accepted\")\n",
+    "            accepted_tokens.append(x)\n",
+    "        else:\n",
+    "            print(\"   -> Rejected\")\n",
+    "            adjusted = torch.clamp(p-q, min=0)\n",
+    "            adjusted = adjusted / adjusted.sum()\n",
+    "            new_token = torch.multinomial(adjusted, num_samples=1)[0].item()\n",
+    "            accepted_tokens.append(new_token)\n",
+    "            break\n",
+    "\n",
+    "print(tokenizer.decode(accepted_tokens))\n",
+    "print(accepted_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "2df0e2a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def draft(input_ids, gamma, confidence_threshold, eos_token, past_kv):\n",
+    "    generated = input_ids.clone() # [1, seq_len]\n",
+    "    draft_probs = []\n",
+    "    for _ in range(gamma): \n",
+    "        with torch.no_grad():\n",
+    "            outputs = draft_model(\n",
+    "                generated if past_kv is None else generated[:, -1:],\n",
+    "                past_key_values=past_kv,\n",
+    "                use_cache=True\n",
+    "            )\n",
+    "            logits = outputs.logits[:, -1, :] # batch, seq_len, vocab_size\n",
+    "            past_kv = outputs.past_key_values\n",
+    "        \n",
+    "        probs = torch.softmax(logits, dim=-1) # [1, 50257]\n",
+    "\n",
+    "        confidence = probs.max().item() # dynamic speculative decoding\n",
+    "        if confidence < confidence_threshold and len(draft_probs) > 0:\n",
+    "            break \n",
+    "\n",
+    "        next_token = torch.argmax(probs, dim=-1, keepdim=True)\n",
+    "\n",
+    "        draft_probs.append(probs)\n",
+    "        generated = torch.cat([generated, next_token], dim=-1)\n",
+    "\n",
+    "        if next_token.item() == eos_token:\n",
+    "            break;\n",
+    "\n",
+    "    return generated, draft_probs, past_kv\n",
+    "\n",
+    "def verify(drafted, drafted_probs, eos_token, past_kv):\n",
+    "    draft_len = len(drafted_probs) # number of new drafted tokens\n",
+    "    with torch.no_grad():\n",
+    "        if past_kv is None:\n",
+    "            target_outputs = verify_model(drafted, use_cache=True)\n",
+    "            target_logits = target_outputs.logits[:, -draft_len - 1:-1, :]\n",
+    "        else:\n",
+    "            target_outputs = verify_model(\n",
+    "                drafted[:, -(draft_len + 1):], # extra token \n",
+    "                past_key_values=past_kv,\n",
+    "                use_cache=True\n",
+    "            )\n",
+    "            target_logits = target_outputs.logits[:, :-1, :]  # Drop last (predicts bonus token)\n",
+    "\n",
+    "        past_kv = target_outputs.past_key_values\n",
+    "\n",
+    "    target_probs = torch.softmax(target_logits, dim=-1) # [1, 5, 50257]\n",
+    "    accepted_tokens = []\n",
+    "    num_accepted = 0 # number of tokens from drafted that is accepted\n",
+    "    for i in range(draft_len):\n",
+    "        # if q(x) <= p(x), keep\n",
+    "        # if q(x) > p(x), reject with 1-p(x)/q(x) chance\n",
+    "        # if rejected, we sample from norm(max(0, p(x) - q(x)))\n",
+    "        q = drafted_probs[i] # [1, 50257]\n",
+    "        p = target_probs[:, i, :] # [1, 50257]\n",
+    "        token = drafted[:, i - draft_len] # [1]\n",
+    "        # assume unbatched for now\n",
+    "        x = token[0].item()\n",
+    "        q_x = q[0, x].item()\n",
+    "        p_x = p[0, x].item()\n",
+    "\n",
+    "        print(f\"Token: '{tokenizer.decode(x)}'\", end = \"\")\n",
+    "        if q_x <= p_x:\n",
+    "            print(\"   -> Accepted\")\n",
+    "            accepted_tokens.append(x)\n",
+    "            num_accepted+=1\n",
+    "        else:\n",
+    "            r = torch.rand(1).item()\n",
+    "            acceptance_rate = p_x / q_x\n",
+    "\n",
+    "            if r < acceptance_rate:\n",
+    "                print(\"   -> Accepted\")\n",
+    "                accepted_tokens.append(x)\n",
+    "                num_accepted+=1\n",
+    "            else:\n",
+    "                print(\"   -> Rejected\", end = \"\")\n",
+    "                adjusted = torch.clamp(p-q, min=0)\n",
+    "                adjusted = adjusted / adjusted.sum()\n",
+    "                new_token = torch.multinomial(adjusted, num_samples=1)[0].item()\n",
+    "                accepted_tokens.append(new_token)\n",
+    "                print(tokenizer.decode(new_token))\n",
+    "                break\n",
+    "        if accepted_tokens[-1] == eos_token:\n",
+    "            break\n",
+    "\n",
+    "    return accepted_tokens, num_accepted, past_kv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "5378f5d5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token: 'A'   -> Accepted\n",
+      "Token: ' deal'   -> Accepted\n",
+      "Token: ' flow'   -> Accepted\n",
+      "Token: ' in'   -> Accepted\n",
+      "Token: ' a'   -> Accepted\n",
+      "Token: ' VC'   -> Rejected venture\n",
+      "Token: ' capital'   -> Accepted\n",
+      "Token: ' fund'   -> Rejected (\n",
+      "Token: 'VC'   -> Accepted\n",
+      "Token: ')'   -> Accepted\n",
+      "Token: ' fund'   -> Accepted\n",
+      "Token: ' is'   -> Rejected refers\n",
+      "Token: ' to'   -> Accepted\n",
+      "Token: ' the'   -> Accepted\n",
+      "Token: ' process'   -> Accepted\n",
+      "Token: ' of'   -> Accepted\n",
+      "Token: ' setting'   -> Rejected screening\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' evaluating'   -> Accepted\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' and'   -> Accepted\n",
+      "Token: ' selecting'   -> Accepted\n",
+      "Token: ' investors'   -> Rejected potential\n",
+      "Token: ' investment'   -> Accepted\n",
+      "Token: ' in'   -> Rejected opportunities\n",
+      "Token: ' for'   -> Rejected.\n",
+      "Token: ' It'   -> Accepted\n",
+      "Token: ' involves'   -> Accepted\n",
+      "Token: ' several'   -> Rejected identifying\n",
+      "Token: ' potential'   -> Accepted\n",
+      "Token: ' investors'   -> Rejected companies\n",
+      "Token: ' that'   -> Accepted\n",
+      "Token: ' could'   -> Rejected the\n",
+      "Token: ' VC'   -> Accepted\n",
+      "Token: ' fund'   -> Accepted\n",
+      "Token: 'ers'   -> Rejected is\n",
+      "Token: ' interested'   -> Accepted\n",
+      "Token: ' in'   -> Accepted\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' assessing'   -> Accepted\n",
+      "Token: ' their'   -> Accepted\n",
+      "Token: ' financial'   -> Rejected growth\n",
+      "Token: ' potential'   -> Accepted\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' and'   -> Accepted\n",
+      "Token: ' evaluating'   -> Rejected business\n",
+      "Token: ' model'   -> Accepted\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' and'   -> Accepted\n",
+      "Token: ','   -> Rejected market\n",
+      "Token: ' demand'   -> Accepted\n",
+      "Token: ','   -> Accepted\n",
+      "Token: ' and'   -> Accepted\n",
+      "Token: ' then'   -> Accepted\n",
+      "Token: ' making'   -> Accepted\n",
+      "Token: ' a'   -> Accepted\n",
+      "Token: ' decision'   -> Accepted\n",
+      "Token: ' on'   -> Accepted\n",
+      "Token: ' whether'   -> Accepted\n",
+      "Token: ' to'   -> Accepted\n",
+      "Token: ' invest'   -> Accepted\n",
+      "Token: ' in'   -> Accepted\n",
+      "Token: ' those'   -> Rejected the\n",
+      "Token: ' VC'   -> Rejected fund\n",
+      "Token: ' in'   -> Rejected’s\n",
+      "Token: ' capital'   -> Accepted\n",
+      "Token: '.'   -> Accepted\n",
+      "Token: '<|im_end|>'   -> Accepted\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n<|im_start|>user\\nWhat is a deal flow in a VC fund?<|im_end|>\\n<|im_start|>assistant\\nA deal flow in a venture capital (VC) fund refers to the process of screening, evaluating, and selecting potential investment opportunities. It involves identifying potential companies that the VC fund is interested in, assessing their growth potential, and business model, and market demand, and then making a decision on whether to invest in the fund’s capital.<|im_end|>'"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "messages = [{\"role\": \"user\", \"content\": \"What is a deal flow in a VC fund?\"}]\n",
+    "prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "\n",
+    "max_tokens = 80\n",
+    "eos_token = tokenizer.eos_token_id\n",
+    "im_end_token = tokenizer.convert_tokens_to_ids(\"<|im_end|>\")\n",
+    "gamma = 15\n",
+    "confidence_threshold = 0.5\n",
+    "\n",
+    "inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
+    "result = inputs[\"input_ids\"].clone()\n",
+    "\n",
+    "draft_kv = None\n",
+    "verify_kv = None\n",
+    "\n",
+    "total_drafted = 0\n",
+    "total_accepted = 0\n",
+    "\n",
+    "while result.shape[-1] - inputs[\"input_ids\"].shape[-1] < max_tokens:\n",
+    "    drafted, drafted_probs, draft_kv = draft(result, gamma, confidence_threshold, eos_token, draft_kv)\n",
+    "    accepted_tokens, num_accepted, verify_kv = verify(drafted, drafted_probs, eos_token, verify_kv)\n",
+    "\n",
+    "    total_drafted += len(drafted_probs)\n",
+    "    total_accepted += num_accepted\n",
+    "    \n",
+    "    valid_len = result.shape[-1] + num_accepted\n",
+    "    result = torch.cat([result, torch.tensor([accepted_tokens], device=device)], dim=-1)\n",
+    "\n",
+    "    if draft_kv is not None:\n",
+    "        draft_kv.crop(max_length=valid_len)\n",
+    "    if verify_kv is not None:\n",
+    "        verify_kv.crop(max_length=valid_len)\n",
+    "\n",
+    "    if eos_token in accepted_tokens or im_end_token in accepted_tokens:\n",
+    "        break\n",
+    "\n",
+    "tokenizer.decode(result[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "40c92741",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6071428571428571"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "total_accepted / total_drafted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "0c661940",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n<|im_start|>user\\nWhat is a deal flow in a VC fund?<|im_end|>\\n<|im_start|>assistant\\nA deal flow in a VC fund refers to the collection and processing of new investment opportunities presented to the fund for screening, evaluation, and ultimately investment.\\n\\nHere’s a basic overview:\\n\\n* **Deal Flow Collection**: Fund managers typically collect deals through email, cold calls, calendars, meeting notes, investor referrals, and referrals by other investors. They utilize various channels including online searches, investor networks, and industry'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
+    "result = inputs[\"input_ids\"].clone()\n",
+    "\n",
+    "past_kv = None\n",
+    "\n",
+    "while result.shape[-1] - inputs[\"input_ids\"].shape[-1] < max_tokens:\n",
+    "    with torch.no_grad():\n",
+    "        output = verify_model(\n",
+    "            result if past_kv is None else result[:, -1:],\n",
+    "            past_key_values=past_kv,\n",
+    "            use_cache=True\n",
+    "        )\n",
+    "        logits = output.logits[:, -1, :] # batch, vocab\n",
+    "    \n",
+    "    past_kv = output.past_key_values\n",
+    "    probs = torch.softmax(logits, dim=-1)\n",
+    "    next_token = torch.multinomial(probs, num_samples=1)\n",
+    "\n",
+    "    result = torch.cat([result, next_token], dim=-1)\n",
+    "    if eos_token in next_token or im_end_token in next_token:\n",
+    "        break\n",
+    "\n",
+    "tokenizer.decode(result[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7a26417",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "speculativedecoding-B0TTdUOs-py3.13",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

main.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+import gradio as gr
+set_seed(67)
+device = "mps"
+# Initialize models and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct")
+draft_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct", torch_dtype=torch.bfloat16).to(device)
+verify_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-3B-Instruct", torch_dtype=torch.bfloat16).to(device)
+def draft(input_ids, gamma, confidence_threshold, eos_token, past_kv):
+    generated = input_ids.clone()
+    draft_probs = []
+    for _ in range(gamma):
+        with torch.no_grad():
+            outputs = draft_model(
+                generated if past_kv is None else generated[:, -1:],
+                past_key_values=past_kv,
+                use_cache=True
+            )
+            logits = outputs.logits[:, -1, :]
+            past_kv = outputs.past_key_values
+        probs = torch.softmax(logits, dim=-1)
+        confidence = probs.max().item()
+        if confidence < confidence_threshold and len(draft_probs) > 0:
+            break
+        next_token = torch.argmax(probs, dim=-1, keepdim=True)
+        draft_probs.append(probs)
+        generated = torch.cat([generated, next_token], dim=-1)
+        if next_token.item() == eos_token:
+            break
+    return generated, draft_probs, past_kv
+def verify(drafted, drafted_probs, eos_token, past_kv):
+    draft_len = len(drafted_probs)
+    with torch.no_grad():
+        if past_kv is None:
+            target_outputs = verify_model(drafted, use_cache=True)
+            target_logits = target_outputs.logits[:, -draft_len - 1:-1, :]
+        else:
+            target_outputs = verify_model(
+                drafted[:, -(draft_len + 1):],
+                past_key_values=past_kv,
+                use_cache=True
+            )
+            target_logits = target_outputs.logits[:, :-1, :]
+        past_kv = target_outputs.past_key_values
+    target_probs = torch.softmax(target_logits, dim=-1)
+    accepted_tokens = []
+    num_accepted = 0
+    for i in range(draft_len):
+        q = drafted_probs[i]
+        p = target_probs[:, i, :]
+        token = drafted[:, i - draft_len]
+        x = token[0].item()
+        q_x = q[0, x].item()
+        p_x = p[0, x].item()
+        if q_x <= p_x:
+            accepted_tokens.append(x)
+            num_accepted += 1
+        else:
+            r = torch.rand(1).item()
+            acceptance_rate = p_x / q_x
+            if r < acceptance_rate:
+                accepted_tokens.append(x)
+                num_accepted += 1
+            else:
+                adjusted = torch.clamp(p - q, min=0)
+                adjusted = adjusted / adjusted.sum()
+                new_token = torch.multinomial(adjusted, num_samples=1)[0].item()
+                accepted_tokens.append(new_token)
+                break
+        if accepted_tokens[-1] == eos_token:
+            break
+    return accepted_tokens, num_accepted, past_kv
+def generate_visual(prompt, max_tokens=50, gamma=15, confidence_threshold=0.5):
+    # Prepare input
+    messages = [{"role": "user", "content": prompt}]
+    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    eos_token = tokenizer.eos_token_id
+    im_end_token = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+    result = inputs["input_ids"].clone()
+    draft_kv = None
+    verify_kv = None
+    total_drafted = 0
+    total_accepted = 0
+    steps = []
+    while result.shape[-1] - inputs["input_ids"].shape[-1] < max_tokens:
+        print(steps)
+        drafted, drafted_probs, draft_kv = draft(result, gamma, confidence_threshold, eos_token, draft_kv)
+        accepted_tokens, num_accepted, verify_kv = verify(drafted, drafted_probs, eos_token, verify_kv)
+        total_drafted += len(drafted_probs)
+        total_accepted += num_accepted
+        # Extract token IDs for visualization
+        drafted_token_ids = drafted[0, -len(drafted_probs):].tolist()
+        step = {
+            "drafted": [tokenizer.decode([t]) for t in drafted_token_ids],
+            "accepted": num_accepted,
+            "resampled": tokenizer.decode([accepted_tokens[-1]]) if num_accepted < len(accepted_tokens) else None
+        }
+        steps.append(step)
+        valid_len = result.shape[-1] + num_accepted
+        result = torch.cat([result, torch.tensor([accepted_tokens], device=device)], dim=-1)
+        if draft_kv is not None:
+            draft_kv.crop(max_length=valid_len)
+        if verify_kv is not None:
+            verify_kv.crop(max_length=valid_len)
+        if eos_token in accepted_tokens or im_end_token in accepted_tokens:
+            break
+    # Extract final output
+    final_output = tokenizer.decode(result[0])
+    # Build HTML visualization
+    html = "<div style='font-family: monospace;'>"
+    html += f"<div style='margin-bottom: 20px; padding: 10px; background: transparent; border: 2px solid white; border-radius: 5px;'>"
+    html += f"<b>Final Output:</b><br/>{final_output}"
+    html += "</div>"
+    html += f"<div style='margin-bottom: 20px; padding: 10px; background: transparent; border: 2pd solid white; border-radius: 5px;'>"
+    html += f"<b>Acceptance Rate:</b> {total_accepted}/{total_drafted} = {total_accepted/total_drafted*100:.1f}%"
+    html += "</div>"
+    html += "<div style='margin-bottom: 10px;'><b>Decoding Steps:</b></div>"
+    for i, step in enumerate(steps):
+        html += f"<div style='margin: 10px 0; padding: 10px; border: 1px solid #ccc; border-radius: 5px;'>"
+        html += f"<b>Step {i+1}:</b> "
+        for j, token in enumerate(step["drafted"]):
+            # Escape HTML special characters
+            token_display = token.replace("<", "&lt;").replace(">", "&gt;")
+            if j < step["accepted"]:
+                html += f"<span style='background: #66CC66; padding: 2px 4px; margin: 2px; border-radius: 3px;'>{token_display}</span>"
+            else:
+                html += f"<span style='background: #FF8B9A; padding: 2px 4px; margin: 2px; text-decoration: line-through; border-radius: 3px;'>{token_display}</span>"
+        if step["resampled"]:
+            resampled_display = step["resampled"].replace("<", "&lt;").replace(">", "&gt;")
+            html += f" → <span style='background: #5AADCC; padding: 2px 4px; border-radius: 3px;'>{resampled_display}</span>"
+        html += "</div>"
+    html += "</div>"
+    return html
+demo = gr.Interface(
+    fn=generate_visual,
+    inputs=[
+        gr.Textbox(label="Prompt", value="What is a deal flow in a VC fund?", lines=3),
+        gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Max Tokens"),
+        gr.Slider(minimum=1, maximum=30, value=15, step=1, label="Gamma (draft lookahead)"),
+        gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="Confidence Threshold")
+    ],
+    outputs=gr.HTML(label="Speculative Decoding Visualization"),
+    title="🚀 Speculative Decoding Demo",
+    description="""
+    **Speculative Decoding Visualization** using Qwen2.5-Coder models
+    - **Draft Model**: Qwen2.5-Coder-0.5B-Instruct (fast)
+    - **Verify Model**: Qwen2.5-Coder-3B-Instruct (accurate)
+    **Color Legend:**
+    - 🟢 Green = Accepted tokens from draft model
+    - 🔴 Red = Rejected tokens (with strikethrough)
+    - 🔵 Blue = Resampled tokens from verify model
+    """,
+    examples=[
+        ["What is a deal flow in a VC fund?", 80, 15, 0.5],
+        ["def fibonacci(n):", 50, 15, 0.5],
+        ["Explain the concept of attention in transformers", 60, 10, 0.6]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[tool.poetry]
+name = "speculativedecoding"
+version = "0.1.0"
+description = ""
+authors = ["Harrison <zhuyiyun060209@gmail.com>"]
+readme = "README.md"
+package-mode = false
+[tool.poetry.dependencies]
+python = ">=3.13"
+transformers = ">=4.57.3,<5.0.0"
+torch = ">=2.0.0"
+ipykernel = "^7.1.0"
+gradio = "^6.1.0"
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"