Spaces:
Build error
Build error
| import os | |
| from typing import Tuple, List | |
| import gradio as gr | |
| import spaces | |
| from dataclasses import dataclass | |
| from huggingface_hub import HfApi, CommitOperationAdd | |
| from transformers import AutoProcessor | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| from llmcompressor.transformers import oneshot, wrap_hf_model_class | |
| class CommitInfo: | |
| repo_url: str | |
| def parse_ignore_list(ignore_str: str) -> List[str]: | |
| """Parse comma-separated ignore list string into list""" | |
| return [item.strip() for item in ignore_str.split(',') if item.strip()] | |
| def create_quantized_model( | |
| model_id: str, | |
| work_dir: str, | |
| ignore_list: List[str], | |
| model_class_name: str | |
| ) -> Tuple[str, List[Tuple[str, Exception]]]: | |
| """Quantize model to FP8 and save to disk""" | |
| errors = [] | |
| try: | |
| # Get the appropriate model class | |
| exec(f"from transformers import {model_class_name}") | |
| model_class = eval(model_class_name) | |
| wrapped_model_class = wrap_hf_model_class(model_class) | |
| # Load model with ZeroGPU | |
| model = wrapped_model_class.from_pretrained( | |
| model_id, | |
| device_map="auto", | |
| torch_dtype="auto", | |
| trust_remote_code=True, | |
| _attn_implementation="eager" | |
| ) | |
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| # Configure quantization | |
| recipe = QuantizationModifier( | |
| targets="Linear", | |
| scheme="FP8_DYNAMIC", | |
| ignore=ignore_list, | |
| ) | |
| # Apply quantization | |
| save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic") | |
| oneshot(model=model, recipe=recipe, output_dir=save_dir) | |
| processor.save_pretrained(save_dir) | |
| return save_dir, errors | |
| except Exception as e: | |
| errors.append((model_id, e)) | |
| raise e | |
| def push_to_hub( | |
| api: HfApi, | |
| model_id: str, | |
| quantized_path: str, | |
| token: str, | |
| ignore_list: List[str], | |
| model_class_name: str, | |
| ) -> CommitInfo: | |
| """Create new repository with quantized model""" | |
| # Create new model repo name | |
| original_owner = model_id.split('/')[0] | |
| new_model_name = f"{model_id.split('/')[-1]}-fp8" | |
| # Get the token owner's username | |
| token_owner = api.whoami(token)["name"] | |
| # Create the new repo under the token owner's account | |
| target_repo = f"{token_owner}/{new_model_name}" | |
| # Create model card content | |
| model_card = f"""--- | |
| language: | |
| - en | |
| license: apache-2.0 | |
| tags: | |
| - fp8 | |
| - quantized | |
| - llmcompressor | |
| base_model: {model_id} | |
| quantization_config: | |
| ignored_layers: {ignore_list} | |
| model_class: {model_class_name} | |
| --- | |
| # {new_model_name} | |
| This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor). | |
| ## Quantization Details | |
| - Weights quantized to FP8 with per channel PTQ | |
| - Activations quantized to FP8 with dynamic per token | |
| - Linear layers targeted for quantization | |
| - Ignored layers: {ignore_list} | |
| - Model class: {model_class_name} | |
| ## Usage | |
| ```python | |
| from transformers import {model_class_name}, AutoProcessor | |
| model = {model_class_name}.from_pretrained("{target_repo}") | |
| processor = AutoProcessor.from_pretrained("{target_repo}") | |
| ``` | |
| """ | |
| # Create new repository | |
| api.create_repo( | |
| repo_id=target_repo, | |
| private=False, | |
| exist_ok=True, | |
| ) | |
| # Prepare operations for upload | |
| operations = [ | |
| CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card), | |
| ] | |
| # Add all files from quantized model | |
| for root, _, files in os.walk(quantized_path): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| relative_path = os.path.relpath(file_path, quantized_path) | |
| operations.append( | |
| CommitOperationAdd( | |
| path_in_repo=relative_path, | |
| path_or_content=file_path | |
| ) | |
| ) | |
| # Upload files | |
| api.create_commit( | |
| repo_id=target_repo, | |
| operations=operations, | |
| commit_message=f"Add FP8 quantized version of {model_id}", | |
| ) | |
| return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}") | |
| # 15 minutes timeout for large models | |
| def run( | |
| model_id: str, | |
| token: str, | |
| ignore_str: str, | |
| model_class_name: str | |
| ) -> str: | |
| """Main function to handle quantization and model upload""" | |
| if not token or model_id == "": | |
| return """ | |
| ### Invalid input π | |
| Please provide both a token and model_id. | |
| """ | |
| try: | |
| # Parse ignore list | |
| ignore_list = parse_ignore_list(ignore_str) | |
| # Set up API with user's token | |
| api = HfApi(token=token) | |
| print("Processing model:", model_id) | |
| print("Ignore list:", ignore_list) | |
| print("Model class:", model_class_name) | |
| # Create working directory | |
| work_dir = "quantized_models" | |
| os.makedirs(work_dir, exist_ok=True) | |
| # Quantize model | |
| quantized_path, errors = create_quantized_model( | |
| model_id, | |
| work_dir, | |
| ignore_list, | |
| model_class_name | |
| ) | |
| # Upload quantized model to new repository | |
| commit_info = push_to_hub( | |
| api, | |
| model_id, | |
| quantized_path, | |
| token, | |
| ignore_list, | |
| model_class_name | |
| ) | |
| response = f""" | |
| ### Success π₯ | |
| Your model has been successfully quantized to FP8 and uploaded to a new repository: | |
| [{commit_info.repo_url}]({commit_info.repo_url}) | |
| Configuration: | |
| - Ignored layers: {ignore_list} | |
| - Model class: {model_class_name} | |
| You can use this model directly with the transformers library! | |
| """ | |
| if errors: | |
| response += "\nWarnings during quantization:\n" | |
| response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors) | |
| return response | |
| except Exception as e: | |
| return f""" | |
| ### Error π’ | |
| An error occurred during processing: | |
| {str(e)} | |
| """ | |
| # Gradio Interface | |
| DESCRIPTION = """ | |
| # Convert any model to FP8 using LLM Compressor | |
| This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account. | |
| The steps are: | |
| 1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access | |
| 2. Enter the model ID you want to quantize | |
| 3. (Optional) Customize ignored layers and model class | |
| 4. Click "Submit" | |
| 5. You'll get a link to your new quantized model repository on your profile! π | |
| ## Advanced Options: | |
| - **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples: | |
| - Llama: `lm_head` | |
| - Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*` | |
| - Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*` | |
| - **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples: | |
| - `AutoModelForCausalLM` | |
| - `MllamaForConditionalGeneration` | |
| - `LlavaForConditionalGeneration` | |
| Note: | |
| - Processing may take several minutes depending on the model size | |
| - The quantized model will be created as a new public repository under your account | |
| - Your token needs write access to create the new repository | |
| """ | |
| title = "FP8 Quantization with LLM Compressor" | |
| with gr.Blocks(title=title) as demo: | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_id = gr.Text( | |
| max_lines=1, | |
| label="model_id", | |
| placeholder="huggingface/model-name" | |
| ) | |
| token = gr.Text( | |
| max_lines=1, | |
| label="your_hf_token (requires write access)", | |
| placeholder="hf_..." | |
| ) | |
| ignore_str = gr.Text( | |
| max_lines=1, | |
| label="ignore_list (comma-separated)", | |
| placeholder="re:.*lm_head,re:vision_model.*", | |
| value="re:.*lm_head" | |
| ) | |
| model_class_name = gr.Text( | |
| max_lines=1, | |
| label="model_class_name (optional)", | |
| placeholder="AutoModelForCausalLM", | |
| value="AutoModelForCausalLM" | |
| ) | |
| with gr.Row(): | |
| clean = gr.ClearButton() | |
| submit = gr.Button("Submit", variant="primary") | |
| with gr.Column(): | |
| output = gr.Markdown() | |
| submit.click( | |
| run, | |
| inputs=[model_id, token, ignore_str, model_class_name], | |
| outputs=output, | |
| concurrency_limit=1 | |
| ) | |
| demo.queue(max_size=10).launch(show_api=True) |