| { |
| "performance_profiles": { |
| "max_throughput": { |
| "description": "Optimized for maximum throughput with batching", |
| "use_case": "High-volume production serving", |
| "settings": { |
| "batch_size": 32, |
| "max_batch_total_tokens": 8192, |
| "tensor_parallel_size": 4, |
| "pipeline_parallel_size": 1, |
| "gpu_memory_utilization": 0.95, |
| "max_num_seqs": 256, |
| "max_num_batched_tokens": 8192, |
| "enable_prefix_caching": true, |
| "enable_chunked_prefill": true, |
| "max_prefill_tokens": 4096 |
| }, |
| "expected_performance": { |
| "throughput_tokens_per_second": "80-120", |
| "latency_p50_ms": "200-400", |
| "latency_p95_ms": "400-800", |
| "concurrent_requests": "64-128" |
| } |
| }, |
| "low_latency": { |
| "description": "Optimized for lowest latency with small batches", |
| "use_case": "Interactive applications, real-time responses", |
| "settings": { |
| "batch_size": 1, |
| "max_batch_total_tokens": 4096, |
| "tensor_parallel_size": 4, |
| "pipeline_parallel_size": 1, |
| "gpu_memory_utilization": 0.90, |
| "max_num_seqs": 32, |
| "max_num_batched_tokens": 4096, |
| "enable_prefix_caching": false, |
| "enable_chunked_prefill": false, |
| "use_flash_attention": true |
| }, |
| "expected_performance": { |
| "throughput_tokens_per_second": "30-50", |
| "latency_p50_ms": "80-150", |
| "latency_p95_ms": "150-300", |
| "concurrent_requests": "8-16" |
| } |
| }, |
| "balanced": { |
| "description": "Balanced configuration for general use", |
| "use_case": "General purpose inference", |
| "settings": { |
| "batch_size": 8, |
| "max_batch_total_tokens": 4096, |
| "tensor_parallel_size": 2, |
| "pipeline_parallel_size": 1, |
| "gpu_memory_utilization": 0.90, |
| "max_num_seqs": 64, |
| "max_num_batched_tokens": 4096, |
| "enable_prefix_caching": true, |
| "enable_chunked_prefill": true |
| }, |
| "expected_performance": { |
| "throughput_tokens_per_second": "50-80", |
| "latency_p50_ms": "150-250", |
| "latency_p95_ms": "250-500", |
| "concurrent_requests": "32-64" |
| } |
| }, |
| "memory_efficient": { |
| "description": "Optimized for lower memory usage", |
| "use_case": "Limited GPU memory, smaller deployments", |
| "settings": { |
| "batch_size": 4, |
| "max_batch_total_tokens": 2048, |
| "tensor_parallel_size": 2, |
| "pipeline_parallel_size": 1, |
| "gpu_memory_utilization": 0.80, |
| "max_num_seqs": 32, |
| "max_num_batched_tokens": 2048, |
| "enable_prefix_caching": false, |
| "enable_chunked_prefill": false, |
| "swap_space": 8 |
| }, |
| "expected_performance": { |
| "throughput_tokens_per_second": "20-40", |
| "latency_p50_ms": "200-350", |
| "latency_p95_ms": "350-600", |
| "concurrent_requests": "16-32" |
| } |
| } |
| }, |
| "hardware_optimizations": { |
| "nvidia_a100": { |
| "recommended_profile": "max_throughput", |
| "gpu_count": 2, |
| "optimizations": [ |
| "Enable Flash Attention 2", |
| "Use tensor parallelism", |
| "Enable prefix caching", |
| "Optimize batch sizes" |
| ], |
| "settings": { |
| "tensor_parallel_size": 2, |
| "gpu_memory_utilization": 0.95, |
| "enable_cuda_graph": true |
| } |
| }, |
| "nvidia_h100": { |
| "recommended_profile": "max_throughput", |
| "gpu_count": 4, |
| "optimizations": [ |
| "Enable FP8 computation", |
| "Use larger batch sizes", |
| "Enable advanced caching", |
| "Utilize higher memory bandwidth" |
| ], |
| "settings": { |
| "tensor_parallel_size": 4, |
| "gpu_memory_utilization": 0.95, |
| "enable_cuda_graph": true, |
| "max_batch_size": 64 |
| } |
| }, |
| "nvidia_v100": { |
| "recommended_profile": "memory_efficient", |
| "gpu_count": 4, |
| "optimizations": [ |
| "Reduce batch sizes", |
| "Enable memory swapping", |
| "Use gradient checkpointing", |
| "Optimize tensor parallelism" |
| ], |
| "settings": { |
| "tensor_parallel_size": 4, |
| "gpu_memory_utilization": 0.85, |
| "swap_space": 16, |
| "max_batch_size": 8 |
| } |
| } |
| }, |
| "context_length_optimizations": { |
| "short_context": { |
| "description": "Optimized for contexts under 4K tokens", |
| "max_tokens": 4096, |
| "settings": { |
| "max_model_len": 4096, |
| "block_size": 16, |
| "enable_prefix_caching": false |
| }, |
| "throughput_multiplier": 2.0 |
| }, |
| "medium_context": { |
| "description": "Optimized for contexts 4K-32K tokens", |
| "max_tokens": 32768, |
| "settings": { |
| "max_model_len": 32768, |
| "block_size": 32, |
| "enable_prefix_caching": true, |
| "enable_chunked_prefill": true |
| }, |
| "throughput_multiplier": 1.0 |
| }, |
| "long_context": { |
| "description": "Optimized for contexts 32K-131K tokens", |
| "max_tokens": 131072, |
| "settings": { |
| "max_model_len": 131072, |
| "block_size": 64, |
| "enable_prefix_caching": true, |
| "enable_chunked_prefill": true, |
| "max_num_batched_tokens": 4096 |
| }, |
| "throughput_multiplier": 0.5 |
| } |
| }, |
| "workload_patterns": { |
| "batch_processing": { |
| "description": "Offline batch processing workloads", |
| "characteristics": { |
| "latency_sensitive": false, |
| "throughput_priority": "high", |
| "batch_sizes": "large" |
| }, |
| "recommended_settings": { |
| "profile": "max_throughput", |
| "batch_size": 32, |
| "concurrent_requests": 128, |
| "enable_async": true |
| } |
| }, |
| "interactive": { |
| "description": "Real-time interactive applications", |
| "characteristics": { |
| "latency_sensitive": true, |
| "throughput_priority": "medium", |
| "batch_sizes": "small" |
| }, |
| "recommended_settings": { |
| "profile": "low_latency", |
| "batch_size": 1, |
| "concurrent_requests": 16, |
| "enable_streaming": true |
| } |
| }, |
| "api_serving": { |
| "description": "Production API serving", |
| "characteristics": { |
| "latency_sensitive": true, |
| "throughput_priority": "high", |
| "batch_sizes": "medium" |
| }, |
| "recommended_settings": { |
| "profile": "balanced", |
| "batch_size": 8, |
| "concurrent_requests": 64, |
| "enable_auto_scaling": true |
| } |
| } |
| }, |
| "monitoring_metrics": { |
| "critical": [ |
| "requests_per_second", |
| "tokens_per_second", |
| "p95_latency_ms", |
| "error_rate", |
| "gpu_memory_utilization" |
| ], |
| "important": [ |
| "p50_latency_ms", |
| "p99_latency_ms", |
| "queue_depth", |
| "cache_hit_rate", |
| "active_requests" |
| ], |
| "optional": [ |
| "gpu_temperature", |
| "power_usage", |
| "batch_size_distribution", |
| "context_length_distribution" |
| ] |
| }, |
| "auto_tuning": { |
| "enabled": false, |
| "parameters": [ |
| "batch_size", |
| "tensor_parallel_size", |
| "gpu_memory_utilization" |
| ], |
| "optimization_goal": "maximize_throughput", |
| "constraints": { |
| "max_latency_ms": 1000, |
| "min_throughput_tps": 30 |
| }, |
| "tuning_duration_minutes": 30 |
| }, |
| "troubleshooting": { |
| "high_latency": { |
| "possible_causes": [ |
| "Large batch sizes", |
| "Long context lengths", |
| "Insufficient GPU memory", |
| "Network bottlenecks" |
| ], |
| "solutions": [ |
| "Reduce batch size", |
| "Enable prefix caching", |
| "Increase tensor parallelism", |
| "Optimize network configuration" |
| ] |
| }, |
| "low_throughput": { |
| "possible_causes": [ |
| "Small batch sizes", |
| "Underutilized GPUs", |
| "Disabled optimizations", |
| "Suboptimal parallelism" |
| ], |
| "solutions": [ |
| "Increase batch size", |
| "Enable chunked prefill", |
| "Adjust tensor parallelism", |
| "Enable prefix caching" |
| ] |
| }, |
| "out_of_memory": { |
| "possible_causes": [ |
| "Batch size too large", |
| "Context length too long", |
| "GPU memory fragmentation", |
| "Insufficient tensor parallelism" |
| ], |
| "solutions": [ |
| "Reduce batch size", |
| "Increase tensor parallelism", |
| "Reduce max_model_len", |
| "Enable memory swapping" |
| ] |
| } |
| } |
| } |