Helion-V2.5-Rnd / performance_config.json

Create performance_config.json (#4)

6658165 verified 4 months ago

8.39 kB

	{
	"performance_profiles": {
	"max_throughput": {
	"description": "Optimized for maximum throughput with batching",
	"use_case": "High-volume production serving",
	"settings": {
	"batch_size": 32,
	"max_batch_total_tokens": 8192,
	"tensor_parallel_size": 4,
	"pipeline_parallel_size": 1,
	"gpu_memory_utilization": 0.95,
	"max_num_seqs": 256,
	"max_num_batched_tokens": 8192,
	"enable_prefix_caching": true,
	"enable_chunked_prefill": true,
	"max_prefill_tokens": 4096
	},
	"expected_performance": {
	"throughput_tokens_per_second": "80-120",
	"latency_p50_ms": "200-400",
	"latency_p95_ms": "400-800",
	"concurrent_requests": "64-128"
	}
	},
	"low_latency": {
	"description": "Optimized for lowest latency with small batches",
	"use_case": "Interactive applications, real-time responses",
	"settings": {
	"batch_size": 1,
	"max_batch_total_tokens": 4096,
	"tensor_parallel_size": 4,
	"pipeline_parallel_size": 1,
	"gpu_memory_utilization": 0.90,
	"max_num_seqs": 32,
	"max_num_batched_tokens": 4096,
	"enable_prefix_caching": false,
	"enable_chunked_prefill": false,
	"use_flash_attention": true
	},
	"expected_performance": {
	"throughput_tokens_per_second": "30-50",
	"latency_p50_ms": "80-150",
	"latency_p95_ms": "150-300",
	"concurrent_requests": "8-16"
	}
	},
	"balanced": {
	"description": "Balanced configuration for general use",
	"use_case": "General purpose inference",
	"settings": {
	"batch_size": 8,
	"max_batch_total_tokens": 4096,
	"tensor_parallel_size": 2,
	"pipeline_parallel_size": 1,
	"gpu_memory_utilization": 0.90,
	"max_num_seqs": 64,
	"max_num_batched_tokens": 4096,
	"enable_prefix_caching": true,
	"enable_chunked_prefill": true
	},
	"expected_performance": {
	"throughput_tokens_per_second": "50-80",
	"latency_p50_ms": "150-250",
	"latency_p95_ms": "250-500",
	"concurrent_requests": "32-64"
	}
	},
	"memory_efficient": {
	"description": "Optimized for lower memory usage",
	"use_case": "Limited GPU memory, smaller deployments",
	"settings": {
	"batch_size": 4,
	"max_batch_total_tokens": 2048,
	"tensor_parallel_size": 2,
	"pipeline_parallel_size": 1,
	"gpu_memory_utilization": 0.80,
	"max_num_seqs": 32,
	"max_num_batched_tokens": 2048,
	"enable_prefix_caching": false,
	"enable_chunked_prefill": false,
	"swap_space": 8
	},
	"expected_performance": {
	"throughput_tokens_per_second": "20-40",
	"latency_p50_ms": "200-350",
	"latency_p95_ms": "350-600",
	"concurrent_requests": "16-32"
	}
	}
	},
	"hardware_optimizations": {
	"nvidia_a100": {
	"recommended_profile": "max_throughput",
	"gpu_count": 2,
	"optimizations": [
	"Enable Flash Attention 2",
	"Use tensor parallelism",
	"Enable prefix caching",
	"Optimize batch sizes"
	],
	"settings": {
	"tensor_parallel_size": 2,
	"gpu_memory_utilization": 0.95,
	"enable_cuda_graph": true
	}
	},
	"nvidia_h100": {
	"recommended_profile": "max_throughput",
	"gpu_count": 4,
	"optimizations": [
	"Enable FP8 computation",
	"Use larger batch sizes",
	"Enable advanced caching",
	"Utilize higher memory bandwidth"
	],
	"settings": {
	"tensor_parallel_size": 4,
	"gpu_memory_utilization": 0.95,
	"enable_cuda_graph": true,
	"max_batch_size": 64
	}
	},
	"nvidia_v100": {
	"recommended_profile": "memory_efficient",
	"gpu_count": 4,
	"optimizations": [
	"Reduce batch sizes",
	"Enable memory swapping",
	"Use gradient checkpointing",
	"Optimize tensor parallelism"
	],
	"settings": {
	"tensor_parallel_size": 4,
	"gpu_memory_utilization": 0.85,
	"swap_space": 16,
	"max_batch_size": 8
	}
	}
	},
	"context_length_optimizations": {
	"short_context": {
	"description": "Optimized for contexts under 4K tokens",
	"max_tokens": 4096,
	"settings": {
	"max_model_len": 4096,
	"block_size": 16,
	"enable_prefix_caching": false
	},
	"throughput_multiplier": 2.0
	},
	"medium_context": {
	"description": "Optimized for contexts 4K-32K tokens",
	"max_tokens": 32768,
	"settings": {
	"max_model_len": 32768,
	"block_size": 32,
	"enable_prefix_caching": true,
	"enable_chunked_prefill": true
	},
	"throughput_multiplier": 1.0
	},
	"long_context": {
	"description": "Optimized for contexts 32K-131K tokens",
	"max_tokens": 131072,
	"settings": {
	"max_model_len": 131072,
	"block_size": 64,
	"enable_prefix_caching": true,
	"enable_chunked_prefill": true,
	"max_num_batched_tokens": 4096
	},
	"throughput_multiplier": 0.5
	}
	},
	"workload_patterns": {
	"batch_processing": {
	"description": "Offline batch processing workloads",
	"characteristics": {
	"latency_sensitive": false,
	"throughput_priority": "high",
	"batch_sizes": "large"
	},
	"recommended_settings": {
	"profile": "max_throughput",
	"batch_size": 32,
	"concurrent_requests": 128,
	"enable_async": true
	}
	},
	"interactive": {
	"description": "Real-time interactive applications",
	"characteristics": {
	"latency_sensitive": true,
	"throughput_priority": "medium",
	"batch_sizes": "small"
	},
	"recommended_settings": {
	"profile": "low_latency",
	"batch_size": 1,
	"concurrent_requests": 16,
	"enable_streaming": true
	}
	},
	"api_serving": {
	"description": "Production API serving",
	"characteristics": {
	"latency_sensitive": true,
	"throughput_priority": "high",
	"batch_sizes": "medium"
	},
	"recommended_settings": {
	"profile": "balanced",
	"batch_size": 8,
	"concurrent_requests": 64,
	"enable_auto_scaling": true
	}
	}
	},
	"monitoring_metrics": {
	"critical": [
	"requests_per_second",
	"tokens_per_second",
	"p95_latency_ms",
	"error_rate",
	"gpu_memory_utilization"
	],
	"important": [
	"p50_latency_ms",
	"p99_latency_ms",
	"queue_depth",
	"cache_hit_rate",
	"active_requests"
	],
	"optional": [
	"gpu_temperature",
	"power_usage",
	"batch_size_distribution",
	"context_length_distribution"
	]
	},
	"auto_tuning": {
	"enabled": false,
	"parameters": [
	"batch_size",
	"tensor_parallel_size",
	"gpu_memory_utilization"
	],
	"optimization_goal": "maximize_throughput",
	"constraints": {
	"max_latency_ms": 1000,
	"min_throughput_tps": 30
	},
	"tuning_duration_minutes": 30
	},
	"troubleshooting": {
	"high_latency": {
	"possible_causes": [
	"Large batch sizes",
	"Long context lengths",
	"Insufficient GPU memory",
	"Network bottlenecks"
	],
	"solutions": [
	"Reduce batch size",
	"Enable prefix caching",
	"Increase tensor parallelism",
	"Optimize network configuration"
	]
	},
	"low_throughput": {
	"possible_causes": [
	"Small batch sizes",
	"Underutilized GPUs",
	"Disabled optimizations",
	"Suboptimal parallelism"
	],
	"solutions": [
	"Increase batch size",
	"Enable chunked prefill",
	"Adjust tensor parallelism",
	"Enable prefix caching"
	]
	},
	"out_of_memory": {
	"possible_causes": [
	"Batch size too large",
	"Context length too long",
	"GPU memory fragmentation",
	"Insufficient tensor parallelism"
	],
	"solutions": [
	"Reduce batch size",
	"Increase tensor parallelism",
	"Reduce max_model_len",
	"Enable memory swapping"
	]
	}
	}
	}