DeepXR
/

Helion-V2.5-Rnd

Text Generation

text-generation-inference

Model card Files Files and versions

Helion-V2.5-Rnd / monitoring_config.json

Trouter-Library's picture

Trouter-Library

Create monitoring_config.json

b7062e4 verified 4 months ago

4.38 kB

	{
	"monitoring": {
	"enabled": true,
	"interval_seconds": 15,
	"retention_days": 30
	},
	"metrics": {
	"system": {
	"enabled": true,
	"collect": [
	"cpu_usage",
	"memory_usage",
	"disk_usage",
	"network_io"
	]
	},
	"gpu": {
	"enabled": true,
	"collect": [
	"gpu_utilization",
	"gpu_memory_used",
	"gpu_memory_total",
	"gpu_temperature",
	"gpu_power_usage"
	],
	"alert_thresholds": {
	"temperature_celsius": 85,
	"memory_utilization_percent": 95,
	"power_watts": 400
	}
	},
	"model": {
	"enabled": true,
	"collect": [
	"requests_per_second",
	"tokens_per_second",
	"average_latency_ms",
	"p50_latency_ms",
	"p95_latency_ms",
	"p99_latency_ms",
	"error_rate",
	"active_connections",
	"queue_depth"
	]
	},
	"inference": {
	"enabled": true,
	"collect": [
	"prompt_tokens",
	"completion_tokens",
	"total_tokens",
	"generation_time_ms",
	"preprocessing_time_ms",
	"postprocessing_time_ms"
	]
	}
	},
	"alerts": {
	"enabled": true,
	"channels": [
	"email",
	"slack",
	"pagerduty"
	],
	"rules": [
	{
	"name": "high_error_rate",
	"condition": "error_rate > 0.05",
	"duration_seconds": 300,
	"severity": "critical",
	"message": "Error rate exceeded 5% for 5 minutes"
	},
	{
	"name": "high_latency",
	"condition": "p95_latency_ms > 5000",
	"duration_seconds": 180,
	"severity": "warning",
	"message": "P95 latency exceeded 5 seconds"
	},
	{
	"name": "gpu_temperature_high",
	"condition": "gpu_temperature > 85",
	"duration_seconds": 60,
	"severity": "critical",
	"message": "GPU temperature critically high"
	},
	{
	"name": "memory_pressure",
	"condition": "gpu_memory_used / gpu_memory_total > 0.95",
	"duration_seconds": 300,
	"severity": "warning",
	"message": "GPU memory utilization above 95%"
	},
	{
	"name": "low_throughput",
	"condition": "tokens_per_second < 10",
	"duration_seconds": 600,
	"severity": "warning",
	"message": "Throughput below 10 tokens/second"
	}
	]
	},
	"logging": {
	"level": "INFO",
	"format": "json",
	"outputs": [
	{
	"type": "file",
	"path": "./logs/monitoring.log",
	"rotation": "daily",
	"retention_days": 30
	},
	{
	"type": "stdout",
	"enabled": true
	},
	{
	"type": "elasticsearch",
	"enabled": false,
	"host": "localhost:9200",
	"index": "helion-metrics"
	}
	]
	},
	"prometheus": {
	"enabled": true,
	"port": 8001,
	"path": "/metrics",
	"namespace": "helion",
	"subsystem": "inference",
	"labels": {
	"model": "Helion-2.5-Rnd",
	"version": "2.5.0-rnd",
	"environment": "production"
	}
	},
	"grafana": {
	"enabled": true,
	"dashboards": [
	{
	"name": "Helion Overview",
	"file": "./monitoring/dashboards/overview.json",
	"refresh": "30s"
	},
	{
	"name": "GPU Metrics",
	"file": "./monitoring/dashboards/gpu.json",
	"refresh": "15s"
	},
	{
	"name": "Inference Performance",
	"file": "./monitoring/dashboards/inference.json",
	"refresh": "30s"
	}
	]
	},
	"health_checks": {
	"enabled": true,
	"endpoint": "/health",
	"interval_seconds": 30,
	"timeout_seconds": 10,
	"checks": [
	{
	"name": "model_loaded",
	"type": "internal",
	"critical": true
	},
	{
	"name": "gpu_available",
	"type": "internal",
	"critical": true
	},
	{
	"name": "inference_responsive",
	"type": "endpoint",
	"url": "http://localhost:8000/v1/models",
	"critical": false
	}
	]
	},
	"tracing": {
	"enabled": true,
	"sample_rate": 0.1,
	"exporter": "jaeger",
	"endpoint": "http://localhost:14268/api/traces"
	},
	"profiling": {
	"enabled": false,
	"interval_seconds": 3600,
	"duration_seconds": 300,
	"output_dir": "./profiling"
	}
	}