Codette-Reasoning / tests /test_phase5_e2e.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 1 day ago

11.3 kB

	#!/usr/bin/env python3
	"""Phase 5 End-to-End Integration Tests

	Tests the complete Phase 5 system:
	1. ReinforcementConfig tunable coefficients
	2. AdapterRouter with MemoryWeighting integration
	3. CodetteOrchestrator routing with memory context
	4. Gamma stabilization field health monitoring
	5. RoutingMetrics observability

	Run with: python test_phase5_e2e.py
	"""

	import sys
	import os
	from pathlib import Path

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from reasoning_forge.memory_weighting import MemoryWeighting, ReinforcementConfig
	from reasoning_forge.coherence_field import CoherenceFieldGamma, GammaHealthMetrics, InterventionType
	from reasoning_forge.routing_metrics import RoutingMetrics, AdapterSelectionRecord
	from inference.adapter_router import AdapterRouter, RouteResult


	def test_reinforcement_config():
	"""Test ReinforcementConfig tunable coefficients."""
	print("\n=== Test 1: Reinforcement Config ===")

	# Test default values
	config = ReinforcementConfig()
	assert config.boost_successful == 0.08, "Default boost should be 0.08"
	assert config.penalize_failed == 0.08, "Default penalize should be 0.08"
	assert config.reward_soft_consensus == 0.03, "Default soft_consensus should be 0.03"
	print("[OK] Default coefficients loaded")

	# Test from_dict()
	custom_dict = {
	"boost_successful": 0.12,
	"penalize_failed": 0.10,
	"reward_soft_consensus": 0.05,
	}
	custom = ReinforcementConfig.from_dict(custom_dict)
	assert custom.boost_successful == 0.12, "Custom boost not applied"
	assert custom.penalize_failed == 0.10, "Custom penalize not applied"
	print("[OK] Custom coefficients loaded from dict")

	# Test to_dict()
	exported = custom.to_dict()
	assert exported["boost_successful"] == 0.12, "Export failed"
	print("[OK] Coefficients exported to dict")

	# Test partial config (missing keys should use defaults)
	partial = ReinforcementConfig.from_dict({"boost_successful": 0.15})
	assert partial.boost_successful == 0.15, "Partial override failed"
	assert partial.penalize_failed == 0.08, "Default not used for missing key"
	print("[OK] Partial config with defaults works")

	return True


	def test_adapter_router_with_memory():
	"""Test AdapterRouter memory weighting integration."""
	print("\n=== Test 2: AdapterRouter with Memory ===")

	# Create router without memory
	router_no_mem = AdapterRouter(available_adapters=["newton", "davinci", "empathy"])
	assert router_no_mem.memory_weighting is None, "Router should not have memory"
	print("[OK] Router created without memory")

	# Route a simple query
	query = "Explain the physics of gravity"
	route = router_no_mem.route(query, strategy="keyword")
	assert route.primary == "newton", "Should select newton for physics query"
	assert route.confidence > 0.0, "Confidence should be set"
	print(f"[OK] Routed to {route.primary} with confidence {route.confidence:.2f}")

	# Test explain_routing without memory
	explanation = router_no_mem.explain_routing(route)
	assert "primary" in explanation, "Explanation missing primary"
	assert explanation["memory_aware"] is False, "Should show memory not available"
	print("[OK] Routing explanation works without memory")

	return True


	def test_gamma_health_monitoring():
	"""Test Gamma (Γ) stabilization field."""
	print("\n=== Test 3: Gamma Health Monitoring ===")

	gamma = CoherenceFieldGamma()

	# Simulate a healthy debate (diverse perspectives, good resolution)
	class MockConflict:
	def __init__(self):
	self.strength = 0.25 # Productive zone

	conflicts = [MockConflict(), MockConflict()]
	responses = {
	"newton": "Physics perspective",
	"davinci": "Creative perspective",
	"empathy": "Emotional perspective",
	}

	# Compute health
	health = gamma.compute_health(
	conflicts=conflicts,
	responses=responses,
	adapter_weights={"newton": 1.0, "davinci": 1.0, "empathy": 1.0},
	)

	assert 0.0 <= health.gamma <= 1.0, "Gamma should be in [0, 1]"
	assert len(gamma.health_history) == 1, "Should record health metric"
	print(f"[OK] Healthy state: Gamma = {health.gamma:.3f}")
	assert health.is_stable(), "Should be in stable zone"
	print("[OK] Status correctly identified as stable")

	# Simulate collapse (no diversity, low resolution)
	mono_responses = {"newton": "Only newton perspective"}
	weak_conflicts = [] # No progress

	health_collapse = gamma.compute_health(
	conflicts=weak_conflicts,
	responses=mono_responses,
	adapter_weights={"newton": 2.0}, # All weight on one
	)

	print(f"[OK] Collapsed state: Gamma = {health_collapse.gamma:.3f}")
	if health_collapse.gamma < 0.4:
	assert health_collapse.is_collapsing(), "Should detect collapse"
	print("[OK] Collapse correctly detected")

	# Test intervention detection
	intervention = gamma.get_intervention(health_collapse, ["davinci", "empathy"])
	if intervention:
	assert intervention.intervention_type == InterventionType.DIVERSITY_INJECTION, \
	"Should inject diversity on collapse"
	print(f"[OK] Intervention recommended: {intervention.intervention_type.value}")

	return True


	def test_routing_metrics():
	"""Test RoutingMetrics observability."""
	print("\n=== Test 4: Routing Metrics ===")

	metrics = RoutingMetrics()
	assert metrics.total_queries == 0, "Should start at 0"
	print("[OK] RoutingMetrics initialized")

	# Record some routing decisions
	record1 = RoutingMetrics.create_record(
	query="What is quantum mechanics?",
	primary_adapter="quantum",
	secondary_adapters=["physics"],
	strategy="keyword",
	confidence_before_boost=0.75,
	confidence_after_boost=0.85,
	memory_boost_applied=True,
	)
	metrics.record_route(record1)

	assert metrics.total_queries == 1, "Should count query"
	assert metrics.adapter_selection_counts["quantum"] == 1, "Should count selection"
	assert metrics.memory_boost_count == 1, "Should count boost"
	print("[OK] Route recorded and metrics updated")

	# Record more routes
	for i in range(4):
	record = RoutingMetrics.create_record(
	query="Another query",
	primary_adapter="newton",
	secondary_adapters=[],
	strategy="keyword",
	confidence_before_boost=0.6,
	confidence_after_boost=0.6,
	memory_boost_applied=False,
	)
	metrics.record_route(record)

	assert metrics.total_queries == 5, "Should have 5 queries"
	assert metrics.adapter_selection_counts["newton"] == 4, "Newton selected 4 times"
	print(f"[OK] Recorded 5 queries total")

	# Get summary
	summary = metrics.get_summary()
	assert summary["total_queries"] == 5, "Summary should show total queries"
	assert "quantum" in summary["adapter_stats"], "Should have quantum stats"
	assert "newton" in summary["adapter_stats"], "Should have newton stats"
	print(f"[OK] Summary generated with {len(summary['adapter_stats'])} adapters")

	# Check specific adapter stats
	newton_stats = metrics.get_adapter_stats("newton")
	assert newton_stats["total_selections"] == 4, "Newton should have 4 selections"
	assert newton_stats["memory_boost_hits"] == 0, "Newton had no boosts"
	print(f"[OK] Adapter stats: {newton_stats['total_selections']} selections")

	# Get recent routes
	recent = metrics.get_recent_routes(limit=3)
	assert len(recent) == 3, "Should return 3 recent routes"
	assert recent[0]["primary"] == "newton", "Most recent should be newton"
	print("[OK] Recent routes retrieved")

	return True


	def test_phase5_integration():
	"""Test complete Phase 5 integration (all components together)."""
	print("\n=== Test 5: Phase 5 Complete Integration ===")

	# Create router with memory (normally would load from disk)
	router = AdapterRouter(
	available_adapters=["newton", "davinci", "empathy", "philosophy"],
	memory_weighting=None, # Phase 5 but no memory loaded
	)
	print("[OK] Router created with Phase 5 integration ready")

	# Create Gamma field
	gamma = CoherenceFieldGamma()
	print("[OK] Gamma stabilization field initialized")

	# Create metrics tracker
	routing_metrics = RoutingMetrics()
	print("[OK] Routing metrics tracker initialized")

	# Simulate a complete routing cycle
	query = "How should society balance freedom and security?"
	route = router.route(query, strategy="keyword", max_adapters=2)

	# Create metrics record
	record = RoutingMetrics.create_record(
	query=query,
	primary_adapter=route.primary,
	secondary_adapters=route.secondary,
	strategy=route.strategy,
	confidence_before_boost=0.7,
	confidence_after_boost=0.7,
	memory_boost_applied=False,
	)
	routing_metrics.record_route(record)

	# Simulate debate with conflict
	class MockConflict:
	def __init__(self, agent_a, agent_b):
	self.agent_a = agent_a
	self.agent_b = agent_b
	self.strength = 0.15

	conflicts = [MockConflict("newton", "philosophy")]
	responses = {
	"newton": "Mathematical security metrics",
	"philosophy": "Ethical freedom considerations",
	"davinci": "Innovative balance approaches",
	}

	# Check health
	health = gamma.compute_health(conflicts, responses)
	# Determine status based on is_* methods
	if health.is_collapsing():
	status = "collapsing"
	elif health.is_groupthinking():
	status = "groupthinking"
	else:
	status = "stable"
	print(f"[OK] Health computed: Gamma = {health.gamma:.3f} ({status})")

	# Get all metrics
	summary = routing_metrics.get_summary()
	gamma_data = gamma.export_metrics()

	assert summary["total_queries"] == 1, "Should have recorded 1 query"
	assert "health_history" in gamma_data, "Should export health history"
	print("[OK] All Phase 5 components working together")

	return True


	def main():
	"""Run all Phase 5 tests."""
	print("=" * 70)
	print("PHASE 5 END-TO-END INTEGRATION TESTS")
	print("=" * 70)

	tests = [
	("Reinforcement Config", test_reinforcement_config),
	("AdapterRouter Memory", test_adapter_router_with_memory),
	("Gamma Health Monitoring", test_gamma_health_monitoring),
	("Routing Metrics", test_routing_metrics),
	("Phase 5 Integration", test_phase5_integration),
	]

	passed = 0
	failed = 0

	for test_name, test_func in tests:
	try:
	if test_func():
	passed += 1
	print(f"\n[PASS] {test_name} PASSED")
	else:
	failed += 1
	print(f"\n[FAIL] {test_name} FAILED")
	except Exception as e:
	failed += 1
	print(f"\n[FAIL] {test_name} ERROR: {e}")
	import traceback
	traceback.print_exc()

	print("\n" + "=" * 70)
	print(f"RESULTS: {passed} passed, {failed} failed")
	print("=" * 70)

	return 0 if failed == 0 else 1


	if __name__ == "__main__":
	sys.exit(main())