File size: 31,583 Bytes
ec4aa90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 |
"""
Workflow Orchestrator - Integrates all phases into complete pipeline.
Phase 5: Complete end-to-end workflow with all MCP integrations.
"""
import os
import logging
import asyncio
from typing import Dict, List, Optional
from pathlib import Path
# Phase 1-2: Classification
from src.agents.classifier import CodeClassifier
from src.agents.pattern_integration import PatternMatcherIntegration
from src.utils.file_handler import FileHandler
# Phase 3: Search
from src.search.vector_store import CodeSearchEngine
# Phase 4: Analysis & Transformation
from src.agents.analyzer import CodeAnalyzer
from src.agents.transformer import CodeTransformer
# Phase 5: Testing & GitHub
from src.agents.test_generator import CodeTestGenerator
from src.sandbox.validator import ModalSandboxValidator
# Lazy import to avoid circular dependency
GitHubMCPClient = None
logger = logging.getLogger(__name__)
class ModernizationOrchestrator:
"""
Orchestrates the complete code modernization workflow.
Integrates all 5 phases into a seamless pipeline.
"""
def __init__(self, use_intelligent_matcher: bool = True):
"""Initialize orchestrator with all components."""
logger.info("Initializing ModernizationOrchestrator")
# Phase 1-2 components
self.use_intelligent_matcher = use_intelligent_matcher
if use_intelligent_matcher:
self.pattern_integration = PatternMatcherIntegration(
use_intelligent_matcher=True,
cache_dir=".pattern_cache"
)
logger.info("Using IntelligentPatternMatcher")
else:
self.classifier = CodeClassifier()
logger.info("Using legacy CodeClassifier")
self.file_handler = FileHandler()
# Phase 3 components
self.search_engine = None # Initialized per repo
# Phase 4 components
self.analyzer = CodeAnalyzer()
self.transformer = CodeTransformer()
# Phase 5 components
self.test_generator = CodeTestGenerator()
self.validator = ModalSandboxValidator()
# Lazy load GitHub client to avoid circular import
self.github_client = None
logger.info("ModernizationOrchestrator initialized successfully")
async def modernize_repository(
self,
repo_path: str,
target_version: str = "Python 3.14",
create_pr: bool = False,
repo_url: Optional[str] = None,
github_token: Optional[str] = None,
progress_callback: Optional[callable] = None
) -> Dict:
"""
Complete modernization workflow for a repository.
Args:
repo_path: Path to repository (ZIP or directory)
target_version: Target language/framework version
create_pr: Whether to create GitHub PR
repo_url: GitHub repository URL (required if create_pr=True)
github_token: GitHub personal access token (optional, uses .env if not provided)
progress_callback: Optional callback function for progress updates
Returns:
Dictionary with complete modernization results
"""
logger.info(f"Starting modernization for {repo_path}")
def update_progress(phase: str, message: str):
"""Helper to call progress callback if provided."""
if progress_callback:
progress_callback(phase, message)
results = {
"success": False,
"phases": {},
"statistics": {},
"errors": []
}
try:
# Phase 1: Extract and discover files
logger.info("Phase 1: File discovery")
update_progress("Phase 1", "Extracting and discovering files...")
if repo_path.endswith('.zip'):
extract_path = self.file_handler.extract_repo(repo_path)
else:
extract_path = repo_path
files = self.file_handler.list_code_files(extract_path)
logger.info(f"Discovered {len(files)} code files")
update_progress("Phase 1", f"Discovered {len(files)} code files")
results['phases']['discovery'] = {
"files_found": len(files),
"repo_path": extract_path
}
# Phase 2: Classify files
logger.info("Phase 2: File classification")
update_progress("Phase 2", "Classifying files with AI pattern detection...")
# Read file contents for intelligent matching
file_contents = {}
if self.use_intelligent_matcher:
logger.info("Reading file contents for intelligent pattern matching...")
for file_path in files[:50]: # Limit to 50 files for demo
try:
full_path = os.path.join(extract_path, file_path)
content = self.file_handler.read_file(full_path)
if content:
file_contents[file_path] = content
except Exception as e:
logger.warning(f"Could not read {file_path}: {e}")
classifications = self.pattern_integration.classify_files(
list(file_contents.keys()),
file_contents
)
# Get detailed statistics
analyses = self.pattern_integration.pattern_matcher.analyze_batch(file_contents)
stats = self.pattern_integration.generate_statistics(analyses)
logger.info(f"Intelligent classification: {stats['modernize_high']} high, "
f"{stats['modernize_low']} low, {stats['skip']} skip")
logger.info(f"Detected {stats['patterns_detected']} patterns across {stats['total_files']} files")
else:
classifications = self.classifier.classify_files(files)
stats = None
modernize_high = [f for f, c in classifications.items() if c == 'modernize_high']
modernize_low = [f for f, c in classifications.items() if c == 'modernize_low']
skip_files = [f for f, c in classifications.items() if c == 'skip']
logger.info(f"Classification: {len(modernize_high)} high, {len(modernize_low)} low, {len(skip_files)} skip")
results['phases']['classification'] = {
"modernize_high": len(modernize_high),
"modernize_low": len(modernize_low),
"skip": len(skip_files),
"classifications": classifications,
"intelligent_stats": stats if self.use_intelligent_matcher else None
}
# Phase 3: Semantic search and pattern grouping
logger.info("Phase 3: Semantic search")
update_progress("Phase 3", "Building semantic index with LlamaIndex...")
self.search_engine = CodeSearchEngine(persist_dir=None)
# Build index for high-priority files
files_to_modernize = modernize_high + modernize_low
if files_to_modernize:
self.search_engine.build_index(extract_path) # Build index from repo
# Find pattern groups
pattern_groups = self._find_pattern_groups(files_to_modernize[:20])
logger.info(f"Found {len(pattern_groups)} pattern groups")
results['phases']['search'] = {
"indexed_files": min(len(files_to_modernize), 100),
"pattern_groups": len(pattern_groups)
}
else:
pattern_groups = []
results['phases']['search'] = {"message": "No files to modernize"}
# Phase 4: Analysis and transformation
logger.info("Phase 4: Code transformation")
update_progress("Phase 4", "Analyzing and transforming code...")
transformations = []
# Use intelligent pattern data if available
if self.use_intelligent_matcher and file_contents:
logger.info("Using intelligent pattern analysis for transformation")
# Get prioritized files from intelligent matcher
prioritized = self.pattern_integration.pattern_matcher.prioritize_files(analyses)
# Process top priority files
files_to_transform = [
(fp, analysis) for fp, analysis in prioritized
if analysis.requires_modernization
][:10] # Limit to 10 files for demo
logger.info(f"Processing {len(files_to_transform)} high-priority files with detailed pattern data")
total_files = len(files_to_transform)
for idx, (file_path, file_analysis) in enumerate(files_to_transform, 1):
try:
update_progress("Phase 4", f"Transforming file {idx}/{total_files}: {Path(file_path).name}")
original_code = file_contents.get(file_path, "")
if not original_code:
continue
# Convert intelligent pattern analysis to transformation plan
transformation_plan = self.pattern_integration.get_transformation_plan(file_analysis)
# Transform using detailed pattern information
modernized_code = await self.transformer.transform_code(
file_path,
original_code,
transformation_plan
)
transformations.append({
"file_path": file_path,
"original_code": original_code,
"modernized_code": modernized_code,
"analysis": transformation_plan,
"patterns_addressed": [p['pattern'] for p in transformation_plan['steps']],
"pattern_details": file_analysis.patterns # Include detailed pattern info
})
except Exception as e:
logger.error(f"Error transforming {file_path}: {e}")
results['errors'].append(f"Transformation error for {file_path}: {e}")
else:
# Fallback to legacy pattern grouping
logger.info("Using legacy pattern grouping for transformation")
file_to_patterns = {}
for group in pattern_groups[:5]: # Limit to 5 groups for demo
for file_path in group['files'][:3]:
if file_path not in file_to_patterns:
file_to_patterns[file_path] = []
file_to_patterns[file_path].append(group['pattern_name'])
logger.info(f"Processing {len(file_to_patterns)} unique files")
total_files = len(file_to_patterns)
for idx, (file_path, patterns) in enumerate(file_to_patterns.items(), 1):
try:
update_progress("Phase 4", f"Transforming file {idx}/{total_files}: {Path(file_path).name}")
full_path = os.path.join(extract_path, file_path)
original_code = self.file_handler.read_file(full_path)
if not original_code:
continue
# Analyze patterns
combined_pattern = " AND ".join(patterns)
analysis = await self.analyzer.analyze_pattern(
[file_path],
combined_pattern,
{file_path: original_code}
)
# Transform file
modernized_code = await self.transformer.transform_code(
file_path,
original_code,
analysis
)
transformations.append({
"file_path": file_path,
"original_code": original_code,
"modernized_code": modernized_code,
"analysis": analysis,
"patterns_addressed": patterns
})
except Exception as e:
logger.error(f"Error transforming {file_path}: {e}")
results['errors'].append(f"Transformation error for {file_path}: {e}")
logger.info(f"Transformed {len(transformations)} files")
# Save transformed files to output directory
output_dir = Path("modernized_output")
output_dir.mkdir(exist_ok=True)
for t in transformations:
try:
# Create subdirectories if needed
output_file = output_dir / t['file_path']
output_file.parent.mkdir(parents=True, exist_ok=True)
# Save modernized code
output_file.write_text(t['modernized_code'])
logger.info(f"Saved: {output_file}")
# Also save original for comparison
original_file = output_dir / "original" / t['file_path']
original_file.parent.mkdir(parents=True, exist_ok=True)
original_file.write_text(t['original_code'])
except Exception as e:
logger.error(f"Error saving {t['file_path']}: {e}")
logger.info(f"Output saved to: {output_dir.absolute()}")
results['phases']['transformation'] = {
"files_transformed": len(transformations),
"output_directory": str(output_dir.absolute())
}
# Store transformations for zip file creation
results['transformations'] = transformations
# Phase 5: Test generation and validation
logger.info("Phase 5: Test generation and validation")
update_progress("Phase 5", "Generating tests and validating in Modal sandbox...")
validation_results = []
# Create tests directory
tests_dir = output_dir / "tests"
tests_dir.mkdir(exist_ok=True)
total_tests = min(len(transformations), 10)
for idx, t in enumerate(transformations[:10], 1): # Limit to 10 for demo
try:
# Update progress
update_progress("Phase 5", f"Testing file {idx}/{total_tests}: {Path(t['file_path']).name}")
# Generate tests
tests = self.test_generator.generate_tests(
t['original_code'],
t['modernized_code'],
t['file_path']
)
# Validate and auto-fix export issues
if tests:
from src.agents.code_validator import validate_and_fix_code
# Detect language from file extension
file_ext = Path(t['file_path']).suffix.lower()
language_map = {
'.ts': 'typescript',
'.js': 'javascript',
'.py': 'python',
'.java': 'java'
}
language = language_map.get(file_ext, 'unknown')
# Validate and fix
fixed_code, is_valid, issues = validate_and_fix_code(
t['modernized_code'],
tests,
language
)
if not is_valid:
logger.warning(f"Code validation issues for {t['file_path']}: {issues}")
if fixed_code != t['modernized_code']:
logger.info(f"Auto-fixed export issues in {t['file_path']}")
t['modernized_code'] = fixed_code
# Re-save the fixed source file
output_file = output_dir / Path(t['file_path']).name
output_file.write_text(fixed_code)
# Save test file
if tests:
test_file = tests_dir / f"test_{Path(t['file_path']).name}"
test_file.write_text(tests)
logger.info(f"Saved test: {test_file}")
# Validate in sandbox
validation = self.validator.validate_transformation(
t['original_code'],
t['modernized_code'],
tests,
file_path=t['file_path']
)
validation['file_path'] = t['file_path']
validation_results.append(validation)
except Exception as e:
logger.error(f"Error validating {t['file_path']}: {e}")
results['errors'].append(f"Validation error: {e}")
# Calculate aggregate test results
total_tests = sum(v.get('tests_run', 0) for v in validation_results)
total_passed = sum(v.get('tests_passed', 0) for v in validation_results)
# Fix: Only average coverage for files that have coverage data
coverage_values = [v.get('coverage_percent', 0) for v in validation_results if v.get('coverage_percent', 0) > 0]
avg_coverage = sum(coverage_values) / len(coverage_values) if coverage_values else 0.0
logger.info(f"Validation: {total_passed}/{total_tests} tests passed, {avg_coverage:.1f}% coverage")
results['phases']['validation'] = {
"files_validated": len(validation_results),
"total_tests": total_tests,
"tests_passed": total_passed,
"tests_failed": total_tests - total_passed,
"average_coverage": round(avg_coverage, 2),
"pass_rate": round(total_passed / max(total_tests, 1) * 100, 2)
}
# Phase 5b: GitHub PR creation (optional)
if create_pr and repo_url:
logger.info("Phase 5b: Creating GitHub PR")
# Lazy load GitHub client
if self.github_client is None:
from src.mcp.github_client import GitHubMCPClient
self.github_client = GitHubMCPClient(github_token=github_token)
# Prepare changed files
changed_files = {
t['file_path']: t['modernized_code']
for t in transformations
}
# Generate PR summary
pr_summary = self._generate_pr_summary(results, target_version)
# Create PR
pr_result = await self.github_client.create_pr(
repo_url=repo_url,
changed_files=changed_files,
pr_summary=pr_summary,
test_results=results['phases']['validation']
)
results['phases']['github_pr'] = pr_result
logger.info(f"PR creation: {pr_result.get('success', False)}")
# Calculate final statistics
results['statistics'] = {
"total_files": len(files),
"files_modernized": len(transformations),
"tests_generated": total_tests,
"test_pass_rate": round(total_passed / max(total_tests, 1) * 100, 2),
"average_coverage": round(avg_coverage, 2)
}
# Add output locations
results['output'] = {
"modernized_files": str(output_dir.absolute()),
"original_files": str((output_dir / "original").absolute()),
"test_files": str((output_dir / "tests").absolute())
}
results['success'] = True
logger.info("Modernization workflow completed successfully")
logger.info(f"π Modernized files: {output_dir.absolute()}")
logger.info(f"π Test files: {output_dir / 'tests'}")
except Exception as e:
logger.error(f"Workflow error: {e}")
results['errors'].append(f"Workflow error: {e}")
results['success'] = False
return results
def _find_pattern_groups(self, files: List[str]) -> List[Dict]:
"""
Find groups of files with similar legacy patterns.
Detects file languages and uses appropriate pattern queries.
Args:
files: List of file paths
Returns:
List of pattern group dictionaries
"""
# Detect languages present in the files
languages = self._detect_languages_in_files(files)
# Build language-specific pattern queries
pattern_queries = self._get_pattern_queries_for_languages(languages)
groups = []
for query in pattern_queries:
try:
similar_files = self.search_engine.find_similar_patterns(query, top_k=10)
if similar_files:
groups.append({
"pattern_name": query,
"files": [f['file_path'] for f in similar_files],
"similarity_scores": [f['score'] for f in similar_files]
})
except Exception as e:
logger.error(f"Error searching for pattern '{query}': {e}")
return groups
def _detect_languages_in_files(self, files: List[str]) -> set:
"""Detect programming languages from file extensions."""
extension_to_language = {
'.py': 'python',
'.java': 'java',
'.js': 'javascript',
'.ts': 'typescript',
'.jsx': 'javascript',
'.tsx': 'typescript',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.cs': 'csharp',
'.go': 'go',
'.rb': 'ruby',
'.php': 'php',
'.kt': 'kotlin',
'.scala': 'scala',
'.rs': 'rust',
'.swift': 'swift'
}
languages = set()
for file_path in files:
ext = Path(file_path).suffix.lower()
if ext in extension_to_language:
languages.add(extension_to_language[ext])
return languages if languages else {'python'} # Default to Python if no recognized extensions
def _get_pattern_queries_for_languages(self, languages: set) -> List[str]:
"""Get pattern queries appropriate for the detected languages."""
# Common patterns for all languages
common_patterns = [
"Files with SQL injection vulnerabilities",
"Files with hardcoded credentials or secrets",
"Files with security vulnerabilities",
"Files with deprecated API usage"
]
# Language-specific patterns
language_patterns = {
'python': [
"Files using deprecated database libraries like MySQLdb",
"Files using Python 2 print statements",
"Files using deprecated urllib2 library",
"Files missing type hints",
"Files using old-style string formatting"
],
'java': [
"Files using deprecated Java APIs like Vector or Hashtable",
"Files using raw JDBC without prepared statements",
"Files missing try-with-resources for AutoCloseable",
"Files using pre-Java 8 patterns without lambdas or streams",
"Files using deprecated Date and Calendar APIs",
"Files with missing null checks or Optional usage"
],
'javascript': [
"Files using var instead of let or const",
"Files using callback patterns instead of Promises or async/await",
"Files using jQuery for DOM manipulation",
"Files with eval() usage",
"Files using prototype-based inheritance"
],
'typescript': [
"Files with excessive any type usage",
"Files missing strict null checks",
"Files using old module syntax"
],
'cpp': [
"Files using raw pointers instead of smart pointers",
"Files with manual memory management",
"Files using C-style casts",
"Files missing RAII patterns"
],
'csharp': [
"Files using deprecated .NET APIs",
"Files missing async/await patterns",
"Files using old collection types"
],
'go': [
"Files missing error handling",
"Files with goroutine leaks",
"Files missing context usage"
],
'ruby': [
"Files using deprecated Ruby syntax",
"Files missing proper error handling"
],
'php': [
"Files using deprecated mysql_* functions",
"Files missing prepared statements",
"Files with register_globals usage"
]
}
queries = common_patterns.copy()
for lang in languages:
if lang in language_patterns:
queries.extend(language_patterns[lang])
return queries
def _generate_pr_summary(self, results: Dict, target_version: str) -> str:
"""Generate PR summary from results."""
stats = results['statistics']
# Build coverage line only if coverage > 0
coverage_line = ""
if stats.get('average_coverage', 0) > 0:
coverage_line = f"**Code Coverage**: {stats['average_coverage']:.1f}%\n"
summary = f"""Automated migration to {target_version} with security fixes and performance improvements.
**Files Modernized**: {stats['files_modernized']} / {stats['total_files']}
**Tests Generated**: {stats['tests_generated']}
**Test Pass Rate**: {stats['test_pass_rate']:.1f}%
{coverage_line}
This PR includes:
- Syntax modernization to {target_version}
- Security vulnerability fixes
- Deprecated library replacements
- Comprehensive test suite
- Performance optimizations
All changes have been validated in an isolated sandbox environment.
"""
return summary
def generate_report(self, results: Dict) -> str:
"""
Generate human-readable report from results.
Args:
results: Workflow results dictionary
Returns:
Formatted report string
"""
report = []
report.append("=" * 60)
report.append("LEGACY CODE MODERNIZATION REPORT")
report.append("=" * 60)
report.append("")
if results['success']:
report.append("β
Status: SUCCESS")
else:
report.append("β Status: FAILED")
report.append("")
report.append("STATISTICS:")
report.append("-" * 60)
stats = results.get('statistics', {})
for key, value in stats.items():
# Skip average_coverage if it's 0
if key == 'average_coverage' and value == 0:
continue
report.append(f" {key.replace('_', ' ').title()}: {value}")
# Add intelligent pattern statistics if available
classification_data = results.get('phases', {}).get('classification', {})
intelligent_stats = classification_data.get('intelligent_stats')
if intelligent_stats:
report.append("")
report.append("INTELLIGENT PATTERN ANALYSIS:")
report.append("-" * 60)
report.append(f" Patterns Detected: {intelligent_stats.get('patterns_detected', 0)}")
report.append(f" Average Modernization Score: {intelligent_stats.get('average_modernization_score', 0)}/100")
report.append(f" Total Estimated Effort: {intelligent_stats.get('total_estimated_effort_hours', 0)}h")
severity_counts = intelligent_stats.get('severity_counts', {})
if severity_counts:
report.append(" Severity Breakdown:")
for severity, count in severity_counts.items():
if count > 0:
report.append(f" {severity.upper()}: {count}")
report.append("")
report.append("PHASE RESULTS:")
report.append("-" * 60)
for phase, data in results.get('phases', {}).items():
report.append(f"\n {phase.upper()}:")
if isinstance(data, dict):
for k, v in data.items():
if k not in ['classifications', 'intelligent_stats']: # Skip large data
report.append(f" {k}: {v}")
# Add output locations
if results.get('output'):
report.append("")
report.append("OUTPUT LOCATIONS:")
report.append("-" * 60)
for key, path in results['output'].items():
report.append(f" π {key.replace('_', ' ').title()}: {path}")
if results.get('errors'):
report.append("")
report.append("ERRORS:")
report.append("-" * 60)
for error in results['errors']:
report.append(f" β οΈ {error}")
report.append("")
report.append("=" * 60)
return "\n".join(report)
|