Spaces:

Agents-MCP-Hackathon
/

Python-Code-to-Diagram-Generator-MCP

Running

File size: 5,126 Bytes

99a41ea

"""
String processing pipeline functions for testing function analysis.
"""

import re
from typing import List


def normalize_whitespace(text):
    """Normalize whitespace by removing extra spaces and newlines."""
    # Replace multiple whitespace with single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    return text.strip()


def remove_special_characters(text, keep_chars=""):
    """Remove special characters, optionally keeping specified characters."""
    # Keep alphanumeric, spaces, and specified characters
    pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]"
    return re.sub(pattern, '', text)


def convert_to_lowercase(text):
    """Convert text to lowercase."""
    return text.lower()


def remove_stopwords(text, stopwords=None):
    """Remove common stopwords from text."""
    if stopwords is None:
        stopwords = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
            'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
            'will', 'would', 'could', 'should', 'may', 'might', 'must'
        }
    
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)


def extract_keywords(text, min_length=3):
    """Extract keywords (words longer than min_length)."""
    words = text.split()
    keywords = [word for word in words if len(word) >= min_length]
    return keywords


def count_word_frequency(text):
    """Count frequency of each word in text."""
    words = text.split()
    frequency = {}
    for word in words:
        frequency[word] = frequency.get(word, 0) + 1
    return frequency


def capitalize_words(text, exceptions=None):
    """Capitalize first letter of each word, with exceptions."""
    if exceptions is None:
        exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    
    words = text.split()
    capitalized = []
    
    for i, word in enumerate(words):
        if i == 0 or word.lower() not in exceptions:
            capitalized.append(word.capitalize())
        else:
            capitalized.append(word.lower())
    
    return ' '.join(capitalized)


def truncate_text(text, max_length=100, suffix="..."):
    """Truncate text to specified length with suffix."""
    if len(text) <= max_length:
        return text
    
    truncated = text[:max_length - len(suffix)]
    # Try to break at last complete word
    last_space = truncated.rfind(' ')
    if last_space > max_length * 0.8:  # If we can break at a word boundary
        truncated = truncated[:last_space]
    
    return truncated + suffix


def text_processing_pipeline(text, operations=None):
    """Process text through a pipeline of operations."""
    if operations is None:
        operations = [
            'normalize_whitespace',
            'remove_special_characters', 
            'convert_to_lowercase',
            'remove_stopwords'
        ]
    
    # Map operation names to functions
    operation_map = {
        'normalize_whitespace': normalize_whitespace,
        'remove_special_characters': remove_special_characters,
        'convert_to_lowercase': convert_to_lowercase,
        'remove_stopwords': remove_stopwords,
        'capitalize_words': capitalize_words,
        'truncate_text': truncate_text
    }
    
    result = text
    processing_steps = []
    
    for operation in operations:
        if operation in operation_map:
            before = result
            result = operation_map[operation](result)
            processing_steps.append({
                'operation': operation,
                'before': before[:50] + "..." if len(before) > 50 else before,
                'after': result[:50] + "..." if len(result) > 50 else result
            })
    
    return result, processing_steps


def analyze_text_statistics(text):
    """Analyze various statistics about the text."""
    words = text.split()
    
    stats = {
        'character_count': len(text),
        'word_count': len(words),
        'sentence_count': len(re.findall(r'[.!?]+', text)),
        'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
        'longest_word': max(words, key=len) if words else "",
        'shortest_word': min(words, key=len) if words else ""
    }
    
    return stats


if __name__ == "__main__":
    sample_text = """
    This is a SAMPLE text with various   formatting issues!!! 
    It has multiple    spaces, special @#$% characters, and 
    needs some serious cleaning & processing...
    """
    
    print("Original text:")
    print(repr(sample_text))
    
    processed_text, steps = text_processing_pipeline(sample_text)
    
    print("\nProcessing steps:")
    for step in steps:
        print(f"After {step['operation']}:")
        print(f"  {step['after']}")
    
    print(f"\nFinal result: {processed_text}")
    
    stats = analyze_text_statistics(processed_text)
    print(f"\nText statistics: {stats}")