File size: 5,126 Bytes
99a41ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
String processing pipeline functions for testing function analysis.
"""

import re
from typing import List


def normalize_whitespace(text):
    """Normalize whitespace by removing extra spaces and newlines."""
    # Replace multiple whitespace with single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    return text.strip()


def remove_special_characters(text, keep_chars=""):
    """Remove special characters, optionally keeping specified characters."""
    # Keep alphanumeric, spaces, and specified characters
    pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]"
    return re.sub(pattern, '', text)


def convert_to_lowercase(text):
    """Convert text to lowercase."""
    return text.lower()


def remove_stopwords(text, stopwords=None):
    """Remove common stopwords from text."""
    if stopwords is None:
        stopwords = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
            'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
            'will', 'would', 'could', 'should', 'may', 'might', 'must'
        }
    
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)


def extract_keywords(text, min_length=3):
    """Extract keywords (words longer than min_length)."""
    words = text.split()
    keywords = [word for word in words if len(word) >= min_length]
    return keywords


def count_word_frequency(text):
    """Count frequency of each word in text."""
    words = text.split()
    frequency = {}
    for word in words:
        frequency[word] = frequency.get(word, 0) + 1
    return frequency


def capitalize_words(text, exceptions=None):
    """Capitalize first letter of each word, with exceptions."""
    if exceptions is None:
        exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    
    words = text.split()
    capitalized = []
    
    for i, word in enumerate(words):
        if i == 0 or word.lower() not in exceptions:
            capitalized.append(word.capitalize())
        else:
            capitalized.append(word.lower())
    
    return ' '.join(capitalized)


def truncate_text(text, max_length=100, suffix="..."):
    """Truncate text to specified length with suffix."""
    if len(text) <= max_length:
        return text
    
    truncated = text[:max_length - len(suffix)]
    # Try to break at last complete word
    last_space = truncated.rfind(' ')
    if last_space > max_length * 0.8:  # If we can break at a word boundary
        truncated = truncated[:last_space]
    
    return truncated + suffix


def text_processing_pipeline(text, operations=None):
    """Process text through a pipeline of operations."""
    if operations is None:
        operations = [
            'normalize_whitespace',
            'remove_special_characters', 
            'convert_to_lowercase',
            'remove_stopwords'
        ]
    
    # Map operation names to functions
    operation_map = {
        'normalize_whitespace': normalize_whitespace,
        'remove_special_characters': remove_special_characters,
        'convert_to_lowercase': convert_to_lowercase,
        'remove_stopwords': remove_stopwords,
        'capitalize_words': capitalize_words,
        'truncate_text': truncate_text
    }
    
    result = text
    processing_steps = []
    
    for operation in operations:
        if operation in operation_map:
            before = result
            result = operation_map[operation](result)
            processing_steps.append({
                'operation': operation,
                'before': before[:50] + "..." if len(before) > 50 else before,
                'after': result[:50] + "..." if len(result) > 50 else result
            })
    
    return result, processing_steps


def analyze_text_statistics(text):
    """Analyze various statistics about the text."""
    words = text.split()
    
    stats = {
        'character_count': len(text),
        'word_count': len(words),
        'sentence_count': len(re.findall(r'[.!?]+', text)),
        'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
        'longest_word': max(words, key=len) if words else "",
        'shortest_word': min(words, key=len) if words else ""
    }
    
    return stats


if __name__ == "__main__":
    sample_text = """
    This is a SAMPLE text with various   formatting issues!!! 
    It has multiple    spaces, special @#$% characters, and 
    needs some serious cleaning & processing...
    """
    
    print("Original text:")
    print(repr(sample_text))
    
    processed_text, steps = text_processing_pipeline(sample_text)
    
    print("\nProcessing steps:")
    for step in steps:
        print(f"After {step['operation']}:")
        print(f"  {step['after']}")
    
    print(f"\nFinal result: {processed_text}")
    
    stats = analyze_text_statistics(processed_text)
    print(f"\nText statistics: {stats}")