|
|
""" |
|
|
String processing pipeline functions for testing function analysis. |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import List |
|
|
|
|
|
|
|
|
def normalize_whitespace(text): |
|
|
"""Normalize whitespace by removing extra spaces and newlines.""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def remove_special_characters(text, keep_chars=""): |
|
|
"""Remove special characters, optionally keeping specified characters.""" |
|
|
|
|
|
pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]" |
|
|
return re.sub(pattern, '', text) |
|
|
|
|
|
|
|
|
def convert_to_lowercase(text): |
|
|
"""Convert text to lowercase.""" |
|
|
return text.lower() |
|
|
|
|
|
|
|
|
def remove_stopwords(text, stopwords=None): |
|
|
"""Remove common stopwords from text.""" |
|
|
if stopwords is None: |
|
|
stopwords = { |
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', |
|
|
'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', |
|
|
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', |
|
|
'will', 'would', 'could', 'should', 'may', 'might', 'must' |
|
|
} |
|
|
|
|
|
words = text.split() |
|
|
filtered_words = [word for word in words if word.lower() not in stopwords] |
|
|
return ' '.join(filtered_words) |
|
|
|
|
|
|
|
|
def extract_keywords(text, min_length=3): |
|
|
"""Extract keywords (words longer than min_length).""" |
|
|
words = text.split() |
|
|
keywords = [word for word in words if len(word) >= min_length] |
|
|
return keywords |
|
|
|
|
|
|
|
|
def count_word_frequency(text): |
|
|
"""Count frequency of each word in text.""" |
|
|
words = text.split() |
|
|
frequency = {} |
|
|
for word in words: |
|
|
frequency[word] = frequency.get(word, 0) + 1 |
|
|
return frequency |
|
|
|
|
|
|
|
|
def capitalize_words(text, exceptions=None): |
|
|
"""Capitalize first letter of each word, with exceptions.""" |
|
|
if exceptions is None: |
|
|
exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} |
|
|
|
|
|
words = text.split() |
|
|
capitalized = [] |
|
|
|
|
|
for i, word in enumerate(words): |
|
|
if i == 0 or word.lower() not in exceptions: |
|
|
capitalized.append(word.capitalize()) |
|
|
else: |
|
|
capitalized.append(word.lower()) |
|
|
|
|
|
return ' '.join(capitalized) |
|
|
|
|
|
|
|
|
def truncate_text(text, max_length=100, suffix="..."): |
|
|
"""Truncate text to specified length with suffix.""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
truncated = text[:max_length - len(suffix)] |
|
|
|
|
|
last_space = truncated.rfind(' ') |
|
|
if last_space > max_length * 0.8: |
|
|
truncated = truncated[:last_space] |
|
|
|
|
|
return truncated + suffix |
|
|
|
|
|
|
|
|
def text_processing_pipeline(text, operations=None): |
|
|
"""Process text through a pipeline of operations.""" |
|
|
if operations is None: |
|
|
operations = [ |
|
|
'normalize_whitespace', |
|
|
'remove_special_characters', |
|
|
'convert_to_lowercase', |
|
|
'remove_stopwords' |
|
|
] |
|
|
|
|
|
|
|
|
operation_map = { |
|
|
'normalize_whitespace': normalize_whitespace, |
|
|
'remove_special_characters': remove_special_characters, |
|
|
'convert_to_lowercase': convert_to_lowercase, |
|
|
'remove_stopwords': remove_stopwords, |
|
|
'capitalize_words': capitalize_words, |
|
|
'truncate_text': truncate_text |
|
|
} |
|
|
|
|
|
result = text |
|
|
processing_steps = [] |
|
|
|
|
|
for operation in operations: |
|
|
if operation in operation_map: |
|
|
before = result |
|
|
result = operation_map[operation](result) |
|
|
processing_steps.append({ |
|
|
'operation': operation, |
|
|
'before': before[:50] + "..." if len(before) > 50 else before, |
|
|
'after': result[:50] + "..." if len(result) > 50 else result |
|
|
}) |
|
|
|
|
|
return result, processing_steps |
|
|
|
|
|
|
|
|
def analyze_text_statistics(text): |
|
|
"""Analyze various statistics about the text.""" |
|
|
words = text.split() |
|
|
|
|
|
stats = { |
|
|
'character_count': len(text), |
|
|
'word_count': len(words), |
|
|
'sentence_count': len(re.findall(r'[.!?]+', text)), |
|
|
'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, |
|
|
'longest_word': max(words, key=len) if words else "", |
|
|
'shortest_word': min(words, key=len) if words else "" |
|
|
} |
|
|
|
|
|
return stats |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sample_text = """ |
|
|
This is a SAMPLE text with various formatting issues!!! |
|
|
It has multiple spaces, special @#$% characters, and |
|
|
needs some serious cleaning & processing... |
|
|
""" |
|
|
|
|
|
print("Original text:") |
|
|
print(repr(sample_text)) |
|
|
|
|
|
processed_text, steps = text_processing_pipeline(sample_text) |
|
|
|
|
|
print("\nProcessing steps:") |
|
|
for step in steps: |
|
|
print(f"After {step['operation']}:") |
|
|
print(f" {step['after']}") |
|
|
|
|
|
print(f"\nFinal result: {processed_text}") |
|
|
|
|
|
stats = analyze_text_statistics(processed_text) |
|
|
print(f"\nText statistics: {stats}") |
|
|
|