navred61's picture
copied from private space
99a41ea
"""
String processing pipeline functions for testing function analysis.
"""
import re
from typing import List
def normalize_whitespace(text):
"""Normalize whitespace by removing extra spaces and newlines."""
# Replace multiple whitespace with single space
text = re.sub(r'\s+', ' ', text)
# Strip leading and trailing whitespace
return text.strip()
def remove_special_characters(text, keep_chars=""):
"""Remove special characters, optionally keeping specified characters."""
# Keep alphanumeric, spaces, and specified characters
pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]"
return re.sub(pattern, '', text)
def convert_to_lowercase(text):
"""Convert text to lowercase."""
return text.lower()
def remove_stopwords(text, stopwords=None):
"""Remove common stopwords from text."""
if stopwords is None:
stopwords = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
'will', 'would', 'could', 'should', 'may', 'might', 'must'
}
words = text.split()
filtered_words = [word for word in words if word.lower() not in stopwords]
return ' '.join(filtered_words)
def extract_keywords(text, min_length=3):
"""Extract keywords (words longer than min_length)."""
words = text.split()
keywords = [word for word in words if len(word) >= min_length]
return keywords
def count_word_frequency(text):
"""Count frequency of each word in text."""
words = text.split()
frequency = {}
for word in words:
frequency[word] = frequency.get(word, 0) + 1
return frequency
def capitalize_words(text, exceptions=None):
"""Capitalize first letter of each word, with exceptions."""
if exceptions is None:
exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
words = text.split()
capitalized = []
for i, word in enumerate(words):
if i == 0 or word.lower() not in exceptions:
capitalized.append(word.capitalize())
else:
capitalized.append(word.lower())
return ' '.join(capitalized)
def truncate_text(text, max_length=100, suffix="..."):
"""Truncate text to specified length with suffix."""
if len(text) <= max_length:
return text
truncated = text[:max_length - len(suffix)]
# Try to break at last complete word
last_space = truncated.rfind(' ')
if last_space > max_length * 0.8: # If we can break at a word boundary
truncated = truncated[:last_space]
return truncated + suffix
def text_processing_pipeline(text, operations=None):
"""Process text through a pipeline of operations."""
if operations is None:
operations = [
'normalize_whitespace',
'remove_special_characters',
'convert_to_lowercase',
'remove_stopwords'
]
# Map operation names to functions
operation_map = {
'normalize_whitespace': normalize_whitespace,
'remove_special_characters': remove_special_characters,
'convert_to_lowercase': convert_to_lowercase,
'remove_stopwords': remove_stopwords,
'capitalize_words': capitalize_words,
'truncate_text': truncate_text
}
result = text
processing_steps = []
for operation in operations:
if operation in operation_map:
before = result
result = operation_map[operation](result)
processing_steps.append({
'operation': operation,
'before': before[:50] + "..." if len(before) > 50 else before,
'after': result[:50] + "..." if len(result) > 50 else result
})
return result, processing_steps
def analyze_text_statistics(text):
"""Analyze various statistics about the text."""
words = text.split()
stats = {
'character_count': len(text),
'word_count': len(words),
'sentence_count': len(re.findall(r'[.!?]+', text)),
'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
'longest_word': max(words, key=len) if words else "",
'shortest_word': min(words, key=len) if words else ""
}
return stats
if __name__ == "__main__":
sample_text = """
This is a SAMPLE text with various formatting issues!!!
It has multiple spaces, special @#$% characters, and
needs some serious cleaning & processing...
"""
print("Original text:")
print(repr(sample_text))
processed_text, steps = text_processing_pipeline(sample_text)
print("\nProcessing steps:")
for step in steps:
print(f"After {step['operation']}:")
print(f" {step['after']}")
print(f"\nFinal result: {processed_text}")
stats = analyze_text_statistics(processed_text)
print(f"\nText statistics: {stats}")