| import re |
|
|
| def clean_asterisks(text): |
| """Aggressively remove all asterisk patterns.""" |
| |
| text = re.sub(r'\*+([^*]*)\*+', r'\1', text) |
| |
| text = text.replace('*', '') |
| |
| text = ' '.join(text.split()) |
| return text |
|
|
| def remove_dialog_formatting(text): |
| """Remove common dialog markers and formatting.""" |
| |
| text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE) |
| text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE) |
| |
| |
| text = re.sub(r'\([^)]*\)', '', text) |
| text = re.sub(r'\[[^\]]*\]', '', text) |
| |
| return text |
|
|
| def remove_breakthrough_formatting(text): |
| """Remove any LLM formatting that made it through the prompts.""" |
| patterns = [ |
| (r'^.*?:\s*', ''), |
| (r'\[.*?\]', ''), |
| (r'\(.*?\)', ''), |
| (r'"\w+:"', ''), |
| (r'<.*?>', ''), |
| (r'---.*?---', ''), |
| (r'#\s*\w+', ''), |
| ] |
| |
| for pattern, replacement in patterns: |
| text = re.sub(pattern, replacement, text, flags=re.MULTILINE) |
| return text |
|
|
| def convert_to_monologue(text): |
| """Convert multi-party dialog into a flowing narrative.""" |
| |
| transitions = [ |
| "Then", "After that", "Next", "Following that", |
| "Subsequently", "Moving on", "Additionally" |
| ] |
| |
| lines = text.split('\n') |
| narrative = [] |
| current_transition = 0 |
| |
| for line in lines: |
| if line.strip(): |
| |
| cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line) |
| cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line) |
| |
| |
| if narrative and cleaned_line[0].isupper(): |
| narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}") |
| current_transition = (current_transition + 1) % len(transitions) |
| else: |
| narrative.append(cleaned_line) |
| |
| return ' '.join(narrative) |
|
|
| def clean_formatting(text): |
| """Remove markdown and other formatting symbols.""" |
| |
| text = clean_asterisks(text) |
| |
| |
| text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) |
| text = re.sub(r'\*(.+?)\*', r'\1', text) |
| text = re.sub(r'\_(.+?)\_', r'\1', text) |
| text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) |
| |
| |
| text = re.sub(r'```[\s\S]*?```', '', text) |
| text = re.sub(r'`[^`]*`', '', text) |
| |
| return text |
|
|
| def process_for_podcast(text): |
| """Main function to process text for podcast narration.""" |
| |
| text = clean_asterisks(text) |
| text = remove_dialog_formatting(text) |
| text = clean_formatting(text) |
| text = remove_breakthrough_formatting(text) |
| text = convert_to_monologue(text) |
| |
| |
| text = re.sub(r'\s+', ' ', text) |
| text = re.sub(r'\n+', ' ', text) |
| text = text.strip() |
| |
| |
| text = clean_asterisks(text) |
| return text |
|
|