Spaces:
Sleeping
Sleeping
| import os | |
| import spacy | |
| import stanza | |
| # =============================== | |
| # π§ Safe SpaCy + Stanza Downloads | |
| # =============================== | |
| def safe_load_spacy(): | |
| try: | |
| return spacy.load("en_core_web_trf") | |
| except OSError: | |
| try: | |
| return spacy.load("en_core_web_sm") | |
| except OSError: | |
| os.system("python -m spacy download en_core_web_sm") | |
| return spacy.load("en_core_web_sm") | |
| # β Initialize English SpaCy safely | |
| nlp_en = safe_load_spacy() | |
| # Ensure Stanza models exist | |
| stanza_dir = os.path.expanduser("~/.stanza_resources") | |
| if not os.path.exists(stanza_dir): | |
| stanza.download('hi') | |
| stanza.download('ta') | |
| # =============================== | |
| # 1οΈβ£ Imports | |
| # =============================== | |
| import pandas as pd | |
| import re | |
| import docx | |
| from collections import Counter | |
| import stanza | |
| from transformers import pipeline | |
| import torch | |
| from langdetect import detect | |
| import streamlit as st | |
| import io | |
| # =============================== | |
| # 2οΈβ£ Pre-download Stanza models | |
| # =============================== | |
| stanza.download('hi') | |
| stanza.download('ta') | |
| # =============================== | |
| # 3οΈβ£ Initialize Stanza for Hindi/Tamil | |
| # =============================== | |
| nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available()) | |
| nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available()) | |
| # =============================== | |
| # 4οΈβ£ Language-Aware Pipeline Loader | |
| # =============================== | |
| def load_pipelines(language_code): | |
| lang = language_code.upper() | |
| device = 0 if torch.cuda.is_available() else -1 | |
| st.write(f"π Language detected: {lang}") | |
| st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}") | |
| # Emotion model | |
| if lang == "EN": | |
| emo_model = "SamLowe/roberta-base-go_emotions" | |
| elif lang in ["HI", "TA"]: | |
| emo_model = "bhadresh-savani/bert-base-go-emotion" | |
| else: | |
| emo_model = "SamLowe/roberta-base-go_emotions" | |
| emotion_pipeline = pipeline( | |
| "text-classification", | |
| model=emo_model, | |
| tokenizer=emo_model, | |
| return_all_scores=True, | |
| device=device | |
| ) | |
| # Sentiment model | |
| if lang == "EN": | |
| sent_model = "distilbert-base-uncased-finetuned-sst-2-english" | |
| else: | |
| sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual" | |
| sentiment_pipeline = pipeline( | |
| "text-classification", | |
| model=sent_model, | |
| tokenizer=sent_model, | |
| return_all_scores=True, | |
| device=device | |
| ) | |
| return emotion_pipeline, sentiment_pipeline | |
| # =============================== | |
| # 5οΈβ£ Read DOCX and split articles | |
| # =============================== | |
| def read_and_split_articles(file_path): | |
| doc = docx.Document(file_path) | |
| paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] | |
| return paragraphs # β Each docx paragraph separately | |
| # =============================== | |
| # 6οΈβ£ Utility β Filter Neutral | |
| # =============================== | |
| def filter_neutral(emotion_results, neutral_threshold=0.75): | |
| scores = {r["label"]: round(r["score"], 3) | |
| for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)} | |
| if "neutral" in scores and scores["neutral"] > neutral_threshold: | |
| scores.pop("neutral") | |
| return scores | |
| # =============================== | |
| # 7οΈβ£ Sentence Splitter | |
| # =============================== | |
| def split_sentences(text, lang): | |
| if lang == "hi": | |
| sentences = re.split(r'ΰ₯€', text) | |
| elif lang == "ta": | |
| sentences = re.split(r'\.', text) | |
| else: | |
| doc = nlp_en(text) | |
| sentences = [sent.text.strip() for sent in doc.sents] | |
| return [s.strip() for s in sentences if s.strip()] | |
| # =============================== | |
| # 8οΈβ£ PoS Tagger | |
| # =============================== | |
| def get_pos_tags(sentence, lang): | |
| if lang == "en": | |
| doc = nlp_en(sentence) | |
| return [(token.text, token.pos_) for token in doc] | |
| elif lang == "hi": | |
| doc = nlp_hi(sentence) | |
| return [(word.text, word.upos) for sent in doc.sentences for word in sent.words] | |
| elif lang == "ta": | |
| doc = nlp_ta(sentence) | |
| return [(word.text, word.upos) for sent in doc.sentences for word in sent.words] | |
| else: | |
| return [] | |
| # =============================== | |
| # 9οΈβ£ Analysis Function | |
| # =============================== | |
| def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs): | |
| results_summary = [] | |
| export_rows = [] | |
| para_counters = [] | |
| article_counter = Counter() | |
| paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()] | |
| # ------------------------------- | |
| # β Weighted Overall results | |
| weighted_scores = {} | |
| total_length = 0 | |
| all_sentiments = [] | |
| for para in paragraphs: | |
| sentences = split_sentences(para, lang[:2]) | |
| for sentence in sentences: | |
| emo_results = emotion_pipeline(sentence[:512])[0] | |
| filtered = filter_neutral(emo_results) | |
| length = len(sentence.split()) | |
| total_length += length | |
| for emo, score in filtered.items(): | |
| weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length | |
| sentiment_results = sentiment_pipeline(sentence[:512])[0] | |
| all_sentiments.append(max(sentiment_results, key=lambda x: x["score"])) | |
| if total_length > 0: | |
| weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()} | |
| overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {} | |
| st.subheader("π OVERALL (Weighted)") | |
| st.write("Emotions β", weighted_scores) | |
| st.write("Sentiment β", overall_sentiment) | |
| export_rows.append({ | |
| "Type": "Overall", | |
| "Text": "Weighted across article", | |
| "Emotions": weighted_scores, | |
| "Sentiment": overall_sentiment | |
| }) | |
| # ------------------------------- | |
| # Paragraph-level | |
| for p_idx, para in enumerate(paragraphs, start=1): | |
| para_counter = Counter() | |
| sentences = split_sentences(para, lang[:2]) | |
| for sentence in sentences: | |
| results = emotion_pipeline(sentence[:512])[0] | |
| filtered = filter_neutral(results, neutral_threshold=0.75) | |
| for emo, score in filtered.items(): | |
| para_counter[emo] += score | |
| if normalize_paragraphs: | |
| # β Normalize scores so they sum β€ 1 | |
| total = sum(para_counter.values()) | |
| if total > 0: | |
| para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()} | |
| para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True)))) | |
| st.write(f"\nπ Paragraph {p_idx}: {para}") | |
| st.write("Emotions β", para_counters[-1][1]) | |
| export_rows.append({ | |
| "Type": "Paragraph", | |
| "Text": para, | |
| "Emotions": para_counters[-1][1], | |
| "Sentiment": "" | |
| }) | |
| # ------------------------------- | |
| # Sentence-level | |
| st.subheader("π SENTENCES") | |
| for para in paragraphs: | |
| sentences = split_sentences(para, lang[:2]) | |
| for sentence in sentences: | |
| pos_tags = get_pos_tags(sentence, lang[:2]) | |
| results = emotion_pipeline(sentence[:512])[0] | |
| filtered = filter_neutral(results, neutral_threshold=0.75) | |
| sentiment_results = sentiment_pipeline(sentence[:512])[0] | |
| best_sentiment = max(sentiment_results, key=lambda x: x["score"]) | |
| results_summary.append({ | |
| "sentence": sentence, | |
| "pos_tags": pos_tags, | |
| "emotions": filtered, | |
| "sentiment": best_sentiment | |
| }) | |
| st.write(f"Sentence: {sentence}") | |
| st.write(f"POS Tags β {pos_tags}") | |
| st.write(f"Emotions β {filtered}") | |
| st.write(f"Sentiment β {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n") | |
| export_rows.append({ | |
| "Type": "Sentence", | |
| "Text": sentence, | |
| "Emotions": filtered, | |
| "Sentiment": best_sentiment | |
| }) | |
| return results_summary, export_rows | |
| # =============================== | |
| # π Streamlit App | |
| # =============================== | |
| st.title("π Multilingual Text Emotion + Sentiment Analyzer") | |
| uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"]) | |
| text_input = st.text_area("Or paste text here") | |
| # β Checkbox for paragraph normalization | |
| normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True) | |
| # β Placeholder for download buttons at the top | |
| download_placeholder = st.empty() | |
| if st.button("π Analyze"): | |
| with st.spinner("Running analysis... β³"): | |
| if uploaded_file: | |
| articles = read_and_split_articles(uploaded_file) | |
| text_to_analyze = "\n\n".join(articles) if articles else "" | |
| elif text_input.strip(): | |
| text_to_analyze = text_input | |
| else: | |
| st.warning("Please upload a DOCX file or paste text to analyze.") | |
| st.stop() | |
| detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en" | |
| emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang) | |
| results, export_rows = analyze_article( | |
| text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs | |
| ) | |
| # β Show download buttons at the TOP | |
| df_export = pd.DataFrame(export_rows) | |
| csv = df_export.to_csv(index=False).encode("utf-8") | |
| with download_placeholder.container(): | |
| st.download_button( | |
| label="β¬οΈ Download CSV", | |
| data=csv, | |
| file_name="analysis_results.csv", | |
| mime="text/csv", | |
| ) | |
| excel_buffer = io.BytesIO() | |
| df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter") | |
| st.download_button( | |
| label="β¬οΈ Download Excel", | |
| data=excel_buffer, | |
| file_name="analysis_results.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| ) | |