Sentiment-Analysis / app copy.py
Kartikay Khosla
Update app.py and requirements.txt with URL support and emotion filter
14d6a4f
import os
import spacy
import stanza
# ===============================
# πŸ”§ Safe SpaCy + Stanza Downloads
# ===============================
def safe_load_spacy():
try:
return spacy.load("en_core_web_trf")
except OSError:
try:
return spacy.load("en_core_web_sm")
except OSError:
os.system("python -m spacy download en_core_web_sm")
return spacy.load("en_core_web_sm")
# βœ… Initialize English SpaCy safely
nlp_en = safe_load_spacy()
# Ensure Stanza models exist
stanza_dir = os.path.expanduser("~/.stanza_resources")
if not os.path.exists(stanza_dir):
stanza.download('hi')
stanza.download('ta')
# ===============================
# 1️⃣ Imports
# ===============================
import pandas as pd
import re
import docx
from collections import Counter
import stanza
from transformers import pipeline
import torch
from langdetect import detect
import streamlit as st
import io
# ===============================
# 2️⃣ Pre-download Stanza models
# ===============================
stanza.download('hi')
stanza.download('ta')
# ===============================
# 3️⃣ Initialize Stanza for Hindi/Tamil
# ===============================
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
# ===============================
# 4️⃣ Language-Aware Pipeline Loader
# ===============================
def load_pipelines(language_code):
lang = language_code.upper()
device = 0 if torch.cuda.is_available() else -1
st.write(f"🌍 Language detected: {lang}")
st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
# Emotion model
if lang == "EN":
emo_model = "SamLowe/roberta-base-go_emotions"
elif lang in ["HI", "TA"]:
emo_model = "bhadresh-savani/bert-base-go-emotion"
else:
emo_model = "SamLowe/roberta-base-go_emotions"
emotion_pipeline = pipeline(
"text-classification",
model=emo_model,
tokenizer=emo_model,
return_all_scores=True,
device=device
)
# Sentiment model
if lang == "EN":
sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
else:
sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
sentiment_pipeline = pipeline(
"text-classification",
model=sent_model,
tokenizer=sent_model,
return_all_scores=True,
device=device
)
return emotion_pipeline, sentiment_pipeline
# ===============================
# 5️⃣ Read DOCX and split articles
# ===============================
def read_and_split_articles(file_path):
doc = docx.Document(file_path)
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
return paragraphs # βœ… Each docx paragraph separately
# ===============================
# 6️⃣ Utility – Filter Neutral
# ===============================
def filter_neutral(emotion_results, neutral_threshold=0.75):
scores = {r["label"]: round(r["score"], 3)
for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
if "neutral" in scores and scores["neutral"] > neutral_threshold:
scores.pop("neutral")
return scores
# ===============================
# 7️⃣ Sentence Splitter
# ===============================
def split_sentences(text, lang):
if lang == "hi":
sentences = re.split(r'ΰ₯€', text)
elif lang == "ta":
sentences = re.split(r'\.', text)
else:
doc = nlp_en(text)
sentences = [sent.text.strip() for sent in doc.sents]
return [s.strip() for s in sentences if s.strip()]
# ===============================
# 8️⃣ PoS Tagger
# ===============================
def get_pos_tags(sentence, lang):
if lang == "en":
doc = nlp_en(sentence)
return [(token.text, token.pos_) for token in doc]
elif lang == "hi":
doc = nlp_hi(sentence)
return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
elif lang == "ta":
doc = nlp_ta(sentence)
return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
else:
return []
# ===============================
# 9️⃣ Analysis Function
# ===============================
def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs):
results_summary = []
export_rows = []
para_counters = []
article_counter = Counter()
paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
# -------------------------------
# βœ… Weighted Overall results
weighted_scores = {}
total_length = 0
all_sentiments = []
for para in paragraphs:
sentences = split_sentences(para, lang[:2])
for sentence in sentences:
emo_results = emotion_pipeline(sentence[:512])[0]
filtered = filter_neutral(emo_results)
length = len(sentence.split())
total_length += length
for emo, score in filtered.items():
weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
sentiment_results = sentiment_pipeline(sentence[:512])[0]
all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
if total_length > 0:
weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
st.subheader("πŸ“Š OVERALL (Weighted)")
st.write("Emotions β†’", weighted_scores)
st.write("Sentiment β†’", overall_sentiment)
export_rows.append({
"Type": "Overall",
"Text": "Weighted across article",
"Emotions": weighted_scores,
"Sentiment": overall_sentiment
})
# -------------------------------
# Paragraph-level
for p_idx, para in enumerate(paragraphs, start=1):
para_counter = Counter()
sentences = split_sentences(para, lang[:2])
for sentence in sentences:
results = emotion_pipeline(sentence[:512])[0]
filtered = filter_neutral(results, neutral_threshold=0.75)
for emo, score in filtered.items():
para_counter[emo] += score
if normalize_paragraphs:
# βœ… Normalize scores so they sum ≀ 1
total = sum(para_counter.values())
if total > 0:
para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()}
para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
st.write("Emotions β†’", para_counters[-1][1])
export_rows.append({
"Type": "Paragraph",
"Text": para,
"Emotions": para_counters[-1][1],
"Sentiment": ""
})
# -------------------------------
# Sentence-level
st.subheader("πŸ“ SENTENCES")
for para in paragraphs:
sentences = split_sentences(para, lang[:2])
for sentence in sentences:
pos_tags = get_pos_tags(sentence, lang[:2])
results = emotion_pipeline(sentence[:512])[0]
filtered = filter_neutral(results, neutral_threshold=0.75)
sentiment_results = sentiment_pipeline(sentence[:512])[0]
best_sentiment = max(sentiment_results, key=lambda x: x["score"])
results_summary.append({
"sentence": sentence,
"pos_tags": pos_tags,
"emotions": filtered,
"sentiment": best_sentiment
})
st.write(f"Sentence: {sentence}")
st.write(f"POS Tags β†’ {pos_tags}")
st.write(f"Emotions β†’ {filtered}")
st.write(f"Sentiment β†’ {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
export_rows.append({
"Type": "Sentence",
"Text": sentence,
"Emotions": filtered,
"Sentiment": best_sentiment
})
return results_summary, export_rows
# ===============================
# πŸ”Ÿ Streamlit App
# ===============================
st.title("πŸ“‘ Multilingual Text Emotion + Sentiment Analyzer")
uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
text_input = st.text_area("Or paste text here")
# βœ… Checkbox for paragraph normalization
normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True)
# βœ… Placeholder for download buttons at the top
download_placeholder = st.empty()
if st.button("πŸ” Analyze"):
with st.spinner("Running analysis... ⏳"):
if uploaded_file:
articles = read_and_split_articles(uploaded_file)
text_to_analyze = "\n\n".join(articles) if articles else ""
elif text_input.strip():
text_to_analyze = text_input
else:
st.warning("Please upload a DOCX file or paste text to analyze.")
st.stop()
detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
results, export_rows = analyze_article(
text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs
)
# βœ… Show download buttons at the TOP
df_export = pd.DataFrame(export_rows)
csv = df_export.to_csv(index=False).encode("utf-8")
with download_placeholder.container():
st.download_button(
label="⬇️ Download CSV",
data=csv,
file_name="analysis_results.csv",
mime="text/csv",
)
excel_buffer = io.BytesIO()
df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
st.download_button(
label="⬇️ Download Excel",
data=excel_buffer,
file_name="analysis_results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)