Spaces:

Kartik2204
/

Sentiment-Analysis

Sleeping

File size: 10,372 Bytes

14d6a4f

import os
import spacy
import stanza

# ===============================
# 🔧 Safe SpaCy + Stanza Downloads
# ===============================
def safe_load_spacy():
    try:
        return spacy.load("en_core_web_trf")
    except OSError:
        try:
            return spacy.load("en_core_web_sm")
        except OSError:
            os.system("python -m spacy download en_core_web_sm")
            return spacy.load("en_core_web_sm")

# ✅ Initialize English SpaCy safely
nlp_en = safe_load_spacy()

# Ensure Stanza models exist
stanza_dir = os.path.expanduser("~/.stanza_resources")
if not os.path.exists(stanza_dir):
    stanza.download('hi')
    stanza.download('ta')

# ===============================
# 1️⃣ Imports
# ===============================
import pandas as pd
import re
import docx
from collections import Counter
import stanza
from transformers import pipeline
import torch
from langdetect import detect
import streamlit as st
import io

# ===============================
# 2️⃣ Pre-download Stanza models
# ===============================
stanza.download('hi')
stanza.download('ta')

# ===============================
# 3️⃣ Initialize Stanza for Hindi/Tamil
# ===============================
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())

# ===============================
# 4️⃣ Language-Aware Pipeline Loader
# ===============================
def load_pipelines(language_code):
    lang = language_code.upper()
    device = 0 if torch.cuda.is_available() else -1
    st.write(f"🌍 Language detected: {lang}")
    st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")

    # Emotion model
    if lang == "EN":
        emo_model = "SamLowe/roberta-base-go_emotions"
    elif lang in ["HI", "TA"]:
        emo_model = "bhadresh-savani/bert-base-go-emotion"
    else:
        emo_model = "SamLowe/roberta-base-go_emotions"

    emotion_pipeline = pipeline(
        "text-classification",
        model=emo_model,
        tokenizer=emo_model,
        return_all_scores=True,
        device=device
    )

    # Sentiment model
    if lang == "EN":
        sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
    else:
        sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"

    sentiment_pipeline = pipeline(
        "text-classification",
        model=sent_model,
        tokenizer=sent_model,
        return_all_scores=True,
        device=device
    )

    return emotion_pipeline, sentiment_pipeline

# ===============================
# 5️⃣ Read DOCX and split articles
# ===============================
def read_and_split_articles(file_path):
    doc = docx.Document(file_path)
    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
    return paragraphs  # ✅ Each docx paragraph separately

# ===============================
# 6️⃣ Utility – Filter Neutral
# ===============================
def filter_neutral(emotion_results, neutral_threshold=0.75):
    scores = {r["label"]: round(r["score"], 3)
              for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
    if "neutral" in scores and scores["neutral"] > neutral_threshold:
        scores.pop("neutral")
    return scores

# ===============================
# 7️⃣ Sentence Splitter
# ===============================
def split_sentences(text, lang):
    if lang == "hi":
        sentences = re.split(r'।', text)
    elif lang == "ta":
        sentences = re.split(r'\.', text)
    else:
        doc = nlp_en(text)
        sentences = [sent.text.strip() for sent in doc.sents]
    return [s.strip() for s in sentences if s.strip()]

# ===============================
# 8️⃣ PoS Tagger
# ===============================
def get_pos_tags(sentence, lang):
    if lang == "en":
        doc = nlp_en(sentence)
        return [(token.text, token.pos_) for token in doc]
    elif lang == "hi":
        doc = nlp_hi(sentence)
        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
    elif lang == "ta":
        doc = nlp_ta(sentence)
        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
    else:
        return []

# ===============================
# 9️⃣ Analysis Function
# ===============================
def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs):
    results_summary = []
    export_rows = []
    para_counters = []
    article_counter = Counter()

    paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]

    # -------------------------------
    # ✅ Weighted Overall results
    weighted_scores = {}
    total_length = 0
    all_sentiments = []

    for para in paragraphs:
        sentences = split_sentences(para, lang[:2])
        for sentence in sentences:
            emo_results = emotion_pipeline(sentence[:512])[0]
            filtered = filter_neutral(emo_results)
            length = len(sentence.split())
            total_length += length
            for emo, score in filtered.items():
                weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
            sentiment_results = sentiment_pipeline(sentence[:512])[0]
            all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))

    if total_length > 0:
        weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}

    overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}

    st.subheader("📊 OVERALL (Weighted)")
    st.write("Emotions →", weighted_scores)
    st.write("Sentiment →", overall_sentiment)

    export_rows.append({
        "Type": "Overall",
        "Text": "Weighted across article",
        "Emotions": weighted_scores,
        "Sentiment": overall_sentiment
    })

    # -------------------------------
    # Paragraph-level
    for p_idx, para in enumerate(paragraphs, start=1):
        para_counter = Counter()
        sentences = split_sentences(para, lang[:2])
        for sentence in sentences:
            results = emotion_pipeline(sentence[:512])[0]
            filtered = filter_neutral(results, neutral_threshold=0.75)
            for emo, score in filtered.items():
                para_counter[emo] += score

        if normalize_paragraphs:
            # ✅ Normalize scores so they sum ≤ 1
            total = sum(para_counter.values())
            if total > 0:
                para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()}

        para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
        st.write(f"\n📑 Paragraph {p_idx}: {para}")
        st.write("Emotions →", para_counters[-1][1])

        export_rows.append({
            "Type": "Paragraph",
            "Text": para,
            "Emotions": para_counters[-1][1],
            "Sentiment": ""
        })

    # -------------------------------
    # Sentence-level
    st.subheader("📝 SENTENCES")
    for para in paragraphs:
        sentences = split_sentences(para, lang[:2])
        for sentence in sentences:
            pos_tags = get_pos_tags(sentence, lang[:2])
            results = emotion_pipeline(sentence[:512])[0]
            filtered = filter_neutral(results, neutral_threshold=0.75)
            sentiment_results = sentiment_pipeline(sentence[:512])[0]
            best_sentiment = max(sentiment_results, key=lambda x: x["score"])
            results_summary.append({
                "sentence": sentence,
                "pos_tags": pos_tags,
                "emotions": filtered,
                "sentiment": best_sentiment
            })
            st.write(f"Sentence: {sentence}")
            st.write(f"POS Tags → {pos_tags}")
            st.write(f"Emotions → {filtered}")
            st.write(f"Sentiment → {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")

            export_rows.append({
                "Type": "Sentence",
                "Text": sentence,
                "Emotions": filtered,
                "Sentiment": best_sentiment
            })

    return results_summary, export_rows

# ===============================
# 🔟 Streamlit App
# ===============================
st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")

uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
text_input = st.text_area("Or paste text here")

# ✅ Checkbox for paragraph normalization
normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True)

# ✅ Placeholder for download buttons at the top
download_placeholder = st.empty()

if st.button("🔍 Analyze"):
    with st.spinner("Running analysis... ⏳"):
        if uploaded_file:
            articles = read_and_split_articles(uploaded_file)
            text_to_analyze = "\n\n".join(articles) if articles else ""
        elif text_input.strip():
            text_to_analyze = text_input
        else:
            st.warning("Please upload a DOCX file or paste text to analyze.")
            st.stop()

        detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
        emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
        results, export_rows = analyze_article(
            text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs
        )

        # ✅ Show download buttons at the TOP
        df_export = pd.DataFrame(export_rows)
        csv = df_export.to_csv(index=False).encode("utf-8")

        with download_placeholder.container():
            st.download_button(
                label="⬇️ Download CSV",
                data=csv,
                file_name="analysis_results.csv",
                mime="text/csv",
            )

            excel_buffer = io.BytesIO()
            df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
            st.download_button(
                label="⬇️ Download Excel",
                data=excel_buffer,
                file_name="analysis_results.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            )