Spaces:

Kartik2204
/

Sentiment-Analysis

Sleeping

Sentiment-Analysis / app copy.py

Kartikay Khosla

Update app.py and requirements.txt with URL support and emotion filter

14d6a4f 3 months ago

10.4 kB

	import os
	import spacy
	import stanza

	# ===============================
	# 🔧 Safe SpaCy + Stanza Downloads
	# ===============================
	def safe_load_spacy():
	try:
	return spacy.load("en_core_web_trf")
	except OSError:
	try:
	return spacy.load("en_core_web_sm")
	except OSError:
	os.system("python -m spacy download en_core_web_sm")
	return spacy.load("en_core_web_sm")

	# ✅ Initialize English SpaCy safely
	nlp_en = safe_load_spacy()

	# Ensure Stanza models exist
	stanza_dir = os.path.expanduser("~/.stanza_resources")
	if not os.path.exists(stanza_dir):
	stanza.download('hi')
	stanza.download('ta')

	# ===============================
	# 1️⃣ Imports
	# ===============================
	import pandas as pd
	import re
	import docx
	from collections import Counter
	import stanza
	from transformers import pipeline
	import torch
	from langdetect import detect
	import streamlit as st
	import io

	# ===============================
	# 2️⃣ Pre-download Stanza models
	# ===============================
	stanza.download('hi')
	stanza.download('ta')

	# ===============================
	# 3️⃣ Initialize Stanza for Hindi/Tamil
	# ===============================
	nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
	nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())

	# ===============================
	# 4️⃣ Language-Aware Pipeline Loader
	# ===============================
	def load_pipelines(language_code):
	lang = language_code.upper()
	device = 0 if torch.cuda.is_available() else -1
	st.write(f"🌍 Language detected: {lang}")
	st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")

	# Emotion model
	if lang == "EN":
	emo_model = "SamLowe/roberta-base-go_emotions"
	elif lang in ["HI", "TA"]:
	emo_model = "bhadresh-savani/bert-base-go-emotion"
	else:
	emo_model = "SamLowe/roberta-base-go_emotions"

	emotion_pipeline = pipeline(
	"text-classification",
	model=emo_model,
	tokenizer=emo_model,
	return_all_scores=True,
	device=device
	)

	# Sentiment model
	if lang == "EN":
	sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
	else:
	sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"

	sentiment_pipeline = pipeline(
	"text-classification",
	model=sent_model,
	tokenizer=sent_model,
	return_all_scores=True,
	device=device
	)

	return emotion_pipeline, sentiment_pipeline

	# ===============================
	# 5️⃣ Read DOCX and split articles
	# ===============================
	def read_and_split_articles(file_path):
	doc = docx.Document(file_path)
	paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
	return paragraphs # ✅ Each docx paragraph separately

	# ===============================
	# 6️⃣ Utility – Filter Neutral
	# ===============================
	def filter_neutral(emotion_results, neutral_threshold=0.75):
	scores = {r["label"]: round(r["score"], 3)
	for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
	if "neutral" in scores and scores["neutral"] > neutral_threshold:
	scores.pop("neutral")
	return scores

	# ===============================
	# 7️⃣ Sentence Splitter
	# ===============================
	def split_sentences(text, lang):
	if lang == "hi":
	sentences = re.split(r'।', text)
	elif lang == "ta":
	sentences = re.split(r'\.', text)
	else:
	doc = nlp_en(text)
	sentences = [sent.text.strip() for sent in doc.sents]
	return [s.strip() for s in sentences if s.strip()]

	# ===============================
	# 8️⃣ PoS Tagger
	# ===============================
	def get_pos_tags(sentence, lang):
	if lang == "en":
	doc = nlp_en(sentence)
	return [(token.text, token.pos_) for token in doc]
	elif lang == "hi":
	doc = nlp_hi(sentence)
	return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
	elif lang == "ta":
	doc = nlp_ta(sentence)
	return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
	else:
	return []

	# ===============================
	# 9️⃣ Analysis Function
	# ===============================
	def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs):
	results_summary = []
	export_rows = []
	para_counters = []
	article_counter = Counter()

	paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]

	# -------------------------------
	# ✅ Weighted Overall results
	weighted_scores = {}
	total_length = 0
	all_sentiments = []

	for para in paragraphs:
	sentences = split_sentences(para, lang[:2])
	for sentence in sentences:
	emo_results = emotion_pipeline(sentence[:512])[0]
	filtered = filter_neutral(emo_results)
	length = len(sentence.split())
	total_length += length
	for emo, score in filtered.items():
	weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
	sentiment_results = sentiment_pipeline(sentence[:512])[0]
	all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))

	if total_length > 0:
	weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}

	overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}

	st.subheader("📊 OVERALL (Weighted)")
	st.write("Emotions →", weighted_scores)
	st.write("Sentiment →", overall_sentiment)

	export_rows.append({
	"Type": "Overall",
	"Text": "Weighted across article",
	"Emotions": weighted_scores,
	"Sentiment": overall_sentiment
	})

	# -------------------------------
	# Paragraph-level
	for p_idx, para in enumerate(paragraphs, start=1):
	para_counter = Counter()
	sentences = split_sentences(para, lang[:2])
	for sentence in sentences:
	results = emotion_pipeline(sentence[:512])[0]
	filtered = filter_neutral(results, neutral_threshold=0.75)
	for emo, score in filtered.items():
	para_counter[emo] += score

	if normalize_paragraphs:
	# ✅ Normalize scores so they sum ≤ 1
	total = sum(para_counter.values())
	if total > 0:
	para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()}

	para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
	st.write(f"\n📑 Paragraph {p_idx}: {para}")
	st.write("Emotions →", para_counters[-1][1])

	export_rows.append({
	"Type": "Paragraph",
	"Text": para,
	"Emotions": para_counters[-1][1],
	"Sentiment": ""
	})

	# -------------------------------
	# Sentence-level
	st.subheader("📝 SENTENCES")
	for para in paragraphs:
	sentences = split_sentences(para, lang[:2])
	for sentence in sentences:
	pos_tags = get_pos_tags(sentence, lang[:2])
	results = emotion_pipeline(sentence[:512])[0]
	filtered = filter_neutral(results, neutral_threshold=0.75)
	sentiment_results = sentiment_pipeline(sentence[:512])[0]
	best_sentiment = max(sentiment_results, key=lambda x: x["score"])
	results_summary.append({
	"sentence": sentence,
	"pos_tags": pos_tags,
	"emotions": filtered,
	"sentiment": best_sentiment
	})
	st.write(f"Sentence: {sentence}")
	st.write(f"POS Tags → {pos_tags}")
	st.write(f"Emotions → {filtered}")
	st.write(f"Sentiment → {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")

	export_rows.append({
	"Type": "Sentence",
	"Text": sentence,
	"Emotions": filtered,
	"Sentiment": best_sentiment
	})

	return results_summary, export_rows

	# ===============================
	# 🔟 Streamlit App
	# ===============================
	st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")

	uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
	text_input = st.text_area("Or paste text here")

	# ✅ Checkbox for paragraph normalization
	normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True)

	# ✅ Placeholder for download buttons at the top
	download_placeholder = st.empty()

	if st.button("🔍 Analyze"):
	with st.spinner("Running analysis... ⏳"):
	if uploaded_file:
	articles = read_and_split_articles(uploaded_file)
	text_to_analyze = "\n\n".join(articles) if articles else ""
	elif text_input.strip():
	text_to_analyze = text_input
	else:
	st.warning("Please upload a DOCX file or paste text to analyze.")
	st.stop()

	detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
	emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
	results, export_rows = analyze_article(
	text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs
	)

	# ✅ Show download buttons at the TOP
	df_export = pd.DataFrame(export_rows)
	csv = df_export.to_csv(index=False).encode("utf-8")

	with download_placeholder.container():
	st.download_button(
	label="⬇️ Download CSV",
	data=csv,
	file_name="analysis_results.csv",
	mime="text/csv",
	)

	excel_buffer = io.BytesIO()
	df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
	st.download_button(
	label="⬇️ Download Excel",
	data=excel_buffer,
	file_name="analysis_results.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	)