Spaces:
Sleeping
Sleeping
Kartikay Khosla
commited on
Commit
Β·
00bb6d2
1
Parent(s):
afd3717
Add SAURL.py as main entrypoint with Gemini insights
Browse files
SAURL.py
CHANGED
|
@@ -12,6 +12,7 @@ from langdetect import detect
|
|
| 12 |
import streamlit as st
|
| 13 |
import io
|
| 14 |
from newspaper import Article # β
for URL input
|
|
|
|
| 15 |
|
| 16 |
# ===============================
|
| 17 |
# π§ Safe SpaCy + Stanza Downloads
|
|
@@ -39,6 +40,13 @@ stanza.download('ta')
|
|
| 39 |
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
|
| 40 |
nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# ===============================
|
| 44 |
# Language-Aware Pipeline Loader
|
|
@@ -79,7 +87,6 @@ def load_pipelines(language_code):
|
|
| 79 |
|
| 80 |
return emotion_pipeline, sentiment_pipeline
|
| 81 |
|
| 82 |
-
|
| 83 |
# ===============================
|
| 84 |
# DOCX Reader β keep paras separate
|
| 85 |
# ===============================
|
|
@@ -88,7 +95,6 @@ def read_and_split_articles(file_path):
|
|
| 88 |
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
| 89 |
return paragraphs
|
| 90 |
|
| 91 |
-
|
| 92 |
# ===============================
|
| 93 |
# URL Reader β title + main body
|
| 94 |
# ===============================
|
|
@@ -101,7 +107,6 @@ def read_article_from_url(url):
|
|
| 101 |
full_text = f"{title}\n\n{body}"
|
| 102 |
return full_text
|
| 103 |
|
| 104 |
-
|
| 105 |
# ===============================
|
| 106 |
# Filter Neutral
|
| 107 |
# ===============================
|
|
@@ -112,7 +117,6 @@ def filter_neutral(emotion_results, neutral_threshold=0.75):
|
|
| 112 |
scores.pop("neutral")
|
| 113 |
return scores
|
| 114 |
|
| 115 |
-
|
| 116 |
# ===============================
|
| 117 |
# Sentence Splitter
|
| 118 |
# ===============================
|
|
@@ -126,7 +130,6 @@ def split_sentences(text, lang):
|
|
| 126 |
sentences = [sent.text.strip() for sent in doc.sents]
|
| 127 |
return [s.strip() for s in sentences if s.strip()]
|
| 128 |
|
| 129 |
-
|
| 130 |
# ===============================
|
| 131 |
# POS Tagger
|
| 132 |
# ===============================
|
|
@@ -143,16 +146,47 @@ def get_pos_tags(sentence, lang):
|
|
| 143 |
else:
|
| 144 |
return []
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
# ===============================
|
| 148 |
# Analysis Function
|
| 149 |
# ===============================
|
| 150 |
def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
|
| 151 |
-
results_summary = []
|
| 152 |
export_rows = []
|
| 153 |
-
para_counters = []
|
| 154 |
-
emotion_to_sentences = {}
|
| 155 |
-
|
| 156 |
paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
|
| 157 |
if len(paragraphs) <= 1:
|
| 158 |
paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
|
|
@@ -175,7 +209,8 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
|
|
| 175 |
all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
|
| 176 |
|
| 177 |
if total_length > 0:
|
| 178 |
-
weighted_scores = {emo:
|
|
|
|
| 179 |
|
| 180 |
overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
|
| 181 |
|
|
@@ -193,63 +228,36 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
|
|
| 193 |
# Paragraph-level
|
| 194 |
for p_idx, para in enumerate(paragraphs, start=1):
|
| 195 |
para_counter = Counter()
|
|
|
|
| 196 |
sentences = split_sentences(para, lang[:2])
|
| 197 |
for sentence in sentences:
|
| 198 |
results = emotion_pipeline(sentence[:512])[0]
|
| 199 |
filtered = filter_neutral(results, neutral_threshold=0.75)
|
| 200 |
for emo, score in filtered.items():
|
| 201 |
para_counter[emo] += score
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
|
| 208 |
st.write(f"\nπ Paragraph {p_idx}: {para}")
|
| 209 |
-
st.write("Emotions β",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
export_rows.append({
|
| 212 |
"Type": "Paragraph",
|
| 213 |
"Text": para,
|
| 214 |
-
"Emotions":
|
| 215 |
-
"Sentiment":
|
|
|
|
| 216 |
})
|
| 217 |
|
| 218 |
-
|
| 219 |
-
st.subheader("π SENTENCES")
|
| 220 |
-
for para in paragraphs:
|
| 221 |
-
sentences = split_sentences(para, lang[:2])
|
| 222 |
-
for sentence in sentences:
|
| 223 |
-
pos_tags = get_pos_tags(sentence, lang[:2])
|
| 224 |
-
results = emotion_pipeline(sentence[:512])[0]
|
| 225 |
-
filtered = filter_neutral(results, neutral_threshold=0.75)
|
| 226 |
-
sentiment_results = sentiment_pipeline(sentence[:512])[0]
|
| 227 |
-
best_sentiment = max(sentiment_results, key=lambda x: x["score"])
|
| 228 |
-
results_summary.append({
|
| 229 |
-
"sentence": sentence,
|
| 230 |
-
"pos_tags": pos_tags,
|
| 231 |
-
"emotions": filtered,
|
| 232 |
-
"sentiment": best_sentiment
|
| 233 |
-
})
|
| 234 |
-
st.write(f"Sentence: {sentence}")
|
| 235 |
-
st.write(f"POS Tags β {pos_tags}")
|
| 236 |
-
st.write(f"Emotions β {filtered}")
|
| 237 |
-
st.write(f"Sentiment β {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
|
| 238 |
-
|
| 239 |
-
for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
|
| 240 |
-
if emo not in emotion_to_sentences:
|
| 241 |
-
emotion_to_sentences[emo] = []
|
| 242 |
-
emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
|
| 243 |
-
|
| 244 |
-
export_rows.append({
|
| 245 |
-
"Type": "Sentence",
|
| 246 |
-
"Text": sentence,
|
| 247 |
-
"Emotions": filtered,
|
| 248 |
-
"Sentiment": best_sentiment
|
| 249 |
-
})
|
| 250 |
-
|
| 251 |
-
return results_summary, export_rows, emotion_to_sentences
|
| 252 |
-
|
| 253 |
|
| 254 |
# ===============================
|
| 255 |
# Streamlit App
|
|
@@ -275,9 +283,7 @@ if st.button("π Analyze"):
|
|
| 275 |
|
| 276 |
detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
|
| 277 |
emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
|
| 278 |
-
|
| 279 |
-
text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
|
| 280 |
-
)
|
| 281 |
|
| 282 |
# β
Download buttons FIRST
|
| 283 |
df_export = pd.DataFrame(export_rows)
|
|
@@ -298,16 +304,3 @@ if st.button("π Analyze"):
|
|
| 298 |
file_name="analysis_results.xlsx",
|
| 299 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 300 |
)
|
| 301 |
-
|
| 302 |
-
# β
Emotion filter tabs at the end
|
| 303 |
-
if emotion_to_sentences and len(emotion_to_sentences) > 0:
|
| 304 |
-
st.subheader("π Explore by Emotion (Top 5 only)")
|
| 305 |
-
emotion_list = list(emotion_to_sentences.keys())
|
| 306 |
-
tabs = st.tabs(emotion_list)
|
| 307 |
-
for idx, emo in enumerate(emotion_list):
|
| 308 |
-
with tabs[idx]:
|
| 309 |
-
st.write(f"### πΉ {emo.upper()}")
|
| 310 |
-
for text in emotion_to_sentences[emo]:
|
| 311 |
-
st.write(f"- {text}")
|
| 312 |
-
else:
|
| 313 |
-
st.info("No emotions strong enough to show in Top 5 filters.")
|
|
|
|
| 12 |
import streamlit as st
|
| 13 |
import io
|
| 14 |
from newspaper import Article # β
for URL input
|
| 15 |
+
import google.generativeai as genai # β
Gemini for insights
|
| 16 |
|
| 17 |
# ===============================
|
| 18 |
# π§ Safe SpaCy + Stanza Downloads
|
|
|
|
| 40 |
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
|
| 41 |
nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
|
| 42 |
|
| 43 |
+
# ===============================
|
| 44 |
+
# Gemini setup
|
| 45 |
+
# ===============================
|
| 46 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 47 |
+
if not api_key:
|
| 48 |
+
raise ValueError("β Missing GEMINI_API_KEY. Please set it in Hugging Face secrets or locally.")
|
| 49 |
+
genai.configure(api_key=api_key)
|
| 50 |
|
| 51 |
# ===============================
|
| 52 |
# Language-Aware Pipeline Loader
|
|
|
|
| 87 |
|
| 88 |
return emotion_pipeline, sentiment_pipeline
|
| 89 |
|
|
|
|
| 90 |
# ===============================
|
| 91 |
# DOCX Reader β keep paras separate
|
| 92 |
# ===============================
|
|
|
|
| 95 |
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
| 96 |
return paragraphs
|
| 97 |
|
|
|
|
| 98 |
# ===============================
|
| 99 |
# URL Reader β title + main body
|
| 100 |
# ===============================
|
|
|
|
| 107 |
full_text = f"{title}\n\n{body}"
|
| 108 |
return full_text
|
| 109 |
|
|
|
|
| 110 |
# ===============================
|
| 111 |
# Filter Neutral
|
| 112 |
# ===============================
|
|
|
|
| 117 |
scores.pop("neutral")
|
| 118 |
return scores
|
| 119 |
|
|
|
|
| 120 |
# ===============================
|
| 121 |
# Sentence Splitter
|
| 122 |
# ===============================
|
|
|
|
| 130 |
sentences = [sent.text.strip() for sent in doc.sents]
|
| 131 |
return [s.strip() for s in sentences if s.strip()]
|
| 132 |
|
|
|
|
| 133 |
# ===============================
|
| 134 |
# POS Tagger
|
| 135 |
# ===============================
|
|
|
|
| 146 |
else:
|
| 147 |
return []
|
| 148 |
|
| 149 |
+
# ===============================
|
| 150 |
+
# Gemini β Generate Insight
|
| 151 |
+
# ===============================
|
| 152 |
+
def generate_insight(paragraph, emotions, sentiment):
|
| 153 |
+
"""Use Gemini to suggest improvements with Top 3 emotions only"""
|
| 154 |
+
try:
|
| 155 |
+
top_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)[:3]
|
| 156 |
+
emo_text = ", ".join([f"{k}: {v}" for k, v in top_emotions])
|
| 157 |
+
sent_text = f"{sentiment['label']} ({round(sentiment['score'], 3)})" if sentiment else "N/A"
|
| 158 |
+
|
| 159 |
+
prompt = (
|
| 160 |
+
f"Here is a paragraph:\n\n{paragraph}\n\n"
|
| 161 |
+
f"Top 3 detected emotions: {emo_text}\n"
|
| 162 |
+
f"Overall sentiment: {sent_text}\n\n"
|
| 163 |
+
"π Please suggest how to rewrite or improve this paragraph for better clarity, "
|
| 164 |
+
"engagement, and emotional impact. Provide constructive, content-specific insights."
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 168 |
+
response = model.generate_content(prompt)
|
| 169 |
+
|
| 170 |
+
return response.text.strip() if response and response.text else "No insight generated."
|
| 171 |
+
except Exception as e:
|
| 172 |
+
return f"β οΈ Insight generation failed: {str(e)}"
|
| 173 |
+
|
| 174 |
+
# ===============================
|
| 175 |
+
# Normalize Scores (scale to 1)
|
| 176 |
+
# ===============================
|
| 177 |
+
def normalize_scores(scores: dict):
|
| 178 |
+
if not scores:
|
| 179 |
+
return scores
|
| 180 |
+
max_val = max(scores.values())
|
| 181 |
+
if max_val == 0:
|
| 182 |
+
return scores
|
| 183 |
+
return {k: round(v / max_val, 3) for k, v in scores.items()}
|
| 184 |
|
| 185 |
# ===============================
|
| 186 |
# Analysis Function
|
| 187 |
# ===============================
|
| 188 |
def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
|
|
|
|
| 189 |
export_rows = []
|
|
|
|
|
|
|
|
|
|
| 190 |
paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
|
| 191 |
if len(paragraphs) <= 1:
|
| 192 |
paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
|
|
|
|
| 209 |
all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
|
| 210 |
|
| 211 |
if total_length > 0:
|
| 212 |
+
weighted_scores = {emo: val / total_length for emo, val in weighted_scores.items()}
|
| 213 |
+
weighted_scores = normalize_scores(weighted_scores) # β
normalize to scale of 1
|
| 214 |
|
| 215 |
overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
|
| 216 |
|
|
|
|
| 228 |
# Paragraph-level
|
| 229 |
for p_idx, para in enumerate(paragraphs, start=1):
|
| 230 |
para_counter = Counter()
|
| 231 |
+
all_para_sentiments = []
|
| 232 |
sentences = split_sentences(para, lang[:2])
|
| 233 |
for sentence in sentences:
|
| 234 |
results = emotion_pipeline(sentence[:512])[0]
|
| 235 |
filtered = filter_neutral(results, neutral_threshold=0.75)
|
| 236 |
for emo, score in filtered.items():
|
| 237 |
para_counter[emo] += score
|
| 238 |
+
sentiment_results = sentiment_pipeline(sentence[:512])[0]
|
| 239 |
+
all_para_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
|
| 240 |
+
|
| 241 |
+
para_emotions = dict(sorted(para_counter.items(), key=lambda x: x[1], reverse=True))
|
| 242 |
+
para_emotions = normalize_scores(para_emotions) # β
normalize to scale of 1
|
| 243 |
+
para_sentiment = max(all_para_sentiments, key=lambda x: x["score"]) if all_para_sentiments else {}
|
| 244 |
|
|
|
|
| 245 |
st.write(f"\nπ Paragraph {p_idx}: {para}")
|
| 246 |
+
st.write("Emotions β", para_emotions)
|
| 247 |
+
st.write("Sentiment β", para_sentiment)
|
| 248 |
+
|
| 249 |
+
insight = generate_insight(para, para_emotions, para_sentiment)
|
| 250 |
+
st.write("π‘ Insight β", insight)
|
| 251 |
|
| 252 |
export_rows.append({
|
| 253 |
"Type": "Paragraph",
|
| 254 |
"Text": para,
|
| 255 |
+
"Emotions": para_emotions,
|
| 256 |
+
"Sentiment": para_sentiment,
|
| 257 |
+
"Insight": insight
|
| 258 |
})
|
| 259 |
|
| 260 |
+
return export_rows
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
# ===============================
|
| 263 |
# Streamlit App
|
|
|
|
| 283 |
|
| 284 |
detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
|
| 285 |
emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
|
| 286 |
+
export_rows = analyze_article(text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline)
|
|
|
|
|
|
|
| 287 |
|
| 288 |
# β
Download buttons FIRST
|
| 289 |
df_export = pd.DataFrame(export_rows)
|
|
|
|
| 304 |
file_name="analysis_results.xlsx",
|
| 305 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 306 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|