Kartikay Khosla commited on
Commit
00bb6d2
Β·
1 Parent(s): afd3717

Add SAURL.py as main entrypoint with Gemini insights

Browse files
Files changed (1) hide show
  1. SAURL.py +62 -69
SAURL.py CHANGED
@@ -12,6 +12,7 @@ from langdetect import detect
12
  import streamlit as st
13
  import io
14
  from newspaper import Article # βœ… for URL input
 
15
 
16
  # ===============================
17
  # πŸ”§ Safe SpaCy + Stanza Downloads
@@ -39,6 +40,13 @@ stanza.download('ta')
39
  nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
40
  nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
41
 
 
 
 
 
 
 
 
42
 
43
  # ===============================
44
  # Language-Aware Pipeline Loader
@@ -79,7 +87,6 @@ def load_pipelines(language_code):
79
 
80
  return emotion_pipeline, sentiment_pipeline
81
 
82
-
83
  # ===============================
84
  # DOCX Reader – keep paras separate
85
  # ===============================
@@ -88,7 +95,6 @@ def read_and_split_articles(file_path):
88
  paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
89
  return paragraphs
90
 
91
-
92
  # ===============================
93
  # URL Reader – title + main body
94
  # ===============================
@@ -101,7 +107,6 @@ def read_article_from_url(url):
101
  full_text = f"{title}\n\n{body}"
102
  return full_text
103
 
104
-
105
  # ===============================
106
  # Filter Neutral
107
  # ===============================
@@ -112,7 +117,6 @@ def filter_neutral(emotion_results, neutral_threshold=0.75):
112
  scores.pop("neutral")
113
  return scores
114
 
115
-
116
  # ===============================
117
  # Sentence Splitter
118
  # ===============================
@@ -126,7 +130,6 @@ def split_sentences(text, lang):
126
  sentences = [sent.text.strip() for sent in doc.sents]
127
  return [s.strip() for s in sentences if s.strip()]
128
 
129
-
130
  # ===============================
131
  # POS Tagger
132
  # ===============================
@@ -143,16 +146,47 @@ def get_pos_tags(sentence, lang):
143
  else:
144
  return []
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  # ===============================
148
  # Analysis Function
149
  # ===============================
150
  def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
151
- results_summary = []
152
  export_rows = []
153
- para_counters = []
154
- emotion_to_sentences = {}
155
-
156
  paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
157
  if len(paragraphs) <= 1:
158
  paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
@@ -175,7 +209,8 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
175
  all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
176
 
177
  if total_length > 0:
178
- weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
 
179
 
180
  overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
181
 
@@ -193,63 +228,36 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
193
  # Paragraph-level
194
  for p_idx, para in enumerate(paragraphs, start=1):
195
  para_counter = Counter()
 
196
  sentences = split_sentences(para, lang[:2])
197
  for sentence in sentences:
198
  results = emotion_pipeline(sentence[:512])[0]
199
  filtered = filter_neutral(results, neutral_threshold=0.75)
200
  for emo, score in filtered.items():
201
  para_counter[emo] += score
202
- if emo not in emotion_to_sentences:
203
- emotion_to_sentences[emo] = []
204
- if emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
205
- emotion_to_sentences[emo].append(f"(Para {p_idx}) {sentence}")
 
 
206
 
207
- para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
208
  st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
209
- st.write("Emotions β†’", para_counters[-1][1])
 
 
 
 
210
 
211
  export_rows.append({
212
  "Type": "Paragraph",
213
  "Text": para,
214
- "Emotions": para_counters[-1][1],
215
- "Sentiment": ""
 
216
  })
217
 
218
- # Sentence-level
219
- st.subheader("πŸ“ SENTENCES")
220
- for para in paragraphs:
221
- sentences = split_sentences(para, lang[:2])
222
- for sentence in sentences:
223
- pos_tags = get_pos_tags(sentence, lang[:2])
224
- results = emotion_pipeline(sentence[:512])[0]
225
- filtered = filter_neutral(results, neutral_threshold=0.75)
226
- sentiment_results = sentiment_pipeline(sentence[:512])[0]
227
- best_sentiment = max(sentiment_results, key=lambda x: x["score"])
228
- results_summary.append({
229
- "sentence": sentence,
230
- "pos_tags": pos_tags,
231
- "emotions": filtered,
232
- "sentiment": best_sentiment
233
- })
234
- st.write(f"Sentence: {sentence}")
235
- st.write(f"POS Tags β†’ {pos_tags}")
236
- st.write(f"Emotions β†’ {filtered}")
237
- st.write(f"Sentiment β†’ {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
238
-
239
- for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
240
- if emo not in emotion_to_sentences:
241
- emotion_to_sentences[emo] = []
242
- emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
243
-
244
- export_rows.append({
245
- "Type": "Sentence",
246
- "Text": sentence,
247
- "Emotions": filtered,
248
- "Sentiment": best_sentiment
249
- })
250
-
251
- return results_summary, export_rows, emotion_to_sentences
252
-
253
 
254
  # ===============================
255
  # Streamlit App
@@ -275,9 +283,7 @@ if st.button("πŸ” Analyze"):
275
 
276
  detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
277
  emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
278
- results, export_rows, emotion_to_sentences = analyze_article(
279
- text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
280
- )
281
 
282
  # βœ… Download buttons FIRST
283
  df_export = pd.DataFrame(export_rows)
@@ -298,16 +304,3 @@ if st.button("πŸ” Analyze"):
298
  file_name="analysis_results.xlsx",
299
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
300
  )
301
-
302
- # βœ… Emotion filter tabs at the end
303
- if emotion_to_sentences and len(emotion_to_sentences) > 0:
304
- st.subheader("🎭 Explore by Emotion (Top 5 only)")
305
- emotion_list = list(emotion_to_sentences.keys())
306
- tabs = st.tabs(emotion_list)
307
- for idx, emo in enumerate(emotion_list):
308
- with tabs[idx]:
309
- st.write(f"### πŸ”Ή {emo.upper()}")
310
- for text in emotion_to_sentences[emo]:
311
- st.write(f"- {text}")
312
- else:
313
- st.info("No emotions strong enough to show in Top 5 filters.")
 
12
  import streamlit as st
13
  import io
14
  from newspaper import Article # βœ… for URL input
15
+ import google.generativeai as genai # βœ… Gemini for insights
16
 
17
  # ===============================
18
  # πŸ”§ Safe SpaCy + Stanza Downloads
 
40
  nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
41
  nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
42
 
43
+ # ===============================
44
+ # Gemini setup
45
+ # ===============================
46
+ api_key = os.getenv("GEMINI_API_KEY")
47
+ if not api_key:
48
+ raise ValueError("❌ Missing GEMINI_API_KEY. Please set it in Hugging Face secrets or locally.")
49
+ genai.configure(api_key=api_key)
50
 
51
  # ===============================
52
  # Language-Aware Pipeline Loader
 
87
 
88
  return emotion_pipeline, sentiment_pipeline
89
 
 
90
  # ===============================
91
  # DOCX Reader – keep paras separate
92
  # ===============================
 
95
  paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
96
  return paragraphs
97
 
 
98
  # ===============================
99
  # URL Reader – title + main body
100
  # ===============================
 
107
  full_text = f"{title}\n\n{body}"
108
  return full_text
109
 
 
110
  # ===============================
111
  # Filter Neutral
112
  # ===============================
 
117
  scores.pop("neutral")
118
  return scores
119
 
 
120
  # ===============================
121
  # Sentence Splitter
122
  # ===============================
 
130
  sentences = [sent.text.strip() for sent in doc.sents]
131
  return [s.strip() for s in sentences if s.strip()]
132
 
 
133
  # ===============================
134
  # POS Tagger
135
  # ===============================
 
146
  else:
147
  return []
148
 
149
+ # ===============================
150
+ # Gemini – Generate Insight
151
+ # ===============================
152
+ def generate_insight(paragraph, emotions, sentiment):
153
+ """Use Gemini to suggest improvements with Top 3 emotions only"""
154
+ try:
155
+ top_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)[:3]
156
+ emo_text = ", ".join([f"{k}: {v}" for k, v in top_emotions])
157
+ sent_text = f"{sentiment['label']} ({round(sentiment['score'], 3)})" if sentiment else "N/A"
158
+
159
+ prompt = (
160
+ f"Here is a paragraph:\n\n{paragraph}\n\n"
161
+ f"Top 3 detected emotions: {emo_text}\n"
162
+ f"Overall sentiment: {sent_text}\n\n"
163
+ "πŸ‘‰ Please suggest how to rewrite or improve this paragraph for better clarity, "
164
+ "engagement, and emotional impact. Provide constructive, content-specific insights."
165
+ )
166
+
167
+ model = genai.GenerativeModel("gemini-1.5-flash")
168
+ response = model.generate_content(prompt)
169
+
170
+ return response.text.strip() if response and response.text else "No insight generated."
171
+ except Exception as e:
172
+ return f"⚠️ Insight generation failed: {str(e)}"
173
+
174
+ # ===============================
175
+ # Normalize Scores (scale to 1)
176
+ # ===============================
177
+ def normalize_scores(scores: dict):
178
+ if not scores:
179
+ return scores
180
+ max_val = max(scores.values())
181
+ if max_val == 0:
182
+ return scores
183
+ return {k: round(v / max_val, 3) for k, v in scores.items()}
184
 
185
  # ===============================
186
  # Analysis Function
187
  # ===============================
188
  def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
 
189
  export_rows = []
 
 
 
190
  paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
191
  if len(paragraphs) <= 1:
192
  paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
 
209
  all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
210
 
211
  if total_length > 0:
212
+ weighted_scores = {emo: val / total_length for emo, val in weighted_scores.items()}
213
+ weighted_scores = normalize_scores(weighted_scores) # βœ… normalize to scale of 1
214
 
215
  overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
216
 
 
228
  # Paragraph-level
229
  for p_idx, para in enumerate(paragraphs, start=1):
230
  para_counter = Counter()
231
+ all_para_sentiments = []
232
  sentences = split_sentences(para, lang[:2])
233
  for sentence in sentences:
234
  results = emotion_pipeline(sentence[:512])[0]
235
  filtered = filter_neutral(results, neutral_threshold=0.75)
236
  for emo, score in filtered.items():
237
  para_counter[emo] += score
238
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
239
+ all_para_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
240
+
241
+ para_emotions = dict(sorted(para_counter.items(), key=lambda x: x[1], reverse=True))
242
+ para_emotions = normalize_scores(para_emotions) # βœ… normalize to scale of 1
243
+ para_sentiment = max(all_para_sentiments, key=lambda x: x["score"]) if all_para_sentiments else {}
244
 
 
245
  st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
246
+ st.write("Emotions β†’", para_emotions)
247
+ st.write("Sentiment β†’", para_sentiment)
248
+
249
+ insight = generate_insight(para, para_emotions, para_sentiment)
250
+ st.write("πŸ’‘ Insight β†’", insight)
251
 
252
  export_rows.append({
253
  "Type": "Paragraph",
254
  "Text": para,
255
+ "Emotions": para_emotions,
256
+ "Sentiment": para_sentiment,
257
+ "Insight": insight
258
  })
259
 
260
+ return export_rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  # ===============================
263
  # Streamlit App
 
283
 
284
  detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
285
  emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
286
+ export_rows = analyze_article(text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline)
 
 
287
 
288
  # βœ… Download buttons FIRST
289
  df_export = pd.DataFrame(export_rows)
 
304
  file_name="analysis_results.xlsx",
305
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
306
  )