Kartik2204 commited on
Commit
846f6aa
·
verified ·
1 Parent(s): 0b2c8c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -250
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app.py
2
  import os
3
  import spacy
4
  import stanza
@@ -13,8 +12,6 @@ import streamlit as st
13
  import io
14
  from newspaper import Article
15
  import concurrent.futures
16
- import json
17
- import tempfile
18
 
19
  # ===============================
20
  # 🔑 Vertex AI Setup
@@ -22,6 +19,9 @@ import tempfile
22
  import vertexai
23
  from vertexai.preview.generative_models import GenerativeModel
24
 
 
 
 
25
  # Ensure GCP credentials exist
26
  if "GCP_SERVICE_ACCOUNT_JSON" not in os.environ:
27
  raise RuntimeError("❌ GCP_SERVICE_ACCOUNT_JSON secret not found in Hugging Face Space")
@@ -119,19 +119,14 @@ def load_pipelines(language_code):
119
  return emotion_pipeline, sentiment_pipeline
120
 
121
  # ===============================
122
- # DOCX Reader (enhanced: detect bold runs as sub-headlines)
123
  # ===============================
124
  def read_and_split_articles(file_path):
125
  doc = docx.Document(file_path)
126
  paragraphs = []
127
  for para in doc.paragraphs:
128
- text = para.text.strip()
129
- if not text:
130
- continue
131
-
132
- # If this paragraph contains bold run(s), keep it as its own paragraph (likely a subhead)
133
- is_bold = any([r.bold for r in para.runs]) if para.runs else False
134
- paragraphs.append(text if not is_bold else text)
135
 
136
  headline = paragraphs[0] if paragraphs else ""
137
  body_paragraphs = paragraphs[1:] if len(paragraphs) > 1 else []
@@ -139,24 +134,18 @@ def read_and_split_articles(file_path):
139
  return headline, body_paragraphs
140
 
141
  # ===============================
142
- # Robust URL Reader
143
  # ===============================
144
  def read_article_from_url(url):
145
- # safe-guard trailing spaces and encoded spaces
146
- if not url or not isinstance(url, str):
147
- return "", []
148
- url = url.strip()
149
- # try to download & parse
150
- try:
151
- article = Article(url)
152
- article.download()
153
- article.parse()
154
- except Exception as e:
155
- st.warning(f"⚠️ Could not download article: {e}")
156
- return "", []
157
  headline = article.title.strip() if article.title else ""
158
  text_body = article.text.strip() if article.text else ""
 
159
  body_paragraphs = [p.strip() for p in text_body.split("\n") if p.strip()]
 
160
  return headline, body_paragraphs
161
 
162
  # ===============================
@@ -164,19 +153,20 @@ def read_article_from_url(url):
164
  # ===============================
165
  def filter_neutral(emotion_results, neutral_threshold=0.75):
166
  sorted_results = sorted(emotion_results, key=lambda x: x["score"], reverse=True)
 
167
  scores = {}
168
  for r in sorted_results:
169
  scores[r["label"]] = round(r["score"], 3)
 
170
  if "neutral" in scores and scores["neutral"] > neutral_threshold:
171
  scores.pop("neutral")
 
172
  return scores
173
 
174
  # ===============================
175
  # Split Sentences
176
  # ===============================
177
  def split_sentences(text, lang):
178
- if not text:
179
- return []
180
  if lang == "hi":
181
  sentences = re.split(r"।", text)
182
  return [s.strip() for s in sentences if s.strip()]
@@ -216,203 +206,82 @@ def get_pos_tags(sentence, lang):
216
  def normalize_scores(scores: dict):
217
  if not scores:
218
  return scores
 
219
  max_val = max(scores.values())
220
  if max_val == 0:
221
  return scores
 
222
  normalized = {}
223
  for k, v in scores.items():
224
  normalized[k] = round(v / max_val, 3)
 
225
  return normalized
226
 
227
  # ===============================
228
- # Enhanced Keyword extraction for SEO suggestions
229
- # ===============================
230
- COMMON_STOPWORDS = set([
231
- "the","a","an","and","or","in","on","to","of","for","is","it","that","this","with","its","as",
232
- "are","be","was","were","by","from","at","have","has","had","but","not","which","who","what",
233
- "when","where","why","how","will","can","should","our","we","you","your","I","they","their","his","her"
234
- ])
235
-
236
- def extract_keywords(text, top_n=6):
237
- # simple heuristic: words length > 3, not stopwords, prefer words appearing earlier & freq > 1
238
- if not text:
239
- return []
240
- tokens = re.findall(r"\w+", text.lower())
241
- tokens = [t for t in tokens if t not in COMMON_STOPWORDS and len(t) > 3]
242
- freq = Counter(tokens)
243
- if not freq:
244
- return []
245
- # prefer words that appear in first 80 chars (title/head) or start of article
246
- head_tokens = set(re.findall(r"\w+", text[:200].lower()))
247
- scored = []
248
- for word, count in freq.items():
249
- score = count
250
- if word in head_tokens:
251
- score += 1.25
252
- scored.append((word, score))
253
- scored.sort(key=lambda x: x[1], reverse=True)
254
- keywords = [w for w,_ in scored[:top_n]]
255
- return keywords
256
-
257
- def shorten_headline_variants(headline):
258
- # provide 1-3 short headline ideas (heuristic)
259
- words = headline.split()
260
- variants = []
261
- # variant 1: take first 6-8 words
262
- variants.append(" ".join(words[:8]) + ("..." if len(words) > 8 else ""))
263
- # variant 2: main noun + core keyword (if any)
264
- kws = extract_keywords(headline, top_n=3)
265
- if kws:
266
- variants.append(f"{kws[0].capitalize()}: {words[0]} {' '.join(words[1:4])}")
267
- # unique variant fallback: remove stopwords from headline
268
- variants.append(" ".join([w for w in words if w.lower() not in COMMON_STOPWORDS])[:70])
269
- # dedupe & cleanup
270
- clean = []
271
- for v in variants:
272
- v = v.strip()
273
- if v and v not in clean:
274
- clean.append(v)
275
- return clean[:3]
276
-
277
- # ===============================
278
- # Compute SEO suggestions (single place; lightweight heuristics)
279
- # ===============================
280
- def compute_seo_suggestions_enhanced(headline, paragraphs, top_n_keywords=5):
281
- """
282
- Returns small dict:
283
- {
284
- "keywords": [...],
285
- "keyword_density": {...},
286
- "suggestions": [...],
287
- "headline_suggestions": [...]
288
- }
289
- Show main SEO block only once (headline area).
290
- """
291
- text = (headline or "") + " " + " ".join(paragraphs or [])
292
- tokens = re.findall(r"\w+", text.lower())
293
- tokens = [t for t in tokens if t not in COMMON_STOPWORDS and len(t) > 3]
294
- freq = Counter(tokens)
295
- total = sum(freq.values()) or 1
296
- # pick top keywords but filter random garbage by requiring either freq>1 or appearing in headline
297
- top = []
298
- for w, c in freq.most_common(30):
299
- if c > 1 or (headline and w in headline.lower()):
300
- top.append((w, c))
301
- if len(top) >= top_n_keywords:
302
- break
303
- keywords = [k for k,_ in top]
304
- keyword_density = {k: round(freq[k] / total, 4) for k in keywords}
305
- suggestions = []
306
- # headline length advice
307
- if headline:
308
- if len(headline) > 70:
309
- suggestions.append("Headline is long (>70 chars). Consider shortening to 50–65 chars for better CTR.")
310
- elif len(headline) < 30:
311
- suggestions.append("Headline is short (<30 chars). Consider adding a descriptive keyword for clarity/SEO.")
312
- # keyword placement
313
- if keywords:
314
- suggestions.append(f"Primary keywords to consider: {', '.join(keywords)}.")
315
- # ensure at least one primary keyword in first 100 chars
316
- head_sample = text[:100].lower()
317
- if not any(k in head_sample for k in keywords[:2]):
318
- suggestions.append("Consider including 1–2 primary keywords in the headline or first 100 words.")
319
- # density advice (only sensible extremes)
320
- for k, d in keyword_density.items():
321
- if d < 0.003:
322
- suggestions.append(f"Keyword '{k}' has low density ({d}). Consider using it once in the first 100 words.")
323
- elif d > 0.06:
324
- suggestions.append(f"Keyword '{k}' has high density ({d}). Review for possible keyword stuffing.")
325
- # meta draft
326
- body_tokens = [t for t in tokens]
327
- meta = " ".join(body_tokens[:25])[:155].strip()
328
- suggestions.append(f"Suggested meta (draft): {meta}...")
329
- # headline ideas
330
- headline_suggestions = shorten_headline_variants(headline) if headline else []
331
- return {"keywords": keywords, "keyword_density": keyword_density, "suggestions": suggestions, "headline_suggestions": headline_suggestions}
332
-
333
- # ===============================
334
- # Improved Paragraph Cleaner
335
  # ===============================
336
  def clean_paragraphs(paragraphs):
337
- """
338
- - Merge bullets and numbered lists with previous paragraphs.
339
- - Remove promotional or repetitive boilerplate.
340
- - Detect and merge short fragments into previous paragraph.
341
- """
342
  cleaned = []
343
- prev = None
344
 
345
- for raw_para in paragraphs:
346
- if raw_para is None:
347
- continue
348
- text = raw_para.strip()
349
  if not text:
350
  continue
351
 
352
  upper_text = text.upper()
353
 
354
- # skip known promo patterns
355
  if upper_text.startswith(("ALSO READ", "READ ALSO", "TRENDING", "MUST READ")):
356
  continue
357
- if "और पढ़ें" in text or "यह भी पढ़ें" in text or "पूरा पढ़ें" in text:
358
- continue
359
-
360
- # skip obvious single-word labels like "PHOTO" or "VIDEO"
361
- if len(text.split()) <= 2 and text.isupper():
362
- continue
363
-
364
- # if line looks like a bullet or numbered list or very short fragment,
365
- # merge with previous paragraph instead of treating as its own paragraph
366
- is_bullet = bool(re.match(r"^(\-|\•|\*|\d+[\.\)]\s)", text))
367
- short_fragment = len(text.split()) < 6 and not text.endswith((".", "?", "!", ":"))
368
 
369
- if (is_bullet or short_fragment) and prev is not None:
370
- # merge into previous paragraph with a space
371
- prev = prev.rstrip() + " " + text
372
- cleaned[-1] = prev
373
  continue
374
 
375
- # skip tiny key:value lines (promos)
376
- if len(text.split()) < 5 and ":" in text and not text.endswith("?"):
377
  continue
378
 
379
- # otherwise treat as a normal paragraph
380
  cleaned.append(text)
381
- prev = text
382
 
383
  return cleaned
384
 
385
  # ===============================
386
- # Gemini Insight Generation (humanized prompts + gemini scoring + guardrails)
 
 
 
 
 
 
 
 
 
 
 
 
387
  # ===============================
388
  def generate_insight(text, emotions, sentiment, level="Paragraph", emotion_pipeline=None, sentiment_pipeline=None):
389
- """
390
- - Calls Gemini to propose a *snippet* rewrite (word/phrase/sentence).
391
- - Humanizes Gemini prompt and asks for Original → Rewrite → Why.
392
- - Re-scores Gemini's rewrite using local pipelines (emotion + sentiment).
393
- - Applies guardrails based on Gemini output (NOT original).
394
- - Returns gemini_emotions (top-3 dict) and final_text string for display.
395
- """
396
  try:
397
- # Human-like Gemini prompt: ask for a specific snippet and human-sounding rewrite
398
  prompt = f"""
399
- You are a seasoned human editor. Use a natural, conversational tone — not robotic.
400
-
401
  Text to review:
402
  {text}
403
-
404
  Task:
405
- - Identify the *specific phrase or sentence* that can be improved (only the smallest necessary span).
406
- - Output exactly in this format:
407
- Original → <the exact part to change>
408
- Rewrite → <a short, natural, human-sounding rewrite>
409
- Why → <one short sentence explaining the edit>
410
-
411
- If no rewrite is needed, output exactly:
 
 
 
 
412
  No rewrite needed. The {level.lower()} reads naturally and clearly.
413
  """
 
 
414
  response_text = None
415
- # Try Pro first, then Flash fallback
416
  for model_id, timeout in [
417
  ("publishers/google/models/gemini-2.5-pro", 40),
418
  ("publishers/google/models/gemini-2.5-flash", 25),
@@ -435,60 +304,46 @@ No rewrite needed. The {level.lower()} reads naturally and clearly.
435
  if not response_text:
436
  return {}, f"⚠️ No insight generated."
437
 
438
- # If Gemini declines rewrite
439
  if response_text.startswith("No rewrite needed"):
440
- # return clear "no rewrite" phrasing so editorial doesn't get scary warnings
441
- return {}, f"✅ No rewrite needed. The {level.lower()} reads naturally and clearly."
442
 
443
- # Re-score Gemini output using a context (Original + Rewrite) so that emotion+sentiment reflect the suggested change
444
- gemini_emotions = {}
445
- gemini_sentiment = {}
446
  if emotion_pipeline is not None and sentiment_pipeline is not None:
447
  context_for_scoring = f"Original: {text}\nRewrite: {response_text}"
 
448
  emo_res_new = emotion_pipeline(context_for_scoring[:512])[0]
449
  gemini_emotions = filter_neutral(emo_res_new)
450
- # keep top 3 emotions with scores
451
  sorted_emotions = sorted(gemini_emotions.items(), key=lambda x: x[1], reverse=True)
452
- gemini_emotions = dict(sorted_emotions[:3])
453
 
454
  senti_res_new = sentiment_pipeline(context_for_scoring[:512])[0]
455
  gemini_sentiment = max(senti_res_new, key=lambda x: x["score"])
456
 
457
- # Guardrails on GEMINI output:
458
- # If Gemini's suggested rewrite itself is strongly negative, we skip (treat as no rewrite)
459
  if gemini_sentiment["label"].upper() == "NEGATIVE" and gemini_sentiment["score"] >= 0.8:
460
- return {}, f"✅ No rewrite needed. The {level.lower()} reads naturally and clearly."
461
 
462
  negative_emotions = ["disapproval", "anger", "sadness", "fear", "disgust", "annoyance", "grief", "remorse"]
463
  for emo, score in gemini_emotions.items():
464
  if emo.lower() in negative_emotions and score >= 0.8:
465
- return {}, f"✅ No rewrite needed. The {level.lower()} reads naturally and clearly."
466
 
467
- # If both approval and disapproval are high in the gemini re-score, skip as ambiguous
468
  if gemini_emotions.get("approval", 0) > 0.6 and gemini_emotions.get("disapproval", 0) > 0.6:
469
- return {}, f"✅ No rewrite needed. The {level.lower()} reads naturally and clearly."
470
 
471
- # Attach SEO suggestions (lightweight) if possible (but minimal)
472
- seo_tips = []
473
- try:
474
- seo_data = compute_seo_suggestions_enhanced(text, [text])
475
- seo_tips = seo_data.get("suggestions", [])[:2]
476
- except Exception:
477
- seo_tips = []
478
 
479
- # Format the final output: show the Gemini rewrite + its sentiment + top-3 emotions
480
  gem_emo_text = ", ".join([f"{k}: {v}" for k, v in gemini_emotions.items()]) if gemini_emotions else "N/A"
481
  gem_sent_text = f"{gemini_sentiment.get('label','N/A')} ({round(gemini_sentiment.get('score',0),3)})" if gemini_sentiment else "N/A"
482
 
483
- seo_text = ""
484
- if seo_tips:
485
- seo_text = "\n\n💡 SEO Suggestions:\n- " + "\n- ".join(seo_tips)
486
-
487
  final_text = (
488
- f"✍️ {response_text}\n\n"
489
  f"✨ Gemini Rewrite Sentiment: {gem_sent_text}\n"
490
  f"✨ Gemini Rewrite Top Emotions: {gem_emo_text}"
491
- f"{seo_text}"
492
  )
493
  return gemini_emotions, final_text
494
 
@@ -519,18 +374,6 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
519
  st.write("Headline →", headline)
520
  st.write("Emotions →", headline_emotions)
521
  st.write("Sentiment →", headline_sentiment)
522
-
523
- # Show SEO suggestions only once here
524
- seo_data = compute_seo_suggestions_enhanced(headline, paragraphs)
525
- if seo_data.get("suggestions"):
526
- st.markdown("### 💡 SEO Suggestions (headline)")
527
- for s in seo_data["suggestions"][:3]:
528
- st.write("-", s)
529
- if seo_data.get("headline_suggestions"):
530
- st.markdown("### 📝 Headline ideas:")
531
- for hs in seo_data["headline_suggestions"]:
532
- st.write("-", hs)
533
-
534
  top3_headline, headline_insight = generate_insight(
535
  headline, headline_emotions, headline_sentiment, "Headline",
536
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
@@ -543,7 +386,7 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
543
  })
544
 
545
  # -----------------------
546
- # Overall Article Analysis (compute weighted emotions across cleaned paragraphs)
547
  # -----------------------
548
  if paragraphs:
549
  for p in paragraphs:
@@ -578,17 +421,9 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
578
  })
579
 
580
  # -----------------------
581
- # Paragraph Analysis (detect sub-headings and avoid SEO spam)
582
  # -----------------------
583
  for p_idx, para in enumerate(paragraphs, start=1):
584
- # subheading heuristics
585
- is_subheading = (
586
- para.strip().endswith("?")
587
- or len(para.split()) <= 8
588
- or bool(re.match(r"^\d+[\.\)]", para.strip()))
589
- or (sum(1 for w in para.split() if w.isupper()) >= 2 and len(para.split()) <= 10)
590
- )
591
-
592
  para_counter, para_sentiments = Counter(), []
593
  sentences = split_sentences(para, lang[:2])
594
  for sentence in sentences:
@@ -603,29 +438,17 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
603
  sorted_para = sorted(para_emotions.items(), key=lambda x: x[1], reverse=True)
604
  para_emotions = dict(sorted_para[:10])
605
  para_sentiment = max(para_sentiments, key=lambda x: x["score"]) if para_sentiments else {}
606
-
607
- st.subheader(f"{'🧩 Sub-heading' if is_subheading else '📑 Paragraph'} {p_idx}")
608
  st.write(para)
609
  st.write("Emotions →", para_emotions)
610
  st.write("Sentiment →", para_sentiment)
611
-
612
- # Show limited SEO only if NOT a sub-heading and only one focused tip
613
- if not is_subheading:
614
- try:
615
- seo_data = compute_seo_suggestions_enhanced("", [para], top_n_keywords=3)
616
- uniq_suggestion = seo_data["suggestions"][0] if seo_data.get("suggestions") else None
617
- if uniq_suggestion:
618
- st.markdown(f"💡 SEO Tip: {uniq_suggestion}")
619
- except Exception:
620
- pass
621
-
622
  top3_para, insight = generate_insight(
623
- para, para_emotions, para_sentiment, "Sub-heading" if is_subheading else "Paragraph",
624
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
625
  )
626
  st.write(insight)
627
  export_rows.append({
628
- "Type": "Sub-heading" if is_subheading else "Paragraph","Text": para,
629
  "Emotions": para_emotions,"Sentiment": para_sentiment,
630
  "Top3": dict(top3_para),"Insight": insight
631
  })
@@ -644,7 +467,6 @@ text_input = st.text_area("Or paste text here")
644
  if st.button("🔍 Analyze"):
645
  with st.spinner("Running analysis... ⏳"):
646
  if uploaded_file:
647
- # streamlit FileUploader returns a BytesIO-like object; docx.Document accepts file-like
648
  headline, paragraphs = read_and_split_articles(uploaded_file)
649
  elif url_input.strip():
650
  headline, paragraphs = read_article_from_url(url_input)
@@ -655,7 +477,6 @@ if st.button("🔍 Analyze"):
655
  else:
656
  st.warning("Please provide text input.")
657
  st.stop()
658
-
659
  detected_lang = detect((headline + " " + " ".join(paragraphs))[:200]) if (headline or paragraphs) else "en"
660
  emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
661
  export_rows = analyze_article(headline, paragraphs, detected_lang, emotion_pipeline, sentiment_pipeline)
@@ -666,3 +487,4 @@ if st.button("🔍 Analyze"):
666
  excel_buffer = io.BytesIO()
667
  df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
668
  st.download_button("⬇️ Download Excel", excel_buffer, "analysis_results.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", use_container_width=True)
 
 
 
1
  import os
2
  import spacy
3
  import stanza
 
12
  import io
13
  from newspaper import Article
14
  import concurrent.futures
 
 
15
 
16
  # ===============================
17
  # 🔑 Vertex AI Setup
 
19
  import vertexai
20
  from vertexai.preview.generative_models import GenerativeModel
21
 
22
+ import json
23
+ import tempfile
24
+
25
  # Ensure GCP credentials exist
26
  if "GCP_SERVICE_ACCOUNT_JSON" not in os.environ:
27
  raise RuntimeError("❌ GCP_SERVICE_ACCOUNT_JSON secret not found in Hugging Face Space")
 
119
  return emotion_pipeline, sentiment_pipeline
120
 
121
  # ===============================
122
+ # DOCX Reader
123
  # ===============================
124
  def read_and_split_articles(file_path):
125
  doc = docx.Document(file_path)
126
  paragraphs = []
127
  for para in doc.paragraphs:
128
+ if para.text.strip():
129
+ paragraphs.append(para.text.strip())
 
 
 
 
 
130
 
131
  headline = paragraphs[0] if paragraphs else ""
132
  body_paragraphs = paragraphs[1:] if len(paragraphs) > 1 else []
 
134
  return headline, body_paragraphs
135
 
136
  # ===============================
137
+ # URL Reader
138
  # ===============================
139
  def read_article_from_url(url):
140
+ article = Article(url)
141
+ article.download()
142
+ article.parse()
143
+
 
 
 
 
 
 
 
 
144
  headline = article.title.strip() if article.title else ""
145
  text_body = article.text.strip() if article.text else ""
146
+
147
  body_paragraphs = [p.strip() for p in text_body.split("\n") if p.strip()]
148
+
149
  return headline, body_paragraphs
150
 
151
  # ===============================
 
153
  # ===============================
154
  def filter_neutral(emotion_results, neutral_threshold=0.75):
155
  sorted_results = sorted(emotion_results, key=lambda x: x["score"], reverse=True)
156
+
157
  scores = {}
158
  for r in sorted_results:
159
  scores[r["label"]] = round(r["score"], 3)
160
+
161
  if "neutral" in scores and scores["neutral"] > neutral_threshold:
162
  scores.pop("neutral")
163
+
164
  return scores
165
 
166
  # ===============================
167
  # Split Sentences
168
  # ===============================
169
  def split_sentences(text, lang):
 
 
170
  if lang == "hi":
171
  sentences = re.split(r"।", text)
172
  return [s.strip() for s in sentences if s.strip()]
 
206
  def normalize_scores(scores: dict):
207
  if not scores:
208
  return scores
209
+
210
  max_val = max(scores.values())
211
  if max_val == 0:
212
  return scores
213
+
214
  normalized = {}
215
  for k, v in scores.items():
216
  normalized[k] = round(v / max_val, 3)
217
+
218
  return normalized
219
 
220
  # ===============================
221
+ # Clean Paragraphs (remove embeds/promos)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  # ===============================
223
  def clean_paragraphs(paragraphs):
 
 
 
 
 
224
  cleaned = []
 
225
 
226
+ for para in paragraphs:
227
+ text = para.strip()
 
 
228
  if not text:
229
  continue
230
 
231
  upper_text = text.upper()
232
 
 
233
  if upper_text.startswith(("ALSO READ", "READ ALSO", "TRENDING", "MUST READ")):
234
  continue
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ if "और पढ़ें" in text or "यह भी पढ़ें" in text or "पूरा पढ़ें" in text:
 
 
 
237
  continue
238
 
239
+ if len(text.split()) < 5 and ":" in text:
 
240
  continue
241
 
 
242
  cleaned.append(text)
 
243
 
244
  return cleaned
245
 
246
  # ===============================
247
+ # Gemini Insight Generation (patched with guardrails + snippet rewrites)
248
+ # ===============================
249
+ # ===============================
250
+ # Gemini Insight Generation (patched with guardrails + snippet rewrites + Gemini emotions/sentiment)
251
+ # ===============================
252
+ # ===============================
253
+ # Gemini Insight Generation (no Top 3 emotions, skip Gemini scoring if no rewrite)
254
+ # ===============================
255
+ # ===============================
256
+ # Gemini Insight Generation (only Gemini sentiment + top 3 emotions)
257
+ # ===============================
258
+ # ===============================
259
+ # Gemini Insight Generation (only Gemini sentiment + top 3 emotions, with context scoring)
260
  # ===============================
261
  def generate_insight(text, emotions, sentiment, level="Paragraph", emotion_pipeline=None, sentiment_pipeline=None):
 
 
 
 
 
 
 
262
  try:
263
+ # Always ask Gemini
264
  prompt = f"""
265
+ You are a seasoned human editor with a natural, conversational tone — not robotic or formulaic.
 
266
  Text to review:
267
  {text}
 
268
  Task:
269
+ - Identify the *specific phrase or sentence* that can be improved for clarity, tone, or impact.
270
+ - Present it as:
271
+ Original → [the exact part]
272
+ Rewrite → [a natural, human-sounding rewrite — avoid over-polishing or AI tone]
273
+ Why → [briefly explain the edit as if giving human feedback — e.g., “This reads more fluidly” or “Helps it sound more direct.”]
274
+ Guidelines:
275
+ - Use everyday phrasing and mild imperfections that feel authentic.
276
+ - Avoid mechanical transitions like “Overall,” “In summary,” or “This small change.”
277
+ - Vary sentence rhythm and tone to mimic human writing.
278
+ - Keep rewrites short and organic, not overly polished.
279
+ - If the text is already fine, say exactly:
280
  No rewrite needed. The {level.lower()} reads naturally and clearly.
281
  """
282
+
283
+
284
  response_text = None
 
285
  for model_id, timeout in [
286
  ("publishers/google/models/gemini-2.5-pro", 40),
287
  ("publishers/google/models/gemini-2.5-flash", 25),
 
304
  if not response_text:
305
  return {}, f"⚠️ No insight generated."
306
 
307
+ # If Gemini says no rewrite → just show that (no extra scoring)
308
  if response_text.startswith("No rewrite needed"):
309
+ return {}, f" {response_text}"
 
310
 
311
+ # Otherwise, re-score Gemini rewrite using context (Original + Rewrite)
312
+ gemini_emotions, gemini_sentiment = {}, {}
 
313
  if emotion_pipeline is not None and sentiment_pipeline is not None:
314
  context_for_scoring = f"Original: {text}\nRewrite: {response_text}"
315
+
316
  emo_res_new = emotion_pipeline(context_for_scoring[:512])[0]
317
  gemini_emotions = filter_neutral(emo_res_new)
 
318
  sorted_emotions = sorted(gemini_emotions.items(), key=lambda x: x[1], reverse=True)
319
+ gemini_emotions = dict(sorted_emotions[:3]) # keep top 3
320
 
321
  senti_res_new = sentiment_pipeline(context_for_scoring[:512])[0]
322
  gemini_sentiment = max(senti_res_new, key=lambda x: x["score"])
323
 
324
+ # Guardrails on Gemini output
 
325
  if gemini_sentiment["label"].upper() == "NEGATIVE" and gemini_sentiment["score"] >= 0.8:
326
+ return {}, f"✅ No rewrite needed. The {level.lower()} is clear and well written."
327
 
328
  negative_emotions = ["disapproval", "anger", "sadness", "fear", "disgust", "annoyance", "grief", "remorse"]
329
  for emo, score in gemini_emotions.items():
330
  if emo.lower() in negative_emotions and score >= 0.8:
331
+ return {}, f"✅ No rewrite needed. The {level.lower()} is clear and well written."
332
 
 
333
  if gemini_emotions.get("approval", 0) > 0.6 and gemini_emotions.get("disapproval", 0) > 0.6:
334
+ return {}, f"✅ No rewrite needed. The {level.lower()} is clear and well written."
335
 
336
+ # Badge indicator
337
+ badge = "✍️"
 
 
 
 
 
338
 
339
+ # Format Gemini insight with rewrite emotions & sentiment
340
  gem_emo_text = ", ".join([f"{k}: {v}" for k, v in gemini_emotions.items()]) if gemini_emotions else "N/A"
341
  gem_sent_text = f"{gemini_sentiment.get('label','N/A')} ({round(gemini_sentiment.get('score',0),3)})" if gemini_sentiment else "N/A"
342
 
 
 
 
 
343
  final_text = (
344
+ f"{badge} {response_text}\n\n"
345
  f"✨ Gemini Rewrite Sentiment: {gem_sent_text}\n"
346
  f"✨ Gemini Rewrite Top Emotions: {gem_emo_text}"
 
347
  )
348
  return gemini_emotions, final_text
349
 
 
374
  st.write("Headline →", headline)
375
  st.write("Emotions →", headline_emotions)
376
  st.write("Sentiment →", headline_sentiment)
 
 
 
 
 
 
 
 
 
 
 
 
377
  top3_headline, headline_insight = generate_insight(
378
  headline, headline_emotions, headline_sentiment, "Headline",
379
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
 
386
  })
387
 
388
  # -----------------------
389
+ # Overall Article Analysis
390
  # -----------------------
391
  if paragraphs:
392
  for p in paragraphs:
 
421
  })
422
 
423
  # -----------------------
424
+ # Paragraph Analysis
425
  # -----------------------
426
  for p_idx, para in enumerate(paragraphs, start=1):
 
 
 
 
 
 
 
 
427
  para_counter, para_sentiments = Counter(), []
428
  sentences = split_sentences(para, lang[:2])
429
  for sentence in sentences:
 
438
  sorted_para = sorted(para_emotions.items(), key=lambda x: x[1], reverse=True)
439
  para_emotions = dict(sorted_para[:10])
440
  para_sentiment = max(para_sentiments, key=lambda x: x["score"]) if para_sentiments else {}
441
+ st.subheader(f"📑 Paragraph {p_idx}")
 
442
  st.write(para)
443
  st.write("Emotions →", para_emotions)
444
  st.write("Sentiment →", para_sentiment)
 
 
 
 
 
 
 
 
 
 
 
445
  top3_para, insight = generate_insight(
446
+ para, para_emotions, para_sentiment, "Paragraph",
447
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
448
  )
449
  st.write(insight)
450
  export_rows.append({
451
+ "Type": "Paragraph","Text": para,
452
  "Emotions": para_emotions,"Sentiment": para_sentiment,
453
  "Top3": dict(top3_para),"Insight": insight
454
  })
 
467
  if st.button("🔍 Analyze"):
468
  with st.spinner("Running analysis... ⏳"):
469
  if uploaded_file:
 
470
  headline, paragraphs = read_and_split_articles(uploaded_file)
471
  elif url_input.strip():
472
  headline, paragraphs = read_article_from_url(url_input)
 
477
  else:
478
  st.warning("Please provide text input.")
479
  st.stop()
 
480
  detected_lang = detect((headline + " " + " ".join(paragraphs))[:200]) if (headline or paragraphs) else "en"
481
  emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
482
  export_rows = analyze_article(headline, paragraphs, detected_lang, emotion_pipeline, sentiment_pipeline)
 
487
  excel_buffer = io.BytesIO()
488
  df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
489
  st.download_button("⬇️ Download Excel", excel_buffer, "analysis_results.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", use_container_width=True)
490
+