Kartik2204 commited on
Commit
0b2c8c6
Β·
verified Β·
1 Parent(s): 90232bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -315
app.py CHANGED
@@ -5,7 +5,7 @@ import stanza
5
  import pandas as pd
6
  import re
7
  import docx
8
- from collections import Counter, defaultdict
9
  from transformers import pipeline
10
  import torch
11
  from langdetect import detect
@@ -13,22 +13,8 @@ import streamlit as st
13
  import io
14
  from newspaper import Article
15
  import concurrent.futures
16
- import urllib.parse
17
- import requests
18
-
19
- # optional TF-IDF for better SEO keyword extraction
20
- try:
21
- from sklearn.feature_extraction.text import TfidfVectorizer
22
- SKLEARN_AVAILABLE = True
23
- except Exception:
24
- SKLEARN_AVAILABLE = False
25
-
26
- # optional: pytrends (for Google Trends signals). Best-effort.
27
- try:
28
- from pytrends.request import TrendReq
29
- PYTRENDS_AVAILABLE = True
30
- except Exception:
31
- PYTRENDS_AVAILABLE = False
32
 
33
  # ===============================
34
  # πŸ”‘ Vertex AI Setup
@@ -36,9 +22,6 @@ except Exception:
36
  import vertexai
37
  from vertexai.preview.generative_models import GenerativeModel
38
 
39
- import json
40
- import tempfile
41
-
42
  # Ensure GCP credentials exist
43
  if "GCP_SERVICE_ACCOUNT_JSON" not in os.environ:
44
  raise RuntimeError("❌ GCP_SERVICE_ACCOUNT_JSON secret not found in Hugging Face Space")
@@ -146,12 +129,9 @@ def read_and_split_articles(file_path):
146
  if not text:
147
  continue
148
 
149
- # If this paragraph contains bold run(s), keep as separate paragraph/subhead
150
- try:
151
- is_bold = any([r.bold for r in para.runs]) if para.runs else False
152
- except Exception:
153
- is_bold = False
154
- paragraphs.append(text)
155
 
156
  headline = paragraphs[0] if paragraphs else ""
157
  body_paragraphs = paragraphs[1:] if len(paragraphs) > 1 else []
@@ -159,54 +139,23 @@ def read_and_split_articles(file_path):
159
  return headline, body_paragraphs
160
 
161
  # ===============================
162
- # URL Reader (robust)
163
  # ===============================
164
  def read_article_from_url(url):
165
- """
166
- Robust article fetcher:
167
- - sanitizes/unquotes the URL
168
- - tries a direct requests.get() with a browser UA and feeds HTML to newspaper.Article
169
- - falls back to Article.download() only if necessary
170
- - returns (headline, body_paragraphs) or ("", []) on failure
171
- """
172
  if not url or not isinstance(url, str):
173
  return "", []
174
-
175
- # sanitize and normalize url
176
  url = url.strip()
177
- url = urllib.parse.unquote(url)
178
- url = url.rstrip(" \t\n\r")
179
- url = url.replace(" ", "")
180
-
181
- headers = {
182
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
183
- "(KHTML, like Gecko) Chrome/115.0 Safari/537.36"
184
- }
185
-
186
  try:
187
- # try direct fetch first (some sites block Article.download())
188
- resp = requests.get(url, headers=headers, timeout=10)
189
- if resp.status_code == 200 and resp.text.strip():
190
- art = Article(url)
191
- # feed html to newspaper if possible
192
- try:
193
- art.set_html(resp.text)
194
- art.parse()
195
- except Exception:
196
- # fallback to normal download
197
- art.download()
198
- art.parse()
199
- else:
200
- # fallback to library download (may raise)
201
- art = Article(url)
202
- art.download()
203
- art.parse()
204
  except Exception as e:
205
- st.warning(f"⚠️ Could not download/parse article ({e}). Returning empty result.")
206
  return "", []
207
-
208
- headline = art.title.strip() if art.title else ""
209
- text_body = art.text.strip() if art.text else ""
210
  body_paragraphs = [p.strip() for p in text_body.split("\n") if p.strip()]
211
  return headline, body_paragraphs
212
 
@@ -215,20 +164,19 @@ def read_article_from_url(url):
215
  # ===============================
216
  def filter_neutral(emotion_results, neutral_threshold=0.75):
217
  sorted_results = sorted(emotion_results, key=lambda x: x["score"], reverse=True)
218
-
219
  scores = {}
220
  for r in sorted_results:
221
  scores[r["label"]] = round(r["score"], 3)
222
-
223
  if "neutral" in scores and scores["neutral"] > neutral_threshold:
224
  scores.pop("neutral")
225
-
226
  return scores
227
 
228
  # ===============================
229
  # Split Sentences
230
  # ===============================
231
  def split_sentences(text, lang):
 
 
232
  if lang == "hi":
233
  sentences = re.split(r"ΰ₯€", text)
234
  return [s.strip() for s in sentences if s.strip()]
@@ -268,21 +216,129 @@ def get_pos_tags(sentence, lang):
268
  def normalize_scores(scores: dict):
269
  if not scores:
270
  return scores
271
-
272
  max_val = max(scores.values())
273
  if max_val == 0:
274
  return scores
275
-
276
  normalized = {}
277
  for k, v in scores.items():
278
  normalized[k] = round(v / max_val, 3)
279
-
280
  return normalized
281
 
282
  # ===============================
283
- # Stronger paragraph cleaning (merge bullets/numbers & skip promos)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  # ===============================
285
  def clean_paragraphs(paragraphs):
 
 
 
 
 
286
  cleaned = []
287
  prev = None
288
 
@@ -326,213 +382,6 @@ def clean_paragraphs(paragraphs):
326
 
327
  return cleaned
328
 
329
- # ===============================
330
- # SEO / Keyword extraction helpers (TF-IDF + spaCy NER/POS)
331
- # ===============================
332
- SEO_STOPWORDS = set([
333
- "the","a","an","and","or","in","on","to","of","for","is","it","that","this","with","its","as",
334
- "are","was","were","be","by","from","at","has","have","but","not","will","can","which","also",
335
- "we","you","i","they","their","our","us"
336
- ])
337
-
338
- def extract_candidate_keywords(headline, paragraphs, top_k=10, ngram_range=(1,2)):
339
- """
340
- Return a ranked list of candidate keywords using TF-IDF + POS/NER filtering.
341
- Prioritizes named entities and multi-word noun phrases.
342
- """
343
- text = " ".join([headline] + paragraphs)
344
- if not text.strip():
345
- return []
346
-
347
- if not SKLEARN_AVAILABLE:
348
- # fallback: naive frequency filtered by spaCy nouns/ents
349
- try:
350
- doc_sp = nlp_en(text)
351
- except Exception:
352
- tokens = re.findall(r"\w+", text.lower())
353
- tokens = [t for t in tokens if t not in SEO_STOPWORDS and len(t) > 2]
354
- freq = Counter(tokens)
355
- return [(k, v) for k, v in freq.most_common(top_k)]
356
-
357
- ne_set = set(ent.text.strip() for ent in doc_sp.ents if len(ent.text.strip()) > 2)
358
- candidate_terms = defaultdict(float)
359
- for ne in ne_set:
360
- candidate_terms[ne] += 3.0
361
- for chunk in doc_sp.noun_chunks:
362
- txt = chunk.text.strip()
363
- if len(txt) > 2:
364
- candidate_terms[txt] += 0.5
365
- for tok in doc_sp:
366
- if tok.pos_ in ("NOUN", "PROPN") and not tok.is_stop and tok.is_alpha:
367
- candidate_terms[tok.text] += 0.1
368
- cand_list = sorted(candidate_terms.items(), key=lambda x: x[1], reverse=True)[:top_k]
369
- return cand_list
370
-
371
- # Use TF-IDF
372
- vectorizer = TfidfVectorizer(
373
- ngram_range=ngram_range,
374
- stop_words='english',
375
- max_df=0.9,
376
- min_df=1,
377
- token_pattern=r"(?u)\b\w\w+\b"
378
- )
379
- try:
380
- X = vectorizer.fit_transform([text])
381
- feature_names = vectorizer.get_feature_names_out()
382
- scores = X.toarray().sum(axis=0)
383
- terms_scores = {feature_names[i]: float(scores[i]) for i in range(len(feature_names))}
384
- except Exception:
385
- return []
386
-
387
- # spaCy analysis
388
- try:
389
- doc_sp = nlp_en(text)
390
- except Exception:
391
- # fallback to top tfidf raw
392
- cleaned = [(t, s) for t, s in terms_scores.items() if t.lower() not in SEO_STOPWORDS and len(t) > 2]
393
- cleaned = sorted(cleaned, key=lambda x: x[1], reverse=True)[:top_k]
394
- return cleaned
395
-
396
- candidate_terms = defaultdict(float)
397
-
398
- # prioritize named entities
399
- for ent in doc_sp.ents:
400
- ent_text = ent.text.strip()
401
- if len(ent_text) > 2:
402
- key = ent_text
403
- # try to find matching tfidf key(s)
404
- tf_score = 0.0
405
- for tfk, sc in terms_scores.items():
406
- if ent_text.lower() in tfk:
407
- tf_score += sc
408
- candidate_terms[key] += tf_score + 2.0
409
-
410
- # noun chunks and noun/proper tokens
411
- for chunk in doc_sp.noun_chunks:
412
- chunk_text = chunk.text.strip()
413
- if len(chunk_text) > 2 and not all(w.lower() in SEO_STOPWORDS for w in chunk_text.split()):
414
- candidate_terms[chunk_text] += terms_scores.get(chunk_text.lower(), 0.0) + 0.2
415
-
416
- for token in doc_sp:
417
- if token.pos_ in ("NOUN", "PROPN") and not token.is_stop and token.is_alpha and len(token.text) > 2:
418
- t = token.text.strip()
419
- candidate_terms[t] += terms_scores.get(t.lower(), 0.0) + 0.05
420
-
421
- if not candidate_terms:
422
- cleaned = [(t, s) for t, s in terms_scores.items() if t.lower() not in SEO_STOPWORDS and len(t) > 2]
423
- cleaned = sorted(cleaned, key=lambda x: x[1], reverse=True)[:top_k]
424
- return cleaned
425
-
426
- cand_list = sorted(candidate_terms.items(), key=lambda x: x[1], reverse=True)
427
- # dedupe case-insensitive and prune
428
- seen = set()
429
- final = []
430
- for term, score in cand_list:
431
- low = term.lower()
432
- if low in seen:
433
- continue
434
- if len(low) < 3:
435
- continue
436
- if all(tok in SEO_STOPWORDS for tok in low.split()):
437
- continue
438
- final.append((term, round(score, 4)))
439
- seen.add(low)
440
- if len(final) >= top_k:
441
- break
442
-
443
- return final
444
-
445
- def query_google_trends(keyword):
446
- """
447
- Best-effort Google Trends interest check for a keyword.
448
- Returns a float score (0-100) or None if not available.
449
- Requires pytrends and network.
450
- """
451
- if not PYTRENDS_AVAILABLE:
452
- return None
453
- try:
454
- py = TrendReq(hl='en-US', tz=360)
455
- kw_list = [keyword]
456
- py.build_payload(kw_list, timeframe='today 12-m')
457
- data = py.interest_over_time()
458
- if data.empty:
459
- return None
460
- mean_interest = float(data[keyword].mean())
461
- return mean_interest
462
- except Exception:
463
- return None
464
-
465
- def compute_seo_suggestions_enhanced(headline, paragraphs, top_k=6):
466
- """
467
- Returns:
468
- {
469
- keywords: [{'term':str, 'score':float, 'trend':float|None}, ...],
470
- headline_suggestions: [<string>, ...],
471
- suggestions: [<human-friendly advice>...]
472
- }
473
- """
474
- body_text = " ".join(paragraphs)
475
- candidates = extract_candidate_keywords(headline, paragraphs, top_k=20)
476
-
477
- keywords = []
478
- for term, score in candidates:
479
- trend = None
480
- if PYTRENDS_AVAILABLE:
481
- try:
482
- trend = query_google_trends(term if isinstance(term, str) else term[0])
483
- except Exception:
484
- trend = None
485
- keywords.append({"term": term, "score": score, "trend": trend})
486
-
487
- suggestions = []
488
- # headline length advice
489
- hlen = len(headline)
490
- if hlen > 70:
491
- suggestions.append("Headline is long (>70 chars). Consider shortening to 50–65 characters for better CTR.")
492
- elif hlen < 30:
493
- suggestions.append("Headline is short (<30 chars). Consider adding a descriptive primary keyword for clarity.")
494
-
495
- prioritized = sorted(keywords, key=lambda x: (x["trend"] if x["trend"] is not None else -1, x["score"]), reverse=True)
496
- top_terms = [k["term"] for k in prioritized[:3]]
497
-
498
- if top_terms:
499
- suggestions.append(f"Primary keywords to consider: {', '.join(top_terms)}. Try to include 1 in the headline and 1–2 in the first 100 words.")
500
-
501
- # lightweight density checks
502
- body_tokens = re.findall(r"\w+", body_text.lower())
503
- total = max(1, len(body_tokens))
504
- for k in top_terms:
505
- freq = body_tokens.count(k.lower())
506
- dens = freq / total
507
- if dens < 0.002:
508
- suggestions.append(f"Keyword '{k}' appears {freq} times β€” consider using it once near the top of the article to improve relevance.")
509
- elif dens > 0.06:
510
- suggestions.append(f"Keyword '{k}' density is high ({round(dens,3)}). Watch for keyword-stuffing.")
511
-
512
- # headline rewrite suggestions (2 variations)
513
- headline_suggestions = []
514
- if top_terms:
515
- primary = top_terms[0]
516
- if primary.lower() not in headline.lower():
517
- if ":" in headline:
518
- h1 = headline.replace(":", f": {primary} β€”", 1)
519
- else:
520
- h1 = f"{headline} β€” {primary}"
521
- if len(h1) <= 90:
522
- headline_suggestions.append(h1)
523
- else:
524
- short = f"{primary}: {headline}"
525
- headline_suggestions.append(short[:90])
526
- if top_terms and not any(w in headline.lower() for w in ["how", "why", "what", "when", "where", "which", "?"]):
527
- headline_suggestions.append(f"How {str(top_terms[0]).title()} Shapes the Story β€” {headline[:50]}")
528
-
529
- result = {
530
- "keywords": keywords[:top_k],
531
- "headline_suggestions": headline_suggestions,
532
- "suggestions": suggestions
533
- }
534
- return result
535
-
536
  # ===============================
537
  # Gemini Insight Generation (humanized prompts + gemini scoring + guardrails)
538
  # ===============================
@@ -542,7 +391,6 @@ def generate_insight(text, emotions, sentiment, level="Paragraph", emotion_pipel
542
  - Humanizes Gemini prompt and asks for Original β†’ Rewrite β†’ Why.
543
  - Re-scores Gemini's rewrite using local pipelines (emotion + sentiment).
544
  - Applies guardrails based on Gemini output (NOT original).
545
- - Attaches lightweight SEO suggestions where helpful.
546
  - Returns gemini_emotions (top-3 dict) and final_text string for display.
547
  """
548
  try:
@@ -589,15 +437,17 @@ No rewrite needed. The {level.lower()} reads naturally and clearly.
589
 
590
  # If Gemini declines rewrite
591
  if response_text.startswith("No rewrite needed"):
 
592
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
593
 
594
- # Re-score Gemini output using a context (Original + Rewrite)
595
  gemini_emotions = {}
596
  gemini_sentiment = {}
597
  if emotion_pipeline is not None and sentiment_pipeline is not None:
598
  context_for_scoring = f"Original: {text}\nRewrite: {response_text}"
599
  emo_res_new = emotion_pipeline(context_for_scoring[:512])[0]
600
  gemini_emotions = filter_neutral(emo_res_new)
 
601
  sorted_emotions = sorted(gemini_emotions.items(), key=lambda x: x[1], reverse=True)
602
  gemini_emotions = dict(sorted_emotions[:3])
603
 
@@ -605,6 +455,7 @@ No rewrite needed. The {level.lower()} reads naturally and clearly.
605
  gemini_sentiment = max(senti_res_new, key=lambda x: x["score"])
606
 
607
  # Guardrails on GEMINI output:
 
608
  if gemini_sentiment["label"].upper() == "NEGATIVE" and gemini_sentiment["score"] >= 0.8:
609
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
610
 
@@ -613,21 +464,26 @@ No rewrite needed. The {level.lower()} reads naturally and clearly.
613
  if emo.lower() in negative_emotions and score >= 0.8:
614
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
615
 
 
616
  if gemini_emotions.get("approval", 0) > 0.6 and gemini_emotions.get("disapproval", 0) > 0.6:
617
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
618
 
619
- # Attach SEO suggestions (lightweight) if possible for headline/paragraphs
620
- seo_text = ""
621
  try:
622
- seo_data = compute_seo_suggestions_enhanced(text if level != "Headline" else text, [text])
623
- if seo_data and seo_data.get("suggestions"):
624
- seo_text = "\n\nπŸ’‘ SEO Suggestions:\n- " + "\n- ".join(seo_data.get("suggestions", [])[:4])
625
  except Exception:
626
- seo_text = ""
627
 
 
628
  gem_emo_text = ", ".join([f"{k}: {v}" for k, v in gemini_emotions.items()]) if gemini_emotions else "N/A"
629
  gem_sent_text = f"{gemini_sentiment.get('label','N/A')} ({round(gemini_sentiment.get('score',0),3)})" if gemini_sentiment else "N/A"
630
 
 
 
 
 
631
  final_text = (
632
  f"✍️ {response_text}\n\n"
633
  f"✨ Gemini Rewrite Sentiment: {gem_sent_text}\n"
@@ -664,19 +520,16 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
664
  st.write("Emotions β†’", headline_emotions)
665
  st.write("Sentiment β†’", headline_sentiment)
666
 
667
- # enhanced SEO for headline
668
- try:
669
- seo_head = compute_seo_suggestions_enhanced(headline, paragraphs)
670
- if seo_head.get("suggestions"):
671
- st.write("πŸ’‘ SEO Suggestions (headline):")
672
- for s in seo_head["suggestions"]:
673
- st.write("-", s)
674
- if seo_head.get("headline_suggestions"):
675
- st.write("πŸ“ Headline ideas:")
676
- for hs in seo_head["headline_suggestions"]:
677
- st.write("-", hs)
678
- except Exception:
679
- pass
680
 
681
  top3_headline, headline_insight = generate_insight(
682
  headline, headline_emotions, headline_sentiment, "Headline",
@@ -690,7 +543,7 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
690
  })
691
 
692
  # -----------------------
693
- # Overall Article Analysis
694
  # -----------------------
695
  if paragraphs:
696
  for p in paragraphs:
@@ -725,9 +578,17 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
725
  })
726
 
727
  # -----------------------
728
- # Paragraph Analysis
729
  # -----------------------
730
  for p_idx, para in enumerate(paragraphs, start=1):
 
 
 
 
 
 
 
 
731
  para_counter, para_sentiments = Counter(), []
732
  sentences = split_sentences(para, lang[:2])
733
  for sentence in sentences:
@@ -742,28 +603,29 @@ def analyze_article(headline, paragraphs, lang, emotion_pipeline, sentiment_pipe
742
  sorted_para = sorted(para_emotions.items(), key=lambda x: x[1], reverse=True)
743
  para_emotions = dict(sorted_para[:10])
744
  para_sentiment = max(para_sentiments, key=lambda x: x["score"]) if para_sentiments else {}
745
- st.subheader(f"πŸ“‘ Paragraph {p_idx}")
 
746
  st.write(para)
747
  st.write("Emotions β†’", para_emotions)
748
  st.write("Sentiment β†’", para_sentiment)
749
 
750
- # paragraph-level SEO: show lightweight suggestions
751
- try:
752
- seo_para = compute_seo_suggestions_enhanced(para, [para])
753
- if seo_para.get("suggestions"):
754
- st.write("πŸ’‘ SEO (paragraph):")
755
- for s in seo_para["suggestions"]:
756
- st.write("-", s)
757
- except Exception:
758
- pass
759
 
760
  top3_para, insight = generate_insight(
761
- para, para_emotions, para_sentiment, "Paragraph",
762
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
763
  )
764
  st.write(insight)
765
  export_rows.append({
766
- "Type": "Paragraph","Text": para,
767
  "Emotions": para_emotions,"Sentiment": para_sentiment,
768
  "Top3": dict(top3_para),"Insight": insight
769
  })
@@ -782,6 +644,7 @@ text_input = st.text_area("Or paste text here")
782
  if st.button("πŸ” Analyze"):
783
  with st.spinner("Running analysis... ⏳"):
784
  if uploaded_file:
 
785
  headline, paragraphs = read_and_split_articles(uploaded_file)
786
  elif url_input.strip():
787
  headline, paragraphs = read_article_from_url(url_input)
 
5
  import pandas as pd
6
  import re
7
  import docx
8
+ from collections import Counter
9
  from transformers import pipeline
10
  import torch
11
  from langdetect import detect
 
13
  import io
14
  from newspaper import Article
15
  import concurrent.futures
16
+ import json
17
+ import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # ===============================
20
  # πŸ”‘ Vertex AI Setup
 
22
  import vertexai
23
  from vertexai.preview.generative_models import GenerativeModel
24
 
 
 
 
25
  # Ensure GCP credentials exist
26
  if "GCP_SERVICE_ACCOUNT_JSON" not in os.environ:
27
  raise RuntimeError("❌ GCP_SERVICE_ACCOUNT_JSON secret not found in Hugging Face Space")
 
129
  if not text:
130
  continue
131
 
132
+ # If this paragraph contains bold run(s), keep it as its own paragraph (likely a subhead)
133
+ is_bold = any([r.bold for r in para.runs]) if para.runs else False
134
+ paragraphs.append(text if not is_bold else text)
 
 
 
135
 
136
  headline = paragraphs[0] if paragraphs else ""
137
  body_paragraphs = paragraphs[1:] if len(paragraphs) > 1 else []
 
139
  return headline, body_paragraphs
140
 
141
  # ===============================
142
+ # Robust URL Reader
143
  # ===============================
144
  def read_article_from_url(url):
145
+ # safe-guard trailing spaces and encoded spaces
 
 
 
 
 
 
146
  if not url or not isinstance(url, str):
147
  return "", []
 
 
148
  url = url.strip()
149
+ # try to download & parse
 
 
 
 
 
 
 
 
150
  try:
151
+ article = Article(url)
152
+ article.download()
153
+ article.parse()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  except Exception as e:
155
+ st.warning(f"⚠️ Could not download article: {e}")
156
  return "", []
157
+ headline = article.title.strip() if article.title else ""
158
+ text_body = article.text.strip() if article.text else ""
 
159
  body_paragraphs = [p.strip() for p in text_body.split("\n") if p.strip()]
160
  return headline, body_paragraphs
161
 
 
164
  # ===============================
165
  def filter_neutral(emotion_results, neutral_threshold=0.75):
166
  sorted_results = sorted(emotion_results, key=lambda x: x["score"], reverse=True)
 
167
  scores = {}
168
  for r in sorted_results:
169
  scores[r["label"]] = round(r["score"], 3)
 
170
  if "neutral" in scores and scores["neutral"] > neutral_threshold:
171
  scores.pop("neutral")
 
172
  return scores
173
 
174
  # ===============================
175
  # Split Sentences
176
  # ===============================
177
  def split_sentences(text, lang):
178
+ if not text:
179
+ return []
180
  if lang == "hi":
181
  sentences = re.split(r"ΰ₯€", text)
182
  return [s.strip() for s in sentences if s.strip()]
 
216
  def normalize_scores(scores: dict):
217
  if not scores:
218
  return scores
 
219
  max_val = max(scores.values())
220
  if max_val == 0:
221
  return scores
 
222
  normalized = {}
223
  for k, v in scores.items():
224
  normalized[k] = round(v / max_val, 3)
 
225
  return normalized
226
 
227
  # ===============================
228
+ # Enhanced Keyword extraction for SEO suggestions
229
+ # ===============================
230
+ COMMON_STOPWORDS = set([
231
+ "the","a","an","and","or","in","on","to","of","for","is","it","that","this","with","its","as",
232
+ "are","be","was","were","by","from","at","have","has","had","but","not","which","who","what",
233
+ "when","where","why","how","will","can","should","our","we","you","your","I","they","their","his","her"
234
+ ])
235
+
236
+ def extract_keywords(text, top_n=6):
237
+ # simple heuristic: words length > 3, not stopwords, prefer words appearing earlier & freq > 1
238
+ if not text:
239
+ return []
240
+ tokens = re.findall(r"\w+", text.lower())
241
+ tokens = [t for t in tokens if t not in COMMON_STOPWORDS and len(t) > 3]
242
+ freq = Counter(tokens)
243
+ if not freq:
244
+ return []
245
+ # prefer words that appear in first 80 chars (title/head) or start of article
246
+ head_tokens = set(re.findall(r"\w+", text[:200].lower()))
247
+ scored = []
248
+ for word, count in freq.items():
249
+ score = count
250
+ if word in head_tokens:
251
+ score += 1.25
252
+ scored.append((word, score))
253
+ scored.sort(key=lambda x: x[1], reverse=True)
254
+ keywords = [w for w,_ in scored[:top_n]]
255
+ return keywords
256
+
257
+ def shorten_headline_variants(headline):
258
+ # provide 1-3 short headline ideas (heuristic)
259
+ words = headline.split()
260
+ variants = []
261
+ # variant 1: take first 6-8 words
262
+ variants.append(" ".join(words[:8]) + ("..." if len(words) > 8 else ""))
263
+ # variant 2: main noun + core keyword (if any)
264
+ kws = extract_keywords(headline, top_n=3)
265
+ if kws:
266
+ variants.append(f"{kws[0].capitalize()}: {words[0]} {' '.join(words[1:4])}")
267
+ # unique variant fallback: remove stopwords from headline
268
+ variants.append(" ".join([w for w in words if w.lower() not in COMMON_STOPWORDS])[:70])
269
+ # dedupe & cleanup
270
+ clean = []
271
+ for v in variants:
272
+ v = v.strip()
273
+ if v and v not in clean:
274
+ clean.append(v)
275
+ return clean[:3]
276
+
277
+ # ===============================
278
+ # Compute SEO suggestions (single place; lightweight heuristics)
279
+ # ===============================
280
+ def compute_seo_suggestions_enhanced(headline, paragraphs, top_n_keywords=5):
281
+ """
282
+ Returns small dict:
283
+ {
284
+ "keywords": [...],
285
+ "keyword_density": {...},
286
+ "suggestions": [...],
287
+ "headline_suggestions": [...]
288
+ }
289
+ Show main SEO block only once (headline area).
290
+ """
291
+ text = (headline or "") + " " + " ".join(paragraphs or [])
292
+ tokens = re.findall(r"\w+", text.lower())
293
+ tokens = [t for t in tokens if t not in COMMON_STOPWORDS and len(t) > 3]
294
+ freq = Counter(tokens)
295
+ total = sum(freq.values()) or 1
296
+ # pick top keywords but filter random garbage by requiring either freq>1 or appearing in headline
297
+ top = []
298
+ for w, c in freq.most_common(30):
299
+ if c > 1 or (headline and w in headline.lower()):
300
+ top.append((w, c))
301
+ if len(top) >= top_n_keywords:
302
+ break
303
+ keywords = [k for k,_ in top]
304
+ keyword_density = {k: round(freq[k] / total, 4) for k in keywords}
305
+ suggestions = []
306
+ # headline length advice
307
+ if headline:
308
+ if len(headline) > 70:
309
+ suggestions.append("Headline is long (>70 chars). Consider shortening to 50–65 chars for better CTR.")
310
+ elif len(headline) < 30:
311
+ suggestions.append("Headline is short (<30 chars). Consider adding a descriptive keyword for clarity/SEO.")
312
+ # keyword placement
313
+ if keywords:
314
+ suggestions.append(f"Primary keywords to consider: {', '.join(keywords)}.")
315
+ # ensure at least one primary keyword in first 100 chars
316
+ head_sample = text[:100].lower()
317
+ if not any(k in head_sample for k in keywords[:2]):
318
+ suggestions.append("Consider including 1–2 primary keywords in the headline or first 100 words.")
319
+ # density advice (only sensible extremes)
320
+ for k, d in keyword_density.items():
321
+ if d < 0.003:
322
+ suggestions.append(f"Keyword '{k}' has low density ({d}). Consider using it once in the first 100 words.")
323
+ elif d > 0.06:
324
+ suggestions.append(f"Keyword '{k}' has high density ({d}). Review for possible keyword stuffing.")
325
+ # meta draft
326
+ body_tokens = [t for t in tokens]
327
+ meta = " ".join(body_tokens[:25])[:155].strip()
328
+ suggestions.append(f"Suggested meta (draft): {meta}...")
329
+ # headline ideas
330
+ headline_suggestions = shorten_headline_variants(headline) if headline else []
331
+ return {"keywords": keywords, "keyword_density": keyword_density, "suggestions": suggestions, "headline_suggestions": headline_suggestions}
332
+
333
+ # ===============================
334
+ # Improved Paragraph Cleaner
335
  # ===============================
336
  def clean_paragraphs(paragraphs):
337
+ """
338
+ - Merge bullets and numbered lists with previous paragraphs.
339
+ - Remove promotional or repetitive boilerplate.
340
+ - Detect and merge short fragments into previous paragraph.
341
+ """
342
  cleaned = []
343
  prev = None
344
 
 
382
 
383
  return cleaned
384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  # ===============================
386
  # Gemini Insight Generation (humanized prompts + gemini scoring + guardrails)
387
  # ===============================
 
391
  - Humanizes Gemini prompt and asks for Original β†’ Rewrite β†’ Why.
392
  - Re-scores Gemini's rewrite using local pipelines (emotion + sentiment).
393
  - Applies guardrails based on Gemini output (NOT original).
 
394
  - Returns gemini_emotions (top-3 dict) and final_text string for display.
395
  """
396
  try:
 
437
 
438
  # If Gemini declines rewrite
439
  if response_text.startswith("No rewrite needed"):
440
+ # return clear "no rewrite" phrasing so editorial doesn't get scary warnings
441
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
442
 
443
+ # Re-score Gemini output using a context (Original + Rewrite) so that emotion+sentiment reflect the suggested change
444
  gemini_emotions = {}
445
  gemini_sentiment = {}
446
  if emotion_pipeline is not None and sentiment_pipeline is not None:
447
  context_for_scoring = f"Original: {text}\nRewrite: {response_text}"
448
  emo_res_new = emotion_pipeline(context_for_scoring[:512])[0]
449
  gemini_emotions = filter_neutral(emo_res_new)
450
+ # keep top 3 emotions with scores
451
  sorted_emotions = sorted(gemini_emotions.items(), key=lambda x: x[1], reverse=True)
452
  gemini_emotions = dict(sorted_emotions[:3])
453
 
 
455
  gemini_sentiment = max(senti_res_new, key=lambda x: x["score"])
456
 
457
  # Guardrails on GEMINI output:
458
+ # If Gemini's suggested rewrite itself is strongly negative, we skip (treat as no rewrite)
459
  if gemini_sentiment["label"].upper() == "NEGATIVE" and gemini_sentiment["score"] >= 0.8:
460
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
461
 
 
464
  if emo.lower() in negative_emotions and score >= 0.8:
465
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
466
 
467
+ # If both approval and disapproval are high in the gemini re-score, skip as ambiguous
468
  if gemini_emotions.get("approval", 0) > 0.6 and gemini_emotions.get("disapproval", 0) > 0.6:
469
  return {}, f"βœ… No rewrite needed. The {level.lower()} reads naturally and clearly."
470
 
471
+ # Attach SEO suggestions (lightweight) if possible (but minimal)
472
+ seo_tips = []
473
  try:
474
+ seo_data = compute_seo_suggestions_enhanced(text, [text])
475
+ seo_tips = seo_data.get("suggestions", [])[:2]
 
476
  except Exception:
477
+ seo_tips = []
478
 
479
+ # Format the final output: show the Gemini rewrite + its sentiment + top-3 emotions
480
  gem_emo_text = ", ".join([f"{k}: {v}" for k, v in gemini_emotions.items()]) if gemini_emotions else "N/A"
481
  gem_sent_text = f"{gemini_sentiment.get('label','N/A')} ({round(gemini_sentiment.get('score',0),3)})" if gemini_sentiment else "N/A"
482
 
483
+ seo_text = ""
484
+ if seo_tips:
485
+ seo_text = "\n\nπŸ’‘ SEO Suggestions:\n- " + "\n- ".join(seo_tips)
486
+
487
  final_text = (
488
  f"✍️ {response_text}\n\n"
489
  f"✨ Gemini Rewrite Sentiment: {gem_sent_text}\n"
 
520
  st.write("Emotions β†’", headline_emotions)
521
  st.write("Sentiment β†’", headline_sentiment)
522
 
523
+ # Show SEO suggestions only once here
524
+ seo_data = compute_seo_suggestions_enhanced(headline, paragraphs)
525
+ if seo_data.get("suggestions"):
526
+ st.markdown("### πŸ’‘ SEO Suggestions (headline)")
527
+ for s in seo_data["suggestions"][:3]:
528
+ st.write("-", s)
529
+ if seo_data.get("headline_suggestions"):
530
+ st.markdown("### πŸ“ Headline ideas:")
531
+ for hs in seo_data["headline_suggestions"]:
532
+ st.write("-", hs)
 
 
 
533
 
534
  top3_headline, headline_insight = generate_insight(
535
  headline, headline_emotions, headline_sentiment, "Headline",
 
543
  })
544
 
545
  # -----------------------
546
+ # Overall Article Analysis (compute weighted emotions across cleaned paragraphs)
547
  # -----------------------
548
  if paragraphs:
549
  for p in paragraphs:
 
578
  })
579
 
580
  # -----------------------
581
+ # Paragraph Analysis (detect sub-headings and avoid SEO spam)
582
  # -----------------------
583
  for p_idx, para in enumerate(paragraphs, start=1):
584
+ # subheading heuristics
585
+ is_subheading = (
586
+ para.strip().endswith("?")
587
+ or len(para.split()) <= 8
588
+ or bool(re.match(r"^\d+[\.\)]", para.strip()))
589
+ or (sum(1 for w in para.split() if w.isupper()) >= 2 and len(para.split()) <= 10)
590
+ )
591
+
592
  para_counter, para_sentiments = Counter(), []
593
  sentences = split_sentences(para, lang[:2])
594
  for sentence in sentences:
 
603
  sorted_para = sorted(para_emotions.items(), key=lambda x: x[1], reverse=True)
604
  para_emotions = dict(sorted_para[:10])
605
  para_sentiment = max(para_sentiments, key=lambda x: x["score"]) if para_sentiments else {}
606
+
607
+ st.subheader(f"{'🧩 Sub-heading' if is_subheading else 'πŸ“‘ Paragraph'} {p_idx}")
608
  st.write(para)
609
  st.write("Emotions β†’", para_emotions)
610
  st.write("Sentiment β†’", para_sentiment)
611
 
612
+ # Show limited SEO only if NOT a sub-heading and only one focused tip
613
+ if not is_subheading:
614
+ try:
615
+ seo_data = compute_seo_suggestions_enhanced("", [para], top_n_keywords=3)
616
+ uniq_suggestion = seo_data["suggestions"][0] if seo_data.get("suggestions") else None
617
+ if uniq_suggestion:
618
+ st.markdown(f"πŸ’‘ SEO Tip: {uniq_suggestion}")
619
+ except Exception:
620
+ pass
621
 
622
  top3_para, insight = generate_insight(
623
+ para, para_emotions, para_sentiment, "Sub-heading" if is_subheading else "Paragraph",
624
  emotion_pipeline=emotion_pipeline, sentiment_pipeline=sentiment_pipeline
625
  )
626
  st.write(insight)
627
  export_rows.append({
628
+ "Type": "Sub-heading" if is_subheading else "Paragraph","Text": para,
629
  "Emotions": para_emotions,"Sentiment": para_sentiment,
630
  "Top3": dict(top3_para),"Insight": insight
631
  })
 
644
  if st.button("πŸ” Analyze"):
645
  with st.spinner("Running analysis... ⏳"):
646
  if uploaded_file:
647
+ # streamlit FileUploader returns a BytesIO-like object; docx.Document accepts file-like
648
  headline, paragraphs = read_and_split_articles(uploaded_file)
649
  elif url_input.strip():
650
  headline, paragraphs = read_article_from_url(url_input)