Spaces:

AliHashir
/

ai_for_all

Sleeping

App Files Files Community

ai_for_all / app /fetch /fetcher.py

AliHashir

feat: add URL fetching and content extraction functionality

2d6a8ea 4 months ago

raw

history blame contribute delete

4.15 kB

	# app/fetch/fetcher.py
	from __future__ import annotations
	import re
	from typing import List, Optional
	from urllib.parse import urlparse

	import httpx
	from bs4 import BeautifulSoup
	from readability import Document
	import trafilatura

	USER_AGENT = (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	)

	HEADERS = {
	"User-Agent": USER_AGENT,
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en;q=0.9",
	}

	TIMEOUT = httpx.Timeout(10.0, connect=5.0)
	BLOCKED_SCHEMES = {"javascript", "data"}
	BLOCKED_EXTS = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg"}

	def _looks_blocked(url: str) -> bool:
	try:
	p = urlparse(url)
	if p.scheme in BLOCKED_SCHEMES:
	return True
	for ext in BLOCKED_EXTS:
	if p.path.lower().endswith(ext):
	return True
	except Exception:
	return True
	return False

	async def fetch_html(url: str) -> Optional[str]:
	if _looks_blocked(url):
	return None
	async with httpx.AsyncClient(headers=HEADERS, timeout=TIMEOUT, follow_redirects=True) as client:
	resp = await client.get(url)
	ct = resp.headers.get("Content-Type", "")
	if "text/html" not in ct and "application/xhtml+xml" not in ct:
	return None
	resp.raise_for_status()
	return resp.text

	def _clean_text(txt: str) -> str:
	txt = re.sub(r"\r\n\|\r", "\n", txt)
	txt = re.sub(r"[ \t]+", " ", txt)
	txt = re.sub(r"\n{3,}", "\n\n", txt)
	return txt.strip()

	def _bs4_text(html: str) -> str:
	soup = BeautifulSoup(html, "lxml")
	# Drop nav, footer, script, style
	for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
	tag.decompose()
	return _clean_text(soup.get_text("\n"))

	def extract_main_text(html: str, base_url: str \| None = None) -> Optional[str]:
	# Try trafilatura first
	try:
	txt = trafilatura.extract(html, url=base_url, include_comments=False, include_tables=False)
	if txt and len(txt) >= 400:
	return _clean_text(txt)
	except Exception:
	pass
	# Fallback to readability-lxml
	try:
	doc = Document(html)
	summary_html = doc.summary() or ""
	txt = _bs4_text(summary_html)
	if txt and len(txt) >= 300:
	return _clean_text(txt)
	except Exception:
	pass
	# Last resort: whole page text
	try:
	txt = _bs4_text(html)
	if txt and len(txt) >= 200:
	return _clean_text(txt)
	except Exception:
	pass
	return None

	def _split_paragraphs(text: str) -> List[str]:
	# Split on blank lines first
	parts = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
	out: list[str] = []
	for p in parts:
	# Further split very long paragraphs by sentence groups
	if len(p) > 1200:
	chunks = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", p)
	buf = []
	cur = ""
	for s in chunks:
	cur = (cur + " " + s).strip()
	if len(cur) >= 400:
	buf.append(cur)
	cur = ""
	if cur:
	buf.append(cur)
	out.extend(buf)
	else:
	out.append(p)
	# Filter short or junky lines
	out = [p for p in out if len(p) >= 160]
	# Deduplicate
	seen: set[str] = set()
	deduped: list[str] = []
	for p in out:
	key = re.sub(r"\W+", " ", p).strip().casefold()
	if key not in seen:
	seen.add(key)
	deduped.append(p)
	return deduped[:12] # cap

	async def get_paragraphs_for_url(url: str) -> List[str]:
	html = await fetch_html(url)
	if not html:
	return []
	text = extract_main_text(html, base_url=url)
	if not text:
	return []
	return _split_paragraphs(text)

	async def get_paragraphs_with_fallback(url: str, snippet: str \| None) -> List[str]:
	paras = await get_paragraphs_for_url(url)
	if paras:
	return paras
	return [snippet] if snippet else []