Spaces:
Sleeping
Sleeping
| import asyncio | |
| from playwright.async_api import async_playwright | |
| from typing import Dict, Optional | |
| import time | |
| class HTMLLoader: | |
| def __init__(self): | |
| self.browser = None | |
| self.context = None | |
| self.max_retries = 3 | |
| self.timeout = 30000 | |
| self.wait_for_selector = "body" | |
| self.max_retries = 3 | |
| self.delay_between_requests = 1.0 | |
| async def __aenter__(self): | |
| self.playwright = await async_playwright().start() | |
| self.browser = await self.playwright.firefox.launch(headless=True) | |
| self.context = await self.browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| ) | |
| return self | |
| async def __aexit__(self, exc_type, exc_val, exc_tb): | |
| if self.context: | |
| await self.context.close() | |
| if self.browser: | |
| await self.browser.close() | |
| if self.playwright: | |
| await self.playwright.stop() | |
| async def load_page(self, url: str) -> Dict[str, str]: | |
| """Load HTML content from URL handling both static and dynamic sites""" | |
| for attempt in range(self.max_retries): | |
| try: | |
| page = await self.context.new_page() | |
| await page.goto(url, timeout=self.timeout) | |
| # Wait for body to load | |
| await page.wait_for_selector( | |
| self.wait_for_selector, | |
| timeout=10000 | |
| ) | |
| # Additional wait for dynamic content | |
| await page.wait_for_timeout(2000) | |
| html_content = await page.content() | |
| title = await page.title() | |
| url_final = page.url | |
| await page.close() | |
| return { | |
| "html": html_content, | |
| "title": title, | |
| "url": url_final, | |
| "timestamp": int(time.time()) | |
| } | |
| except Exception as e: | |
| if attempt == self.max_retries - 1: | |
| raise Exception(f"Failed to load {url}: {str(e)}") | |
| await asyncio.sleep(self.delay_between_requests) | |
| return None |