etukurudinesh's picture
test3
8b441ff
import asyncio
from playwright.async_api import async_playwright
from typing import Dict, Optional
import time
class HTMLLoader:
def __init__(self):
self.browser = None
self.context = None
self.max_retries = 3
self.timeout = 30000
self.wait_for_selector = "body"
self.max_retries = 3
self.delay_between_requests = 1.0
async def __aenter__(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.firefox.launch(headless=True)
self.context = await self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
async def load_page(self, url: str) -> Dict[str, str]:
"""Load HTML content from URL handling both static and dynamic sites"""
for attempt in range(self.max_retries):
try:
page = await self.context.new_page()
await page.goto(url, timeout=self.timeout)
# Wait for body to load
await page.wait_for_selector(
self.wait_for_selector,
timeout=10000
)
# Additional wait for dynamic content
await page.wait_for_timeout(2000)
html_content = await page.content()
title = await page.title()
url_final = page.url
await page.close()
return {
"html": html_content,
"title": title,
"url": url_final,
"timestamp": int(time.time())
}
except Exception as e:
if attempt == self.max_retries - 1:
raise Exception(f"Failed to load {url}: {str(e)}")
await asyncio.sleep(self.delay_between_requests)
return None