File size: 2,322 Bytes
feea636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b441ff
 
feea636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import asyncio
from playwright.async_api import async_playwright
from typing import Dict, Optional
import time

class HTMLLoader:
    def __init__(self):
        self.browser = None
        self.context = None
        self.max_retries = 3
        self.timeout = 30000
        self.wait_for_selector = "body"
        self.max_retries = 3
        self.delay_between_requests = 1.0
        
    async def __aenter__(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.firefox.launch(headless=True)

        self.context = await self.browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.context:
            await self.context.close()
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
    
    async def load_page(self, url: str) -> Dict[str, str]:
        """Load HTML content from URL handling both static and dynamic sites"""
        for attempt in range(self.max_retries):
            try:
                page = await self.context.new_page()
                await page.goto(url, timeout=self.timeout)
                
                # Wait for body to load
                await page.wait_for_selector(
                    self.wait_for_selector,
                    timeout=10000
                )
                
                # Additional wait for dynamic content
                await page.wait_for_timeout(2000)
                
                html_content = await page.content()
                title = await page.title()
                url_final = page.url
                
                await page.close()
                
                return {
                    "html": html_content,
                    "title": title,
                    "url": url_final,
                    "timestamp": int(time.time())
                }
                
            except Exception as e:
                if attempt == self.max_retries - 1:
                    raise Exception(f"Failed to load {url}: {str(e)}")
                await asyncio.sleep(self.delay_between_requests)
        
        return None