| | |
| | """ |
| | GODDESS CRAWLER - High-performance async web scraper |
| | Sellable tool - £20 one-time |
| | |
| | Usage: python goddess_crawler.py seeds.txt output_dir/ --workers 100 |
| | """ |
| | import asyncio |
| | import aiohttp |
| | import argparse |
| | import os |
| | import re |
| | from pathlib import Path |
| | from typing import Set |
| | import time |
| |
|
| | class GoddessCrawler: |
| | def __init__(self, output_dir: str, workers: int = 100, timeout: int = 15): |
| | self.output = Path(output_dir) |
| | self.output.mkdir(exist_ok=True) |
| | self.workers = workers |
| | self.timeout = aiohttp.ClientTimeout(total=timeout) |
| | self.visited: Set[str] = set() |
| | self.count = 0 |
| | self.bytes = 0 |
| | self.errors = 0 |
| | self.start = time.time() |
| | |
| | def strip_html(self, html: str) -> str: |
| | html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I) |
| | html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I) |
| | html = re.sub(r'<[^>]+>', ' ', html) |
| | html = re.sub(r'\s+', ' ', html) |
| | return html.strip() |
| | |
| | async def fetch(self, session: aiohttp.ClientSession, url: str) -> str | None: |
| | if url in self.visited: |
| | return None |
| | self.visited.add(url) |
| | try: |
| | async with session.get(url) as r: |
| | if r.status == 200 and 'text' in r.content_type: |
| | html = await r.text() |
| | return self.strip_html(html) |
| | except: |
| | self.errors += 1 |
| | return None |
| | |
| | async def process(self, session: aiohttp.ClientSession, url: str): |
| | text = await self.fetch(session, url) |
| | if text and len(text) > 200: |
| | self.count += 1 |
| | path = self.output / f"p_{self.count:08d}.txt" |
| | content = f"URL: {url}\n\n{text}" |
| | path.write_text(content) |
| | self.bytes += len(content) |
| | |
| | async def crawl(self, seeds: list[str]): |
| | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0.0.0"} |
| | conn = aiohttp.TCPConnector(limit=self.workers, limit_per_host=10) |
| | async with aiohttp.ClientSession(connector=conn, timeout=self.timeout, headers=headers) as session: |
| | sem = asyncio.Semaphore(self.workers) |
| | async def bounded(url): |
| | async with sem: |
| | await self.process(session, url) |
| | |
| | |
| | async def status(): |
| | while True: |
| | await asyncio.sleep(15) |
| | elapsed = time.time() - self.start |
| | rate = self.bytes / elapsed / 1e6 |
| | print(f"[{elapsed:.0f}s] {self.count} pages | {self.bytes/1e9:.2f}GB | {self.errors} err | {rate:.1f}MB/s") |
| | |
| | status_task = asyncio.create_task(status()) |
| | await asyncio.gather(*[bounded(url) for url in seeds]) |
| | status_task.cancel() |
| | |
| | print(f"\nDone! {self.count} pages, {self.bytes/1e9:.2f}GB") |
| |
|
| | def main(): |
| | p = argparse.ArgumentParser(description="High-performance web crawler") |
| | p.add_argument("seeds", help="File with URLs, one per line") |
| | p.add_argument("output", help="Output directory") |
| | p.add_argument("--workers", type=int, default=100, help="Concurrent connections") |
| | p.add_argument("--timeout", type=int, default=15, help="Request timeout") |
| | args = p.parse_args() |
| | |
| | seeds = Path(args.seeds).read_text().strip().split('\n') |
| | print(f"Loaded {len(seeds)} seeds, {args.workers} workers") |
| | |
| | crawler = GoddessCrawler(args.output, args.workers, args.timeout) |
| | asyncio.run(crawler.crawl(seeds)) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|