OpenTransformer
/

async-web-crawler

Model card Files Files and versions

async-web-crawler / crawler.py

OpenTransformer's picture

OpenTransformer

Upload crawler.py with huggingface_hub

389595f verified about 1 month ago

history blame contribute delete

3.69 kB

	#!/usr/bin/env python3
	"""
	GODDESS CRAWLER - High-performance async web scraper
	Sellable tool - £20 one-time

	Usage: python goddess_crawler.py seeds.txt output_dir/ --workers 100
	"""
	import asyncio
	import aiohttp
	import argparse
	import os
	import re
	from pathlib import Path
	from typing import Set
	import time

	class GoddessCrawler:
	def __init__(self, output_dir: str, workers: int = 100, timeout: int = 15):
	self.output = Path(output_dir)
	self.output.mkdir(exist_ok=True)
	self.workers = workers
	self.timeout = aiohttp.ClientTimeout(total=timeout)
	self.visited: Set[str] = set()
	self.count = 0
	self.bytes = 0
	self.errors = 0
	self.start = time.time()

	def strip_html(self, html: str) -> str:
	html = re.sub(r'<script[^>]>.?</script>', '', html, flags=re.DOTALL\|re.I)
	html = re.sub(r'<style[^>]>.?</style>', '', html, flags=re.DOTALL\|re.I)
	html = re.sub(r'<[^>]+>', ' ', html)
	html = re.sub(r'\s+', ' ', html)
	return html.strip()

	async def fetch(self, session: aiohttp.ClientSession, url: str) -> str \| None:
	if url in self.visited:
	return None
	self.visited.add(url)
	try:
	async with session.get(url) as r:
	if r.status == 200 and 'text' in r.content_type:
	html = await r.text()
	return self.strip_html(html)
	except:
	self.errors += 1
	return None

	async def process(self, session: aiohttp.ClientSession, url: str):
	text = await self.fetch(session, url)
	if text and len(text) > 200:
	self.count += 1
	path = self.output / f"p_{self.count:08d}.txt"
	content = f"URL: {url}\n\n{text}"
	path.write_text(content)
	self.bytes += len(content)

	async def crawl(self, seeds: list[str]):
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0.0.0"}
	conn = aiohttp.TCPConnector(limit=self.workers, limit_per_host=10)
	async with aiohttp.ClientSession(connector=conn, timeout=self.timeout, headers=headers) as session:
	sem = asyncio.Semaphore(self.workers)
	async def bounded(url):
	async with sem:
	await self.process(session, url)

	# Status printer
	async def status():
	while True:
	await asyncio.sleep(15)
	elapsed = time.time() - self.start
	rate = self.bytes / elapsed / 1e6
	print(f"[{elapsed:.0f}s] {self.count} pages \| {self.bytes/1e9:.2f}GB \| {self.errors} err \| {rate:.1f}MB/s")

	status_task = asyncio.create_task(status())
	await asyncio.gather(*[bounded(url) for url in seeds])
	status_task.cancel()

	print(f"\nDone! {self.count} pages, {self.bytes/1e9:.2f}GB")

	def main():
	p = argparse.ArgumentParser(description="High-performance web crawler")
	p.add_argument("seeds", help="File with URLs, one per line")
	p.add_argument("output", help="Output directory")
	p.add_argument("--workers", type=int, default=100, help="Concurrent connections")
	p.add_argument("--timeout", type=int, default=15, help="Request timeout")
	args = p.parse_args()

	seeds = Path(args.seeds).read_text().strip().split('\n')
	print(f"Loaded {len(seeds)} seeds, {args.workers} workers")

	crawler = GoddessCrawler(args.output, args.workers, args.timeout)
	asyncio.run(crawler.crawl(seeds))

	if __name__ == "__main__":
	main()