Spaces:

WD101
/

OneServerToRuleThemAll

Sleeping

App Files Files Community

OneServerToRuleThemAll / server.py

etukurudinesh

docstrings

200310b 8 months ago

raw

history blame contribute delete

2.87 kB

	import gradio as gr
	import asyncio
	from main import WebScrapingOrchestrator
	import os

	orchestrator = WebScrapingOrchestrator()

	async def scrape_async(url):
	result = await orchestrator.process_url(url)
	if "error" in result:
	return {
	"Error": {result['error']},
	}
	return {
	"URL": result.get("url"),
	"Title": result.get("title"),
	"Text Length": result["summary"]["text_length"],
	"Headings": result["llm_ready_data"]["key_headings"],
	"Main Topics": result["llm_ready_data"]["main_topics"],
	"Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
	}

	def scrape(url):
	"""
	Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON
	with a specific structure, and stores the result in a MongoDB database.

	Args:
	url (str): The URL of the webpage to scrape (e.g., 'https://example.com').

	Returns:
	dict: A JSON-compatible dictionary containing the scraped content in the following format:
	- URL (str): The scraped webpage URL.
	- Title (str): The title of the webpage.
	- Text Length (int): The length of the extracted text.
	- Headings (list): Key headings extracted from the webpage.
	- Main Topics (list): Main topics identified in the content.
	- Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended.

	Notes:
	- Utilizes Playwright for browser automation to fetch and render the webpage.
	- The scraped data is processed into a structured JSON format suitable for LLM processing.
	- The resulting JSON is stored in a MongoDB collection for persistence.
	- This function wraps an asynchronous `scrape_async` function and runs it synchronously
	using `asyncio.run`.

	Example:
	>>> result = scrape("https://example.com")
	>>> print(result)
	{
	"URL": "https://example.com",
	"Title": "Example Page",
	"Text Length": 1234,
	"Headings": ["Heading 1", "Heading 2"],
	"Main Topics": ["Topic 1", "Topic 2"],
	"Summary (Short)": "This is a summary of the webpage content..."
	}
	"""
	return asyncio.run(scrape_async(url))

	with gr.Blocks(title="MCP Web Scraper") as demo:
	gr.Markdown("### 🔍 MCP LLM Web Scraper")
	url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
	output = gr.JSON(label="Scraped & LLM-ready Content")

	scrape_button = gr.Button("Scrape Page")
	scrape_button.click(scrape, inputs=url_input, outputs=output)

	if __name__ == "__main__":
	#os.environ['no_proxy'] = 'localhost, 127.0.0.1, ::1'
	#demo.launch(server_name="0.0.0.0", server_port=7860)
	demo.launch(mcp_server=True)