etukurudinesh's picture
docstrings
200310b
import gradio as gr
import asyncio
from main import WebScrapingOrchestrator
import os
orchestrator = WebScrapingOrchestrator()
async def scrape_async(url):
result = await orchestrator.process_url(url)
if "error" in result:
return {
"Error": {result['error']},
}
return {
"URL": result.get("url"),
"Title": result.get("title"),
"Text Length": result["summary"]["text_length"],
"Headings": result["llm_ready_data"]["key_headings"],
"Main Topics": result["llm_ready_data"]["main_topics"],
"Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
}
def scrape(url):
"""
Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON
with a specific structure, and stores the result in a MongoDB database.
Args:
url (str): The URL of the webpage to scrape (e.g., 'https://example.com').
Returns:
dict: A JSON-compatible dictionary containing the scraped content in the following format:
- URL (str): The scraped webpage URL.
- Title (str): The title of the webpage.
- Text Length (int): The length of the extracted text.
- Headings (list): Key headings extracted from the webpage.
- Main Topics (list): Main topics identified in the content.
- Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended.
Notes:
- Utilizes Playwright for browser automation to fetch and render the webpage.
- The scraped data is processed into a structured JSON format suitable for LLM processing.
- The resulting JSON is stored in a MongoDB collection for persistence.
- This function wraps an asynchronous `scrape_async` function and runs it synchronously
using `asyncio.run`.
Example:
>>> result = scrape("https://example.com")
>>> print(result)
{
"URL": "https://example.com",
"Title": "Example Page",
"Text Length": 1234,
"Headings": ["Heading 1", "Heading 2"],
"Main Topics": ["Topic 1", "Topic 2"],
"Summary (Short)": "This is a summary of the webpage content..."
}
"""
return asyncio.run(scrape_async(url))
with gr.Blocks(title="MCP Web Scraper") as demo:
gr.Markdown("### ๐Ÿ” MCP LLM Web Scraper")
url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
output = gr.JSON(label="Scraped & LLM-ready Content")
scrape_button = gr.Button("Scrape Page")
scrape_button.click(scrape, inputs=url_input, outputs=output)
if __name__ == "__main__":
#os.environ['no_proxy'] = 'localhost, 127.0.0.1, ::1'
#demo.launch(server_name="0.0.0.0", server_port=7860)
demo.launch(mcp_server=True)