Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import asyncio | |
| from main import WebScrapingOrchestrator | |
| import os | |
| orchestrator = WebScrapingOrchestrator() | |
| async def scrape_async(url): | |
| result = await orchestrator.process_url(url) | |
| if "error" in result: | |
| return { | |
| "Error": {result['error']}, | |
| } | |
| return { | |
| "URL": result.get("url"), | |
| "Title": result.get("title"), | |
| "Text Length": result["summary"]["text_length"], | |
| "Headings": result["llm_ready_data"]["key_headings"], | |
| "Main Topics": result["llm_ready_data"]["main_topics"], | |
| "Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..." | |
| } | |
| def scrape(url): | |
| """ | |
| Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON | |
| with a specific structure, and stores the result in a MongoDB database. | |
| Args: | |
| url (str): The URL of the webpage to scrape (e.g., 'https://example.com'). | |
| Returns: | |
| dict: A JSON-compatible dictionary containing the scraped content in the following format: | |
| - URL (str): The scraped webpage URL. | |
| - Title (str): The title of the webpage. | |
| - Text Length (int): The length of the extracted text. | |
| - Headings (list): Key headings extracted from the webpage. | |
| - Main Topics (list): Main topics identified in the content. | |
| - Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended. | |
| Notes: | |
| - Utilizes Playwright for browser automation to fetch and render the webpage. | |
| - The scraped data is processed into a structured JSON format suitable for LLM processing. | |
| - The resulting JSON is stored in a MongoDB collection for persistence. | |
| - This function wraps an asynchronous `scrape_async` function and runs it synchronously | |
| using `asyncio.run`. | |
| Example: | |
| >>> result = scrape("https://example.com") | |
| >>> print(result) | |
| { | |
| "URL": "https://example.com", | |
| "Title": "Example Page", | |
| "Text Length": 1234, | |
| "Headings": ["Heading 1", "Heading 2"], | |
| "Main Topics": ["Topic 1", "Topic 2"], | |
| "Summary (Short)": "This is a summary of the webpage content..." | |
| } | |
| """ | |
| return asyncio.run(scrape_async(url)) | |
| with gr.Blocks(title="MCP Web Scraper") as demo: | |
| gr.Markdown("### ๐ MCP LLM Web Scraper") | |
| url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...") | |
| output = gr.JSON(label="Scraped & LLM-ready Content") | |
| scrape_button = gr.Button("Scrape Page") | |
| scrape_button.click(scrape, inputs=url_input, outputs=output) | |
| if __name__ == "__main__": | |
| #os.environ['no_proxy'] = 'localhost, 127.0.0.1, ::1' | |
| #demo.launch(server_name="0.0.0.0", server_port=7860) | |
| demo.launch(mcp_server=True) | |