Spaces:
Sleeping
Sleeping
Stephen Zweibel
Add initial implementation of FormatReview tool with core features and configurations
bb869fd
| import asyncio | |
| import nest_asyncio | |
| import logging | |
| import json | |
| from pprint import pprint | |
| from config import settings | |
| from pydantic import BaseModel, Field | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger("crawl4ai_test") | |
| class FormattingRules(BaseModel): | |
| """Schema for formatting rules extraction""" | |
| margins: str = Field(description="Margin requirements for the manuscript") | |
| font: str = Field(description="Font requirements including size, type, etc.") | |
| line_spacing: str = Field(description="Line spacing requirements") | |
| citations: str = Field(description="Citation style and formatting requirements") | |
| sections: str = Field(description="Required sections and their structure") | |
| other_rules: str = Field(description="Any other formatting requirements") | |
| summary: str = Field(description="A brief summary of the key formatting requirements") | |
| async def test_crawl(): | |
| """Test crawl4ai functionality""" | |
| from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig | |
| from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
| url = "https://journal.code4lib.org/article-guidelines" | |
| # Configure the browser | |
| browser_config = BrowserConfig(verbose=True) | |
| # Configure the LLM extraction | |
| extraction_strategy = LLMExtractionStrategy( | |
| llm_config=LLMConfig( | |
| provider=f"{settings.llm_provider}/{settings.llm_model_name}", | |
| api_token=settings.openrouter_api_key | |
| ), | |
| schema=FormattingRules.schema(), | |
| extraction_type="schema", | |
| instruction=""" | |
| From the crawled content, extract all formatting rules for manuscript submissions. | |
| Focus on requirements for margins, font, line spacing, citations, section structure, | |
| and any other formatting guidelines. Provide a comprehensive extraction of all | |
| formatting-related information. | |
| """ | |
| ) | |
| # Configure the crawler | |
| run_config = CrawlerRunConfig( | |
| word_count_threshold=10, | |
| exclude_external_links=True, | |
| process_iframes=True, | |
| remove_overlay_elements=True, | |
| exclude_social_media_links=True, | |
| check_robots_txt=True, | |
| semaphore_count=3, | |
| extraction_strategy=extraction_strategy | |
| ) | |
| # Initialize the crawler and run | |
| async with AsyncWebCrawler() as crawler: | |
| result = await crawler.arun( | |
| url=url, | |
| config=run_config | |
| ) | |
| # Print all attributes of the result object | |
| logger.info(f"Result object type: {type(result)}") | |
| logger.info(f"Result object dir: {dir(result)}") | |
| # Check for success | |
| logger.info(f"Success: {result.success}") | |
| # Check for markdown | |
| if hasattr(result, 'markdown'): | |
| logger.info(f"Has markdown: {bool(result.markdown)}") | |
| logger.info(f"Markdown type: {type(result.markdown)}") | |
| logger.info(f"Markdown preview: {str(result.markdown)[:200]}...") | |
| else: | |
| logger.info("No markdown attribute") | |
| # Check for extracted_data | |
| if hasattr(result, 'extracted_data'): | |
| logger.info(f"Has extracted_data: {bool(result.extracted_data)}") | |
| logger.info(f"Extracted data: {result.extracted_data}") | |
| else: | |
| logger.info("No extracted_data attribute") | |
| # Check for other potential attributes | |
| for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: | |
| if hasattr(result, attr): | |
| logger.info(f"Has {attr}: {bool(getattr(result, attr))}") | |
| logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...") | |
| # Try to access _results directly | |
| if hasattr(result, '_results'): | |
| logger.info(f"Has _results: {bool(result._results)}") | |
| if result._results: | |
| first_result = result._results[0] | |
| logger.info(f"First result type: {type(first_result)}") | |
| logger.info(f"First result dir: {dir(first_result)}") | |
| # Check if first result has extracted_data | |
| if hasattr(first_result, 'extracted_data'): | |
| logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}") | |
| logger.info(f"First result extracted_data: {first_result.extracted_data}") | |
| # Check for other attributes in first result | |
| for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: | |
| if hasattr(first_result, attr): | |
| logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}") | |
| logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...") | |
| return result | |
| def main(): | |
| """Main function""" | |
| # Apply nest_asyncio | |
| nest_asyncio.apply() | |
| # Create a new event loop and run the async function | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| try: | |
| result = loop.run_until_complete(test_crawl()) | |
| logger.info("Test completed successfully") | |
| finally: | |
| loop.close() | |
| if __name__ == "__main__": | |
| main() | |