Spaces:

Zwounds
/

FormatReview

Sleeping

FormatReview / test_crawl.py

Stephen Zweibel

Add initial implementation of FormatReview tool with core features and configurations

bb869fd 7 months ago

5.44 kB

	import asyncio
	import nest_asyncio
	import logging
	import json
	from pprint import pprint
	from config import settings
	from pydantic import BaseModel, Field

	# Configure logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger = logging.getLogger("crawl4ai_test")

	class FormattingRules(BaseModel):
	"""Schema for formatting rules extraction"""
	margins: str = Field(description="Margin requirements for the manuscript")
	font: str = Field(description="Font requirements including size, type, etc.")
	line_spacing: str = Field(description="Line spacing requirements")
	citations: str = Field(description="Citation style and formatting requirements")
	sections: str = Field(description="Required sections and their structure")
	other_rules: str = Field(description="Any other formatting requirements")
	summary: str = Field(description="A brief summary of the key formatting requirements")

	async def test_crawl():
	"""Test crawl4ai functionality"""
	from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
	from crawl4ai.extraction_strategy import LLMExtractionStrategy

	url = "https://journal.code4lib.org/article-guidelines"

	# Configure the browser
	browser_config = BrowserConfig(verbose=True)

	# Configure the LLM extraction
	extraction_strategy = LLMExtractionStrategy(
	llm_config=LLMConfig(
	provider=f"{settings.llm_provider}/{settings.llm_model_name}",
	api_token=settings.openrouter_api_key
	),
	schema=FormattingRules.schema(),
	extraction_type="schema",
	instruction="""
	From the crawled content, extract all formatting rules for manuscript submissions.
	Focus on requirements for margins, font, line spacing, citations, section structure,
	and any other formatting guidelines. Provide a comprehensive extraction of all
	formatting-related information.
	"""
	)

	# Configure the crawler
	run_config = CrawlerRunConfig(
	word_count_threshold=10,
	exclude_external_links=True,
	process_iframes=True,
	remove_overlay_elements=True,
	exclude_social_media_links=True,
	check_robots_txt=True,
	semaphore_count=3,
	extraction_strategy=extraction_strategy
	)

	# Initialize the crawler and run
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url=url,
	config=run_config
	)

	# Print all attributes of the result object
	logger.info(f"Result object type: {type(result)}")
	logger.info(f"Result object dir: {dir(result)}")

	# Check for success
	logger.info(f"Success: {result.success}")

	# Check for markdown
	if hasattr(result, 'markdown'):
	logger.info(f"Has markdown: {bool(result.markdown)}")
	logger.info(f"Markdown type: {type(result.markdown)}")
	logger.info(f"Markdown preview: {str(result.markdown)[:200]}...")
	else:
	logger.info("No markdown attribute")

	# Check for extracted_data
	if hasattr(result, 'extracted_data'):
	logger.info(f"Has extracted_data: {bool(result.extracted_data)}")
	logger.info(f"Extracted data: {result.extracted_data}")
	else:
	logger.info("No extracted_data attribute")

	# Check for other potential attributes
	for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
	if hasattr(result, attr):
	logger.info(f"Has {attr}: {bool(getattr(result, attr))}")
	logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...")

	# Try to access _results directly
	if hasattr(result, '_results'):
	logger.info(f"Has _results: {bool(result._results)}")
	if result._results:
	first_result = result._results[0]
	logger.info(f"First result type: {type(first_result)}")
	logger.info(f"First result dir: {dir(first_result)}")

	# Check if first result has extracted_data
	if hasattr(first_result, 'extracted_data'):
	logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}")
	logger.info(f"First result extracted_data: {first_result.extracted_data}")

	# Check for other attributes in first result
	for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
	if hasattr(first_result, attr):
	logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}")
	logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...")

	return result

	def main():
	"""Main function"""
	# Apply nest_asyncio
	nest_asyncio.apply()

	# Create a new event loop and run the async function
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	result = loop.run_until_complete(test_crawl())
	logger.info("Test completed successfully")
	finally:
	loop.close()

	if __name__ == "__main__":
	main()