selfevolveagent / evoagentx /core /module_utils.py

Upload 2846 files

5374a2d verified 12 days ago

12.5 kB

	import os
	import yaml
	import json
	import regex
	from uuid import uuid4
	from datetime import datetime, date
	from pydantic import BaseModel
	from pydantic_core import PydanticUndefined, ValidationError
	from typing import Union, Type, Any, List, Dict, get_origin, get_args

	from .logging import logger

	def make_parent_folder(path: str):

	dir_folder = os.path.dirname(path)
	if len(dir_folder.strip()) == 0:
	return
	if not os.path.exists(dir_folder):
	os.makedirs(dir_folder, exist_ok=True)

	def generate_id():
	return uuid4().hex

	def get_timestamp():
	return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	def load_json(path: str, type: str="json"):

	assert type in ["json", "jsonl"] # only support json or jsonl format
	if not os.path.exists(path=path):
	logger.error(f"File \"{path}\" does not exists!")

	if type == "json":
	try:
	with open(path, "r", encoding="utf-8") as file:
	# outputs = yaml.safe_load(file.read()) # 用yaml.safe_load加载大文件的时候会非常慢
	outputs = json.loads(file.read())
	except Exception:
	logger.error(f"File \"{path}\" is not a valid json file!")

	elif type == "jsonl":
	outputs = []
	with open(path, "r", encoding="utf-8") as fin:
	for line in fin:
	# outputs.append(yaml.safe_load(line))
	outputs.append(json.loads(line))
	else:
	outputs = []

	return outputs

	def save_json(data, path: str, type: str="json", use_indent: bool=True) -> str:

	"""
	save data to a json file

	Args:
	data: The json data to be saved. It can be a JSON str or a Serializable object when type=="json" or a list of JSON str or Serializable object when type=="jsonl".
	path(str): The path of the saved json file.
	type(str): The type of the json file, chosen from ["json" or "jsonl"].
	use_indent: Whether to use indent when saving the json file.

	Returns:
	path: the path where the json data is saved.
	"""

	assert type in ["json", "jsonl"] # only support json or jsonl format
	make_parent_folder(path)

	if type == "json":
	with open(path, "w", encoding="utf-8") as fout:
	if use_indent:
	fout.write(data if isinstance(data, str) else json.dumps(data, indent=4))
	else:
	fout.write(data if isinstance(data, str) else json.dumps(data))

	elif type == "jsonl":
	with open(path, "w", encoding="utf-8") as fout:
	for item in data:
	fout.write("{}\n".format(item if isinstance(item, str) else json.dumps(item)))

	return path

	def escape_json_values(string: str) -> str:

	def escape_value(match):
	raw_value = match.group(1)
	raw_value = raw_value.replace('\n', '\\n')
	return f'"{raw_value}"'

	def fix_json(match):
	raw_key = match.group(1)
	raw_value = match.group(2)
	raw_value = raw_value.replace("\n", "\\n")
	raw_value = regex.sub(r'(?<!\\)"', '\\\"', raw_value)
	return f'"{raw_key}": "{raw_value}"'

	try:
	json.loads(string)
	return string
	except json.JSONDecodeError:
	pass

	try:
	string = regex.sub(r'(?<!\\)"', '\\\"', string) # replace " with \"
	pattern_key = r'\\"([^"]+)\\"(?=\s:\s)'
	string = regex.sub(pattern_key, r'"\1"', string) # replace \\"key\\" with "key"
	pattern_value = r'(?<=:\s)\\"((?:\\.\|[^"\\]))\\"'
	string = regex.sub(pattern_value, escape_value, string, flags=regex.DOTALL) # replace \\"value\\" with "value"and change \n to \\n
	pattern_nested_json = r'"([^"]+)"\s:\s\\"([^"]\{+[\S\s]?\}+)[\r\n\\n]*"' # handle nested json in value
	string = regex.sub(pattern_nested_json, fix_json, string, flags=regex.DOTALL)
	json.loads(string)
	return string
	except json.JSONDecodeError:
	pass

	return string

	def fix_json_booleans(string: str) -> str:
	"""
	Finds and replaces isolated "True" and "False" with "true" and "false".

	The '\b' in the regex stands for a "word boundary", which ensures that
	we only match the full words and not substrings like "True" in "IsTrue".

	Args:
	json_string (str): The input JSON string.

	Returns:
	str: The modified JSON string with booleans in lowercase.
	"""
	# Use re.sub() with a word boundary (\b) to ensure we only match
	# the isolated words 'True' and 'False' and not substrings like "True" in "IsTrue"
	modified_string = regex.sub(r'\bTrue\b', 'true', string)
	modified_string = regex.sub(r'\bFalse\b', 'false', modified_string)
	return modified_string


	def fix_json(string: str) -> str:
	string = fix_json_booleans(string)
	string = escape_json_values(string)
	return string


	def parse_json_from_text(text: str) -> List[str]:
	"""
	Autoregressively extract JSON object from text

	Args:
	text (str): a text that includes JSON data

	Returns:
	List[str]: a list of parsed JSON data
	"""
	json_pattern = r"""(?:\{(?:[^{}]\|(?R))\}\|\[(?:[^\[\]]\|(?R))\])"""
	pattern = regex.compile(json_pattern, regex.VERBOSE)
	matches = pattern.findall(text)
	matches = [fix_json(match) for match in matches]
	return matches


	def parse_xml_from_text(text: str, label: str) -> List[str]:
	pattern = rf"<{label}>(.*?)</{label}>"
	matches: List[str] = regex.findall(pattern, text, regex.DOTALL)
	values = []
	if matches:
	values = [match.strip() for match in matches]
	return values

	def parse_data_from_text(text: str, datatype: str):

	if datatype == "str":
	data = text
	elif datatype == "int":
	data = int(text)
	elif datatype == "float":
	data = float(text)
	elif datatype == "bool":
	data = text.lower() in ("true", "yes", "1", "on", "True")
	elif datatype == "list":
	data = eval(text)
	elif datatype == "dict":
	data = eval(text)
	else:
	# raise ValueError(
	# f"Invalid value '{datatype}' is detected for `datatype`. "
	# "Available choices: ['str', 'int', 'float', 'bool', 'list', 'dict']"
	# )
	# logger.warning(f"Unknown datatype '{datatype}' is detected for `datatype`. Return the raw text instead.")
	# failed to parse the data, return the raw text
	return text
	return data

	def parse_json_from_llm_output(text: str) -> dict:
	"""
	Extract JSON str from LLM outputs and convert it to dict.
	"""
	json_list = parse_json_from_text(text=text)
	if json_list:
	json_text = json_list[0]
	try:
	data = yaml.safe_load(json_text)
	except Exception:
	raise ValueError(f"The following generated text is not a valid JSON string!\n{json_text}")
	else:
	raise ValueError(f"The follwoing generated text does not contain JSON string!\n{text}")
	return data

	def extract_code_blocks(text: str, return_type: bool = False) -> Union[List[str], List[tuple]]:
	"""
	Extract code blocks from text enclosed in triple backticks.

	Args:
	text (str): The text containing code blocks
	return_type (bool): If True, returns tuples of (language, code), otherwise just code

	Returns:
	Union[List[str], List[tuple]]: Either list of code blocks or list of (language, code) tuples
	"""
	# Regular expression to match code blocks enclosed in triple backticks
	code_block_pattern = r"```((?:[a-zA-Z])?)\n(.?)\n```"
	# Find all matches in the text
	matches = regex.findall(code_block_pattern, text, regex.DOTALL)

	# if no code blocks are found, return the text itself
	if not matches:
	return [(None, text.strip())] if return_type else [text.strip()]

	if return_type:
	# Return tuples of (language, code)
	return [(lang.strip() or None, code.strip()) for lang, code in matches]
	else:
	# Return just the code blocks
	return [code.strip() for _, code in matches]

	def remove_repr_quotes(json_string):
	pattern = r'"([A-Za-z_]\w\(.\))"'
	result = regex.sub(pattern, r'\1', json_string)
	return result

	def custom_serializer(obj: Any):

	if isinstance(obj, (bytes, bytearray)):
	return obj.decode()
	if isinstance(obj, (datetime, date)):
	return obj.strftime("%Y-%m-%d %H:%M:%S")
	if isinstance(obj, set):
	return list(obj)
	if hasattr(obj, "read") and hasattr(obj, "name"):
	return f"<FileObject name={getattr(obj, 'name', 'unknown')}>"
	if callable(obj):
	return obj.__name__
	if hasattr(obj, "__class__"):
	return obj.__repr__() if hasattr(obj, "__repr__") else obj.__class__.__name__

	raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

	# def get_type_name(type):
	# """
	# return the name of a type.
	# """
	# origin = get_origin(type)
	# args = get_args(type)
	# if origin:
	# type_name = f"{origin.__name__}[{', '.join(arg.__name__ for arg in args)}]"
	# else:
	# type_name = getattr(type, "__name__", str(type))

	# return type_name

	def get_type_name(typ):

	origin = get_origin(typ)
	if origin is None:
	return getattr(typ, "__name__", str(typ))

	if origin is Union:
	args = get_args(typ)
	return " \| ".join(get_type_name(arg) for arg in args)

	if origin is type:
	return f"Type[{get_type_name(args[0])}]" if args else "Type[Any]"

	if origin in (list, tuple):
	args = get_args(typ)
	return f"{origin.__name__}[{', '.join(get_type_name(arg) for arg in args)}]"

	if origin is dict:
	key_type, value_type = get_args(typ)
	return f"dict[{get_type_name(key_type)}, {get_type_name(value_type)}]"

	return str(origin)

	def get_pydantic_field_types(model: Type[BaseModel]) -> Dict[str, Union[str, dict]]:

	field_types = {}
	for field_name, field_info in model.model_fields.items():
	field_type = field_info.annotation
	if hasattr(field_type, "model_fields"):
	field_types[field_name] = get_pydantic_field_types(field_type)
	else:
	type_name = get_type_name(field_type)
	field_types[field_name] = type_name

	return field_types

	def get_pydantic_required_field_types(model: Type[BaseModel]) -> Dict[str, str]:

	required_field_types = {}
	for field_name, field_info in model.model_fields.items():
	if not field_info.is_required():
	continue
	if field_info.default is not PydanticUndefined or field_info.default_factory is not None:
	continue
	field_type = field_info.annotation
	type_name = get_type_name(field_type)
	required_field_types[field_name] = type_name

	return required_field_types

	def format_pydantic_field_types(field_types: Dict[str, str]) -> str:

	output = ", ".join(f"\"{field_name}\": {field_type}" for field_name, field_type in field_types.items())
	output = "{" + output + "}"
	return output

	def get_error_message(errors: List[Union[ValidationError, Exception]]) -> str:

	if not isinstance(errors, list):
	errors = [errors]

	validation_errors, exceptions = [], []
	for error in errors:
	if isinstance(error, ValidationError):
	validation_errors.append(error)
	else:
	exceptions.append(error)

	message = ""
	if len(validation_errors) > 0:
	message += f" >>>>>>>> {len(validation_errors)} Validation Errors: <<<<<<<<\n\n"
	message += "\n\n".join([str(error) for error in validation_errors])
	if len(exceptions) > 0:
	if len(message) > 0:
	message += "\n\n"
	message += f">>>>>>>> {len(exceptions)} Exception Errors: <<<<<<<<\n\n"
	message += "\n\n".join([str(type(error).__name__) + ": " +str(error) for error in exceptions])
	return message

	def get_base_module_init_error_message(cls, data: Dict[str, Any], errors: List[Union[ValidationError, Exception]]) -> str:

	if not isinstance(errors, list):
	errors = [errors]

	message = f"Can not instantiate {cls.__name__} from: "
	formatted_data = json.dumps(data, indent=4, default=custom_serializer)
	formatted_data = remove_repr_quotes(formatted_data)
	message += formatted_data
	message += "\n\n" + get_error_message(errors)
	return message