Spaces:

Usmansafder
/

Engchain

Sleeping

Engchain / evaluation /engineering_parser.py

usmansafdarktk

Initial commit for Hugging Face Space

a03bf1f about 2 months ago

4.53 kB

	import re
	from typing import List, Tuple, Optional


	def convert_to_float_eng(text: str) -> Optional[float]:
	"""
	Converts a string with a number and optional units to a float.
	Handles integers, floats, and scientific notation (e.g., '1.23e-4').

	Args:
	text: The input string to parse.

	Returns:
	The extracted number as a float, or None if no number is found.
	"""
	if not isinstance(text, str):
	return None

	# This regex is designed to find scientific notation, floats, and integers.
	# It handles patterns like: 123, 123.45, 1.23e-4, -0.98E+2
	# It also ignores surrounding text or units (e.g., "7.65 L").
	match = re.search(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?', text.replace(',', ''))

	if not match:
	return None

	try:
	return float(match.group(0))
	except (ValueError, TypeError):
	return None


	def extract_final_answer_eng(text: str) -> Optional[float]:
	"""
	Extracts the most likely final numerical answer from a text block.
	"""
	text = re.sub(r'\s+', ' ', text).strip()

	# Priority 1: Look for the number after the "Answer:" tag.
	match = re.search(r'\\Answer:\\\s.?([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', text, re.IGNORECASE)
	if match and match.group(1):
	return convert_to_float_eng(match.group(1))

	# Priority 2: Look for the last number after an equals sign, using word boundaries.
	# The \b ensures we match "V = 100" but not the 0 in "F_A0".
	matches = list(re.finditer(r'=\s(\b[-+]?\d\.?\d+(?:[eE][-+]?\d+)?\b)', text))
	if matches:
	return convert_to_float_eng(matches[-1].group(1))

	# Priority 3: As a last resort, find the very last standalone number in the string.
	matches = list(re.finditer(r'(\b[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\b)', text))
	if matches:
	return convert_to_float_eng(matches[-1].group(0))

	return None


	def extract_steps(full_text: str) -> Tuple[List[str], List[Optional[float]], Optional[float]]:
	"""
	Parses a full solution text into its constituent parts.
	(IMPROVED VERSION with more robust step prefix cleaning)
	"""
	# Regex to find the start of each step, robust to markdown and spacing.
	# It finds both "Step 1" and "1." style steps.
	step_starts = [match.start() for match in re.finditer(r'(?:^\|\n)\s(?:\\\|#)\s(?:Step\s\d+\|\d+\.)', full_text, re.IGNORECASE)]

	if not step_starts:
	step_texts = [full_text.strip()]
	else:
	step_texts = []
	for i, start_index in enumerate(step_starts):
	end_index = step_starts[i + 1] if i + 1 < len(step_starts) else len(full_text)
	step_texts.append(full_text[start_index:end_index].strip())

	# NEW: A more robust regex to clean both "Step X:" and "X." prefixes
	# This is the key change to fix the "1.0" error.
	cleaned_step_texts = [re.sub(r'^(?:\\\|#)\s(?:Step\s\d+\|\d+\.)\s:\s*', '', text, flags=re.IGNORECASE).strip() for text in step_texts]

	step_answers = [extract_final_answer_eng(text) for text in cleaned_step_texts]

	overall_final_answer = extract_final_answer_eng(full_text)

	if overall_final_answer is None and step_answers:
	for answer in reversed(step_answers):
	if answer is not None:
	overall_final_answer = answer
	break

	return cleaned_step_texts, step_answers, overall_final_answer


	if __name__ == '__main__':
	# --- Test Case 1: Simple CSTR Problem (from before) ---
	sample_solution_cstr = """
	Step 1: State the CSTR design equation.
	V = (F_A0 - F_A) / (-r_A)

	Step 2: Substitute the given values into the equation.
	V = (1.26 mol/s - 0.99 mol/s) / (0.0353 mol/(L·s))
	V = 0.27 / 0.0353 = 7.6487 L

	Step 3: Calculate the final volume.
	V = 7.65 L

	Answer: The required reactor volume is 7.65 liters.
	"""

	print("--- Test Case 1: Simple CSTR Problem ---")
	steps, step_answers, final_answer = extract_steps(sample_solution_cstr)
	print(f"\nOverall Final Answer Extracted: {final_answer}\n")
	for i, (text, answer) in enumerate(zip(steps, step_answers)):
	print(f"--- Step {i+1} ---")
	print(f"Text: {text}")
	print(f"Answer found in step: {answer}")
	print("-" * 15)

	print("\n" + "="*50 + "\n") # Separator for clarity