Spaces:

Usmansafder
/

Engchain

Sleeping

Engchain / evaluation /aggregate_results.py

usmansafdarktk

Initial commit for Hugging Face Space

a03bf1f about 2 months ago

4.75 kB

	import os
	import json
	import numpy as np
	import pickle
	import argparse
	from tqdm import tqdm

	# Argument Parser Setup
	parser = argparse.ArgumentParser(
	description="Summarize model evaluation results from .jsonl files."
	)
	# Add an argument for the input directory
	parser.add_argument(
	"evals_dir",
	type=str,
	help="The source directory containing the evaluation .jsonl files."
	)
	# Add an argument for the output directory
	parser.add_argument(
	"output_dir",
	type=str,
	help="The destination directory to save the summary .pkl and .txt files."
	)
	args = parser.parse_args()


	# Configuration
	EVALS_DIR = args.evals_dir
	OUTPUT_DIR = args.output_dir
	os.makedirs(OUTPUT_DIR, exist_ok=True)


	# Create a file to store the LaTeX formatted results
	mean_sdv_results_file = open(os.path.join(OUTPUT_DIR, 'mean_sdv_results.txt'), 'w')


	# Create dictionaries for your specific categories
	branch_wise_results = {}
	domain_wise_results = {}
	area_wise_results = {}
	level_wise_results = {}


	# 2. Data Collection
	# Find all evaluation files in the directory
	eval_files = [f for f in os.listdir(EVALS_DIR) if f.endswith('_evals_2.jsonl')]

	for model_file in eval_files:
	# Initialize dictionaries for the current model
	model_name = model_file.replace('_evals.jsonl', '')
	branch_wise_results[model_name] = {}
	domain_wise_results[model_name] = {}
	area_wise_results[model_name] = {}
	level_wise_results[model_name] = {}

	all_metric_vals = []

	file_path = os.path.join(EVALS_DIR, model_file)
	with open(file_path, 'r') as f_pred:
	print(f"Processing {model_file}...")
	for line in tqdm(f_pred):
	json_line = json.loads(line)

	# Skip if there's an error or missing data
	if not json_line.get('scores'):
	continue
	scores = json_line['scores']
	if any(s is None for s in scores.values()):
	continue

	# Extract scores
	recall, precision, step_f1, final_answer_match, bertscore, rouge2 = \
	scores['recall'], scores['precision'], scores['step_f1'], scores['final_answer_match'], scores['bertscore'], scores['rouge2']

	score_list = [final_answer_match, recall, precision, step_f1, bertscore, rouge2]
	all_metric_vals.append(score_list)

	# Collect results grouped by your categories
	branch = json_line['branch']
	domain = json_line['domain']
	area = json_line['area']
	level = json_line['level'].lower()

	# Group by branch
	branch_wise_results[model_name].setdefault(branch, []).append(score_list)
	# Group by domain
	domain_wise_results[model_name].setdefault(domain, []).append(score_list)
	# Group by area
	area_wise_results[model_name].setdefault(area, []).append(score_list)
	# Group by level
	level_wise_results[model_name].setdefault(level, []).append(score_list)

	# 3. Calculation and Output Generation

	# Calculate overall mean and standard deviation for the LaTeX output
	mean_vals = np.mean(all_metric_vals, axis=0)
	std_vals = np.std(all_metric_vals, axis=0)

	# Write the formatted string to the text file
	header = "Model & Final Answer & Recall & Precision & Step F1 & BERTScore & ROUGE-2 \\\\\n"
	if mean_sdv_results_file.tell() == 0: # Write header only once
	mean_sdv_results_file.write(header)

	formatted_scores = " & ".join([f"${m:.3f}_{{{s:.3f}}}$" for m, s in zip(mean_vals, std_vals)])
	mean_sdv_results_file.write(f"{model_name} & {formatted_scores} \\\\\n")

	# Calculate final mean scores for each category
	for category_results in [branch_wise_results, domain_wise_results, area_wise_results, level_wise_results]:
	for model in category_results:
	for category_key, scores_list in category_results[model].items():
	category_results[model][category_key] = np.mean(scores_list, axis=0)

	# 4. Save Dictionaries to Pickle Files
	print("\nSaving aggregated results to pickle files...")
	pickle.dump(branch_wise_results, open(os.path.join(OUTPUT_DIR, 'branch_wise_results.pkl'), 'wb'))
	pickle.dump(domain_wise_results, open(os.path.join(OUTPUT_DIR, 'domain_wise_results.pkl'), 'wb'))
	pickle.dump(area_wise_results, open(os.path.join(OUTPUT_DIR, 'area_wise_results.pkl'), 'wb'))
	pickle.dump(level_wise_results, open(os.path.join(OUTPUT_DIR, 'level_wise_results.pkl'), 'wb'))

	mean_sdv_results_file.close()
	print("Analysis complete.")