Spaces:
Sleeping
Sleeping
File size: 4,257 Bytes
a03bf1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import json
import argparse
from tqdm import tqdm
from dotenv import load_dotenv
# Import both evaluation functions but give them unique names (aliases)
from evaluation.bi_encoders_evaluation import evaluate_trace_eng as evaluate_bi_encoder
from evaluation.cross_encoders_llm_judge_evaluation import evaluate_trace_eng as evaluate_cross_llm
# 1. Argument Parsing
# Set up a parser to accept command-line arguments
parser = argparse.ArgumentParser(description="Run evaluation on model inference results.")
parser.add_argument(
"evaluator_type",
type=str,
choices=['bi', 'cross-llm'],
help="The type of evaluator to use: 'bi' for Bi-Encoders or 'cross-llm' for Cross-Encoders + LLM Judge."
)
args = parser.parse_args()
# 2. Configuration
load_dotenv()
# Get the model name from the .env file
MODEL_NAME = os.getenv("MODEL_NAME")
# Define the input file (this is the same for both)
INPUT_FILE = f"inference_results/{MODEL_NAME}_inference_results.jsonl"
# Dynamically set the output file and evaluation function based on the argument
if args.evaluator_type == 'bi':
print("Using Bi-Encoder evaluator.")
OUTPUT_FILE = f"evaluation_results/{MODEL_NAME}_evals.jsonl"
evaluation_function = evaluate_bi_encoder
elif args.evaluator_type == 'cross-llm':
print("Using Cross-Encoder + LLM Judge evaluator.")
OUTPUT_FILE = f"evaluation_results/{MODEL_NAME}_evals_2.jsonl"
evaluation_function = evaluate_cross_llm
# 3. Main Evaluation Logic
if __name__ == "__main__":
# Check if the input file from the inference step exists
if not os.path.exists(INPUT_FILE):
print(f"Error: Input file not found at '{INPUT_FILE}'")
print("Please make sure you have run the inference step first.")
else:
print(f"Reading inference results from '{INPUT_FILE}'...")
# Open the input and output files
with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
problems_with_generations = [json.loads(line) for line in f_in]
print(f"Starting evaluation for {len(problems_with_generations)} problems...")
# Loop through each problem using tqdm for a progress bar
for problem_data in tqdm(problems_with_generations, desc=f"Evaluating with {args.evaluator_type}"):
solution = problem_data.get('solution')
generation = problem_data.get('generation')
if args.evaluator_type == 'bi':
# Call the bi-encoder function with two arguments
scores = evaluation_function(solution, generation)
elif args.evaluator_type == 'cross-llm':
# For the cross-encoder, first build the context string
problem_context = (
f"This is a {problem_data.get('level')} level problem in "
f"{problem_data.get('branch')} engineering, specifically in the "
f"domain of {problem_data.get('domain')} and the area of {problem_data.get('area')}."
)
# Then call its function with three arguments
scores = evaluation_function(solution, generation, problem_context)
# Prepare the final JSON object for output
output_entry = {
'seed': problem_data.get('seed'),
'id': problem_data.get('id'),
'level': problem_data.get('level'),
'branch': problem_data.get('branch'),
'domain': problem_data.get('domain'),
'area': problem_data.get('area'),
'model': MODEL_NAME,
'scores': scores # Nest the scores dictionary
}
# Write the scored result to the new file
f_out.write(json.dumps(output_entry) + '\n')
print(f"\nEvaluation complete. Scored results saved to '{OUTPUT_FILE}'.") |