File size: 4,754 Bytes
a03bf1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import json
import numpy as np
import pickle
import argparse  
from tqdm import tqdm

# Argument Parser Setup
parser = argparse.ArgumentParser(
    description="Summarize model evaluation results from .jsonl files."
)
# Add an argument for the input directory
parser.add_argument(
    "evals_dir",
    type=str,
    help="The source directory containing the evaluation .jsonl files."
)
# Add an argument for the output directory
parser.add_argument(
    "output_dir",
    type=str,
    help="The destination directory to save the summary .pkl and .txt files."
)
args = parser.parse_args()


# Configuration
EVALS_DIR = args.evals_dir
OUTPUT_DIR = args.output_dir
os.makedirs(OUTPUT_DIR, exist_ok=True)


# Create a file to store the LaTeX formatted results
mean_sdv_results_file = open(os.path.join(OUTPUT_DIR, 'mean_sdv_results.txt'), 'w')


# Create dictionaries for your specific categories
branch_wise_results = {}
domain_wise_results = {}
area_wise_results = {}
level_wise_results = {}


#  2. Data Collection 
# Find all evaluation files in the directory
eval_files = [f for f in os.listdir(EVALS_DIR) if f.endswith('_evals_2.jsonl')]

for model_file in eval_files:
    # Initialize dictionaries for the current model
    model_name = model_file.replace('_evals.jsonl', '')
    branch_wise_results[model_name] = {}
    domain_wise_results[model_name] = {}
    area_wise_results[model_name] = {}
    level_wise_results[model_name] = {}
    
    all_metric_vals = []
    
    file_path = os.path.join(EVALS_DIR, model_file)
    with open(file_path, 'r') as f_pred:
        print(f"Processing {model_file}...")
        for line in tqdm(f_pred):
            json_line = json.loads(line)
            
            # Skip if there's an error or missing data
            if not json_line.get('scores'):
                continue
            scores = json_line['scores']
            if any(s is None for s in scores.values()):
                continue

            # Extract scores
            recall, precision, step_f1, final_answer_match, bertscore, rouge2 = \
                scores['recall'], scores['precision'], scores['step_f1'], scores['final_answer_match'], scores['bertscore'], scores['rouge2']
            
            score_list = [final_answer_match, recall, precision, step_f1, bertscore, rouge2]
            all_metric_vals.append(score_list)

            # Collect results grouped by your categories
            branch = json_line['branch']
            domain = json_line['domain']
            area = json_line['area']
            level = json_line['level'].lower()
            
            # Group by branch
            branch_wise_results[model_name].setdefault(branch, []).append(score_list)
            # Group by domain
            domain_wise_results[model_name].setdefault(domain, []).append(score_list)
            # Group by area
            area_wise_results[model_name].setdefault(area, []).append(score_list)
            # Group by level
            level_wise_results[model_name].setdefault(level, []).append(score_list)

    #  3. Calculation and Output Generation 
    
    # Calculate overall mean and standard deviation for the LaTeX output
    mean_vals = np.mean(all_metric_vals, axis=0)
    std_vals = np.std(all_metric_vals, axis=0)
    
    # Write the formatted string to the text file
    header = "Model & Final Answer & Recall & Precision & Step F1 & BERTScore & ROUGE-2 \\\\\n"
    if mean_sdv_results_file.tell() == 0: # Write header only once
        mean_sdv_results_file.write(header)
        
    formatted_scores = " & ".join([f"${m:.3f}_{{{s:.3f}}}$" for m, s in zip(mean_vals, std_vals)])
    mean_sdv_results_file.write(f"{model_name} & {formatted_scores} \\\\\n")

    # Calculate final mean scores for each category
    for category_results in [branch_wise_results, domain_wise_results, area_wise_results, level_wise_results]:
        for model in category_results:
            for category_key, scores_list in category_results[model].items():
                category_results[model][category_key] = np.mean(scores_list, axis=0)

#  4. Save Dictionaries to Pickle Files 
print("\nSaving aggregated results to pickle files...")
pickle.dump(branch_wise_results, open(os.path.join(OUTPUT_DIR, 'branch_wise_results.pkl'), 'wb'))
pickle.dump(domain_wise_results, open(os.path.join(OUTPUT_DIR, 'domain_wise_results.pkl'), 'wb'))
pickle.dump(area_wise_results, open(os.path.join(OUTPUT_DIR, 'area_wise_results.pkl'), 'wb'))
pickle.dump(level_wise_results, open(os.path.join(OUTPUT_DIR, 'level_wise_results.pkl'), 'wb'))

mean_sdv_results_file.close()
print("Analysis complete.")