| | |
| |
|
| | from datetime import datetime |
| | from dataclasses import dataclass |
| |
|
| | from datasets import load_dataset |
| |
|
| |
|
| | @dataclass |
| | class CodeExecutionProblem: |
| | question_id: str |
| | contest_id: str |
| | contest_date: datetime |
| | difficulty: str |
| | function_name: str |
| | code: str |
| | input: str |
| | output: str |
| | id: str |
| | problem_id: str |
| | numsteps: int |
| |
|
| | def __post_init__(self): |
| | pass |
| |
|
| | def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict: |
| | return { |
| | "question_id": self.question_id, |
| | "contest_id": self.contest_id, |
| | "contest_date": self.contest_date.isoformat(), |
| | "difficulty": self.difficulty, |
| | "function_name": self.function_name, |
| | "code": self.code, |
| | "input": self.input, |
| | "output": self.output, |
| | "id": self.id, |
| | "problem_id": self.problem_id, |
| | "numsteps": self.numsteps, |
| | "output_list": output_list, |
| | "pred_list": pred_list, |
| | } |
| |
|
| | def insert_output_evaluation( |
| | self, output_list: list[str], code_list: list[str], graded_list: list[bool] |
| | ) -> dict: |
| | output = self.insert_output(output_list, code_list) |
| | output["graded_list"] = graded_list |
| | output["pass@1"] = graded_list.count(True) / len(graded_list) |
| | return output |
| |
|
| | def get_evaluation_sample(self) -> dict: |
| | return { |
| | "code": self.code, |
| | "input": self.input, |
| | "output": self.output, |
| | } |
| |
|
| |
|
| | def load_code_execution_dataset(release_version="release_v1", cache_dir: str = None) -> list[CodeExecutionProblem]: |
| | dataset = load_dataset("livecodebench/execution-v2", split="test", trust_remote_code=True, cache_dir=cache_dir) |
| | dataset = [CodeExecutionProblem(**p) for p in dataset] |
| | |
| | return dataset |
| |
|
| |
|
| | if __name__ == "__main__": |
| | dataset = load_code_execution_dataset() |
| |
|