|
|
import json |
|
|
from datasets import load_dataset |
|
|
|
|
|
import verifiers as vf |
|
|
|
|
|
|
|
|
def load_environment( |
|
|
num_train_examples=7000, |
|
|
num_eval_examples=1000, |
|
|
**kwargs |
|
|
): |
|
|
""" |
|
|
Environment for verifying complex JSON output from models. |
|
|
|
|
|
The task requires models to: |
|
|
1. Parse multi-question prompts |
|
|
2. Generate valid JSON responses |
|
|
3. Match the expected structure with correct keys and values |
|
|
|
|
|
Rewards (no penalties, only positive rewards): |
|
|
- Formatting (valid JSON dict): 0.33 if pass, 0 if fail |
|
|
- All keys match: 0.33 if pass, 0 if fail |
|
|
- Answer values match: 0.33 if pass, 0 if fail |
|
|
Total max reward: ~1.0 |
|
|
""" |
|
|
|
|
|
|
|
|
dataset = load_dataset("Delta-Vector/Tauri-Complex-JSON-Formatting", split="train") |
|
|
|
|
|
|
|
|
def format_example(example): |
|
|
return { |
|
|
"question": example["prompt"], |
|
|
"info": {"verification_info": example["verification_info"]}, |
|
|
} |
|
|
|
|
|
dataset = dataset.map(format_example, remove_columns=dataset.column_names) |
|
|
|
|
|
|
|
|
train_dataset = dataset.select(range(num_train_examples)) |
|
|
eval_dataset = dataset.select(range(num_train_examples, num_train_examples + num_eval_examples)) |
|
|
|
|
|
|
|
|
def extract_json_from_completion(completion): |
|
|
"""Extract JSON from completion, handling code blocks.""" |
|
|
if not completion: |
|
|
return "" |
|
|
|
|
|
|
|
|
if isinstance(completion, list) and len(completion) > 0: |
|
|
content = completion[-1].get("content", "") |
|
|
else: |
|
|
content = str(completion) |
|
|
|
|
|
|
|
|
import re |
|
|
code_block_pattern = r"```(?:json)?\s*\n(.*?)\n```" |
|
|
matches = re.findall(code_block_pattern, content, re.DOTALL) |
|
|
if matches: |
|
|
return matches[-1].strip() |
|
|
|
|
|
|
|
|
return content.strip() |
|
|
|
|
|
|
|
|
parser = vf.Parser(extract_fn=extract_json_from_completion) |
|
|
|
|
|
def format_reward(completion, **kwargs) -> float: |
|
|
""" |
|
|
Reward for valid JSON formatting. |
|
|
Returns 0.33 for valid JSON dict, 0 for invalid. |
|
|
""" |
|
|
try: |
|
|
response = parser.parse_answer(completion) or "" |
|
|
response = response.strip() |
|
|
|
|
|
|
|
|
if not response: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
parsed = json.loads(response) |
|
|
|
|
|
|
|
|
if not isinstance(parsed, dict): |
|
|
return 0.0 |
|
|
|
|
|
return 0.33 |
|
|
except (json.JSONDecodeError, ValueError, TypeError): |
|
|
return 0.0 |
|
|
|
|
|
def keys_match_reward(completion, info, **kwargs) -> float: |
|
|
""" |
|
|
Reward for matching keys in the JSON structure. |
|
|
Returns 0.33 if all keys match, 0 otherwise. |
|
|
""" |
|
|
try: |
|
|
response = parser.parse_answer(completion) or "" |
|
|
response = response.strip() |
|
|
parsed_response = json.loads(response) |
|
|
|
|
|
|
|
|
verification_info = json.loads(info["verification_info"]) |
|
|
ground_truth = verification_info["ground_truth"] |
|
|
|
|
|
|
|
|
if not isinstance(parsed_response, dict): |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
def get_all_keys(d, prefix=""): |
|
|
keys = set() |
|
|
if isinstance(d, dict): |
|
|
for k, v in d.items(): |
|
|
full_key = f"{prefix}.{k}" if prefix else k |
|
|
keys.add(full_key) |
|
|
keys.update(get_all_keys(v, full_key)) |
|
|
return keys |
|
|
|
|
|
expected_keys = get_all_keys(ground_truth) |
|
|
actual_keys = get_all_keys(parsed_response) |
|
|
|
|
|
|
|
|
if expected_keys == actual_keys: |
|
|
return 0.33 |
|
|
else: |
|
|
return 0.0 |
|
|
|
|
|
except (json.JSONDecodeError, ValueError, AttributeError, TypeError): |
|
|
return 0.0 |
|
|
|
|
|
def values_match_reward(completion, info, **kwargs) -> float: |
|
|
""" |
|
|
Reward for matching values in the JSON structure. |
|
|
Returns 0.33 if all values match, 0 otherwise. |
|
|
""" |
|
|
try: |
|
|
response = parser.parse_answer(completion) or "" |
|
|
response = response.strip() |
|
|
parsed_response = json.loads(response) |
|
|
|
|
|
|
|
|
verification_info = json.loads(info["verification_info"]) |
|
|
ground_truth = verification_info["ground_truth"] |
|
|
|
|
|
|
|
|
def deep_compare(a, b): |
|
|
if type(a) != type(b): |
|
|
return False |
|
|
if isinstance(a, dict): |
|
|
if set(a.keys()) != set(b.keys()): |
|
|
return False |
|
|
return all(deep_compare(a[k], b[k]) for k in a.keys()) |
|
|
elif isinstance(a, list): |
|
|
if len(a) != len(b): |
|
|
return False |
|
|
return all(deep_compare(a[i], b[i]) for i in range(len(a))) |
|
|
else: |
|
|
return a == b |
|
|
|
|
|
if deep_compare(parsed_response, ground_truth): |
|
|
return 0.33 |
|
|
else: |
|
|
return 0.0 |
|
|
|
|
|
except (json.JSONDecodeError, ValueError, AttributeError, TypeError): |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
rubric = vf.Rubric( |
|
|
parser=parser, |
|
|
funcs=[ |
|
|
format_reward, |
|
|
keys_match_reward, |
|
|
values_match_reward, |
|
|
], |
|
|
weights=[1.0, 1.0, 1.0] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
vf_env = vf.SingleTurnEnv( |
|
|
dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
parser=parser, |
|
|
rubric=rubric, |
|
|
) |
|
|
|
|
|
return vf_env |
|
|
|