Evaluate
ReasonScape Evaluator (evaluate.py)¶
evaluate.py groups results/
data into point and aggregation buckets and performs statistical processing on them, writing a bucket.json
usage: evaluate.py [-h] --interview INTERVIEW [--output OUTPUT] [--offline] [--histogram SIZE COUNT] [--tokenizer TOKENIZER] [--fftsamples FFTSAMPLES]
Evaluate LLM interview results
options:
-h, --help show this help message and exit
--interview INTERVIEW
Path, glob pattern, or comma-separated list of NDJSON files
--output OUTPUT Write evaluation results to JSON file
--offline Enable answer re-processing
--histogram SIZE COUNT
Create token histograms with bucket SIZE and COUNT
--tokenizer TOKENIZER
Tokenizer name for FFT analysis
--fftsamples FFTSAMPLES
Number of samples to use for FFT calculation (default: 128)
Use --output
to specify the bucket.json
filename.
Evaluate Output Schema¶
The schema of these files is rich:
{
"buckets": {
"Phi-4-mini-instruct-fp16+zerocot-nosys+greedy-4k+movies+003_movies_choice_count-12_reference_count-3": {
"model": "Phi-4-mini-instruct-fp16",
"template": "zerocot-nosys",
"param_name": "greedy-4k",
"scenario": "Phi-4-mini-instruct-fp16+zerocot-nosys+greedy-4k",
"base_task": "movies",
"task": "003_movies_choice_count-12_reference_count-3",
"btype": "point",
"correct": 337,
"invalid": 6,
"invalid_ratio": 0.006756756756756757,
"total": 888,
"truncated": 8,
"truncated_ratio": 0.008928571428571428,
"hard_terminated": 0,
"params": {
"reference_count": 3,
"choice_count": 12,
"count": 128
},
"adjusted_accuracy": 0.3230958230958231,
"adjusted_successes": 263.0,
"adjusted_trials": 814.0,
"adjusted_center": 0.3239267848444002,
"adjusted_margin": 0.03206244563179326,
"completion_tokens_mean": 388.35247747747746,
"completion_tokens_correct_mean": 377.8991097922849,
"completion_tokens_incorrect_mean": 394.7459165154265,
"prompt_tokens_mean": 132.36936936936937,
"total_tokens": 376857,
"histogram": {
"correct": {
"0": 0.0,
"50": 0.0,
"100": 0.0,
"150": 1.1869436201780414,
"200": 2.373887240356083,
"250": 10.385756676557863,
...
"1250": 0.0,
"1300": 0.0,
"1350": 0.0,
"1400": 0.0,
"1450": 0.0
},
"incorrect": {
"0": 0.0,
"50": 0.17889087656529518,
"100": 1.4311270125223614,
"150": 1.9677996422182469,
"200": 1.9677996422182469,
"250": 6.797853309481217,
...
"1250": 0.0,
"1300": 0.0,
"1350": 0.0,
"1400": 0.0,
"1450": 1.6100178890876566
}
},
"fft": {
"avg_spectrum": [
29.47362002266355,
27.00455019270718,
..
25.047122306471827,
25.047146221892223
],
"std_spectrum": [
0.2159618862893598,
0.6655491629616215,
...
1.1467863230041124,
1.1467736389641212
],
"frequencies": [
0.0,
0.00641025641025641,
...
0.49358974358974356,
0.5
]
}
},