LICENSE
README.md
pyproject.toml
setup.py
Evalution.egg-info/PKG-INFO
Evalution.egg-info/SOURCES.txt
Evalution.egg-info/dependency_links.txt
Evalution.egg-info/entry_points.txt
Evalution.egg-info/requires.txt
Evalution.egg-info/top_level.txt
evalution/__init__.py
evalution/_banner.py
evalution/cli.py
evalution/compare.py
evalution/config.py
evalution/logbar.py
evalution/results.py
evalution/runtime.py
evalution/version.py
evalution/yaml.py
evalution/benchmarks/__init__.py
evalution/benchmarks/aexams.py
evalution/benchmarks/afrimgsm.py
evalution/benchmarks/afrimmlu.py
evalution/benchmarks/afrixnli.py
evalution/benchmarks/agieval.py
evalution/benchmarks/aime.py
evalution/benchmarks/alghafa.py
evalution/benchmarks/anli.py
evalution/benchmarks/arabic_subject_mmlu.py
evalution/benchmarks/arabicmmlu.py
evalution/benchmarks/arc_challenge.py
evalution/benchmarks/arc_easy.py
evalution/benchmarks/arc_exam.py
evalution/benchmarks/arc_mt.py
evalution/benchmarks/arithmetic.py
evalution/benchmarks/asdiv.py
evalution/benchmarks/assin.py
evalution/benchmarks/babi.py
evalution/benchmarks/babilong.py
evalution/benchmarks/bangla.py
evalution/benchmarks/base.py
evalution/benchmarks/bbh.py
evalution/benchmarks/bbq.py
evalution/benchmarks/bear.py
evalution/benchmarks/belebele.py
evalution/benchmarks/blimp.py
evalution/benchmarks/boolq.py
evalution/benchmarks/c4.py
evalution/benchmarks/cabbq.py
evalution/benchmarks/careqa.py
evalution/benchmarks/cb.py
evalution/benchmarks/ceval.py
evalution/benchmarks/click.py
evalution/benchmarks/cmmlu.py
evalution/benchmarks/cnn_dailymail.py
evalution/benchmarks/cocoteros_es.py
evalution/benchmarks/code_x_glue.py
evalution/benchmarks/cola.py
evalution/benchmarks/commonsense_qa.py
evalution/benchmarks/copa.py
evalution/benchmarks/copal_id.py
evalution/benchmarks/coqa.py
evalution/benchmarks/crows_pairs.py
evalution/benchmarks/darijahellaswag.py
evalution/benchmarks/darijammlu.py
evalution/benchmarks/data.py
evalution/benchmarks/drop.py
evalution/benchmarks/egyhellaswag.py
evalution/benchmarks/egymmlu.py
evalution/benchmarks/esbbq.py
evalution/benchmarks/eus_exams.py
evalution/benchmarks/eus_proficiency.py
evalution/benchmarks/eus_reading.py
evalution/benchmarks/eus_trivia.py
evalution/benchmarks/execution.py
evalution/benchmarks/fda.py
evalution/benchmarks/fewshot_multiple_choice.py
evalution/benchmarks/fld.py
evalution/benchmarks/flores_es.py
evalution/benchmarks/flores_pt.py
evalution/benchmarks/french_bench_arc_challenge.py
evalution/benchmarks/gpqa.py
evalution/benchmarks/graphwalks.py
evalution/benchmarks/groundcocoa.py
evalution/benchmarks/gsm8k.py
evalution/benchmarks/gsm8k_common.py
evalution/benchmarks/gsm8k_fr.py
evalution/benchmarks/gsm8k_ko.py
evalution/benchmarks/gsm8k_platinum.py
evalution/benchmarks/gsm_plus.py
evalution/benchmarks/haerae.py
evalution/benchmarks/headqa.py
evalution/benchmarks/hellaswag.py
evalution/benchmarks/hendrycks_ethics.py
evalution/benchmarks/hendrycks_math.py
evalution/benchmarks/histoires_morales.py
evalution/benchmarks/humaneval.py
evalution/benchmarks/icelandic_winogrande.py
evalution/benchmarks/ifeval.py
evalution/benchmarks/ifeval_pt.py
evalution/benchmarks/inverse_scaling.py
evalution/benchmarks/kmmlu.py
evalution/benchmarks/kobest.py
evalution/benchmarks/kormedmcqa.py
evalution/benchmarks/lambada.py
evalution/benchmarks/lambada_cloze.py
evalution/benchmarks/lambada_multilingual.py
evalution/benchmarks/lambada_multilingual_stablelm.py
evalution/benchmarks/localized_bbq.py
evalution/benchmarks/logiqa.py
evalution/benchmarks/logiqa2.py
evalution/benchmarks/longbench.py
evalution/benchmarks/longbench2.py
evalution/benchmarks/mastermind.py
evalution/benchmarks/mathqa.py
evalution/benchmarks/mbpp.py
evalution/benchmarks/mc_taco.py
evalution/benchmarks/mediqa_qa2019.py
evalution/benchmarks/medmcqa.py
evalution/benchmarks/medqa.py
evalution/benchmarks/meqsum.py
evalution/benchmarks/mgsm.py
evalution/benchmarks/mgsm_direct_es_spanish_bench.py
evalution/benchmarks/mlqa.py
evalution/benchmarks/mmlu.py
evalution/benchmarks/mmlu_cf.py
evalution/benchmarks/mmlu_pro.py
evalution/benchmarks/mmlu_pro_plus.py
evalution/benchmarks/mmlu_redux.py
evalution/benchmarks/mnli.py
evalution/benchmarks/moral_stories.py
evalution/benchmarks/mrpc.py
evalution/benchmarks/multiple_choice.py
evalution/benchmarks/multiple_choice_utils.py
evalution/benchmarks/multirc.py
evalution/benchmarks/mutual.py
evalution/benchmarks/noticia.py
evalution/benchmarks/nq_open.py
evalution/benchmarks/openbookqa.py
evalution/benchmarks/paloma.py
evalution/benchmarks/paws_x.py
evalution/benchmarks/phrases_es.py
evalution/benchmarks/pile_10k.py
evalution/benchmarks/piqa.py
evalution/benchmarks/polemo2.py
evalution/benchmarks/prost.py
evalution/benchmarks/pubmedqa.py
evalution/benchmarks/qa4mre.py
evalution/benchmarks/qasper.py
evalution/benchmarks/qnli.py
evalution/benchmarks/qqp.py
evalution/benchmarks/race.py
evalution/benchmarks/record.py
evalution/benchmarks/rolling_perplexity.py
evalution/benchmarks/rte.py
evalution/benchmarks/ruler.py
evalution/benchmarks/sciq.py
evalution/benchmarks/scrolls.py
evalution/benchmarks/simple_cooccurrence_bias.py
evalution/benchmarks/single_continuation.py
evalution/benchmarks/siqa.py
evalution/benchmarks/spanish_bench.py
evalution/benchmarks/squad_completion.py
evalution/benchmarks/squadv2.py
evalution/benchmarks/sst2.py
evalution/benchmarks/storycloze.py
evalution/benchmarks/subsets.py
evalution/benchmarks/swag.py
evalution/benchmarks/swde.py
evalution/benchmarks/toxigen.py
evalution/benchmarks/translated_hellaswag.py
evalution/benchmarks/triviaqa.py
evalution/benchmarks/truthfulqa.py
evalution/benchmarks/webqs.py
evalution/benchmarks/wic.py
evalution/benchmarks/wikitext.py
evalution/benchmarks/winogender.py
evalution/benchmarks/winogrande.py
evalution/benchmarks/wmdp.py
evalution/benchmarks/wnli.py
evalution/benchmarks/wsc.py
evalution/benchmarks/wsc273.py
evalution/benchmarks/xcopa.py
evalution/benchmarks/xlsum_es.py
evalution/benchmarks/xnli.py
evalution/benchmarks/xnli_eu.py
evalution/benchmarks/xquad.py
evalution/benchmarks/xstorycloze.py
evalution/benchmarks/xwinograd.py
evalution/datasets/__init__.py
evalution/datasets/flores200.py
evalution/datasets/mediqa_qa.py
evalution/datasets/meqsum.py
evalution/datasets/wnli_es.py
evalution/datasets/xlsum.py
evalution/engines/__init__.py
evalution/engines/base.py
evalution/engines/continuous.py
evalution/engines/gptqmodel_engine.py
evalution/engines/memory.py
evalution/engines/openvino_engine.py
evalution/engines/sglang_engine.py
evalution/engines/tensorrt_llm_engine.py
evalution/engines/transformers.py
evalution/engines/transformers_common.py
evalution/engines/transformers_compat.py
evalution/engines/vllm_engine.py
evalution/scorers/__init__.py
evalution/scorers/bleu.py
evalution/scorers/choice_label.py
evalution/scorers/classification.py
evalution/scorers/gsm8k.py
evalution/scorers/longbench.py
evalution/scorers/math_exact_match.py
evalution/scorers/multiple_choice.py
evalution/scorers/qa_text.py
evalution/scorers/summary_rouge.py
evalution/scorers/translation.py
tests/test_aexams.py
tests/test_afrimgsm.py
tests/test_afrimmlu.py
tests/test_afrixnli.py
tests/test_agieval.py
tests/test_aime.py
tests/test_alghafa.py
tests/test_anli.py
tests/test_api.py
tests/test_arabicmmlu.py
tests/test_arc_challenge.py
tests/test_arc_easy.py
tests/test_arc_mt.py
tests/test_arithmetic.py
tests/test_asdiv.py
tests/test_assin.py
tests/test_babi.py
tests/test_babilong.py
tests/test_bangla.py
tests/test_bbh.py
tests/test_bbq.py
tests/test_bear.py
tests/test_belebele.py
tests/test_benchmark_order.py
tests/test_benchmark_stream_defaults.py
tests/test_blimp.py
tests/test_boolq.py
tests/test_c4.py
tests/test_cabbq.py
tests/test_careqa.py
tests/test_cb.py
tests/test_ceval.py
tests/test_cli.py
tests/test_click.py
tests/test_cmmlu.py
tests/test_cnn_dailymail.py
tests/test_cocoteros_es.py
tests/test_code_x_glue.py
tests/test_cola.py
tests/test_commonsense_qa.py
tests/test_compare.py
tests/test_continuous.py
tests/test_copa.py
tests/test_copal_id.py
tests/test_coqa.py
tests/test_crows_pairs.py
tests/test_darijahellaswag.py
tests/test_darijammlu.py
tests/test_data.py
tests/test_dataset_streaming_nogil.py
tests/test_drop.py
tests/test_egyhellaswag.py
tests/test_egymmlu.py
tests/test_environment.py
tests/test_esbbq.py
tests/test_eus_exams.py
tests/test_eus_proficiency.py
tests/test_eus_reading.py
tests/test_eus_trivia.py
tests/test_fda.py
tests/test_fld.py
tests/test_flores_es.py
tests/test_flores_pt.py
tests/test_french_bench_arc_challenge.py
tests/test_gpqa.py
tests/test_gptqmodel_engine.py
tests/test_graphwalks.py
tests/test_groundcocoa.py
tests/test_gsm8k.py
tests/test_gsm8k_fr.py
tests/test_gsm8k_ko.py
tests/test_gsm8k_platinum.py
tests/test_gsm_plus.py
tests/test_haerae.py
tests/test_headqa.py
tests/test_hellaswag.py
tests/test_hendrycks_ethics.py
tests/test_hendrycks_math.py
tests/test_histoires_morales.py
tests/test_humaneval.py
tests/test_icelandic_winogrande.py
tests/test_ifeval_pt.py
tests/test_import_safety.py
tests/test_inverse_scaling.py
tests/test_kmmlu.py
tests/test_kobest.py
tests/test_kormedmcqa.py
tests/test_lambada.py
tests/test_lambada_cloze.py
tests/test_lambada_multilingual.py
tests/test_lambada_multilingual_stablelm.py
tests/test_logbar_context.py
tests/test_logbar_tables.py
tests/test_logiqa.py
tests/test_logiqa2.py
tests/test_longbench.py
tests/test_longbench2.py
tests/test_mastermind.py
tests/test_mathqa.py
tests/test_mbpp.py
tests/test_mc_taco.py
tests/test_mediqa_qa2019.py
tests/test_medmcqa.py
tests/test_medqa.py
tests/test_meqsum.py
tests/test_mgsm.py
tests/test_mgsm_direct_es_spanish_bench.py
tests/test_mlqa.py
tests/test_mmlu.py
tests/test_mmlu_cf.py
tests/test_mmlu_pro.py
tests/test_mmlu_pro_plus.py
tests/test_mmlu_redux.py
tests/test_mnli.py
tests/test_moral_stories.py
tests/test_mrpc.py
tests/test_multirc.py
tests/test_mutual.py
tests/test_noticia.py
tests/test_nq_open.py
tests/test_openbookqa.py
tests/test_openvino_engine.py
tests/test_package.py
tests/test_paloma.py
tests/test_paws_x.py
tests/test_pcre_compat.py
tests/test_phrases_es.py
tests/test_pile_10k.py
tests/test_piqa.py
tests/test_polemo2.py
tests/test_prost.py
tests/test_pubmedqa.py
tests/test_qa4mre.py
tests/test_qasper.py
tests/test_qnli.py
tests/test_qqp.py
tests/test_race.py
tests/test_record.py
tests/test_regex_backend.py
tests/test_rte.py
tests/test_ruler.py
tests/test_runtime_tables.py
tests/test_sciq.py
tests/test_scorers.py
tests/test_scrolls.py
tests/test_sglang_engine.py
tests/test_simple_cooccurrence_bias.py
tests/test_siqa.py
tests/test_spanish_bench.py
tests/test_squad_completion.py
tests/test_squadv2.py
tests/test_sst2.py
tests/test_startup_banner.py
tests/test_storycloze.py
tests/test_swag.py
tests/test_swde.py
tests/test_tensorrt_llm_engine.py
tests/test_tokenicer_integration.py
tests/test_toxigen.py
tests/test_transformer.py
tests/test_transformer_compat.py
tests/test_triviaqa.py
tests/test_truthfulqa.py
tests/test_vllm_engine.py
tests/test_webqs.py
tests/test_wic.py
tests/test_wikitext.py
tests/test_winogender.py
tests/test_winogrande.py
tests/test_wmdp.py
tests/test_wnli.py
tests/test_wnli_es_dataset.py
tests/test_wsc.py
tests/test_wsc273.py
tests/test_xcopa.py
tests/test_xlsum_es.py
tests/test_xnli.py
tests/test_xnli_eu.py
tests/test_xquad.py
tests/test_xstorycloze.py
tests/test_xwinograd.py
tests/test_yaml.py