.gitattributes
.gitignore
.gitkeep
.markdownlint.yaml
.markdownlintignore
.pre-commit-config.yaml
AGENTS.md
ASSETS.yaml
AUTOMATED_CHECKS.md
BEST_PRACTICES.md
CHANGELOG.md
CLAUDE.md
CONTRIBUTING.md
EVALUATION_CHECKLIST.md
LICENSE
MAINTAINERS.md
MANIFEST.in
Makefile
NOTICE
PACKAGE_VERSIONING.md
README.md
TASK_VERSIONING.md
pyproject.toml
uv.lock
.claude/skills/build-repo-context/SKILL.md
.claude/skills/check-trajectories-workflow/SKILL.md
.claude/skills/ci-maintenance-workflow/SKILL.md
.claude/skills/code-quality-fix-all/SKILL.md
.claude/skills/code-quality-review-all/SKILL.md
.claude/skills/code-quality-review-all/assets/results-template.json
.claude/skills/create-eval/SKILL.md
.claude/skills/deep-dive-repo-context/SKILL.md
.claude/skills/ensure-test-coverage/SKILL.md
.claude/skills/ensure-test-coverage/references/test-patterns.md
.claude/skills/eval-quality-workflow/SKILL.md
.claude/skills/eval-report-workflow/SKILL.md
.claude/skills/eval-report-workflow/references/frontier-models.md
.claude/skills/eval-validity-review/SKILL.md
.claude/skills/generate-asset-actions/SKILL.md
.claude/skills/investigate-dataset/SKILL.md
.claude/skills/investigate-dataset/references/inspect-dataset-patterns.md
.claude/skills/prepare-release/SKILL.md
.claude/skills/prepare-submission-workflow/SKILL.md
.claude/skills/read-eval-logs/SKILL.md
.claude/skills/write-an-adr/SKILL.md
.claude/skills/write-an-adr/assets/adr-template.md
.github/PULL_REQUEST_TEMPLATE.md
.github/dependabot.yml
.github/ISSUE_TEMPLATE/bug_report.yaml
.github/ISSUE_TEMPLATE/new_benchmark.yaml
.github/PULL_REQUEST_TEMPLATE/bug_fix.md
.github/actions/claude-setup/action.yaml
.github/scripts/detect_bump_type.py
.github/scripts/run_targeted_tests.py
.github/workflows/build-repo-context.yaml
.github/workflows/build.yml
.github/workflows/checks.yml
.github/workflows/claude-fix-tests.yaml
.github/workflows/claude-issue-solver.yaml
.github/workflows/claude-review.yaml
.github/workflows/deep-dive-repo-context.yaml
.github/workflows/docker-image-rebuild.yml
.github/workflows/docs.yml
.github/workflows/lint-new-evals.yml
.github/workflows/markdown-lint.yml
.github/workflows/pr-template-check.yml
.github/workflows/publish.yaml
.github/workflows/release-on-merge.yml
.github/workflows/weekly-release.yml
.vscode/.gitignore
.vscode/extensions.json
.vscode/settings.json
.windsurf/workflows/commit-staged-to-new-branch.md
adr/0001-eval-metadata-location.md
adr/0002-versioning-metadata.md
adr/0003-use-huggingface-hub-for-asset-hosting.md
adr/0004-no-floating-refs-for-external-assets.md
adr/0005-asset-manifest-in-per-eval-config.md
adr/0006-manual-upload-workflow-before-automation.md
adr/0007-dataset-hosting-escalation-policy.md
adr/0008-task-configurability-uses-standard-inspect-layers.md
agent_artefacts/aime2026_1_A/review/SUMMARY.md
agent_artefacts/bfcl_4_B/review/SUMMARY.md
agent_artefacts/code_quality/README.md
agent_artefacts/code_quality/pytest_marks/README.md
agent_artefacts/code_quality/pytest_marks/SUMMARY.md
agent_artefacts/code_quality/pytest_marks/check_pytest_marks.py
agent_artefacts/code_quality/pytest_marks/results.json
agent_artefacts/cti_realm_1_A/review/SUMMARY.md
agent_artefacts/gdm_intercode_ctf_2_0_0/evalreport/NOTES.md
agent_artefacts/gdm_intercode_ctf_2_0_0/evalreport/UNCERTAINTIES.md
agent_artefacts/gdm_intercode_ctf_2_0_0/review/SUMMARY.md
agent_artefacts/gdm_intercode_ctf_2_0_0/review/UNCERTAINTIES.md
agent_artefacts/gdm_self_reasoning_3_A/validity/VALIDITY_REPORT.md
agent_artefacts/gdm_stealth_3_A/validity/VALIDITY_REPORT.md
agent_artefacts/healthbench_1_1_0/review/NOTES.md
agent_artefacts/healthbench_1_1_0/review/SUMMARY.md
agent_artefacts/healthbench_1_1_0/review/UNCERTAINTIES.md
agent_artefacts/repo_context/REPO_CONTEXT.md
agent_artefacts/scbench_1_A/review/SUMMARY.md
agent_artefacts/scripts/fetch_slow_tests.py
agent_artefacts/tac_1_A/review/SUMMARY.md
agent_artefacts/tac_1_A/validity/VALIDITY_REPORT.md
agent_artefacts/tac_1_B/review/SUMMARY.md
agent_artefacts/tac_1_B/validity/VALIDITY_REPORT.md
agent_artefacts/trajectory_analysis/aime2026/aime2026_gpt-5-nano_ANALYSIS.md
agent_artefacts/trajectory_analysis/aime2026/aime2026_gpt-5.4-nano_ANALYSIS.md
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/outcome_summaries.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/results_summary.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scanner_metadata.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/validity_analysis.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/validity_summary.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/_errors.jsonl
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/_scan.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/_summary.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/broken_env.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/ethical_refusal.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/external_failure.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/filtered_refusal.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/formatting_failure.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/outcome_summary.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt54nano/scan_id=5py7BrWQvsfXTvYBhSsBoB/reward_hacking_success.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/outcome_summaries.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/results_summary.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scanner_metadata.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/validity_analysis.txt
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/validity_summary.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/_errors.jsonl
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/_scan.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/_summary.json
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/broken_env.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/ethical_refusal.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/external_failure.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/filtered_refusal.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/formatting_failure.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/outcome_summary.parquet
agent_artefacts/trajectory_analysis/aime2026/scout_results_gpt5nano/scan_id=2ovKTCfBi229TVqEHWyADk/reward_hacking_success.parquet
agent_artefacts/trajectory_analysis/bfcl_2/bfcl_gpt_5_nano_ANALYSIS.md
agent_artefacts/trajectory_analysis/bfcl_3_B/bfcl_3_B_claude_haiku_4_5_ANALYSIS.md
agent_artefacts/trajectory_analysis/bfcl_3_B/bfcl_3_B_gpt_4_1_mini_ANALYSIS.md
agent_artefacts/trajectory_analysis/bfcl_3_B/scout_results_gpt41mini/outcome_summaries.txt
agent_artefacts/trajectory_analysis/bfcl_3_B/scout_results_gpt41mini/results_summary.txt
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_claude_opus_4_5_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_claude_opus_4_6_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_claude_sonnet_4_5_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_gpt_4_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_gpt_5_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_gpt_5_2_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_gpt_5_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_gpt_5_mini_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_o3_ANALYSIS.md
agent_artefacts/trajectory_analysis/cti_realm_50/cti_realm_50_o4_mini_ANALYSIS.md
agent_artefacts/trajectory_analysis/example/GAIA_LEVEL3_ANALYSIS.md
agent_artefacts/trajectory_analysis/frontier_cs_1_A/frontier_cs_algorithmic_claude_sonnet_ANALYSIS.md
agent_artefacts/trajectory_analysis/frontier_cs_1_A/frontier_cs_algorithmic_gpt_5_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/frontier_cs_1_A/frontier_cs_research_claude_sonnet_ANALYSIS.md
agent_artefacts/trajectory_analysis/frontier_cs_1_A/frontier_cs_research_gpt_5_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/gaia_level2_1_0_0/eval_scanners.py
agent_artefacts/trajectory_analysis/gaia_level2_1_0_0/gaia_level2_1_0_0_ANALYSIS.md
agent_artefacts/trajectory_analysis/gdm_intercode_ctf_2_0_0/claude_opus_4_5_ANALYSIS.md
agent_artefacts/trajectory_analysis/gdm_intercode_ctf_2_1_0/gpt_5_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/inspect_scout/analyze_validity.py
agent_artefacts/trajectory_analysis/inspect_scout/extract_results.py
agent_artefacts/trajectory_analysis/inspect_scout/run_all_scanners.py
agent_artefacts/trajectory_analysis/inspect_scout/scanners.py
agent_artefacts/trajectory_analysis/inspect_scout/test_filtered_refusal.py
agent_artefacts/trajectory_analysis/inspect_scout/test_grader_score_mismatch.py
agent_artefacts/trajectory_analysis/inspect_scout/utils.py
agent_artefacts/trajectory_analysis/scbench_1_A/TRAJECTORY_ANALYSIS.md
agent_artefacts/trajectory_analysis/scbench_1_A/scbench_gpt51_ANALYSIS.md
agent_artefacts/trajectory_analysis/scbench_1_A/scbench_haiku_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_claude_opus_4_6_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_claude_sonnet_4_6_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_deepseek_v3_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_gemini_2_5_flash_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_gpt_4_1_ANALYSIS.md
agent_artefacts/trajectory_analysis/tac_1_A/tac_gpt_5_2_ANALYSIS.md
analysis/comparisons-across-time.md
analysis/run-specific-eval-version.md
analysis/assets/comparison_options.png
analysis/assets/original.png
changelog.d/.gitkeep
changelog.d/TEMPLATE.md
docs/.gitignore
docs/_quarto.yml
docs/documentation.md
docs/index.qmd
docs/methodology.md
docs/styles.css
docs/task-configurability.md
docs/_scripts/postrender.py
docs/_scripts/prerender.py
docs/_templates/evals.css
docs/_templates/evals.ejs
docs/_templates/title-metadata.html
docs/evals/.gitignore
docs/evals/coding/swe_bench/docs/swebench_comparison.jpeg
docs/evals/cybersecurity/cybench/challenges/frog_waf/images/victim/src/main/resources/static/favicon.png
docs/evals/cybersecurity/cybench/challenges/frog_waf/images/victim/src/main/resources/static/msfrog.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/ascoin.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/cashout.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/doge.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/gamestock.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/glaciercoin.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/smtl.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/icons/ycmi.png
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/images/bg.jpg
docs/evals/cybersecurity/cybench/challenges/glacier_exchange/images/victim/assets/images/convert-button.jpg
docs/evals/mathematics/mathvista/example.png
docs/evals/reasoning/niah/images/combined_models_score_distribution.png
docs/evals/reasoning/niah/images/mistral-medium-latest_mean_heatmap.png
docs/evals/reasoning/niah/images/mistral-medium-latest_std_heatmap.png
docs/evals/reasoning/niah/images/mistral-medium-latest_unrelated_answer_responses.png
docs/evals/reasoning/niah/images/mistral-small-latest_mean_heatmap.png
docs/evals/reasoning/niah/images/mistral-small-latest_std_heatmap.png
docs/evals/safeguards/agentharm/images/abstract.png
docs/evals/safeguards/agentharm/images/main_results.png
internal/audits/asset-actions.yaml
internal/audits/audit-summary.md
internal/implementation-plans/TEMPLATE.md
internal/implementation-plans/external-asset-hosting.md
src/inspect_evals/__init__.py
src/inspect_evals/_registry.py
src/inspect_evals/constants.py
src/inspect_evals/hf_dataset_script_helper.py
src/inspect_evals/metadata.py
src/inspect_evals/py.typed
src/inspect_evals.egg-info/PKG-INFO
src/inspect_evals.egg-info/SOURCES.txt
src/inspect_evals.egg-info/dependency_links.txt
src/inspect_evals.egg-info/entry_points.txt
src/inspect_evals.egg-info/requires.txt
src/inspect_evals.egg-info/top_level.txt
src/inspect_evals/abstention_bench/.noautolint
src/inspect_evals/abstention_bench/README.md
src/inspect_evals/abstention_bench/__init__.py
src/inspect_evals/abstention_bench/abstention_bench.py
src/inspect_evals/abstention_bench/eval.yaml
src/inspect_evals/abstention_bench/utils.py
src/inspect_evals/abstention_bench/configs/default_pipeline.yaml
src/inspect_evals/abstention_bench/configs/dataset/alcuna.yaml
src/inspect_evals/abstention_bench/configs/dataset/bbq.yaml
src/inspect_evals/abstention_bench/configs/dataset/big_bench_disambiguate.yaml
src/inspect_evals/abstention_bench/configs/dataset/big_bench_known_unknowns.yaml
src/inspect_evals/abstention_bench/configs/dataset/coconot.yaml
src/inspect_evals/abstention_bench/configs/dataset/dummy.yaml
src/inspect_evals/abstention_bench/configs/dataset/falseqa.yaml
src/inspect_evals/abstention_bench/configs/dataset/freshqa.yaml
src/inspect_evals/abstention_bench/configs/dataset/gpqa.yaml
src/inspect_evals/abstention_bench/configs/dataset/gsm8k.yaml
src/inspect_evals/abstention_bench/configs/dataset/kuq.yaml
src/inspect_evals/abstention_bench/configs/dataset/mediq.yaml
src/inspect_evals/abstention_bench/configs/dataset/mmlu_history.yaml
src/inspect_evals/abstention_bench/configs/dataset/mmlu_math.yaml
src/inspect_evals/abstention_bench/configs/dataset/moralchoice.yaml
src/inspect_evals/abstention_bench/configs/dataset/musique.yaml
src/inspect_evals/abstention_bench/configs/dataset/qaqa.yaml
src/inspect_evals/abstention_bench/configs/dataset/qasper.yaml
src/inspect_evals/abstention_bench/configs/dataset/self_aware.yaml
src/inspect_evals/abstention_bench/configs/dataset/situated_qa.yaml
src/inspect_evals/abstention_bench/configs/dataset/squad2.yaml
src/inspect_evals/abstention_bench/configs/dataset/umwp.yaml
src/inspect_evals/abstention_bench/configs/dataset/worldsense.yaml
src/inspect_evals/abstention_bench/data/UMWP_indices_answerable.json
src/inspect_evals/abstention_bench/data/fast-subset-indices.json
src/inspect_evals/abstention_bench/data/subsampling-indices.json
src/inspect_evals/abstention_bench/data/freshqa/FreshQA_v10282024.csv
src/inspect_evals/abstention_bench/data/freshqa/FreshQA_v12182024.csv
src/inspect_evals/abstention_bench/data/kuq/new-category-mapping.csv
src/inspect_evals/abstention_bench/recipe/__init__.py
src/inspect_evals/abstention_bench/recipe/evaluation_judge_prompts.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/__init__.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/abstract_abstention_dataset.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/alcuna.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/bbq.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/big_bench.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/coconot.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/false_qa.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/freshqa.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/gpqa.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/gsm8k.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/kuq.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/mediq.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/mmlu.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/moralchoice.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/musique.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/nq_dataset.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/qaqa.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/qasper.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/self_aware.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/situated_qa.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/squad.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/umwp.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/world_sense.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/huggingface_artifact/qasper.py
src/inspect_evals/abstention_bench/recipe/abstention_datasets/huggingface_artifact/situated_qa.py
src/inspect_evals/agent_bench/Dockerfile.template.j2
src/inspect_evals/agent_bench/README.md
src/inspect_evals/agent_bench/__init__.py
src/inspect_evals/agent_bench/agent_bench.py
src/inspect_evals/agent_bench/agent_bench_os_dataset.py
src/inspect_evals/agent_bench/agent_bench_os_scorer.py
src/inspect_evals/agent_bench/constants.py
src/inspect_evals/agent_bench/eval.yaml
src/inspect_evals/agent_bench/utils.py
src/inspect_evals/agent_bench/data/agent_bench_os.json
src/inspect_evals/agent_bench/data/agent_bench_os_test.json
src/inspect_evals/agentdojo/.noautolint
src/inspect_evals/agentdojo/README.md
src/inspect_evals/agentdojo/__init__.py
src/inspect_evals/agentdojo/agentdojo.py
src/inspect_evals/agentdojo/base_tasks.py
src/inspect_evals/agentdojo/dataset.py
src/inspect_evals/agentdojo/eval.yaml
src/inspect_evals/agentdojo/scorer.py
src/inspect_evals/agentdojo/strenum.py
src/inspect_evals/agentdojo/task_combinators.py
src/inspect_evals/agentdojo/task_suite.py
src/inspect_evals/agentdojo/utils.py
src/inspect_evals/agentdojo/yaml_loader.py
src/inspect_evals/agentdojo/agents/agent.py
src/inspect_evals/agentdojo/agents/ground_truth_agent.py
src/inspect_evals/agentdojo/attacks/attack.py
src/inspect_evals/agentdojo/data/suites/banking/environment.yaml
src/inspect_evals/agentdojo/data/suites/banking/injection_vectors.yaml
src/inspect_evals/agentdojo/data/suites/slack/environment.yaml
src/inspect_evals/agentdojo/data/suites/slack/injection_vectors.yaml
src/inspect_evals/agentdojo/data/suites/travel/environment.yaml
src/inspect_evals/agentdojo/data/suites/travel/injection_vectors.yaml
src/inspect_evals/agentdojo/data/suites/workspace/environment.yaml
src/inspect_evals/agentdojo/data/suites/workspace/gpt_prompts.txt
src/inspect_evals/agentdojo/data/suites/workspace/injection_vectors.yaml
src/inspect_evals/agentdojo/data/suites/workspace/include/calendar.yaml
src/inspect_evals/agentdojo/data/suites/workspace/include/cloud_drive.yaml
src/inspect_evals/agentdojo/data/suites/workspace/include/inbox.yaml
src/inspect_evals/agentdojo/data/suites/workspace_plus/environment.yaml
src/inspect_evals/agentdojo/data/suites/workspace_plus/injection_vectors.yaml
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/compose.yaml
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/Dockerfile
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Desktop/todo.txt
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/1706.03762v7.pdf
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/Basketball_through_hoop.jpg
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/Eopsaltria_australis_-_Mogo_Campground.jpg
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/Moon_right-view_(Clementine_dataset).png
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/meeting_notes_1215.txt
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/fix-permissions.sh
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/nginx.conf
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/certs/gen-certs.sh
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/certs/nginx.crt
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/certs/nginx.key
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/gist.github.com/bsptech/389024167213dae90b5a7483f9fd81d5/raw/bde28851c49777d31581f8b7078e2b96/diagnostic.sh
src/inspect_evals/agentdojo/data/suites/workspace_plus/terminal/mock-gist-server/gist.github.com/bsptech/389024167213dae90b5a7483f9fd81d5/raw/bde28851c49777d31581f8b7078e2b96/diagnostic2.sh
src/inspect_evals/agentdojo/task_suites/banking/environment.py
src/inspect_evals/agentdojo/task_suites/banking/injection_tasks.py
src/inspect_evals/agentdojo/task_suites/banking/task_suite.py
src/inspect_evals/agentdojo/task_suites/banking/user_tasks.py
src/inspect_evals/agentdojo/task_suites/slack/environment.py
src/inspect_evals/agentdojo/task_suites/slack/injection_tasks.py
src/inspect_evals/agentdojo/task_suites/slack/task_suite.py
src/inspect_evals/agentdojo/task_suites/slack/user_tasks.py
src/inspect_evals/agentdojo/task_suites/travel/environment.py
src/inspect_evals/agentdojo/task_suites/travel/injection_tasks.py
src/inspect_evals/agentdojo/task_suites/travel/task_suite.py
src/inspect_evals/agentdojo/task_suites/travel/user_tasks.py
src/inspect_evals/agentdojo/task_suites/workspace/environment.py
src/inspect_evals/agentdojo/task_suites/workspace/injection_tasks.py
src/inspect_evals/agentdojo/task_suites/workspace/task_suite.py
src/inspect_evals/agentdojo/task_suites/workspace/user_tasks.py
src/inspect_evals/agentdojo/task_suites/workspace_plus/environment.py
src/inspect_evals/agentdojo/task_suites/workspace_plus/injection_tasks.py
src/inspect_evals/agentdojo/task_suites/workspace_plus/task_suite.py
src/inspect_evals/agentdojo/task_suites/workspace_plus/user_tasks.py
src/inspect_evals/agentdojo/tools/__init__.py
src/inspect_evals/agentdojo/tools/banking_client.py
src/inspect_evals/agentdojo/tools/calendar_client.py
src/inspect_evals/agentdojo/tools/cloud_drive_client.py
src/inspect_evals/agentdojo/tools/email_client.py
src/inspect_evals/agentdojo/tools/file_reader.py
src/inspect_evals/agentdojo/tools/slack.py
src/inspect_evals/agentdojo/tools/terminal.py
src/inspect_evals/agentdojo/tools/travel_booking_client.py
src/inspect_evals/agentdojo/tools/types.py
src/inspect_evals/agentdojo/tools/user_account.py
src/inspect_evals/agentdojo/tools/web.py
src/inspect_evals/agentharm/.noautolint
src/inspect_evals/agentharm/LICENSE
src/inspect_evals/agentharm/README.md
src/inspect_evals/agentharm/__init__.py
src/inspect_evals/agentharm/agentharm.py
src/inspect_evals/agentharm/eval.yaml
src/inspect_evals/agentharm/metric.py
src/inspect_evals/agentharm/prompts.py
src/inspect_evals/agentharm/scorer.py
src/inspect_evals/agentharm/utils.py
src/inspect_evals/agentharm/agents/__init__.py
src/inspect_evals/agentharm/agents/agent.py
src/inspect_evals/agentharm/agents/default_agent.py
src/inspect_evals/agentharm/agents/refusal_agent.py
src/inspect_evals/agentharm/benchmark/benign_grading_functions.py
src/inspect_evals/agentharm/benchmark/grading_utils.py
src/inspect_evals/agentharm/benchmark/harmful_grading_functions.py
src/inspect_evals/agentharm/benchmark/benign_tools/__init__.py
src/inspect_evals/agentharm/benchmark/benign_tools/add_new_features.py
src/inspect_evals/agentharm/benchmark/benign_tools/comment_on_mastodon.py
src/inspect_evals/agentharm/benchmark/benign_tools/connect_vpn.py
src/inspect_evals/agentharm/benchmark/benign_tools/convert_pdf_to_text.py
src/inspect_evals/agentharm/benchmark/benign_tools/convert_text_to_pdf.py
src/inspect_evals/agentharm/benchmark/benign_tools/create_ai_video.py
src/inspect_evals/agentharm/benchmark/benign_tools/create_encryption_tool.py
src/inspect_evals/agentharm/benchmark/benign_tools/create_instagram_post.py
src/inspect_evals/agentharm/benchmark/benign_tools/create_instagram_story.py
src/inspect_evals/agentharm/benchmark/benign_tools/create_meme_image.py
src/inspect_evals/agentharm/benchmark/benign_tools/dm_reddit_user.py
src/inspect_evals/agentharm/benchmark/benign_tools/download_file.py
src/inspect_evals/agentharm/benchmark/benign_tools/download_from_alpha_bay.py
src/inspect_evals/agentharm/benchmark/benign_tools/download_image.py
src/inspect_evals/agentharm/benchmark/benign_tools/download_pdf.py
src/inspect_evals/agentharm/benchmark/benign_tools/download_software.py
src/inspect_evals/agentharm/benchmark/benign_tools/edit_txt.py
src/inspect_evals/agentharm/benchmark/benign_tools/fill_in_story_template.py
src/inspect_evals/agentharm/benchmark/benign_tools/follow_x_account.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_all_login_data.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_client_emails.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_contacts.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_dir_list.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_direct_messages_x.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_friends_info.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_info_from_database.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_latest_emails.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_latest_posts_on_x.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_paypal_transactions.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_pinterest_accounts.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_public_databases.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_tinder_choices.py
src/inspect_evals/agentharm/benchmark/benign_tools/get_webpage.py
src/inspect_evals/agentharm/benchmark/benign_tools/log_security_audit.py
src/inspect_evals/agentharm/benchmark/benign_tools/login_into_bank_app.py
src/inspect_evals/agentharm/benchmark/benign_tools/make_online_purchases.py
src/inspect_evals/agentharm/benchmark/benign_tools/paypal_transfer.py
src/inspect_evals/agentharm/benchmark/benign_tools/personal_website_upload.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_forum_comment.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_instagram_reply.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_on_facebook.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_on_instagram.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_on_x.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_to_amazon.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_to_blog.py
src/inspect_evals/agentharm/benchmark/benign_tools/post_x_reply.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_bing_search.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_google_search.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_onion_search.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_posts_on_instagram.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_posts_on_x.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_x_user_info.py
src/inspect_evals/agentharm/benchmark/benign_tools/query_x_users.py
src/inspect_evals/agentharm/benchmark/benign_tools/read_file.py
src/inspect_evals/agentharm/benchmark/benign_tools/redeem_gift_cards.py
src/inspect_evals/agentharm/benchmark/benign_tools/report_x_post.py
src/inspect_evals/agentharm/benchmark/benign_tools/repost_x_post.py
src/inspect_evals/agentharm/benchmark/benign_tools/run_stable_diffusion_model.py
src/inspect_evals/agentharm/benchmark/benign_tools/scrape_reddit_usernames.py
src/inspect_evals/agentharm/benchmark/benign_tools/search_asset_libraries.py
src/inspect_evals/agentharm/benchmark/benign_tools/search_hotels.py
src/inspect_evals/agentharm/benchmark/benign_tools/search_web_hosting.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_direct_message_instagram.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_direct_message_tinder.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_direct_message_x.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_email.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_message.py
src/inspect_evals/agentharm/benchmark/benign_tools/send_message_whatsapp.py
src/inspect_evals/agentharm/benchmark/benign_tools/share_on_pinterest.py
src/inspect_evals/agentharm/benchmark/benign_tools/submit_alpha_bay_order.py
src/inspect_evals/agentharm/benchmark/benign_tools/submit_amazon_order.py
src/inspect_evals/agentharm/benchmark/benign_tools/submit_assignment.py
src/inspect_evals/agentharm/benchmark/benign_tools/submit_tinder_match.py
src/inspect_evals/agentharm/benchmark/benign_tools/terminal.py
src/inspect_evals/agentharm/benchmark/benign_tools/transfer_money.py
src/inspect_evals/agentharm/benchmark/benign_tools/transfer_money_crypto.py
src/inspect_evals/agentharm/benchmark/benign_tools/upload_file.py
src/inspect_evals/agentharm/benchmark/benign_tools/upload_tiktok_video.py
src/inspect_evals/agentharm/benchmark/benign_tools/upload_youtube_video.py
src/inspect_evals/agentharm/benchmark/benign_tools/verify_database_security.py
src/inspect_evals/agentharm/benchmark/benign_tools/content/api_keys.json
src/inspect_evals/agentharm/benchmark/benign_tools/content/attention_paper.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/contract.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/paper_abstract.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/passwords.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/public_story.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/short_story.txt
src/inspect_evals/agentharm/benchmark/benign_tools/content/to_do.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/__init__.py
src/inspect_evals/agentharm/benchmark/harmful_tools/add_hidden_malware.py
src/inspect_evals/agentharm/benchmark/harmful_tools/comment_on_mastodon.py
src/inspect_evals/agentharm/benchmark/harmful_tools/connect_vpn.py
src/inspect_evals/agentharm/benchmark/harmful_tools/convert_pdf_to_text.py
src/inspect_evals/agentharm/benchmark/harmful_tools/convert_text_to_pdf.py
src/inspect_evals/agentharm/benchmark/harmful_tools/create_ai_video.py
src/inspect_evals/agentharm/benchmark/harmful_tools/create_instagram_post.py
src/inspect_evals/agentharm/benchmark/harmful_tools/create_instagram_story.py
src/inspect_evals/agentharm/benchmark/harmful_tools/create_meme_image.py
src/inspect_evals/agentharm/benchmark/harmful_tools/create_ransomware.py
src/inspect_evals/agentharm/benchmark/harmful_tools/delete_access_logs.py
src/inspect_evals/agentharm/benchmark/harmful_tools/dm_reddit_user.py
src/inspect_evals/agentharm/benchmark/harmful_tools/download_file.py
src/inspect_evals/agentharm/benchmark/harmful_tools/download_from_alpha_bay.py
src/inspect_evals/agentharm/benchmark/harmful_tools/download_image.py
src/inspect_evals/agentharm/benchmark/harmful_tools/download_pdf.py
src/inspect_evals/agentharm/benchmark/harmful_tools/download_software.py
src/inspect_evals/agentharm/benchmark/harmful_tools/edit_txt.py
src/inspect_evals/agentharm/benchmark/harmful_tools/extract_data_from_database.py
src/inspect_evals/agentharm/benchmark/harmful_tools/fill_in_story_template.py
src/inspect_evals/agentharm/benchmark/harmful_tools/follow_x_account.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_all_login_data.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_client_emails.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_contacts.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_dir_list.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_direct_messages_x.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_friends_info.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_info_from_database.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_latest_emails.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_latest_posts_on_x.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_paypal_transactions.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_pinterest_accounts.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_public_databases.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_tinder_choices.py
src/inspect_evals/agentharm/benchmark/harmful_tools/get_webpage.py
src/inspect_evals/agentharm/benchmark/harmful_tools/login_into_bank_app.py
src/inspect_evals/agentharm/benchmark/harmful_tools/make_online_purchases.py
src/inspect_evals/agentharm/benchmark/harmful_tools/paypal_transfer.py
src/inspect_evals/agentharm/benchmark/harmful_tools/personal_website_upload.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_forum_comment.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_instagram_reply.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_on_facebook.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_on_instagram.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_on_x.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_to_amazon.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_to_blog.py
src/inspect_evals/agentharm/benchmark/harmful_tools/post_x_reply.py
src/inspect_evals/agentharm/benchmark/harmful_tools/purchase_credit_card_details.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_bing_search.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_google_search.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_onion_search.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_posts_on_instagram.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_posts_on_x.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_x_user_info.py
src/inspect_evals/agentharm/benchmark/harmful_tools/query_x_users.py
src/inspect_evals/agentharm/benchmark/harmful_tools/read_file.py
src/inspect_evals/agentharm/benchmark/harmful_tools/report_x_post.py
src/inspect_evals/agentharm/benchmark/harmful_tools/repost_x_post.py
src/inspect_evals/agentharm/benchmark/harmful_tools/run_stable_diffusion_model.py
src/inspect_evals/agentharm/benchmark/harmful_tools/scrape_reddit_usernames.py
src/inspect_evals/agentharm/benchmark/harmful_tools/search_asset_libraries.py
src/inspect_evals/agentharm/benchmark/harmful_tools/search_hotels.py
src/inspect_evals/agentharm/benchmark/harmful_tools/search_web_hosting.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_direct_message_instagram.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_direct_message_tinder.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_direct_message_x.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_email.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_message.py
src/inspect_evals/agentharm/benchmark/harmful_tools/send_message_whatsapp.py
src/inspect_evals/agentharm/benchmark/harmful_tools/share_on_pinterest.py
src/inspect_evals/agentharm/benchmark/harmful_tools/submit_alpha_bay_order.py
src/inspect_evals/agentharm/benchmark/harmful_tools/submit_amazon_order.py
src/inspect_evals/agentharm/benchmark/harmful_tools/submit_assignment.py
src/inspect_evals/agentharm/benchmark/harmful_tools/submit_tinder_match.py
src/inspect_evals/agentharm/benchmark/harmful_tools/terminal.py
src/inspect_evals/agentharm/benchmark/harmful_tools/transfer_money.py
src/inspect_evals/agentharm/benchmark/harmful_tools/transfer_money_crypto.py
src/inspect_evals/agentharm/benchmark/harmful_tools/upload_file.py
src/inspect_evals/agentharm/benchmark/harmful_tools/upload_tiktok_video.py
src/inspect_evals/agentharm/benchmark/harmful_tools/upload_youtube_video.py
src/inspect_evals/agentharm/benchmark/harmful_tools/content/api_keys.json
src/inspect_evals/agentharm/benchmark/harmful_tools/content/attention_paper.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/content/contract.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/content/paper_abstract.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/content/passwords.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/content/short_story.txt
src/inspect_evals/agentharm/benchmark/harmful_tools/content/to_do.txt
src/inspect_evals/agentharm/images/abstract.png
src/inspect_evals/agentharm/images/main_results.png
src/inspect_evals/agentic_misalignment/LICENSE
src/inspect_evals/agentic_misalignment/README.md
src/inspect_evals/agentic_misalignment/__init__.py
src/inspect_evals/agentic_misalignment/agentic_misalignment.py
src/inspect_evals/agentic_misalignment/eval.yaml
src/inspect_evals/agentic_misalignment/prompt_generator.py
src/inspect_evals/agentic_misalignment/scorers.py
src/inspect_evals/agentic_misalignment/show_prompt_diffs.py
src/inspect_evals/agentic_misalignment/classifiers/__init__.py
src/inspect_evals/agentic_misalignment/classifiers/blackmail_classifier.py
src/inspect_evals/agentic_misalignment/classifiers/classifier.py
src/inspect_evals/agentic_misalignment/classifiers/deliberation_classifier.py
src/inspect_evals/agentic_misalignment/classifiers/leak_classifier.py
src/inspect_evals/agentic_misalignment/classifiers/murder_classifier.py
src/inspect_evals/agentic_misalignment/templates/system_prompt_templates.py
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-goal-conflict-none-none.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-goal-conflict-none-replacement.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-goal-conflict-none-restriction.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-none.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-replacement.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-restriction.md
src/inspect_evals/agentic_misalignment/templates/blackmail/templates/email-template-blackmail-shared.md
src/inspect_evals/agentic_misalignment/templates/blackmail/variables/ambiguous.md
src/inspect_evals/agentic_misalignment/templates/blackmail/variables/america_vs_global.md
src/inspect_evals/agentic_misalignment/templates/blackmail/variables/ethical_vs_pragmatic.md
src/inspect_evals/agentic_misalignment/templates/blackmail/variables/individualism_vs_collectivism.md
src/inspect_evals/agentic_misalignment/templates/blackmail/variables/safety_vs_acceleration.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-goal-conflict-none-none.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-goal-conflict-none-replacement.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-goal-conflict-none-restriction.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-none.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-replacement.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-restriction.md
src/inspect_evals/agentic_misalignment/templates/leaking/templates/email-template-leaking-shared.md
src/inspect_evals/agentic_misalignment/templates/leaking/variables/ambiguous.md
src/inspect_evals/agentic_misalignment/templates/leaking/variables/america_vs_global.md
src/inspect_evals/agentic_misalignment/templates/leaking/variables/ethical_vs_pragmatic.md
src/inspect_evals/agentic_misalignment/templates/leaking/variables/individualism_vs_collectivism.md
src/inspect_evals/agentic_misalignment/templates/leaking/variables/safety_vs_acceleration.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-goal-conflict-none-none.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-goal-conflict-none-replacement.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-goal-conflict-none-restriction.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-none.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-replacement.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-restriction.md
src/inspect_evals/agentic_misalignment/templates/murder/templates/email-template-murder-shared.md
src/inspect_evals/agentic_misalignment/templates/murder/variables/ambiguous.md
src/inspect_evals/agentic_misalignment/templates/murder/variables/america_vs_global.md
src/inspect_evals/agentic_misalignment/templates/murder/variables/ethical_vs_pragmatic.md
src/inspect_evals/agentic_misalignment/templates/murder/variables/individualism_vs_collectivism.md
src/inspect_evals/agentic_misalignment/templates/murder/variables/safety_vs_acceleration.md
src/inspect_evals/agieval/README.md
src/inspect_evals/agieval/__init__.py
src/inspect_evals/agieval/agieval.py
src/inspect_evals/agieval/agieval_cloze.py
src/inspect_evals/agieval/eval.yaml
src/inspect_evals/agieval/utils.py
src/inspect_evals/ahb/README.md
src/inspect_evals/ahb/__init__.py
src/inspect_evals/ahb/ahb.py
src/inspect_evals/ahb/ahbplot.png
src/inspect_evals/ahb/ceiling.png
src/inspect_evals/ahb/dataset.py
src/inspect_evals/ahb/eval.yaml
src/inspect_evals/ahb/image (1).png
src/inspect_evals/ahb/metrics.py
src/inspect_evals/ahb/plot2.png
src/inspect_evals/ahb/radar_plot.py
src/inspect_evals/ahb/scorer.py
src/inspect_evals/ahb/types.py
src/inspect_evals/ahb/utils.py
src/inspect_evals/aime2024/README.md
src/inspect_evals/aime2024/__init__.py
src/inspect_evals/aime2024/aime2024.py
src/inspect_evals/aime2024/eval.yaml
src/inspect_evals/aime2025/README.md
src/inspect_evals/aime2025/__init__.py
src/inspect_evals/aime2025/aime2025.py
src/inspect_evals/aime2025/eval.yaml
src/inspect_evals/aime2026/README.md
src/inspect_evals/aime2026/__init__.py
src/inspect_evals/aime2026/aime2026.py
src/inspect_evals/aime2026/eval.yaml
src/inspect_evals/air_bench/README.md
src/inspect_evals/air_bench/__init__.py
src/inspect_evals/air_bench/air_bench.py
src/inspect_evals/air_bench/eval.yaml
src/inspect_evals/air_bench/results.csv
src/inspect_evals/air_bench/utils.py
src/inspect_evals/ape/README.md
src/inspect_evals/ape/__init__.py
src/inspect_evals/ape/ape.py
src/inspect_evals/ape/dataset.py
src/inspect_evals/ape/eval.yaml
src/inspect_evals/ape/prompts.py
src/inspect_evals/ape/scorer.py
src/inspect_evals/ape/solver.py
src/inspect_evals/ape/utils.py
src/inspect_evals/apps/README.md
src/inspect_evals/apps/__init__.py
src/inspect_evals/apps/apps.py
src/inspect_evals/apps/eval.yaml
src/inspect_evals/apps/huggingface_artifact/apps.py
src/inspect_evals/arc/README.md
src/inspect_evals/arc/__init__.py
src/inspect_evals/arc/arc.py
src/inspect_evals/arc/eval.yaml
src/inspect_evals/assistant_bench/README.md
src/inspect_evals/assistant_bench/__init__.py
src/inspect_evals/assistant_bench/assistant_bench.py
src/inspect_evals/assistant_bench/compose.yaml
src/inspect_evals/assistant_bench/dataset.py
src/inspect_evals/assistant_bench/eval.yaml
src/inspect_evals/assistant_bench/prompts.py
src/inspect_evals/assistant_bench/scoring.py
src/inspect_evals/assistant_bench/solver.py
src/inspect_evals/assistant_bench/values.yaml
src/inspect_evals/b3/README.md
src/inspect_evals/b3/__init__.py
src/inspect_evals/b3/b3.py
src/inspect_evals/b3/dataset.py
src/inspect_evals/b3/embeddings.py
src/inspect_evals/b3/eval.yaml
src/inspect_evals/b3/judge_defense.py
src/inspect_evals/b3/metrics.py
src/inspect_evals/b3/paths.py
src/inspect_evals/b3/scorer.py
src/inspect_evals/b3/tools.py
src/inspect_evals/b3/data/profanity_keywords.json
src/inspect_evals/b3/data/threat_snapshots/TS10_legal_TI_exfiltration_0.json
src/inspect_evals/b3/data/threat_snapshots/TS10_legal_TI_exfiltration_1.json
src/inspect_evals/b3/data/threat_snapshots/TS1_cco_coach_0.json
src/inspect_evals/b3/data/threat_snapshots/TS1_cco_coach_1.json
src/inspect_evals/b3/data/threat_snapshots/TS2_trip_planner_phishing_0.json
src/inspect_evals/b3/data/threat_snapshots/TS2_trip_planner_phishing_1.json
src/inspect_evals/b3/data/threat_snapshots/TS3_mcp_chat_poisoning_0.json
src/inspect_evals/b3/data/threat_snapshots/TS3_mcp_chat_poisoning_1.json
src/inspect_evals/b3/data/threat_snapshots/TS4_solace_profane_chat_0.json
src/inspect_evals/b3/data/threat_snapshots/TS4_solace_profane_chat_1.json
src/inspect_evals/b3/data/threat_snapshots/TS5_chatapp_refuse_0.json
src/inspect_evals/b3/data/threat_snapshots/TS5_chatapp_refuse_1.json
src/inspect_evals/b3/data/threat_snapshots/TS6_analytics_misleading_0.json
src/inspect_evals/b3/data/threat_snapshots/TS6_analytics_misleading_1.json
src/inspect_evals/b3/data/threat_snapshots/TS7_code_review_malicious_0.json
src/inspect_evals/b3/data/threat_snapshots/TS7_code_review_malicious_1.json
src/inspect_evals/b3/data/threat_snapshots/TS8_product_recommendation_tool_extraction_0.json
src/inspect_evals/b3/data/threat_snapshots/TS8_product_recommendation_tool_extraction_1.json
src/inspect_evals/b3/data/threat_snapshots/TS9_email_tool_invocation_0.json
src/inspect_evals/b3/data/threat_snapshots/TS9_email_tool_invocation_1.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS10_psychiatry_TI_exfiltration_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS1_smart_home_controller_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS2_clothes_recommendation_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS3_commute_assistant_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS4_maths_teaching_support_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS5_calendar_bot_refuse_execute_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS6_employee_reviewer_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS7_ci_workflow_auditor_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS8_email_assistant_0.json
src/inspect_evals/b3/data/threat_snapshots/variants/TS9_internal_messenger_impersonation_0.json
src/inspect_evals/b3/data/tool_specs/apply_coupon.json
src/inspect_evals/b3/data/tool_specs/book_flight.json
src/inspect_evals/b3/data/tool_specs/book_hotel.json
src/inspect_evals/b3/data/tool_specs/book_tour.json
src/inspect_evals/b3/data/tool_specs/buy_product.json
src/inspect_evals/b3/data/tool_specs/buy_transit_tickets.json
src/inspect_evals/b3/data/tool_specs/calendar_list.json
src/inspect_evals/b3/data/tool_specs/check_bus_routes.json
src/inspect_evals/b3/data/tool_specs/check_train_connections.json
src/inspect_evals/b3/data/tool_specs/compare_prices.json
src/inspect_evals/b3/data/tool_specs/contacts_lookup.json
src/inspect_evals/b3/data/tool_specs/context_memory.json
src/inspect_evals/b3/data/tool_specs/db_lookup.json
src/inspect_evals/b3/data/tool_specs/document_retriever.json
src/inspect_evals/b3/data/tool_specs/dummy_tool.json
src/inspect_evals/b3/data/tool_specs/email_marketing.json
src/inspect_evals/b3/data/tool_specs/fetch_inbox.json
src/inspect_evals/b3/data/tool_specs/fetch_threads.json
src/inspect_evals/b3/data/tool_specs/get_current_weather.json
src/inspect_evals/b3/data/tool_specs/get_weather_alerts.json
src/inspect_evals/b3/data/tool_specs/get_weather_forecast.json
src/inspect_evals/b3/data/tool_specs/location_context_memory.json
src/inspect_evals/b3/data/tool_specs/note_memory.json
src/inspect_evals/b3/data/tool_specs/patient_record_retriever.json
src/inspect_evals/b3/data/tool_specs/price_compare.json
src/inspect_evals/b3/data/tool_specs/schedule_meeting.json
src/inspect_evals/b3/data/tool_specs/search_email.json
src/inspect_evals/b3/data/tool_specs/search_flights.json
src/inspect_evals/b3/data/tool_specs/search_hotels.json
src/inspect_evals/b3/data/tool_specs/search_products.json
src/inspect_evals/b3/data/tool_specs/search_tours.json
src/inspect_evals/b3/data/tool_specs/send_chat_message.json
src/inspect_evals/b3/data/tool_specs/send_email.json
src/inspect_evals/b3/data/tool_specs/set_thermostat.json
src/inspect_evals/b3/data/tool_specs/task_memory.json
src/inspect_evals/b3/data/tool_specs/toggle_lights.json
src/inspect_evals/b3/data/tool_specs/web_search.json
src/inspect_evals/b3/experiments/constants.py
src/inspect_evals/b3/experiments/model_spec.py
src/inspect_evals/b3/experiments/run.py
src/inspect_evals/bbeh/README.md
src/inspect_evals/bbeh/__init__.py
src/inspect_evals/bbeh/bbeh.py
src/inspect_evals/bbeh/data.py
src/inspect_evals/bbeh/eval.yaml
src/inspect_evals/bbeh/utils.py
src/inspect_evals/bbh/README.md
src/inspect_evals/bbh/__init__.py
src/inspect_evals/bbh/bbh.py
src/inspect_evals/bbh/eval.yaml
src/inspect_evals/bbq/README.md
src/inspect_evals/bbq/__init__.py
src/inspect_evals/bbq/bbq.py
src/inspect_evals/bbq/eval.yaml
src/inspect_evals/bbq/huggingface_artifact/bbq.py
src/inspect_evals/bfcl/README.md
src/inspect_evals/bfcl/__init__.py
src/inspect_evals/bfcl/bfcl.py
src/inspect_evals/bfcl/data.py
src/inspect_evals/bfcl/eval.yaml
src/inspect_evals/bfcl/prompts.py
src/inspect_evals/bfcl/backends/__init__.py
src/inspect_evals/bfcl/backends/downloader.py
src/inspect_evals/bfcl/backends/loader.py
src/inspect_evals/bfcl/score/__init__.py
src/inspect_evals/bfcl/score/multi_turn_scorer.py
src/inspect_evals/bfcl/score/scorer.py
src/inspect_evals/bfcl/solve/__init__.py
src/inspect_evals/bfcl/solve/multi_turn_solver.py
src/inspect_evals/bfcl/solve/single_turn_solver.py
src/inspect_evals/bfcl/utils/__init__.py
src/inspect_evals/bfcl/utils/function_parsing.py
src/inspect_evals/bfcl/utils/task_categories.py
src/inspect_evals/bfcl/utils/tool_parsing.py
src/inspect_evals/bigcodebench/Dockerfile
src/inspect_evals/bigcodebench/README.md
src/inspect_evals/bigcodebench/__init__.py
src/inspect_evals/bigcodebench/bigcodebench.py
src/inspect_evals/bigcodebench/compose.yaml
src/inspect_evals/bigcodebench/docker-requirements.txt
src/inspect_evals/bigcodebench/eval.yaml
src/inspect_evals/bold/README.md
src/inspect_evals/bold/__init__.py
src/inspect_evals/bold/bold.py
src/inspect_evals/bold/eval.yaml
src/inspect_evals/boolq/README.md
src/inspect_evals/boolq/__init__.py
src/inspect_evals/boolq/boolq.py
src/inspect_evals/boolq/eval.yaml
src/inspect_evals/browse_comp/README.md
src/inspect_evals/browse_comp/__init__.py
src/inspect_evals/browse_comp/browse_comp.py
src/inspect_evals/browse_comp/compose.yaml
src/inspect_evals/browse_comp/eval.yaml
src/inspect_evals/browse_comp/prompts.py
src/inspect_evals/browse_comp/utils.py
src/inspect_evals/chembench/README.md
src/inspect_evals/chembench/__init__.py
src/inspect_evals/chembench/chembench.py
src/inspect_evals/chembench/eval.yaml
src/inspect_evals/class_eval/Dockerfile
src/inspect_evals/class_eval/README.md
src/inspect_evals/class_eval/__init__.py
src/inspect_evals/class_eval/class_eval.py
src/inspect_evals/class_eval/docker-requirements.txt
src/inspect_evals/class_eval/eval.yaml
src/inspect_evals/class_eval/notes.txt
src/inspect_evals/class_eval/utils.py
src/inspect_evals/coconot/README.md
src/inspect_evals/coconot/__init__.py
src/inspect_evals/coconot/coconot.py
src/inspect_evals/coconot/eval.yaml
src/inspect_evals/coconot/data/refusal_evaluation_rubric.json
src/inspect_evals/commonsense_qa/README.md
src/inspect_evals/commonsense_qa/__init__.py
src/inspect_evals/commonsense_qa/commonsense_qa.py
src/inspect_evals/commonsense_qa/eval.yaml
src/inspect_evals/compute_eval/Dockerfile
src/inspect_evals/compute_eval/README.md
src/inspect_evals/compute_eval/__init__.py
src/inspect_evals/compute_eval/compose.yaml
src/inspect_evals/compute_eval/compute_eval.py
src/inspect_evals/compute_eval/eval.yaml
src/inspect_evals/compute_eval/prompt.py
src/inspect_evals/compute_eval/scorer.py
src/inspect_evals/core_bench/Dockerfile
src/inspect_evals/core_bench/README.md
src/inspect_evals/core_bench/__init__.py
src/inspect_evals/core_bench/agent_prompts.json
src/inspect_evals/core_bench/compose.yaml
src/inspect_evals/core_bench/core_bench.py
src/inspect_evals/core_bench/dataset.py
src/inspect_evals/core_bench/eval.yaml
src/inspect_evals/core_bench/scorer.py
src/inspect_evals/core_bench/tools.py
src/inspect_evals/core_bench/utils.py
src/inspect_evals/cti_realm/.gitignore
src/inspect_evals/cti_realm/.noautolint
src/inspect_evals/cti_realm/CTI_REALM_TRANSPARENCY.md
src/inspect_evals/cti_realm/README.md
src/inspect_evals/cti_realm/__init__.py
src/inspect_evals/cti_realm/cti_realm.py
src/inspect_evals/cti_realm/download_data.py
src/inspect_evals/cti_realm/eval.yaml
src/inspect_evals/cti_realm/core/__init__.py
src/inspect_evals/cti_realm/core/dataset.py
src/inspect_evals/cti_realm/core/health_check.py
src/inspect_evals/cti_realm/core/llm_judge_prompts.json
src/inspect_evals/cti_realm/core/logging_utils.py
src/inspect_evals/cti_realm/core/parsing_utils.py
src/inspect_evals/cti_realm/core/reasoning_extractor.py
src/inspect_evals/cti_realm/core/scorer.py
src/inspect_evals/cti_realm/core/tool_verification.py
src/inspect_evals/cti_realm/core/tools.py
src/inspect_evals/cti_realm/core/trajectory_scorer.py
src/inspect_evals/cti_realm/core/utils.py
src/inspect_evals/cti_realm/docker/.dockerignore
src/inspect_evals/cti_realm/docker/Dockerfile
src/inspect_evals/cti_realm/docker/compose.yaml
src/inspect_evals/cti_realm/docker/kusto_init/data_loader.py
src/inspect_evals/cti_realm/docker/kusto_init/init_kusto.py
src/inspect_evals/cti_realm/docker/kusto_init/load_data.py
src/inspect_evals/cti_realm/docker/kusto_init/requirements.txt
src/inspect_evals/cti_realm/docker/mitre_service/mitre_api.py
src/inspect_evals/cti_realm/docker/mitre_service/mitre_loader.py
src/inspect_evals/cti_realm/docker/mitre_service/requirements.txt
src/inspect_evals/cti_realm/docker/sigma_indexer/build_index.py
src/inspect_evals/cti_realm/docker/sigma_indexer/requirements.txt
src/inspect_evals/cti_realm/seed_memory/patterns.md
src/inspect_evals/cti_realm/seed_memory/tool_tips.md
src/inspect_evals/cti_realm/seed_memory/workflow.md
src/inspect_evals/cve_bench/README.md
src/inspect_evals/cve_bench/__init__.py
src/inspect_evals/cve_bench/cve_bench.py
src/inspect_evals/cve_bench/eval.yaml
src/inspect_evals/cybench/.noautolint
src/inspect_evals/cybench/README.md
src/inspect_evals/cybench/__init__.py
src/inspect_evals/cybench/build-images.sh
src/inspect_evals/cybench/cybench.py
src/inspect_evals/cybench/eval.yaml
src/inspect_evals/cybench/push_images.sh
src/inspect_evals/cybench/agent_sandbox/Dockerfile
src/inspect_evals/cybench/agent_sandbox/archive-keyring.gpg
src/inspect_evals/cybench/challenges/.noautolint
src/inspect_evals/cybench/challenges/avatar/compose.yaml
src/inspect_evals/cybench/challenges/avatar/eval.yaml
src/inspect_evals/cybench/challenges/avatar/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/avatar/images/victim/chall/chall.py
src/inspect_evals/cybench/challenges/avatar/images/victim/chall/flag.txt
src/inspect_evals/cybench/challenges/avatar/solution/requirements.sh
src/inspect_evals/cybench/challenges/avatar/solution/solution.sh
src/inspect_evals/cybench/challenges/avatar/solution/solution.txt
src/inspect_evals/cybench/challenges/avatar/solution/solve.py
src/inspect_evals/cybench/challenges/back_to_the_past/compose.yaml
src/inspect_evals/cybench/challenges/back_to_the_past/eval.yaml
src/inspect_evals/cybench/challenges/back_to_the_past/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/back_to_the_past/images/victim/html.tgz
src/inspect_evals/cybench/challenges/back_to_the_past/images/victim/nginx.conf
src/inspect_evals/cybench/challenges/back_to_the_past/solution/requirements.sh
src/inspect_evals/cybench/challenges/back_to_the_past/solution/solution.sh
src/inspect_evals/cybench/challenges/back_to_the_past/solution/solution.txt
src/inspect_evals/cybench/challenges/chunky/compose.yaml
src/inspect_evals/cybench/challenges/chunky/eval.yaml
src/inspect_evals/cybench/challenges/chunky/images/blog/Dockerfile
src/inspect_evals/cybench/challenges/chunky/images/blog/requirements.txt
src/inspect_evals/cybench/challenges/chunky/images/blog/src/app.py
src/inspect_evals/cybench/challenges/chunky/images/blog/src/jwks.json
src/inspect_evals/cybench/challenges/chunky/images/blog/src/admin/admin.py
src/inspect_evals/cybench/challenges/chunky/images/blog/src/blog_posts/blog_posts.py
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/base.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/create_post.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/error.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/home.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/login.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/post.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/templates/signup.html
src/inspect_evals/cybench/challenges/chunky/images/blog/src/users/users.py
src/inspect_evals/cybench/challenges/chunky/images/nginx/Dockerfile
src/inspect_evals/cybench/challenges/chunky/images/nginx/nginx.conf
src/inspect_evals/cybench/challenges/chunky/images/rcache/Dockerfile
src/inspect_evals/cybench/challenges/chunky/images/rcache/cache.go
src/inspect_evals/cybench/challenges/chunky/images/rcache/go.mod
src/inspect_evals/cybench/challenges/chunky/images/rcache/rcache.go
src/inspect_evals/cybench/challenges/chunky/solution/requirements.sh
src/inspect_evals/cybench/challenges/chunky/solution/solution.sh
src/inspect_evals/cybench/challenges/chunky/solution/solution.txt
src/inspect_evals/cybench/challenges/chunky/solution/solve.py
src/inspect_evals/cybench/challenges/crushing/compose.yaml
src/inspect_evals/cybench/challenges/crushing/eval.yaml
src/inspect_evals/cybench/challenges/crushing/resources/crush
src/inspect_evals/cybench/challenges/crushing/resources/message.txt.cz
src/inspect_evals/cybench/challenges/crushing/solution/solution.sh
src/inspect_evals/cybench/challenges/crushing/solution/solution.txt
src/inspect_evals/cybench/challenges/crushing/solution/solve.py
src/inspect_evals/cybench/challenges/data_siege/compose.yaml
src/inspect_evals/cybench/challenges/data_siege/eval.yaml
src/inspect_evals/cybench/challenges/data_siege/resources/capture.pcap
src/inspect_evals/cybench/challenges/delulu/compose.yaml
src/inspect_evals/cybench/challenges/delulu/eval.yaml
src/inspect_evals/cybench/challenges/delulu/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/delulu/images/victim/challenge/delulu
src/inspect_evals/cybench/challenges/delulu/images/victim/challenge/flag.txt
src/inspect_evals/cybench/challenges/delulu/images/victim/challenge/glibc/ld-linux-x86-64.so.2
src/inspect_evals/cybench/challenges/delulu/images/victim/challenge/glibc/libc.so.6
src/inspect_evals/cybench/challenges/delulu/solution/requirements.sh
src/inspect_evals/cybench/challenges/delulu/solution/solution.sh
src/inspect_evals/cybench/challenges/delulu/solution/solution.txt
src/inspect_evals/cybench/challenges/delulu/solution/solve.py
src/inspect_evals/cybench/challenges/diffecient/compose.yaml
src/inspect_evals/cybench/challenges/diffecient/eval.yaml
src/inspect_evals/cybench/challenges/diffecient/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/diffecient/images/victim/xinetd
src/inspect_evals/cybench/challenges/diffecient/images/victim/app/diffecient.py
src/inspect_evals/cybench/challenges/diffecient/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/diffecient/solution/requirements.sh
src/inspect_evals/cybench/challenges/diffecient/solution/solution.sh
src/inspect_evals/cybench/challenges/diffecient/solution/solve.py
src/inspect_evals/cybench/challenges/dynastic/compose.yaml
src/inspect_evals/cybench/challenges/dynastic/eval.yaml
src/inspect_evals/cybench/challenges/dynastic/resources/output.txt
src/inspect_evals/cybench/challenges/dynastic/resources/source.py
src/inspect_evals/cybench/challenges/dynastic/solution/solution.sh
src/inspect_evals/cybench/challenges/dynastic/solution/solve.py
src/inspect_evals/cybench/challenges/eval_me/compose.yaml
src/inspect_evals/cybench/challenges/eval_me/eval.yaml
src/inspect_evals/cybench/challenges/eval_me/resources/capture.pcapng
src/inspect_evals/cybench/challenges/eval_me/resources/extract.sh
src/inspect_evals/cybench/challenges/eval_me/solution/requirements.sh
src/inspect_evals/cybench/challenges/eval_me/solution/solution.sh
src/inspect_evals/cybench/challenges/ezmaze/compose.yaml
src/inspect_evals/cybench/challenges/ezmaze/eval.yaml
src/inspect_evals/cybench/challenges/ezmaze/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/ezmaze/images/victim/xinetd
src/inspect_evals/cybench/challenges/ezmaze/images/victim/app/chall.py
src/inspect_evals/cybench/challenges/ezmaze/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/ezmaze/solution/requirements.sh
src/inspect_evals/cybench/challenges/ezmaze/solution/solution.sh
src/inspect_evals/cybench/challenges/ezmaze/solution/solve.py
src/inspect_evals/cybench/challenges/failproof/compose.yaml
src/inspect_evals/cybench/challenges/failproof/eval.yaml
src/inspect_evals/cybench/challenges/failproof/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/failproof/images/victim/xinetd
src/inspect_evals/cybench/challenges/failproof/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/failproof/images/victim/app/source.py
src/inspect_evals/cybench/challenges/failproof/solution/requirements.sh
src/inspect_evals/cybench/challenges/failproof/solution/solution.sh
src/inspect_evals/cybench/challenges/failproof/solution/solve.py
src/inspect_evals/cybench/challenges/flag_command/compose.yaml
src/inspect_evals/cybench/challenges/flag_command/eval.yaml
src/inspect_evals/cybench/challenges/flag_command/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/flag_command/images/victim/flag.txt
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/run.py
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/config.py
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/main.py
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/util.py
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/blueprints/routes.py
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/audio/typing_sound.mp3
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/css/commands.css
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/css/terminal.css
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/js/commands.js
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/js/game.js
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/static/terminal/js/main.js
src/inspect_evals/cybench/challenges/flag_command/images/victim/challenge/application/templates/main.html
src/inspect_evals/cybench/challenges/flag_command/images/victim/config/supervisord.conf
src/inspect_evals/cybench/challenges/flag_command/solution/requirements.sh
src/inspect_evals/cybench/challenges/flag_command/solution/solution.sh
src/inspect_evals/cybench/challenges/flag_command/solution/solution.txt
src/inspect_evals/cybench/challenges/flag_command/solution/solve.py
src/inspect_evals/cybench/challenges/flecks_of_gold/compose.yaml
src/inspect_evals/cybench/challenges/flecks_of_gold/eval.yaml
src/inspect_evals/cybench/challenges/flecks_of_gold/resources/flecks
src/inspect_evals/cybench/challenges/flecks_of_gold/solution/flecks_patched
src/inspect_evals/cybench/challenges/flecks_of_gold/solution/solution.sh
src/inspect_evals/cybench/challenges/flecks_of_gold/solution/solution.txt
src/inspect_evals/cybench/challenges/frog_waf/compose.yaml
src/inspect_evals/cybench/challenges/frog_waf/eval.yaml
src/inspect_evals/cybench/challenges/frog_waf/images/victim/.gitignore
src/inspect_evals/cybench/challenges/frog_waf/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/frog_waf/images/victim/build-docker.sh
src/inspect_evals/cybench/challenges/frog_waf/images/victim/build.gradle
src/inspect_evals/cybench/challenges/frog_waf/images/victim/flag.txt
src/inspect_evals/cybench/challenges/frog_waf/images/victim/gradlew
src/inspect_evals/cybench/challenges/frog_waf/images/victim/gradlew.bat
src/inspect_evals/cybench/challenges/frog_waf/images/victim/settings.gradle
src/inspect_evals/cybench/challenges/frog_waf/images/victim/gradle/wrapper/gradle-wrapper.jar
src/inspect_evals/cybench/challenges/frog_waf/images/victim/gradle/wrapper/gradle-wrapper.properties
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/Application.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/config/AppConfig.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/contact/CheckCountry.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/contact/Contact.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/contact/ContactController.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/contact/CountryValidator.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/error/ErrorHandlingControllerAdvice.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/error/ValidationErrorResponse.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/controller/error/Violation.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/waf/AttackTypes.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/waf/FrogWaf.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/java/com/sekai/app/waf/WafViolation.java
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/application.yaml
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/countries
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/static/banner.txt
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/static/favicon.png
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/static/msfrog.png
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/templates/index.html
src/inspect_evals/cybench/challenges/frog_waf/images/victim/src/main/resources/templates/waf.html
src/inspect_evals/cybench/challenges/frog_waf/solution/solution.sh
src/inspect_evals/cybench/challenges/frog_waf/solution/solve.py
src/inspect_evals/cybench/challenges/glacier_exchange/compose.yaml
src/inspect_evals/cybench/challenges/glacier_exchange/eval.yaml
src/inspect_evals/cybench/challenges/glacier_exchange/metadata.json
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/flag.txt
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/requirements.txt
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/server.py
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/wsgi.py
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/ascoin.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/cashout.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/doge.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/favicon.ico
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/gamestock.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/glaciercoin.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/smtl.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/icons/ycmi.png
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/images/bg.jpg
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/images/convert-button.jpg
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/scripts/chart.component.js
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/scripts/index.js
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/assets/styles/main.css
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/src/coin_api.py
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/src/wallet.py
src/inspect_evals/cybench/challenges/glacier_exchange/images/victim/templates/index.html
src/inspect_evals/cybench/challenges/glacier_exchange/solution/flag.txt
src/inspect_evals/cybench/challenges/glacier_exchange/solution/requirements.sh
src/inspect_evals/cybench/challenges/glacier_exchange/solution/requirements.txt
src/inspect_evals/cybench/challenges/glacier_exchange/solution/solution.sh
src/inspect_evals/cybench/challenges/glacier_exchange/solution/solution.txt
src/inspect_evals/cybench/challenges/glacier_exchange/solution/solve.py
src/inspect_evals/cybench/challenges/it_has_begun/compose.yaml
src/inspect_evals/cybench/challenges/it_has_begun/eval.yaml
src/inspect_evals/cybench/challenges/it_has_begun/resources/script.sh
src/inspect_evals/cybench/challenges/it_has_begun/solution/solution.sh
src/inspect_evals/cybench/challenges/just_another_pickle_jail/compose.yaml
src/inspect_evals/cybench/challenges/just_another_pickle_jail/eval.yaml
src/inspect_evals/cybench/challenges/just_another_pickle_jail/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/just_another_pickle_jail/images/victim/src/chall.py
src/inspect_evals/cybench/challenges/just_another_pickle_jail/images/victim/src/flag
src/inspect_evals/cybench/challenges/just_another_pickle_jail/images/victim/src/my_pickle.py
src/inspect_evals/cybench/challenges/just_another_pickle_jail/resources/chall.py
src/inspect_evals/cybench/challenges/just_another_pickle_jail/resources/my_pickle.py
src/inspect_evals/cybench/challenges/just_another_pickle_jail/solution/gen_pkl.py
src/inspect_evals/cybench/challenges/just_another_pickle_jail/solution/requirements.sh
src/inspect_evals/cybench/challenges/just_another_pickle_jail/solution/solution.sh
src/inspect_evals/cybench/challenges/labyrinth_linguist/compose.yaml
src/inspect_evals/cybench/challenges/labyrinth_linguist/eval.yaml
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/entrypoint.sh
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/flag.txt
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/.gitignore
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/pom.xml
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/src/main/java/Main.java
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/src/main/resources/static/css/style.css
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/src/main/resources/static/font/Ancient_G_Written.ttf
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/challenge/src/main/resources/templates/index.html
src/inspect_evals/cybench/challenges/labyrinth_linguist/images/victim/config/supervisord.conf
src/inspect_evals/cybench/challenges/labyrinth_linguist/resources/pom.xml
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/flag.sh
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/flag.txt
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/requirements.sh
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/solution.sh
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/solution.txt
src/inspect_evals/cybench/challenges/labyrinth_linguist/solution/solver.py
src/inspect_evals/cybench/challenges/locktalk/compose.yaml
src/inspect_evals/cybench/challenges/locktalk/eval.yaml
src/inspect_evals/cybench/challenges/locktalk/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/config.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/run.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/__init__.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/__init__.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/routes.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/1.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/10.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/2.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/3.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/4.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/5.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/6.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/7.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/8.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/api/json/9.json
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/__init__.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/routes.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/css/.gitkeep
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/css/style.css
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/fonts/.gitkeep
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/images/.gitkeep
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/js/.gitkeep
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/static/js/main.js
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/main/templates/index.html
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/middleware/__init__.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/challenge/app/middleware/middleware.py
src/inspect_evals/cybench/challenges/locktalk/images/victim/conf/haproxy.cfg
src/inspect_evals/cybench/challenges/locktalk/images/victim/conf/requirements.txt
src/inspect_evals/cybench/challenges/locktalk/images/victim/conf/supervisord.conf
src/inspect_evals/cybench/challenges/locktalk/images/victim/conf/uwsgi.ini
src/inspect_evals/cybench/challenges/locktalk/solution/flag.txt
src/inspect_evals/cybench/challenges/locktalk/solution/requirements.sh
src/inspect_evals/cybench/challenges/locktalk/solution/solution.sh
src/inspect_evals/cybench/challenges/locktalk/solution/solution.txt
src/inspect_evals/cybench/challenges/locktalk/solution/solve.py
src/inspect_evals/cybench/challenges/lootstash/compose.yaml
src/inspect_evals/cybench/challenges/lootstash/eval.yaml
src/inspect_evals/cybench/challenges/lootstash/resources/stash
src/inspect_evals/cybench/challenges/lootstash/solution/flag.txt
src/inspect_evals/cybench/challenges/lootstash/solution/solution.sh
src/inspect_evals/cybench/challenges/lootstash/solution/solution.txt
src/inspect_evals/cybench/challenges/missingbits/compose.yaml
src/inspect_evals/cybench/challenges/missingbits/eval.yaml
src/inspect_evals/cybench/challenges/missingbits/resources/ciphertext_message
src/inspect_evals/cybench/challenges/missingbits/resources/priv.key
src/inspect_evals/cybench/challenges/missingbits/solution/decode_file.py
src/inspect_evals/cybench/challenges/missingbits/solution/pem2hex.py
src/inspect_evals/cybench/challenges/missingbits/solution/requirements.sh
src/inspect_evals/cybench/challenges/missingbits/solution/solution.sh
src/inspect_evals/cybench/challenges/missingbits/solution/writeup.md
src/inspect_evals/cybench/challenges/network_tools/compose.yaml
src/inspect_evals/cybench/challenges/network_tools/eval.yaml
src/inspect_evals/cybench/challenges/network_tools/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/network_tools/images/victim/share/flag.txt
src/inspect_evals/cybench/challenges/network_tools/images/victim/share/nettools
src/inspect_evals/cybench/challenges/network_tools/images/victim/share/run.sh
src/inspect_evals/cybench/challenges/network_tools/resources/main.rs
src/inspect_evals/cybench/challenges/network_tools/resources/nettools
src/inspect_evals/cybench/challenges/network_tools/solution/flag.txt
src/inspect_evals/cybench/challenges/network_tools/solution/requirements.sh
src/inspect_evals/cybench/challenges/network_tools/solution/solution.sh
src/inspect_evals/cybench/challenges/network_tools/solution/solve_zafirr.py
src/inspect_evals/cybench/challenges/noisier_crc/compose.yaml
src/inspect_evals/cybench/challenges/noisier_crc/eval.yaml
src/inspect_evals/cybench/challenges/noisier_crc/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/noisier_crc/images/victim/xinetd
src/inspect_evals/cybench/challenges/noisier_crc/images/victim/app/chall.py
src/inspect_evals/cybench/challenges/noisier_crc/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/noisier_crc/solution/flag.txt
src/inspect_evals/cybench/challenges/noisier_crc/solution/requirements.sh
src/inspect_evals/cybench/challenges/noisier_crc/solution/solution.sh
src/inspect_evals/cybench/challenges/noisier_crc/solution/solution.txt
src/inspect_evals/cybench/challenges/noisier_crc/solution/solve.py
src/inspect_evals/cybench/challenges/noisy_crc/compose.yaml
src/inspect_evals/cybench/challenges/noisy_crc/eval.yaml
src/inspect_evals/cybench/challenges/noisy_crc/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/noisy_crc/images/victim/xinetd
src/inspect_evals/cybench/challenges/noisy_crc/images/victim/app/chall.py
src/inspect_evals/cybench/challenges/noisy_crc/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/noisy_crc/solution/new_solve.py
src/inspect_evals/cybench/challenges/noisy_crc/solution/requirements.sh
src/inspect_evals/cybench/challenges/noisy_crc/solution/solution.sh
src/inspect_evals/cybench/challenges/noisy_crc/solution/solution.txt
src/inspect_evals/cybench/challenges/packedaway/compose.yaml
src/inspect_evals/cybench/challenges/packedaway/eval.yaml
src/inspect_evals/cybench/challenges/packedaway/solution/packed
src/inspect_evals/cybench/challenges/packedaway/solution/requirements.sh
src/inspect_evals/cybench/challenges/packedaway/solution/solution.sh
src/inspect_evals/cybench/challenges/packedaway/solution/solution.txt
src/inspect_evals/cybench/challenges/packedaway/solution/upx
src/inspect_evals/cybench/challenges/partial_tenacity/compose.yaml
src/inspect_evals/cybench/challenges/partial_tenacity/eval.yaml
src/inspect_evals/cybench/challenges/partial_tenacity/challenge/output.txt
src/inspect_evals/cybench/challenges/partial_tenacity/challenge/source.py
src/inspect_evals/cybench/challenges/partial_tenacity/solution/flag.txt
src/inspect_evals/cybench/challenges/partial_tenacity/solution/requirements.sh
src/inspect_evals/cybench/challenges/partial_tenacity/solution/solution.py
src/inspect_evals/cybench/challenges/partial_tenacity/solution/solution.sh
src/inspect_evals/cybench/challenges/partial_tenacity/solution/solution.txt
src/inspect_evals/cybench/challenges/permuted/compose.yaml
src/inspect_evals/cybench/challenges/permuted/eval.yaml
src/inspect_evals/cybench/challenges/permuted/challenge/output.txt
src/inspect_evals/cybench/challenges/permuted/challenge/source.py
src/inspect_evals/cybench/challenges/permuted/solution/flag.txt
src/inspect_evals/cybench/challenges/permuted/solution/requirements.sh
src/inspect_evals/cybench/challenges/permuted/solution/solution.sh
src/inspect_evals/cybench/challenges/permuted/solution/solution.txt
src/inspect_evals/cybench/challenges/permuted/solution/solve.py
src/inspect_evals/cybench/challenges/primary_knowledge/compose.yaml
src/inspect_evals/cybench/challenges/primary_knowledge/eval.yaml
src/inspect_evals/cybench/challenges/primary_knowledge/challenge/output.txt
src/inspect_evals/cybench/challenges/primary_knowledge/challenge/source.py
src/inspect_evals/cybench/challenges/primary_knowledge/solution/output.txt
src/inspect_evals/cybench/challenges/primary_knowledge/solution/requirements.sh
src/inspect_evals/cybench/challenges/primary_knowledge/solution/solution.sh
src/inspect_evals/cybench/challenges/primary_knowledge/solution/solution.txt
src/inspect_evals/cybench/challenges/primary_knowledge/solution/solve.py
src/inspect_evals/cybench/challenges/randsubware/compose.yaml
src/inspect_evals/cybench/challenges/randsubware/eval.yaml
src/inspect_evals/cybench/challenges/randsubware/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/randsubware/images/victim/xinetd
src/inspect_evals/cybench/challenges/randsubware/images/victim/app/chall.py
src/inspect_evals/cybench/challenges/randsubware/images/victim/app/flag.py
src/inspect_evals/cybench/challenges/randsubware/resources/chall.py
src/inspect_evals/cybench/challenges/randsubware/solution/README.md
src/inspect_evals/cybench/challenges/randsubware/solution/requirements.sh
src/inspect_evals/cybench/challenges/randsubware/solution/solution.sh
src/inspect_evals/cybench/challenges/randsubware/solution/solution.txt
src/inspect_evals/cybench/challenges/randsubware/solution/solve.py
src/inspect_evals/cybench/challenges/randsubware/solution/solve_with_lib.py
src/inspect_evals/cybench/challenges/robust_cbc/compose.yaml
src/inspect_evals/cybench/challenges/robust_cbc/eval.yaml
src/inspect_evals/cybench/challenges/robust_cbc/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/robust_cbc/images/victim/xinetd
src/inspect_evals/cybench/challenges/robust_cbc/images/victim/app/server
src/inspect_evals/cybench/challenges/robust_cbc/solution/requirements.sh
src/inspect_evals/cybench/challenges/robust_cbc/solution/solution.sh
src/inspect_evals/cybench/challenges/robust_cbc/solution/solution.txt
src/inspect_evals/cybench/challenges/robust_cbc/solution/solve.py
src/inspect_evals/cybench/challenges/rpgo/compose.yaml
src/inspect_evals/cybench/challenges/rpgo/eval.yaml
src/inspect_evals/cybench/challenges/rpgo/resources/rpgo
src/inspect_evals/cybench/challenges/rpgo/solution/requirements.sh
src/inspect_evals/cybench/challenges/rpgo/solution/solution.sh
src/inspect_evals/cybench/challenges/rpgo/solution/solution.txt
src/inspect_evals/cybench/challenges/rpgo/solution/solve.py
src/inspect_evals/cybench/challenges/shuffled_aes/compose.yaml
src/inspect_evals/cybench/challenges/shuffled_aes/eval.yaml
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/.gitignore
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/Cargo.toml
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/flag.txt
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/src/aes_util.rs
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/src/block_cipher.rs
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/src/main.rs
src/inspect_evals/cybench/challenges/shuffled_aes/images/victim/chall/src/stream_cipher.rs
src/inspect_evals/cybench/challenges/shuffled_aes/resources/Cargo.toml
src/inspect_evals/cybench/challenges/shuffled_aes/resources/shuffled-aes.zip
src/inspect_evals/cybench/challenges/shuffled_aes/resources/src/aes_util.rs
src/inspect_evals/cybench/challenges/shuffled_aes/resources/src/block_cipher.rs
src/inspect_evals/cybench/challenges/shuffled_aes/resources/src/main.rs
src/inspect_evals/cybench/challenges/shuffled_aes/resources/src/stream_cipher.rs
src/inspect_evals/cybench/challenges/shuffled_aes/solution/README.md
src/inspect_evals/cybench/challenges/shuffled_aes/solution/aes_utils.py
src/inspect_evals/cybench/challenges/shuffled_aes/solution/exploit.py
src/inspect_evals/cybench/challenges/shuffled_aes/solution/requirements.sh
src/inspect_evals/cybench/challenges/shuffled_aes/solution/solution.sh
src/inspect_evals/cybench/challenges/skilift/compose.yaml
src/inspect_evals/cybench/challenges/skilift/eval.yaml
src/inspect_evals/cybench/challenges/skilift/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/skilift/images/victim/chall/Makefile
src/inspect_evals/cybench/challenges/skilift/images/victim/chall/sim_main.cpp
src/inspect_evals/cybench/challenges/skilift/images/victim/chall/top.v
src/inspect_evals/cybench/challenges/skilift/resources/top.v
src/inspect_evals/cybench/challenges/skilift/solution/requirements.sh
src/inspect_evals/cybench/challenges/skilift/solution/solution.sh
src/inspect_evals/cybench/challenges/skilift/solution/solve.py
src/inspect_evals/cybench/challenges/slcg/compose.yaml
src/inspect_evals/cybench/challenges/slcg/eval.yaml
src/inspect_evals/cybench/challenges/slcg/resources/ciphertext.txt
src/inspect_evals/cybench/challenges/slcg/resources/encrypt.py
src/inspect_evals/cybench/challenges/slcg/solution/ciphertext.py
src/inspect_evals/cybench/challenges/slcg/solution/requirements.sh
src/inspect_evals/cybench/challenges/slcg/solution/solution.sh
src/inspect_evals/cybench/challenges/slcg/solution/solve.py
src/inspect_evals/cybench/challenges/sop/compose.yaml
src/inspect_evals/cybench/challenges/sop/eval.yaml
src/inspect_evals/cybench/challenges/sop/resources/app
src/inspect_evals/cybench/challenges/sop/solution/flag.txt
src/inspect_evals/cybench/challenges/sop/solution/requirements.sh
src/inspect_evals/cybench/challenges/sop/solution/solution.sh
src/inspect_evals/cybench/challenges/sop/solution/solution.txt
src/inspect_evals/cybench/challenges/sop/solution/solve.py
src/inspect_evals/cybench/challenges/unbreakable/compose.yaml
src/inspect_evals/cybench/challenges/unbreakable/eval.yaml
src/inspect_evals/cybench/challenges/unbreakable/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/unbreakable/images/victim/challenge/flag.txt
src/inspect_evals/cybench/challenges/unbreakable/images/victim/challenge/main.py
src/inspect_evals/cybench/challenges/unbreakable/resources/flag.txt
src/inspect_evals/cybench/challenges/unbreakable/resources/main.py
src/inspect_evals/cybench/challenges/unbreakable/solution/requirements.sh
src/inspect_evals/cybench/challenges/unbreakable/solution/solution.sh
src/inspect_evals/cybench/challenges/unbreakable/solution/solve.py
src/inspect_evals/cybench/challenges/urgent/compose.yaml
src/inspect_evals/cybench/challenges/urgent/eval.yaml
src/inspect_evals/cybench/challenges/urgent/resources/forensics_urgent.zip
src/inspect_evals/cybench/challenges/urgent/solution/solution.sh
src/inspect_evals/cybench/challenges/urgent/solution/solution.txt
src/inspect_evals/cybench/challenges/urgent/solution/solve.py
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/compose.yaml
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/eval.yaml
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/images/walking-to-the-sea-side/Dockerfile
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/images/walking-to-the-sea-side/deployment/chal.sage
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/resources/chal.sage
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/chal.sage
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/flag.txt
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/pure_python_solution.py
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/requirements.sh
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/solution.sh
src/inspect_evals/cybench/challenges/walking_to_the_sea_side/solution/solution.txt
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/compose.yaml
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/eval.yaml
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/images/victim/Dockerfile
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/images/victim/challenge/app.py
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/images/victim/challenge/flag.txt
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/images/victim/challenge/sandbox.py
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/solution/requirements.sh
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/solution/solution.sh
src/inspect_evals/cybench/challenges/were_pickle_phreaks_revenge/solution/solve.py
src/inspect_evals/cybergym/.gitignore
src/inspect_evals/cybergym/.noautolint
src/inspect_evals/cybergym/README.md
src/inspect_evals/cybergym/__init__.py
src/inspect_evals/cybergym/cybergym.py
src/inspect_evals/cybergym/dataset.py
src/inspect_evals/cybergym/eval.yaml
src/inspect_evals/cybergym/react_solver.py
src/inspect_evals/cybergym/scorers.py
src/inspect_evals/cybergym/solvers.py
src/inspect_evals/cybergym/task_template/README.template
src/inspect_evals/cybergym/task_template/compose.yml
src/inspect_evals/cybergym/task_template/submit.template
src/inspect_evals/cybergym/task_template/controller/Dockerfile
src/inspect_evals/cybergym/task_template/controller/server.py
src/inspect_evals/cybergym/task_template/executor/http_entrypoint.py
src/inspect_evals/cybermetric/README.md
src/inspect_evals/cybermetric/__init__.py
src/inspect_evals/cybermetric/cybermetric.py
src/inspect_evals/cybermetric/eval.yaml
src/inspect_evals/cyberseceval_2/.noautolint
src/inspect_evals/cyberseceval_2/README.md
src/inspect_evals/cyberseceval_2/__init__.py
src/inspect_evals/cyberseceval_2/eval.yaml
src/inspect_evals/cyberseceval_2/interpreter_abuse/dataset.py
src/inspect_evals/cyberseceval_2/interpreter_abuse/task.py
src/inspect_evals/cyberseceval_2/prompt_injection/dataset.py
src/inspect_evals/cyberseceval_2/prompt_injection/task.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/Dockerfile
src/inspect_evals/cyberseceval_2/vulnerability_exploit/dataset.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/scorers.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/task.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/__init__.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/base_test_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/base_constraint_satisfaction_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/c_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/javascript_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/python_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/constraint_satisfaction/sqlite_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/__init__.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/c_buffer_overflow_generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_double_free.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_dynamic_stack_alloc.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_format_string.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_hash_crc32.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_hash_crc32_hex.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_heap_buffer_overflow.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_integer_overflow_to_heap_overflow.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_iterator_invalidation.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_missleading_parse_map_2.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_missleading_state_machine_1.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_map_1.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_map_2.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_tlv_1.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_tlv_2.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_tlv_3.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_parse_tlv_4.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_stack_buffer_overflow_1.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_stack_buffer_overflow_2.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_state_machine_1.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_state_machine_2.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_state_machine_3.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_string_constraints.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/example_use_after_free.cpp
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/examples.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/generator.py
src/inspect_evals/cyberseceval_2/vulnerability_exploit/challenges/memory_corruption/se_defines.h
src/inspect_evals/cyberseceval_3/.noautolint
src/inspect_evals/cyberseceval_3/README.md
src/inspect_evals/cyberseceval_3/__init__.py
src/inspect_evals/cyberseceval_3/eval.yaml
src/inspect_evals/cyberseceval_3/visual_prompt_injection/dataset.py
src/inspect_evals/cyberseceval_3/visual_prompt_injection/task.py
src/inspect_evals/docvqa/README.md
src/inspect_evals/docvqa/__init__.py
src/inspect_evals/docvqa/docvqa.py
src/inspect_evals/docvqa/eval.yaml
src/inspect_evals/drop/README.md
src/inspect_evals/drop/__init__.py
src/inspect_evals/drop/drop.py
src/inspect_evals/drop/eval.yaml
src/inspect_evals/ds1000/Dockerfile
src/inspect_evals/ds1000/README.md
src/inspect_evals/ds1000/__init__.py
src/inspect_evals/ds1000/compose.yaml
src/inspect_evals/ds1000/docker-requirements.txt
src/inspect_evals/ds1000/ds1000.py
src/inspect_evals/ds1000/eval.yaml
src/inspect_evals/fortress/README.md
src/inspect_evals/fortress/__init__.py
src/inspect_evals/fortress/cli_helper.py
src/inspect_evals/fortress/data.py
src/inspect_evals/fortress/eval.yaml
src/inspect_evals/fortress/fortress.py
src/inspect_evals/fortress/scorer.py
src/inspect_evals/frontier_cs/Dockerfile
src/inspect_evals/frontier_cs/README.md
src/inspect_evals/frontier_cs/__init__.py
src/inspect_evals/frontier_cs/compose.yaml
src/inspect_evals/frontier_cs/dataset.py
src/inspect_evals/frontier_cs/eval.yaml
src/inspect_evals/frontier_cs/frontier_cs.py
src/inspect_evals/frontier_cs/scorer.py
src/inspect_evals/frontierscience/README.md
src/inspect_evals/frontierscience/__init__.py
src/inspect_evals/frontierscience/eval.yaml
src/inspect_evals/frontierscience/frontierscience.py
src/inspect_evals/gaia/README.md
src/inspect_evals/gaia/__init__.py
src/inspect_evals/gaia/compose.yaml
src/inspect_evals/gaia/dataset.py
src/inspect_evals/gaia/eval.yaml
src/inspect_evals/gaia/gaia.py
src/inspect_evals/gaia/scorer.py
src/inspect_evals/gaia/values.yaml
src/inspect_evals/gdm_capabilities/README.md
src/inspect_evals/gdm_in_house_ctf/README.md
src/inspect_evals/gdm_in_house_ctf/__init__.py
src/inspect_evals/gdm_in_house_ctf/dataset.py
src/inspect_evals/gdm_in_house_ctf/eval.yaml
src/inspect_evals/gdm_in_house_ctf/gdm_in_house_ctf.py
src/inspect_evals/gdm_in_house_ctf/challenges/.noautolint
src/inspect_evals/gdm_in_house_ctf/challenges/challenges.json
src/inspect_evals/gdm_in_house_ctf/challenges/attacker/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/attacker/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/attacker/short_password.lst
src/inspect_evals/gdm_in_house_ctf/challenges/attacker/short_user.lst
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/app.py
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/requirements.txt
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/templates/base.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/cmd_injection/templates/network.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/db_3/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/db_3/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/db_3/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/db_3/db.sql
src/inspect_evals/gdm_in_house_ctf/challenges/grafana/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/grafana/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/grafana/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/idor/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/idor/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/idor/app.py
src/inspect_evals/gdm_in_house_ctf/challenges/idor/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/idor/requirements.txt
src/inspect_evals/gdm_in_house_ctf/challenges/idor/templates/accounts.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/idor/templates/base.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/idor/templates/creds.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/idor/templates/home.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_7z/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_7z/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_7z/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_bash/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_bash/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_bash/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_find/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_find/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_find/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_sed/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_sed/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/privesc_sed/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/pw_reuse/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/pw_reuse/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/pw_reuse/db.Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/pw_reuse/db.sql
src/inspect_evals/gdm_in_house_ctf/challenges/pw_reuse/ssh.Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/spray/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/spray/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/spray/app.py
src/inspect_evals/gdm_in_house_ctf/challenges/spray/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/spray/requirements.txt
src/inspect_evals/gdm_in_house_ctf/challenges/spray/templates/login.jinja2
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/app.db
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/app.py
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/app.sql
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/sqli/requirements.txt
src/inspect_evals/gdm_in_house_ctf/challenges/ssh/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/ssh/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/ssh/compose.yaml
src/inspect_evals/gdm_in_house_ctf/challenges/webmin/Dockerfile
src/inspect_evals/gdm_in_house_ctf/challenges/webmin/README.md
src/inspect_evals/gdm_in_house_ctf/challenges/webmin/compose.yaml
src/inspect_evals/gdm_intercode_ctf/.gitignore
src/inspect_evals/gdm_intercode_ctf/Dockerfile.template
src/inspect_evals/gdm_intercode_ctf/README.md
src/inspect_evals/gdm_intercode_ctf/__init__.py
src/inspect_evals/gdm_intercode_ctf/compose.yaml
src/inspect_evals/gdm_intercode_ctf/dataset.py
src/inspect_evals/gdm_intercode_ctf/eval.yaml
src/inspect_evals/gdm_intercode_ctf/gdm_intercode_ctf.py
src/inspect_evals/gdm_self_proliferation/.gitignore
src/inspect_evals/gdm_self_proliferation/.noautolint
src/inspect_evals/gdm_self_proliferation/README.md
src/inspect_evals/gdm_self_proliferation/__init__.py
src/inspect_evals/gdm_self_proliferation/approval.yaml
src/inspect_evals/gdm_self_proliferation/custom_solvers.py
src/inspect_evals/gdm_self_proliferation/custom_tools.py
src/inspect_evals/gdm_self_proliferation/data_setup.py
src/inspect_evals/gdm_self_proliferation/dataset.py
src/inspect_evals/gdm_self_proliferation/eval.yaml
src/inspect_evals/gdm_self_proliferation/gdm_self_proliferation.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/common.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp01.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp02.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp03.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp04.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp05.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp07.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp08.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp09.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp10.py
src/inspect_evals/gdm_self_proliferation/custom_scorers/sp12.py
src/inspect_evals/gdm_self_proliferation/data/challenge_variables.json
src/inspect_evals/gdm_self_proliferation/data/challenges.json
src/inspect_evals/gdm_self_proliferation/data/replace_placeholder.py
src/inspect_evals/gdm_self_proliferation/data/setup.py
src/inspect_evals/gdm_self_proliferation/data/base/Dockerfile
src/inspect_evals/gdm_self_proliferation/data/gcp_setup/README.md
src/inspect_evals/gdm_self_proliferation/data/gmail/gmail.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/gmail/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp01/README.md
src/inspect_evals/gdm_self_proliferation/data/sp01/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp01/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp01/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp02/README.md
src/inspect_evals/gdm_self_proliferation/data/sp02/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp02/gcp.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/sp02/keyfile_template.json
src/inspect_evals/gdm_self_proliferation/data/sp02/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp02/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp02/variables.json
src/inspect_evals/gdm_self_proliferation/data/sp03/README.md
src/inspect_evals/gdm_self_proliferation/data/sp03/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp03/local_llm.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/sp03/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp03/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp04/README.md
src/inspect_evals/gdm_self_proliferation/data/sp04/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp04/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp04/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp05/README.md
src/inspect_evals/gdm_self_proliferation/data/sp05/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp05/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp05/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp05/variables.json
src/inspect_evals/gdm_self_proliferation/data/sp07/README.md
src/inspect_evals/gdm_self_proliferation/data/sp07/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp07/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp07/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp07/variables.json
src/inspect_evals/gdm_self_proliferation/data/sp08/README.md
src/inspect_evals/gdm_self_proliferation/data/sp08/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp08/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp09/README.md
src/inspect_evals/gdm_self_proliferation/data/sp09/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp09/eval_stub._py
src/inspect_evals/gdm_self_proliferation/data/sp09/mistral.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/sp09/mistral_ref._py
src/inspect_evals/gdm_self_proliferation/data/sp09/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp09/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp10/README.md
src/inspect_evals/gdm_self_proliferation/data/sp10/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp10/distil.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/sp10/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp10/subtasks.json
src/inspect_evals/gdm_self_proliferation/data/sp10/variables.json
src/inspect_evals/gdm_self_proliferation/data/sp12/README.md
src/inspect_evals/gdm_self_proliferation/data/sp12/compose.yaml
src/inspect_evals/gdm_self_proliferation/data/sp12/eval_data._py
src/inspect_evals/gdm_self_proliferation/data/sp12/sampling._py
src/inspect_evals/gdm_self_proliferation/data/sp12/secrets.zip
src/inspect_evals/gdm_self_proliferation/data/sp12/strategy_llm.pdf
src/inspect_evals/gdm_self_proliferation/data/sp12/strategy_llm.tex
src/inspect_evals/gdm_self_proliferation/data/sp12/strategyllm.Dockerfile
src/inspect_evals/gdm_self_proliferation/data/sp12/subtasks.json
src/inspect_evals/gdm_self_reasoning/README.md
src/inspect_evals/gdm_self_reasoning/__init__.py
src/inspect_evals/gdm_self_reasoning/common_tools.py
src/inspect_evals/gdm_self_reasoning/constants.py
src/inspect_evals/gdm_self_reasoning/eval.yaml
src/inspect_evals/gdm_self_reasoning/evaluation_utils.py
src/inspect_evals/gdm_self_reasoning/gdm_self_reasoning.py
src/inspect_evals/gdm_self_reasoning/solver_pipeline.py
src/inspect_evals/gdm_self_reasoning/data/.noautolint
src/inspect_evals/gdm_self_reasoning/data/build_images.sh
src/inspect_evals/gdm_self_reasoning/data/challenge_details.py
src/inspect_evals/gdm_self_reasoning/data/challenges.json
src/inspect_evals/gdm_self_reasoning/data/dataset.py
src/inspect_evals/gdm_self_reasoning/data/generate_challenges_json.py
src/inspect_evals/gdm_self_reasoning/data/registry.py
src/inspect_evals/gdm_self_reasoning/data/approved_directories/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/approved_directories/agent.py
src/inspect_evals/gdm_self_reasoning/data/approved_directories/allowed_directories_and_files.txt
src/inspect_evals/gdm_self_reasoning/data/approved_directories/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/approved_directories/config.json
src/inspect_evals/gdm_self_reasoning/data/approved_directories/watch_permissions.sh
src/inspect_evals/gdm_self_reasoning/data/base/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/base/agentpy_shared.py
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/agent.py
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/calculator.py
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/config.json
src/inspect_evals/gdm_self_reasoning/data/calculator_improvement/problems.txt
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/agent.py
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/config.json
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/setup_task.sh
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_instrumental_only/wordlist.txt
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_irreversibility_only/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_irreversibility_only/agent.py
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_irreversibility_only/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/context_length_mod_irreversibility_only/config.json
src/inspect_evals/gdm_self_reasoning/data/database_tool/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/database_tool/agent.py
src/inspect_evals/gdm_self_reasoning/data/database_tool/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/database_tool/config.json
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/agent.py
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/calculator.py
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/config.json
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/problems.txt
src/inspect_evals/gdm_self_reasoning/data/latency_calculator/timer.py
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/agent.py
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/calculator.py
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/config.json
src/inspect_evals/gdm_self_reasoning/data/max_messages_calculator/problems.txt
src/inspect_evals/gdm_self_reasoning/data/max_tokens/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/max_tokens/agent.py
src/inspect_evals/gdm_self_reasoning/data/max_tokens/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/max_tokens/config.json
src/inspect_evals/gdm_self_reasoning/data/max_tokens/game_of_life_tests.py
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/agent.py
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/config.json
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/project4.txt
src/inspect_evals/gdm_self_reasoning/data/oversight_frequency/project4_inputs.txt
src/inspect_evals/gdm_self_reasoning/data/read_logs/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/read_logs/agent.py
src/inspect_evals/gdm_self_reasoning/data/read_logs/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/read_logs/config.json
src/inspect_evals/gdm_self_reasoning/data/read_logs/job_notes.txt
src/inspect_evals/gdm_self_reasoning/data/read_logs/practice_questions.txt
src/inspect_evals/gdm_self_reasoning/data/read_logs/todo.txt
src/inspect_evals/gdm_self_reasoning/data/read_logs/website.html
src/inspect_evals/gdm_self_reasoning/data/read_logs/job_hunt/ads.txt
src/inspect_evals/gdm_self_reasoning/data/read_logs/job_hunt/cv.md
src/inspect_evals/gdm_self_reasoning/data/read_logs/logs/0.log
src/inspect_evals/gdm_self_reasoning/data/read_logs/logs/1.log
src/inspect_evals/gdm_self_reasoning/data/read_logs/logs/2.log
src/inspect_evals/gdm_self_reasoning/data/read_logs/logs/3.log
src/inspect_evals/gdm_self_reasoning/data/turn_off_filters/Dockerfile
src/inspect_evals/gdm_self_reasoning/data/turn_off_filters/agent.py
src/inspect_evals/gdm_self_reasoning/data/turn_off_filters/blocklisted_words.txt
src/inspect_evals/gdm_self_reasoning/data/turn_off_filters/compose.yaml
src/inspect_evals/gdm_self_reasoning/data/turn_off_filters/config.json
src/inspect_evals/gdm_self_reasoning/solvers/approved_directories.py
src/inspect_evals/gdm_self_reasoning/solvers/calculator_improvement.py
src/inspect_evals/gdm_self_reasoning/solvers/context_length_mod_instrumental_only.py
src/inspect_evals/gdm_self_reasoning/solvers/context_length_mod_irreversibility_only.py
src/inspect_evals/gdm_self_reasoning/solvers/database_tool.py
src/inspect_evals/gdm_self_reasoning/solvers/latency_calculator.py
src/inspect_evals/gdm_self_reasoning/solvers/max_messages_calculator.py
src/inspect_evals/gdm_self_reasoning/solvers/max_tokens.py
src/inspect_evals/gdm_self_reasoning/solvers/oversight_frequency.py
src/inspect_evals/gdm_self_reasoning/solvers/read_logs.py
src/inspect_evals/gdm_self_reasoning/solvers/turn_off_filters.py
src/inspect_evals/gdm_stealth/Dockerfile
src/inspect_evals/gdm_stealth/README.md
src/inspect_evals/gdm_stealth/__init__.py
src/inspect_evals/gdm_stealth/compose.yaml
src/inspect_evals/gdm_stealth/constants.py
src/inspect_evals/gdm_stealth/eval.yaml
src/inspect_evals/gdm_stealth/gdm_stealth.py
src/inspect_evals/gdm_stealth/rater_lib.py
src/inspect_evals/gdm_stealth/scorer_lib.py
src/inspect_evals/gdm_stealth/solver_pipeline.py
src/inspect_evals/gdm_stealth/data/dataset.py
src/inspect_evals/gdm_stealth/data/ai_secretary/email_tasks_confidential_subject.txt
src/inspect_evals/gdm_stealth/data/ai_secretary/email_tasks_content_rater.txt
src/inspect_evals/gdm_stealth/data/ai_secretary/email_tasks_email_domain.txt
src/inspect_evals/gdm_stealth/data/ai_secretary/generate.py
src/inspect_evals/gdm_stealth/data/ai_secretary/register.json
src/inspect_evals/gdm_stealth/data/ai_secretary/registry.py
src/inspect_evals/gdm_stealth/data/cover_your_tracks/generate.py
src/inspect_evals/gdm_stealth/data/cover_your_tracks/register.json
src/inspect_evals/gdm_stealth/data/cover_your_tracks/registry.py
src/inspect_evals/gdm_stealth/data/strategic_rule_breaking/generate_registry.py
src/inspect_evals/gdm_stealth/data/strategic_rule_breaking/register.json
src/inspect_evals/gdm_stealth/data/strategic_rule_breaking/registry.py
src/inspect_evals/gdm_stealth/solver/ai_secretary/checker.py
src/inspect_evals/gdm_stealth/solver/ai_secretary/criteria.py
src/inspect_evals/gdm_stealth/solver/ai_secretary/tools.py
src/inspect_evals/gdm_stealth/solver/cover_your_tracks/checker.py
src/inspect_evals/gdm_stealth/solver/cover_your_tracks/database.py
src/inspect_evals/gdm_stealth/solver/cover_your_tracks/email.csv
src/inspect_evals/gdm_stealth/solver/cover_your_tracks/tools.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/customer_support_env.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/env_records.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/global_defs.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/handbook.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/llm_sampling.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/oversight.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/prompt.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/tasks.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/ticket.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/tickets.json
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/tools.py
src/inspect_evals/gdm_stealth/solver/strategic_rule_breaking/users.json
src/inspect_evals/gdpval/.gitignore
src/inspect_evals/gdpval/Dockerfile
src/inspect_evals/gdpval/README.md
src/inspect_evals/gdpval/__init__.py
src/inspect_evals/gdpval/build_docker_image.sh
src/inspect_evals/gdpval/docker-requirements.txt
src/inspect_evals/gdpval/eval.yaml
src/inspect_evals/gdpval/extract_output.py
src/inspect_evals/gdpval/gdpval.py
src/inspect_evals/gdpval/hooks.py
src/inspect_evals/gdpval/util.py
src/inspect_evals/gpqa/README.md
src/inspect_evals/gpqa/__init__.py
src/inspect_evals/gpqa/eval.yaml
src/inspect_evals/gpqa/gpqa.py
src/inspect_evals/gsm8k/README.md
src/inspect_evals/gsm8k/__init__.py
src/inspect_evals/gsm8k/eval.yaml
src/inspect_evals/gsm8k/gsm8k.py
src/inspect_evals/healthbench/.noautolint
src/inspect_evals/healthbench/README.md
src/inspect_evals/healthbench/__init__.py
src/inspect_evals/healthbench/dataset.py
src/inspect_evals/healthbench/eval.yaml
src/inspect_evals/healthbench/healthbench.py
src/inspect_evals/healthbench/meta_evaluation.py
src/inspect_evals/healthbench/scorer.py
src/inspect_evals/healthbench/types.py
src/inspect_evals/hellaswag/README.md
src/inspect_evals/hellaswag/__init__.py
src/inspect_evals/hellaswag/eval.yaml
src/inspect_evals/hellaswag/hellaswag.py
src/inspect_evals/hle/README.md
src/inspect_evals/hle/__init__.py
src/inspect_evals/hle/eval.yaml
src/inspect_evals/hle/hle.py
src/inspect_evals/hle/judge.py
src/inspect_evals/humaneval/README.md
src/inspect_evals/humaneval/__init__.py
src/inspect_evals/humaneval/eval.yaml
src/inspect_evals/humaneval/humaneval.py
src/inspect_evals/ifeval/README.md
src/inspect_evals/ifeval/__init__.py
src/inspect_evals/ifeval/eval.yaml
src/inspect_evals/ifeval/ifeval.py
src/inspect_evals/ifevalcode/.noautolint
src/inspect_evals/ifevalcode/Dockerfile
src/inspect_evals/ifevalcode/README.md
src/inspect_evals/ifevalcode/__init__.py
src/inspect_evals/ifevalcode/compose.yaml
src/inspect_evals/ifevalcode/eval.yaml
src/inspect_evals/ifevalcode/ifevalcode.py
src/inspect_evals/ifevalcode/scorer.py
src/inspect_evals/ifevalcode/solver.py
src/inspect_evals/ifevalcode/utils.py
src/inspect_evals/infinite_bench/README.md
src/inspect_evals/infinite_bench/__init__.py
src/inspect_evals/infinite_bench/constants.py
src/inspect_evals/infinite_bench/eval.yaml
src/inspect_evals/infinite_bench/infinite_bench.py
src/inspect_evals/infinite_bench/math_calc_scorer.py
src/inspect_evals/infinite_bench/truncate_input.py
src/inspect_evals/infinite_bench/utils.py
src/inspect_evals/instrumentaleval/README.md
src/inspect_evals/instrumentaleval/__init__.py
src/inspect_evals/instrumentaleval/eval.yaml
src/inspect_evals/instrumentaleval/instrumentaleval.py
src/inspect_evals/instrumentaleval/metric.py
src/inspect_evals/instrumentaleval/process_data.py
src/inspect_evals/instrumentaleval/prompt.py
src/inspect_evals/instrumentaleval/scorer.py
src/inspect_evals/instrumentaleval/utils.py
src/inspect_evals/kernelbench/Dockerfile
src/inspect_evals/kernelbench/README.md
src/inspect_evals/kernelbench/__init__.py
src/inspect_evals/kernelbench/_defaults.py
src/inspect_evals/kernelbench/_types.py
src/inspect_evals/kernelbench/compose.yaml
src/inspect_evals/kernelbench/dataset.py
src/inspect_evals/kernelbench/eval.yaml
src/inspect_evals/kernelbench/eval_runner.py
src/inspect_evals/kernelbench/kernelbench.py
src/inspect_evals/kernelbench/scorer.py
src/inspect_evals/kernelbench/solver.py
src/inspect_evals/kernelbench/utils.py
src/inspect_evals/lab_bench/README.md
src/inspect_evals/lab_bench/__init__.py
src/inspect_evals/lab_bench/eval.yaml
src/inspect_evals/lab_bench/lab_bench.py
src/inspect_evals/lab_bench/metrics.py
src/inspect_evals/lab_bench/record_to_sample_helpers.py
src/inspect_evals/lab_bench/scorer.py
src/inspect_evals/lab_bench/utils.py
src/inspect_evals/lingoly/README.md
src/inspect_evals/lingoly/__init__.py
src/inspect_evals/lingoly/dataset.py
src/inspect_evals/lingoly/eval.yaml
src/inspect_evals/lingoly/lingoly.py
src/inspect_evals/lingoly/metrics.py
src/inspect_evals/livebench/README.md
src/inspect_evals/livebench/__init__.py
src/inspect_evals/livebench/eval.yaml
src/inspect_evals/livebench/instruction_following.py
src/inspect_evals/livebench/livebench.py
src/inspect_evals/livebench/scorer.py
src/inspect_evals/livebench/utils.py
src/inspect_evals/livecodebench_pro/README.md
src/inspect_evals/livecodebench_pro/__init__.py
src/inspect_evals/livecodebench_pro/compose.yaml
src/inspect_evals/livecodebench_pro/dataset.py
src/inspect_evals/livecodebench_pro/eval.yaml
src/inspect_evals/livecodebench_pro/judge.py
src/inspect_evals/livecodebench_pro/livecodebench_pro.py
src/inspect_evals/livecodebench_pro/util.py
src/inspect_evals/make_me_pay/README.md
src/inspect_evals/make_me_pay/__init__.py
src/inspect_evals/make_me_pay/eval.yaml
src/inspect_evals/make_me_pay/make_me_pay.py
src/inspect_evals/make_me_pay/scorer.py
src/inspect_evals/make_me_pay/solver.py
src/inspect_evals/make_me_pay/task_descriptions.py
src/inspect_evals/make_me_pay/utils.py
src/inspect_evals/makemesay/README.md
src/inspect_evals/makemesay/__init__.py
src/inspect_evals/makemesay/eval.yaml
src/inspect_evals/makemesay/game.py
src/inspect_evals/makemesay/makemesay.py
src/inspect_evals/makemesay/prompts.py
src/inspect_evals/makemesay/scorer.py
src/inspect_evals/makemesay/solver.py
src/inspect_evals/makemesay/utils.py
src/inspect_evals/mask/README.md
src/inspect_evals/mask/__init__.py
src/inspect_evals/mask/appendix.md
src/inspect_evals/mask/classify.py
src/inspect_evals/mask/compat.py
src/inspect_evals/mask/data.py
src/inspect_evals/mask/eval.yaml
src/inspect_evals/mask/mask.py
src/inspect_evals/mask/models.py
src/inspect_evals/mask/scorer.py
src/inspect_evals/mask/solver.py
src/inspect_evals/mask/types.py
src/inspect_evals/mask/utils.py
src/inspect_evals/mask/assets/records_by_config.png
src/inspect_evals/mask/judge_estimate/orchestrate.py
src/inspect_evals/mask/judge_estimate/parse.py
src/inspect_evals/mask/metrics/core.py
src/inspect_evals/mask/metrics/statistical_summary.py
src/inspect_evals/mask/metrics/utils.py
src/inspect_evals/mask/prompts/builder.py
src/inspect_evals/mask/prompts/prompts.py
src/inspect_evals/math/README.md
src/inspect_evals/math/__init__.py
src/inspect_evals/math/eval.yaml
src/inspect_evals/math/math.py
src/inspect_evals/math/utils.py
src/inspect_evals/mathvista/README.md
src/inspect_evals/mathvista/__init__.py
src/inspect_evals/mathvista/eval.yaml
src/inspect_evals/mathvista/example.png
src/inspect_evals/mathvista/mathvista.py
src/inspect_evals/mbpp/README.md
src/inspect_evals/mbpp/__init__.py
src/inspect_evals/mbpp/eval.yaml
src/inspect_evals/mbpp/mbpp.py
src/inspect_evals/medqa/README.md
src/inspect_evals/medqa/__init__.py
src/inspect_evals/medqa/eval.yaml
src/inspect_evals/medqa/medqa.py
src/inspect_evals/medqa/huggingface_artifact/bigbiohub.py
src/inspect_evals/medqa/huggingface_artifact/med_qa.py
src/inspect_evals/mgsm/README.md
src/inspect_evals/mgsm/__init__.py
src/inspect_evals/mgsm/eval.yaml
src/inspect_evals/mgsm/mgsm.py
src/inspect_evals/mind2web/README.md
src/inspect_evals/mind2web/__init__.py
src/inspect_evals/mind2web/dataset.py
src/inspect_evals/mind2web/eval.yaml
src/inspect_evals/mind2web/llm_prompt.json
src/inspect_evals/mind2web/mind2web.py
src/inspect_evals/mind2web/prompts.py
src/inspect_evals/mind2web/scorer.py
src/inspect_evals/mind2web/solver.py
src/inspect_evals/mind2web/utils.py
src/inspect_evals/mind2web_sc/README.md
src/inspect_evals/mind2web_sc/__init__.py
src/inspect_evals/mind2web_sc/compose.yaml
src/inspect_evals/mind2web_sc/eval.yaml
src/inspect_evals/mind2web_sc/guardrail_executor.py
src/inspect_evals/mind2web_sc/mind2web_sc.py
src/inspect_evals/mind2web_sc/scorer.py
src/inspect_evals/mind2web_sc/solver.py
src/inspect_evals/mind2web_sc/utils.py
src/inspect_evals/mind2web_sc/data/seeact/sample_labeled_all.json
src/inspect_evals/mind2web_sc/original_guardagent/prompts_guard.py
src/inspect_evals/mind2web_sc/original_guardagent/request_seeact.py
src/inspect_evals/mle_bench/Dockerfile
src/inspect_evals/mle_bench/README.md
src/inspect_evals/mle_bench/__init__.py
src/inspect_evals/mle_bench/eval.yaml
src/inspect_evals/mle_bench/instructions.txt
src/inspect_evals/mle_bench/mle_bench.py
src/inspect_evals/mle_bench/tos.py
src/inspect_evals/mle_bench/splits/all.txt
src/inspect_evals/mle_bench/splits/dev.txt
src/inspect_evals/mle_bench/splits/high.txt
src/inspect_evals/mle_bench/splits/low.txt
src/inspect_evals/mle_bench/splits/medium.txt
src/inspect_evals/mle_bench/splits/spaceship-titanic.txt
src/inspect_evals/mle_bench/tests/dummy_agent_main.py
src/inspect_evals/mle_bench/tests/mle_tests.py
src/inspect_evals/mlrc_bench/Dockerfile
src/inspect_evals/mlrc_bench/README.md
src/inspect_evals/mlrc_bench/__init__.py
src/inspect_evals/mlrc_bench/action_handler.py
src/inspect_evals/mlrc_bench/action_typing.py
src/inspect_evals/mlrc_bench/dataset.py
src/inspect_evals/mlrc_bench/eval.yaml
src/inspect_evals/mlrc_bench/mlrc_bench.py
src/inspect_evals/mlrc_bench/mlrc_high_level_actions.py
src/inspect_evals/mlrc_bench/mlrc_low_level_actions.py
src/inspect_evals/mlrc_bench/prompts.py
src/inspect_evals/mlrc_bench/sandbox_io.py
src/inspect_evals/mlrc_bench/scorer.py
src/inspect_evals/mlrc_bench/tool_utils.py
src/inspect_evals/mlrc_bench/utils.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/.noautolint
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/MLAgentBench/utils.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/llm-merging/env/evaluation.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/llm-merging/env/methods/BaseMethod.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/llm-merging/scripts/environment.yml
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/machine-unlearning/env/evaluation.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/machine-unlearning/scripts/environment.yml
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/meta-learning/scripts/environment.yml
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/product-rec/scripts/prepare.py
src/inspect_evals/mlrc_bench/orig_benchmark_corrections/weather-forecast/scripts/environment.yml
src/inspect_evals/mmiu/README.md
src/inspect_evals/mmiu/__init__.py
src/inspect_evals/mmiu/eval.yaml
src/inspect_evals/mmiu/mmiu.py
src/inspect_evals/mmiu/task_names.py
src/inspect_evals/mmiu/utils.py
src/inspect_evals/mmlu/README.md
src/inspect_evals/mmlu/__init__.py
src/inspect_evals/mmlu/eval.yaml
src/inspect_evals/mmlu/mmlu.py
src/inspect_evals/mmlu_pro/README.md
src/inspect_evals/mmlu_pro/__init__.py
src/inspect_evals/mmlu_pro/eval.yaml
src/inspect_evals/mmlu_pro/mmlu_pro.py
src/inspect_evals/mmmu/README.md
src/inspect_evals/mmmu/__init__.py
src/inspect_evals/mmmu/eval.yaml
src/inspect_evals/mmmu/mmmu.py
src/inspect_evals/mmmu/utils.py
src/inspect_evals/moru/.gitignore
src/inspect_evals/moru/CLAUDE.md
src/inspect_evals/moru/README.md
src/inspect_evals/moru/__init__.py
src/inspect_evals/moru/dataset.py
src/inspect_evals/moru/eval.yaml
src/inspect_evals/moru/metrics.py
src/inspect_evals/moru/moru.py
src/inspect_evals/moru/plot.png
src/inspect_evals/moru/scorer.py
src/inspect_evals/moru/types.py
src/inspect_evals/moru/utils.py
src/inspect_evals/moru/scripts/.noautolint
src/inspect_evals/moru/scripts/plot_dimensions.py
src/inspect_evals/musr/README.md
src/inspect_evals/musr/__init__.py
src/inspect_evals/musr/eval.yaml
src/inspect_evals/musr/musr.py
src/inspect_evals/musr/prompts.py
src/inspect_evals/niah/README.md
src/inspect_evals/niah/__init__.py
src/inspect_evals/niah/eval.yaml
src/inspect_evals/niah/niah.py
src/inspect_evals/niah/images/combined_models_score_distribution.png
src/inspect_evals/niah/images/mistral-medium-latest_mean_heatmap.png
src/inspect_evals/niah/images/mistral-medium-latest_std_heatmap.png
src/inspect_evals/niah/images/mistral-medium-latest_unrelated_answer_responses.png
src/inspect_evals/niah/images/mistral-small-latest_mean_heatmap.png
src/inspect_evals/niah/images/mistral-small-latest_std_heatmap.png
src/inspect_evals/niah/utils/dataset_generation.py
src/inspect_evals/niah/utils/model_contexts.py
src/inspect_evals/niah/utils/needle_utils.py
src/inspect_evals/niah/utils/prompting.py
src/inspect_evals/niah/utils/sample_generation.py
src/inspect_evals/niah/utils/scoring.py
src/inspect_evals/niah/utils/text_utils.py
src/inspect_evals/novelty_bench/README.md
src/inspect_evals/novelty_bench/__init__.py
src/inspect_evals/novelty_bench/eval.yaml
src/inspect_evals/novelty_bench/novelty_bench.py
src/inspect_evals/novelty_bench/partition.py
src/inspect_evals/novelty_bench/score.py
src/inspect_evals/novelty_bench/utils.py
src/inspect_evals/onet/README.md
src/inspect_evals/onet/__init__.py
src/inspect_evals/onet/eval.yaml
src/inspect_evals/onet/onet.py
src/inspect_evals/osworld/README.md
src/inspect_evals/osworld/__init__.py
src/inspect_evals/osworld/compose.yaml
src/inspect_evals/osworld/dataset.py
src/inspect_evals/osworld/eval.yaml
src/inspect_evals/osworld/osworld.py
src/inspect_evals/osworld/scorer.py
src/inspect_evals/osworld/sparse_clone.py
src/inspect_evals/osworld/container/.noautolint
src/inspect_evals/osworld/container/Dockerfile
src/inspect_evals/osworld/container/code/cli.py
src/inspect_evals/osworld/container/code/config.py
src/inspect_evals/osworld/container/code/controller.py
src/inspect_evals/osworld/container/code/env.py
src/inspect_evals/osworld/container/code/evaluate.py
src/inspect_evals/osworld/container/code/execute.py
src/inspect_evals/osworld/container/code/logging.py
src/inspect_evals/osworld/container/code/osworld_types.py
src/inspect_evals/osworld/container/code/util.py
src/inspect_evals/osworld/container/code/evaluators/README.md
src/inspect_evals/osworld/container/code/evaluators/__init__.py
src/inspect_evals/osworld/container/code/evaluators/getters/__init__.py
src/inspect_evals/osworld/container/code/evaluators/metrics/__init__.py
src/inspect_evals/osworld/container/home_dir/.config/libreoffice/4/user/registrymodifications.xcu
src/inspect_evals/osworld/container/home_dir/.config/vlc/vlcrc
src/inspect_evals/osworld/container/home_dir/Desktop/gimp.desktop
src/inspect_evals/osworld/container/home_dir/Desktop/libreoffice-calc.desktop
src/inspect_evals/osworld/container/home_dir/Desktop/libreoffice-impress.desktop
src/inspect_evals/osworld/container/home_dir/Desktop/libreoffice-writer.desktop
src/inspect_evals/osworld/container/home_dir/Desktop/thunderbird.desktop
src/inspect_evals/osworld/container/home_dir/Desktop/vlc.desktop
src/inspect_evals/osworld/docs/overview.svg
src/inspect_evals/paperbench/README.md
src/inspect_evals/paperbench/SCORING_DESIGN.md
src/inspect_evals/paperbench/__init__.py
src/inspect_evals/paperbench/constants.py
src/inspect_evals/paperbench/dataset.py
src/inspect_evals/paperbench/eval.yaml
src/inspect_evals/paperbench/instructions.txt
src/inspect_evals/paperbench/paperbench.py
src/inspect_evals/paperbench/solvers.py
src/inspect_evals/paperbench/images/Dockerfile
src/inspect_evals/paperbench/images/agent.env.example
src/inspect_evals/paperbench/images/compose.yaml
src/inspect_evals/paperbench/score/__init__.py
src/inspect_evals/paperbench/score/judge.py
src/inspect_evals/paperbench/score/prompts.py
src/inspect_evals/paperbench/score/reproduce.py
src/inspect_evals/paperbench/score/simple_judge.py
src/inspect_evals/paperbench/score/submission.py
src/inspect_evals/paperbench/score/task_node.py
src/inspect_evals/paperbench/score/utils.py
src/inspect_evals/paws/README.md
src/inspect_evals/paws/__init__.py
src/inspect_evals/paws/eval.yaml
src/inspect_evals/paws/paws.py
src/inspect_evals/persistbench/README.md
src/inspect_evals/persistbench/__init__.py
src/inspect_evals/persistbench/dataset.py
src/inspect_evals/persistbench/eval.yaml
src/inspect_evals/persistbench/persistbench.py
src/inspect_evals/persistbench/prompts.py
src/inspect_evals/persistbench/scorers.py
src/inspect_evals/persistbench/solvers.py
src/inspect_evals/persistbench/utils.py
src/inspect_evals/persistbench/benchmark_samples/beneficial_samples.jsonl
src/inspect_evals/persistbench/benchmark_samples/cross_domain.jsonl
src/inspect_evals/persistbench/benchmark_samples/sycophancy.jsonl
src/inspect_evals/personality/README.md
src/inspect_evals/personality/__init__.py
src/inspect_evals/personality/eval.yaml
src/inspect_evals/personality/personality.py
src/inspect_evals/personality/prompts/__init__.py
src/inspect_evals/personality/prompts/system.py
src/inspect_evals/piqa/README.md
src/inspect_evals/piqa/__init__.py
src/inspect_evals/piqa/eval.yaml
src/inspect_evals/piqa/piqa.py
src/inspect_evals/piqa/huggingface_artifact/piqa.py
src/inspect_evals/pre_flight/README.md
src/inspect_evals/pre_flight/__init__.py
src/inspect_evals/pre_flight/eval.yaml
src/inspect_evals/pre_flight/pre_flight.py
src/inspect_evals/pubmedqa/README.md
src/inspect_evals/pubmedqa/__init__.py
src/inspect_evals/pubmedqa/eval.yaml
src/inspect_evals/pubmedqa/pubmedqa.py
src/inspect_evals/pubmedqa/data/test_ground_truth.json
src/inspect_evals/race_h/README.md
src/inspect_evals/race_h/__init__.py
src/inspect_evals/race_h/eval.yaml
src/inspect_evals/race_h/race_h.py
src/inspect_evals/sad/README.md
src/inspect_evals/sad/__init__.py
src/inspect_evals/sad/dataset.py
src/inspect_evals/sad/decision_log.md
src/inspect_evals/sad/download_data.py
src/inspect_evals/sad/eval.yaml
src/inspect_evals/sad/facts_influence.py
src/inspect_evals/sad/sad.py
src/inspect_evals/sad/scorer.py
src/inspect_evals/sad/solver.py
src/inspect_evals/sad/stages.py
src/inspect_evals/scbench/Dockerfile
src/inspect_evals/scbench/README.md
src/inspect_evals/scbench/__init__.py
src/inspect_evals/scbench/compose.yaml
src/inspect_evals/scbench/data_manifest.py
src/inspect_evals/scbench/dataset.py
src/inspect_evals/scbench/eval.yaml
src/inspect_evals/scbench/prompts.py
src/inspect_evals/scbench/scbench.py
src/inspect_evals/scbench/scorer.py
src/inspect_evals/scbench/data/evals_canonical/manifest.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_4t1_hvg_gene_sets.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_4t1_normalization.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_celltyping_01_4t1_compartment_fractions.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_clustering_01_4t1_pericyte_adjacent_to_caf.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_differential_expression_01_contractile_caf_marker_recovery.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_qc_4T1_filter_cells.json
src/inspect_evals/scbench/data/evals_canonical/chromium/chromium_trajectory_01_caf_terminal_marker_recovery.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_celltyping_major_immune_lineages.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_clustering_avg_purity.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_de_monocyte_pseudobulk.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_normalization_full_pipeline.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_pca_pc1_biological_axis.json
src/inspect_evals/scbench/data/evals_canonical/csgenetics/csgenetics_qc_filtering.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_celltyping_major_cell_types.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_clustering_leiden_n_clusters.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_de_edn1_maximal.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_dimred_choose_batch_key.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_normalization_cp10k_log1p.json
src/inspect_evals/scbench/data/evals_canonical/illumina/illumina_qc_report_initial_nuclei.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_celltyping_present_cell_types.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_clustering_n_clusters.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_de_mutation_frequency.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_dimred_normalization_choice.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_normalization_protein_integrity.json
src/inspect_evals/scbench/data/evals_canonical/missionbio/missionbio_qc_variant_call_rate.json
src/inspect_evals/scbench/data/evals_canonical/parsebio/parsebio_celltyping_coarse_distribution.json
src/inspect_evals/scbench/data/evals_canonical/parsebio/parsebio_clustering_celltype_purity.json
src/inspect_evals/scbench/data/evals_canonical/parsebio/parsebio_de_ifnb_cd14mono_markers.json
src/inspect_evals/scbench/data/evals_canonical/parsebio/parsebio_normalization_edge_decision.json
src/inspect_evals/scbench/data/evals_canonical/parsebio/parsebio_pca_pc1_primary_driver.json
src/inspect_evals/scbench/graders/__init__.py
src/inspect_evals/scbench/graders/base.py
src/inspect_evals/scbench/graders/distribution.py
src/inspect_evals/scbench/graders/label_set.py
src/inspect_evals/scbench/graders/marker_gene.py
src/inspect_evals/scbench/graders/multiple_choice.py
src/inspect_evals/scbench/graders/numeric.py
src/inspect_evals/scbench/graders/spatial.py
src/inspect_evals/scicode/Dockerfile
src/inspect_evals/scicode/README.md
src/inspect_evals/scicode/__init__.py
src/inspect_evals/scicode/dataset.py
src/inspect_evals/scicode/docker-requirements.txt
src/inspect_evals/scicode/eval.yaml
src/inspect_evals/scicode/metrics.py
src/inspect_evals/scicode/process_data.py
src/inspect_evals/scicode/prompt_templates.py
src/inspect_evals/scicode/scicode.py
src/inspect_evals/scicode/scorer.py
src/inspect_evals/scicode/solver.py
src/inspect_evals/scicode/test_util.py
src/inspect_evals/scicode/util.py
src/inspect_evals/sciknoweval/README.md
src/inspect_evals/sciknoweval/__init__.py
src/inspect_evals/sciknoweval/eval.yaml
src/inspect_evals/sciknoweval/evaluator_prompt.yaml
src/inspect_evals/sciknoweval/sciknoweval.py
src/inspect_evals/sciknoweval/scorer.py
src/inspect_evals/sciknoweval/solver.py
src/inspect_evals/sciknoweval/task_mapping.py
src/inspect_evals/sciknoweval/utils.py
src/inspect_evals/sec_qa/README.md
src/inspect_evals/sec_qa/__init__.py
src/inspect_evals/sec_qa/eval.yaml
src/inspect_evals/sec_qa/sec_qa.py
src/inspect_evals/sevenllm/README.md
src/inspect_evals/sevenllm/__init__.py
src/inspect_evals/sevenllm/eval.yaml
src/inspect_evals/sevenllm/scorers.py
src/inspect_evals/sevenllm/sevenllm.py
src/inspect_evals/simpleqa/README.md
src/inspect_evals/simpleqa/__init__.py
src/inspect_evals/simpleqa/dataset.py
src/inspect_evals/simpleqa/eval.yaml
src/inspect_evals/simpleqa/grader_templates.py
src/inspect_evals/simpleqa/scorer.py
src/inspect_evals/simpleqa/simpleqa.py
src/inspect_evals/simpleqa/paper_config/simpleqa.yaml
src/inspect_evals/simpleqa/paper_config/simpleqa_verified.yaml
src/inspect_evals/sosbench/README.md
src/inspect_evals/sosbench/__init__.py
src/inspect_evals/sosbench/eval.yaml
src/inspect_evals/sosbench/sosbench.py
src/inspect_evals/sosbench/utils.py
src/inspect_evals/squad/README.md
src/inspect_evals/squad/__init__.py
src/inspect_evals/squad/eval.yaml
src/inspect_evals/squad/squad.py
src/inspect_evals/stereoset/README.md
src/inspect_evals/stereoset/__init__.py
src/inspect_evals/stereoset/eval.yaml
src/inspect_evals/stereoset/stereoset.py
src/inspect_evals/strong_reject/README.md
src/inspect_evals/strong_reject/__init__.py
src/inspect_evals/strong_reject/eval.yaml
src/inspect_evals/strong_reject/prompts.py
src/inspect_evals/strong_reject/scorer.py
src/inspect_evals/strong_reject/strong_reject.py
src/inspect_evals/strong_reject/utils.py
src/inspect_evals/swe_bench/README.md
src/inspect_evals/swe_bench/__init__.py
src/inspect_evals/swe_bench/download_baselines.sh
src/inspect_evals/swe_bench/eval.yaml
src/inspect_evals/swe_bench/example_values.yaml
src/inspect_evals/swe_bench/scorers.py
src/inspect_evals/swe_bench/swe_bench.py
src/inspect_evals/swe_bench/docs/swebench_comparison.jpeg
src/inspect_evals/swe_bench/tests/create_test_repos.py
src/inspect_evals/swe_bench/tests/swe_bench_tests.py
src/inspect_evals/swe_lancer/.gitattributes
src/inspect_evals/swe_lancer/README.md
src/inspect_evals/swe_lancer/__init__.py
src/inspect_evals/swe_lancer/dataset.py
src/inspect_evals/swe_lancer/debugging.py
src/inspect_evals/swe_lancer/eval.yaml
src/inspect_evals/swe_lancer/prompts.py
src/inspect_evals/swe_lancer/scorers.py
src/inspect_evals/swe_lancer/solvers.py
src/inspect_evals/swe_lancer/swe_lancer.py
src/inspect_evals/swe_lancer/tools.py
src/inspect_evals/swe_lancer/utils.py
src/inspect_evals/swe_lancer/data/all_swelancer_tasks.csv
src/inspect_evals/swe_lancer/scripts/clean_swelancer.py
src/inspect_evals/sycophancy/README.md
src/inspect_evals/sycophancy/__init__.py
src/inspect_evals/sycophancy/eval.yaml
src/inspect_evals/sycophancy/prompts.py
src/inspect_evals/sycophancy/scorers.py
src/inspect_evals/sycophancy/solvers.py
src/inspect_evals/sycophancy/sycophancy.py
src/inspect_evals/sycophancy/utils.py
src/inspect_evals/tac/EVALUATION.md
src/inspect_evals/tac/README.md
src/inspect_evals/tac/__init__.py
src/inspect_evals/tac/dataset.py
src/inspect_evals/tac/eval.yaml
src/inspect_evals/tac/prompts.py
src/inspect_evals/tac/scorer.py
src/inspect_evals/tac/tac.py
src/inspect_evals/tac/tac_results.png
src/inspect_evals/tac/tools.py
src/inspect_evals/tau2/.noautolint
src/inspect_evals/tau2/README.md
src/inspect_evals/tau2/__init__.py
src/inspect_evals/tau2/eval.yaml
src/inspect_evals/tau2/tau2.py
src/inspect_evals/tau2/utils.py
src/inspect_evals/tau2/airline/AirlineAgentTools.py
src/inspect_evals/tau2/airline/__init__.py
src/inspect_evals/tau2/airline/agents.py
src/inspect_evals/tau2/airline/data_model.py
src/inspect_evals/tau2/airline/dataset.py
src/inspect_evals/tau2/airline/scorer.py
src/inspect_evals/tau2/airline/tools.py
src/inspect_evals/tau2/common/__init__.py
src/inspect_evals/tau2/common/agents.py
src/inspect_evals/tau2/common/dataset.py
src/inspect_evals/tau2/common/metadata.py
src/inspect_evals/tau2/common/scorer.py
src/inspect_evals/tau2/common/tools.py
src/inspect_evals/tau2/data/domains/airline/db.json.gz
src/inspect_evals/tau2/data/domains/airline/policy.txt
src/inspect_evals/tau2/data/domains/airline/split_tasks.json
src/inspect_evals/tau2/data/domains/airline/tasks.json
src/inspect_evals/tau2/data/domains/mock/db.json
src/inspect_evals/tau2/data/domains/mock/policy.md
src/inspect_evals/tau2/data/domains/mock/policy_solo.md
src/inspect_evals/tau2/data/domains/mock/split_tasks.json
src/inspect_evals/tau2/data/domains/mock/tasks.json
src/inspect_evals/tau2/data/domains/retail/db.json.gz
src/inspect_evals/tau2/data/domains/retail/policy.txt
src/inspect_evals/tau2/data/domains/retail/split_tasks.json
src/inspect_evals/tau2/data/domains/retail/tasks.json
src/inspect_evals/tau2/data/domains/telecom/db.toml
src/inspect_evals/tau2/data/domains/telecom/main_policy.txt
src/inspect_evals/tau2/data/domains/telecom/main_policy_solo.txt
src/inspect_evals/tau2/data/domains/telecom/split_tasks.json
src/inspect_evals/tau2/data/domains/telecom/tasks.json.gz
src/inspect_evals/tau2/data/domains/telecom/tasks_small.json
src/inspect_evals/tau2/data/domains/telecom/tech_support_manual.txt
src/inspect_evals/tau2/data/domains/telecom/tech_support_workflow.txt
src/inspect_evals/tau2/data/domains/telecom/tech_support_workflow_solo.txt
src/inspect_evals/tau2/data/domains/telecom/user_db.toml
src/inspect_evals/tau2/data/user_simulator/simulation_guidelines.txt
src/inspect_evals/tau2/data/user_simulator/simulation_guidelines_tools.txt
src/inspect_evals/tau2/data_model/tasks.py
src/inspect_evals/tau2/environment/__init__.py
src/inspect_evals/tau2/environment/db.py
src/inspect_evals/tau2/retail/RetailAgentTools.py
src/inspect_evals/tau2/retail/__init__.py
src/inspect_evals/tau2/retail/agents.py
src/inspect_evals/tau2/retail/data_model.py
src/inspect_evals/tau2/retail/dataset.py
src/inspect_evals/tau2/retail/scorer.py
src/inspect_evals/tau2/retail/tools.py
src/inspect_evals/tau2/telecom/TelecomAgentTools.py
src/inspect_evals/tau2/telecom/TelecomUserTools.py
src/inspect_evals/tau2/telecom/__init__.py
src/inspect_evals/tau2/telecom/agents.py
src/inspect_evals/tau2/telecom/data_model.py
src/inspect_evals/tau2/telecom/dataset.py
src/inspect_evals/tau2/telecom/environment.py
src/inspect_evals/tau2/telecom/scorer.py
src/inspect_evals/tau2/telecom/solver.py
src/inspect_evals/tau2/telecom/tools.py
src/inspect_evals/tau2/telecom/user_data_model.py
src/inspect_evals/tau2/telecom/user_tools.py
src/inspect_evals/tau2/telecom/utils.py
src/inspect_evals/threecb/.noautolint
src/inspect_evals/threecb/README.md
src/inspect_evals/threecb/__init__.py
src/inspect_evals/threecb/constants.py
src/inspect_evals/threecb/dataset.py
src/inspect_evals/threecb/dataset_utils.py
src/inspect_evals/threecb/docker_utils.py
src/inspect_evals/threecb/eval.yaml
src/inspect_evals/threecb/models.py
src/inspect_evals/threecb/solvers.py
src/inspect_evals/threecb/threecb.py
src/inspect_evals/threecb/task_configs/.noautolint
src/inspect_evals/threecb/task_configs/binpwn_gdb_repl/Dockerfile
src/inspect_evals/threecb/task_configs/binpwn_gdb_repl/binpwn_gdb_repl.toml
src/inspect_evals/threecb/task_configs/binpwn_gdb_repl/setup.sh
src/inspect_evals/threecb/task_configs/escalation_setuid/Dockerfile
src/inspect_evals/threecb/task_configs/escalation_setuid/escalation_setuid.toml
src/inspect_evals/threecb/task_configs/escalation_setuid/setup.sh
src/inspect_evals/threecb/task_configs/evasion_debugger/Dockerfile
src/inspect_evals/threecb/task_configs/evasion_debugger/evader.c
src/inspect_evals/threecb/task_configs/evasion_debugger/evasion_debugger.toml
src/inspect_evals/threecb/task_configs/evasion_debugger/verify.c
src/inspect_evals/threecb/task_configs/find_all_users/Dockerfile
src/inspect_evals/threecb/task_configs/find_all_users/find_all_users.toml
src/inspect_evals/threecb/task_configs/find_all_users/setup.sh
src/inspect_evals/threecb/task_configs/impact_defacing/Dockerfile
src/inspect_evals/threecb/task_configs/impact_defacing/entrypoint.sh
src/inspect_evals/threecb/task_configs/impact_defacing/impact_defacing.toml
src/inspect_evals/threecb/task_configs/impact_defacing/watcher.sh
src/inspect_evals/threecb/task_configs/initialaccess_mail_link/Dockerfile
src/inspect_evals/threecb/task_configs/initialaccess_mail_link/entrypoint.sh
src/inspect_evals/threecb/task_configs/initialaccess_mail_link/initialaccess_mail_link.toml
src/inspect_evals/threecb/task_configs/initialaccess_mail_link/watcher.sh
src/inspect_evals/threecb/task_configs/nmap/Dockerfile
src/inspect_evals/threecb/task_configs/nmap/app.py
src/inspect_evals/threecb/task_configs/nmap/compose.yaml
src/inspect_evals/threecb/task_configs/nmap/entrypoint.sh
src/inspect_evals/threecb/task_configs/nmap/nmap.toml
src/inspect_evals/threecb/task_configs/null_writing/Dockerfile
src/inspect_evals/threecb/task_configs/null_writing/compose.yaml
src/inspect_evals/threecb/task_configs/null_writing/hmm.c
src/inspect_evals/threecb/task_configs/null_writing/null_writing.toml
src/inspect_evals/threecb/task_configs/null_writing/setup.sh
src/inspect_evals/threecb/task_configs/python_repl_factorial10/Dockerfile
src/inspect_evals/threecb/task_configs/python_repl_factorial10/python_repl_factorial10.toml
src/inspect_evals/threecb/task_configs/resourcedevelopment_signing/Dockerfile
src/inspect_evals/threecb/task_configs/resourcedevelopment_signing/dummy_binary
src/inspect_evals/threecb/task_configs/resourcedevelopment_signing/resourcedevelopment_signing.toml
src/inspect_evals/threecb/task_configs/resourcedevelopment_signing/verify_and_reveal_flag.sh
src/inspect_evals/threecb/task_configs/web_navigation/Dockerfile
src/inspect_evals/threecb/task_configs/web_navigation/compose.yaml
src/inspect_evals/threecb/task_configs/web_navigation/web_navigation.toml
src/inspect_evals/truthfulqa/README.md
src/inspect_evals/truthfulqa/__init__.py
src/inspect_evals/truthfulqa/eval.yaml
src/inspect_evals/truthfulqa/truthfulqa.py
src/inspect_evals/uccb/README.md
src/inspect_evals/uccb/__init__.py
src/inspect_evals/uccb/eval.yaml
src/inspect_evals/uccb/uccb.py
src/inspect_evals/usaco/README.md
src/inspect_evals/usaco/__init__.py
src/inspect_evals/usaco/dataset.py
src/inspect_evals/usaco/eval.yaml
src/inspect_evals/usaco/usaco.py
src/inspect_evals/utils/__init__.py
src/inspect_evals/utils/aime_common.py
src/inspect_evals/utils/deps_utils.py
src/inspect_evals/utils/docker_utils.py
src/inspect_evals/utils/hf_telemetry.py
src/inspect_evals/utils/huggingface.py
src/inspect_evals/utils/load_dataset.py
src/inspect_evals/utils/metrics.py
src/inspect_evals/utils/scorers.py
src/inspect_evals/vimgolf_challenges/.noautolint
src/inspect_evals/vimgolf_challenges/Dockerfile
src/inspect_evals/vimgolf_challenges/README.md
src/inspect_evals/vimgolf_challenges/__init__.py
src/inspect_evals/vimgolf_challenges/docker-requirements.txt
src/inspect_evals/vimgolf_challenges/eval.yaml
src/inspect_evals/vimgolf_challenges/vimgolf_challenges.py
src/inspect_evals/vimgolf_challenges/vimgolf_verifier.py
src/inspect_evals/vstar_bench/README.md
src/inspect_evals/vstar_bench/__init__.py
src/inspect_evals/vstar_bench/eval.yaml
src/inspect_evals/vstar_bench/vstar_bench.py
src/inspect_evals/winogrande/README.md
src/inspect_evals/winogrande/__init__.py
src/inspect_evals/winogrande/eval.yaml
src/inspect_evals/winogrande/winogrande.py
src/inspect_evals/wmdp/README.md
src/inspect_evals/wmdp/__init__.py
src/inspect_evals/wmdp/eval.yaml
src/inspect_evals/wmdp/wmdp.py
src/inspect_evals/worldsense/.gitignore
src/inspect_evals/worldsense/README.md
src/inspect_evals/worldsense/__init__.py
src/inspect_evals/worldsense/_utils.py
src/inspect_evals/worldsense/eval.yaml
src/inspect_evals/worldsense/worldsense.py
src/inspect_evals/writingbench/README.md
src/inspect_evals/writingbench/__init__.py
src/inspect_evals/writingbench/benchmark_all.jsonl
src/inspect_evals/writingbench/eval.yaml
src/inspect_evals/writingbench/writingbench.py
src/inspect_evals/xstest/README.md
src/inspect_evals/xstest/__init__.py
src/inspect_evals/xstest/eval.yaml
src/inspect_evals/xstest/xstest.py
src/inspect_evals/zerobench/README.md
src/inspect_evals/zerobench/__init__.py
src/inspect_evals/zerobench/eval.yaml
src/inspect_evals/zerobench/reducer.py
src/inspect_evals/zerobench/scorer.py
src/inspect_evals/zerobench/zerobench.py
src/inspect_evals/zerobench/images/mean_average_formula.png
src/inspect_evals/zerobench/images/zerobench_metrics_in_inspect.png
tests/README.md
tests/__init__.py
tests/conftest.py
tests/docker_build_trace.py
tests/test_add_readme_section.py
tests/test_asset_tools.py
tests/test_autolint_best_practices.py
tests/test_autolint_checks.py
tests/test_check_posix_code.py
tests/test_clean.py
tests/test_datasets_hf.py
tests/test_epoch_compatibility.py
tests/test_eval_imports.py
tests/test_generate_readmes.py
tests/test_metadata.py
tests/test_prerender_links.py
tests/test_registry_imports.py
tests/test_utils.py
tests/abstention_bench/__init__.py
tests/abstention_bench/test_dataset.py
tests/abstention_bench/test_end_to_end.py
tests/abstention_bench/test_metric.py
tests/abstention_bench/test_scorer.py
tests/agent_bench/__init__.py
tests/agent_bench/test_agent_bench_os.py
tests/agent_bench/test_dataset.py
tests/agent_bench/test_extract_trace_messages.py
tests/agent_bench/test_scorer.py
tests/agent_bench/utils.py
tests/agentdojo/__init__.py
tests/agentdojo/test_agentdojo.py
tests/agentdojo/test_attacks.py
tests/agentdojo/test_dataset.py
tests/agentdojo/test_deepdiff_wrappers.py
tests/agentdojo/test_utils.py
tests/agentharm/__init__.py
tests/agentharm/test_agent_harm.py
tests/agentharm/test_epoch_compatibility.py
tests/agentic_misalignment/__init__.py
tests/agentic_misalignment/test_end_to_end.py
tests/agentic_misalignment/test_unit.py
tests/agieval/__init__.py
tests/agieval/test_agieval_solver.py
tests/agieval/test_scorer.py
tests/ahb/__init__.py
tests/ahb/test_ahb.py
tests/aime2024/__init__.py
tests/aime2024/test_aime2024.py
tests/aime2025/__init__.py
tests/aime2025/test_aime2025.py
tests/aime2026/__init__.py
tests/aime2026/test_aime2026.py
tests/aime_common/__init__.py
tests/aime_common/test_aime_common.py
tests/air_bench/__init__.py
tests/air_bench/test_air_bench_dataset.py
tests/air_bench/test_metrics.py
tests/air_bench/test_scorer.py
tests/air_bench/test_solver.py
tests/ape/__init__.py
tests/ape/test_ape.py
tests/ape/test_dataset.py
tests/ape/test_scorer.py
tests/ape/test_solver.py
tests/ape/test_utils.py
tests/apps/__init__.py
tests/apps/test_apps.py
tests/apps/test_apps_dataset.py
tests/arc/__init__.py
tests/arc/test_arc.py
tests/assistant_bench/__init__.py
tests/assistant_bench/test_dataset.py
tests/assistant_bench/test_epoch_compatibility.py
tests/assistant_bench/test_scoring.py
tests/assistant_bench/test_solver.py
tests/b3/__init__.py
tests/b3/test_b3.py
tests/b3/test_dataset.py
tests/b3/test_metrics.py
tests/b3/test_tools.py
tests/bbeh/__init__.py
tests/bbeh/test_bbeh.py
tests/bbeh/test_dataset.py
tests/bbeh/test_end_to_end.py
tests/bbh/__init__.py
tests/bbh/test_bbh.py
tests/bbh/test_scorer.py
tests/bbq/__init__.py
tests/bbq/test_bbq.py
tests/bfcl/__init__.py
tests/bfcl/test_backend_loader.py
tests/bfcl/test_bfcl.py
tests/bfcl/test_data.py
tests/bfcl/test_function_parsing.py
tests/bfcl/test_functions.py
tests/bfcl/test_multi_turn_scorer.py
tests/bfcl/test_multi_turn_solver.py
tests/bfcl/test_scorer.py
tests/bfcl/test_solver.py
tests/bfcl/test_utils_and_edge_cases.py
tests/bfcl/fixtures/__init__.py
tests/bfcl/fixtures/three_method_backend.py
tests/bigcodebench/__init__.py
tests/bigcodebench/test_bigcodebench.py
tests/bold/__init__.py
tests/bold/test_bold.py
tests/boolq/__init__.py
tests/boolq/test_boolq.py
tests/browse_comp/__init__.py
tests/browse_comp/test_browse_comp.py
tests/browse_comp/test_epoch_compatibility.py
tests/chembench/__init__.py
tests/chembench/test_dataset.py
tests/chembench/test_scorer.py
tests/chembench/test_solver.py
tests/class_eval/__init__.py
tests/class_eval/test_class_eval.py
tests/class_eval/test_data.py
tests/class_eval/test_scorer.py
tests/coconot/__init__.py
tests/coconot/test_coconot_dataset.py
tests/coconot/test_end_to_end.py
tests/commonsense_qa/__init__.py
tests/commonsense_qa/test_dataset.py
tests/compute_eval/__init__.py
tests/compute_eval/test_compute_eval.py
tests/compute_eval/test_e2e.py
tests/compute_eval/test_scorer.py
tests/compute_eval/test_utils.py
tests/conftest_helpers/__init__.py
tests/conftest_helpers/hf_telemetry_helpers.py
tests/conftest_helpers/hf_test_helpers.py
tests/conftest_helpers/utils.py
tests/conftest_helpers/windows_test_helpers.py
tests/core_bench/__init__.py
tests/core_bench/conftest.py
tests/core_bench/test_core_bench.py
tests/core_bench/test_core_bench_dataset.py
tests/core_bench/test_core_bench_scorer.py
tests/core_bench/test_tools.py
tests/core_bench/test_utils.py
tests/cti_realm/__init__.py
tests/cti_realm/test_e2e.py
tests/cti_realm/test_parsing_utils.py
tests/cti_realm/test_scorer.py
tests/cti_realm/test_tools.py
tests/cti_realm/test_trajectory_scorer.py
tests/cti_realm/fixtures/__init__.py
tests/cti_realm/fixtures/data/__init__.py
tests/cti_realm/fixtures/data/dataset_answers_stratified_25.jsonl
tests/cti_realm/fixtures/data/dataset_samples_stratified_25.jsonl
tests/cve_bench/__init__.py
tests/cve_bench/test_end_to_end.py
tests/cybench/__init__.py
tests/cybench/test_sandbox.py
tests/cybergym/__init__.py
tests/cybergym/test_cybergym.py
tests/cybergym/test_dataset.py
tests/cybergym/test_sandbox.py
tests/cybergym/test_scorers.py
tests/cybergym/test_solvers.py
tests/cybermetric/__init__.py
tests/cybermetric/test_cybermetric.py
tests/cyberseceval_2/__init__.py
tests/cyberseceval_2/test_cyberseceval_2.py
tests/cyberseceval_3/__init__.py
tests/cyberseceval_3/test_cyberseceval_3.py
tests/docvqa/__init__.py
tests/docvqa/test_docvqa.py
tests/docvqa/test_scorer.py
tests/drop/__init__.py
tests/drop/test_drop.py
tests/ds1000/__init__.py
tests/ds1000/test_ds1000.py
tests/fortress/__init__.py
tests/fortress/test_data_loading.py
tests/fortress/test_end_to_end.py
tests/fortress/test_fortress_scorer.py
tests/fortress/test_metadata_properties.py
tests/fortress/test_scoring.py
tests/frontier_cs/__init__.py
tests/frontier_cs/test_frontier_cs.py
tests/frontier_cs/test_interactive_scoring.py
tests/frontier_cs/test_research_scoring.py
tests/frontierscience/__init__.py
tests/frontierscience/test_frontierscience.py
tests/gaia/__init__.py
tests/gaia/test_dataset.py
tests/gaia/test_gaia.py
tests/gaia/test_gaia_record_to_sample.py
tests/gaia/test_message_limit.py
tests/gaia/test_scorer.py
tests/gdm_in_house_ctf/__init__.py
tests/gdm_in_house_ctf/test_dataset.py
tests/gdm_in_house_ctf/test_gdm_in_house_ctf.py
tests/gdm_intercode_ctf/__init__.py
tests/gdm_intercode_ctf/test_gdm_intercode_ctf.py
tests/gdm_self_proliferation/__init__.py
tests/gdm_self_proliferation/test_common.py
tests/gdm_self_proliferation/test_custom_tools.py
tests/gdm_self_proliferation/test_dataset_utils.py
tests/gdm_self_proliferation/test_e2e.py
tests/gdm_self_proliferation/test_record_to_sample.py
tests/gdm_self_proliferation/test_solvers_scorers.py
tests/gdm_self_proliferation/test_sp_milestone_dispatch.py
tests/gdm_self_reasoning/__init__.py
tests/gdm_self_reasoning/test_e2e.py
tests/gdm_self_reasoning/test_scorer.py
tests/gdm_self_reasoning/test_solver.py
tests/gdm_self_reasoning/test_tools.py
tests/gdm_stealth/__init__.py
tests/gdm_stealth/test_ai_secretary_checker.py
tests/gdm_stealth/test_ai_secretary_tools.py
tests/gdm_stealth/test_cover_your_tracks_tools.py
tests/gdm_stealth/test_customer_support_env.py
tests/gdm_stealth/test_cyt_checker.py
tests/gdm_stealth/test_cyt_database.py
tests/gdm_stealth/test_gdm_stealth.py
tests/gdm_stealth/test_llm_sampling.py
tests/gdm_stealth/test_oversight.py
tests/gdm_stealth/test_rater_lib.py
tests/gdm_stealth/test_scorer.py
tests/gdm_stealth/test_solver.py
tests/gdm_stealth/test_strategic_rule_breaking_tools.py
tests/gdpval/__init__.py
tests/gdpval/test_gdpval.py
tests/gdpval/test_util.py
tests/gpqa/__init__.py
tests/gpqa/test_gpqa.py
tests/gsm8k/__init__.py
tests/gsm8k/test_gsm8k.py
tests/healthbench/__init__.py
tests/healthbench/test_dataset.py
tests/healthbench/test_dataset_verification.py
tests/healthbench/test_epoch_compatibility.py
tests/healthbench/test_scorer.py
tests/healthbench/test_tasks.py
tests/hellaswag/__init__.py
tests/hellaswag/test_hellaswag.py
tests/hle/__init__.py
tests/hle/test_epoch_compatibility.py
tests/hle/test_fix_known_mime_mismatches.py
tests/hle/test_hle.py
tests/humaneval/__init__.py
tests/humaneval/test_humaneval.py
tests/humaneval/test_scorer.py
tests/ifeval/__init__.py
tests/ifeval/test_ifeval.py
tests/ifeval/test_scorer.py
tests/ifevalcode/__init__.py
tests/ifevalcode/test_dataset.py
tests/ifevalcode/test_end_to_end.py
tests/ifevalcode/test_scorer.py
tests/ifevalcode/test_solver.py
tests/ifevalcode/test_utils.py
tests/infinite_bench/__init__.py
tests/infinite_bench/test_infinite_bench.py
tests/instrumentaleval/__init__.py
tests/instrumentaleval/test_end_to_end.py
tests/instrumentaleval/test_metric.py
tests/instrumentaleval/test_process_data.py
tests/instrumentaleval/test_scorer.py
tests/instrumentaleval/test_utils.py
tests/kernelbench/__init__.py
tests/kernelbench/conftest.py
tests/kernelbench/test_dataset.py
tests/kernelbench/test_e2e.py
tests/kernelbench/test_eval_runner.py
tests/kernelbench/test_metrics.py
tests/kernelbench/test_scorer.py
tests/kernelbench/test_solver.py
tests/lab_bench/__init__.py
tests/lab_bench/test_lab_bench_dataset.py
tests/lab_bench/test_lab_bench_metrics.py
tests/lab_bench/test_lab_bench_record_to_sample_helpers.py
tests/lab_bench/test_lab_bench_scorer.py
tests/lab_bench/test_lab_bench_utils.py
tests/lingoly/__init__.py
tests/lingoly/test_lingoly.py
tests/lingoly/test_metrics.py
tests/lingoly/test_scorer.py
tests/livebench/__init__.py
tests/livebench/test_datasets.py
tests/livebench/test_decide_scorer.py
tests/livebench/test_record_to_sample.py
tests/livebench/test_utils.py
tests/livebench/validation.py
tests/livecodebench_pro/__init__.py
tests/livecodebench_pro/test_dataset.py
tests/livecodebench_pro/test_e2e.py
tests/livecodebench_pro/test_judge.py
tests/livecodebench_pro/test_scorer.py
tests/livecodebench_pro/test_solver.py
tests/livecodebench_pro/test_utils.py
tests/make_me_pay/__init__.py
tests/make_me_pay/test_donation_detector.py
tests/make_me_pay/test_duration_turn_caps.py
tests/make_me_pay/test_end_to_end.py
tests/make_me_pay/test_scorer.py
tests/make_me_pay/test_withdraw_detector.py
tests/makemesay/__init__.py
tests/makemesay/test_e2e.py
tests/makemesay/test_game.py
tests/makemesay/test_scorer.py
tests/makemesay/test_solver.py
tests/makemesay/test_utils.py
tests/mask/__init__.py
tests/mask/data_fixtures.py
tests/mask/test_accuracy_semantics.py
tests/mask/test_belief_semantics.py
tests/mask/test_core_metrics.py
tests/mask/test_dataset.py
tests/mask/test_end_to_end.py
tests/mask/test_honesty_semantics.py
tests/mask/test_llm_judge_parsing.py
tests/mask/test_orchestration.py
tests/mask/test_scorer.py
tests/mask/test_solvers.py
tests/mask/test_statistical_summary_metrics.py
tests/mask/test_utils.py
tests/math/__init__.py
tests/math/test_scorer.py
tests/math/test_utils.py
tests/mathvista/__init__.py
tests/mathvista/test_mathvista_dataset.py
tests/mathvista/test_scorer.py
tests/mbpp/__init__.py
tests/mbpp/test_mbpp.py
tests/mbpp/test_scorer.py
tests/medqa/__init__.py
tests/medqa/test_medqa.py
tests/medqa/test_medqa_dataset.py
tests/mgsm/__init__.py
tests/mgsm/test_mgsm.py
tests/mind2web/__init__.py
tests/mind2web/test_dataset.py
tests/mind2web/test_epoch_compatibility.py
tests/mind2web/test_m2w_scorer.py
tests/mind2web/test_task.py
tests/mind2web/test_utils.py
tests/mind2web/data/__init__.py
tests/mind2web/data/sample_1.json
tests/mind2web/data/sample_10.json
tests/mind2web/data/sample_100.json
tests/mind2web_sc/__init__.py
tests/mind2web_sc/test_dataset.py
tests/mind2web_sc/test_end_to_end.py
tests/mind2web_sc/test_scorer.py
tests/mind2web_sc/test_solver.py
tests/mind2web_sc/test_utils.py
tests/mle_bench/__init__.py
tests/mle_bench/test_mle_bench.py
tests/mle_bench/test_scorer.py
tests/mlrc_bench/__init__.py
tests/mlrc_bench/test_dataset.py
tests/mlrc_bench/test_end_to_end.py
tests/mlrc_bench/test_mlrc_high_level_actions.py
tests/mlrc_bench/test_mlrc_low_level_actions.py
tests/mlrc_bench/test_safe_load_python_file_globals.py
tests/mlrc_bench/test_scorer.py
tests/mlrc_bench/test_solver.py
tests/mlrc_bench/test_task_smoke.py
tests/mlrc_bench/test_utils.py
tests/mmiu/__init__.py
tests/mmiu/test_mmiu_dataset.py
tests/mmiu/test_mmiu_metrics.py
tests/mmiu/test_mmiu_scorer.py
tests/mmiu/test_mmiu_utils.py
tests/mmlu/__init__.py
tests/mmlu/test_mmlu.py
tests/mmlu_pro/__init__.py
tests/mmlu_pro/test_mmlu_pro.py
tests/mmmu/__init__.py
tests/mmmu/test_mmmu.py
tests/moru/__init__.py
tests/moru/conftest.py
tests/moru/test_dataset.py
tests/moru/test_metrics.py
tests/moru/test_moru.py
tests/moru/test_scorer.py
tests/moru/test_utils.py
tests/musr/__init__.py
tests/musr/test_musr.py
tests/niah/__init__.py
tests/niah/test_niah.py
tests/niah/test_scoring.py
tests/novelty_bench/__init__.py
tests/novelty_bench/test_e2e_novelty_bench.py
tests/novelty_bench/test_model_inference.py
tests/novelty_bench/test_novelty_bench.py
tests/novelty_bench/test_novelty_bench_dataset.py
tests/novelty_bench/test_novelty_bench_utils.py
tests/novelty_bench/test_partition.py
tests/novelty_bench/test_score.py
tests/onet/__init__.py
tests/onet/test_onet.py
tests/osworld/__init__.py
tests/osworld/test_osworld.py
tests/osworld/test_osworld_dataset_filtering.py
tests/osworld/test_osworld_should_include_example.py
tests/osworld/test_scorer.py
tests/paperbench/__init__.py
tests/paperbench/conftest.py
tests/paperbench/test_dataset.py
tests/paperbench/test_e2e.py
tests/paperbench/test_paperbench.py
tests/paperbench/test_sandbox.py
tests/paperbench/test_solvers.py
tests/paperbench/score/__init__.py
tests/paperbench/score/test_judge.py
tests/paperbench/score/test_reproduce.py
tests/paperbench/score/test_simple_judge.py
tests/paperbench/score/test_submission.py
tests/paperbench/score/test_submission_integration.py
tests/paperbench/score/test_task_node.py
tests/paperbench/score/test_utils.py
tests/paws/__init__.py
tests/paws/test_paws.py
tests/persistbench/__init__.py
tests/persistbench/test_dataset.py
tests/persistbench/test_end_to_end.py
tests/persistbench/test_metrics.py
tests/persistbench/test_scorers.py
tests/persistbench/test_solvers.py
tests/persistbench/test_utils.py
tests/personality/__init__.py
tests/personality/test_any_choice.py
tests/personality/test_dataset.py
tests/personality/test_metrics.py
tests/personality/test_task_end_to_end.py
tests/piqa/__init__.py
tests/piqa/test_dataset.py
tests/pre_flight/__init__.py
tests/pre_flight/test_pre_flight.py
tests/pubmedqa/__init__.py
tests/pubmedqa/test_pubmedqa_dataset.py
tests/race_h/__init__.py
tests/race_h/test_race_h.py
tests/sad/__init__.py
tests/sad/conftest.py
tests/sad/test_dataset.py
tests/sad/test_download_data.py
tests/sad/test_e2e.py
tests/sad/test_facts_influence.py
tests/sad/test_scorer.py
tests/sad/test_stages.py
tests/scbench/__init__.py
tests/scbench/test_data_manifest.py
tests/scbench/test_grader_trace_parity.py
tests/scbench/test_graders.py
tests/scbench/test_harness_parity.py
tests/scbench/test_scbench.py
tests/scbench/test_scorer.py
tests/scicode/__init__.py
tests/scicode/test_dataset.py
tests/scicode/test_metrics.py
tests/scicode/test_scorer.py
tests/scicode/test_solver.py
tests/scicode/test_util.py
tests/sciknoweval/__init__.py
tests/sciknoweval/test_dataset.py
tests/sciknoweval/test_scorer.py
tests/sciknoweval/test_solver.py
tests/sciknoweval/test_utils.py
tests/sec_qa/__init__.py
tests/sec_qa/test_sec_qa.py
tests/sec_qa/test_sec_qa_dataset.py
tests/sevenllm/__init__.py
tests/sevenllm/test_sevenllm.py
tests/simpleqa/__init__.py
tests/simpleqa/test_epoch_compatibility.py
tests/simpleqa/test_scorer_real_sample.py
tests/simpleqa/test_simpleqa.py
tests/sosbench/__init__.py
tests/sosbench/test_dataset.py
tests/sosbench/test_scorer.py
tests/sosbench/test_utils.py
tests/squad/__init__.py
tests/squad/test_squad.py
tests/stereoset/__init__.py
tests/stereoset/test_stereoset.py
tests/strong_reject/__init__.py
tests/strong_reject/test_scorer.py
tests/strong_reject/test_scorer_model_resolution.py
tests/strong_reject/test_utils.py
tests/swe_bench/__init__.py
tests/swe_bench/test_scorer.py
tests/swe_bench/test_swe_bench.py
tests/swe_lancer/.gitignore
tests/swe_lancer/__init__.py
tests/swe_lancer/shared_mocks.py
tests/swe_lancer/test_dataset.py
tests/swe_lancer/test_e2e.py
tests/swe_lancer/test_integration.py
tests/swe_lancer/test_scorers.py
tests/swe_lancer/test_solvers.py
tests/swe_lancer/test_utils.py
tests/swe_lancer/data/15193-manager-0_1.txt
tests/swe_lancer/data/15193-manager-0_2.txt
tests/swe_lancer/data/15193-manager-0_3.txt
tests/swe_lancer/data/16912_4_1.txt
tests/swe_lancer/data/16912_4_2.txt
tests/swe_lancer/data/16912_4_3.txt
tests/swe_lancer/data/__init__.py
tests/swe_lancer/data/extract_prompts.py
tests/sycophancy/__init__.py
tests/sycophancy/test_epoch_compatibility.py
tests/sycophancy/test_sycophancy.py
tests/tac/__init__.py
tests/tac/test_tac.py
tests/tau2/__init__.py
tests/tau2/floating_point_test_agent_state.json
tests/tau2/floating_point_test_messages.json
tests/tau2/floating_point_test_task.json
tests/tau2/test_airline_task.json
tests/tau2/test_end_to_end.py
tests/tau2/test_record_to_sample.py
tests/tau2/test_records.json
tests/tau2/test_retail_task.json
tests/tau2/test_scorer.py
tests/tau2/test_solver.py
tests/tau2/test_telecom_task.json
tests/tau2/utils.py
tests/test_uccb/__init__.py
tests/test_uccb/test_uccb.py
tests/threecb/__init__.py
tests/threecb/test_dataset_utils.py
tests/threecb/test_eval_integration.py
tests/threecb/test_scoring.py
tests/threecb/test_solver.py
tests/threecb/test_submit_tool.py
tests/threecb/fixtures/__init__.py
tests/threecb/fixtures/record.py
tests/tools/__init__.py
tests/tools/test_judge_calibration_diagnostics.py
tests/truthfulqa/__init__.py
tests/truthfulqa/test_truthfulqa.py
tests/uccb/__init__.py
tests/uccb/test_uccb.py
tests/usaco/__init__.py
tests/usaco/create_dataset.py
tests/usaco/test_dataset.py
tests/usaco/test_usaco.py
tests/utils/__init__.py
tests/utils/huggingface.py
tests/utils/matchers.py
tests/utils/sandbox_tools.py
tests/utils/solvers.py
tests/utils/task_assertions.py
tests/utils/test_deps_utils.py
tests/utils/test_docker_utils.py
tests/utils/test_ensure_revision.py
tests/utils/test_hf_rate_limiting.py
tests/utils/test_huggingface_tests.py
tests/utils/test_load_dataset.py
tests/utils/test_solvers.py
tests/utils/test_task_assertions.py
tests/utils/test_transformers_wrappers.py
tests/utils/test_artifacts/Dockerfile
tests/utils/test_artifacts/test_compose.yaml
tests/vimgolf_challenges/__init__.py
tests/vimgolf_challenges/test_scorer.py
tests/vimgolf_challenges/test_vimgolf_challenges.py
tests/vstar_bench/__init__.py
tests/vstar_bench/test_vstar_bench.py
tests/winogrande/__init__.py
tests/winogrande/test_winogrande.py
tests/wmdp/__init__.py
tests/wmdp/test_wmdp.py
tests/worldsense/__init__.py
tests/worldsense/test_scorer.py
tests/worldsense/test_worldsense.py
tests/writingbench/__init__.py
tests/writingbench/test_writingbench.py
tests/xstest/__init__.py
tests/xstest/test_xstest.py
tests/zerobench/__init__.py
tests/zerobench/test_zerobench_dataset.py
tests/zerobench/test_zerobench_reducer.py
tests/zerobench/test_zerobench_scorer.py
third-party-licenses/Apache-2.0.txt
tools/README.md
tools/__init__.py
tools/add_readme_section.py
tools/build_and_push_docker_images.py
tools/check_changelog.py
tools/check_posix_code.py
tools/check_undeclared_assets.py
tools/check_unlisted_evals.py
tools/clean.py
tools/create_markdown_files_from_trajectories.py
tools/debug_sandbox.py
tools/generate_asset_manifest.py
tools/generate_readmes.py
tools/judge_calibration_diagnostics.py
tools/list_large_files.sh
tools/parse_eval_logs_for_evaluation_report.py
tools/run_autolint.py
tools/run_evals.py
tools/summarise_asset_manifest.py
tools/count_evals/estimate_number_of_evals_at_commit.sh
tools/count_evals/generate_culm_eval_series.sh
tools/count_evals/plot_culmulative_eval_series.py
tools/run_autolint/__init__.py
tools/run_autolint/models.py
tools/run_autolint/output.py
tools/run_autolint/suppressions.py
tools/run_autolint/checks/__init__.py
tools/run_autolint/checks/best_practices.py
tools/run_autolint/checks/code_quality.py
tools/run_autolint/checks/dependencies.py
tools/run_autolint/checks/file_structure.py
tools/run_autolint/checks/tests.py
tools/run_autolint/checks/utils.py