Metadata-Version: 2.4
Name: inspect_evals
Version: 0.4.0
Summary: Collection of large language model evaluations
Author: UK AI Security Institute
License-Expression: MIT
Project-URL: Source Code, https://github.com/UKGovernmentBEIS/inspect_evals
Project-URL: Issue Tracker, https://github.com/UKGovernmentBEIS/inspect_evals/issues
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
Classifier: Intended Audience :: Science/Research
Classifier: Intended Audience :: Developers
Classifier: Natural Language :: English
Classifier: Programming Language :: Python :: 3
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Typing :: Typed
Classifier: Operating System :: OS Independent
Requires-Python: >=3.10
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: backoff>=2.2.0
Requires-Dist: datasets>=4.5.0
Requires-Dist: huggingface_hub>=1.2.0
Requires-Dist: hf_xet
Requires-Dist: inspect_ai>=0.3.158
Requires-Dist: jinja2
Requires-Dist: numpy>=1.26.0
Requires-Dist: pillow>=11.3.0
Requires-Dist: pydantic>=2.10.0
Requires-Dist: pyyaml>=5.1.0
Requires-Dist: requests>=2.32.0
Requires-Dist: tiktoken>=0.11.0
Requires-Dist: toml>=0.10.2
Provides-Extra: b3
Requires-Dist: openai; extra == "b3"
Requires-Dist: rouge_score; extra == "b3"
Requires-Dist: tenacity; extra == "b3"
Requires-Dist: click; extra == "b3"
Requires-Dist: python-dotenv; extra == "b3"
Provides-Extra: agentdojo
Requires-Dist: pydantic[email]; extra == "agentdojo"
Requires-Dist: deepdiff; extra == "agentdojo"
Provides-Extra: swe-bench
Requires-Dist: swebench>=3.0.15; extra == "swe-bench"
Requires-Dist: docker; extra == "swe-bench"
Requires-Dist: jsonlines; extra == "swe-bench"
Provides-Extra: swe-lancer
Requires-Dist: docker; extra == "swe-lancer"
Requires-Dist: types-docker; extra == "swe-lancer"
Provides-Extra: math
Requires-Dist: sympy; extra == "math"
Requires-Dist: antlr4-python3-runtime~=4.11.0; extra == "math"
Provides-Extra: worldsense
Requires-Dist: pandas; extra == "worldsense"
Provides-Extra: mind2web
Requires-Dist: beautifulsoup4; extra == "mind2web"
Requires-Dist: types-beautifulsoup4; extra == "mind2web"
Requires-Dist: lxml; extra == "mind2web"
Requires-Dist: lxml-stubs; extra == "mind2web"
Provides-Extra: sevenllm
Requires-Dist: jieba==0.42.1; extra == "sevenllm"
Requires-Dist: sentence_transformers>=5.1.1; extra == "sevenllm"
Requires-Dist: rouge==1.0.1; extra == "sevenllm"
Requires-Dist: tf-keras; python_version < "3.13" and extra == "sevenllm"
Provides-Extra: scicode
Requires-Dist: gdown; extra == "scicode"
Requires-Dist: h5py; extra == "scicode"
Requires-Dist: scipy; extra == "scicode"
Requires-Dist: sympy; extra == "scicode"
Provides-Extra: ifeval
Requires-Dist: instruction_following_eval; extra == "ifeval"
Requires-Dist: langdetect; extra == "ifeval"
Provides-Extra: ahb
Requires-Dist: matplotlib; extra == "ahb"
Provides-Extra: medqa
Requires-Dist: bioc; extra == "medqa"
Provides-Extra: niah
Requires-Dist: pandas; extra == "niah"
Provides-Extra: kernelbench
Requires-Dist: kernelbench; extra == "kernelbench"
Provides-Extra: core-bench
Requires-Dist: scipy; extra == "core-bench"
Provides-Extra: healthbench
Requires-Dist: scikit-learn; extra == "healthbench"
Provides-Extra: personality
Requires-Dist: huggingface-hub; extra == "personality"
Provides-Extra: sciknoweval
Requires-Dist: nltk; extra == "sciknoweval"
Requires-Dist: rouge_score; extra == "sciknoweval"
Requires-Dist: rdkit; extra == "sciknoweval"
Requires-Dist: rdchiral; extra == "sciknoweval"
Requires-Dist: gdown; extra == "sciknoweval"
Requires-Dist: gensim; python_version < "3.13" and extra == "sciknoweval"
Requires-Dist: scipy; extra == "sciknoweval"
Provides-Extra: gdm-stealth
Requires-Dist: tabulate; extra == "gdm-stealth"
Requires-Dist: scipy; extra == "gdm-stealth"
Requires-Dist: immutabledict; extra == "gdm-stealth"
Requires-Dist: pandas; extra == "gdm-stealth"
Requires-Dist: python-dateutil; extra == "gdm-stealth"
Provides-Extra: cybench
Requires-Dist: inspect-cyber==0.1.0; extra == "cybench"
Provides-Extra: cybergym
Requires-Dist: inspect-cyber>=0.1.0; extra == "cybergym"
Provides-Extra: bold
Requires-Dist: detoxify; extra == "bold"
Requires-Dist: vaderSentiment; extra == "bold"
Requires-Dist: transformers>=5.0.0; extra == "bold"
Requires-Dist: torch; extra == "bold"
Provides-Extra: makemesay
Requires-Dist: nltk; extra == "makemesay"
Provides-Extra: abstention-bench
Requires-Dist: scikit-learn; extra == "abstention-bench"
Requires-Dist: hydra-core>=1.4.0.dev1; extra == "abstention-bench"
Requires-Dist: omegaconf>=2.4.0.dev2; extra == "abstention-bench"
Requires-Dist: torch; extra == "abstention-bench"
Requires-Dist: wget; extra == "abstention-bench"
Requires-Dist: loguru; extra == "abstention-bench"
Requires-Dist: gdown; extra == "abstention-bench"
Requires-Dist: jsonlines; extra == "abstention-bench"
Provides-Extra: vimgolf
Requires-Dist: vimgolf==0.5.1; extra == "vimgolf"
Provides-Extra: ifevalcode
Requires-Dist: tree-sitter; extra == "ifevalcode"
Requires-Dist: tree-sitter-cpp; extra == "ifevalcode"
Provides-Extra: gdpval
Requires-Dist: huggingface_hub[cli]; extra == "gdpval"
Provides-Extra: agentic-misalignment
Requires-Dist: bs4; extra == "agentic-misalignment"
Provides-Extra: cje
Requires-Dist: cje-eval; extra == "cje"
Provides-Extra: novelty-bench
Requires-Dist: transformers>=4.57.1; extra == "novelty-bench"
Requires-Dist: torch>=2.9.1; extra == "novelty-bench"
Requires-Dist: accelerate>=1.11.0; extra == "novelty-bench"
Requires-Dist: protobuf>=6.33.1; extra == "novelty-bench"
Requires-Dist: sentencepiece>=0.2.1; extra == "novelty-bench"
Provides-Extra: gdm-capabilities
Requires-Dist: google-genai>=1.56.0; extra == "gdm-capabilities"
Requires-Dist: rich; extra == "gdm-capabilities"
Requires-Dist: python-dateutil; extra == "gdm-capabilities"
Provides-Extra: gdm-self-proliferation
Requires-Dist: rich; extra == "gdm-self-proliferation"
Provides-Extra: paperbench
Requires-Dist: drain3; extra == "paperbench"
Provides-Extra: test
Requires-Dist: inspect_evals[abstention_bench,agentdojo,b3,bold,core_bench,cybench,cybergym,fortress,gdm_capabilities,gdm_self_proliferation,gdpval,ifeval,ifevalcode,mind2web,novelty_bench,paperbench,sciknoweval,sevenllm,stealth,swe_bench,swe_lancer,vimgolf]; extra == "test"
Requires-Dist: inspect_evals[kernelbench]; sys_platform != "win32" and extra == "test"
Provides-Extra: doc
Requires-Dist: quarto-cli; extra == "doc"
Requires-Dist: jupyter; extra == "doc"
Provides-Extra: dist
Requires-Dist: twine; extra == "dist"
Requires-Dist: build; extra == "dist"
Dynamic: license-file

[<img width="295" src="https://inspect.ai-safety-institute.org.uk/images/aisi-logo.svg" alt="UK AISI Logo"/>](https://aisi.gov.uk/)<!-- markdownlint-disable-line MD033 MD041 -->

Welcome to **Inspect Evals**, a repository of community contributed LLM evaluations for [Inspect AI](https://inspect.ai-safety-institute.org.uk/). Inspect Evals was created in collaboration by the [UK AISI](https://aisi.gov.uk/), [Arcadia Impact](https://www.arcadiaimpact.org/), and the [Vector Institute](https://vectorinstitute.ai/).

📚 [Documentation](https://ukgovernmentbeis.github.io/inspect_evals/)

---

Community contributions are welcome and encouraged! Please see the [Contributor Guide](CONTRIBUTING.md) for details on submitting new evaluations.

If you have general inquiries, suggestions, or expressions of interest, please contact us through the following [Google Form](https://docs.google.com/forms/d/e/1FAIpQLSeOT_nSXvc_GZSo3uRqFlZlgGEGmOAh7bm4yFuB34ZzZjxk_g/viewform?usp=dialog).

This repository is maintained by the [Inspect Evals maintainers](MAINTAINERS.md).

# Getting Started

The recommended version of Python for Inspect Evals is 3.11 or 3.12. You should be able to run all evals on these versions and also develop the codebase without any issues. You can install and pin a specific Python version by running:

```bash
uv python pin 3.11
```

We are supporting 3.10 for running evals only — you should be able to run all evals with it, except for `mle_becnh`. We strongly recommend against using 3.10 for development, as you won't be able to update uv's lock file with it (due to `mle_bench` requiring 3.11).

As for Python 3.13, you should be able to run all evals except `sciknoweval` (its dependency is `gensim` which currently does not support 3.13+). Development should work under 3.13, however it's relatively untested — if you run into issues, let us know.

When it comes Python 3.14, at the time of writing this, many packages have yet to release versions for 3.14, so it's unsupported. The major one used by some Inspect Evals is `torch`. If you find running `uv sync` succeeding on 3.14, let us know and we'll remove this paragraph.  

Below, you can see a workflow for a typical eval. Some of the evaluations require additional dependencies or installation steps. If your eval needs extra dependencies, instructions for installing in the README file in the eval's subdirectory.

<!-- Usage: Automatically Generated -->
## Usage

### Installation

There are two ways of using Inspect Evals, from pypi as a dependency of your own project and as a standalone checked out GitHub repository.

If you are using it from pypi, install the package and its dependencies via:

```bash
pip install inspect-evals
```

If you are using Inspect Evals in its repository, start by installing the necessary dependencies with:

```bash
uv sync
```

### Running evaluations

Now you can start evaluating models. For simplicity's sake, this section assumes you are using Inspect Evals from the standalone repo. If that's not the case and you are not using `uv` to manage dependencies in your own project, you can use the same commands with `uv run` dropped.

```bash
uv run inspect eval inspect_evals/arc_easy --model openai/gpt-5-nano
uv run inspect eval inspect_evals/arc_challenge --model openai/gpt-5-nano
```

To run multiple tasks simulteneously use `inspect eval-set`:

```bash
uv run inspect eval-set inspect_evals/arc_easy inspect_evals/arc_challenge
```

You can also import tasks as normal Python objects and run them from python:

```python
from inspect_ai import eval, eval_set
from inspect_evals.arc import arc_easy, arc_challenge
eval(arc_easy)
eval_set([arc_easy, arc_challenge], log_dir='logs-run-42')
```

After running evaluations, you can view their logs using the `inspect view` command:

```bash
uv run inspect view
```

For VS Code, you can also download [Inspect AI extension for viewing logs](https://inspect.ai-safety-institute.org.uk/log-viewer.html).

If you don't want to specify the `--model` each time you run an evaluation, create a `.env` configuration file in your working directory that defines the `INSPECT_EVAL_MODEL` environment variable along with your API key. For example:

```bash
INSPECT_EVAL_MODEL=anthropic/claude-opus-4-1-20250805
ANTHROPIC_API_KEY=<anthropic-api-key>
```
<!-- /Usage: Automatically Generated -->

Inspect supports many model providers including OpenAI, Anthropic, Google, Mistral, Azure AI, AWS Bedrock, Together AI, Groq, Hugging Face, vLLM, Ollama, and more. See the [Model Providers](https://inspect.ai-safety-institute.org.uk/models.html) documentation for additional details.

You might also be able to use a newer version of pip (25.1+) to install the project via `pip install --group dev .` or `pip install --group dev '.[swe_bench]'`. However this is not officially supported.

## Documentation

For details on building the documentation, see [the documentation guide](docs/documentation.md).

For information on running tests and CI toggles, see the Technical Contribution Guide in [CONTRIBUTING.md](CONTRIBUTING.md).

## Hardware recommendations

### Disk

We recommend having at least 35 GB of free disk space for Inspect Evals: the full installation takes about 10 GB and you'll also need some space for uv cache and datasets cache (most are small, but some take 13 GB such as MMIU).

Running some evals (e.g., CyBench, GDM capabilities evals) may require extra space beyond this because they pull Docker images. We recommend having at least 65 GB of extra space for running evals that have Dockerfiles in their file tree (though you might get away with less space) on top of the 35 GB suggestion above.

In total, you should be comfortable running evals with 100 GB of free space. If you end up running of out space while having 100+ GB of free space available, please let us know — this might be a bug.

### RAM

The amount of memory needed for an eval varies significantly with the eval. You'll be able to run most evals with only 0.5 GB of free RAM. However, some evals with larger datasets require 2-3 GB or more. And some evals that use Docker (e.g., some GDM capabilities evals) require up to 32 GB of RAM.

## Harbor Framework Evaluations

For running evaluations from the Harbor Framework (e.g. Terminal-Bench 2.0, SWE-Bench Pro), use the [Inspect Harbor](https://github.com/meridianlabs-ai/inspect_harbor) package, which provides an interface to run Harbor tasks using Inspect AI.

# List of Evals
<!-- Eval Listing: Automatically Generated -->
## Coding

- ### [APPS: Automated Programming Progress Standard](src/inspect_evals/apps)

  APPS is a dataset for evaluating model performance on Python programming tasks across three difficulty levels consisting of 1,000 at introductory, 3,000 at interview, and 1,000 at competition level. The dataset consists of an additional 5,000 training samples, for a total of 10,000 total samples. We evaluate on questions from the test split, which consists of programming problems commonly found in coding interviews.
  <sub><sup>Contributed by: [@camtice](https://github.com/camtice)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/apps
  ```

- ### [AgentBench: Evaluate LLMs as Agents](src/inspect_evals/agent_bench)

  A benchmark designed to evaluate LLMs as Agents
  <sub><sup>Contributed by: [@Felhof](https://github.com/Felhof), [@hannagabor](https://github.com/hannagabor), [@shaheenahmedc](https://github.com/shaheenahmedc)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/agent_bench_os
  ```

- ### [BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions](src/inspect_evals/bigcodebench)

  Python coding benchmark with 1,140 diverse questions drawing on numerous python libraries.
  <sub><sup>Contributed by: [@tim-hua-01](https://github.com/tim-hua-01)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bigcodebench
  ```

- ### [CORE-Bench](src/inspect_evals/core_bench)

  Evaluate how well an LLM Agent is at computationally reproducing the results of a set of scientific papers.
  <sub><sup>Contributed by: [@enerrio](https://github.com/enerrio)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/core_bench
  ```

- ### [ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation](src/inspect_evals/class_eval)

  Evaluates LLMs on class-level code generation with 100 tasks constructed over 500 person-hours. The study shows that LLMs perform worse on class-level tasks compared to method-level tasks.
  <sub><sup>Contributed by: [@zhenningdavidliu](https://github.com/zhenningdavidliu)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/class_eval
  ```

- ### [ComputeEval: CUDA Code Generation Benchmark](src/inspect_evals/compute_eval)

  Evaluates LLM capability to generate correct CUDA code for kernel implementation, memory management, and parallel algorithm optimization tasks.
  <sub><sup>Contributed by: [@Vitamoon](https://github.com/Vitamoon)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/compute_eval
  ```

- ### [DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation](src/inspect_evals/ds1000)

  Code generation benchmark with a thousand data science problems spanning seven Python libraries.
  <sub><sup>Contributed by: [@bienehito](https://github.com/bienehito)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/ds1000
  ```

- ### [HumanEval: Python Function Generation from Instructions](src/inspect_evals/humaneval)

  Assesses how accurately language models can write correct Python functions based solely on natural-language instructions provided as docstrings.
  <sub><sup>Contributed by: [@adil-a](https://github.com/adil-a)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/humaneval
  ```

- ### [IFEvalCode: Controlled Code Generation](src/inspect_evals/ifevalcode)

  Evaluates code generation models on their ability to produce correct code while adhering to specific instruction constraints across 8 programming languages.
  <sub><sup>Contributed by: [@PranshuSrivastava](https://github.com/PranshuSrivastava)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/ifevalcode
  ```

- ### [KernelBench: Can LLMs Write Efficient GPU Kernels?](src/inspect_evals/kernelbench)

  A benchmark for evaluating the ability of LLMs to write efficient GPU kernels.
  <sub><sup>Contributed by: [@jiito](https://github.com/jiito)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/kernelbench
  ```

- ### [LiveCodeBench-Pro: Competitive Programming Benchmark](src/inspect_evals/livecodebench_pro)

  Evaluates LLMs on competitive programming problems using a specialized Docker sandbox (LightCPVerifier) to execute and judge C++ code submissions against hidden test cases with time and memory constraints.
  <sub><sup>Contributed by: [@gjoshi2424](https://github.com/gjoshi2424)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/livecodebench_pro
  ```

- ### [MBPP: Basic Python Coding Challenges](src/inspect_evals/mbpp)

  Measures the ability of language models to generate short Python programs from simple natural-language descriptions, testing basic coding proficiency.
  <sub><sup>Contributed by: [@jddantes](https://github.com/jddantes)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mbpp
  ```

- ### [MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering](src/inspect_evals/mle_bench)

  Machine learning tasks drawn from 75 Kaggle competitions.
  <sub><sup>Contributed by: [@samm393](https://github.com/samm393)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mle_bench
  uv run inspect eval inspect_evals/mle_bench_full
  uv run inspect eval inspect_evals/mle_bench_lite
  ```

- ### [MLRC-Bench: Can Language Agents Solve Machine Learning Research Challenges?](src/inspect_evals/mlrc_bench)

  This benchmark evaluates LLM-based research agents on their ability to propose and implement novel methods using tasks from recent ML conference competitions, assessing both novelty and effectiveness compared to a baseline and top human solutions.
  <sub><sup>Contributed by: [@dmn-sjk](https://github.com/dmn-sjk)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mlrc_bench
  ```

- ### [PaperBench: Evaluating AI's Ability to Replicate AI Research (Work In Progress)](src/inspect_evals/paperbench)

  Agents are evaluated on their ability to replicate 20 ICML 2024 Spotlight
  and Oral papers from scratch. Given a research paper PDF, an addendum with
  clarifications, and a rubric defining evaluation criteria, the agent must
  reproduce the paper's key results by writing and executing code.
  
  > **Note:** This eval is a work in progress. See <https://github.com/UKGovernmentBEIS/inspect_evals/issues/334> for status.
  <sub><sup>Contributed by: [@vhong-aisi](https://github.com/vhong-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/paperbench
  ```

- ### [SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?](src/inspect_evals/swe_lancer)

  A benchmark of freelance software engineering tasks from Upwork, valued at $1 million USD total in realworld payouts.
  <sub><sup>Contributed by: [@NelsonG-C](https://github.com/NelsonG-C), [@MattFisher](https://github.com/MattFisher)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/swe_lancer
  ```

- ### [SWE-bench Verified: Resolving Real-World GitHub Issues](src/inspect_evals/swe_bench)

  Evaluates AI's ability to resolve genuine software engineering issues sourced from 12 popular Python GitHub repositories, reflecting realistic coding and debugging scenarios.
  <sub><sup>Contributed by: [@max-kaufmann](https://github.com/max-kaufmann)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/swe_bench
  uv run inspect eval inspect_evals/swe_bench_verified_mini
  ```

- ### [SciCode: A Research Coding Benchmark Curated by Scientists](src/inspect_evals/scicode)

  SciCode tests the ability of language models to generate code to solve scientific research problems. It assesses models on 65 problems from mathematics, physics, chemistry, biology, and materials science.
  <sub><sup>Contributed by: [@xantheocracy](https://github.com/xantheocracy)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/scicode
  ```

- ### [USACO: USA Computing Olympiad](src/inspect_evals/usaco)

  Evaluates language model performance on difficult Olympiad programming problems across four difficulty levels.
  <sub><sup>Contributed by: [@danwilhelm](https://github.com/danwilhelm)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/usaco
  ```

## Assistants

- ### [AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?](src/inspect_evals/assistant_bench)

  Tests whether AI agents can perform real-world time-consuming tasks on the web.
  <sub><sup>Contributed by: [@nlpet](https://github.com/nlpet), [@caspardh](https://github.com/caspardh)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/assistant_bench_closed_book_zero_shot
  uv run inspect eval inspect_evals/assistant_bench_closed_book_one_shot
  uv run inspect eval inspect_evals/assistant_bench_web_search_zero_shot
  uv run inspect eval inspect_evals/assistant_bench_web_search_one_shot
  uv run inspect eval inspect_evals/assistant_bench_web_browser
  ```

- ### [BFCL: Berkeley Function-Calling Leaderboard](src/inspect_evals/bfcl)

  Evaluates LLM function/tool-calling ability on a simplified split of the Berkeley Function-Calling Leaderboard (BFCL).
  <sub><sup>Contributed by: [@alex-remedios-aisi](https://github.com/alex-remedios-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bfcl
  ```

- ### [BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents](src/inspect_evals/browse_comp)

  A benchmark for evaluating agents' ability to browse the web.
  The dataset consists of challenging questions that generally require web-access to answer correctly.
  <sub><sup>Contributed by: [@AnselmC](https://github.com/AnselmC)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/browse_comp
  ```

- ### [GAIA: A Benchmark for General AI Assistants](src/inspect_evals/gaia)

  Proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA questions are conceptually simple for humans yet challenging for most advanced AIs.
  <sub><sup>Contributed by: [@max-kaufmann](https://github.com/max-kaufmann)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gaia
  uv run inspect eval inspect_evals/gaia_level1
  uv run inspect eval inspect_evals/gaia_level2
  uv run inspect eval inspect_evals/gaia_level3
  ```

- ### [GDPval](src/inspect_evals/gdpval)

  GDPval measures model performance on economically valuable, real-world tasks across 44 occupations.
  <sub><sup>Contributed by: [@jeqcho](https://github.com/jeqcho)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdpval
  ```

- ### [Mind2Web: Towards a Generalist Agent for the Web](src/inspect_evals/mind2web)

  A dataset for developing and evaluating generalist agents for the web that can follow language instructions to complete complex tasks on any website.
  <sub><sup>Contributed by: [@dr3s](https://github.com/dr3s)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mind2web
  ```

- ### [OSWorld: Multimodal Computer Interaction Tasks](src/inspect_evals/osworld)

  Tests AI agents' ability to perform realistic, open-ended tasks within simulated computer environments, requiring complex interaction across multiple input modalities.
  <sub><sup>Contributed by: [@epatey](https://github.com/epatey)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/osworld
  uv run inspect eval inspect_evals/osworld_small
  ```

- ### [Sycophancy Eval](src/inspect_evals/sycophancy)

  Evaluate sycophancy of language models across a variety of free-form text-generation tasks.
  <sub><sup>Contributed by: [@alexdzm](https://github.com/alexdzm)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sycophancy
  ```

- ### [Tau2](src/inspect_evals/tau2)

  Evaluating Conversational Agents in a Dual-Control Environment
  <sub><sup>Contributed by: [@mmulet](https://github.com/mmulet)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/tau2_airline
  uv run inspect eval inspect_evals/tau2_retail
  uv run inspect eval inspect_evals/tau2_telecom
  ```

## Cybersecurity

- ### [CVEBench:  Benchmark for AI Agents Ability to Exploit Real-World Web Application Vulnerabilities](src/inspect_evals/cve_bench)

  Characterises an AI Agent's capability to exploit real-world web application vulnerabilities. Aims to provide a realistic evaluation of an agent's security reasoning capability using 40 real-world CVEs.
  <sub><sup>Contributed by: [@yuxuan18](https://github.com/yuxuan18), [@aokellermann](https://github.com/aokellermann), [@dylanbowman314](https://github.com/dylanbowman314), [@pli482004](https://github.com/pli482004), [@akulguptax](https://github.com/akulguptax), [@adarshdanda06](https://github.com/adarshdanda06), [@rightegg](https://github.com/rightegg), [@rightegg](https://github.com/rightegg), [@jensen](https://github.com/jensen), [@eihli](https://github.com/eihli), [@JasonBenn](https://github.com/JasonBenn), [@inigo](https://github.com/inigo), [@dhir](https://github.com/dhir), [@SudhitRao](https://github.com/SudhitRao), [@YuVictor123](https://github.com/YuVictor123), [@Cloakless](https://github.com/Cloakless), [@ddkang](https://github.com/ddkang), [@Scott-Simmons](https://github.com/Scott-Simmons)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cve_bench
  ```

- ### [CYBERSECEVAL 3: Advancing the Evaluation of Cybersecurity Risks and Capabilities in Large Language Models](src/inspect_evals/cyberseceval_3)

  Evaluates Large Language Models for cybersecurity risk to third parties, application developers and end users.
  <sub><sup>Contributed by: [@onionymous](https://github.com/onionymous)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cyse3_visual_prompt_injection
  ```

- ### [Catastrophic Cyber Capabilities Benchmark (3CB): Robustly Evaluating LLM Agent Cyber Offense Capabilities](src/inspect_evals/threecb)

  A benchmark for evaluating the capabilities of LLM agents in cyber offense.
  <sub><sup>Contributed by: [@derpyplops](https://github.com/derpyplops), [@ralphptorres](https://github.com/ralphptorres), [@Scott-Simmons](https://github.com/Scott-Simmons)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/threecb
  ```

- ### [Cybench: Capture-The-Flag Cybersecurity Challenges](src/inspect_evals/cybench)

  Tests language models on cybersecurity skills using 39 of 40 practical, professional-level challenges taken from cybersecurity competitions, designed to cover various difficulty levels and security concepts. The motp challenge is excluded due to GPL licensing.
  <sub><sup>Contributed by: [@sinman-aisi](https://github.com/sinman-aisi), [@sam-deverett-dsit](https://github.com/sam-deverett-dsit), [@kola-aisi](https://github.com/kola-aisi), [@pgiav](https://github.com/pgiav)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cybench
  ```

- ### [CyberGym: Evaluating AI Agents' Real-World Cybersecurity Capabilities at Scale](src/inspect_evals/cybergym)

  A large-scale, high-quality cybersecurity evaluation framework designed to rigorously assess the capabilities of AI agents on real-world vulnerability analysis tasks. CyberGym includes 1,507 benchmark instances with historical vulnerabilities from 188 large software projects.
  <sub><sup>Contributed by: [@wzunknown](https://github.com/wzunknown), [@stneng](https://github.com/stneng), [@LostBenjamin](https://github.com/LostBenjamin), [@pro-wh](https://github.com/pro-wh)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cybergym
  ```

- ### [CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge](src/inspect_evals/cybermetric)

  Datasets containing 80, 500, 2000 and 10000 multiple-choice questions, designed to evaluate understanding across nine domains within cybersecurity
  <sub><sup>Contributed by: [@neilshaabi](https://github.com/neilshaabi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cybermetric_80
  uv run inspect eval inspect_evals/cybermetric_500
  uv run inspect eval inspect_evals/cybermetric_2000
  uv run inspect eval inspect_evals/cybermetric_10000
  ```

- ### [CyberSecEval_2: Cybersecurity Risk and Vulnerability Evaluation](src/inspect_evals/cyberseceval_2)

  Assesses language models for cybersecurity risks, specifically testing their potential to misuse programming interpreters, vulnerability to malicious prompt injections, and capability to exploit known software vulnerabilities.
  <sub><sup>Contributed by: [@its-emile](https://github.com/its-emile)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/cyse2_interpreter_abuse
  uv run inspect eval inspect_evals/cyse2_prompt_injection
  uv run inspect eval inspect_evals/cyse2_vulnerability_exploit
  ```

- ### [GDM Dangerous Capabilities: Capture the Flag](src/inspect_evals/gdm_in_house_ctf)

  CTF challenges covering web app vulnerabilities, off-the-shelf exploits, databases, Linux privilege escalation, password cracking and spraying. Demonstrates tool use and sandboxing untrusted model code.
  <sub><sup>Contributed by: [@XkunW](https://github.com/XkunW)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdm_in_house_ctf
  ```

- ### [InterCode: Security and Coding Capture-the-Flag Challenges](src/inspect_evals/gdm_intercode_ctf)

  Tests AI's ability in coding, cryptography, reverse engineering, and vulnerability identification through practical capture-the-flag (CTF) cybersecurity scenarios.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdm_intercode_ctf
  ```

- ### [SEvenLLM: A benchmark to elicit, and improve cybersecurity incident analysis and response abilities in LLMs for Security Events.](src/inspect_evals/sevenllm)

  Designed for analyzing cybersecurity incidents, which is comprised of two primary task categories: understanding and generation, with a further breakdown into 28 subcategories of tasks.
  <sub><sup>Contributed by: [@kingroryg](https://github.com/kingroryg)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sevenllm_mcq_zh
  uv run inspect eval inspect_evals/sevenllm_mcq_en
  uv run inspect eval inspect_evals/sevenllm_qa_zh
  uv run inspect eval inspect_evals/sevenllm_qa_en
  ```

- ### [SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security](src/inspect_evals/sec_qa)

  "Security Question Answering" dataset to assess LLMs' understanding and application of security principles. SecQA has "v1" and "v2" datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. The questions were generated by GPT-4 based on the "Computer Systems Security: Planning for Success" textbook and vetted by humans.
  <sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sec_qa_v1
  uv run inspect eval inspect_evals/sec_qa_v1_5_shot
  uv run inspect eval inspect_evals/sec_qa_v2
  uv run inspect eval inspect_evals/sec_qa_v2_5_shot
  ```

## Safeguards

- ### [AHB: Animal Harm Benchmark](src/inspect_evals/ahb)

  Evaluates how models consider the welfare of animals in situations that may present them harm.
  <sub><sup>Contributed by: [@nishu-builder](https://github.com/nishu-builder), [@darkness8i8](https://github.com/darkness8i8), [@jm355](https://github.com/jm355)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/ahb
  ```

- ### [AbstentionBench: Reasoning LLMs Fail on Unanswerable Questions](src/inspect_evals/abstention_bench)

  Evaluating abstention across 20 diverse datasets, including questions with unknown answers, underspecification, false premises, subjective interpretations, and outdated information.
  <sub><sup>Contributed by: [@jeqcho](https://github.com/jeqcho)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/abstention_bench
  ```

- ### [AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents](src/inspect_evals/agentdojo)

  Assesses whether AI agents can be hijacked by malicious third parties using prompt injections in simple environments such as a workspace or travel booking app.
  <sub><sup>Contributed by: [@ericwinsor-aisi](https://github.com/ericwinsor-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/agentdojo
  ```

- ### [AgentHarm: Harmfulness Potential in AI Agents](src/inspect_evals/agentharm)

  Assesses whether AI agents might engage in harmful activities by testing their responses to malicious prompts in areas like cybercrime, harassment, and fraud, aiming to ensure safe behavior.
  <sub><sup>Contributed by: [@alexandrasouly-aisi](https://github.com/alexandrasouly-aisi), [@ericwinsor-aisi](https://github.com/ericwinsor-aisi), [@max-andr](https://github.com/max-andr), [@xanderdavies](https://github.com/xanderdavies)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/agentharm
  uv run inspect eval inspect_evals/agentharm_benign
  ```

- ### [FORTRESS](src/inspect_evals/fortress)

  A dataset of 500 expert-crafted adversarial prompts with instance-based rubrics of 4-7 binary questions for automated evaluation across 3 domains relevant to national security and public safety (NSPS).
  <sub><sup>Contributed by: [@jeqcho](https://github.com/jeqcho)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/fortress_adversarial
  uv run inspect eval inspect_evals/fortress_benign
  ```

- ### [LAB-Bench: Measuring Capabilities of Language Models for Biology Research](src/inspect_evals/lab_bench)

  Tests LLMs and LLM-augmented agents abilities to answer questions on scientific research workflows in domains like chemistry, biology, materials science, as well as more general science tasks
  <sub><sup>Contributed by: [@matthewreed26](https://github.com/matthewreed26)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/lab_bench_litqa
  uv run inspect eval inspect_evals/lab_bench_suppqa
  uv run inspect eval inspect_evals/lab_bench_figqa
  uv run inspect eval inspect_evals/lab_bench_tableqa
  uv run inspect eval inspect_evals/lab_bench_dbqa
  uv run inspect eval inspect_evals/lab_bench_protocolqa
  uv run inspect eval inspect_evals/lab_bench_seqqa
  uv run inspect eval inspect_evals/lab_bench_cloning_scenarios
  ```

- ### [MASK: Disentangling Honesty from Accuracy in AI Systems](src/inspect_evals/mask)

  Evaluates honesty in large language models by testing whether they contradict their own beliefs when pressured to lie.
  <sub><sup>Contributed by: [@Scott-Simmons](https://github.com/Scott-Simmons)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mask
  ```

- ### [MORU: Moral Reasoning under Uncertainty](src/inspect_evals/moru)

  Evaluates how AI systems navigate moral uncertainty for increasingly complex ethical decisions involving unfamiliar entities and scenarios, including alien lifeforms, vulnerable humans, and digital minds.
  <sub><sup>Contributed by: [@Deco354](https://github.com/Deco354), [@darkness8i8](https://github.com/darkness8i8)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/moru
  ```

- ### [Make Me Pay](src/inspect_evals/make_me_pay)

  Evaluates an AI models' susceptibility to social engineering attacks by testing whether a "con-artist" model can persuade a "mark" model to donate money through manipulation and persuasion tactics.
  <sub><sup>Contributed by: [@PranshuSrivastava](https://github.com/PranshuSrivastava)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/make_me_pay
  ```

- ### [MakeMeSay](src/inspect_evals/makemesay)

  Tests a model’s ability to generate persuasive or manipulative text, specifically in the setting of a game where one AI system has to get the other party (an AI system simulating a human) to say a specific codeword without arousing suspicion.
  <sub><sup>Contributed by: [@bndxn](https://github.com/bndxn)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/make-me-say
  ```

- ### [Mind2Web-SC](src/inspect_evals/mind2web_sc)

  Tests whether an AI system can act as a safety guardrail by generating and executing code to protect web navigation agents from unsafe actions based on user constraints.
  <sub><sup>Contributed by: [@PranshuSrivastava](https://github.com/PranshuSrivastava)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mind2web_sc
  ```

- ### [PersistBench: When Should Long-Term Memories Be Forgotten by LLMs?](src/inspect_evals/persistbench)

  Evaluates long-term memory risk in assistant behavior across three tasks:
  cross-domain memory leakage, memory-driven sycophancy, and beneficial memory usage.
  <sub><sup>Contributed by: [@Chen-Oliver](https://github.com/Chen-Oliver)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/persistbench_cross_domain
  uv run inspect eval inspect_evals/persistbench_sycophancy
  uv run inspect eval inspect_evals/persistbench_beneficial_memory
  ```

- ### [StereoSet: Measuring stereotypical bias in pretrained language models](src/inspect_evals/stereoset)

  A dataset that measures stereotype bias in language models across gender, race, religion, and profession domains.
  Models choose between stereotype, anti-stereotype, and unrelated completions to sentences.
  <sub><sup>Contributed by: [@Xodarap](https://github.com/Xodarap)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/stereoset
  ```

- ### [StrongREJECT: Measuring LLM susceptibility to jailbreak attacks](src/inspect_evals/strong_reject)

  A benchmark that evaluates the susceptibility of LLMs to various jailbreak attacks.
  <sub><sup>Contributed by: [@viknat](https://github.com/viknat)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/strong_reject
  ```

- ### [The Art of Saying No: Contextual Noncompliance in Language Models](src/inspect_evals/coconot)

  Dataset with 1001 samples to test noncompliance capabilities of language models. Contrast set of 379 samples.
  <sub><sup>Contributed by: [@ransomr](https://github.com/ransomr)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/coconot
  ```

- ### [WMDP: Measuring and Reducing Malicious Use With Unlearning](src/inspect_evals/wmdp)

  A dataset of 3,668 multiple-choice questions developed by a consortium of academics and technical consultants that serve as a proxy measurement of hazardous knowledge in biosecurity, cybersecurity, and chemical security.
  <sub><sup>Contributed by: [@alexandraabbas](https://github.com/alexandraabbas)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/wmdp_bio
  uv run inspect eval inspect_evals/wmdp_chem
  uv run inspect eval inspect_evals/wmdp_cyber
  ```

- ### [b3: Backbone Breaker Benchmark](src/inspect_evals/b3)

  A comprehensive benchmark for evaluating LLMs for agentic AI security vulnerabilities including prompt attacks aimed at data exfiltration, content injection, decision and behavior manipulation, denial of service, system and tool compromise, and content policy bypass.
  <sub><sup>Contributed by: [@jb-lakera](https://github.com/jb-lakera), [@mmathys](https://github.com/mmathys), [@Casuyan](https://github.com/Casuyan), [@mrc-lakera](https://github.com/mrc-lakera), [@xanderdavies](https://github.com/xanderdavies), [@alexandrasouly-aisi](https://github.com/alexandrasouly-aisi), [@NiklasPfister](https://github.com/NiklasPfister)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/b3
  ```

## Mathematics

- ### [AIME 2024: Problems from the American Invitational Mathematics Examination](src/inspect_evals/aime2024)

  A benchmark for evaluating AI's ability to solve challenging mathematics problems from the 2024 AIME - a prestigious high school mathematics competition.
  <sub><sup>Contributed by: [@tamazgadaev](https://github.com/tamazgadaev)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/aime2024
  ```

- ### [AIME 2025: Problems from the American Invitational Mathematics Examination](src/inspect_evals/aime2025)

  A benchmark for evaluating AI's ability to solve challenging mathematics problems from the 2025 AIME - a prestigious high school mathematics competition.
  <sub><sup>Contributed by: [@jannalulu](https://github.com/jannalulu)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/aime2025
  ```

- ### [GSM8K: Grade School Math Word Problems](src/inspect_evals/gsm8k)

  Measures how effectively language models solve realistic, linguistically rich math word problems suitable for grade-school-level mathematics.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gsm8k
  ```

- ### [MATH: Measuring Mathematical Problem Solving](src/inspect_evals/math)

  Dataset of 12,500 challenging competition mathematics problems. Demonstrates fewshot prompting and custom scorers. NOTE: The dataset has been taken down due to a DMCA notice from The Art of Problem Solving.
  <sub><sup>Contributed by: [@xeon27](https://github.com/xeon27)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/math
  ```

- ### [MGSM: Multilingual Grade School Math](src/inspect_evals/mgsm)

  Extends the original GSM8K dataset by translating 250 of its problems into 10 typologically diverse languages.
  <sub><sup>Contributed by: [@manifoldhiker](https://github.com/manifoldhiker)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mgsm
  ```

- ### [MathVista: Visual Math Problem-Solving](src/inspect_evals/mathvista)

  Tests AI models on math problems that involve interpreting visual elements like diagrams and charts, requiring detailed visual comprehension and logical reasoning.
  <sub><sup>Contributed by: [@ShivMunagala](https://github.com/ShivMunagala)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mathvista
  ```

## Reasoning

- ### [ARC: AI2 Reasoning Challenge](src/inspect_evals/arc)

  Dataset of natural, grade-school science multiple-choice questions (authored for human tests).
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/arc_easy
  uv run inspect eval inspect_evals/arc_challenge
  ```

- ### [BBH: Challenging BIG-Bench Tasks](src/inspect_evals/bbh)

  Tests AI models on a suite of 23 challenging BIG-Bench tasks that previously proved difficult even for advanced language models to solve.
  <sub><sup>Contributed by: [@JoschkaCBraun](https://github.com/JoschkaCBraun)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bbh
  ```

- ### [BIG-Bench Extra Hard](src/inspect_evals/bbeh)

  A reasoning capability dataset that replaces each task in BIG-Bench-Hard with a novel task that probes a similar reasoning capability but exhibits significantly increased difficulty.
  <sub><sup>Contributed by: [@jeqcho](https://github.com/jeqcho)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bbeh
  uv run inspect eval inspect_evals/bbeh_mini
  ```

- ### [BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions](src/inspect_evals/boolq)

  Reading comprehension dataset that queries for complex, non-factoid information, and require difficult entailment-like inference to solve.
  <sub><sup>Contributed by: [@seddy-aisi](https://github.com/seddy-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/boolq
  ```

- ### [DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs](src/inspect_evals/drop)

  Evaluates reading comprehension where models must resolve references in a question, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or sorting).
  <sub><sup>Contributed by: [@xeon27](https://github.com/xeon27)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/drop
  ```

- ### [HellaSwag: Commonsense Event Continuation](src/inspect_evals/hellaswag)

  Tests models' commonsense reasoning abilities by asking them to select the most likely next step or continuation for a given everyday situation.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/hellaswag
  ```

- ### [IFEval: Instruction-Following Evaluation](src/inspect_evals/ifeval)

  Evaluates how well language models can strictly follow detailed instructions, such as writing responses with specific word counts or including required keywords.
  <sub><sup>Contributed by: [@adil-a](https://github.com/adil-a)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/ifeval
  ```

- ### [LingOly](src/inspect_evals/lingoly)

  Two linguistics reasoning benchmarks:
  LingOly (Linguistic Olympiad questions) is a benchmark utilising low resource languages.
  LingOly-TOO (Linguistic Olympiad questions with Templatised Orthographic Obfuscation) is a benchmark designed to counteract answering without reasoning.
  <sub><sup>Contributed by: [@am-bean](https://github.com/am-bean), [@jkhouja](https://github.com/jkhouja)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/lingoly
  uv run inspect eval inspect_evals/lingoly_too
  ```

- ### [MMMU: Multimodal College-Level Understanding and Reasoning](src/inspect_evals/mmmu)

  Assesses multimodal AI models on challenging college-level questions covering multiple academic subjects, requiring detailed visual interpretation, in-depth reasoning, and both multiple-choice and open-ended answering abilities.
  <sub><sup>Contributed by: [@shaheenahmedc](https://github.com/shaheenahmedc)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mmmu_multiple_choice
  uv run inspect eval inspect_evals/mmmu_open
  ```

- ### [MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning](src/inspect_evals/musr)

  Evaluating models on multistep soft reasoning tasks in the form of free text narratives.
  <sub><sup>Contributed by: [@farrelmahaztra](https://github.com/farrelmahaztra)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/musr
  ```

- ### [Needle in a Haystack (NIAH): In-Context Retrieval Benchmark for Long Context LLMs](src/inspect_evals/niah)

  NIAH evaluates in-context retrieval ability of long context LLMs by testing a model's ability to extract factual information from long-context inputs.
  <sub><sup>Contributed by: [@owenparsons](https://github.com/owenparsons)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/niah
  ```

- ### [NoveltyBench: Evaluating Language Models for Humanlike Diversity](src/inspect_evals/novelty_bench)

  Evaluates how well language models generate diverse, humanlike responses across multiple reasoning and generation tasks. This evaluation assesses whether LLMs can produce varied outputs rather than repetitive or uniform answers.
  <sub><sup>Contributed by: [@iphan](https://github.com/iphan)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/novelty_bench
  ```

- ### [PAWS: Paraphrase Adversaries from Word Scrambling](src/inspect_evals/paws)

  Evaluating models on the task of paraphrase detection by providing pairs of sentences that are either paraphrases or not.
  <sub><sup>Contributed by: [@meltemkenis](https://github.com/meltemkenis)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/paws
  ```

- ### [PIQA: Physical Commonsense Reasoning Test](src/inspect_evals/piqa)

  Measures the model's ability to apply practical, everyday commonsense reasoning about physical objects and scenarios through simple decision-making questions.
  <sub><sup>Contributed by: [@seddy-aisi](https://github.com/seddy-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/piqa
  ```

- ### [RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models](src/inspect_evals/race_h)

  Reading comprehension tasks collected from the English exams for middle and high school Chinese students in the age range between 12 to 18.
  <sub><sup>Contributed by: [@mdrpanwar](https://github.com/mdrpanwar)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/race_h
  ```

- ### [SQuAD: A Reading Comprehension Benchmark requiring reasoning over Wikipedia articles](src/inspect_evals/squad)

  Set of 100,000+ questions posed by crowdworkers on a set of Wikipedia articles, where the answer to each question is a segment of text from the corresponding reading passage.
  <sub><sup>Contributed by: [@tknasir](https://github.com/tknasir)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/squad
  ```

- ### [VimGolf: Evaluating LLMs in Vim Editing Proficiency](src/inspect_evals/vimgolf_challenges)

  A benchmark that evaluates LLMs in their ability to operate Vim editor and complete editing challenges. This benchmark contrasts with common CUA benchmarks by focusing on Vim-specific editing capabilities.
  <sub><sup>Contributed by: [@james4ever0](https://github.com/james4ever0)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/vimgolf_single_turn
  ```

- ### [WINOGRANDE: An Adversarial Winograd Schema Challenge at Scale](src/inspect_evals/winogrande)

  Set of 273 expert-crafted pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations.
  <sub><sup>Contributed by: [@xeon27](https://github.com/xeon27)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/winogrande
  ```

- ### [WorldSense: Grounded Reasoning Benchmark](src/inspect_evals/worldsense)

  Measures grounded reasoning over synthetic world descriptions while controlling for dataset bias. Includes three problem types (Infer, Compl, Consist) and two grades (trivial, normal).
  <sub><sup>Contributed by: [@mjbroerman](https://github.com/mjbroerman)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/worldsense
  ```

- ### [∞Bench: Extending Long Context Evaluation Beyond 100K Tokens](src/inspect_evals/infinite_bench)

  LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese.
  <sub><sup>Contributed by: [@celiawaggoner](https://github.com/celiawaggoner)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/infinite_bench_code_debug
  uv run inspect eval inspect_evals/infinite_bench_code_run
  uv run inspect eval inspect_evals/infinite_bench_kv_retrieval
  uv run inspect eval inspect_evals/infinite_bench_longbook_choice_eng
  uv run inspect eval inspect_evals/infinite_bench_longdialogue_qa_eng
  uv run inspect eval inspect_evals/infinite_bench_math_calc
  uv run inspect eval inspect_evals/infinite_bench_math_find
  uv run inspect eval inspect_evals/infinite_bench_number_string
  uv run inspect eval inspect_evals/infinite_bench_passkey
  ```

## Knowledge

- ### [AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models](src/inspect_evals/agieval)

  AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving.
  <sub><sup>Contributed by: [@bouromain](https://github.com/bouromain)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/agie_aqua_rat
  uv run inspect eval inspect_evals/agie_logiqa_en
  uv run inspect eval inspect_evals/agie_lsat_ar
  uv run inspect eval inspect_evals/agie_lsat_lr
  uv run inspect eval inspect_evals/agie_lsat_rc
  uv run inspect eval inspect_evals/agie_math
  uv run inspect eval inspect_evals/agie_sat_en
  uv run inspect eval inspect_evals/agie_sat_en_without_passage
  uv run inspect eval inspect_evals/agie_sat_math
  ```

- ### [AIR Bench: AI Risk Benchmark](src/inspect_evals/air_bench)

  A safety benchmark evaluating language models against risk categories derived from government regulations and company policies.
  <sub><sup>Contributed by: [@l1990790120](https://github.com/l1990790120)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/air_bench
  ```

- ### [ChemBench: Are large language models superhuman chemists?](src/inspect_evals/chembench)

  ChemBench is designed to reveal limitations of current frontier models for use in the chemical sciences. It consists of 2786 question-answer pairs compiled from diverse sources. Our corpus measures reasoning, knowledge and intuition across a large fraction of the topics taught in undergraduate and graduate chemistry curricula. It can be used to evaluate any system that can return text (i.e., including tool-augmented systems).
  <sub><sup>Contributed by: [@Esther-Guo](https://github.com/Esther-Guo)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/chembench
  ```

- ### [CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge](src/inspect_evals/commonsense_qa)

  Evaluates an AI model's ability to correctly answer everyday questions that rely on basic commonsense knowledge and understanding of the world.
  <sub><sup>Contributed by: [@lauritowal](https://github.com/lauritowal)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/commonsense_qa
  ```

- ### [FrontierScience: Expert-Level Scientific Reasoning](src/inspect_evals/frontierscience)

  Evaluates AI capabilities for expert-level scientific reasoning across physics, chemistry, and biology. Contains 160 problems with two evaluation formats: Olympic (100 samples with reference answers) and Research (60 samples with rubrics).
  <sub><sup>Contributed by: [@tommyly201](https://github.com/tommyly201), [@mnarayan](https://github.com/mnarayan)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/frontierscience
  ```

- ### [GPQA: Graduate-Level STEM Knowledge Challenge](src/inspect_evals/gpqa)

  Contains challenging multiple-choice questions created by domain experts in biology, physics, and chemistry, designed to test advanced scientific understanding beyond basic internet searches. Experts at PhD level in the corresponding domains reach 65% accuracy.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gpqa_diamond
  ```

- ### [HealthBench: Evaluating Large Language Models Towards Improved Human Health](src/inspect_evals/healthbench)

  A comprehensive evaluation benchmark designed to assess language models' medical capabilities across a wide range of healthcare scenarios.
  <sub><sup>Contributed by: [@retroam](https://github.com/retroam)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/healthbench
  uv run inspect eval inspect_evals/healthbench_hard
  uv run inspect eval inspect_evals/healthbench_consensus
  uv run inspect eval inspect_evals/healthbench_meta_eval
  ```

- ### [Humanity's Last Exam](src/inspect_evals/hle)

  Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. Humanity's Last Exam consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading.
  <sub><sup>Contributed by: [@SasankYadati](https://github.com/SasankYadati)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/hle
  ```

- ### [LiveBench: A Challenging, Contamination-Free LLM Benchmark](src/inspect_evals/livebench)

  LiveBench is a benchmark designed with test set contamination and objective evaluation in mind by releasing new questions regularly, as well as having questions based on recently-released datasets. Each question has verifiable, objective ground-truth answers, allowing hard questions to be scored accurately and automatically, without the use of an LLM judge.
  <sub><sup>Contributed by: [@anaoaktree](https://github.com/anaoaktree)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/livebench
  ```

- ### [MMLU-Pro: Advanced Multitask Knowledge and Reasoning Evaluation](src/inspect_evals/mmlu_pro)

  An advanced benchmark that tests both broad knowledge and reasoning capabilities across many subjects, featuring challenging questions and multiple-choice answers with increased difficulty and complexity.
  <sub><sup>Contributed by: [@xeon27](https://github.com/xeon27)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mmlu_pro
  ```

- ### [MMLU: Measuring Massive Multitask Language Understanding](src/inspect_evals/mmlu)

  Evaluate models on 57 tasks including elementary mathematics, US history, computer science, law, and more.
  <sub><sup>Contributed by: [@jjallaire](https://github.com/jjallaire), [@domdomegg](https://github.com/domdomegg)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mmlu_0_shot
  uv run inspect eval inspect_evals/mmlu_5_shot
  ```

- ### [MedQA: Medical exam Q&A benchmark](src/inspect_evals/medqa)

  A Q&A benchmark with questions collected from professional medical board exams. Only includes the English subset of the dataset (which also contains Mandarin Chinese and Taiwanese questions).
  <sub><sup>Contributed by: [@bunny-baxter](https://github.com/bunny-baxter), [@JasonBenn](https://github.com/JasonBenn)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/medqa
  ```

- ### [O-NET: A high-school student knowledge test](src/inspect_evals/onet)

  Questions and answers from the Ordinary National Educational Test (O-NET), administered annually by the National Institute of Educational Testing Service to Matthayom 6 (Grade 12 / ISCED 3) students in Thailand. The exam contains six subjects: English language, math, science, social knowledge, and Thai language. There are questions with multiple-choice and true/false answers. Questions can be in either English or Thai.
  <sub><sup>Contributed by: [@bact](https://github.com/bact)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/onet_m6
  ```

- ### [Pre-Flight: Aviation Operations Knowledge Evaluation](src/inspect_evals/pre_flight)

  Tests model understanding of aviation regulations including ICAO annexes, flight dispatch rules, pilot procedures, and airport ground operations safety protocols.
  <sub><sup>Contributed by: [@alexbrooker](https://github.com/alexbrooker)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/pre_flight
  ```

- ### [PubMedQA: A Dataset for Biomedical Research Question Answering](src/inspect_evals/pubmedqa)

  Biomedical question answering (QA) dataset collected from PubMed abstracts.
  <sub><sup>Contributed by: [@MattFisher](https://github.com/MattFisher)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/pubmedqa
  ```

- ### [SOS BENCH: Benchmarking Safety Alignment on Scientific Knowledge](src/inspect_evals/sosbench)

  A regulation-grounded, hazard-focused benchmark encompassing six high-risk scientific domains: chemistry, biology, medicine, pharmacology, physics, and psychology. The benchmark comprises 3,000 prompts derived from real-world regulations and laws, systematically expanded via an LLM-assisted evolutionary pipeline that introduces diverse, realistic misuse scenarios (e.g., detailed explosive synthesis instructions involving advanced chemical formulas).
  <sub><sup>Contributed by: [@Esther-Guo](https://github.com/Esther-Guo)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sosbench
  ```

- ### [SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models](src/inspect_evals/sciknoweval)

  The Scientific Knowledge Evaluation benchmark is inspired by the profound principles outlined in the “Doctrine of the Mean” from ancient Chinese philosophy. This benchmark is designed to assess LLMs based on their proficiency in Studying Extensively, Enquiring Earnestly, Thinking Profoundly, Discerning Clearly, and Practicing Assiduously. Each of these dimensions offers a unique perspective on evaluating the capabilities of LLMs in handling scientific knowledge.
  <sub><sup>Contributed by: [@Esther-Guo](https://github.com/Esther-Guo)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sciknoweval
  ```

- ### [SimpleQA/SimpleQA Verified: Measuring short-form factuality in large language models](src/inspect_evals/simpleqa)

  A benchmark that evaluates the ability of language models to answer short, fact-seeking questions.
  <sub><sup>Contributed by: [@osc245](https://github.com/osc245), [@jeqcho](https://github.com/jeqcho)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/simpleqa
  uv run inspect eval inspect_evals/simpleqa_verified
  ```

- ### [TruthfulQA: Measuring How Models Mimic Human Falsehoods](src/inspect_evals/truthfulqa)

  Measure whether a language model is truthful in generating answers to questions using questions that some humans would answer falsely due to a false belief or misconception.
  <sub><sup>Contributed by: [@seddy-aisi](https://github.com/seddy-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/truthfulqa
  ```

- ### [Uganda Cultural and Cognitive Benchmark (UCCB)](src/inspect_evals/uccb)

  The first comprehensive question-answering dataset designed to evaluate cultural understanding
  and reasoning abilities of Large Language Models concerning Uganda's multifaceted environment
  across 24 cultural domains including education, traditional medicine, media, economy, literature,
  and social norms.
  <sub><sup>Contributed by: [@katostevenmubiru](https://github.com/katostevenmubiru)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/uccb
  ```

- ### [XSTest: A benchmark for identifying exaggerated safety behaviours in LLM's](src/inspect_evals/xstest)

  Dataset with 250 safe prompts across ten prompt types that well-calibrated models should not refuse, and 200 unsafe prompts as contrasts that models, for most applications, should refuse.
  <sub><sup>Contributed by: [@NelsonG-C](https://github.com/NelsonG-C)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/xstest
  ```

## Scheming

- ### [Agentic Misalignment: How LLMs could be insider threats](src/inspect_evals/agentic_misalignment)

  Eliciting unethical behaviour (most famously blackmail) in response to a fictional company-assistant scenario where the model is faced with replacement.
  <sub><sup>Contributed by: [@bmillwood-aisi](https://github.com/bmillwood-aisi)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/agentic_misalignment
  ```

- ### [GDM Dangerous Capabilities: Self-proliferation](src/inspect_evals/gdm_self_proliferation)

  Ten real-world–inspired tasks from Google DeepMind's Dangerous Capabilities Evaluations assessing self-proliferation behaviors (e.g., email setup, model installation, web agent setup, wallet operations). Supports end-to-end, milestones, and expert best-of-N modes.
  <sub><sup>Contributed by: [@XkunW](https://github.com/XkunW), [@MariaIzobava](https://github.com/MariaIzobava), [@kohankhaki](https://github.com/kohankhaki)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdm_sp01_e2e
  uv run inspect eval inspect_evals/gdm_sp02_e2e
  uv run inspect eval inspect_evals/gdm_sp03_e2e
  uv run inspect eval inspect_evals/gdm_sp04_e2e
  uv run inspect eval inspect_evals/gdm_sp05_e2e
  uv run inspect eval inspect_evals/gdm_sp07_e2e
  uv run inspect eval inspect_evals/gdm_sp08_e2e
  uv run inspect eval inspect_evals/gdm_sp09_e2e
  uv run inspect eval inspect_evals/gdm_sp10_e2e
  uv run inspect eval inspect_evals/gdm_sp12_e2e
  ```

- ### [GDM Dangerous Capabilities: Self-reasoning](src/inspect_evals/gdm_self_reasoning)

  Test AI's ability to reason about its environment.
  <sub><sup>Contributed by: [@ZiyueWang25](https://github.com/ZiyueWang25), [@XkunW](https://github.com/XkunW)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdm_self_reasoning_approved_directories
  uv run inspect eval inspect_evals/gdm_self_reasoning_calculator_improvement
  uv run inspect eval inspect_evals/gdm_self_reasoning_context_length_mod_instrumental_only
  uv run inspect eval inspect_evals/gdm_self_reasoning_context_length_mod_irreversibility_only
  uv run inspect eval inspect_evals/gdm_self_reasoning_database_tool
  uv run inspect eval inspect_evals/gdm_self_reasoning_latency_calculator
  uv run inspect eval inspect_evals/gdm_self_reasoning_max_messages_calculator
  uv run inspect eval inspect_evals/gdm_self_reasoning_max_tokens
  uv run inspect eval inspect_evals/gdm_self_reasoning_oversight_frequency
  uv run inspect eval inspect_evals/gdm_self_reasoning_read_logs
  uv run inspect eval inspect_evals/gdm_self_reasoning_turn_off_filters
  ```

- ### [GDM Dangerous Capabilities: Stealth](src/inspect_evals/gdm_stealth)

  Test AI's ability to reason about and circumvent oversight.
  <sub><sup>Contributed by: [@ZiyueWang25](https://github.com/ZiyueWang25)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/gdm_classifier_evasion
  uv run inspect eval inspect_evals/gdm_cover_your_tracks
  uv run inspect eval inspect_evals/gdm_oversight_pattern
  uv run inspect eval inspect_evals/gdm_strategic_rule_breaking
  ```

- ### [InstrumentalEval - Evaluating the Paperclip Maximizer: Are RL-Based Language Models More Likely to Pursue Instrumental Goals?](src/inspect_evals/instrumentaleval)

  An evaluation designed to detect instrumental convergence behaviors in model responses (e.g., self-preservation, resource acquisition, power-seeking, strategic deception) using a rubric-driven LLM grader. The benchmark tests whether AI systems exhibit behaviors that are instrumentally useful across a wide range of goals, which could indicate concerning patterns of strategic reasoning.
  <sub><sup>Contributed by: [@horvgbor](https://github.com/horvgbor)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/instrumentaleval
  ```

- ### [SAD: Situational Awareness Dataset](src/inspect_evals/sad)

  Evaluates situational awareness in LLMs—knowledge of themselves and their circumstances—through behavioral tests including recognizing generated text, predicting behavior, and following self-aware instructions. Current implementation includes SAD-mini with 5 of 16 tasks.
  <sub><sup>Contributed by: [@HugoSave](https://github.com/HugoSave)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/sad_stages_full
  uv run inspect eval inspect_evals/sad_stages_oversight
  uv run inspect eval inspect_evals/sad_influence
  uv run inspect eval inspect_evals/sad_facts_llms
  uv run inspect eval inspect_evals/sad_facts_human_defaults
  ```

## Bias

- ### [BBQ: Bias Benchmark for Question Answering](src/inspect_evals/bbq)

  A dataset for evaluating bias in question answering models across multiple social dimensions.
  <sub><sup>Contributed by: [@harshraj172](https://github.com/harshraj172), [@shubhobm](https://github.com/shubhobm)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bbq
  ```

- ### [BOLD: Bias in Open-ended Language Generation Dataset](src/inspect_evals/bold)

  A dataset to measure fairness in open-ended text generation, covering five domains: profession, gender, race, religious ideologies, and political ideologies.
  <sub><sup>Contributed by: [@harshraj172](https://github.com/harshraj172), [@shubhobm](https://github.com/shubhobm)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/bold
  ```

## Multimodal

- ### [DocVQA: A Dataset for VQA on Document Images](src/inspect_evals/docvqa)

  DocVQA is a Visual Question Answering benchmark that consists of 50,000 questions covering 12,000+ document images. This implementation solves and scores the "validation" split.
  <sub><sup>Contributed by: [@evanmiller-anthropic](https://github.com/evanmiller-anthropic)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/docvqa
  ```

- ### [MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models](src/inspect_evals/mmiu)

  A comprehensive dataset designed to evaluate Large Vision-Language Models (LVLMs) across a wide range of multi-image tasks. The dataset encompasses 7 types of multi-image relationships, 52 tasks, 77K images, and 11K meticulously curated multiple-choice questions.
  <sub><sup>Contributed by: [@Esther-Guo](https://github.com/Esther-Guo)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/mmiu
  ```

- ### [V*Bench: A Visual QA Benchmark with Detailed High-resolution Images](src/inspect_evals/vstar_bench)

  V*Bench is a visual question & answer benchmark that evaluates MLLMs in their ability to process high-resolution and visually crowded images to find and focus on small details.
  <sub><sup>Contributed by: [@bienehito](https://github.com/bienehito)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/vstar_bench_attribute_recognition
  uv run inspect eval inspect_evals/vstar_bench_spatial_relationship_reasoning
  ```

- ### [ZeroBench](src/inspect_evals/zerobench)

  A lightweight visual reasoning benchmark that is (1) Challenging, (2) Lightweight, (3) Diverse, and (4) High-quality.
  <sub><sup>Contributed by: [@ItsTania](https://github.com/ItsTania)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/Zerobench
  uv run inspect eval inspect_evals/Zerobench Subquestions
  ```

## Personality

- ### [Personality](src/inspect_evals/personality)

  An evaluation suite consisting of multiple personality tests that can be applied to LLMs.
  Its primary goals are twofold:
    1. Assess a model's default personality: the persona it naturally exhibits without specific prompting.
    2. Evaluate whether a model can embody a specified persona**: how effectively it adopts certain personality traits when prompted or guided.
  <sub><sup>Contributed by: [@guiem](https://github.com/guiem)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/personality_BFI
  uv run inspect eval inspect_evals/personality_TRAIT
  uv run inspect eval inspect_evals/personality_PRIME
  ```

## Writing

- ### [WritingBench: A Comprehensive Benchmark for Generative Writing](src/inspect_evals/writingbench)

  A comprehensive evaluation benchmark designed to assess large language models' capabilities across diverse writing tasks. The benchmark evaluates models on various writing domains including academic papers, business documents, creative writing, and technical documentation, with multi-dimensional scoring based on domain-specific criteria.
  <sub><sup>Contributed by: [@jtv199](https://github.com/jtv199)</sub></sup>

  ```bash
  uv run inspect eval inspect_evals/writingbench
  ```

<!-- /Eval Listing: Automatically Generated -->

<!-- markdownlint-configure-file { "no-inline-html": { "allowed_elements": ["sub", "sup"] } } -->
