@inproceedings{van-miltenburg-etal-2020-gradations,
title = "Gradations of Error Severity in Automatic Image Descriptions",
author = "van Miltenburg, Emiel and
Lu, Wei-Ting and
Krahmer, Emiel and
Gatt, Albert and
Chen, Guanyi and
Li, Lin and
van Deemter, Kees",
editor = "Davis, Brian and
Graham, Yvette and
Kelleher, John and
Sripada, Yaji",
booktitle = "Proceedings of the 13th International Conference on Natural Language Generation",
month = dec,
year = "2020",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.inlg-1.45/",
doi = "10.18653/v1/2020.inlg-1.45",
pages = "398--411",
abstract = "Earlier research has shown that evaluation metrics based on textual similarity (e.g., BLEU, CIDEr, Meteor) do not correlate well with human evaluation scores for automatically generated text. We carried out an experiment with Chinese speakers, where we systematically manipulated image descriptions to contain different kinds of errors. Because our manipulated descriptions form minimal pairs with the reference descriptions, we are able to assess the impact of different kinds of errors on the perceived quality of the descriptions. Our results show that different kinds of errors elicit significantly different evaluation scores, even though all erroneous descriptions differ in only one character from the reference descriptions. Evaluation metrics based solely on textual similarity are unable to capture these differences, which (at least partially) explains their poor correlation with human judgments. Our work provides the foundations for future work, where we aim to understand why different errors are seen as more or less severe."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-miltenburg-etal-2020-gradations">
<titleInfo>
<title>Gradations of Error Severity in Automatic Image Descriptions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">van Miltenburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei-Ting</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">Krahmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanyi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Kelleher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaji</namePart>
<namePart type="family">Sripada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Earlier research has shown that evaluation metrics based on textual similarity (e.g., BLEU, CIDEr, Meteor) do not correlate well with human evaluation scores for automatically generated text. We carried out an experiment with Chinese speakers, where we systematically manipulated image descriptions to contain different kinds of errors. Because our manipulated descriptions form minimal pairs with the reference descriptions, we are able to assess the impact of different kinds of errors on the perceived quality of the descriptions. Our results show that different kinds of errors elicit significantly different evaluation scores, even though all erroneous descriptions differ in only one character from the reference descriptions. Evaluation metrics based solely on textual similarity are unable to capture these differences, which (at least partially) explains their poor correlation with human judgments. Our work provides the foundations for future work, where we aim to understand why different errors are seen as more or less severe.</abstract>
<identifier type="citekey">van-miltenburg-etal-2020-gradations</identifier>
<identifier type="doi">10.18653/v1/2020.inlg-1.45</identifier>
<location>
<url>https://aclanthology.org/2020.inlg-1.45/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>398</start>
<end>411</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Gradations of Error Severity in Automatic Image Descriptions
%A van Miltenburg, Emiel
%A Lu, Wei-Ting
%A Krahmer, Emiel
%A Gatt, Albert
%A Chen, Guanyi
%A Li, Lin
%A van Deemter, Kees
%Y Davis, Brian
%Y Graham, Yvette
%Y Kelleher, John
%Y Sripada, Yaji
%S Proceedings of the 13th International Conference on Natural Language Generation
%D 2020
%8 December
%I Association for Computational Linguistics
%C Dublin, Ireland
%F van-miltenburg-etal-2020-gradations
%X Earlier research has shown that evaluation metrics based on textual similarity (e.g., BLEU, CIDEr, Meteor) do not correlate well with human evaluation scores for automatically generated text. We carried out an experiment with Chinese speakers, where we systematically manipulated image descriptions to contain different kinds of errors. Because our manipulated descriptions form minimal pairs with the reference descriptions, we are able to assess the impact of different kinds of errors on the perceived quality of the descriptions. Our results show that different kinds of errors elicit significantly different evaluation scores, even though all erroneous descriptions differ in only one character from the reference descriptions. Evaluation metrics based solely on textual similarity are unable to capture these differences, which (at least partially) explains their poor correlation with human judgments. Our work provides the foundations for future work, where we aim to understand why different errors are seen as more or less severe.
%R 10.18653/v1/2020.inlg-1.45
%U https://aclanthology.org/2020.inlg-1.45/
%U https://doi.org/10.18653/v1/2020.inlg-1.45
%P 398-411
Markdown (Informal)
[Gradations of Error Severity in Automatic Image Descriptions](https://aclanthology.org/2020.inlg-1.45/) (van Miltenburg et al., INLG 2020)
ACL
- Emiel van Miltenburg, Wei-Ting Lu, Emiel Krahmer, Albert Gatt, Guanyi Chen, Lin Li, and Kees van Deemter. 2020. Gradations of Error Severity in Automatic Image Descriptions. In Proceedings of the 13th International Conference on Natural Language Generation, pages 398–411, Dublin, Ireland. Association for Computational Linguistics.