@article{M35C51D04, title = "A Study on Automatic Metrics for Korean Text Abstractive Summarization", journal = "The Transactions of the Korea Information Processing Society", year = "2024", issn = "null", doi = "https://doi.org/10.3745/TKIPS.2024.13.12.691", author = "Sehwi Yoon, Youhyun Shin", keywords = "Natural Language Processing, Abstractive Summarization, Automatic Metric, Transformer, Large Language Model", abstract = "This study aims to analyze and validate automatic evaluation metrics for Korean abstractive summarization. The unique linguistic characteristics of each language require evaluation metrics designed for them, underscoring the importance of research focused on Korean. Research on summarization and its meta-evaluation is extremely limited, especially for Korean. Therefore, by validating reliable automatic evaluation metrics using Korean summarization data, this study contributes to future research on Korean models in the fields of natural language generation. Human evaluation, widely regarded as the most reliable metric, is time-consuming and costly. Thus, research into automatic evaluation metrics holds significant importance for efficiency. In this study, summaries from three models—T5, KoBART,and GPT-3.5 Turbo—were evaluated based on their fluency, consistency, and relevance using 10 Korean documents and their corresponding reference summaries. Correlation coefficients were calculated between human evaluations and automatic metrics for fluency, consistency, and relevance. The results showed that for T5 summaries, the correlation coefficients for consistency and relevance were 0.33 and 0.26, respectively, while for KoBART summaries, the coefficients for fluency and relevance were 0.33 and 0.40, respectively. BERTScore demonstrated the highest correlation, indicating its effectiveness for Korean summaries. Meanwhile, GPT-3.5 Turbo summaries showed significant correlations of 0.23 and 0.17 in consistency and relevance using HaRiM+, a metric developed to detect hallucinations in recent work. Additionally, the correlation analysis by document type revealed that T5 summaries showed high correlations with the BLEU metric for briefing and meeting minutes, KoBART summaries and GPT-3.5 Turbo summaries both demonstrated high correlations with BERTScore for narrative and editorial documents, respectively. These findings emphasize the importance of selecting evaluation metrics tailored to specific document types. Therefore, this study provides a basis for selecting appropriate evaluation metrics tailored to the objectives of specific tasks in future Korean summarization research." }