@inproceedings{3d4cf4e815324123a2771f98bb2d57ee,
title = "Abstractive text-image summarization using multi-modal attentional hierarchical RNN",
abstract = "Rapid growth of multi-modal documents on the Internet makes multi-modal summarization research necessary. Most previous research summarizes texts or images separately. Recent neural summarization research shows the strength of the Encoder-Decoder model in text summarization. This paper proposes an abstractive text-image summarization model using the attentional hierarchical Encoder-Decoder model to summarize a text document and its accompanying images simultaneously, and then to align the sentences and images in summaries. A multi-modal attentional mechanism is proposed to attend original sentences, images, and captions when decoding. The DailyMail dataset is extended by collecting images and captions from the Web. Experiments show our model outperforms the neural abstractive and extractive text summarization methods that do not consider images. In addition, our model can generate informative summaries of images.",
author = "Jingqiang Chen and Hai Zhuge",
year = "2020",
month = jan,
day = "1",
language = "English",
series = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
publisher = "Association for Computational Linguistics",
pages = "4046--4056",
editor = "Ellen Riloff and David Chiang and Julia Hockenmaier and Jun'ichi Tsujii",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
note = "2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018 ; Conference date: 31-10-2018 Through 04-11-2018",
}