2023
Martínez-Sevilla, J. C.; Ríos-Vila, A.; Castellanos, F. J.; Calvo-Zaragoza, J.
A Holistic Approach for Aligned Music and Lyrics Transcription Conference
Document Analysis and Recognition - ICDAR 2023, vol. 1, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-41676-7.
Abstract | Links | BibTeX | Tags: REPERTORIUM
@conference{MartinezSevilla:ICDAR:2023,
title = {A Holistic Approach for Aligned Music and Lyrics Transcription},
author = {J.C. Martínez-Sevilla and A. Ríos-Vila and F. J. Castellanos and J. Calvo-Zaragoza },
editor = {Fink, Gernot A. and Jain, Rajiv and Kise, Koichi and Zanibbi, Richard},
doi = {https://doi.org/10.1007/978-3-031-41676-7_11},
isbn = {978-3-031-41676-7},
year = {2023},
date = {2023-08-28},
urldate = {2023-08-28},
booktitle = {Document Analysis and Recognition - ICDAR 2023},
volume = {1},
pages = {185--201},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {In this paper, we present the Aligned Music Notation and Lyrics Transcription (AMNLT) challenge, whose goal is to retrieve the content from document images of vocal music. This new research area arises from the need to automatically transcribe notes and lyrics from music scores and align both sources of information conveniently. Although existing methods are able to deal with music notation and text, they work without providing their proper alignment, which is crucial to actually retrieve the content of the piece of vocal music. To overcome this challenge, we consider holistic neural approaches that transcribe music and text in one step, along with an encoding that implicitly aligns the sources of information. The methodology is evaluated on a benchmark specifically designed for AMNLT. The results report that existing methods can obtain high-quality text and music transcriptions, but posterior alignment errors are inevitably found. However, our formulation achieves relative improvements of over 80{%} in the metric that considers both transcription and alignment. We hope that this work will establish itself as a future reference for further research on AMNLT.},
keywords = {REPERTORIUM},
pubstate = {published},
tppubtype = {conference}
}
In this paper, we present the Aligned Music Notation and Lyrics Transcription (AMNLT) challenge, whose goal is to retrieve the content from document images of vocal music. This new research area arises from the need to automatically transcribe notes and lyrics from music scores and align both sources of information conveniently. Although existing methods are able to deal with music notation and text, they work without providing their proper alignment, which is crucial to actually retrieve the content of the piece of vocal music. To overcome this challenge, we consider holistic neural approaches that transcribe music and text in one step, along with an encoding that implicitly aligns the sources of information. The methodology is evaluated on a benchmark specifically designed for AMNLT. The results report that existing methods can obtain high-quality text and music transcriptions, but posterior alignment errors are inevitably found. However, our formulation achieves relative improvements of over 80{%} in the metric that considers both transcription and alignment. We hope that this work will establish itself as a future reference for further research on AMNLT.
2023
Martínez-Sevilla, J. C.; Ríos-Vila, A.; Castellanos, F. J.; Calvo-Zaragoza, J.
A Holistic Approach for Aligned Music and Lyrics Transcription Conference
Document Analysis and Recognition - ICDAR 2023, vol. 1, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-41676-7.
Abstract | Links | BibTeX | Tags: REPERTORIUM
@conference{MartinezSevilla:ICDAR:2023,
title = {A Holistic Approach for Aligned Music and Lyrics Transcription},
author = {J.C. Martínez-Sevilla and A. Ríos-Vila and F. J. Castellanos and J. Calvo-Zaragoza },
editor = {Fink, Gernot A. and Jain, Rajiv and Kise, Koichi and Zanibbi, Richard},
doi = {https://doi.org/10.1007/978-3-031-41676-7_11},
isbn = {978-3-031-41676-7},
year = {2023},
date = {2023-08-28},
urldate = {2023-08-28},
booktitle = {Document Analysis and Recognition - ICDAR 2023},
volume = {1},
pages = {185--201},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {In this paper, we present the Aligned Music Notation and Lyrics Transcription (AMNLT) challenge, whose goal is to retrieve the content from document images of vocal music. This new research area arises from the need to automatically transcribe notes and lyrics from music scores and align both sources of information conveniently. Although existing methods are able to deal with music notation and text, they work without providing their proper alignment, which is crucial to actually retrieve the content of the piece of vocal music. To overcome this challenge, we consider holistic neural approaches that transcribe music and text in one step, along with an encoding that implicitly aligns the sources of information. The methodology is evaluated on a benchmark specifically designed for AMNLT. The results report that existing methods can obtain high-quality text and music transcriptions, but posterior alignment errors are inevitably found. However, our formulation achieves relative improvements of over 80{%} in the metric that considers both transcription and alignment. We hope that this work will establish itself as a future reference for further research on AMNLT.},
keywords = {REPERTORIUM},
pubstate = {published},
tppubtype = {conference}
}
In this paper, we present the Aligned Music Notation and Lyrics Transcription (AMNLT) challenge, whose goal is to retrieve the content from document images of vocal music. This new research area arises from the need to automatically transcribe notes and lyrics from music scores and align both sources of information conveniently. Although existing methods are able to deal with music notation and text, they work without providing their proper alignment, which is crucial to actually retrieve the content of the piece of vocal music. To overcome this challenge, we consider holistic neural approaches that transcribe music and text in one step, along with an encoding that implicitly aligns the sources of information. The methodology is evaluated on a benchmark specifically designed for AMNLT. The results report that existing methods can obtain high-quality text and music transcriptions, but posterior alignment errors are inevitably found. However, our formulation achieves relative improvements of over 80{%} in the metric that considers both transcription and alignment. We hope that this work will establish itself as a future reference for further research on AMNLT.