@article {CHEN:2024:0736-2935:7237,
title = "Realization of global audio telepresence via a learning-based model-matching approach with an acoustic array system",
journal = "INTER-NOISE and NOISE-CON Congress and Conference Proceedings",
parent_itemid = "infobike://ince/incecp",
publishercode ="ince",
year = "2024",
volume = "270",
number = "4",
publication date ="2024-10-04T00:00:00",
pages = "7237-7246",
itemtype = "ARTICLE",
issn = "0736-2935",
url = "https://ince.publisher.ingentaconnect.com/content/ince/incecp/2024/00000270/00000004/art00028",
doi = "doi:10.3397/IN_2024_3935",
author = "CHEN, You-Siang and CHEN, Sing-Yu and BAI, Mingsian",
abstract = "A Global Audio Telepresence (GOAT) system requires a microphone array to capture the spatial audio signals in the far end and a loudspeaker array to reconstruct the sound field in the near end. This seamlessly immerses near-end users in remote audio scenes with full ambience. In this
paper, we use a learning-based GOAT system (L-GOAT) based on the model-matching principle, where a deep neural network (DNN) acts as non-linear filters for the GOAT system. The network training attempts to minimize the matching error between the signals reproduced by the DNN and the desired
signals filtered by the far-end acoustic transfer functions (ATFs). Extensive simulations were carried out for multi-source scenarios in two different rooms with different reverberation times. To implement the L-GOAT system, a five-microphone linear array was adopted in the far-end room, while
a six-loudspeaker array was utilized in the near-end room. The objective evaluation matrices, including the Perceptual Evaluation of Speech Quality (PESQ), Short-Time Objective Intelligibility (STOI), and the matching errors, were conducted to validate the efficacy of the GOAT systems. The
proposed learning-based approach has demonstrated superior performance compared to a conventional digital signal processing (DSP)-based method.",
}