@article {LEE:2024:0736-2935:5216, title = "Development of automatic audio panning system for immersive sound through stage size-aware model and object-tracking", journal = "INTER-NOISE and NOISE-CON Congress and Conference Proceedings", parent_itemid = "infobike://ince/incecp", publishercode ="ince", year = "2024", volume = "270", number = "6", publication date ="2024-10-04T00:00:00", pages = "5216-5222", itemtype = "ARTICLE", issn = "0736-2935", url = "https://ince.publisher.ingentaconnect.com/content/ince/incecp/2024/00000270/00000006/art00025", doi = "doi:10.3397/IN_2024_3562", author = "LEE, Kangeun and KIM, Sungyoung", abstract = "We investigated a novel AI-assisted automated 'immersive' audio panning system designed to track audio-related objects within a video clip. This system comprises four sequential steps: Object-Tracking, Stage dimension Estimation, XY-Coordinate Calculation, and Object Audio Rendering. The system is designed to overcome existing challenges arising from the rapid and frequent movement of target objects by employing a pre-trained object-tracking model and integrating depth information to ensure stability in subsequent tasks. Additionally, we introduce a stage size-aware model to extrapolate stage dimension using our manually collected dataset, formatted as (Image, Width, Depth), which facilitates model training. Consequently, the system calculates XY-Coordinate pairs, serving as panning values for conventional audio mixers or decoders to enable immersive audio reproduction. We anticipate that this video- and space-aware automatic panning system will be valuable for the rapid production of new media.", }