{"id":"https://openalex.org/W3196347085","doi":"https://doi.org/10.1109/tpami.2022.3155643","title":"Binaural SoundNet: Predicting Semantics, Depth and Motion With Binaural Sounds","display_name":"Binaural SoundNet: Predicting Semantics, Depth and Motion With Binaural Sounds","publication_year":2022,"publication_date":"2022-03-03","ids":{"openalex":"https://openalex.org/W3196347085","doi":"https://doi.org/10.1109/tpami.2022.3155643","mag":"3196347085","pmid":"https://pubmed.ncbi.nlm.nih.gov/35239475"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2022.3155643","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2022.3155643","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078838951","display_name":"Dengxin Dai","orcid":"https://orcid.org/0000-0001-5440-9678"},"institutions":[{"id":"https://openalex.org/I4210109712","display_name":"Max Planck Institute for Informatics","ror":"https://ror.org/01w19ak89","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I4210109712"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Dengxin Dai","raw_affiliation_strings":["Vision for Autonomous Systems Group, MPI for Informatics, Saarbr&#x00FC;cken, Germany"],"raw_orcid":"https://orcid.org/0000-0001-5440-9678","affiliations":[{"raw_affiliation_string":"Vision for Autonomous Systems Group, MPI for Informatics, Saarbr&#x00FC;cken, Germany","institution_ids":["https://openalex.org/I4210109712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027109026","display_name":"Arun Balajee Vasudevan","orcid":"https://orcid.org/0000-0002-5409-0780"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arun Balajee Vasudevan","raw_affiliation_strings":["Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007656938","display_name":"Ji\u0159\u0131\u0301 Matas","orcid":"https://orcid.org/0000-0003-0863-4844"},"institutions":[{"id":"https://openalex.org/I44504214","display_name":"Czech Technical University in Prague","ror":"https://ror.org/03kqpb082","country_code":"CZ","type":"education","lineage":["https://openalex.org/I44504214"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Jiri Matas","raw_affiliation_strings":["Center for Machine Perception, Czech Technical University, Prague, Czechia"],"raw_orcid":"https://orcid.org/0000-0003-0863-4844","affiliations":[{"raw_affiliation_string":"Center for Machine Perception, Czech Technical University, Prague, Czechia","institution_ids":["https://openalex.org/I44504214"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001254143","display_name":"Luc Van Gool","orcid":"https://orcid.org/0000-0002-3445-5711"},"institutions":[{"id":"https://openalex.org/I99464096","display_name":"KU Leuven","ror":"https://ror.org/05f950310","country_code":"BE","type":"education","lineage":["https://openalex.org/I99464096"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Luc Van Gool","raw_affiliation_strings":["Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland","Department of Electrical Engineering, KU Leuven, Leuven, Belgium"],"raw_orcid":"https://orcid.org/0000-0002-3445-5711","affiliations":[{"raw_affiliation_string":"Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland","institution_ids":[]},{"raw_affiliation_string":"Department of Electrical Engineering, KU Leuven, Leuven, Belgium","institution_ids":["https://openalex.org/I99464096"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5078838951"],"corresponding_institution_ids":["https://openalex.org/I4210109712"],"apc_list":null,"apc_paid":null,"fwci":1.1893,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.76332694,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"45","issue":"1","first_page":"123","last_page":"136"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/binaural-recording","display_name":"Binaural recording","score":0.8758665323257446},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8328785300254822},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5714050531387329},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5500820279121399},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5124431252479553},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4958737790584564},{"id":"https://openalex.org/keywords/sound-localization","display_name":"Sound localization","score":0.4714523255825043},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.46655648946762085},{"id":"https://openalex.org/keywords/auditory-scene-analysis","display_name":"Auditory scene analysis","score":0.4520249366760254},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.42578524351119995},{"id":"https://openalex.org/keywords/orientation","display_name":"Orientation (vector space)","score":0.41889840364456177},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.2600449025630951},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.1329679787158966}],"concepts":[{"id":"https://openalex.org/C201247586","wikidata":"https://www.wikidata.org/wiki/Q5612967","display_name":"Binaural recording","level":2,"score":0.8758665323257446},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8328785300254822},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5714050531387329},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5500820279121399},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5124431252479553},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4958737790584564},{"id":"https://openalex.org/C68236139","wikidata":"https://www.wikidata.org/wiki/Q765652","display_name":"Sound localization","level":2,"score":0.4714523255825043},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.46655648946762085},{"id":"https://openalex.org/C38129911","wikidata":"https://www.wikidata.org/wiki/Q4820038","display_name":"Auditory scene analysis","level":3,"score":0.4520249366760254},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.42578524351119995},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.41889840364456177},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2600449025630951},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.1329679787158966},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D003463","descriptor_name":"Cues","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D003463","descriptor_name":"Cues","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D003463","descriptor_name":"Cues","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007858","descriptor_name":"Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007858","descriptor_name":"Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007858","descriptor_name":"Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013016","descriptor_name":"Sound","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013016","descriptor_name":"Sound","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013016","descriptor_name":"Sound","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false}],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2022.3155643","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2022.3155643","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:35239475","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/35239475","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7699999809265137,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":105,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1523224409","https://openalex.org/W1581848821","https://openalex.org/W1937507046","https://openalex.org/W1971920230","https://openalex.org/W2024787486","https://openalex.org/W2026401213","https://openalex.org/W2028184103","https://openalex.org/W2031489346","https://openalex.org/W2032337854","https://openalex.org/W2038484192","https://openalex.org/W2048237443","https://openalex.org/W2049193094","https://openalex.org/W2067503923","https://openalex.org/W2069057506","https://openalex.org/W2072604753","https://openalex.org/W2110764733","https://openalex.org/W2115579991","https://openalex.org/W2116101591","https://openalex.org/W2120847449","https://openalex.org/W2122497675","https://openalex.org/W2159346755","https://openalex.org/W2171819471","https://openalex.org/W2171834532","https://openalex.org/W2253728777","https://openalex.org/W2340897893","https://openalex.org/W2343077198","https://openalex.org/W2412782625","https://openalex.org/W2511428026","https://openalex.org/W2560474170","https://openalex.org/W2584505851","https://openalex.org/W2619697695","https://openalex.org/W2737712196","https://openalex.org/W2742299865","https://openalex.org/W2764198839","https://openalex.org/W2767709712","https://openalex.org/W2800288142","https://openalex.org/W2886300652","https://openalex.org/W2891715141","https://openalex.org/W2897451716","https://openalex.org/W2939726645","https://openalex.org/W2962703836","https://openalex.org/W2962723698","https://openalex.org/W2962756039","https://openalex.org/W2962865004","https://openalex.org/W2962866891","https://openalex.org/W2962960500","https://openalex.org/W2963061226","https://openalex.org/W2963115079","https://openalex.org/W2963218389","https://openalex.org/W2963680395","https://openalex.org/W2964109005","https://openalex.org/W2964121744","https://openalex.org/W2964309882","https://openalex.org/W2964345931","https://openalex.org/W2970603850","https://openalex.org/W2970746371","https://openalex.org/W2981851635","https://openalex.org/W2982624843","https://openalex.org/W2985775862","https://openalex.org/W2988200020","https://openalex.org/W2989980422","https://openalex.org/W2990408345","https://openalex.org/W2995233853","https://openalex.org/W2995254904","https://openalex.org/W3000389243","https://openalex.org/W3034742263","https://openalex.org/W3085046840","https://openalex.org/W3089887959","https://openalex.org/W3089944088","https://openalex.org/W3092237241","https://openalex.org/W3096411729","https://openalex.org/W3096780661","https://openalex.org/W3101537861","https://openalex.org/W3103348781","https://openalex.org/W3104529101","https://openalex.org/W3106362698","https://openalex.org/W3108332675","https://openalex.org/W3108655859","https://openalex.org/W3121780787","https://openalex.org/W3122808968","https://openalex.org/W3130951396","https://openalex.org/W3138953166","https://openalex.org/W3162322471","https://openalex.org/W3174854700","https://openalex.org/W3174856432","https://openalex.org/W3176232375","https://openalex.org/W3188558905","https://openalex.org/W3190580390","https://openalex.org/W3207649350","https://openalex.org/W4287608901","https://openalex.org/W4293665662","https://openalex.org/W6631190155","https://openalex.org/W6678516059","https://openalex.org/W6729831399","https://openalex.org/W6738607494","https://openalex.org/W6748481559","https://openalex.org/W6755528852","https://openalex.org/W6767784004","https://openalex.org/W6770805772","https://openalex.org/W6771599870","https://openalex.org/W6771763809","https://openalex.org/W6784119104","https://openalex.org/W6785011006","https://openalex.org/W6792340124"],"related_works":["https://openalex.org/W1991848873","https://openalex.org/W3004570917","https://openalex.org/W4389240440","https://openalex.org/W2084430325","https://openalex.org/W2041661331","https://openalex.org/W2539207221","https://openalex.org/W2045803470","https://openalex.org/W2242743481","https://openalex.org/W1571953124","https://openalex.org/W4389102442"],"abstract_inverted_index":{"Humans":[0],"can":[1,151],"robustly":[2],"recognize":[3],"and":[4,61,77,91,102,125,230,237,245,262],"localize":[5],"objects":[6],"by":[7,248],"using":[8,155],"visual":[9,22,101],"and/or":[10],"auditory":[11,149,257],"cues.":[12],"While":[13],"machines":[14],"are":[15,216,234,254,264],"able":[16],"to":[17,136,173,193],"do":[18],"the":[19,50,56,62,66,131,138,142,148,161,175,183,195,213,224,228,242,249,267],"same":[20,139],"with":[21,30,86],"data":[23,261],"already,":[24],"less":[25],"work":[26,33],"has":[27],"been":[28],"done":[29],"sounds.":[31,44,179],"This":[32,146],"develops":[34],"an":[35],"approach":[36],"for":[37,107,208,256],"scene":[38],"understanding":[39],"purely":[40],"based":[41],"on":[42,266],"binaural":[43,89],"The":[45,98,260],"considered":[46],"tasks":[47,185,215],"include":[48],"predicting":[49],"semantic":[51],"masks":[52],"of":[53,58,65,83,100,120,178,232],"sound-making":[54,59],"objects,":[55,60],"motion":[57],"depth":[63],"map":[64],"scene.":[67],"To":[68,158],"this":[69],"aim,":[70],"we":[71,112,163],"propose":[72,164],"a":[73,79,92,114,126],"novel":[74,166],"sensor":[75],"setup":[76],"record":[78],"new":[80],"audio-visual":[81],"dataset":[82],"street":[84],"scenes":[85],"eight":[87],"professional":[88],"microphones":[90,233],"360":[93],"<inline-formula><tex-math":[94],"notation=\"LaTeX\">$\\mathrm{^{\\circ":[95],"}}$</tex-math></inline-formula>":[96],"camera.":[97],"co-existence":[99],"audio":[103],"cues":[104],"is":[105,134],"leveraged":[106],"supervision":[108],"transfer.":[109],"In":[110],"particular,":[111],"employ":[113],"cross-modal":[115],"distillation":[116],"framework":[117],"that":[118,201],"consists":[119],"multiple":[121],"vision":[122],"\u2018teacher\u2019":[123],"methods":[124,144],"sound":[127],"\u2018student\u2019":[128],"method":[129,133,204],"\u2013":[130,219],"student":[132],"trained":[135,153],"generate":[137],"results":[140,199,207],"as":[141],"teacher":[143],"do.":[145],"way,":[147],"system":[150],"be":[152],"without":[154],"human":[156],"annotations.":[157],"further":[159],"boost":[160,194],"performance,":[162,226],"another":[165],"auxiliary":[167],"task,":[168],"coined":[169],"Spatial":[170],"Sound":[171],"Super-Resolution,":[172],"increase":[174],"directional":[176],"resolution":[177],"We":[180],"then":[181],"formulate":[182],"four":[184,210,214],"into":[186],"one":[187],"end-to-end":[188],"trainable":[189],"multi-tasking":[190],"network":[191],"aiming":[192],"overall":[196],"performance.":[197],"Experimental":[198],"show":[200],"1)":[202],"our":[203],"achieves":[205,223],"good":[206],"all":[209],"tasks,":[211],"2)":[212],"mutually":[217],"beneficial":[218],"training":[220],"them":[221],"together":[222],"best":[225],"3)":[227],"number":[229],"orientation":[231],"both":[235],"important,":[236],"4)":[238],"features":[239,246],"learned":[240],"from":[241],"standard":[243],"spectrogram":[244],"obtained":[247],"classic":[250],"signal":[251],"processing":[252],"pipeline":[253],"complementary":[255],"perception":[258],"tasks.":[259],"code":[263],"released":[265],"project":[268],"page:":[269],"<uri>https://www.trace.ethz.ch/publications/2020/sound_perception/index.html</uri>":[270],".":[271]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1}],"updated_date":"2026-05-16T08:24:45.110214","created_date":"2025-10-10T00:00:00"}
