{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T08:54:02Z","timestamp":1767084842256,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":47,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819609079"},{"type":"electronic","value":"9789819609086"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0908-6_20","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T19:26:21Z","timestamp":1733513181000},"page":"347-364","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["3D-Aware Instance Segmentation and\u00a0Tracking in\u00a0Egocentric Videos"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7775-6250","authenticated-orcid":false,"given":"Yash","family":"Bhalgat","sequence":"first","affiliation":[]},{"given":"Vadim","family":"Tschernezki","sequence":"additional","affiliation":[]},{"given":"Iro","family":"Laina","sequence":"additional","affiliation":[]},{"given":"Jo\u00e3o F.","family":"Henriques","sequence":"additional","affiliation":[]},{"given":"Andrea","family":"Vedaldi","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"20_CR1","unstructured":"Aharon, N., Orfaig, R., Bobrovsky, B.Z.: Bot-sort: Robust associations multi-pedestrian tracking. arXiv preprint arXiv:2206.14651 (2022)"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Athar, A., Luiten, J., Voigtlaender, P., Khurana, T., Dave, A., Leibe, B., Ramanan, D.: Burst: A benchmark for unifying object recognition, segmentation and tracking in video. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision. pp. 1674\u20131683 (2023)","DOI":"10.1109\/WACV56688.2023.00172"},{"key":"20_CR3","unstructured":"Bhalgat, Y., Laina, I., Henriques, J.F., Vedaldi, A., Zisserman, A.: Contrastive lift: 3d object instance segmentation by slow-fast contrastive fusion. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Bhalgat, Y., Laina, I., Henriques, J.F., Zisserman, A., Vedaldi, A.: N2f2: Hierarchical scene understanding with nested neural feature fields. arXiv preprint arXiv:2403.10997 (2024)","DOI":"10.1007\/978-3-031-73202-7_12"},{"key":"20_CR5","unstructured":"Caelles, S., Pont-Tuset, J., Perazzi, F., Montes, A., Maninis, K.K., Van\u00a0Gool, L.: The 2019 davis challenge on vos: Unsupervised multi-object segmentation. arXiv preprint arXiv:1905.00737 (2019)"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"Cao, J., Pang, J., Weng, X., Khirodkar, R., Kitani, K.: Observation-centric sort: Rethinking sort for robust multi-object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9686\u20139696 (2023)","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Blomqvist, K., Milano, F., Siegwart, R.: Panoptic vision-language feature fields. IEEE Robotics and Automation Letters (2024)","DOI":"10.1109\/IROS55552.2023.10342275"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, H.K., Oh, S.W., Price, B., Schwing, A., Lee, J.Y.: Tracking anything with decoupled video segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1316\u20131326 (2023)","DOI":"10.1109\/ICCV51070.2023.00127"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, H.K., Schwing, A.G.: Xmem: Long-term video object segmentation with an atkinson-shiffrin memory model. In: European Conference on Computer Vision. pp. 640\u2013658. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Choudhuri, A., Chowdhary, G., Schwing, A.G.: Assignment-space-based multi-object tracking and segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 13598\u201313607 (2021)","DOI":"10.1109\/ICCV48922.2021.01334"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Choudhuri, A., Chowdhary, G., Schwing, A.G.: Context-aware relative object queries to unify video instance and panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6377\u20136386 (2023)","DOI":"10.1109\/CVPR52729.2023.00617"},{"issue":"11","key":"20_CR12","doi-asserted-by":"publisher","first-page":"4125","DOI":"10.1109\/TPAMI.2020.2991965","volume":"43","author":"D Damen","year":"2021","unstructured":"Damen, D., Doughty, H., Farinella, G.M., Fidler, S., Furnari, A., Kazakos, E., Moltisanti, D., Munro, J., Perrett, T., Price, W., Wray, M.: The epic-kitchens dataset: Collection, challenges and baselines. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) 43(11), 4125\u20134141 (2021). https:\/\/doi.org\/10.1109\/TPAMI.2020.2991965","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"20_CR13","first-page":"13610","volume":"35","author":"C Doersch","year":"2022","unstructured":"Doersch, C., Gupta, A., Markeeva, L., Recasens, A., Smaira, L., Aytar, Y., Carreira, J., Zisserman, A., Yang, Y.: Tap-vid: A benchmark for tracking any point in a video. Adv. Neural. Inf. Process. Syst. 35, 13610\u201313626 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Doersch, C., Yang, Y., Vecerik, M., Gokay, D., Gupta, A., Aytar, Y., Carreira, J., Zisserman, A.: Tapir: Tracking any point with per-frame initialization and temporal refinement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10061\u201310072 (2023)","DOI":"10.1109\/ICCV51070.2023.00923"},{"key":"20_CR15","unstructured":"Grauman, K., Westbury, A., Byrne, E., Chavis, Z., Furnari, A., Girdhar, R., Hamburger, J., Jiang, H., Liu, M., Liu, X., et\u00a0al.: Ego4d: Around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18995\u201319012 (2022)"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Gu, Q., Lv, Z., Frost, D., Green, S., Straub, J., Sweeney, C.: Egolifter: Open-world 3d segmentation for egocentric perception. arXiv preprint arXiv:2403.18118 (2024)","DOI":"10.1007\/978-3-031-72775-7_22"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Fang, Z., Fragkiadaki, K.: Particle video revisited: Tracking through occlusions using point trajectories. In: European Conference on Computer Vision. pp. 59\u201375. Springer (2022)","DOI":"10.1007\/978-3-031-20047-2_4"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Karaev, N., Rocco, I., Graham, B., Neverova, N., Vedaldi, A., Rupprecht, C.: Cotracker: It is better to track together. arXiv preprint arXiv:2307.07635 (2023)","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Kim, C.M., Wu, M., Kerr, J., Goldberg, K., Tancik, M., Kanazawa, A.: Garfield: Group anything with radiance fields. In: CVPR. pp. 21530\u201321539 (2024)","DOI":"10.1109\/CVPR52733.2024.02034"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.Y., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Li, S., Ke, L., Danelljan, M., Piccinelli, L., Segu, M., Van\u00a0Gool, L., Yu, F.: Matching anything by segmenting anything. CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01794"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J., et\u00a0al.: Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"20_CR23","doi-asserted-by":"publisher","first-page":"548","DOI":"10.1007\/s11263-020-01375-2","volume":"129","author":"J Luiten","year":"2021","unstructured":"Luiten, J., Osep, A., Dendorfer, P., Torr, P., Geiger, A., Leal-Taix\u00e9, L., Leibe, B.: Hota: A higher order metric for evaluating multi-object tracking. Int. J. Comput. Vision 129, 548\u2013578 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Maggiolino, G., Ahmad, A., Cao, J., Kitani, K.: Deep oc-sort: Multi-pedestrian tracking by adaptive re-identification. In: 2023 IEEE International Conference on Image Processing (ICIP). pp. 3025\u20133029. IEEE (2023)","DOI":"10.1109\/ICIP49359.2023.10222576"},{"key":"20_CR25","volume-title":"Alexey Gritsenko","author":"M Minderer","year":"2023","unstructured":"Minderer, M.: Alexey Gritsenko. Scaling open-vocabulary object detection. NeurIPS, N.H. (2023)"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Meinhardt, T., Kirillov, A., Leal-Taixe, L., Feichtenhofer, C.: Trackformer: Multi-object tracking with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 8844\u20138854 (2022)","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Oh, S.W., Lee, J.Y., Xu, N., Kim, S.J.: Video object segmentation using space-time memory networks. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 9226\u20139235 (2019)","DOI":"10.1109\/ICCV.2019.00932"},{"key":"20_CR28","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H.V., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., Howes, R., Huang, P.Y., Xu, H., Sharma, V., Li, S.W., Galuba, W., Rabbat, M., Assran, M., Ballas, N., Synnaeve, G., Misra, I., Jegou, H., Mairal, J., Labatut, P., Joulin, A., Bojanowski, P.: Dinov2: Learning robust visual features without supervision (2023)"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"20_CR30","unstructured":"Plizzari, C., Goel, S., Perrett, T., Chalk, J., Kanazawa, A., Damen, D.: Spatial cognition from egocentric video: Out of sight, not out of mind. In: ArXiv (2024)"},{"key":"20_CR31","doi-asserted-by":"crossref","unstructured":"Qiao, S., Zhu, Y., Adam, H., Yuille, A., Chen, L.C.: Vip-deeplab: Learning visual perception with depth-aware video panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3997\u20134008 (2021)","DOI":"10.1109\/CVPR46437.2021.00399"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., Malik, J.: Tracking people by predicting 3d appearance, location and pose. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 2740\u20132749 (June 2022)","DOI":"10.1109\/CVPR52688.2022.00276"},{"key":"20_CR33","unstructured":"Raji\u010d, F., Ke, L., Tai, Y.W., Tang, C.K., Danelljan, M., Yu, F.: Segment anything meets point tracking. arXiv preprint arXiv:2307.01197 (2023)"},{"key":"20_CR34","unstructured":"Santrock, J.W.: A topical approach to life-span development. McGraw Hill (2002)"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Siddiqui, Y., Porzi, L., Bul\u00f3, S.R., M\u00fcller, N., Nie\u00dfner, M., Dai, A., Kontschieder, P.: Panoptic lifting for 3d scene understanding with neural fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9043\u20139052 (2023)","DOI":"10.1109\/CVPR52729.2023.00873"},{"key":"20_CR36","unstructured":"Tschernezki, V., Darkhalil, A., Zhu, Z., Fouhey, D., Larina, I., Larlus, D., Damen, D., Vedaldi, A.: EPIC Fields: Marrying 3D Geometry and Video Understanding. In: Proceedings of the Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Voigtlaender, P., Chai, Y., Schroff, F., Adam, H., Leibe, B., Chen, L.C.: Feelvos: Fast end-to-end embedding learning for video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 9481\u20139490 (2019)","DOI":"10.1109\/CVPR.2019.00971"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, Z., Wang, X., Shen, C., Cheng, B., Shen, H., Xia, H.: End-to-end video instance segmentation with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 8741\u20138750 (2021)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Bai, S., Zhang, W., Bai, X.: Seqformer: Sequential transformer for video instance segmentation. In: European Conference on Computer Vision. pp. 553\u2013569. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_32"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 5188\u20135197 (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"20_CR41","first-page":"2491","volume":"34","author":"Z Yang","year":"2021","unstructured":"Yang, Z., Wei, Y., Yang, Y.: Associating objects with transformers for video object segmentation. Adv. Neural. Inf. Process. Syst. 34, 2491\u20132502 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Ye, M., Danelljan, M., Yu, F., Ke, L.: Gaussian grouping: Segment and edit anything in 3d scenes. In: ECCV (2024)","DOI":"10.1007\/978-3-031-73397-0_10"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Ying, H., Yin, Y., Zhang, J., Wang, F., Yu, T., Huang, R., Fang, L.: Omniseg3d: Omniversal 3d segmentation via hierarchical contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20612\u201320622 (2024)","DOI":"10.1109\/CVPR52733.2024.01948"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Sun, P., Jiang, Y., Yu, D., Weng, F., Yuan, Z., Luo, P., Liu, W., Wang, X.: Bytetrack: Multi-object tracking by associating every detection box. In: Proceedings of the European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20047-2_1"},{"issue":"6","key":"20_CR45","doi-asserted-by":"publisher","first-page":"7099","DOI":"10.1109\/TPAMI.2022.3225573","volume":"45","author":"T Zhou","year":"2022","unstructured":"Zhou, T., Porikli, F., Crandall, D.J., Van Gool, L., Wang, W.: A survey on deep learning technique for video segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 7099\u20137122 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., Misra, I.: Detecting twenty-thousand classes using image-level supervision. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Zhou, X., Koltun, V., Kr\u00e4henb\u00fchl, P.: Tracking objects as points. In: European conference on computer vision. pp. 474\u2013490. Springer (2020)","DOI":"10.1007\/978-3-030-58548-8_28"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0908-6_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:14:05Z","timestamp":1733516045000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0908-6_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9789819609079","9789819609086"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0908-6_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}