{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:16:01Z","timestamp":1775578561190,"version":"3.50.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100013096","name":"Science and Technology Project of State Grid","doi-asserted-by":"publisher","award":["5108-202218280A-2-249-XG"],"award-info":[{"award-number":["5108-202218280A-2-249-XG"]}],"id":[{"id":"10.13039\/501100013096","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Displays"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1016\/j.displa.2023.102553","type":"journal-article","created":{"date-parts":[[2023,10,10]],"date-time":"2023-10-10T13:10:08Z","timestamp":1696943408000},"page":"102553","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":12,"special_numbering":"C","title":["Self-supervised deep monocular visual odometry and depth estimation with observation variation"],"prefix":"10.1016","volume":"80","author":[{"given":"Wentao","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yanbo","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zehao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Xiao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1943-1535","authenticated-orcid":false,"given":"Jingchuan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Guo","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.displa.2023.102553_b1","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1851","article-title":"Unsupervised learning of depth and ego-motion from video","author":"Zhou","year":"2017"},{"issue":"9","key":"10.1016\/j.displa.2023.102553_b2","doi-asserted-by":"crossref","first-page":"2548","DOI":"10.1007\/s11263-021-01484-6","article-title":"Unsupervised scale-consistent depth learning from video","volume":"129","author":"Bian","year":"2021","journal-title":"Int. J. Comput. Vis."},{"issue":"5","key":"10.1016\/j.displa.2023.102553_b3","doi-asserted-by":"crossref","first-page":"1255","DOI":"10.1109\/TRO.2017.2705103","article-title":"Orb-SLAM2: An open-source SLAM system for monocular, stereo, and RGB-D cameras","volume":"33","author":"MurArtal","year":"2017","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.displa.2023.102553_b4","doi-asserted-by":"crossref","unstructured":"W. Zhao, S. Liu, Y. Shu, Y.-J. Liu, Towards better generalization: Joint depth-pose learning without posenet, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 9151\u20139161.","DOI":"10.1109\/CVPR42600.2020.00917"},{"issue":"5","key":"10.1016\/j.displa.2023.102553_b5","doi-asserted-by":"crossref","first-page":"3173","DOI":"10.1109\/TRO.2022.3164834","article-title":"Improving monocular visual odometry using learned depth","volume":"38","author":"Sun","year":"2022","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.displa.2023.102553_b6","unstructured":"S.F. Bhat, I. Alhashim, P. Wonka, Adabins: Depth estimation using adaptive bins, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 4009\u20134018."},{"issue":"1","key":"10.1016\/j.displa.2023.102553_b7","doi-asserted-by":"crossref","first-page":"36","DOI":"10.1109\/MRA.2022.3228492","article-title":"Long-term visual simultaneous localization and mapping: using a bayesian persistence filter-based global map prediction","volume":"30","author":"Deng","year":"2023","journal-title":"IEEE Robotics & Automation Magazine"},{"key":"10.1016\/j.displa.2023.102553_b8","series-title":"Advances in Neural Information Processing Systems, Vol. 32","article-title":"Unsupervised scale-consistent depth and ego-motion learning from monocular video","author":"Bian","year":"2019"},{"key":"10.1016\/j.displa.2023.102553_b9","doi-asserted-by":"crossref","unstructured":"T. Sattler, Q. Zhou, M. Pollefeys, L. Leal-Taixe, Understanding the limitations of cnn-based absolute camera pose regression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 3302\u20133312.","DOI":"10.1109\/CVPR.2019.00342"},{"key":"10.1016\/j.displa.2023.102553_b10","series-title":"2017 IEEE International Conference on Robotics and Automation","first-page":"2043","article-title":"Deepvo: Towards end-to-end visual odometry with deep recurrent convolutional neural networks","author":"Wang","year":"2017"},{"key":"10.1016\/j.displa.2023.102553_b11","doi-asserted-by":"crossref","unstructured":"Z. Li, G. Wang, X. Ji, CDPN: Coordinates-based disentangled pose network for real-time rgb-based 6-dof object pose estimation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 7678\u20137687.","DOI":"10.1109\/ICCV.2019.00777"},{"key":"10.1016\/j.displa.2023.102553_b12","series-title":"Advances in Neural Information Processing Systems. Vol. 30","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"issue":"11","key":"10.1016\/j.displa.2023.102553_b13","doi-asserted-by":"crossref","first-page":"1231","DOI":"10.1177\/0278364913491297","article-title":"Vision meets robotics: The kitti dataset","volume":"32","author":"Geiger","year":"2013","journal-title":"Int. J. Robot. Res."},{"issue":"6","key":"10.1016\/j.displa.2023.102553_b14","doi-asserted-by":"crossref","first-page":"1874","DOI":"10.1109\/TRO.2021.3075644","article-title":"Orb-SLAM3: An accurate open-source library for visual, visual\u2013inertial, and multimap SLAM","volume":"37","author":"Campos","year":"2021","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.displa.2023.102553_b15","series-title":"Multiple View Geometry in Computer Vision","author":"Hartley","year":"2003"},{"key":"10.1016\/j.displa.2023.102553_b16","series-title":"Advances in Neural Information Processing Systems. Vol. 27","article-title":"Depth map prediction from a single image using a multi-scale deep network","author":"Eigen","year":"2014"},{"key":"10.1016\/j.displa.2023.102553_b17","series-title":"Advances in Neural Information Processing Systems. Vol. 34","first-page":"16558","article-title":"Droid-SLAM: Deep visual SLAM for monocular, stereo, and RGB-D cameras","author":"Teed","year":"2021"},{"key":"10.1016\/j.displa.2023.102553_b18","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part II. Vol. 16","first-page":"402","article-title":"Raft: Recurrent all-pairs field transforms for optical flow","author":"Teed","year":"2020"},{"key":"10.1016\/j.displa.2023.102553_b19","series-title":"2018 IEEE International Conference on Robotics and Automation","first-page":"7286","article-title":"Undeepvo: Monocular visual odometry through unsupervised deep learning","author":"Li","year":"2018"},{"key":"10.1016\/j.displa.2023.102553_b20","doi-asserted-by":"crossref","unstructured":"H. Zhan, R. Garg, C.S. Weerasekera, K. Li, H. Agarwal, I. Reid, Unsupervised learning of monocular depth estimation and visual odometry with deep feature reconstruction, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 340\u2013349.","DOI":"10.1109\/CVPR.2018.00043"},{"key":"10.1016\/j.displa.2023.102553_b21","series-title":"2022 International Conference on Robotics and Automation","first-page":"7605","article-title":"Self-supervised ego-motion estimation based on multi-layer fusion of RGB and inferred depth","author":"Jiang","year":"2022"},{"key":"10.1016\/j.displa.2023.102553_b22","doi-asserted-by":"crossref","unstructured":"Z. Yin, J. Shi, Geonet: Unsupervised learning of dense depth, optical flow and camera pose, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 1983\u20131992.","DOI":"10.1109\/CVPR.2018.00212"},{"key":"10.1016\/j.displa.2023.102553_b23","doi-asserted-by":"crossref","unstructured":"A. Ranjan, V. Jampani, L. Balles, K. Kim, D. Sun, J. Wulff, M.J. Black, Competitive collaboration: Joint unsupervised learning of depth, camera motion, optical flow and motion segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12240\u201312249.","DOI":"10.1109\/CVPR.2019.01252"},{"issue":"4","key":"10.1016\/j.displa.2023.102553_b24","doi-asserted-by":"crossref","first-page":"71","DOI":"10.1145\/3386569.3392377","article-title":"Consistent video depth estimation","volume":"39","author":"Luo","year":"2020","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"10.1016\/j.displa.2023.102553_b25","doi-asserted-by":"crossref","DOI":"10.1109\/TITS.2022.3182858","article-title":"Unsupervised learning of optical flow with non-occlusion from geometry","author":"Wang","year":"2022","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.displa.2023.102553_b26","doi-asserted-by":"crossref","unstructured":"W. Ye, X. Lan, S. Chen, Y. Ming, X. Yu, H. Bao, Z. Cui, G. Zhang, PVO: Panoptic visual odometry, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 9579\u20139589.","DOI":"10.1109\/CVPR52729.2023.00924"},{"key":"10.1016\/j.displa.2023.102553_b27","doi-asserted-by":"crossref","unstructured":"S. Lee, S. Im, S. Lin, I.S. Kweon, Learning monocular depth in dynamic scenes via instance-aware projection consistency, in: Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 35, 2021, pp. 1863\u20131872.","DOI":"10.1609\/aaai.v35i3.16281"},{"key":"10.1016\/j.displa.2023.102553_b28","series-title":"2023 IEEE International Conference on Robotics and Automation","first-page":"6131","article-title":"Deep unsupervised visual odometry via bundle adjusted pose graph optimization","author":"Lu","year":"2023"},{"key":"10.1016\/j.displa.2023.102553_b29","series-title":"2023 IEEE International Conference on Robotics and Automation","first-page":"6175","article-title":"MOFT: Monocular odometry based on deep depth and careful feature selection and tracking","author":"Koledi\u0107","year":"2023"},{"key":"10.1016\/j.displa.2023.102553_b30","doi-asserted-by":"crossref","unstructured":"T.-W. Hui, RM-depth: Unsupervised learning of recurrent monocular depth in dynamic scenes, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 1675\u20131684.","DOI":"10.1109\/CVPR52688.2022.00172"},{"key":"10.1016\/j.displa.2023.102553_b31","first-page":"1","article-title":"Attention mechanisms in computer vision: A survey","author":"Guo","year":"2022","journal-title":"Comput. Vis. Media"},{"key":"10.1016\/j.displa.2023.102553_b32","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"10.1016\/j.displa.2023.102553_b33","series-title":"Advances in Neural Information Processing Systems. Vol. 33","first-page":"1877","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.displa.2023.102553_b34","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.displa.2023.102553_b35","series-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"},{"key":"10.1016\/j.displa.2023.102553_b36","series-title":"Mobilevit: Light-weight, general-purpose, and mobile-friendly vision transformer","author":"Mehta","year":"2021"},{"key":"10.1016\/j.displa.2023.102553_b37","doi-asserted-by":"crossref","unstructured":"Z. Liu, Y. Lin, Y. Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.displa.2023.102553_b38","series-title":"Separable self-attention for mobile vision transformers","author":"Mehta","year":"2022"},{"key":"10.1016\/j.displa.2023.102553_b39","series-title":"2022 International Conference on 3D Vision","first-page":"668","article-title":"Monovit: Self-supervised monocular depth estimation with a vision transformer","author":"Zhao","year":"2022"},{"key":"10.1016\/j.displa.2023.102553_b40","doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, G. Sun, Squeeze-and-excitation networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.displa.2023.102553_b41","doi-asserted-by":"crossref","unstructured":"S. Woo, J. Park, J.-Y. Lee, I.S. Kweon, CBAM: Convolutional block attention module, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 3\u201319.","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"10.1016\/j.displa.2023.102553_b42","doi-asserted-by":"crossref","unstructured":"Y. Shavit, R. Ferens, Y. Keller, Learning multi-scene absolute pose regression with transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2733\u20132742.","DOI":"10.1109\/ICCV48922.2021.00273"},{"key":"10.1016\/j.displa.2023.102553_b43","doi-asserted-by":"crossref","unstructured":"V. Guizilini, R. Ambrus, S. Pillai, A. Raventos, A. Gaidon, 3D packing for self-supervised monocular depth estimation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 2485\u20132494.","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"10.1016\/j.displa.2023.102553_b44","doi-asserted-by":"crossref","unstructured":"C. Godard, O. Mac Aodha, M. Firman, G.J. Brostow, Digging into Self-Supervised Monocular Depth Prediction, in: The International Conference on Computer Vision, ICCV, 2019.","DOI":"10.1109\/ICCV.2019.00393"},{"key":"10.1016\/j.displa.2023.102553_b45","doi-asserted-by":"crossref","unstructured":"J. Bae, S. Moon, S. Im, Deep digging into the generalization of self-supervised monocular depth estimation, in: Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 37. No. 1, 2023, pp. 187\u2013196.","DOI":"10.1609\/aaai.v37i1.25090"},{"key":"10.1016\/j.displa.2023.102553_b46","doi-asserted-by":"crossref","unstructured":"R. Fan, M. Poggi, S. Mattoccia, Contrastive Learning for Depth Prediction, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 3225\u20133236.","DOI":"10.1109\/CVPRW59228.2023.00325"},{"key":"10.1016\/j.displa.2023.102553_b47","series-title":"2019 International Conference on Robotics and Automation","first-page":"6359","article-title":"Beyond photometric loss for self-supervised ego-motion estimation","author":"Shen","year":"2019"},{"key":"10.1016\/j.displa.2023.102553_b48","series-title":"2011 IEEE Intelligent Vehicles Symposium","first-page":"963","article-title":"Stereoscan: Dense 3d reconstruction in real-time","author":"Geiger","year":"2011"}],"container-title":["Displays"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938223001865?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938223001865?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T11:54:06Z","timestamp":1758887646000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0141938223001865"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":48,"alternative-id":["S0141938223001865"],"URL":"https:\/\/doi.org\/10.1016\/j.displa.2023.102553","relation":{},"ISSN":["0141-9382"],"issn-type":[{"value":"0141-9382","type":"print"}],"subject":[],"published":{"date-parts":[[2023,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Self-supervised deep monocular visual odometry and depth estimation with observation variation","name":"articletitle","label":"Article Title"},{"value":"Displays","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.displa.2023.102553","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"102553"}}