default search action

combined dblp search
author search
venue search
publication search

ask others

BibTeX records: Jifeng Dai

Name: dblp XML data dump
Creator: Schloss Dagstuhl - Leibniz Center for Informatics
Published: 1993
License: https://creativecommons.org/publicdomain/zero/1.0/
Keywords: dblp, XML, computer science, scholarly publications, metadata

> Home > Persons > Jifeng Dai

download as .bib file

@inproceedings{DBLP:conf/aaai/ZhangDHQDH26,
  author       = {Tianyi Zhang and
                  Haonan Duan and
                  Haoran Hao and
                  Yu Qiao and
                  Jifeng Dai and
                  Zhi Hou},
  editor       = {Sven Koenig and
                  Chad Jenkins and
                  Matthew E. Taylor},
  title        = {Grounding Actions in Camera Space: Observation-Centric Vision-Language-Action
                  Policy},
  booktitle    = {Fortieth {AAAI} Conference on Artificial Intelligence, Thirty-Eighth
                  Conference on Innovative Applications of Artificial Intelligence,
                  Sixteenth Symposium on Educational Advances in Artificial Intelligence,
                  {AAAI} 2026, Singapore, January 20-27, 2026},
  pages        = {18782--18790},
  publisher    = {{AAAI} Press},
  year         = {2026},
  url          = {https://doi.org/10.1609/aaai.v40i22.38947},
  doi          = {10.1609/AAAI.V40I22.38947},
  timestamp    = {Fri, 27 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/aaai/ZhangDHQDH26.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2602-22808,
  author       = {Shiqian Su and
                  Sen Xing and
                  Xuan Dong and
                  Muyan Zhong and
                  Bin Wang and
                  Xizhou Zhu and
                  Yuntao Chen and
                  Wenhai Wang and
                  Yue Deng and
                  Pengxiang Zhu and
                  Ziyuan Liu and
                  Tiantong Li and
                  Jiaheng Yu and
                  Zhe Chen and
                  Lidong Bing and
                  Jifeng Dai},
  title        = {MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework
                  for General Deep Research Tasks},
  journal      = {CoRR},
  volume       = {abs/2602.22808},
  year         = {2026},
  url          = {https://doi.org/10.48550/arXiv.2602.22808},
  doi          = {10.48550/ARXIV.2602.22808},
  eprinttype   = {arXiv},
  eprint       = {2602.22808},
  timestamp    = {Sun, 29 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2602-22808.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2605-12622,
  author       = {Pengfei Jing and
                  Victor Shea{-}Jay Huang and
                  Hengtong Lu and
                  Jifeng Dai and
                  Yan Xie and
                  Benjin Zhu},
  title        = {Action Emergence from Streaming Intent},
  journal      = {CoRR},
  volume       = {abs/2605.12622},
  year         = {2026},
  url          = {https://doi.org/10.48550/arXiv.2605.12622},
  doi          = {10.48550/ARXIV.2605.12622},
  eprinttype   = {arXiv},
  eprint       = {2605.12622},
  timestamp    = {Tue, 09 Jun 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2605-12622.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2605-12624,
  author       = {Yuzhou Huang and
                  Benjin Zhu and
                  Hengtong Lu and
                  Victor Shea{-}Jay Huang and
                  Haiming Zhang and
                  Wei Chen and
                  Jifeng Dai and
                  Yan Xie and
                  Hongsheng Li},
  title        = {MindVLA-U1: {VLA} Beats {VA} with Unified Streaming Architecture for
                  Autonomous Driving},
  journal      = {CoRR},
  volume       = {abs/2605.12624},
  year         = {2026},
  url          = {https://doi.org/10.48550/arXiv.2605.12624},
  doi          = {10.48550/ARXIV.2605.12624},
  eprinttype   = {arXiv},
  eprint       = {2605.12624},
  timestamp    = {Tue, 09 Jun 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2605-12624.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2605-12625,
  author       = {Hengtong Lu and
                  Victor Shea{-}Jay Huang and
                  Chengmin Yang and
                  Pengfei Jing and
                  Jifeng Dai and
                  Yan Xie and
                  Benjin Zhu},
  title        = {Driving Intents Amplify Planning-Oriented Reinforcement Learning},
  journal      = {CoRR},
  volume       = {abs/2605.12625},
  year         = {2026},
  url          = {https://doi.org/10.48550/arXiv.2605.12625},
  doi          = {10.48550/ARXIV.2605.12625},
  eprinttype   = {arXiv},
  eprint       = {2605.12625},
  timestamp    = {Tue, 09 Jun 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2605-12625.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/csur/SunZXLCQXDLGWWCYRFHYLLL25,
  author       = {Jiankai Sun and
                  Chuanyang Zheng and
                  Enze Xie and
                  Zhengying Liu and
                  Ruihang Chu and
                  Jianing Qiu and
                  Jiaqi Xu and
                  Mingyu Ding and
                  Hongyang Li and
                  Mengzhe Geng and
                  Yue Wu and
                  Wenhai Wang and
                  Junsong Chen and
                  Zhangyue Yin and
                  Xiaozhe Ren and
                  Jie Fu and
                  Junxian He and
                  Wu Yuan and
                  Qi Liu and
                  Xihui Liu and
                  Yu Li and
                  Hao Dong and
                  Yu Cheng and
                  Ming Zhang and
                  Pheng{-}Ann Heng and
                  Jifeng Dai and
                  Ping Luo and
                  Jingdong Wang and
                  Ji{-}Rong Wen and
                  Xipeng Qiu and
                  Yike Guo and
                  Hui Xiong and
                  Qun Liu and
                  Zhenguo Li},
  title        = {A Survey of Reasoning with Foundation Models: Concepts, Methodologies,
                  and Outlook},
  journal      = {{ACM} Comput. Surv.},
  volume       = {57},
  number       = {11},
  pages        = {278:1--278:43},
  year         = {2025},
  url          = {https://doi.org/10.1145/3729218},
  doi          = {10.1145/3729218},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/csur/SunZXLCQXDLGWWCYRFHYLLL25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/LiWLXSLQD25,
  author       = {Zhiqi Li and
                  Wenhai Wang and
                  Hongyang Li and
                  Enze Xie and
                  Chonghao Sima and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {BEVFormer: Learning Bird's-Eye-View Representation From LiDAR-Camera
                  via Spatiotemporal Transformers},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {47},
  number       = {3},
  pages        = {2020--2036},
  year         = {2025},
  url          = {https://doi.org/10.1109/TPAMI.2024.3515454},
  doi          = {10.1109/TPAMI.2024.3515454},
  timestamp    = {Wed, 19 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/pami/LiWLXSLQD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/HuSWWXWZLZWQD25,
  author       = {Xiaowei Hu and
                  Min Shi and
                  Weiyun Wang and
                  Sitong Wu and
                  Linjie Xing and
                  Wenhai Wang and
                  Xizhou Zhou and
                  Lewei Lu and
                  Jie Zhou and
                  Xiaogang Wang and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Demystify Transformers {\&} Convolutions in Modern Image Deep
                  Networks},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {47},
  number       = {4},
  pages        = {2416--2428},
  year         = {2025},
  url          = {https://doi.org/10.1109/TPAMI.2024.3520508},
  doi          = {10.1109/TPAMI.2024.3520508},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/HuSWWXWZLZWQD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/ChenFGZD25,
  author       = {Linwei Chen and
                  Ying Fu and
                  Lin Gu and
                  Dezhi Zheng and
                  Jifeng Dai},
  title        = {Spatial Frequency Modulation for Semantic Segmentation},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {47},
  number       = {11},
  pages        = {9767--9784},
  year         = {2025},
  url          = {https://doi.org/10.1109/TPAMI.2025.3592621},
  doi          = {10.1109/TPAMI.2025.3592621},
  timestamp    = {Wed, 15 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/ChenFGZD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/WangZYLLTDGLQD25,
  author       = {Zhaokai Wang and
                  Xizhou Zhu and
                  Xue Yang and
                  Gen Luo and
                  Hao Li and
                  Changyao Tian and
                  Wenhan Dou and
                  Junqi Ge and
                  Lewei Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Parameter-Inverted Image Pyramid Networks for Visual Perception and
                  Multimodal Understanding},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {47},
  number       = {11},
  pages        = {10142--10159},
  year         = {2025},
  url          = {https://doi.org/10.1109/TPAMI.2025.3593283},
  doi          = {10.1109/TPAMI.2025.3593283},
  timestamp    = {Wed, 15 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/WangZYLLTDGLQD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/visintelligence/CuiWLXZDLLZD25,
  author       = {Erfei Cui and
                  Wenhai Wang and
                  Zhiqi Li and
                  Jiangwei Xie and
                  Haoming Zou and
                  Hanming Deng and
                  Gen Luo and
                  Lewei Lu and
                  Xizhou Zhu and
                  Jifeng Dai},
  title        = {DriveMLM: aligning multi-modal large language models with behavioral
                  planning states for autonomous driving},
  journal      = {Vis. Intell.},
  volume       = {3},
  number       = {1},
  year         = {2025},
  url          = {https://doi.org/10.1007/s44267-025-00095-w},
  doi          = {10.1007/S44267-025-00095-W},
  timestamp    = {Sat, 21 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/visintelligence/CuiWLXZDLLZD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/DuanCHWYSLHLLDW25,
  author       = {Yuchen Duan and
                  Zhe Chen and
                  Yusong Hu and
                  Weiyun Wang and
                  Shenglong Ye and
                  Botian Shi and
                  Lewei Lu and
                  Qibin Hou and
                  Tong Lu and
                  Hongsheng Li and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Docopilot: Improving Multimodal Models for Document-Level Understanding},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {4026--4037},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Duan\_Docopilot\_Improving\_Multimodal\_Models\_for\_Document-Level\_Understanding\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.00381},
  timestamp    = {Sun, 04 Jan 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/DuanCHWYSLHLLDW25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/NanLD025,
  author       = {Zhixiong Nan and
                  Xianghong Li and
                  Jifeng Dai and
                  Tao Xiang},
  title        = {{MI-DETR:} An Object Detection Model with Multi-time Inquiries Mechanism},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {4703--4712},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Nan\_MI-DETR\_An\_Object\_Detection\_Model\_with\_Multi-time\_Inquiries\_Mechanism\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.00443},
  timestamp    = {Sat, 06 Sep 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/NanLD025.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/TaoSZZ0LWL00D25,
  author       = {Chenxin Tao and
                  Shiqian Su and
                  Xizhou Zhu and
                  Chenyu Zhang and
                  Zhe Chen and
                  Jiawen Liu and
                  Wenhai Wang and
                  Lewei Lu and
                  Gao Huang and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {HoVLE: Unleashing the Power of Monolithic Vision-Language Models with
                  Holistic Vision-Language Embedding},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {14559--14569},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Tao\_HoVLE\_Unleashing\_the\_Power\_of\_Monolithic\_Vision-Language\_Models\_with\_Holistic\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.01357},
  timestamp    = {Wed, 20 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/TaoSZZ0LWL00D25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/YangDZ0W00WLD25,
  author       = {Chenyu Yang and
                  Xuan Dong and
                  Xizhou Zhu and
                  Weijie Su and
                  Jiahao Wang and
                  Hao Tian and
                  Zhe Chen and
                  Wenhai Wang and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {{PVC:} Progressive Visual Token Compression for Unified Image and
                  Video Processing in Large Vision-Language Models},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {24939--24949},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Yang\_PVC\_Progressive\_Visual\_Token\_Compression\_for\_Unified\_Image\_and\_Video\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.02322},
  timestamp    = {Wed, 20 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/YangDZ0W00WLD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LuoYDWLD0Z25,
  author       = {Gen Luo and
                  Xue Yang and
                  Wenhan Dou and
                  Zhaokai Wang and
                  Jiawen Liu and
                  Jifeng Dai and
                  Yu Qiao and
                  Xizhou Zhu},
  title        = {Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large
                  Language Models with Endogenous Visual Pre-training},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {24960--24971},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Luo\_Mono-InternVL\_Pushing\_the\_Boundaries\_of\_Monolithic\_Multimodal\_Large\_Language\_Models\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.02324},
  timestamp    = {Wed, 20 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/LuoYDWLD0Z25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LiTSZWZDWLLD25,
  author       = {Hao Li and
                  Changyao Tian and
                  Jie Shao and
                  Xizhou Zhu and
                  Zhaokai Wang and
                  Jinguo Zhu and
                  Wenhan Dou and
                  Xiao{-}Gang Wang and
                  Hongsheng Li and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {SynerGen-VL: Towards Synergistic Image Understanding and Generation
                  with Vision Experts and Token Folding},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
  pages        = {29767--29779},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2025},
  url          = {https://openaccess.thecvf.com/content/CVPR2025/html/Li\_SynerGen-VL\_Towards\_Synergistic\_Image\_Understanding\_and\_Generation\_with\_Vision\_Experts\_CVPR\_2025\_paper.html},
  doi          = {10.1109/CVPR52734.2025.02771},
  timestamp    = {Wed, 20 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/LiTSZWZDWLLD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/HouZXDPTZZQDC25,
  author       = {Zhi Hou and
                  Tianyi Zhang and
                  Yuwen Xiong and
                  Haonan Duan and
                  Hengjun Pu and
                  Ronglei Tong and
                  Chengyang Zhao and
                  Xizhou Zhu and
                  Yu Qiao and
                  Jifeng Dai and
                  Yuntao Chen},
  title        = {Dita: Scaling Diffusion Transformer for Generalist Vision-Language-Action
                  Policy},
  booktitle    = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
                  Honolulu, HI, USA, October 19-25, 2025},
  pages        = {7686--7697},
  publisher    = {{IEEE}},
  year         = {2025},
  url          = {https://doi.org/10.1109/ICCV51701.2025.00721},
  doi          = {10.1109/ICCV51701.2025.00721},
  timestamp    = {Wed, 13 May 2026 10:42:14 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/HouZXDPTZZQDC25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/FangDWLHTZZDLL25,
  author       = {Rongyao Fang and
                  Chengqi Duan and
                  Kun Wang and
                  Hao Li and
                  Linjiang Huang and
                  Hao Tian and
                  Xingyu Zeng and
                  Rui Zhao and
                  Jifeng Dai and
                  Hongsheng Li and
                  Xihui Liu},
  title        = {{PUMA:} Empowering Unified {MLLM} with Multi-Granular Visual Generation},
  booktitle    = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
                  Honolulu, HI, USA, October 19-25, 2025},
  pages        = {15447--15457},
  publisher    = {{IEEE}},
  year         = {2025},
  url          = {https://doi.org/10.1109/ICCV51701.2025.01433},
  doi          = {10.1109/ICCV51701.2025.01433},
  timestamp    = {Wed, 13 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/FangDWLHTZZDLL25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/GeCLZLDZ25,
  author       = {Junqi Ge and
                  Ziyi Chen and
                  Jintao Lin and
                  Jinguo Zhu and
                  Xihui Liu and
                  Jifeng Dai and
                  Xizhou Zhu},
  title        = {{V2PE:} Improving Multimodal Long-Context Capability of Vision-Language
                  Models with Variable Visual Position Encoding},
  booktitle    = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
                  Honolulu, HI, USA, October 19-25, 2025},
  pages        = {21070--21084},
  publisher    = {{IEEE}},
  year         = {2025},
  url          = {https://doi.org/10.1109/ICCV51701.2025.01958},
  doi          = {10.1109/ICCV51701.2025.01958},
  timestamp    = {Wed, 13 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/GeCLZLDZ25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/LiaoNMLTDXLZYDC25,
  author       = {Jiaqi Liao and
                  Yuwei Niu and
                  Fanqing Meng and
                  Hao Li and
                  Changyao Tian and
                  Yinuo Du and
                  Yuwen Xiong and
                  Dianqi Li and
                  Xizhou Zhu and
                  Li Yuan and
                  Jifeng Dai and
                  Yu Cheng},
  title        = {LangBridge: Interpreting Image as a Combination of Language Embeddings},
  booktitle    = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
                  Honolulu, HI, USA, October 19-25, 2025},
  pages        = {23752--23762},
  publisher    = {{IEEE}},
  year         = {2025},
  url          = {https://doi.org/10.1109/ICCV51701.2025.02205},
  doi          = {10.1109/ICCV51701.2025.02205},
  timestamp    = {Wed, 13 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/LiaoNMLTDXLZYDC25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/DuanWCZLLQ0DW25,
  author       = {Yuchen Duan and
                  Weiyun Wang and
                  Zhe Chen and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Yu Qiao and
                  Hongsheng Li and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Vision-RWKV: Efficient and Scalable Visual Perception with RWKV-Like
                  Architectures},
  booktitle    = {The Thirteenth International Conference on Learning Representations,
                  {ICLR} 2025, Singapore, April 24-28, 2025},
  publisher    = {OpenReview.net},
  year         = {2025},
  url          = {https://openreview.net/forum?id=nGiGXLnKhl},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iclr/DuanWCZLLQ0DW25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/MengW0L0YLZD00Z25,
  author       = {Fanqing Meng and
                  Jin Wang and
                  Chuanhao Li and
                  Quanfeng Lu and
                  Hao Tian and
                  Tianshuo Yang and
                  Jiaqi Liao and
                  Xizhou Zhu and
                  Jifeng Dai and
                  Yu Qiao and
                  Ping Luo and
                  Kaipeng Zhang and
                  Wenqi Shao},
  title        = {{MMIU:} Multimodal Multi-image Understanding for Evaluating Large
                  Vision-Language Models},
  booktitle    = {The Thirteenth International Conference on Learning Representations,
                  {ICLR} 2025, Singapore, April 24-28, 2025},
  publisher    = {OpenReview.net},
  year         = {2025},
  url          = {https://openreview.net/forum?id=WsgEWL8i0K},
  timestamp    = {Thu, 10 Jul 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/MengW0L0YLZD00Z25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/SiW0XLDQ0025,
  author       = {Chongjie Si and
                  Xuehui Wang and
                  Xue Yang and
                  Zhengqin Xu and
                  Qingyun Li and
                  Jifeng Dai and
                  Yu Qiao and
                  Xiaokang Yang and
                  Wei Shen},
  title        = {Maintaining Structural Integrity in Parameter Spaces for Parameter
                  Efficient Fine-tuning},
  booktitle    = {The Thirteenth International Conference on Learning Representations,
                  {ICLR} 2025, Singapore, April 24-28, 2025},
  publisher    = {OpenReview.net},
  year         = {2025},
  url          = {https://openreview.net/forum?id=OALIb8oNfl},
  timestamp    = {Fri, 16 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/SiW0XLDQ0025.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icml/Liu0ZWD25,
  author       = {Shi Liu and
                  Weijie Su and
                  Xizhou Zhu and
                  Wenhai Wang and
                  Jifeng Dai},
  editor       = {Aarti Singh and
                  Maryam Fazel and
                  Daniel Hsu and
                  Simon Lacoste{-}Julien and
                  Felix Berkenkamp and
                  Tegan Maharaj and
                  Kiri Wagstaff and
                  Jerry Zhu},
  title        = {CoMemo: LVLMs Need Image Context with Image Memory},
  booktitle    = {Forty-second International Conference on Machine Learning, {ICML}
                  2025, Vancouver, BC, Canada, July 13-19, 2025},
  series       = {Proceedings of Machine Learning Research},
  publisher    = {{PMLR} / OpenReview.net},
  year         = {2025},
  url          = {https://proceedings.mlr.press/v267/liu25bn.html},
  timestamp    = {Wed, 04 Feb 2026 16:54:16 +0100},
  biburl       = {https://dblp.org/rec/conf/icml/Liu0ZWD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icml/XingZLLLWDW25,
  author       = {Sen Xing and
                  Muyan Zhong and
                  Zeqiang Lai and
                  Liangchen Li and
                  Jiawen Liu and
                  Yaohui Wang and
                  Jifeng Dai and
                  Wenhai Wang},
  editor       = {Aarti Singh and
                  Maryam Fazel and
                  Daniel Hsu and
                  Simon Lacoste{-}Julien and
                  Felix Berkenkamp and
                  Tegan Maharaj and
                  Kiri Wagstaff and
                  Jerry Zhu},
  title        = {MuLan: Adapting Multilingual Diffusion Models for Hundreds of Languages
                  with Negligible Cost},
  booktitle    = {Forty-second International Conference on Machine Learning, {ICML}
                  2025, Vancouver, BC, Canada, July 13-19, 2025},
  series       = {Proceedings of Machine Learning Research},
  publisher    = {{PMLR} / OpenReview.net},
  year         = {2025},
  url          = {https://proceedings.mlr.press/v267/xing25d.html},
  timestamp    = {Wed, 04 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icml/XingZLLLWDW25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2501-07783,
  author       = {Zhaokai Wang and
                  Xizhou Zhu and
                  Xue Yang and
                  Gen Luo and
                  Hao Li and
                  Changyao Tian and
                  Wenhan Dou and
                  Junqi Ge and
                  Lewei Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Parameter-Inverted Image Pyramid Networks for Visual Perception and
                  Multimodal Understanding},
  journal      = {CoRR},
  volume       = {abs/2501.07783},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2501.07783},
  doi          = {10.48550/ARXIV.2501.07783},
  eprinttype   = {arXiv},
  eprint       = {2501.07783},
  timestamp    = {Mon, 24 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2501-07783.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2503-01463,
  author       = {Zhixiong Nan and
                  Xianghong Li and
                  Jifeng Dai and
                  Tao Xiang},
  title        = {{MI-DETR:} An Object Detection Model with Multi-time Inquiries Mechanism},
  journal      = {CoRR},
  volume       = {abs/2503.01463},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2503.01463},
  doi          = {10.48550/ARXIV.2503.01463},
  eprinttype   = {arXiv},
  eprint       = {2503.01463},
  timestamp    = {Tue, 08 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2503-01463.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2503-10291,
  author       = {Weiyun Wang and
                  Zhangwei Gao and
                  Lianjie Chen and
                  Zhe Chen and
                  Jinguo Zhu and
                  Xiangyu Zhao and
                  Yangzhou Liu and
                  Yue Cao and
                  Shenglong Ye and
                  Xizhou Zhu and
                  Lewei Lu and
                  Haodong Duan and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {VisualPRM: An Effective Process Reward Model for Multimodal Reasoning},
  journal      = {CoRR},
  volume       = {abs/2503.10291},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2503.10291},
  doi          = {10.48550/ARXIV.2503.10291},
  eprinttype   = {arXiv},
  eprint       = {2503.10291},
  timestamp    = {Sun, 13 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2503-10291.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2503-10639,
  author       = {Rongyao Fang and
                  Chengqi Duan and
                  Kun Wang and
                  Linjiang Huang and
                  Hao Li and
                  Shilin Yan and
                  Hao Tian and
                  Xingyu Zeng and
                  Rui Zhao and
                  Jifeng Dai and
                  Xihui Liu and
                  Hongsheng Li},
  title        = {GoT: Unleashing Reasoning Capability of Multimodal Large Language
                  Model for Visual Generation and Editing},
  journal      = {CoRR},
  volume       = {abs/2503.10639},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2503.10639},
  doi          = {10.48550/ARXIV.2503.10639},
  eprinttype   = {arXiv},
  eprint       = {2503.10639},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2503-10639.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2503-19404,
  author       = {Jiaqi Liao and
                  Yuwei Niu and
                  Fanqing Meng and
                  Hao Li and
                  Changyao Tian and
                  Yinuo Du and
                  Yuwen Xiong and
                  Dianqi Li and
                  Xizhou Zhu and
                  Li Yuan and
                  Jifeng Dai and
                  Yu Cheng},
  title        = {LangBridge: Interpreting Image as a Combination of Language Embeddings},
  journal      = {CoRR},
  volume       = {abs/2503.19404},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2503.19404},
  doi          = {10.48550/ARXIV.2503.19404},
  eprinttype   = {arXiv},
  eprint       = {2503.19404},
  timestamp    = {Wed, 23 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2503-19404.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2503-19757,
  author       = {Zhi Hou and
                  Tianyi Zhang and
                  Yuwen Xiong and
                  Haonan Duan and
                  Hengjun Pu and
                  Ronglei Tong and
                  Chengyang Zhao and
                  Xizhou Zhu and
                  Yu Qiao and
                  Jifeng Dai and
                  Yuntao Chen},
  title        = {Dita: Scaling Diffusion Transformer for Generalist Vision-Language-Action
                  Policy},
  journal      = {CoRR},
  volume       = {abs/2503.19757},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2503.19757},
  doi          = {10.48550/ARXIV.2503.19757},
  eprinttype   = {arXiv},
  eprint       = {2503.19757},
  timestamp    = {Sat, 31 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2503-19757.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2504-10479,
  author       = {Jinguo Zhu and
                  Weiyun Wang and
                  Zhe Chen and
                  Zhaoyang Liu and
                  Shenglong Ye and
                  Lixin Gu and
                  Hao Tian and
                  Yuchen Duan and
                  Weijie Su and
                  Jie Shao and
                  Zhangwei Gao and
                  Erfei Cui and
                  Xuehui Wang and
                  Yue Cao and
                  Yangzhou Liu and
                  Xingguang Wei and
                  Hongjie Zhang and
                  Haomin Wang and
                  Weiye Xu and
                  Hao Li and
                  Jiahao Wang and
                  Nianchen Deng and
                  Songze Li and
                  Yinan He and
                  Tan Jiang and
                  Jiapeng Luo and
                  Yi Wang and
                  Conghui He and
                  Botian Shi and
                  Xingcheng Zhang and
                  Wenqi Shao and
                  Junjun He and
                  Yingtong Xiong and
                  Wenwen Qu and
                  Peng Sun and
                  Penglong Jiao and
                  Han Lv and
                  Lijun Wu and
                  Kaipeng Zhang and
                  Huipeng Deng and
                  Jiaye Ge and
                  Kai Chen and
                  Limin Wang and
                  Min Dou and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Dahua Lin and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source
                  Multimodal Models},
  journal      = {CoRR},
  volume       = {abs/2504.10479},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2504.10479},
  doi          = {10.48550/ARXIV.2504.10479},
  eprinttype   = {arXiv},
  eprint       = {2504.10479},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2504-10479.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2504-15279,
  author       = {Weiye Xu and
                  Jiahao Wang and
                  Weiyun Wang and
                  Zhe Chen and
                  Wengang Zhou and
                  Aijun Yang and
                  Lewei Lu and
                  Houqiang Li and
                  Xiaohua Wang and
                  Xizhou Zhu and
                  Wenhai Wang and
                  Jifeng Dai and
                  Jinguo Zhu},
  title        = {VisuLogic: {A} Benchmark for Evaluating Visual Reasoning in Multi-modal
                  Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2504.15279},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2504.15279},
  doi          = {10.48550/ARXIV.2504.15279},
  eprinttype   = {arXiv},
  eprint       = {2504.15279},
  timestamp    = {Sun, 25 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2504-15279.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2505-04623,
  author       = {Zhenghao Xing and
                  Xiaowei Hu and
                  Chi{-}Wing Fu and
                  Wenhai Wang and
                  Jifeng Dai and
                  Pheng{-}Ann Heng},
  title        = {EchoInk-R1: Exploring Audio-Visual Reasoning in Multimodal LLMs via
                  Reinforcement Learning},
  journal      = {CoRR},
  volume       = {abs/2505.04623},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2505.04623},
  doi          = {10.48550/ARXIV.2505.04623},
  eprinttype   = {arXiv},
  eprint       = {2505.04623},
  timestamp    = {Sun, 29 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2505-04623.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2505-17011,
  author       = {Yan Li and
                  Changyao Tian and
                  Renqiu Xia and
                  Ning Liao and
                  Weiwei Guo and
                  Junchi Yan and
                  Hongsheng Li and
                  Jifeng Dai and
                  Hao Li and
                  Xue Yang},
  title        = {Learning Adaptive and Temporally Causal Video Tokenization in a 1D
                  Latent Space},
  journal      = {CoRR},
  volume       = {abs/2505.17011},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2505.17011},
  doi          = {10.48550/ARXIV.2505.17011},
  eprinttype   = {arXiv},
  eprint       = {2505.17011},
  timestamp    = {Sun, 29 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2505-17011.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2505-23395,
  author       = {Xingguang Wei and
                  Haomin Wang and
                  Shenglong Ye and
                  Ruifeng Luo and
                  Yanting Zhang and
                  Lixin Gu and
                  Jifeng Dai and
                  Yu Qiao and
                  Wenhai Wang and
                  Hongjie Zhang},
  title        = {Point or Line? Using Line-based Representation for Panoptic Symbol
                  Spotting in {CAD} Drawings},
  journal      = {CoRR},
  volume       = {abs/2505.23395},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2505.23395},
  doi          = {10.48550/ARXIV.2505.23395},
  eprinttype   = {arXiv},
  eprint       = {2505.23395},
  timestamp    = {Fri, 04 Jul 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2505-23395.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2505-23762,
  author       = {Chenyu Yang and
                  Shiqian Su and
                  Shi Liu and
                  Xuan Dong and
                  Yue Yu and
                  Weijie Su and
                  Xuehui Wang and
                  Zhaoyang Liu and
                  Jinguo Zhu and
                  Hao Li and
                  Wenhai Wang and
                  Yu Qiao and
                  Xizhou Zhu and
                  Jifeng Dai},
  title        = {ZeroGUI: Automating Online {GUI} Learning at Zero Human Cost},
  journal      = {CoRR},
  volume       = {abs/2505.23762},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2505.23762},
  doi          = {10.48550/ARXIV.2505.23762},
  eprinttype   = {arXiv},
  eprint       = {2505.23762},
  timestamp    = {Wed, 17 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2505-23762.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2506-00123,
  author       = {Gen Luo and
                  Ganlin Yang and
                  Ziyang Gong and
                  Guanzhou Chen and
                  Haonan Duan and
                  Erfei Cui and
                  Ronglei Tong and
                  Zhi Hou and
                  Tianyi Zhang and
                  Zhe Chen and
                  Shenglong Ye and
                  Lewei Lu and
                  Jingbo Wang and
                  Wenhai Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Rongrong Ji and
                  Xizhou Zhu},
  title        = {Visual Embodied Brain: Let Multimodal Large Language Models See, Think,
                  and Control in Spaces},
  journal      = {CoRR},
  volume       = {abs/2506.00123},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2506.00123},
  doi          = {10.48550/ARXIV.2506.00123},
  eprinttype   = {arXiv},
  eprint       = {2506.00123},
  timestamp    = {Sun, 07 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2506-00123.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2506-04217,
  author       = {Junting Chen and
                  Haotian Liang and
                  Lingxiao Du and
                  Weiyun Wang and
                  Mengkang Hu and
                  Yao Mu and
                  Wenhai Wang and
                  Jifeng Dai and
                  Ping Luo and
                  Wenqi Shao and
                  Lin Shao},
  title        = {OWMM-Agent: Open World Mobile Manipulation With Multi-modal Agentic
                  Data Synthesis},
  journal      = {CoRR},
  volume       = {abs/2506.04217},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2506.04217},
  doi          = {10.48550/ARXIV.2506.04217},
  eprinttype   = {arXiv},
  eprint       = {2506.04217},
  timestamp    = {Sun, 06 Jul 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2506-04217.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2506-06279,
  author       = {Shi Liu and
                  Weijie Su and
                  Xizhou Zhu and
                  Wenhai Wang and
                  Jifeng Dai},
  title        = {CoMemo: LVLMs Need Image Context with Image Memory},
  journal      = {CoRR},
  volume       = {abs/2506.06279},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2506.06279},
  doi          = {10.48550/ARXIV.2506.06279},
  eprinttype   = {arXiv},
  eprint       = {2506.06279},
  timestamp    = {Mon, 07 Jul 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2506-06279.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2506-18385,
  author       = {Nianchen Deng and
                  Lixin Gu and
                  Shenglong Ye and
                  Yinan He and
                  Zhe Chen and
                  Songze Li and
                  Haomin Wang and
                  Xingguang Wei and
                  Tianshuo Yang and
                  Min Dou and
                  Tong He and
                  Wenqi Shao and
                  Kaipeng Zhang and
                  Yi Wang and
                  Botian Shi and
                  Yanting Zhang and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongjie Zhang and
                  Wenhai Wang},
  title        = {InternSpatial: {A} Comprehensive Dataset for Spatial Reasoning in
                  Vision-Language Models},
  journal      = {CoRR},
  volume       = {abs/2506.18385},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2506.18385},
  doi          = {10.48550/ARXIV.2506.18385},
  eprinttype   = {arXiv},
  eprint       = {2506.18385},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2506-18385.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2507-11893,
  author       = {Linwei Chen and
                  Ying Fu and
                  Lin Gu and
                  Dezhi Zheng and
                  Jifeng Dai},
  title        = {Spatial Frequency Modulation for Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/2507.11893},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.11893},
  doi          = {10.48550/ARXIV.2507.11893},
  eprinttype   = {arXiv},
  eprint       = {2507.11893},
  timestamp    = {Wed, 20 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-11893.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2507-12566,
  author       = {Gen Luo and
                  Wenhan Dou and
                  Wenhao Li and
                  Zhaokai Wang and
                  Xue Yang and
                  Changyao Tian and
                  Hao Li and
                  Weiyun Wang and
                  Wenhai Wang and
                  Xizhou Zhu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Mono-InternVL-1.5: Towards Cheaper and Faster Monolithic Multimodal
                  Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2507.12566},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.12566},
  doi          = {10.48550/ARXIV.2507.12566},
  eprinttype   = {arXiv},
  eprint       = {2507.12566},
  timestamp    = {Mon, 18 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-12566.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2507-12841,
  author       = {Yiming Ren and
                  Zhiqiang Lin and
                  Yu Li and
                  Gao Meng and
                  Weiyun Wang and
                  Junjie Wang and
                  Zicheng Lin and
                  Jifeng Dai and
                  Yujiu Yang and
                  Wenhai Wang and
                  Ruihang Chu},
  title        = {AnyCap Project: {A} Unified Framework, Dataset, and Benchmark for
                  Controllable Omni-modal Captioning},
  journal      = {CoRR},
  volume       = {abs/2507.12841},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.12841},
  doi          = {10.48550/ARXIV.2507.12841},
  eprinttype   = {arXiv},
  eprint       = {2507.12841},
  timestamp    = {Thu, 23 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-12841.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2507-14675,
  author       = {Yuchen Duan and
                  Zhe Chen and
                  Yusong Hu and
                  Weiyun Wang and
                  Shenglong Ye and
                  Botian Shi and
                  Lewei Lu and
                  Qibin Hou and
                  Tong Lu and
                  Hongsheng Li and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Docopilot: Improving Multimodal Models for Document-Level Understanding},
  journal      = {CoRR},
  volume       = {abs/2507.14675},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.14675},
  doi          = {10.48550/ARXIV.2507.14675},
  eprinttype   = {arXiv},
  eprint       = {2507.14675},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-14675.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2507-19478,
  author       = {Xuehui Wang and
                  Zhenyu Wu and
                  JingJing Xie and
                  Zichen Ding and
                  Bowen Yang and
                  Zehao Li and
                  Zhaoyang Liu and
                  Qingyun Li and
                  Xuan Dong and
                  Zhe Chen and
                  Weiyun Wang and
                  Xiangyu Zhao and
                  Jixuan Chen and
                  Haodong Duan and
                  Tianbao Xie and
                  Chenyu Yang and
                  Shiqian Su and
                  Yue Yu and
                  Yuan Huang and
                  Yiqian Liu and
                  Xiao Zhang and
                  Yanting Zhang and
                  Xiangyu Yue and
                  Weijie Su and
                  Xizhou Zhu and
                  Wei Shen and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {MMBench-GUI: Hierarchical Multi-Platform Evaluation Framework for
                  {GUI} Agents},
  journal      = {CoRR},
  volume       = {abs/2507.19478},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.19478},
  doi          = {10.48550/ARXIV.2507.19478},
  eprinttype   = {arXiv},
  eprint       = {2507.19478},
  timestamp    = {Wed, 17 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-19478.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2508-13103,
  author       = {Tianyi Zhang and
                  Haonan Duan and
                  Haoran Hao and
                  Yu Qiao and
                  Jifeng Dai and
                  Zhi Hou},
  title        = {Grounding Actions in Camera Space: Observation-Centric Vision-Language-Action
                  Policy},
  journal      = {CoRR},
  volume       = {abs/2508.13103},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2508.13103},
  doi          = {10.48550/ARXIV.2508.13103},
  eprinttype   = {arXiv},
  eprint       = {2508.13103},
  timestamp    = {Sun, 07 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2508-13103.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2508-18265,
  author       = {Weiyun Wang and
                  Zhangwei Gao and
                  Lixin Gu and
                  Hengjun Pu and
                  Long Cui and
                  Xingguang Wei and
                  Zhaoyang Liu and
                  Linglin Jing and
                  Shenglong Ye and
                  Jie Shao and
                  Zhaokai Wang and
                  Zhe Chen and
                  Hongjie Zhang and
                  Ganlin Yang and
                  Haomin Wang and
                  Qi Wei and
                  Jinhui Yin and
                  Wenhao Li and
                  Erfei Cui and
                  Guanzhou Chen and
                  Zichen Ding and
                  Changyao Tian and
                  Zhenyu Wu and
                  JingJing Xie and
                  Zehao Li and
                  Bowen Yang and
                  Yuchen Duan and
                  Xuehui Wang and
                  Zhi Hou and
                  Haoran Hao and
                  Tianyi Zhang and
                  Songze Li and
                  Xiangyu Zhao and
                  Haodong Duan and
                  Nianchen Deng and
                  Bin Fu and
                  Yinan He and
                  Yi Wang and
                  Conghui He and
                  Botian Shi and
                  Junjun He and
                  Yingtong Xiong and
                  Han Lv and
                  Lijun Wu and
                  Wenqi Shao and
                  Kaipeng Zhang and
                  Huipeng Deng and
                  Biqing Qi and
                  Jiaye Ge and
                  Qipeng Guo and
                  Wenwei Zhang and
                  Songyang Zhang and
                  Maosong Cao and
                  Junyao Lin and
                  Kexian Tang and
                  Jianfei Gao and
                  Haian Huang and
                  Yuzhe Gu and
                  Chengqi Lyu and
                  Huanze Tang and
                  Rui Wang and
                  Haijun Lv and
                  Wanli Ouyang and
                  Limin Wang and
                  Min Dou and
                  Xizhou Zhu and
                  Tong Lu and
                  Dahua Lin and
                  Jifeng Dai and
                  Weijie Su and
                  Bowen Zhou and
                  Kai Chen and
                  Yu Qiao and
                  Wenhai Wang and
                  Gen Luo},
  title        = {InternVL3.5: Advancing Open-Source Multimodal Models in Versatility,
                  Reasoning, and Efficiency},
  journal      = {CoRR},
  volume       = {abs/2508.18265},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2508.18265},
  doi          = {10.48550/ARXIV.2508.18265},
  eprinttype   = {arXiv},
  eprint       = {2508.18265},
  timestamp    = {Tue, 07 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2508-18265.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2509-14232,
  author       = {Zhaokai Wang and
                  Penghao Yin and
                  Xiangyu Zhao and
                  Changyao Tian and
                  Yu Qiao and
                  Wenhai Wang and
                  Jifeng Dai and
                  Gen Luo},
  title        = {GenExam: {A} Multidisciplinary Text-to-Image Exam},
  journal      = {CoRR},
  volume       = {abs/2509.14232},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2509.14232},
  doi          = {10.48550/ARXIV.2509.14232},
  eprinttype   = {arXiv},
  eprint       = {2509.14232},
  timestamp    = {Thu, 16 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2509-14232.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2509-24007,
  author       = {Yangzhou Liu and
                  Yue Cao and
                  Hao Li and
                  Gen Luo and
                  Zhe Chen and
                  Weiyun Wang and
                  Xiaobo Liang and
                  Biqing Qi and
                  Lijun Wu and
                  Changyao Tian and
                  Yanting Zhang and
                  Yuqiang Li and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Sequential Diffusion Language Models},
  journal      = {CoRR},
  volume       = {abs/2509.24007},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2509.24007},
  doi          = {10.48550/ARXIV.2509.24007},
  eprinttype   = {arXiv},
  eprint       = {2509.24007},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2509-24007.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2510-08565,
  author       = {Changyao Tian and
                  Hao Li and
                  Gen Luo and
                  Xizhou Zhu and
                  Weijie Su and
                  Hanming Deng and
                  Jinguo Zhu and
                  Jie Shao and
                  Ziran Zhu and
                  Yunpeng Liu and
                  Lewei Lu and
                  Wenhai Wang and
                  Hongsheng Li and
                  Jifeng Dai},
  title        = {NaViL: Rethinking Scaling Properties of Native Multimodal Large Language
                  Models under Data Constraints},
  journal      = {CoRR},
  volume       = {abs/2510.08565},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2510.08565},
  doi          = {10.48550/ARXIV.2510.08565},
  eprinttype   = {arXiv},
  eprint       = {2510.08565},
  timestamp    = {Tue, 11 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2510-08565.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2510-11027,
  author       = {Ganlin Yang and
                  Tianyi Zhang and
                  Haoran Hao and
                  Weiyun Wang and
                  Yibin Liu and
                  Dehui Wang and
                  Guanzhou Chen and
                  Zijian Cai and
                  Junting Chen and
                  Weijie Su and
                  Wengang Zhou and
                  Yu Qiao and
                  Jifeng Dai and
                  Jiangmiao Pang and
                  Gen Luo and
                  Wenhai Wang and
                  Yao Mu and
                  Zhi Hou},
  title        = {Vlaser: Vision-Language-Action Model with Synergistic Embodied Reasoning},
  journal      = {CoRR},
  volume       = {abs/2510.11027},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2510.11027},
  doi          = {10.48550/ARXIV.2510.11027},
  eprinttype   = {arXiv},
  eprint       = {2510.11027},
  timestamp    = {Sun, 07 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2510-11027.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2511-11793,
  author       = {MiroMind Team and
                  Song Bai and
                  Lidong Bing and
                  Carson Chen and
                  Guanzheng Chen and
                  Yuntao Chen and
                  Zhe Chen and
                  Ziyi Chen and
                  Jifeng Dai and
                  Xuan Dong and
                  Wenhan Dou and
                  Yue Deng and
                  Yunjie Fu and
                  Junqi Ge and
                  Chenxia Han and
                  Tammy Huang and
                  Zhenhang Huang and
                  Jerry Jiao and
                  Shilei Jiang and
                  Tianyu Jiao and
                  Xiaoqi Jian and
                  Lei Lei and
                  Ruilin Li and
                  Ryan Luo and
                  Tiantong Li and
                  Xiang Lin and
                  Ziyuan Liu and
                  Zhiqi Li and
                  Jie Ni and
                  Qiang Ren and
                  Pax Sun and
                  Shiqian Su and
                  Chenxin Tao and
                  Bin Wang and
                  Hellen Wang and
                  Haonan Wang and
                  James Wang and
                  Jin Wang and
                  Jojo Wang and
                  Letian Wang and
                  Shizun Wang and
                  Weizhi Wang and
                  Zixuan Wang and
                  Jinfan Xu and
                  Sen Xing and
                  Chenyu Yang and
                  Hai Ye and
                  Jiaheng Yu and
                  Yue Yu and
                  Muyan Zhong and
                  Tianchen Zhao and
                  Xizhou Zhu and
                  Yanpeng Zhou and
                  Yifan Zhang and
                  Zhi Zhu},
  title        = {MiroThinker: Pushing the Performance Boundaries of Open-Source Research
                  Agents via Model, Context, and Interactive Scaling},
  journal      = {CoRR},
  volume       = {abs/2511.11793},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2511.11793},
  doi          = {10.48550/ARXIV.2511.11793},
  eprinttype   = {arXiv},
  eprint       = {2511.11793},
  timestamp    = {Sun, 19 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2511-11793.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/chinaf/ChenWTYGCTHLMMWDYGHSJXW24,
  author       = {Zhe Chen and
                  Weiyun Wang and
                  Hao Tian and
                  Shenglong Ye and
                  Zhangwei Gao and
                  Erfei Cui and
                  Wenwen Tong and
                  Kongzhi Hu and
                  Jiapeng Luo and
                  Zheng Ma and
                  Ji Ma and
                  Jiaqi Wang and
                  Xiaoyi Dong and
                  Hang Yan and
                  Hewei Guo and
                  Conghui He and
                  Botian Shi and
                  Zhenjiang Jin and
                  Chao Xu and
                  Bin Wang and
                  Xingjian Wei and
                  Wei Li and
                  Wenjian Zhang and
                  Bo Zhang and
                  Pinlong Cai and
                  Licheng Wen and
                  Xiangchao Yan and
                  Min Dou and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Dahua Lin and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {How far are we to GPT-4V? Closing the gap to commercial multimodal
                  models with open-source suites},
  journal      = {Sci. China Inf. Sci.},
  volume       = {67},
  number       = {12},
  year         = {2024},
  url          = {https://doi.org/10.1007/s11432-024-4231-5},
  doi          = {10.1007/S11432-024-4231-5},
  timestamp    = {Fri, 15 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/chinaf/ChenWTYGCTHLMMWDYGHSJXW24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/chinaf/LiuCGWCWTLZLQD24,
  author       = {Yangzhou Liu and
                  Yue Cao and
                  Zhangwei Gao and
                  Weiyun Wang and
                  Zhe Chen and
                  Wenhai Wang and
                  Hao Tian and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {MMInstruct: a high-quality multi-modal instruction tuning dataset
                  with extensive diversity},
  journal      = {Sci. China Inf. Sci.},
  volume       = {67},
  number       = {12},
  year         = {2024},
  url          = {https://doi.org/10.1007/s11432-024-4187-3},
  doi          = {10.1007/S11432-024-4187-3},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/chinaf/LiuCGWCWTLZLQD24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/cmpb/ChengDDLZLWZH24,
  author       = {Heming Cheng and
                  Dongfang Ding and
                  Jifeng Dai and
                  Gen Li and
                  Ke Zhang and
                  Jianyun Li and
                  Liuchuang Wei and
                  Xue Zhang and
                  Jie Hou},
  title        = {Effect of a reduced arterial axial pre-stretch ratio during aging
                  on the cardiac output and cerebral blood flow in the healthy elders},
  journal      = {Comput. Methods Programs Biomed.},
  volume       = {257},
  pages        = {108468},
  year         = {2024},
  url          = {https://doi.org/10.1016/j.cmpb.2024.108468},
  doi          = {10.1016/J.CMPB.2024.108468},
  timestamp    = {Mon, 09 Dec 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/cmpb/ChengDDLZLWZH24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/LiSDWLWZLYDTXXCLLGJLSLQ24,
  author       = {Hongyang Li and
                  Chonghao Sima and
                  Jifeng Dai and
                  Wenhai Wang and
                  Lewei Lu and
                  Huijie Wang and
                  Jia Zeng and
                  Zhiqi Li and
                  Jiazhi Yang and
                  Hanming Deng and
                  Hao Tian and
                  Enze Xie and
                  Jiangwei Xie and
                  Li Chen and
                  Tianyu Li and
                  Yang Li and
                  Yulu Gao and
                  Xiaosong Jia and
                  Si Liu and
                  Jianping Shi and
                  Dahua Lin and
                  Yu Qiao},
  title        = {Delving Into the Devils of Bird's-Eye-View Perception: {A} Review,
                  Evaluation and Recipe},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {46},
  number       = {4},
  pages        = {2151--2170},
  year         = {2024},
  url          = {https://doi.org/10.1109/TPAMI.2023.3333838},
  doi          = {10.1109/TPAMI.2023.3333838},
  timestamp    = {Thu, 09 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/LiSDWLWZLYDTXXCLLGJLSLQ24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/FangGZCLDL24,
  author       = {Rongyao Fang and
                  Peng Gao and
                  Aojun Zhou and
                  Yingjie Cai and
                  Si Liu and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FeatAug-DETR: Enriching One-to-Many Matching for DETRs With Feature
                  Augmentation},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {46},
  number       = {9},
  pages        = {6402--6415},
  year         = {2024},
  url          = {https://doi.org/10.1109/TPAMI.2024.3381961},
  doi          = {10.1109/TPAMI.2024.3381961},
  timestamp    = {Mon, 09 Dec 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/pami/FangGZCLDL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/visintelligence/GaoCCRWZTYHZLLQDW24,
  author       = {Zhangwei Gao and
                  Zhe Chen and
                  Erfei Cui and
                  Yiming Ren and
                  Weiyun Wang and
                  Jinguo Zhu and
                  Hao Tian and
                  Shenglong Ye and
                  Junjun He and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Mini-InternVL: a flexible-transfer pocket multi-modal model with 5{\%}
                  parameters and 90{\%} performance},
  journal      = {Vis. Intell.},
  volume       = {2},
  number       = {1},
  pages        = {32},
  year         = {2024},
  url          = {https://doi.org/10.1007/s44267-024-00067-6},
  doi          = {10.1007/S44267-024-00067-6},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/visintelligence/GaoCCRWZTYHZLLQDW24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/XiongLCWZLWL00L24,
  author       = {Yuwen Xiong and
                  Zhiqi Li and
                  Yuntao Chen and
                  Feng Wang and
                  Xizhou Zhu and
                  Jiapeng Luo and
                  Wenhai Wang and
                  Tong Lu and
                  Hongsheng Li and
                  Yu Qiao and
                  Lewei Lu and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {Efficient Deformable ConvNets: Rethinking Dynamic and Sparse Operator
                  for Vision Applications},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
  pages        = {5652--5661},
  publisher    = {{IEEE}},
  year         = {2024},
  url          = {https://doi.org/10.1109/CVPR52733.2024.00540},
  doi          = {10.1109/CVPR52733.2024.00540},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/XiongLCWZLWL00L24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/Li0WZZQWLLD24,
  author       = {Hao Li and
                  Xue Yang and
                  Zhaokai Wang and
                  Xizhou Zhu and
                  Jie Zhou and
                  Yu Qiao and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {Auto MC-Reward: Automated Dense Reward Design with Large Language
                  Models for Minecraft},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
  pages        = {16426--16435},
  publisher    = {{IEEE}},
  year         = {2024},
  url          = {https://doi.org/10.1109/CVPR52733.2024.01554},
  doi          = {10.1109/CVPR52733.2024.01554},
  timestamp    = {Mon, 03 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/Li0WZZQWLLD24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/Yu0LDD0Y24,
  author       = {Yi Yu and
                  Xue Yang and
                  Qingyun Li and
                  Feipeng Da and
                  Jifeng Dai and
                  Yu Qiao and
                  Junchi Yan},
  title        = {Point2RBox: Combine Knowledge from Synthetic Visual Patterns for End-to-End
                  Oriented Object Detection with Single Point Supervision},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
  pages        = {16783--16793},
  publisher    = {{IEEE}},
  year         = {2024},
  url          = {https://doi.org/10.1109/CVPR52733.2024.01588},
  doi          = {10.1109/CVPR52733.2024.01588},
  timestamp    = {Wed, 25 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/Yu0LDD0Y24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ChenWW0CXZZZLLL24,
  author       = {Zhe Chen and
                  Jiannan Wu and
                  Wenhai Wang and
                  Weijie Su and
                  Guo Chen and
                  Sen Xing and
                  Muyan Zhong and
                  Qinglong Zhang and
                  Xizhou Zhu and
                  Lewei Lu and
                  Bin Li and
                  Ping Luo and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Intern {VL:} Scaling up Vision Foundation Models and Aligning for
                  Generic Visual-Linguistic Tasks},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
  pages        = {24185--24198},
  publisher    = {{IEEE}},
  year         = {2024},
  url          = {https://doi.org/10.1109/CVPR52733.2024.02283},
  doi          = {10.1109/CVPR52733.2024.02283},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/ChenWW0CXZZZLLL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/LiuLGCLZLCQDW24,
  author       = {Zhaoyang Liu and
                  Zeqiang Lai and
                  Zhangwei Gao and
                  Erfei Cui and
                  Ziheng Li and
                  Xizhou Zhu and
                  Lewei Lu and
                  Qifeng Chen and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  editor       = {Ales Leonardis and
                  Elisa Ricci and
                  Stefan Roth and
                  Olga Russakovsky and
                  Torsten Sattler and
                  G{\"{u}}l Varol},
  title        = {ControlLLM: Augment Language Models with Tools by Searching on Graphs},
  booktitle    = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
                  September 29-October 4, 2024, Proceedings, Part {XII}},
  series       = {Lecture Notes in Computer Science},
  pages        = {89--105},
  publisher    = {Springer},
  year         = {2024},
  url          = {https://doi.org/10.1007/978-3-031-73254-6\_6},
  doi          = {10.1007/978-3-031-73254-6\_6},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eccv/LiuLGCLZLCQDW24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/LiWLLYDQZ24,
  author       = {Gang Li and
                  Wenhai Wang and
                  Xiang Li and
                  Ziheng Li and
                  Jian Yang and
                  Jifeng Dai and
                  Yu Qiao and
                  Shanshan Zhang},
  editor       = {Ales Leonardis and
                  Elisa Ricci and
                  Stefan Roth and
                  Olga Russakovsky and
                  Torsten Sattler and
                  G{\"{u}}l Varol},
  title        = {Distilling Knowledge from Large-Scale Image Models for Object Detection},
  booktitle    = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
                  September 29-October 4, 2024, Proceedings, Part {LXXXIV}},
  series       = {Lecture Notes in Computer Science},
  pages        = {142--160},
  publisher    = {Springer},
  year         = {2024},
  url          = {https://doi.org/10.1007/978-3-031-72907-2\_9},
  doi          = {10.1007/978-3-031-72907-2\_9},
  timestamp    = {Wed, 25 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eccv/LiWLLYDQZ24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/WangRLLYCWLLZQD24,
  author       = {Weiyun Wang and
                  Yiming Ren and
                  Haowen Luo and
                  Tiantong Li and
                  Chenxiang Yan and
                  Zhe Chen and
                  Wenhai Wang and
                  Qingyun Li and
                  Lewei Lu and
                  Xizhou Zhu and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Ales Leonardis and
                  Elisa Ricci and
                  Stefan Roth and
                  Olga Russakovsky and
                  Torsten Sattler and
                  G{\"{u}}l Varol},
  title        = {The All-Seeing Project {V2:} Towards General Relation Comprehension
                  of the Open World},
  booktitle    = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
                  September 29-October 4, 2024, Proceedings, Part {XXXIII}},
  series       = {Lecture Notes in Computer Science},
  pages        = {471--490},
  publisher    = {Springer},
  year         = {2024},
  url          = {https://doi.org/10.1007/978-3-031-73414-4\_27},
  doi          = {10.1007/978-3-031-73414-4\_27},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/WangRLLYCWLLZQD24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/TianTD0LL00HZ24,
  author       = {Changyao Tian and
                  Chenxin Tao and
                  Jifeng Dai and
                  Hao Li and
                  Ziheng Li and
                  Lewei Lu and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Gao Huang and
                  Xizhou Zhu},
  title        = {{ADDP:} Learning General Representations for Image Recognition and
                  Generation with Alternating Denoising Diffusion Process},
  booktitle    = {The Twelfth International Conference on Learning Representations,
                  {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
  publisher    = {OpenReview.net},
  year         = {2024},
  url          = {https://openreview.net/forum?id=cMPm8YFXZe},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/TianTD0LL00HZ24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/Wang0LWHXCLZ0CL24,
  author       = {Weiyun Wang and
                  Min Shi and
                  Qingyun Li and
                  Wenhai Wang and
                  Zhenhang Huang and
                  Linjie Xing and
                  Zhe Chen and
                  Hao Li and
                  Xizhou Zhu and
                  Zhiguo Cao and
                  Yushi Chen and
                  Tong Lu and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding
                  of the Open World},
  booktitle    = {The Twelfth International Conference on Learning Representations,
                  {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
  publisher    = {OpenReview.net},
  year         = {2024},
  url          = {https://openreview.net/forum?id=c2R7ajodcI},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iclr/Wang0LWHXCLZ0CL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/YangWCD024,
  author       = {Yang Yang and
                  Wenhai Wang and
                  Zhe Chen and
                  Jifeng Dai and
                  Liang Zheng},
  title        = {Bounding Box Stability against Feature Dropout Reflects Detector Generalization
                  across Environments},
  booktitle    = {The Twelfth International Conference on Learning Representations,
                  {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
  publisher    = {OpenReview.net},
  year         = {2024},
  url          = {https://openreview.net/forum?id=lmM4Ecm4HJ},
  timestamp    = {Mon, 18 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iclr/YangWCD024.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icml/0001CZCYGCLHTSY24,
  author       = {Yao Mu and
                  Junting Chen and
                  Qinglong Zhang and
                  Shoufa Chen and
                  Qiaojun Yu and
                  Chongjian Ge and
                  Runjian Chen and
                  Zhixuan Liang and
                  Mengkang Hu and
                  Chaofan Tao and
                  Peize Sun and
                  Haibao Yu and
                  Chao Yang and
                  Wenqi Shao and
                  Wenhai Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Mingyu Ding and
                  Ping Luo},
  editor       = {Ruslan Salakhutdinov and
                  Zico Kolter and
                  Katherine A. Heller and
                  Adrian Weller and
                  Nuria Oliver and
                  Jonathan Scarlett and
                  Felix Berkenkamp},
  title        = {RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis},
  booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024,
                  Vienna, Austria, July 21-27, 2024},
  series       = {Proceedings of Machine Learning Research},
  pages        = {36434--36454},
  publisher    = {{PMLR} / OpenReview.net},
  year         = {2024},
  url          = {https://proceedings.mlr.press/v235/mu24a.html},
  timestamp    = {Mon, 09 Feb 2026 15:35:36 +0100},
  biburl       = {https://dblp.org/rec/conf/icml/0001CZCYGCLHTSY24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/0004WX0WC00DP24,
  author       = {Jiawei Gao and
                  Ziqin Wang and
                  Zeqi Xiao and
                  Jingbo Wang and
                  Tai Wang and
                  Jinkun Cao and
                  Xiaolin Hu and
                  Si Liu and
                  Jifeng Dai and
                  Jiangmiao Pang},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {CooHOI: Learning Cooperative Human-Object Interaction with Manipulated
                  Object Dynamics},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/918b9487f8ea4661e8ba5a02b2126658-Abstract-Conference.html},
  timestamp    = {Tue, 26 May 2026 17:12:08 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/0004WX0WC00DP24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/DongZZCWOZDZLYG24,
  author       = {Xiaoyi Dong and
                  Pan Zhang and
                  Yuhang Zang and
                  Yuhang Cao and
                  Bin Wang and
                  Linke Ouyang and
                  Songyang Zhang and
                  Haodong Duan and
                  Wenwei Zhang and
                  Yining Li and
                  Hang Yan and
                  Yang Gao and
                  Zhe Chen and
                  Xinyue Zhang and
                  Wei Li and
                  Jingwen Li and
                  Wenhai Wang and
                  Kai Chen and
                  Conghui He and
                  Xingcheng Zhang and
                  Jifeng Dai and
                  Yu Qiao and
                  Dahua Lin and
                  Jiaqi Wang},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {InternLM-XComposer2-4KHD: {A} Pioneering Large Vision-Language Model
                  Handling Resolutions from 336 Pixels to 4K {HD}},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/4b06cdddb1cde6624c0be1465c7b800f-Abstract-Conference.html},
  timestamp    = {Thu, 26 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/DongZZCWOZDZLYG24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/NanLXD24,
  author       = {Zhixiong Nan and
                  Xianghong Li and
                  Tao Xiang and
                  Jifeng Dai},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {DI-MaskDINO: {A} Joint Object Detection and Instance Segmentation
                  Model},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/6f1346bac8b02f76a631400e2799b24b-Abstract-Conference.html},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/NanLXD24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/TaoZSLTL0000D24,
  author       = {Chenxin Tao and
                  Xizhou Zhu and
                  Shiqian Su and
                  Lewei Lu and
                  Changyao Tian and
                  Xuan Luo and
                  Gao Huang and
                  Hongsheng Li and
                  Yu Qiao and
                  Jie Zhou and
                  Jifeng Dai},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {Learning 1D Causal Visual Representation with De-focus Attention Networks},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/2d9c6cdb4cfe93869c090fea7375044b-Abstract-Conference.html},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/TaoZSLTL0000D24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/WangZRDLLH0ZLZL24,
  author       = {Weiyun Wang and
                  Shuibo Zhang and
                  Yiming Ren and
                  Yuchen Duan and
                  Tiantong Li and
                  Shuo Liu and
                  Mengkang Hu and
                  Zhe Chen and
                  Kaipeng Zhang and
                  Lewei Lu and
                  Xizhou Zhu and
                  Ping Luo and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenqi Shao and
                  Wenhai Wang},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {Needle In {A} Multimodal Haystack},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/24a8968affe71ffe4067d022b9d16566-Abstract-Datasets\_and\_Benchmarks\_Track.html},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/WangZRDLLH0ZLZL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/WuZXL00WZLL0QD24,
  author       = {Jiannan Wu and
                  Muyan Zhong and
                  Sen Xing and
                  Zeqiang Lai and
                  Zhaoyang Liu and
                  Zhe Chen and
                  Wenhai Wang and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Ping Luo and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {VisionLLM v2: An End-to-End Generalist Multimodal Large Language Model
                  for Hundreds of Vision-Language Tasks},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/81a60d18e010b27b36cd465c6604b915-Abstract-Conference.html},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/WuZXL00WZLL0QD24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/YangZZ0WDW0Z0D24,
  author       = {Chenyu Yang and
                  Xizhou Zhu and
                  Jinguo Zhu and
                  Weijie Su and
                  Junjie Wang and
                  Xuan Dong and
                  Wenhai Wang and
                  Lewei Lu and
                  Bin Li and
                  Jie Zhou and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {Vision Model Pre-training on Interleaved Image-Text Data via Latent
                  Compression Learning},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/2a952768bb85041f95ed06a5b60cf4d5-Abstract-Conference.html},
  timestamp    = {Sun, 15 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/YangZZ0WDW0Z0D24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/Zhu0W0DGL0D24,
  author       = {Xizhou Zhu and
                  Xue Yang and
                  Zhaokai Wang and
                  Hao Li and
                  Wenhan Dou and
                  Junqi Ge and
                  Lewei Lu and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {Parameter-Inverted Image Pyramid Networks},
  booktitle    = {Advances in Neural Information Processing Systems 37: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/ee81a23d6b83ac15fbeb5b7a30934e0b-Abstract-Conference.html},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/Zhu0W0DGL0D24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/siggraph/ShiHWBLZZCSQDL24,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Fu{-}Yun Wang and
                  Weikang Bian and
                  Dasong Li and
                  Yi Zhang and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  editor       = {Andres Burbano and
                  Denis Zorin and
                  Wojciech Jarosz},
  title        = {Motion-I2V: Consistent and Controllable Image-to-Video Generation
                  with Explicit Motion Modeling},
  booktitle    = {{ACM} {SIGGRAPH} 2024 Conference Papers, {SIGGRAPH} 2024, Denver,
                  CO, USA, 27 July 2024- 1 August 2024},
  pages        = {111},
  publisher    = {{ACM}},
  year         = {2024},
  url          = {https://doi.org/10.1145/3641519.3657497},
  doi          = {10.1145/3641519.3657497},
  timestamp    = {Mon, 07 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/siggraph/ShiHWBLZZCSQDL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2401-06197,
  author       = {Yuwen Xiong and
                  Zhiqi Li and
                  Yuntao Chen and
                  Feng Wang and
                  Xizhou Zhu and
                  Jiapeng Luo and
                  Wenhai Wang and
                  Tong Lu and
                  Hongsheng Li and
                  Yu Qiao and
                  Lewei Lu and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {Efficient Deformable ConvNets: Rethinking Dynamic and Sparse Operator
                  for Vision Applications},
  journal      = {CoRR},
  volume       = {abs/2401.06197},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2401.06197},
  doi          = {10.48550/ARXIV.2401.06197},
  eprinttype   = {arXiv},
  eprint       = {2401.06197},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2401-06197.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2401-10208,
  author       = {Changyao Tian and
                  Xizhou Zhu and
                  Yuwen Xiong and
                  Weiyun Wang and
                  Zhe Chen and
                  Wenhai Wang and
                  Yuntao Chen and
                  Lewei Lu and
                  Tong Lu and
                  Jie Zhou and
                  Hongsheng Li and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {MM-Interleaved: Interleaved Image-Text Generative Modeling via Multi-modal
                  Feature Synchronizer},
  journal      = {CoRR},
  volume       = {abs/2401.10208},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2401.10208},
  doi          = {10.48550/ARXIV.2401.10208},
  eprinttype   = {arXiv},
  eprint       = {2401.10208},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2401-10208.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2401-15977,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Fu{-}Yun Wang and
                  Weikang Bian and
                  Dasong Li and
                  Yi Zhang and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Motion-I2V: Consistent and Controllable Image-to-Video Generation
                  with Explicit Motion Modeling},
  journal      = {CoRR},
  volume       = {abs/2401.15977},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2401.15977},
  doi          = {10.48550/ARXIV.2401.15977},
  eprinttype   = {arXiv},
  eprint       = {2401.15977},
  timestamp    = {Mon, 07 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2401-15977.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2402-16117,
  author       = {Yao Mu and
                  Junting Chen and
                  Qinglong Zhang and
                  Shoufa Chen and
                  Qiaojun Yu and
                  Chongjian Ge and
                  Runjian Chen and
                  Zhixuan Liang and
                  Mengkang Hu and
                  Chaofan Tao and
                  Peize Sun and
                  Haibao Yu and
                  Chao Yang and
                  Wenqi Shao and
                  Wenhai Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Mingyu Ding and
                  Ping Luo},
  title        = {RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis},
  journal      = {CoRR},
  volume       = {abs/2402.16117},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.16117},
  doi          = {10.48550/ARXIV.2402.16117},
  eprinttype   = {arXiv},
  eprint       = {2402.16117},
  timestamp    = {Fri, 30 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2402-16117.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2402-19474,
  author       = {Weiyun Wang and
                  Yiming Ren and
                  Haowen Luo and
                  Tiantong Li and
                  Chenxiang Yan and
                  Zhe Chen and
                  Wenhai Wang and
                  Qingyun Li and
                  Lewei Lu and
                  Xizhou Zhu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {The All-Seeing Project {V2:} Towards General Relation Comprehension
                  of the Open World},
  journal      = {CoRR},
  volume       = {abs/2402.19474},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.19474},
  doi          = {10.48550/ARXIV.2402.19474},
  eprinttype   = {arXiv},
  eprint       = {2402.19474},
  timestamp    = {Mon, 18 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2402-19474.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2403-02308,
  author       = {Yuchen Duan and
                  Weiyun Wang and
                  Zhe Chen and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Yu Qiao and
                  Hongsheng Li and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Vision-RWKV: Efficient and Scalable Visual Perception with RWKV-Like
                  Architectures},
  journal      = {CoRR},
  volume       = {abs/2403.02308},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2403.02308},
  doi          = {10.48550/ARXIV.2403.02308},
  eprinttype   = {arXiv},
  eprint       = {2403.02308},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2403-02308.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2403-13803,
  author       = {Yang Yang and
                  Wenhai Wang and
                  Zhe Chen and
                  Jifeng Dai and
                  Liang Zheng},
  title        = {Bounding Box Stability against Feature Dropout Reflects Detector Generalization
                  across Environments},
  journal      = {CoRR},
  volume       = {abs/2403.13803},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2403.13803},
  doi          = {10.48550/ARXIV.2403.13803},
  eprinttype   = {arXiv},
  eprint       = {2403.13803},
  timestamp    = {Mon, 18 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2403-13803.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2404-06512,
  author       = {Xiaoyi Dong and
                  Pan Zhang and
                  Yuhang Zang and
                  Yuhang Cao and
                  Bin Wang and
                  Linke Ouyang and
                  Songyang Zhang and
                  Haodong Duan and
                  Wenwei Zhang and
                  Yining Li and
                  Hang Yan and
                  Yang Gao and
                  Zhe Chen and
                  Xinyue Zhang and
                  Wei Li and
                  Jingwen Li and
                  Wenhai Wang and
                  Kai Chen and
                  Conghui He and
                  Xingcheng Zhang and
                  Jifeng Dai and
                  Yu Qiao and
                  Dahua Lin and
                  Jiaqi Wang},
  title        = {InternLM-XComposer2-4KHD: {A} Pioneering Large Vision-Language Model
                  Handling Resolutions from 336 Pixels to 4K {HD}},
  journal      = {CoRR},
  volume       = {abs/2404.06512},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2404.06512},
  doi          = {10.48550/ARXIV.2404.06512},
  eprinttype   = {arXiv},
  eprint       = {2404.06512},
  timestamp    = {Thu, 26 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2404-06512.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2404-16821,
  author       = {Zhe Chen and
                  Weiyun Wang and
                  Hao Tian and
                  Shenglong Ye and
                  Zhangwei Gao and
                  Erfei Cui and
                  Wenwen Tong and
                  Kongzhi Hu and
                  Jiapeng Luo and
                  Zheng Ma and
                  Ji Ma and
                  Jiaqi Wang and
                  Xiaoyi Dong and
                  Hang Yan and
                  Hewei Guo and
                  Conghui He and
                  Botian Shi and
                  Zhenjiang Jin and
                  Chao Xu and
                  Bin Wang and
                  Xingjian Wei and
                  Wei Li and
                  Wenjian Zhang and
                  Bo Zhang and
                  Pinlong Cai and
                  Licheng Wen and
                  Xiangchao Yan and
                  Min Dou and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Dahua Lin and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal
                  Models with Open-Source Suites},
  journal      = {CoRR},
  volume       = {abs/2404.16821},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2404.16821},
  doi          = {10.48550/ARXIV.2404.16821},
  eprinttype   = {arXiv},
  eprint       = {2404.16821},
  timestamp    = {Fri, 15 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2404-16821.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2405-14739,
  author       = {Chongjie Si and
                  Xuehui Wang and
                  Xue Yang and
                  Zhengqin Xu and
                  Qingyun Li and
                  Jifeng Dai and
                  Yu Qiao and
                  Xiaokang Yang and
                  Wei Shen},
  title        = {FLoRA: Low-Rank Core Space for N-dimension},
  journal      = {CoRR},
  volume       = {abs/2405.14739},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2405.14739},
  doi          = {10.48550/ARXIV.2405.14739},
  eprinttype   = {arXiv},
  eprint       = {2405.14739},
  timestamp    = {Thu, 06 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2405-14739.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2405-19334,
  author       = {Yingqing He and
                  Zhaoyang Liu and
                  Jingye Chen and
                  Zeyue Tian and
                  Hongyu Liu and
                  Xiaowei Chi and
                  Runtao Liu and
                  Ruibin Yuan and
                  Yazhou Xing and
                  Wenhai Wang and
                  Jifeng Dai and
                  Yong Zhang and
                  Wei Xue and
                  Qifeng Liu and
                  Yike Guo and
                  Qifeng Chen},
  title        = {LLMs Meet Multimodal Generation and Editing: {A} Survey},
  journal      = {CoRR},
  volume       = {abs/2405.19334},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2405.19334},
  doi          = {10.48550/ARXIV.2405.19334},
  eprinttype   = {arXiv},
  eprint       = {2405.19334},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2405-19334.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-04330,
  author       = {Xizhou Zhu and
                  Xue Yang and
                  Zhaokai Wang and
                  Hao Li and
                  Wenhan Dou and
                  Junqi Ge and
                  Lewei Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Parameter-Inverted Image Pyramid Networks},
  journal      = {CoRR},
  volume       = {abs/2406.04330},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.04330},
  doi          = {10.48550/ARXIV.2406.04330},
  eprinttype   = {arXiv},
  eprint       = {2406.04330},
  timestamp    = {Tue, 06 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-04330.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-04342,
  author       = {Chenxin Tao and
                  Xizhou Zhu and
                  Shiqian Su and
                  Lewei Lu and
                  Changyao Tian and
                  Xuan Luo and
                  Gao Huang and
                  Hongsheng Li and
                  Yu Qiao and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {Learning 1D Causal Visual Representation with De-focus Attention Networks},
  journal      = {CoRR},
  volume       = {abs/2406.04342},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.04342},
  doi          = {10.48550/ARXIV.2406.04342},
  eprinttype   = {arXiv},
  eprint       = {2406.04342},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-04342.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-07230,
  author       = {Weiyun Wang and
                  Shuibo Zhang and
                  Yiming Ren and
                  Yuchen Duan and
                  Tiantong Li and
                  Shuo Liu and
                  Mengkang Hu and
                  Zhe Chen and
                  Kaipeng Zhang and
                  Lewei Lu and
                  Xizhou Zhu and
                  Ping Luo and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenqi Shao and
                  Wenhai Wang},
  title        = {Needle In {A} Multimodal Haystack},
  journal      = {CoRR},
  volume       = {abs/2406.07230},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.07230},
  doi          = {10.48550/ARXIV.2406.07230},
  eprinttype   = {arXiv},
  eprint       = {2406.07230},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-07230.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-07543,
  author       = {Chenyu Yang and
                  Xizhou Zhu and
                  Jinguo Zhu and
                  Weijie Su and
                  Junjie Wang and
                  Xuan Dong and
                  Wenhai Wang and
                  Lewei Lu and
                  Bin Li and
                  Jie Zhou and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Vision Model Pre-training on Interleaved Image-Text Data via Latent
                  Compression Learning},
  journal      = {CoRR},
  volume       = {abs/2406.07543},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.07543},
  doi          = {10.48550/ARXIV.2406.07543},
  eprinttype   = {arXiv},
  eprint       = {2406.07543},
  timestamp    = {Wed, 26 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-07543.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-08085,
  author       = {Haoji Zhang and
                  Yiqin Wang and
                  Yansong Tang and
                  Yong Liu and
                  Jiashi Feng and
                  Jifeng Dai and
                  Xiaojie Jin},
  title        = {Flash-VStream: Memory-Based Real-Time Understanding for Long Video
                  Streams},
  journal      = {CoRR},
  volume       = {abs/2406.08085},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.08085},
  doi          = {10.48550/ARXIV.2406.08085},
  eprinttype   = {arXiv},
  eprint       = {2406.08085},
  timestamp    = {Thu, 07 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-08085.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-08394,
  author       = {Jiannan Wu and
                  Muyan Zhong and
                  Sen Xing and
                  Zeqiang Lai and
                  Zhaoyang Liu and
                  Wenhai Wang and
                  Zhe Chen and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Ping Luo and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {VisionLLM v2: An End-to-End Generalist Multimodal Large Language Model
                  for Hundreds of Vision-Language Tasks},
  journal      = {CoRR},
  volume       = {abs/2406.08394},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.08394},
  doi          = {10.48550/ARXIV.2406.08394},
  eprinttype   = {arXiv},
  eprint       = {2406.08394},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-08394.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-08418,
  author       = {Qingyun Li and
                  Zhe Chen and
                  Weiyun Wang and
                  Wenhai Wang and
                  Shenglong Ye and
                  Zhenjiang Jin and
                  Guanzhou Chen and
                  Yinan He and
                  Zhangwei Gao and
                  Erfei Cui and
                  Jiashuo Yu and
                  Hao Tian and
                  Jiasheng Zhou and
                  Chao Xu and
                  Bin Wang and
                  Xingjian Wei and
                  Wei Li and
                  Wenjian Zhang and
                  Bo Zhang and
                  Pinlong Cai and
                  Licheng Wen and
                  Xiangchao Yan and
                  Zhenxiang Li and
                  Pei Chu and
                  Yi Wang and
                  Min Dou and
                  Changyao Tian and
                  Xizhou Zhu and
                  Lewei Lu and
                  Yushi Chen and
                  Junjun He and
                  Zhongying Tu and
                  Tong Lu and
                  Yali Wang and
                  Limin Wang and
                  Dahua Lin and
                  Yu Qiao and
                  Botian Shi and
                  Conghui He and
                  Jifeng Dai},
  title        = {OmniCorpus: {A} Unified Multimodal Corpus of 10 Billion-Level Images
                  Interleaved with Text},
  journal      = {CoRR},
  volume       = {abs/2406.08418},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.08418},
  doi          = {10.48550/ARXIV.2406.08418},
  eprinttype   = {arXiv},
  eprint       = {2406.08418},
  timestamp    = {Thu, 26 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-08418.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2406-14558,
  author       = {Jiawei Gao and
                  Ziqin Wang and
                  Zeqi Xiao and
                  Jingbo Wang and
                  Tai Wang and
                  Jinkun Cao and
                  Xiaolin Hu and
                  Si Liu and
                  Jifeng Dai and
                  Jiangmiao Pang},
  title        = {CooHOI: Learning Cooperative Human-Object Interaction with Manipulated
                  Object Dynamics},
  journal      = {CoRR},
  volume       = {abs/2406.14558},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.14558},
  doi          = {10.48550/ARXIV.2406.14558},
  eprinttype   = {arXiv},
  eprint       = {2406.14558},
  timestamp    = {Tue, 23 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-14558.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2407-00603,
  author       = {Yiqin Wang and
                  Haoji Zhang and
                  Yansong Tang and
                  Yong Liu and
                  Jiashi Feng and
                  Jifeng Dai and
                  Xiaojie Jin},
  title        = {Hierarchical Memory for Long Video {QA}},
  journal      = {CoRR},
  volume       = {abs/2407.00603},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2407.00603},
  doi          = {10.48550/ARXIV.2407.00603},
  eprinttype   = {arXiv},
  eprint       = {2407.00603},
  timestamp    = {Thu, 07 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2407-00603.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2407-03320,
  author       = {Pan Zhang and
                  Xiaoyi Dong and
                  Yuhang Zang and
                  Yuhang Cao and
                  Rui Qian and
                  Lin Chen and
                  Qipeng Guo and
                  Haodong Duan and
                  Bin Wang and
                  Linke Ouyang and
                  Songyang Zhang and
                  Wenwei Zhang and
                  Yining Li and
                  Yang Gao and
                  Peng Sun and
                  Xinyue Zhang and
                  Wei Li and
                  Jingwen Li and
                  Wenhai Wang and
                  Hang Yan and
                  Conghui He and
                  Xingcheng Zhang and
                  Kai Chen and
                  Jifeng Dai and
                  Yu Qiao and
                  Dahua Lin and
                  Jiaqi Wang},
  title        = {InternLM-XComposer-2.5: {A} Versatile Large Vision Language Model
                  Supporting Long-Contextual Input and Output},
  journal      = {CoRR},
  volume       = {abs/2407.03320},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2407.03320},
  doi          = {10.48550/ARXIV.2407.03320},
  eprinttype   = {arXiv},
  eprint       = {2407.03320},
  timestamp    = {Thu, 26 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2407-03320.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2407-15838,
  author       = {Yangzhou Liu and
                  Yue Cao and
                  Zhangwei Gao and
                  Weiyun Wang and
                  Zhe Chen and
                  Wenhai Wang and
                  Hao Tian and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {MMInstruct: {A} High-Quality Multi-Modal Instruction Tuning Dataset
                  with Extensive Diversity},
  journal      = {CoRR},
  volume       = {abs/2407.15838},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2407.15838},
  doi          = {10.48550/ARXIV.2407.15838},
  eprinttype   = {arXiv},
  eprint       = {2407.15838},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2407-15838.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2408-02718,
  author       = {Fanqing Meng and
                  Jin Wang and
                  Chuanhao Li and
                  Quanfeng Lu and
                  Hao Tian and
                  Jiaqi Liao and
                  Xizhou Zhu and
                  Jifeng Dai and
                  Yu Qiao and
                  Ping Luo and
                  Kaipeng Zhang and
                  Wenqi Shao},
  title        = {{MMIU:} Multimodal Multi-image Understanding for Evaluating Large
                  Vision-Language Models},
  journal      = {CoRR},
  volume       = {abs/2408.02718},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2408.02718},
  doi          = {10.48550/ARXIV.2408.02718},
  eprinttype   = {arXiv},
  eprint       = {2408.02718},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2408-02718.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-08202,
  author       = {Gen Luo and
                  Xue Yang and
                  Wenhan Dou and
                  Zhaokai Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Xizhou Zhu},
  title        = {Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large
                  Language Models with Endogenous Visual Pre-training},
  journal      = {CoRR},
  volume       = {abs/2410.08202},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.08202},
  doi          = {10.48550/ARXIV.2410.08202},
  eprinttype   = {arXiv},
  eprint       = {2410.08202},
  timestamp    = {Mon, 18 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-08202.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-10267,
  author       = {He Guo and
                  Yulong Wang and
                  Zixuan Ye and
                  Jifeng Dai and
                  Yuwen Xiong},
  title        = {big.LITTLE Vision Transformer for Efficient Visual Recognition},
  journal      = {CoRR},
  volume       = {abs/2410.10267},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.10267},
  doi          = {10.48550/ARXIV.2410.10267},
  eprinttype   = {arXiv},
  eprint       = {2410.10267},
  timestamp    = {Mon, 25 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-10267.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-13861,
  author       = {Rongyao Fang and
                  Chengqi Duan and
                  Kun Wang and
                  Hao Li and
                  Hao Tian and
                  Xingyu Zeng and
                  Rui Zhao and
                  Jifeng Dai and
                  Hongsheng Li and
                  Xihui Liu},
  title        = {{PUMA:} Empowering Unified {MLLM} with Multi-granular Visual Generation},
  journal      = {CoRR},
  volume       = {abs/2410.13861},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.13861},
  doi          = {10.48550/ARXIV.2410.13861},
  eprinttype   = {arXiv},
  eprint       = {2410.13861},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-13861.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-15959,
  author       = {Zhi Hou and
                  Tianyi Zhang and
                  Yuwen Xiong and
                  Hengjun Pu and
                  Chengyang Zhao and
                  Ronglei Tong and
                  Yu Qiao and
                  Jifeng Dai and
                  Yuntao Chen},
  title        = {Diffusion Transformer Policy},
  journal      = {CoRR},
  volume       = {abs/2410.15959},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.15959},
  doi          = {10.48550/ARXIV.2410.15959},
  eprinttype   = {arXiv},
  eprint       = {2410.15959},
  timestamp    = {Sat, 31 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-15959.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-16261,
  author       = {Zhangwei Gao and
                  Zhe Chen and
                  Erfei Cui and
                  Yiming Ren and
                  Weiyun Wang and
                  Jinguo Zhu and
                  Hao Tian and
                  Shenglong Ye and
                  Junjun He and
                  Xizhou Zhu and
                  Lewei Lu and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Mini-InternVL: {A} Flexible-Transfer Pocket Multimodal Model with
                  5{\%} Parameters and 90{\%} Performance},
  journal      = {CoRR},
  volume       = {abs/2410.16261},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.16261},
  doi          = {10.48550/ARXIV.2410.16261},
  eprinttype   = {arXiv},
  eprint       = {2410.16261},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-16261.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2410-16707,
  author       = {Zhixiong Nan and
                  Xianghong Li and
                  Tao Xiang and
                  Jifeng Dai},
  title        = {DI-MaskDINO: {A} Joint Object Detection and Instance Segmentation
                  Model},
  journal      = {CoRR},
  volume       = {abs/2410.16707},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.16707},
  doi          = {10.48550/ARXIV.2410.16707},
  eprinttype   = {arXiv},
  eprint       = {2410.16707},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-16707.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2411-10442,
  author       = {Weiyun Wang and
                  Zhe Chen and
                  Wenhai Wang and
                  Yue Cao and
                  Yangzhou Liu and
                  Zhangwei Gao and
                  Jinguo Zhu and
                  Xizhou Zhu and
                  Lewei Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {Enhancing the Reasoning Ability of Multimodal Large Language Models
                  via Mixed Preference Optimization},
  journal      = {CoRR},
  volume       = {abs/2411.10442},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2411.10442},
  doi          = {10.48550/ARXIV.2411.10442},
  eprinttype   = {arXiv},
  eprint       = {2411.10442},
  timestamp    = {Wed, 01 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2411-10442.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-01271,
  author       = {Sen Xing and
                  Muyan Zhong and
                  Zeqiang Lai and
                  Liangchen Li and
                  Jiawen Liu and
                  Yaohui Wang and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {MuLan: Adapting Multilingual Diffusion Models for Hundreds of Languages
                  with Negligible Cost},
  journal      = {CoRR},
  volume       = {abs/2412.01271},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.01271},
  doi          = {10.48550/ARXIV.2412.01271},
  eprinttype   = {arXiv},
  eprint       = {2412.01271},
  timestamp    = {Sun, 12 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-01271.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-01407,
  author       = {Zehuan Wu and
                  Jingcheng Ni and
                  Xiaodong Wang and
                  Yuxin Guo and
                  Rui Chen and
                  Lewei Lu and
                  Jifeng Dai and
                  Yuwen Xiong},
  title        = {HoloDrive: Holistic 2D-3D Multi-Modal Street Scene Generation for
                  Autonomous Driving},
  journal      = {CoRR},
  volume       = {abs/2412.01407},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.01407},
  doi          = {10.48550/ARXIV.2412.01407},
  eprinttype   = {arXiv},
  eprint       = {2412.01407},
  timestamp    = {Sun, 12 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-01407.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-05271,
  author       = {Zhe Chen and
                  Weiyun Wang and
                  Yue Cao and
                  Yangzhou Liu and
                  Zhangwei Gao and
                  Erfei Cui and
                  Jinguo Zhu and
                  Shenglong Ye and
                  Hao Tian and
                  Zhaoyang Liu and
                  Lixin Gu and
                  Xuehui Wang and
                  Qingyun Li and
                  Yimin Ren and
                  Zixuan Chen and
                  Jiapeng Luo and
                  Jiahao Wang and
                  Tan Jiang and
                  Bo Wang and
                  Conghui He and
                  Botian Shi and
                  Xingcheng Zhang and
                  Han Lv and
                  Yi Wang and
                  Wenqi Shao and
                  Pei Chu and
                  Zhongying Tu and
                  Tong He and
                  Zhiyong Wu and
                  Huipeng Deng and
                  Jiaye Ge and
                  Kai Chen and
                  Min Dou and
                  Lewei Lu and
                  Xizhou Zhu and
                  Tong Lu and
                  Dahua Lin and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {Expanding Performance Boundaries of Open-Source Multimodal Models
                  with Model, Data, and Test-Time Scaling},
  journal      = {CoRR},
  volume       = {abs/2412.05271},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.05271},
  doi          = {10.48550/ARXIV.2412.05271},
  eprinttype   = {arXiv},
  eprint       = {2412.05271},
  timestamp    = {Wed, 03 Jun 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-05271.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-09604,
  author       = {Hao Li and
                  Changyao Tian and
                  Jie Shao and
                  Xizhou Zhu and
                  Zhaokai Wang and
                  Jinguo Zhu and
                  Wenhan Dou and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {SynerGen-VL: Towards Synergistic Image Understanding and Generation
                  with Vision Experts and Token Folding},
  journal      = {CoRR},
  volume       = {abs/2412.09604},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.09604},
  doi          = {10.48550/ARXIV.2412.09604},
  eprinttype   = {arXiv},
  eprint       = {2412.09604},
  timestamp    = {Mon, 20 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-09604.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-09613,
  author       = {Chenyu Yang and
                  Xuan Dong and
                  Xizhou Zhu and
                  Weijie Su and
                  Jiahao Wang and
                  Hao Tian and
                  Zhe Chen and
                  Wenhai Wang and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {{PVC:} Progressive Visual Token Compression for Unified Image and
                  Video Processing in Large Vision-Language Models},
  journal      = {CoRR},
  volume       = {abs/2412.09613},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.09613},
  doi          = {10.48550/ARXIV.2412.09613},
  eprinttype   = {arXiv},
  eprint       = {2412.09613},
  timestamp    = {Mon, 20 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-09613.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-09616,
  author       = {Junqi Ge and
                  Ziyi Chen and
                  Jintao Lin and
                  Jinguo Zhu and
                  Xihui Liu and
                  Jifeng Dai and
                  Xizhou Zhu},
  title        = {{V2PE:} Improving Multimodal Long-Context Capability of Vision-Language
                  Models with Variable Visual Position Encoding},
  journal      = {CoRR},
  volume       = {abs/2412.09616},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.09616},
  doi          = {10.48550/ARXIV.2412.09616},
  eprinttype   = {arXiv},
  eprint       = {2412.09616},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-09616.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2412-16158,
  author       = {Chenxin Tao and
                  Shiqian Su and
                  Xizhou Zhu and
                  Chenyu Zhang and
                  Zhe Chen and
                  Jiawen Liu and
                  Wenhai Wang and
                  Lewei Lu and
                  Gao Huang and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {HoVLE: Unleashing the Power of Monolithic Vision-Language Models with
                  Holistic Vision-Language Embedding},
  journal      = {CoRR},
  volume       = {abs/2412.16158},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2412.16158},
  doi          = {10.48550/ARXIV.2412.16158},
  eprinttype   = {arXiv},
  eprint       = {2412.16158},
  timestamp    = {Thu, 23 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2412-16158.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ShiHLZCSQDL23,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Dasong Li and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FlowFormer++: Masked Cost Volume Autoencoding for Pretraining Optical
                  Flow Estimation},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {1599--1610},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.00160},
  doi          = {10.1109/CVPR52729.2023.00160},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/ShiHLZCSQDL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/TaoZ0HLZ00D23,
  author       = {Chenxin Tao and
                  Xizhou Zhu and
                  Weijie Su and
                  Gao Huang and
                  Bin Li and
                  Jie Zhou and
                  Yu Qiao and
                  Xiaogang Wang and
                  Jifeng Dai},
  title        = {Siamese Image Modeling for Self-Supervised Vision Representation Learning},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {2132--2141},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.00212},
  doi          = {10.1109/CVPR52729.2023.00212},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/TaoZ0HLZ00D23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LiZJZLYWQWWD23,
  author       = {Hao Li and
                  Jinguo Zhu and
                  Xiaohu Jiang and
                  Xizhou Zhu and
                  Hongsheng Li and
                  Chun Yuan and
                  Xiaohua Wang and
                  Yu Qiao and
                  Xiaogang Wang and
                  Wenhai Wang and
                  Jifeng Dai},
  title        = {Uni-Perceiver v2: {A} Generalist Model for Large-Scale Vision and
                  Vision-Language Tasks},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {2691--2700},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.00264},
  doi          = {10.1109/CVPR52729.2023.00264},
  timestamp    = {Thu, 29 Jan 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/LiZJZLYWQWWD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/WangDCHLZHLLLWQ23,
  author       = {Wenhai Wang and
                  Jifeng Dai and
                  Zhe Chen and
                  Zhenhang Huang and
                  Zhiqi Li and
                  Xizhou Zhu and
                  Xiaowei Hu and
                  Tong Lu and
                  Lewei Lu and
                  Hongsheng Li and
                  Xiaogang Wang and
                  Yu Qiao},
  title        = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable
                  Convolutions},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {14408--14419},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.01385},
  doi          = {10.1109/CVPR52729.2023.01385},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/WangDCHLZHLLLWQ23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/0002ZTLLHQWZD23,
  author       = {Weijie Su and
                  Xizhou Zhu and
                  Chenxin Tao and
                  Lewei Lu and
                  Bin Li and
                  Gao Huang and
                  Yu Qiao and
                  Xiaogang Wang and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {Towards All-in-One Pre-Training via Maximizing Multi-Modal Mutual
                  Information},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {15888--15899},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.01525},
  doi          = {10.1109/CVPR52729.2023.01525},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/0002ZTLLHQWZD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/YangCTTZZHLQLZD23,
  author       = {Chenyu Yang and
                  Yuntao Chen and
                  Hao Tian and
                  Chenxin Tao and
                  Xizhou Zhu and
                  Zhaoxiang Zhang and
                  Gao Huang and
                  Hongyang Li and
                  Yu Qiao and
                  Lewei Lu and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition
                  via Perspective Supervision},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {17830--17839},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.01710},
  doi          = {10.1109/CVPR52729.2023.01710},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/YangCTTZZHLQLZD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/HuYCLSZCDLWLJLD23,
  author       = {Yihan Hu and
                  Jiazhi Yang and
                  Li Chen and
                  Keyu Li and
                  Chonghao Sima and
                  Xizhou Zhu and
                  Siqi Chai and
                  Senyao Du and
                  Tianwei Lin and
                  Wenhai Wang and
                  Lewei Lu and
                  Xiaosong Jia and
                  Qiang Liu and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongyang Li},
  title        = {Planning-oriented Autonomous Driving},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {17853--17862},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.01712},
  doi          = {10.1109/CVPR52729.2023.01712},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/HuYCLSZCDLWLJLD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/XuH00D0H23,
  author       = {Jiaqi Xu and
                  Xiaowei Hu and
                  Lei Zhu and
                  Qi Dou and
                  Jifeng Dai and
                  Yu Qiao and
                  Pheng{-}Ann Heng},
  title        = {Video Dehazing via a Multi-Range Temporal Alignment Network with Physical
                  Prior},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {18053--18062},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.01731},
  doi          = {10.1109/CVPR52729.2023.01731},
  timestamp    = {Mon, 03 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/XuH00D0H23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ZhuWFYGDQH23,
  author       = {Yurui Zhu and
                  Tianyu Wang and
                  Xueyang Fu and
                  Xuanyu Yang and
                  Xin Guo and
                  Jifeng Dai and
                  Yu Qiao and
                  Xiaowei Hu},
  title        = {Learning Weather-General and Weather-Specific Features for Image Restoration
                  Under Multiple Adverse Weather Conditions},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
  pages        = {21747--21758},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/CVPR52729.2023.02083},
  doi          = {10.1109/CVPR52729.2023.02083},
  timestamp    = {Wed, 17 Sep 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/ZhuWFYGDQH23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/ShiHBLZCSQDL23,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Weikang Bian and
                  Dasong Li and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation},
  booktitle    = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2023,
                  Paris, France, October 1-6, 2023},
  pages        = {12435--12446},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/ICCV51070.2023.01146},
  doi          = {10.1109/ICCV51070.2023.01146},
  timestamp    = {Mon, 03 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iccv/ShiHBLZCSQDL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/ChenDWHLDQ23,
  author       = {Zhe Chen and
                  Yuchen Duan and
                  Wenhai Wang and
                  Junjun He and
                  Tong Lu and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {Vision Transformer Adapter for Dense Predictions},
  booktitle    = {The Eleventh International Conference on Learning Representations,
                  {ICLR} 2023, Kigali, Rwanda, May 1-5, 2023},
  publisher    = {OpenReview.net},
  year         = {2023},
  url          = {https://openreview.net/forum?id=plKu2GByCNW},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iclr/ChenDWHLDQ23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/MuZHWDJWDQL23,
  author       = {Yao Mu and
                  Qinglong Zhang and
                  Mengkang Hu and
                  Wenhai Wang and
                  Mingyu Ding and
                  Jun Jin and
                  Bin Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Ping Luo},
  editor       = {Alice Oh and
                  Tristan Naumann and
                  Amir Globerson and
                  Kate Saenko and
                  Moritz Hardt and
                  Sergey Levine},
  title        = {EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/4ec43957eda1126ad4887995d05fae3b-Abstract-Conference.html},
  timestamp    = {Fri, 30 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/MuZHWDJWDQL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/SunPGLDWZZQWDQW23,
  author       = {Keqiang Sun and
                  Junting Pan and
                  Yuying Ge and
                  Hao Li and
                  Haodong Duan and
                  Xiaoshi Wu and
                  Renrui Zhang and
                  Aojun Zhou and
                  Zipeng Qin and
                  Yi Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Limin Wang and
                  Hongsheng Li},
  editor       = {Alice Oh and
                  Tristan Naumann and
                  Amir Globerson and
                  Kate Saenko and
                  Moritz Hardt and
                  Sergey Levine},
  title        = {JourneyDB: {A} Benchmark for Generative Image Understanding},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/9bc59aff4685e39e1a8175d5303248a1-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Fri, 27 Dec 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/SunPGLDWZZQWDQW23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/WangCCWZZLLZQD23,
  author       = {Wenhai Wang and
                  Zhe Chen and
                  Xiaokang Chen and
                  Jiannan Wu and
                  Xizhou Zhu and
                  Gang Zeng and
                  Ping Luo and
                  Tong Lu and
                  Jie Zhou and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Alice Oh and
                  Tristan Naumann and
                  Amir Globerson and
                  Kate Saenko and
                  Moritz Hardt and
                  Sergey Levine},
  title        = {VisionLLM: Large Language Model is also an Open-Ended Decoder for
                  Vision-Centric Tasks},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/c1f7b1ed763e9c75e4db74b49b76db5f-Abstract-Conference.html},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/WangCCWZZLLZQD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2303-01237,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Dasong Li and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FlowFormer++: Masked Cost Volume Autoencoding for Pretraining Optical
                  Flow Estimation},
  journal      = {CoRR},
  volume       = {abs/2303.01237},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2303.01237},
  doi          = {10.48550/ARXIV.2303.01237},
  eprinttype   = {arXiv},
  eprint       = {2303.01237},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2303-01237.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2303-01503,
  author       = {Rongyao Fang and
                  Peng Gao and
                  Aojun Zhou and
                  Yingjie Cai and
                  Si Liu and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FeatAug-DETR: Enriching One-to-Many Matching for DETRs with Feature
                  Augmentation},
  journal      = {CoRR},
  volume       = {abs/2303.01503},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2303.01503},
  doi          = {10.48550/ARXIV.2303.01503},
  eprinttype   = {arXiv},
  eprint       = {2303.01503},
  timestamp    = {Fri, 10 Nov 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2303-01503.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2303-08340,
  author       = {Xiaoyu Shi and
                  Zhaoyang Huang and
                  Weikang Bian and
                  Dasong Li and
                  Manyuan Zhang and
                  Ka Chun Cheung and
                  Simon See and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation},
  journal      = {CoRR},
  volume       = {abs/2303.08340},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2303.08340},
  doi          = {10.48550/ARXIV.2303.08340},
  eprinttype   = {arXiv},
  eprint       = {2303.08340},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2303-08340.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2303-09757,
  author       = {Jiaqi Xu and
                  Xiaowei Hu and
                  Lei Zhu and
                  Qi Dou and
                  Jifeng Dai and
                  Yu Qiao and
                  Pheng{-}Ann Heng},
  title        = {Video Dehazing via a Multi-Range Temporal Alignment Network with Physical
                  Prior},
  journal      = {CoRR},
  volume       = {abs/2303.09757},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2303.09757},
  doi          = {10.48550/ARXIV.2303.09757},
  eprinttype   = {arXiv},
  eprint       = {2303.09757},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2303-09757.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2305-05662,
  author       = {Zhaoyang Liu and
                  Yinan He and
                  Wenhai Wang and
                  Weiyun Wang and
                  Yi Wang and
                  Shoufa Chen and
                  Qinglong Zhang and
                  Zeqiang Lai and
                  Yang Yang and
                  Qingyun Li and
                  Jiashuo Yu and
                  Kunchang Li and
                  Zhe Chen and
                  Xue Yang and
                  Xizhou Zhu and
                  Yali Wang and
                  Limin Wang and
                  Ping Luo and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {InternGPT: Solving Vision-Centric Tasks by Interacting with Chatbots
                  Beyond Language},
  journal      = {CoRR},
  volume       = {abs/2305.05662},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2305.05662},
  doi          = {10.48550/ARXIV.2305.05662},
  eprinttype   = {arXiv},
  eprint       = {2305.05662},
  timestamp    = {Fri, 27 Dec 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2305-05662.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2305-11175,
  author       = {Wenhai Wang and
                  Zhe Chen and
                  Xiaokang Chen and
                  Jiannan Wu and
                  Xizhou Zhu and
                  Gang Zeng and
                  Ping Luo and
                  Tong Lu and
                  Jie Zhou and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {VisionLLM: Large Language Model is also an Open-Ended Decoder for
                  Vision-Centric Tasks},
  journal      = {CoRR},
  volume       = {abs/2305.11175},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2305.11175},
  doi          = {10.48550/ARXIV.2305.11175},
  eprinttype   = {arXiv},
  eprint       = {2305.11175},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2305-11175.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2305-15021,
  author       = {Yao Mu and
                  Qinglong Zhang and
                  Mengkang Hu and
                  Wenhai Wang and
                  Mingyu Ding and
                  Jun Jin and
                  Bin Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Ping Luo},
  title        = {EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought},
  journal      = {CoRR},
  volume       = {abs/2305.15021},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2305.15021},
  doi          = {10.48550/ARXIV.2305.15021},
  eprinttype   = {arXiv},
  eprint       = {2305.15021},
  timestamp    = {Fri, 30 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2305-15021.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2305-17144,
  author       = {Xizhou Zhu and
                  Yuntao Chen and
                  Hao Tian and
                  Chenxin Tao and
                  Weijie Su and
                  Chenyu Yang and
                  Gao Huang and
                  Bin Li and
                  Lewei Lu and
                  Xiaogang Wang and
                  Yu Qiao and
                  Zhaoxiang Zhang and
                  Jifeng Dai},
  title        = {Ghost in the Minecraft: Generally Capable Agents for Open-World Environments
                  via Large Language Models with Text-based Knowledge and Memory},
  journal      = {CoRR},
  volume       = {abs/2305.17144},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2305.17144},
  doi          = {10.48550/ARXIV.2305.17144},
  eprinttype   = {arXiv},
  eprint       = {2305.17144},
  timestamp    = {Sat, 06 Sep 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2305-17144.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2306-01721,
  author       = {Zeqiang Lai and
                  Yuchen Duan and
                  Jifeng Dai and
                  Ziheng Li and
                  Ying Fu and
                  Hongsheng Li and
                  Yu Qiao and
                  Wenhai Wang},
  title        = {Denoising Diffusion Semantic Segmentation with Mask Prior Modeling},
  journal      = {CoRR},
  volume       = {abs/2306.01721},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2306.01721},
  doi          = {10.48550/ARXIV.2306.01721},
  eprinttype   = {arXiv},
  eprint       = {2306.01721},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2306-01721.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2306-05423,
  author       = {Changyao Tian and
                  Chenxin Tao and
                  Jifeng Dai and
                  Hao Li and
                  Ziheng Li and
                  Lewei Lu and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Gao Huang and
                  Xizhou Zhu},
  title        = {{ADDP:} Learning General Representations for Image Recognition and
                  Generation with Alternating Denoising Diffusion Process},
  journal      = {CoRR},
  volume       = {abs/2306.05423},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2306.05423},
  doi          = {10.48550/ARXIV.2306.05423},
  eprinttype   = {arXiv},
  eprint       = {2306.05423},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2306-05423.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2306-05442,
  author       = {Zhaoyang Huang and
                  Xiaoyu Shi and
                  Chao Zhang and
                  Qiang Wang and
                  Yijin Li and
                  Hongwei Qin and
                  Jifeng Dai and
                  Xiaogang Wang and
                  Hongsheng Li},
  title        = {FlowFormer: {A} Transformer Architecture and Its Masked Cost Volume
                  Autoencoding for Optical Flow},
  journal      = {CoRR},
  volume       = {abs/2306.05442},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2306.05442},
  doi          = {10.48550/ARXIV.2306.05442},
  eprinttype   = {arXiv},
  eprint       = {2306.05442},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2306-05442.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2307-00716,
  author       = {Junting Pan and
                  Keqiang Sun and
                  Yuying Ge and
                  Hao Li and
                  Haodong Duan and
                  Xiaoshi Wu and
                  Renrui Zhang and
                  Aojun Zhou and
                  Zipeng Qin and
                  Yi Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Limin Wang and
                  Hongsheng Li},
  title        = {JourneyDB: {A} Benchmark for Generative Image Understanding},
  journal      = {CoRR},
  volume       = {abs/2307.00716},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2307.00716},
  doi          = {10.48550/ARXIV.2307.00716},
  eprinttype   = {arXiv},
  eprint       = {2307.00716},
  timestamp    = {Sat, 14 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2307-00716.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2308-01907,
  author       = {Weiyun Wang and
                  Min Shi and
                  Qingyun Li and
                  Wenhai Wang and
                  Zhenhang Huang and
                  Linjie Xing and
                  Zhe Chen and
                  Hao Li and
                  Xizhou Zhu and
                  Zhiguo Cao and
                  Yushi Chen and
                  Tong Lu and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding
                  of the Open World},
  journal      = {CoRR},
  volume       = {abs/2308.01907},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2308.01907},
  doi          = {10.48550/ARXIV.2308.01907},
  eprinttype   = {arXiv},
  eprint       = {2308.01907},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2308-01907.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2310-07653,
  author       = {Zeqiang Lai and
                  Xizhou Zhu and
                  Jifeng Dai and
                  Yu Qiao and
                  Wenhai Wang},
  title        = {Mini-DALLE3: Interactive Text to Image by Prompting Large Language
                  Models},
  journal      = {CoRR},
  volume       = {abs/2310.07653},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.07653},
  doi          = {10.48550/ARXIV.2310.07653},
  eprinttype   = {arXiv},
  eprint       = {2310.07653},
  timestamp    = {Wed, 24 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2310-07653.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2310-17796,
  author       = {Zhaoyang Liu and
                  Zeqiang Lai and
                  Zhangwei Gao and
                  Erfei Cui and
                  Zhiheng Li and
                  Xizhou Zhu and
                  Lewei Lu and
                  Qifeng Chen and
                  Yu Qiao and
                  Jifeng Dai and
                  Wenhai Wang},
  title        = {ControlLLM: Augment Language Models with Tools by Searching on Graphs},
  journal      = {CoRR},
  volume       = {abs/2310.17796},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.17796},
  doi          = {10.48550/ARXIV.2310.17796},
  eprinttype   = {arXiv},
  eprint       = {2310.17796},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2310-17796.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2311-14758,
  author       = {Yu Yi and
                  Xue Yang and
                  Qingyun Li and
                  Feipeng Da and
                  Junchi Yan and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {Point2RBox: Combine Knowledge from Synthetic Visual Patterns for End-to-end
                  Oriented Object Detection with Single Point Supervision},
  journal      = {CoRR},
  volume       = {abs/2311.14758},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2311.14758},
  doi          = {10.48550/ARXIV.2311.14758},
  eprinttype   = {arXiv},
  eprint       = {2311.14758},
  timestamp    = {Mon, 22 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2311-14758.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2311-18835,
  author       = {Rongyao Fang and
                  Shilin Yan and
                  Zhaoyang Huang and
                  Jingqiu Zhou and
                  Hao Tian and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {InstructSeq: Unifying Vision Tasks with Instruction-conditioned Multi-modal
                  Sequence Generation},
  journal      = {CoRR},
  volume       = {abs/2311.18835},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2311.18835},
  doi          = {10.48550/ARXIV.2311.18835},
  eprinttype   = {arXiv},
  eprint       = {2311.18835},
  timestamp    = {Wed, 20 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2311-18835.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2312-09238,
  author       = {Hao Li and
                  Xue Yang and
                  Zhaokai Wang and
                  Xizhou Zhu and
                  Jie Zhou and
                  Yu Qiao and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Lewei Lu and
                  Jifeng Dai},
  title        = {Auto MC-Reward: Automated Dense Reward Design with Large Language
                  Models for Minecraft},
  journal      = {CoRR},
  volume       = {abs/2312.09238},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2312.09238},
  doi          = {10.48550/ARXIV.2312.09238},
  eprinttype   = {arXiv},
  eprint       = {2312.09238},
  timestamp    = {Tue, 06 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2312-09238.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2312-09245,
  author       = {Wenhai Wang and
                  Jiangwei Xie and
                  Chuanyang Hu and
                  Haoming Zou and
                  Jianan Fan and
                  Wenwen Tong and
                  Yang Wen and
                  Silei Wu and
                  Hanming Deng and
                  Zhiqi Li and
                  Hao Tian and
                  Lewei Lu and
                  Xizhou Zhu and
                  Xiaogang Wang and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral
                  Planning States for Autonomous Driving},
  journal      = {CoRR},
  volume       = {abs/2312.09245},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2312.09245},
  doi          = {10.48550/ARXIV.2312.09245},
  eprinttype   = {arXiv},
  eprint       = {2312.09245},
  timestamp    = {Tue, 19 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2312-09245.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2312-11562,
  author       = {Jiankai Sun and
                  Chuanyang Zheng and
                  Enze Xie and
                  Zhengying Liu and
                  Ruihang Chu and
                  Jianing Qiu and
                  Jiaqi Xu and
                  Mingyu Ding and
                  Hongyang Li and
                  Mengzhe Geng and
                  Yue Wu and
                  Wenhai Wang and
                  Junsong Chen and
                  Zhangyue Yin and
                  Xiaozhe Ren and
                  Jie Fu and
                  Junxian He and
                  Wu Yuan and
                  Qi Liu and
                  Xihui Liu and
                  Yu Li and
                  Hao Dong and
                  Yu Cheng and
                  Ming Zhang and
                  Pheng{-}Ann Heng and
                  Jifeng Dai and
                  Ping Luo and
                  Jingdong Wang and
                  Ji{-}Rong Wen and
                  Xipeng Qiu and
                  Yike Guo and
                  Hui Xiong and
                  Qun Liu and
                  Zhenguo Li},
  title        = {A Survey of Reasoning with Foundation Models},
  journal      = {CoRR},
  volume       = {abs/2312.11562},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2312.11562},
  doi          = {10.48550/ARXIV.2312.11562},
  eprinttype   = {arXiv},
  eprint       = {2312.11562},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2312-11562.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2312-14238,
  author       = {Zhe Chen and
                  Jiannan Wu and
                  Wenhai Wang and
                  Weijie Su and
                  Guo Chen and
                  Sen Xing and
                  Muyan Zhong and
                  Qinglong Zhang and
                  Xizhou Zhu and
                  Lewei Lu and
                  Bin Li and
                  Ping Luo and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  title        = {InternVL: Scaling up Vision Foundation Models and Aligning for Generic
                  Visual-Linguistic Tasks},
  journal      = {CoRR},
  volume       = {abs/2312.14238},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2312.14238},
  doi          = {10.48550/ARXIV.2312.14238},
  eprinttype   = {arXiv},
  eprint       = {2312.14238},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2312-14238.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LiFDLHZ22,
  author       = {Hao Li and
                  Tianwen Fu and
                  Jifeng Dai and
                  Hongsheng Li and
                  Gao Huang and
                  Xizhou Zhu},
  title        = {AutoLoss-Zero: Searching Loss Functions from Scratch for Generic Tasks},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
  pages        = {999--1008},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/CVPR52688.2022.00108},
  doi          = {10.1109/CVPR52688.2022.00108},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/LiFDLHZ22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/TaoWZDSHD22,
  author       = {Chenxin Tao and
                  Honghui Wang and
                  Xizhou Zhu and
                  Jiahua Dong and
                  Shiji Song and
                  Gao Huang and
                  Jifeng Dai},
  title        = {Exploring the Equivalence of Siamese Self-Supervised Learning via
                  {A} Unified Gradient Framework},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
  pages        = {14411--14420},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/CVPR52688.2022.01403},
  doi          = {10.1109/CVPR52688.2022.01403},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/TaoWZDSHD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ZhuZLWLWD22,
  author       = {Xizhou Zhu and
                  Jinguo Zhu and
                  Hao Li and
                  Xiaoshi Wu and
                  Hongsheng Li and
                  Xiaohua Wang and
                  Jifeng Dai},
  title        = {Uni-Perceiver: Pre-training Unified Architecture for Generic Perception
                  for Zero-shot and Few-shot Tasks},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
  pages        = {16783--16794},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/CVPR52688.2022.01630},
  doi          = {10.1109/CVPR52688.2022.01630},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/ZhuZLWLWD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/LiWLXSLQD22,
  author       = {Zhiqi Li and
                  Wenhai Wang and
                  Hongyang Li and
                  Enze Xie and
                  Chonghao Sima and
                  Tong Lu and
                  Yu Qiao and
                  Jifeng Dai},
  editor       = {Shai Avidan and
                  Gabriel J. Brostow and
                  Moustapha Ciss{\'{e}} and
                  Giovanni Maria Farinella and
                  Tal Hassner},
  title        = {BEVFormer: Learning Bird's-Eye-View Representation from Multi-camera
                  Images via Spatiotemporal Transformers},
  booktitle    = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
                  Israel, October 23-27, 2022, Proceedings, Part {IX}},
  series       = {Lecture Notes in Computer Science},
  pages        = {1--18},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-20077-9\_1},
  doi          = {10.1007/978-3-031-20077-9\_1},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eccv/LiWLXSLQD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/TianWZDQ22,
  author       = {Changyao Tian and
                  Wenhai Wang and
                  Xizhou Zhu and
                  Jifeng Dai and
                  Yu Qiao},
  editor       = {Shai Avidan and
                  Gabriel J. Brostow and
                  Moustapha Ciss{\'{e}} and
                  Giovanni Maria Farinella and
                  Tal Hassner},
  title        = {{VL-LTR:} Learning Class-wise Visual-Linguistic Representation for
                  Long-Tailed Visual Recognition},
  booktitle    = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
                  Israel, October 23-27, 2022, Proceedings, Part {XXV}},
  series       = {Lecture Notes in Computer Science},
  pages        = {73--91},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-19806-9\_5},
  doi          = {10.1007/978-3-031-19806-9\_5},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/TianWZDQ22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/LinGZGMWDQL22,
  author       = {Ziyi Lin and
                  Shijie Geng and
                  Renrui Zhang and
                  Peng Gao and
                  Gerard de Melo and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongsheng Li},
  editor       = {Shai Avidan and
                  Gabriel J. Brostow and
                  Moustapha Ciss{\'{e}} and
                  Giovanni Maria Farinella and
                  Tal Hassner},
  title        = {Frozen {CLIP} Models are Efficient Video Learners},
  booktitle    = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
                  Israel, October 23-27, 2022, Proceedings, Part {XXXV}},
  series       = {Lecture Notes in Computer Science},
  pages        = {388--404},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-19833-5\_23},
  doi          = {10.1007/978-3-031-19833-5\_23},
  timestamp    = {Mon, 14 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/LinGZGMWDQL22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/ZhangZFGLDQL22,
  author       = {Renrui Zhang and
                  Wei Zhang and
                  Rongyao Fang and
                  Peng Gao and
                  Kunchang Li and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongsheng Li},
  editor       = {Shai Avidan and
                  Gabriel J. Brostow and
                  Moustapha Ciss{\'{e}} and
                  Giovanni Maria Farinella and
                  Tal Hassner},
  title        = {Tip-Adapter: Training-Free Adaption of {CLIP} for Few-Shot Classification},
  booktitle    = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
                  Israel, October 23-27, 2022, Proceedings, Part {XXXV}},
  series       = {Lecture Notes in Computer Science},
  pages        = {493--510},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-19833-5\_29},
  doi          = {10.1007/978-3-031-19833-5\_29},
  timestamp    = {Thu, 12 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eccv/ZhangZFGLDQL22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/HuangSZWCQDL22,
  author       = {Zhaoyang Huang and
                  Xiaoyu Shi and
                  Chao Zhang and
                  Qiang Wang and
                  Ka Chun Cheung and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  editor       = {Shai Avidan and
                  Gabriel J. Brostow and
                  Moustapha Ciss{\'{e}} and
                  Giovanni Maria Farinella and
                  Tal Hassner},
  title        = {FlowFormer: {A} Transformer Architecture for Optical Flow},
  booktitle    = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
                  Israel, October 23-27, 2022, Proceedings, Part {XVII}},
  series       = {Lecture Notes in Computer Science},
  pages        = {668--685},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-19790-1\_40},
  doi          = {10.1007/978-3-031-19790-1\_40},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eccv/HuangSZWCQDL22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/GaoMLLDQ22,
  author       = {Peng Gao and
                  Teli Ma and
                  Hongsheng Li and
                  Ziyi Lin and
                  Jifeng Dai and
                  Yu Qiao},
  editor       = {Sanmi Koyejo and
                  S. Mohamed and
                  A. Agarwal and
                  Danielle Belgrave and
                  K. Cho and
                  A. Oh},
  title        = {{MCMAE:} Masked Convolution Meets Masked Autoencoders},
  booktitle    = {Advances in Neural Information Processing Systems 35: Annual Conference
                  on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
                  LA, USA, November 28 - December 9, 2022},
  year         = {2022},
  url          = {http://papers.nips.cc/paper\_files/paper/2022/hash/e7938ede51225b490bb69f7b361a9259-Abstract-Conference.html},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/GaoMLLDQ22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/ZhuZWWLWD22,
  author       = {Jinguo Zhu and
                  Xizhou Zhu and
                  Wenhai Wang and
                  Xiaohua Wang and
                  Hongsheng Li and
                  Xiaogang Wang and
                  Jifeng Dai},
  editor       = {Sanmi Koyejo and
                  S. Mohamed and
                  A. Agarwal and
                  Danielle Belgrave and
                  K. Cho and
                  A. Oh},
  title        = {Uni-Perceiver-MoE: Learning Sparse Generalist Models with Conditional
                  MoEs},
  booktitle    = {Advances in Neural Information Processing Systems 35: Annual Conference
                  on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
                  LA, USA, November 28 - December 9, 2022},
  year         = {2022},
  url          = {http://papers.nips.cc/paper\_files/paper/2022/hash/11fc8c98b46d4cbdfe8157267228f7d7-Abstract-Conference.html},
  timestamp    = {Mon, 08 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/ZhuZWWLWD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2203-16194,
  author       = {Zhaoyang Huang and
                  Xiaoyu Shi and
                  Chao Zhang and
                  Qiang Wang and
                  Ka Chun Cheung and
                  Hongwei Qin and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FlowFormer: {A} Transformer Architecture for Optical Flow},
  journal      = {CoRR},
  volume       = {abs/2203.16194},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2203.16194},
  doi          = {10.48550/ARXIV.2203.16194},
  eprinttype   = {arXiv},
  eprint       = {2203.16194},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2203-16194.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2203-17270,
  author       = {Zhiqi Li and
                  Wenhai Wang and
                  Hongyang Li and
                  Enze Xie and
                  Chonghao Sima and
                  Tong Lu and
                  Qiao Yu and
                  Jifeng Dai},
  title        = {BEVFormer: Learning Bird's-Eye-View Representation from Multi-Camera
                  Images via Spatiotemporal Transformers},
  journal      = {CoRR},
  volume       = {abs/2203.17270},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2203.17270},
  doi          = {10.48550/ARXIV.2203.17270},
  eprinttype   = {arXiv},
  eprint       = {2203.17270},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2203-17270.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2205-03892,
  author       = {Peng Gao and
                  Teli Ma and
                  Hongsheng Li and
                  Ziyi Lin and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {ConvMAE: Masked Convolution Meets Masked Autoencoders},
  journal      = {CoRR},
  volume       = {abs/2205.03892},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2205.03892},
  doi          = {10.48550/ARXIV.2205.03892},
  eprinttype   = {arXiv},
  eprint       = {2205.03892},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2205-03892.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2205-08534,
  author       = {Zhe Chen and
                  Yuchen Duan and
                  Wenhai Wang and
                  Junjun He and
                  Tong Lu and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {Vision Transformer Adapter for Dense Predictions},
  journal      = {CoRR},
  volume       = {abs/2205.08534},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2205.08534},
  doi          = {10.48550/ARXIV.2205.08534},
  eprinttype   = {arXiv},
  eprint       = {2205.08534},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2205-08534.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2206-01204,
  author       = {Chenxin Tao and
                  Xizhou Zhu and
                  Gao Huang and
                  Yu Qiao and
                  Xiaogang Wang and
                  Jifeng Dai},
  title        = {Siamese Image Modeling for Self-Supervised Vision Representation Learning},
  journal      = {CoRR},
  volume       = {abs/2206.01204},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2206.01204},
  doi          = {10.48550/ARXIV.2206.01204},
  eprinttype   = {arXiv},
  eprint       = {2206.01204},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2206-01204.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2206-04674,
  author       = {Jinguo Zhu and
                  Xizhou Zhu and
                  Wenhai Wang and
                  Xiaohua Wang and
                  Hongsheng Li and
                  Xiaogang Wang and
                  Jifeng Dai},
  title        = {Uni-Perceiver-MoE: Learning Sparse Generalist Models with Conditional
                  MoEs},
  journal      = {CoRR},
  volume       = {abs/2206.04674},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2206.04674},
  doi          = {10.48550/ARXIV.2206.04674},
  eprinttype   = {arXiv},
  eprint       = {2206.04674},
  timestamp    = {Fri, 03 Nov 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2206-04674.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2207-09519,
  author       = {Renrui Zhang and
                  Zhang Wei and
                  Rongyao Fang and
                  Peng Gao and
                  Kunchang Li and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongsheng Li},
  title        = {Tip-Adapter: Training-free Adaption of {CLIP} for Few-shot Classification},
  journal      = {CoRR},
  volume       = {abs/2207.09519},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2207.09519},
  doi          = {10.48550/ARXIV.2207.09519},
  eprinttype   = {arXiv},
  eprint       = {2207.09519},
  timestamp    = {Tue, 16 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2207-09519.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2208-03550,
  author       = {Ziyi Lin and
                  Shijie Geng and
                  Renrui Zhang and
                  Peng Gao and
                  Gerard de Melo and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongsheng Li},
  title        = {Frozen {CLIP} Models are Efficient Video Learners},
  journal      = {CoRR},
  volume       = {abs/2208.03550},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2208.03550},
  doi          = {10.48550/ARXIV.2208.03550},
  eprinttype   = {arXiv},
  eprint       = {2208.03550},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2208-03550.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2209-05324,
  author       = {Hongyang Li and
                  Chonghao Sima and
                  Jifeng Dai and
                  Wenhai Wang and
                  Lewei Lu and
                  Huijie Wang and
                  Enze Xie and
                  Zhiqi Li and
                  Hanming Deng and
                  Hao Tian and
                  Xizhou Zhu and
                  Li Chen and
                  Yulu Gao and
                  Xiangwei Geng and
                  Jia Zeng and
                  Yang Li and
                  Jiazhi Yang and
                  Xiaosong Jia and
                  Bohan Yu and
                  Yu Qiao and
                  Dahua Lin and
                  Si Liu and
                  Junchi Yan and
                  Jianping Shi and
                  Ping Luo},
  title        = {Delving into the Devils of Bird's-eye-view Perception: {A} Review,
                  Evaluation and Recipe},
  journal      = {CoRR},
  volume       = {abs/2209.05324},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2209.05324},
  doi          = {10.48550/ARXIV.2209.05324},
  eprinttype   = {arXiv},
  eprint       = {2209.05324},
  timestamp    = {Tue, 14 Jan 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2209-05324.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2211-05778,
  author       = {Wenhai Wang and
                  Jifeng Dai and
                  Zhe Chen and
                  Zhenhang Huang and
                  Zhiqi Li and
                  Xizhou Zhu and
                  Xiaowei Hu and
                  Tong Lu and
                  Lewei Lu and
                  Hongsheng Li and
                  Xiaogang Wang and
                  Yu Qiao},
  title        = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable
                  Convolutions},
  journal      = {CoRR},
  volume       = {abs/2211.05778},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2211.05778},
  doi          = {10.48550/ARXIV.2211.05778},
  eprinttype   = {arXiv},
  eprint       = {2211.05778},
  timestamp    = {Wed, 03 Jun 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2211-05778.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2211-05781,
  author       = {Jifeng Dai and
                  Min Shi and
                  Weiyun Wang and
                  Sitong Wu and
                  Linjie Xing and
                  Wenhai Wang and
                  Xizhou Zhu and
                  Lewei Lu and
                  Jie Zhou and
                  Xiaogang Wang and
                  Yu Qiao and
                  Xiaowei Hu},
  title        = {Demystify Transformers {\&} Convolutions in Modern Image Deep
                  Networks},
  journal      = {CoRR},
  volume       = {abs/2211.05781},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2211.05781},
  doi          = {10.48550/ARXIV.2211.05781},
  eprinttype   = {arXiv},
  eprint       = {2211.05781},
  timestamp    = {Mon, 29 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2211-05781.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2211-09807,
  author       = {Weijie Su and
                  Xizhou Zhu and
                  Chenxin Tao and
                  Lewei Lu and
                  Bin Li and
                  Gao Huang and
                  Yu Qiao and
                  Xiaogang Wang and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {Towards All-in-one Pre-training via Maximizing Multi-modal Mutual
                  Information},
  journal      = {CoRR},
  volume       = {abs/2211.09807},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2211.09807},
  doi          = {10.48550/ARXIV.2211.09807},
  eprinttype   = {arXiv},
  eprint       = {2211.09807},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2211-09807.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2211-09808,
  author       = {Hao Li and
                  Jinguo Zhu and
                  Xiaohu Jiang and
                  Xizhou Zhu and
                  Hongsheng Li and
                  Chun Yuan and
                  Xiaohua Wang and
                  Yu Qiao and
                  Xiaogang Wang and
                  Wenhai Wang and
                  Jifeng Dai},
  title        = {Uni-Perceiver v2: {A} Generalist Model for Large-Scale Vision and
                  Vision-Language Tasks},
  journal      = {CoRR},
  volume       = {abs/2211.09808},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2211.09808},
  doi          = {10.48550/ARXIV.2211.09808},
  eprinttype   = {arXiv},
  eprint       = {2211.09808},
  timestamp    = {Tue, 24 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2211-09808.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2211-10439,
  author       = {Chenyu Yang and
                  Yuntao Chen and
                  Hao Tian and
                  Chenxin Tao and
                  Xizhou Zhu and
                  Zhaoxiang Zhang and
                  Gao Huang and
                  Hongyang Li and
                  Yu Qiao and
                  Lewei Lu and
                  Jie Zhou and
                  Jifeng Dai},
  title        = {BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition
                  via Perspective Supervision},
  journal      = {CoRR},
  volume       = {abs/2211.10439},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2211.10439},
  doi          = {10.48550/ARXIV.2211.10439},
  eprinttype   = {arXiv},
  eprint       = {2211.10439},
  timestamp    = {Tue, 19 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2211-10439.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2212-10156,
  author       = {Yihan Hu and
                  Jiazhi Yang and
                  Li Chen and
                  Keyu Li and
                  Chonghao Sima and
                  Xizhou Zhu and
                  Siqi Chai and
                  Senyao Du and
                  Tianwei Lin and
                  Wenhai Wang and
                  Lewei Lu and
                  Xiaosong Jia and
                  Qiang Liu and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongyang Li},
  title        = {Goal-oriented Autonomous Driving},
  journal      = {CoRR},
  volume       = {abs/2212.10156},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2212.10156},
  doi          = {10.48550/ARXIV.2212.10156},
  eprinttype   = {arXiv},
  eprint       = {2212.10156},
  timestamp    = {Mon, 14 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2212-10156.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/TianCDZZ21,
  author       = {Hao Tian and
                  Yuntao Chen and
                  Jifeng Dai and
                  Zhaoxiang Zhang and
                  Xizhou Zhu},
  title        = {Unsupervised Object Detection With {LIDAR} Clues},
  booktitle    = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
                  2021, virtual, June 19-25, 2021},
  pages        = {5962--5972},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2021},
  url          = {https://openaccess.thecvf.com/content/CVPR2021/html/Tian\_Unsupervised\_Object\_Detection\_With\_LIDAR\_Clues\_CVPR\_2021\_paper.html},
  doi          = {10.1109/CVPR46437.2021.00590},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/TianCDZZ21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/0007Z0D021,
  author       = {Peng Gao and
                  Minghang Zheng and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
  booktitle    = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
                  2021, Montreal, QC, Canada, October 10-17, 2021},
  pages        = {3601--3610},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/ICCV48922.2021.00360},
  doi          = {10.1109/ICCV48922.2021.00360},
  timestamp    = {Mon, 14 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/0007Z0D021.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/WangZYDKG21,
  author       = {Wenguan Wang and
                  Tianfei Zhou and
                  Fisher Yu and
                  Jifeng Dai and
                  Ender Konukoglu and
                  Luc Van Gool},
  title        = {Exploring Cross-Image Pixel Contrast for Semantic Segmentation},
  booktitle    = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
                  2021, Montreal, QC, Canada, October 10-17, 2021},
  pages        = {7283--7293},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/ICCV48922.2021.00721},
  doi          = {10.1109/ICCV48922.2021.00721},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/WangZYDKG21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/LiuDZLDH21,
  author       = {Zhuoming Liu and
                  Hao Ding and
                  Huaping Zhong and
                  Weijia Li and
                  Jifeng Dai and
                  Conghui He},
  title        = {Influence Selection for Active Learning},
  booktitle    = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
                  2021, Montreal, QC, Canada, October 10-17, 2021},
  pages        = {9254--9263},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/ICCV48922.2021.00914},
  doi          = {10.1109/ICCV48922.2021.00914},
  timestamp    = {Thu, 07 May 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/LiuDZLDH21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/0019DHSLS0D021,
  author       = {Rui Liu and
                  Hanming Deng and
                  Yangyi Huang and
                  Xiaoyu Shi and
                  Lewei Lu and
                  Wenxiu Sun and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FuseFormer: Fusing Fine-Grained Information in Transformers for Video
                  Inpainting},
  booktitle    = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
                  2021, Montreal, QC, Canada, October 10-17, 2021},
  pages        = {14020--14029},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/ICCV48922.2021.01378},
  doi          = {10.1109/ICCV48922.2021.01378},
  timestamp    = {Mon, 14 Apr 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/0019DHSLS0D021.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/LiTZWHD21,
  author       = {Hao Li and
                  Chenxin Tao and
                  Xizhou Zhu and
                  Xiaogang Wang and
                  Gao Huang and
                  Jifeng Dai},
  title        = {Auto Seg-Loss: Searching Metric Surrogates for Semantic Segmentation},
  booktitle    = {9th International Conference on Learning Representations, {ICLR} 2021,
                  Virtual Event, Austria, May 3-7, 2021},
  publisher    = {OpenReview.net},
  year         = {2021},
  url          = {https://openreview.net/forum?id=MJAqnaC2vO1},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/LiTZWHD21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/ZhuSLLWD21,
  author       = {Xizhou Zhu and
                  Weijie Su and
                  Lewei Lu and
                  Bin Li and
                  Xiaogang Wang and
                  Jifeng Dai},
  title        = {Deformable {DETR:} Deformable Transformers for End-to-End Object Detection},
  booktitle    = {9th International Conference on Learning Representations, {ICLR} 2021,
                  Virtual Event, Austria, May 3-7, 2021},
  publisher    = {OpenReview.net},
  year         = {2021},
  url          = {https://openreview.net/forum?id=gZ9hCDWe6ke},
  timestamp    = {Tue, 15 Nov 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iclr/ZhuSLLWD21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/TaoLZHLD21,
  author       = {Chenxin Tao and
                  Zizhang Li and
                  Xizhou Zhu and
                  Gao Huang and
                  Yong Liu and
                  Jifeng Dai},
  editor       = {Marc'Aurelio Ranzato and
                  Alina Beygelzimer and
                  Yann N. Dauphin and
                  Percy Liang and
                  Jennifer Wortman Vaughan},
  title        = {Searching Parameterized {AP} Loss for Object Detection},
  booktitle    = {Advances in Neural Information Processing Systems 34: Annual Conference
                  on Neural Information Processing Systems 2021, NeurIPS 2021, December
                  6-14, 2021, virtual},
  pages        = {22021--22033},
  year         = {2021},
  url          = {https://proceedings.neurips.cc/paper/2021/hash/b9009beb804fa097c04d226a8ba5102e-Abstract.html},
  timestamp    = {Tue, 11 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/TaoLZHLD21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2101-07448,
  author       = {Peng Gao and
                  Minghang Zheng and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
  journal      = {CoRR},
  volume       = {abs/2101.07448},
  year         = {2021},
  url          = {https://arxiv.org/abs/2101.07448},
  eprinttype   = {arXiv},
  eprint       = {2101.07448},
  timestamp    = {Thu, 14 Jul 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2101-07448.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2101-11939,
  author       = {Wenguan Wang and
                  Tianfei Zhou and
                  Fisher Yu and
                  Jifeng Dai and
                  Ender Konukoglu and
                  Luc Van Gool},
  title        = {Exploring Cross-Image Pixel Contrast for Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/2101.11939},
  year         = {2021},
  url          = {https://arxiv.org/abs/2101.11939},
  eprinttype   = {arXiv},
  eprint       = {2101.11939},
  timestamp    = {Mon, 18 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2101-11939.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2103-14026,
  author       = {Hao Li and
                  Tianwen Fu and
                  Jifeng Dai and
                  Hongsheng Li and
                  Gao Huang and
                  Xizhou Zhu},
  title        = {AutoLoss-Zero: Searching Loss Functions from Scratch for Generic Tasks},
  journal      = {CoRR},
  volume       = {abs/2103.14026},
  year         = {2021},
  url          = {https://arxiv.org/abs/2103.14026},
  eprinttype   = {arXiv},
  eprint       = {2103.14026},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2103-14026.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2104-06637,
  author       = {Rui Liu and
                  Hanming Deng and
                  Yangyi Huang and
                  Xiaoyu Shi and
                  Lewei Lu and
                  Wenxiu Sun and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Decoupled Spatial-Temporal Transformer for Video Inpainting},
  journal      = {CoRR},
  volume       = {abs/2104.06637},
  year         = {2021},
  url          = {https://arxiv.org/abs/2104.06637},
  eprinttype   = {arXiv},
  eprint       = {2104.06637},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2104-06637.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2106-02242,
  author       = {Peng Gao and
                  Shijie Geng and
                  Yu Qiao and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Scalable Transformers for Neural Machine Translation},
  journal      = {CoRR},
  volume       = {abs/2106.02242},
  year         = {2021},
  url          = {https://arxiv.org/abs/2106.02242},
  eprinttype   = {arXiv},
  eprint       = {2106.02242},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2106-02242.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2107-01151,
  author       = {Haiyang Wang and
                  Wenguan Wang and
                  Xizhou Zhu and
                  Jifeng Dai and
                  Liwei Wang},
  title        = {Collaborative Visual Navigation},
  journal      = {CoRR},
  volume       = {abs/2107.01151},
  year         = {2021},
  url          = {https://arxiv.org/abs/2107.01151},
  eprinttype   = {arXiv},
  eprint       = {2107.01151},
  timestamp    = {Tue, 12 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2107-01151.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2108-02404,
  author       = {Peng Gao and
                  Minghang Zheng and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
  journal      = {CoRR},
  volume       = {abs/2108.02404},
  year         = {2021},
  url          = {https://arxiv.org/abs/2108.02404},
  eprinttype   = {arXiv},
  eprint       = {2108.02404},
  timestamp    = {Thu, 14 Jul 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2108-02404.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2108-09331,
  author       = {Zhuoming Liu and
                  Hao Ding and
                  Huaping Zhong and
                  Weijia Li and
                  Jifeng Dai and
                  Conghui He},
  title        = {Influence Selection for Active Learning},
  journal      = {CoRR},
  volume       = {abs/2108.09331},
  year         = {2021},
  url          = {https://arxiv.org/abs/2108.09331},
  eprinttype   = {arXiv},
  eprint       = {2108.09331},
  timestamp    = {Thu, 11 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2108-09331.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2109-02974,
  author       = {Rui Liu and
                  Hanming Deng and
                  Yangyi Huang and
                  Xiaoyu Shi and
                  Lewei Lu and
                  Wenxiu Sun and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Hongsheng Li},
  title        = {FuseFormer: Fusing Fine-Grained Information in Transformers for Video
                  Inpainting},
  journal      = {CoRR},
  volume       = {abs/2109.02974},
  year         = {2021},
  url          = {https://arxiv.org/abs/2109.02974},
  eprinttype   = {arXiv},
  eprint       = {2109.02974},
  timestamp    = {Thu, 14 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2109-02974.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2111-03930,
  author       = {Renrui Zhang and
                  Rongyao Fang and
                  Wei Zhang and
                  Peng Gao and
                  Kunchang Li and
                  Jifeng Dai and
                  Yu Qiao and
                  Hongsheng Li},
  title        = {Tip-Adapter: Training-free CLIP-Adapter for Better Vision-Language
                  Modeling},
  journal      = {CoRR},
  volume       = {abs/2111.03930},
  year         = {2021},
  url          = {https://arxiv.org/abs/2111.03930},
  eprinttype   = {arXiv},
  eprint       = {2111.03930},
  timestamp    = {Thu, 12 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2111-03930.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2111-13579,
  author       = {Changyao Tian and
                  Wenhai Wang and
                  Xizhou Zhu and
                  Xiaogang Wang and
                  Jifeng Dai and
                  Yu Qiao},
  title        = {{VL-LTR:} Learning Class-wise Visual-Linguistic Representation for
                  Long-Tailed Visual Recognition},
  journal      = {CoRR},
  volume       = {abs/2111.13579},
  year         = {2021},
  url          = {https://arxiv.org/abs/2111.13579},
  eprinttype   = {arXiv},
  eprint       = {2111.13579},
  timestamp    = {Mon, 03 Jun 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2111-13579.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2112-01522,
  author       = {Xizhou Zhu and
                  Jinguo Zhu and
                  Hao Li and
                  Xiaoshi Wu and
                  Xiaogang Wang and
                  Hongsheng Li and
                  Xiaohua Wang and
                  Jifeng Dai},
  title        = {Uni-Perceiver: Pre-training Unified Architecture for Generic Perception
                  for Zero-shot and Few-shot Tasks},
  journal      = {CoRR},
  volume       = {abs/2112.01522},
  year         = {2021},
  url          = {https://arxiv.org/abs/2112.01522},
  eprinttype   = {arXiv},
  eprint       = {2112.01522},
  timestamp    = {Tue, 06 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2112-01522.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2112-05138,
  author       = {Chenxin Tao and
                  Zizhang Li and
                  Xizhou Zhu and
                  Gao Huang and
                  Yong Liu and
                  Jifeng Dai},
  title        = {Searching Parameterized {AP} Loss for Object Detection},
  journal      = {CoRR},
  volume       = {abs/2112.05138},
  year         = {2021},
  url          = {https://arxiv.org/abs/2112.05138},
  eprinttype   = {arXiv},
  eprint       = {2112.05138},
  timestamp    = {Tue, 11 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2112-05138.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2112-05141,
  author       = {Chenxin Tao and
                  Honghui Wang and
                  Xizhou Zhu and
                  Jiahua Dong and
                  Shiji Song and
                  Gao Huang and
                  Jifeng Dai},
  title        = {Exploring the Equivalence of Siamese Self-Supervised Learning via
                  {A} Unified Gradient Framework},
  journal      = {CoRR},
  volume       = {abs/2112.05141},
  year         = {2021},
  url          = {https://arxiv.org/abs/2112.05141},
  eprinttype   = {arXiv},
  eprint       = {2112.05141},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2112-05141.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/YangHCSDH20,
  author       = {Le Yang and
                  Yizeng Han and
                  Xi Chen and
                  Shiji Song and
                  Jifeng Dai and
                  Gao Huang},
  title        = {Resolution Adaptive Networks for Efficient Inference},
  booktitle    = {2020 {IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2020, Seattle, WA, USA, June 13-19, 2020},
  pages        = {2366--2375},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2020},
  url          = {https://openaccess.thecvf.com/content\_CVPR\_2020/html/Yang\_Resolution\_Adaptive\_Networks\_for\_Efficient\_Inference\_CVPR\_2020\_paper.html},
  doi          = {10.1109/CVPR42600.2020.00244},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/YangHCSDH20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/WangZDPS020,
  author       = {Wenguan Wang and
                  Hailong Zhu and
                  Jifeng Dai and
                  Yanwei Pang and
                  Jianbing Shen and
                  Ling Shao},
  title        = {Hierarchical Human Parsing With Typed Part-Relation Reasoning},
  booktitle    = {2020 {IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2020, Seattle, WA, USA, June 13-19, 2020},
  pages        = {8926--8936},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2020},
  url          = {https://openaccess.thecvf.com/content\_CVPR\_2020/html/Wang\_Hierarchical\_Human\_Parsing\_With\_Typed\_Part-Relation\_Reasoning\_CVPR\_2020\_paper.html},
  doi          = {10.1109/CVPR42600.2020.00895},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/WangZDPS020.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/SunWDG20,
  author       = {Guolei Sun and
                  Wenguan Wang and
                  Jifeng Dai and
                  Luc Van Gool},
  editor       = {Andrea Vedaldi and
                  Horst Bischof and
                  Thomas Brox and
                  Jan{-}Michael Frahm},
  title        = {Mining Cross-Image Semantics for Weakly Supervised Semantic Segmentation},
  booktitle    = {Computer Vision - {ECCV} 2020 - 16th European Conference, Glasgow,
                  UK, August 23-28, 2020, Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  pages        = {347--365},
  publisher    = {Springer},
  year         = {2020},
  url          = {https://doi.org/10.1007/978-3-030-58536-5\_21},
  doi          = {10.1007/978-3-030-58536-5\_21},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/SunWDG20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/GaoZLD20,
  author       = {Hang Gao and
                  Xizhou Zhu and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {Deformable Kernels: Adapting Effective Receptive Fields for Object
                  Deformation},
  booktitle    = {8th International Conference on Learning Representations, {ICLR} 2020,
                  Addis Ababa, Ethiopia, April 26-30, 2020},
  publisher    = {OpenReview.net},
  year         = {2020},
  url          = {https://openreview.net/forum?id=SkxSv6VFvS},
  timestamp    = {Thu, 19 May 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/GaoZLD20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iclr/SuZCLLWD20,
  author       = {Weijie Su and
                  Xizhou Zhu and
                  Yue Cao and
                  Bin Li and
                  Lewei Lu and
                  Furu Wei and
                  Jifeng Dai},
  title        = {{VL-BERT:} Pre-training of Generic Visual-Linguistic Representations},
  booktitle    = {8th International Conference on Learning Representations, {ICLR} 2020,
                  Addis Ababa, Ethiopia, April 26-30, 2020},
  publisher    = {OpenReview.net},
  year         = {2020},
  url          = {https://openreview.net/forum?id=SygXPaEYvH},
  timestamp    = {Tue, 12 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/SuZCLLWD20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2003-04845,
  author       = {Wenguan Wang and
                  Hailong Zhu and
                  Jifeng Dai and
                  Yanwei Pang and
                  Jianbing Shen and
                  Ling Shao},
  title        = {Hierarchical Human Parsing with Typed Part-Relation Reasoning},
  journal      = {CoRR},
  volume       = {abs/2003.04845},
  year         = {2020},
  url          = {https://arxiv.org/abs/2003.04845},
  eprinttype   = {arXiv},
  eprint       = {2003.04845},
  timestamp    = {Tue, 17 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2003-04845.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2003-07326,
  author       = {Le Yang and
                  Yizeng Han and
                  Xi Chen and
                  Shiji Song and
                  Jifeng Dai and
                  Gao Huang},
  title        = {Resolution Adaptive Networks for Efficient Inference},
  journal      = {CoRR},
  volume       = {abs/2003.07326},
  year         = {2020},
  url          = {https://arxiv.org/abs/2003.07326},
  eprinttype   = {arXiv},
  eprint       = {2003.07326},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2003-07326.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2007-01947,
  author       = {Guolei Sun and
                  Wenguan Wang and
                  Jifeng Dai and
                  Luc Van Gool},
  title        = {Mining Cross-Image Semantics for Weakly Supervised Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/2007.01947},
  year         = {2020},
  url          = {https://arxiv.org/abs/2007.01947},
  eprinttype   = {arXiv},
  eprint       = {2007.01947},
  timestamp    = {Fri, 17 Jul 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2007-01947.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2009-01559,
  author       = {Jingru Tan and
                  Gang Zhang and
                  Hanming Deng and
                  Changbao Wang and
                  Lewei Lu and
                  Quanquan Li and
                  Jifeng Dai},
  title        = {1st Place Solution of {LVIS} Challenge 2020: {A} Good Box is not a
                  Guarantee of a Good Mask},
  journal      = {CoRR},
  volume       = {abs/2009.01559},
  year         = {2020},
  url          = {https://arxiv.org/abs/2009.01559},
  eprinttype   = {arXiv},
  eprint       = {2009.01559},
  timestamp    = {Wed, 16 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2009-01559.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2010-04159,
  author       = {Xizhou Zhu and
                  Weijie Su and
                  Lewei Lu and
                  Bin Li and
                  Xiaogang Wang and
                  Jifeng Dai},
  title        = {Deformable {DETR:} Deformable Transformers for End-to-End Object Detection},
  journal      = {CoRR},
  volume       = {abs/2010.04159},
  year         = {2020},
  url          = {https://arxiv.org/abs/2010.04159},
  eprinttype   = {arXiv},
  eprint       = {2010.04159},
  timestamp    = {Tue, 15 Nov 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2010-04159.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2010-07930,
  author       = {Hao Li and
                  Chenxin Tao and
                  Xizhou Zhu and
                  Xiaogang Wang and
                  Gao Huang and
                  Jifeng Dai},
  title        = {Auto Seg-Loss: Searching Metric Surrogates for Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/2010.07930},
  year         = {2020},
  url          = {https://arxiv.org/abs/2010.07930},
  eprinttype   = {arXiv},
  eprint       = {2010.07930},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2010-07930.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2011-12953,
  author       = {Hao Tian and
                  Yuntao Chen and
                  Jifeng Dai and
                  Zhaoxiang Zhang and
                  Xizhou Zhu},
  title        = {Unsupervised Object Detection with LiDAR Clues},
  journal      = {CoRR},
  volume       = {abs/2011.12953},
  year         = {2020},
  url          = {https://arxiv.org/abs/2011.12953},
  eprinttype   = {arXiv},
  eprint       = {2011.12953},
  timestamp    = {Wed, 20 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2011-12953.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ZhuHLD19,
  author       = {Xizhou Zhu and
                  Han Hu and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {Deformable ConvNets {V2:} More Deformable, Better Results},
  booktitle    = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
                  2019, Long Beach, CA, USA, June 16-20, 2019},
  pages        = {9308--9316},
  publisher    = {Computer Vision Foundation / {IEEE}},
  year         = {2019},
  url          = {http://openaccess.thecvf.com/content\_CVPR\_2019/html/Zhu\_Deformable\_ConvNets\_V2\_More\_Deformable\_Better\_Results\_CVPR\_2019\_paper.html},
  doi          = {10.1109/CVPR.2019.00953},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/ZhuHLD19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/ZhuCZLD19,
  author       = {Xizhou Zhu and
                  Dazhi Cheng and
                  Zheng Zhang and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {An Empirical Study of Spatial Attention Mechanisms in Deep Networks},
  booktitle    = {2019 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
                  2019, Seoul, Korea (South), October 27 - November 2, 2019},
  pages        = {6687--6696},
  publisher    = {{IEEE}},
  year         = {2019},
  url          = {https://doi.org/10.1109/ICCV.2019.00679},
  doi          = {10.1109/ICCV.2019.00679},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/ZhuCZLD19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1904-05873,
  author       = {Xizhou Zhu and
                  Dazhi Cheng and
                  Zheng Zhang and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {An Empirical Study of Spatial Attention Mechanisms in Deep Networks},
  journal      = {CoRR},
  volume       = {abs/1904.05873},
  year         = {2019},
  url          = {http://arxiv.org/abs/1904.05873},
  eprinttype   = {arXiv},
  eprint       = {1904.05873},
  timestamp    = {Thu, 19 May 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1904-05873.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1906-07155,
  author       = {Kai Chen and
                  Jiaqi Wang and
                  Jiangmiao Pang and
                  Yuhang Cao and
                  Yu Xiong and
                  Xiaoxiao Li and
                  Shuyang Sun and
                  Wansen Feng and
                  Ziwei Liu and
                  Jiarui Xu and
                  Zheng Zhang and
                  Dazhi Cheng and
                  Chenchen Zhu and
                  Tianheng Cheng and
                  Qijie Zhao and
                  Buyu Li and
                  Xin Lu and
                  Rui Zhu and
                  Yue Wu and
                  Jifeng Dai and
                  Jingdong Wang and
                  Jianping Shi and
                  Wanli Ouyang and
                  Chen Change Loy and
                  Dahua Lin},
  title        = {MMDetection: Open MMLab Detection Toolbox and Benchmark},
  journal      = {CoRR},
  volume       = {abs/1906.07155},
  year         = {2019},
  url          = {http://arxiv.org/abs/1906.07155},
  eprinttype   = {arXiv},
  eprint       = {1906.07155},
  timestamp    = {Fri, 15 Dec 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1906-07155.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1908-08530,
  author       = {Weijie Su and
                  Xizhou Zhu and
                  Yue Cao and
                  Bin Li and
                  Lewei Lu and
                  Furu Wei and
                  Jifeng Dai},
  title        = {{VL-BERT:} Pre-training of Generic Visual-Linguistic Representations},
  journal      = {CoRR},
  volume       = {abs/1908.08530},
  year         = {2019},
  url          = {http://arxiv.org/abs/1908.08530},
  eprinttype   = {arXiv},
  eprint       = {1908.08530},
  timestamp    = {Tue, 12 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1908-08530.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1910-02940,
  author       = {Hang Gao and
                  Xizhou Zhu and
                  Steve Lin and
                  Jifeng Dai},
  title        = {Deformable Kernels: Adapting Effective Receptive Fields for Object
                  Deformation},
  journal      = {CoRR},
  volume       = {abs/1910.02940},
  year         = {2019},
  url          = {http://arxiv.org/abs/1910.02940},
  eprinttype   = {arXiv},
  eprint       = {1910.02940},
  timestamp    = {Fri, 06 Jan 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1910-02940.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/HuGZDW18,
  author       = {Han Hu and
                  Jiayuan Gu and
                  Zheng Zhang and
                  Jifeng Dai and
                  Yichen Wei},
  title        = {Relation Networks for Object Detection},
  booktitle    = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
  pages        = {3588--3597},
  publisher    = {Computer Vision Foundation / {IEEE} Computer Society},
  year         = {2018},
  url          = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Hu\_Relation\_Networks\_for\_CVPR\_2018\_paper.html},
  doi          = {10.1109/CVPR.2018.00378},
  timestamp    = {Sun, 06 Oct 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/HuGZDW18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ZhuDYW18,
  author       = {Xizhou Zhu and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Towards High Performance Video Object Detection},
  booktitle    = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
  pages        = {7210--7218},
  publisher    = {Computer Vision Foundation / {IEEE} Computer Society},
  year         = {2018},
  url          = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Zhu\_Towards\_High\_Performance\_CVPR\_2018\_paper.html},
  doi          = {10.1109/CVPR.2018.00753},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/ZhuDYW18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/GuHWWD18,
  author       = {Jiayuan Gu and
                  Han Hu and
                  Liwei Wang and
                  Yichen Wei and
                  Jifeng Dai},
  editor       = {Vittorio Ferrari and
                  Martial Hebert and
                  Cristian Sminchisescu and
                  Yair Weiss},
  title        = {Learning Region Features for Object Detection},
  booktitle    = {Computer Vision - {ECCV} 2018 - 15th European Conference, Munich,
                  Germany, September 8-14, 2018, Proceedings, Part {XII}},
  series       = {Lecture Notes in Computer Science},
  pages        = {392--406},
  publisher    = {Springer},
  year         = {2018},
  url          = {https://doi.org/10.1007/978-3-030-01258-8\_24},
  doi          = {10.1007/978-3-030-01258-8\_24},
  timestamp    = {Sun, 06 Oct 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/GuHWWD18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1803-07066,
  author       = {Jiayuan Gu and
                  Han Hu and
                  Liwei Wang and
                  Yichen Wei and
                  Jifeng Dai},
  title        = {Learning Region Features for Object Detection},
  journal      = {CoRR},
  volume       = {abs/1803.07066},
  year         = {2018},
  url          = {http://arxiv.org/abs/1803.07066},
  eprinttype   = {arXiv},
  eprint       = {1803.07066},
  timestamp    = {Mon, 05 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1803-07066.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1804-05830,
  author       = {Xizhou Zhu and
                  Jifeng Dai and
                  Xingchi Zhu and
                  Yichen Wei and
                  Lu Yuan},
  title        = {Towards High Performance Video Object Detection for Mobiles},
  journal      = {CoRR},
  volume       = {abs/1804.05830},
  year         = {2018},
  url          = {http://arxiv.org/abs/1804.05830},
  eprinttype   = {arXiv},
  eprint       = {1804.05830},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1804-05830.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1811-11167,
  author       = {Zheng Zhang and
                  Dazhi Cheng and
                  Xizhou Zhu and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {Integrated Object Detection and Tracking with Tracklet-Conditioned
                  Detection},
  journal      = {CoRR},
  volume       = {abs/1811.11167},
  year         = {2018},
  url          = {http://arxiv.org/abs/1811.11167},
  eprinttype   = {arXiv},
  eprint       = {1811.11167},
  timestamp    = {Thu, 19 May 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1811-11167.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1811-11168,
  author       = {Xizhou Zhu and
                  Han Hu and
                  Stephen Lin and
                  Jifeng Dai},
  title        = {Deformable ConvNets v2: More Deformable, Better Results},
  journal      = {CoRR},
  volume       = {abs/1811.11168},
  year         = {2018},
  url          = {http://arxiv.org/abs/1811.11168},
  eprinttype   = {arXiv},
  eprint       = {1811.11168},
  timestamp    = {Mon, 05 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1811-11168.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/ZhuXDYW17,
  author       = {Xizhou Zhu and
                  Yuwen Xiong and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Deep Feature Flow for Video Recognition},
  booktitle    = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2017, Honolulu, HI, USA, July 21-26, 2017},
  pages        = {4141--4150},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.ieeecomputersociety.org/10.1109/CVPR.2017.441},
  doi          = {10.1109/CVPR.2017.441},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/ZhuXDYW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LiQDJW17,
  author       = {Yi Li and
                  Haozhi Qi and
                  Jifeng Dai and
                  Xiangyang Ji and
                  Yichen Wei},
  title        = {Fully Convolutional Instance-Aware Semantic Segmentation},
  booktitle    = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2017, Honolulu, HI, USA, July 21-26, 2017},
  pages        = {4438--4446},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.org/10.1109/CVPR.2017.472},
  doi          = {10.1109/CVPR.2017.472},
  timestamp    = {Tue, 21 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/LiQDJW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/ZhuWDYW17,
  author       = {Xizhou Zhu and
                  Yujie Wang and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Flow-Guided Feature Aggregation for Video Object Detection},
  booktitle    = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,
                  Italy, October 22-29, 2017},
  pages        = {408--417},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.org/10.1109/ICCV.2017.52},
  doi          = {10.1109/ICCV.2017.52},
  timestamp    = {Tue, 14 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/ZhuWDYW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/DaiQXLZHW17,
  author       = {Jifeng Dai and
                  Haozhi Qi and
                  Yuwen Xiong and
                  Yi Li and
                  Guodong Zhang and
                  Han Hu and
                  Yichen Wei},
  title        = {Deformable Convolutional Networks},
  booktitle    = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,
                  Italy, October 22-29, 2017},
  pages        = {764--773},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.org/10.1109/ICCV.2017.89},
  doi          = {10.1109/ICCV.2017.89},
  timestamp    = {Tue, 21 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/DaiQXLZHW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiQXLZHW17,
  author       = {Jifeng Dai and
                  Haozhi Qi and
                  Yuwen Xiong and
                  Yi Li and
                  Guodong Zhang and
                  Han Hu and
                  Yichen Wei},
  title        = {Deformable Convolutional Networks},
  journal      = {CoRR},
  volume       = {abs/1703.06211},
  year         = {2017},
  url          = {http://arxiv.org/abs/1703.06211},
  eprinttype   = {arXiv},
  eprint       = {1703.06211},
  timestamp    = {Mon, 05 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiQXLZHW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ZhuWDYW17,
  author       = {Xizhou Zhu and
                  Yujie Wang and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Flow-Guided Feature Aggregation for Video Object Detection},
  journal      = {CoRR},
  volume       = {abs/1703.10025},
  year         = {2017},
  url          = {http://arxiv.org/abs/1703.10025},
  eprinttype   = {arXiv},
  eprint       = {1703.10025},
  timestamp    = {Thu, 14 Oct 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZhuWDYW17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1711-11575,
  author       = {Han Hu and
                  Jiayuan Gu and
                  Zheng Zhang and
                  Jifeng Dai and
                  Yichen Wei},
  title        = {Relation Networks for Object Detection},
  journal      = {CoRR},
  volume       = {abs/1711.11575},
  year         = {2017},
  url          = {http://arxiv.org/abs/1711.11575},
  eprinttype   = {arXiv},
  eprint       = {1711.11575},
  timestamp    = {Mon, 05 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1711-11575.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1711-11577,
  author       = {Xizhou Zhu and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Towards High Performance Video Object Detection},
  journal      = {CoRR},
  volume       = {abs/1711.11577},
  year         = {2017},
  url          = {http://arxiv.org/abs/1711.11577},
  eprinttype   = {arXiv},
  eprint       = {1711.11577},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1711-11577.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/DaiHS16,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {Instance-Aware Semantic Segmentation via Multi-task Network Cascades},
  booktitle    = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
  pages        = {3150--3158},
  publisher    = {{IEEE} Computer Society},
  year         = {2016},
  url          = {https://doi.org/10.1109/CVPR.2016.343},
  doi          = {10.1109/CVPR.2016.343},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/DaiHS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/LinDJHS16,
  author       = {Di Lin and
                  Jifeng Dai and
                  Jiaya Jia and
                  Kaiming He and
                  Jian Sun},
  title        = {ScribbleSup: Scribble-Supervised Convolutional Networks for Semantic
                  Segmentation},
  booktitle    = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
  pages        = {3159--3167},
  publisher    = {{IEEE} Computer Society},
  year         = {2016},
  url          = {https://doi.org/10.1109/CVPR.2016.344},
  doi          = {10.1109/CVPR.2016.344},
  timestamp    = {Mon, 03 Mar 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/LinDJHS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/eccv/DaiHLR016,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Yi Li and
                  Shaoqing Ren and
                  Jian Sun},
  editor       = {Bastian Leibe and
                  Jiri Matas and
                  Nicu Sebe and
                  Max Welling},
  title        = {Instance-Sensitive Fully Convolutional Networks},
  booktitle    = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam,
                  The Netherlands, October 11-14, 2016, Proceedings, Part {VI}},
  series       = {Lecture Notes in Computer Science},
  pages        = {534--549},
  publisher    = {Springer},
  year         = {2016},
  url          = {https://doi.org/10.1007/978-3-319-46466-4\_32},
  doi          = {10.1007/978-3-319-46466-4\_32},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/DaiHLR016.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/DaiLHS16,
  author       = {Jifeng Dai and
                  Yi Li and
                  Kaiming He and
                  Jian Sun},
  editor       = {Daniel D. Lee and
                  Masashi Sugiyama and
                  Ulrike von Luxburg and
                  Isabelle Guyon and
                  Roman Garnett},
  title        = {{R-FCN:} Object Detection via Region-based Fully Convolutional Networks},
  booktitle    = {Advances in Neural Information Processing Systems 29: Annual Conference
                  on Neural Information Processing Systems 2016, December 5-10, 2016,
                  Barcelona, Spain},
  pages        = {379--387},
  year         = {2016},
  url          = {https://proceedings.neurips.cc/paper/2016/hash/577ef1154f3240ad5b9b413aa7346a1e-Abstract.html},
  timestamp    = {Mon, 16 May 2022 15:41:51 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/DaiLHS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiHLRS16,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Yi Li and
                  Shaoqing Ren and
                  Jian Sun},
  title        = {Instance-sensitive Fully Convolutional Networks},
  journal      = {CoRR},
  volume       = {abs/1603.08678},
  year         = {2016},
  url          = {http://arxiv.org/abs/1603.08678},
  eprinttype   = {arXiv},
  eprint       = {1603.08678},
  timestamp    = {Tue, 15 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiHLRS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/LinDJHS16,
  author       = {Di Lin and
                  Jifeng Dai and
                  Jiaya Jia and
                  Kaiming He and
                  Jian Sun},
  title        = {ScribbleSup: Scribble-Supervised Convolutional Networks for Semantic
                  Segmentation},
  journal      = {CoRR},
  volume       = {abs/1604.05144},
  year         = {2016},
  url          = {http://arxiv.org/abs/1604.05144},
  eprinttype   = {arXiv},
  eprint       = {1604.05144},
  timestamp    = {Wed, 24 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/LinDJHS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiLHS16,
  author       = {Jifeng Dai and
                  Yi Li and
                  Kaiming He and
                  Jian Sun},
  title        = {{R-FCN:} Object Detection via Region-based Fully Convolutional Networks},
  journal      = {CoRR},
  volume       = {abs/1605.06409},
  year         = {2016},
  url          = {http://arxiv.org/abs/1605.06409},
  eprinttype   = {arXiv},
  eprint       = {1605.06409},
  timestamp    = {Tue, 15 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiLHS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/LiQDJW16,
  author       = {Yi Li and
                  Haozhi Qi and
                  Jifeng Dai and
                  Xiangyang Ji and
                  Yichen Wei},
  title        = {Fully Convolutional Instance-aware Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/1611.07709},
  year         = {2016},
  url          = {http://arxiv.org/abs/1611.07709},
  eprinttype   = {arXiv},
  eprint       = {1611.07709},
  timestamp    = {Tue, 15 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/LiQDJW16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ZhuXDYW16,
  author       = {Xizhou Zhu and
                  Yuwen Xiong and
                  Jifeng Dai and
                  Lu Yuan and
                  Yichen Wei},
  title        = {Deep Feature Flow for Video Recognition},
  journal      = {CoRR},
  volume       = {abs/1611.07715},
  year         = {2016},
  url          = {http://arxiv.org/abs/1611.07715},
  eprinttype   = {arXiv},
  eprint       = {1611.07715},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZhuXDYW16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/DaiH015,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {Convolutional feature masking for joint object and stuff segmentation},
  booktitle    = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
                  2015, Boston, MA, USA, June 7-12, 2015},
  pages        = {3992--4000},
  publisher    = {{IEEE} Computer Society},
  year         = {2015},
  url          = {https://doi.org/10.1109/CVPR.2015.7299025},
  doi          = {10.1109/CVPR.2015.7299025},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/DaiH015.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/DaiHS15,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks
                  for Semantic Segmentation},
  booktitle    = {2015 {IEEE} International Conference on Computer Vision, {ICCV} 2015,
                  Santiago, Chile, December 7-13, 2015},
  pages        = {1635--1643},
  publisher    = {{IEEE} Computer Society},
  year         = {2015},
  url          = {https://doi.org/10.1109/ICCV.2015.191},
  doi          = {10.1109/ICCV.2015.191},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/DaiHS15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:journals/corr/DaiW14,
  author       = {Jifeng Dai and
                  Ying Nian Wu},
  editor       = {Yoshua Bengio and
                  Yann LeCun},
  title        = {Generative Modeling of Convolutional Neural Networks},
  booktitle    = {3rd International Conference on Learning Representations, {ICLR} 2015,
                  San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year         = {2015},
  url          = {http://arxiv.org/abs/1412.6296},
  timestamp    = {Thu, 25 Jul 2019 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiW14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiH015,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks
                  for Semantic Segmentation},
  journal      = {CoRR},
  volume       = {abs/1503.01640},
  year         = {2015},
  url          = {http://arxiv.org/abs/1503.01640},
  eprinttype   = {arXiv},
  eprint       = {1503.01640},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiH015.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiHS15,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {Instance-aware Semantic Segmentation via Multi-task Network Cascades},
  journal      = {CoRR},
  volume       = {abs/1512.04412},
  year         = {2015},
  url          = {http://arxiv.org/abs/1512.04412},
  eprinttype   = {arXiv},
  eprint       = {1512.04412},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiHS15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/DaiHHZW14,
  author       = {Jifeng Dai and
                  Yi Hong and
                  Wenze Hu and
                  Song{-}Chun Zhu and
                  Ying Nian Wu},
  title        = {Unsupervised Learning of Dictionaries of Hierarchical Compositional
                  Models},
  booktitle    = {2014 {IEEE} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2014, Columbus, OH, USA, June 23-28, 2014},
  pages        = {2505--2512},
  publisher    = {{IEEE} Computer Society},
  year         = {2014},
  url          = {https://doi.org/10.1109/CVPR.2014.321},
  doi          = {10.1109/CVPR.2014.321},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cvpr/DaiHHZW14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DaiH014,
  author       = {Jifeng Dai and
                  Kaiming He and
                  Jian Sun},
  title        = {Convolutional Feature Masking for Joint Object and Stuff Segmentation},
  journal      = {CoRR},
  volume       = {abs/1412.1283},
  year         = {2014},
  url          = {http://arxiv.org/abs/1412.1283},
  eprinttype   = {arXiv},
  eprint       = {1412.1283},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/DaiH014.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/iccv/DaiWZZ13,
  author       = {Jifeng Dai and
                  Ying Nian Wu and
                  Jie Zhou and
                  Song{-}Chun Zhu},
  title        = {Cosegmentation and Cosketch by Unsupervised Learning},
  booktitle    = {{IEEE} International Conference on Computer Vision, {ICCV} 2013, Sydney,
                  Australia, December 1-8, 2013},
  pages        = {1305--1312},
  publisher    = {{IEEE} Computer Society},
  year         = {2013},
  url          = {https://doi.org/10.1109/ICCV.2013.165},
  doi          = {10.1109/ICCV.2013.165},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccv/DaiWZZ13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/DaiFZ12,
  author       = {Jifeng Dai and
                  Jianjiang Feng and
                  Jie Zhou},
  title        = {Robust and Efficient Ridge-Based Palmprint Matching},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {34},
  number       = {8},
  pages        = {1618--1632},
  year         = {2012},
  url          = {https://doi.org/10.1109/TPAMI.2011.237},
  doi          = {10.1109/TPAMI.2011.237},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/DaiFZ12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icpr/DaiFZ12,
  author       = {Jifeng Dai and
                  Jianjiang Feng and
                  Jie Zhou},
  title        = {Mining sub-categories for object detection},
  booktitle    = {Proceedings of the 21st International Conference on Pattern Recognition,
                  {ICPR} 2012, Tsukuba, Japan, November 11-15, 2012},
  pages        = {3260--3263},
  publisher    = {{IEEE} Computer Society},
  year         = {2012},
  url          = {https://ieeexplore.ieee.org/document/6460860/},
  timestamp    = {Tue, 10 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icpr/DaiFZ12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/pami/DaiZ11,
  author       = {Jifeng Dai and
                  Jie Zhou},
  title        = {Multifeature-Based High-Resolution Palmprint Recognition},
  journal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume       = {33},
  number       = {5},
  pages        = {945--957},
  year         = {2011},
  url          = {https://doi.org/10.1109/TPAMI.2010.164},
  doi          = {10.1109/TPAMI.2010.164},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/pami/DaiZ11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

manage site settings

To protect your privacy, all features that rely on external API calls from your browser are turned off by default. You need to opt-in for them to become active. All settings here will be stored as cookies with your web browser. For more information see our F.A.Q.