default search action
BibTeX records: Jifeng Dai
@inproceedings{DBLP:conf/aaai/ZhangDHQDH26,
author = {Tianyi Zhang and
Haonan Duan and
Haoran Hao and
Yu Qiao and
Jifeng Dai and
Zhi Hou},
editor = {Sven Koenig and
Chad Jenkins and
Matthew E. Taylor},
title = {Grounding Actions in Camera Space: Observation-Centric Vision-Language-Action
Policy},
booktitle = {Fortieth {AAAI} Conference on Artificial Intelligence, Thirty-Eighth
Conference on Innovative Applications of Artificial Intelligence,
Sixteenth Symposium on Educational Advances in Artificial Intelligence,
{AAAI} 2026, Singapore, January 20-27, 2026},
pages = {18782--18790},
publisher = {{AAAI} Press},
year = {2026},
url = {https://doi.org/10.1609/aaai.v40i22.38947},
doi = {10.1609/AAAI.V40I22.38947},
timestamp = {Fri, 27 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/aaai/ZhangDHQDH26.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2602-22808,
author = {Shiqian Su and
Sen Xing and
Xuan Dong and
Muyan Zhong and
Bin Wang and
Xizhou Zhu and
Yuntao Chen and
Wenhai Wang and
Yue Deng and
Pengxiang Zhu and
Ziyuan Liu and
Tiantong Li and
Jiaheng Yu and
Zhe Chen and
Lidong Bing and
Jifeng Dai},
title = {MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework
for General Deep Research Tasks},
journal = {CoRR},
volume = {abs/2602.22808},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2602.22808},
doi = {10.48550/ARXIV.2602.22808},
eprinttype = {arXiv},
eprint = {2602.22808},
timestamp = {Sun, 29 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2602-22808.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2605-12622,
author = {Pengfei Jing and
Victor Shea{-}Jay Huang and
Hengtong Lu and
Jifeng Dai and
Yan Xie and
Benjin Zhu},
title = {Action Emergence from Streaming Intent},
journal = {CoRR},
volume = {abs/2605.12622},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2605.12622},
doi = {10.48550/ARXIV.2605.12622},
eprinttype = {arXiv},
eprint = {2605.12622},
timestamp = {Tue, 09 Jun 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2605-12622.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2605-12624,
author = {Yuzhou Huang and
Benjin Zhu and
Hengtong Lu and
Victor Shea{-}Jay Huang and
Haiming Zhang and
Wei Chen and
Jifeng Dai and
Yan Xie and
Hongsheng Li},
title = {MindVLA-U1: {VLA} Beats {VA} with Unified Streaming Architecture for
Autonomous Driving},
journal = {CoRR},
volume = {abs/2605.12624},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2605.12624},
doi = {10.48550/ARXIV.2605.12624},
eprinttype = {arXiv},
eprint = {2605.12624},
timestamp = {Tue, 09 Jun 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2605-12624.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2605-12625,
author = {Hengtong Lu and
Victor Shea{-}Jay Huang and
Chengmin Yang and
Pengfei Jing and
Jifeng Dai and
Yan Xie and
Benjin Zhu},
title = {Driving Intents Amplify Planning-Oriented Reinforcement Learning},
journal = {CoRR},
volume = {abs/2605.12625},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2605.12625},
doi = {10.48550/ARXIV.2605.12625},
eprinttype = {arXiv},
eprint = {2605.12625},
timestamp = {Tue, 09 Jun 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2605-12625.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/csur/SunZXLCQXDLGWWCYRFHYLLL25,
author = {Jiankai Sun and
Chuanyang Zheng and
Enze Xie and
Zhengying Liu and
Ruihang Chu and
Jianing Qiu and
Jiaqi Xu and
Mingyu Ding and
Hongyang Li and
Mengzhe Geng and
Yue Wu and
Wenhai Wang and
Junsong Chen and
Zhangyue Yin and
Xiaozhe Ren and
Jie Fu and
Junxian He and
Wu Yuan and
Qi Liu and
Xihui Liu and
Yu Li and
Hao Dong and
Yu Cheng and
Ming Zhang and
Pheng{-}Ann Heng and
Jifeng Dai and
Ping Luo and
Jingdong Wang and
Ji{-}Rong Wen and
Xipeng Qiu and
Yike Guo and
Hui Xiong and
Qun Liu and
Zhenguo Li},
title = {A Survey of Reasoning with Foundation Models: Concepts, Methodologies,
and Outlook},
journal = {{ACM} Comput. Surv.},
volume = {57},
number = {11},
pages = {278:1--278:43},
year = {2025},
url = {https://doi.org/10.1145/3729218},
doi = {10.1145/3729218},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/csur/SunZXLCQXDLGWWCYRFHYLLL25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/LiWLXSLQD25,
author = {Zhiqi Li and
Wenhai Wang and
Hongyang Li and
Enze Xie and
Chonghao Sima and
Tong Lu and
Yu Qiao and
Jifeng Dai},
title = {BEVFormer: Learning Bird's-Eye-View Representation From LiDAR-Camera
via Spatiotemporal Transformers},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {47},
number = {3},
pages = {2020--2036},
year = {2025},
url = {https://doi.org/10.1109/TPAMI.2024.3515454},
doi = {10.1109/TPAMI.2024.3515454},
timestamp = {Wed, 19 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/pami/LiWLXSLQD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/HuSWWXWZLZWQD25,
author = {Xiaowei Hu and
Min Shi and
Weiyun Wang and
Sitong Wu and
Linjie Xing and
Wenhai Wang and
Xizhou Zhou and
Lewei Lu and
Jie Zhou and
Xiaogang Wang and
Yu Qiao and
Jifeng Dai},
title = {Demystify Transformers {\&} Convolutions in Modern Image Deep
Networks},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {47},
number = {4},
pages = {2416--2428},
year = {2025},
url = {https://doi.org/10.1109/TPAMI.2024.3520508},
doi = {10.1109/TPAMI.2024.3520508},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/HuSWWXWZLZWQD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/ChenFGZD25,
author = {Linwei Chen and
Ying Fu and
Lin Gu and
Dezhi Zheng and
Jifeng Dai},
title = {Spatial Frequency Modulation for Semantic Segmentation},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {47},
number = {11},
pages = {9767--9784},
year = {2025},
url = {https://doi.org/10.1109/TPAMI.2025.3592621},
doi = {10.1109/TPAMI.2025.3592621},
timestamp = {Wed, 15 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/ChenFGZD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/WangZYLLTDGLQD25,
author = {Zhaokai Wang and
Xizhou Zhu and
Xue Yang and
Gen Luo and
Hao Li and
Changyao Tian and
Wenhan Dou and
Junqi Ge and
Lewei Lu and
Yu Qiao and
Jifeng Dai},
title = {Parameter-Inverted Image Pyramid Networks for Visual Perception and
Multimodal Understanding},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {47},
number = {11},
pages = {10142--10159},
year = {2025},
url = {https://doi.org/10.1109/TPAMI.2025.3593283},
doi = {10.1109/TPAMI.2025.3593283},
timestamp = {Wed, 15 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/WangZYLLTDGLQD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/visintelligence/CuiWLXZDLLZD25,
author = {Erfei Cui and
Wenhai Wang and
Zhiqi Li and
Jiangwei Xie and
Haoming Zou and
Hanming Deng and
Gen Luo and
Lewei Lu and
Xizhou Zhu and
Jifeng Dai},
title = {DriveMLM: aligning multi-modal large language models with behavioral
planning states for autonomous driving},
journal = {Vis. Intell.},
volume = {3},
number = {1},
year = {2025},
url = {https://doi.org/10.1007/s44267-025-00095-w},
doi = {10.1007/S44267-025-00095-W},
timestamp = {Sat, 21 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/visintelligence/CuiWLXZDLLZD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/DuanCHWYSLHLLDW25,
author = {Yuchen Duan and
Zhe Chen and
Yusong Hu and
Weiyun Wang and
Shenglong Ye and
Botian Shi and
Lewei Lu and
Qibin Hou and
Tong Lu and
Hongsheng Li and
Jifeng Dai and
Wenhai Wang},
title = {Docopilot: Improving Multimodal Models for Document-Level Understanding},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {4026--4037},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Duan\_Docopilot\_Improving\_Multimodal\_Models\_for\_Document-Level\_Understanding\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.00381},
timestamp = {Sun, 04 Jan 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/DuanCHWYSLHLLDW25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/NanLD025,
author = {Zhixiong Nan and
Xianghong Li and
Jifeng Dai and
Tao Xiang},
title = {{MI-DETR:} An Object Detection Model with Multi-time Inquiries Mechanism},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {4703--4712},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Nan\_MI-DETR\_An\_Object\_Detection\_Model\_with\_Multi-time\_Inquiries\_Mechanism\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.00443},
timestamp = {Sat, 06 Sep 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/NanLD025.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/TaoSZZ0LWL00D25,
author = {Chenxin Tao and
Shiqian Su and
Xizhou Zhu and
Chenyu Zhang and
Zhe Chen and
Jiawen Liu and
Wenhai Wang and
Lewei Lu and
Gao Huang and
Yu Qiao and
Jifeng Dai},
title = {HoVLE: Unleashing the Power of Monolithic Vision-Language Models with
Holistic Vision-Language Embedding},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {14559--14569},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Tao\_HoVLE\_Unleashing\_the\_Power\_of\_Monolithic\_Vision-Language\_Models\_with\_Holistic\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.01357},
timestamp = {Wed, 20 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/TaoSZZ0LWL00D25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/YangDZ0W00WLD25,
author = {Chenyu Yang and
Xuan Dong and
Xizhou Zhu and
Weijie Su and
Jiahao Wang and
Hao Tian and
Zhe Chen and
Wenhai Wang and
Lewei Lu and
Jifeng Dai},
title = {{PVC:} Progressive Visual Token Compression for Unified Image and
Video Processing in Large Vision-Language Models},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {24939--24949},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Yang\_PVC\_Progressive\_Visual\_Token\_Compression\_for\_Unified\_Image\_and\_Video\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.02322},
timestamp = {Wed, 20 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/YangDZ0W00WLD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LuoYDWLD0Z25,
author = {Gen Luo and
Xue Yang and
Wenhan Dou and
Zhaokai Wang and
Jiawen Liu and
Jifeng Dai and
Yu Qiao and
Xizhou Zhu},
title = {Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large
Language Models with Endogenous Visual Pre-training},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {24960--24971},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Luo\_Mono-InternVL\_Pushing\_the\_Boundaries\_of\_Monolithic\_Multimodal\_Large\_Language\_Models\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.02324},
timestamp = {Wed, 20 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/LuoYDWLD0Z25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LiTSZWZDWLLD25,
author = {Hao Li and
Changyao Tian and
Jie Shao and
Xizhou Zhu and
Zhaokai Wang and
Jinguo Zhu and
Wenhan Dou and
Xiao{-}Gang Wang and
Hongsheng Li and
Lewei Lu and
Jifeng Dai},
title = {SynerGen-VL: Towards Synergistic Image Understanding and Generation
with Vision Experts and Token Folding},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2025, Nashville, TN, USA, June 11-15, 2025},
pages = {29767--29779},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2025},
url = {https://openaccess.thecvf.com/content/CVPR2025/html/Li\_SynerGen-VL\_Towards\_Synergistic\_Image\_Understanding\_and\_Generation\_with\_Vision\_Experts\_CVPR\_2025\_paper.html},
doi = {10.1109/CVPR52734.2025.02771},
timestamp = {Wed, 20 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/LiTSZWZDWLLD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/HouZXDPTZZQDC25,
author = {Zhi Hou and
Tianyi Zhang and
Yuwen Xiong and
Haonan Duan and
Hengjun Pu and
Ronglei Tong and
Chengyang Zhao and
Xizhou Zhu and
Yu Qiao and
Jifeng Dai and
Yuntao Chen},
title = {Dita: Scaling Diffusion Transformer for Generalist Vision-Language-Action
Policy},
booktitle = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
Honolulu, HI, USA, October 19-25, 2025},
pages = {7686--7697},
publisher = {{IEEE}},
year = {2025},
url = {https://doi.org/10.1109/ICCV51701.2025.00721},
doi = {10.1109/ICCV51701.2025.00721},
timestamp = {Wed, 13 May 2026 10:42:14 +0200},
biburl = {https://dblp.org/rec/conf/iccv/HouZXDPTZZQDC25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/FangDWLHTZZDLL25,
author = {Rongyao Fang and
Chengqi Duan and
Kun Wang and
Hao Li and
Linjiang Huang and
Hao Tian and
Xingyu Zeng and
Rui Zhao and
Jifeng Dai and
Hongsheng Li and
Xihui Liu},
title = {{PUMA:} Empowering Unified {MLLM} with Multi-Granular Visual Generation},
booktitle = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
Honolulu, HI, USA, October 19-25, 2025},
pages = {15447--15457},
publisher = {{IEEE}},
year = {2025},
url = {https://doi.org/10.1109/ICCV51701.2025.01433},
doi = {10.1109/ICCV51701.2025.01433},
timestamp = {Wed, 13 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/FangDWLHTZZDLL25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/GeCLZLDZ25,
author = {Junqi Ge and
Ziyi Chen and
Jintao Lin and
Jinguo Zhu and
Xihui Liu and
Jifeng Dai and
Xizhou Zhu},
title = {{V2PE:} Improving Multimodal Long-Context Capability of Vision-Language
Models with Variable Visual Position Encoding},
booktitle = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
Honolulu, HI, USA, October 19-25, 2025},
pages = {21070--21084},
publisher = {{IEEE}},
year = {2025},
url = {https://doi.org/10.1109/ICCV51701.2025.01958},
doi = {10.1109/ICCV51701.2025.01958},
timestamp = {Wed, 13 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/GeCLZLDZ25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/LiaoNMLTDXLZYDC25,
author = {Jiaqi Liao and
Yuwei Niu and
Fanqing Meng and
Hao Li and
Changyao Tian and
Yinuo Du and
Yuwen Xiong and
Dianqi Li and
Xizhou Zhu and
Li Yuan and
Jifeng Dai and
Yu Cheng},
title = {LangBridge: Interpreting Image as a Combination of Language Embeddings},
booktitle = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2025,
Honolulu, HI, USA, October 19-25, 2025},
pages = {23752--23762},
publisher = {{IEEE}},
year = {2025},
url = {https://doi.org/10.1109/ICCV51701.2025.02205},
doi = {10.1109/ICCV51701.2025.02205},
timestamp = {Wed, 13 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/LiaoNMLTDXLZYDC25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/DuanWCZLLQ0DW25,
author = {Yuchen Duan and
Weiyun Wang and
Zhe Chen and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Yu Qiao and
Hongsheng Li and
Jifeng Dai and
Wenhai Wang},
title = {Vision-RWKV: Efficient and Scalable Visual Perception with RWKV-Like
Architectures},
booktitle = {The Thirteenth International Conference on Learning Representations,
{ICLR} 2025, Singapore, April 24-28, 2025},
publisher = {OpenReview.net},
year = {2025},
url = {https://openreview.net/forum?id=nGiGXLnKhl},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iclr/DuanWCZLLQ0DW25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/MengW0L0YLZD00Z25,
author = {Fanqing Meng and
Jin Wang and
Chuanhao Li and
Quanfeng Lu and
Hao Tian and
Tianshuo Yang and
Jiaqi Liao and
Xizhou Zhu and
Jifeng Dai and
Yu Qiao and
Ping Luo and
Kaipeng Zhang and
Wenqi Shao},
title = {{MMIU:} Multimodal Multi-image Understanding for Evaluating Large
Vision-Language Models},
booktitle = {The Thirteenth International Conference on Learning Representations,
{ICLR} 2025, Singapore, April 24-28, 2025},
publisher = {OpenReview.net},
year = {2025},
url = {https://openreview.net/forum?id=WsgEWL8i0K},
timestamp = {Thu, 10 Jul 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/MengW0L0YLZD00Z25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/SiW0XLDQ0025,
author = {Chongjie Si and
Xuehui Wang and
Xue Yang and
Zhengqin Xu and
Qingyun Li and
Jifeng Dai and
Yu Qiao and
Xiaokang Yang and
Wei Shen},
title = {Maintaining Structural Integrity in Parameter Spaces for Parameter
Efficient Fine-tuning},
booktitle = {The Thirteenth International Conference on Learning Representations,
{ICLR} 2025, Singapore, April 24-28, 2025},
publisher = {OpenReview.net},
year = {2025},
url = {https://openreview.net/forum?id=OALIb8oNfl},
timestamp = {Fri, 16 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/SiW0XLDQ0025.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/Liu0ZWD25,
author = {Shi Liu and
Weijie Su and
Xizhou Zhu and
Wenhai Wang and
Jifeng Dai},
editor = {Aarti Singh and
Maryam Fazel and
Daniel Hsu and
Simon Lacoste{-}Julien and
Felix Berkenkamp and
Tegan Maharaj and
Kiri Wagstaff and
Jerry Zhu},
title = {CoMemo: LVLMs Need Image Context with Image Memory},
booktitle = {Forty-second International Conference on Machine Learning, {ICML}
2025, Vancouver, BC, Canada, July 13-19, 2025},
series = {Proceedings of Machine Learning Research},
publisher = {{PMLR} / OpenReview.net},
year = {2025},
url = {https://proceedings.mlr.press/v267/liu25bn.html},
timestamp = {Wed, 04 Feb 2026 16:54:16 +0100},
biburl = {https://dblp.org/rec/conf/icml/Liu0ZWD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/XingZLLLWDW25,
author = {Sen Xing and
Muyan Zhong and
Zeqiang Lai and
Liangchen Li and
Jiawen Liu and
Yaohui Wang and
Jifeng Dai and
Wenhai Wang},
editor = {Aarti Singh and
Maryam Fazel and
Daniel Hsu and
Simon Lacoste{-}Julien and
Felix Berkenkamp and
Tegan Maharaj and
Kiri Wagstaff and
Jerry Zhu},
title = {MuLan: Adapting Multilingual Diffusion Models for Hundreds of Languages
with Negligible Cost},
booktitle = {Forty-second International Conference on Machine Learning, {ICML}
2025, Vancouver, BC, Canada, July 13-19, 2025},
series = {Proceedings of Machine Learning Research},
publisher = {{PMLR} / OpenReview.net},
year = {2025},
url = {https://proceedings.mlr.press/v267/xing25d.html},
timestamp = {Wed, 04 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/icml/XingZLLLWDW25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2501-07783,
author = {Zhaokai Wang and
Xizhou Zhu and
Xue Yang and
Gen Luo and
Hao Li and
Changyao Tian and
Wenhan Dou and
Junqi Ge and
Lewei Lu and
Yu Qiao and
Jifeng Dai},
title = {Parameter-Inverted Image Pyramid Networks for Visual Perception and
Multimodal Understanding},
journal = {CoRR},
volume = {abs/2501.07783},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2501.07783},
doi = {10.48550/ARXIV.2501.07783},
eprinttype = {arXiv},
eprint = {2501.07783},
timestamp = {Mon, 24 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2501-07783.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2503-01463,
author = {Zhixiong Nan and
Xianghong Li and
Jifeng Dai and
Tao Xiang},
title = {{MI-DETR:} An Object Detection Model with Multi-time Inquiries Mechanism},
journal = {CoRR},
volume = {abs/2503.01463},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2503.01463},
doi = {10.48550/ARXIV.2503.01463},
eprinttype = {arXiv},
eprint = {2503.01463},
timestamp = {Tue, 08 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2503-01463.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2503-10291,
author = {Weiyun Wang and
Zhangwei Gao and
Lianjie Chen and
Zhe Chen and
Jinguo Zhu and
Xiangyu Zhao and
Yangzhou Liu and
Yue Cao and
Shenglong Ye and
Xizhou Zhu and
Lewei Lu and
Haodong Duan and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {VisualPRM: An Effective Process Reward Model for Multimodal Reasoning},
journal = {CoRR},
volume = {abs/2503.10291},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2503.10291},
doi = {10.48550/ARXIV.2503.10291},
eprinttype = {arXiv},
eprint = {2503.10291},
timestamp = {Sun, 13 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2503-10291.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2503-10639,
author = {Rongyao Fang and
Chengqi Duan and
Kun Wang and
Linjiang Huang and
Hao Li and
Shilin Yan and
Hao Tian and
Xingyu Zeng and
Rui Zhao and
Jifeng Dai and
Xihui Liu and
Hongsheng Li},
title = {GoT: Unleashing Reasoning Capability of Multimodal Large Language
Model for Visual Generation and Editing},
journal = {CoRR},
volume = {abs/2503.10639},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2503.10639},
doi = {10.48550/ARXIV.2503.10639},
eprinttype = {arXiv},
eprint = {2503.10639},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2503-10639.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2503-19404,
author = {Jiaqi Liao and
Yuwei Niu and
Fanqing Meng and
Hao Li and
Changyao Tian and
Yinuo Du and
Yuwen Xiong and
Dianqi Li and
Xizhou Zhu and
Li Yuan and
Jifeng Dai and
Yu Cheng},
title = {LangBridge: Interpreting Image as a Combination of Language Embeddings},
journal = {CoRR},
volume = {abs/2503.19404},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2503.19404},
doi = {10.48550/ARXIV.2503.19404},
eprinttype = {arXiv},
eprint = {2503.19404},
timestamp = {Wed, 23 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2503-19404.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2503-19757,
author = {Zhi Hou and
Tianyi Zhang and
Yuwen Xiong and
Haonan Duan and
Hengjun Pu and
Ronglei Tong and
Chengyang Zhao and
Xizhou Zhu and
Yu Qiao and
Jifeng Dai and
Yuntao Chen},
title = {Dita: Scaling Diffusion Transformer for Generalist Vision-Language-Action
Policy},
journal = {CoRR},
volume = {abs/2503.19757},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2503.19757},
doi = {10.48550/ARXIV.2503.19757},
eprinttype = {arXiv},
eprint = {2503.19757},
timestamp = {Sat, 31 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2503-19757.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2504-10479,
author = {Jinguo Zhu and
Weiyun Wang and
Zhe Chen and
Zhaoyang Liu and
Shenglong Ye and
Lixin Gu and
Hao Tian and
Yuchen Duan and
Weijie Su and
Jie Shao and
Zhangwei Gao and
Erfei Cui and
Xuehui Wang and
Yue Cao and
Yangzhou Liu and
Xingguang Wei and
Hongjie Zhang and
Haomin Wang and
Weiye Xu and
Hao Li and
Jiahao Wang and
Nianchen Deng and
Songze Li and
Yinan He and
Tan Jiang and
Jiapeng Luo and
Yi Wang and
Conghui He and
Botian Shi and
Xingcheng Zhang and
Wenqi Shao and
Junjun He and
Yingtong Xiong and
Wenwen Qu and
Peng Sun and
Penglong Jiao and
Han Lv and
Lijun Wu and
Kaipeng Zhang and
Huipeng Deng and
Jiaye Ge and
Kai Chen and
Limin Wang and
Min Dou and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Dahua Lin and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source
Multimodal Models},
journal = {CoRR},
volume = {abs/2504.10479},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2504.10479},
doi = {10.48550/ARXIV.2504.10479},
eprinttype = {arXiv},
eprint = {2504.10479},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2504-10479.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2504-15279,
author = {Weiye Xu and
Jiahao Wang and
Weiyun Wang and
Zhe Chen and
Wengang Zhou and
Aijun Yang and
Lewei Lu and
Houqiang Li and
Xiaohua Wang and
Xizhou Zhu and
Wenhai Wang and
Jifeng Dai and
Jinguo Zhu},
title = {VisuLogic: {A} Benchmark for Evaluating Visual Reasoning in Multi-modal
Large Language Models},
journal = {CoRR},
volume = {abs/2504.15279},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2504.15279},
doi = {10.48550/ARXIV.2504.15279},
eprinttype = {arXiv},
eprint = {2504.15279},
timestamp = {Sun, 25 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2504-15279.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-04623,
author = {Zhenghao Xing and
Xiaowei Hu and
Chi{-}Wing Fu and
Wenhai Wang and
Jifeng Dai and
Pheng{-}Ann Heng},
title = {EchoInk-R1: Exploring Audio-Visual Reasoning in Multimodal LLMs via
Reinforcement Learning},
journal = {CoRR},
volume = {abs/2505.04623},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2505.04623},
doi = {10.48550/ARXIV.2505.04623},
eprinttype = {arXiv},
eprint = {2505.04623},
timestamp = {Sun, 29 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2505-04623.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-17011,
author = {Yan Li and
Changyao Tian and
Renqiu Xia and
Ning Liao and
Weiwei Guo and
Junchi Yan and
Hongsheng Li and
Jifeng Dai and
Hao Li and
Xue Yang},
title = {Learning Adaptive and Temporally Causal Video Tokenization in a 1D
Latent Space},
journal = {CoRR},
volume = {abs/2505.17011},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2505.17011},
doi = {10.48550/ARXIV.2505.17011},
eprinttype = {arXiv},
eprint = {2505.17011},
timestamp = {Sun, 29 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2505-17011.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-23395,
author = {Xingguang Wei and
Haomin Wang and
Shenglong Ye and
Ruifeng Luo and
Yanting Zhang and
Lixin Gu and
Jifeng Dai and
Yu Qiao and
Wenhai Wang and
Hongjie Zhang},
title = {Point or Line? Using Line-based Representation for Panoptic Symbol
Spotting in {CAD} Drawings},
journal = {CoRR},
volume = {abs/2505.23395},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2505.23395},
doi = {10.48550/ARXIV.2505.23395},
eprinttype = {arXiv},
eprint = {2505.23395},
timestamp = {Fri, 04 Jul 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2505-23395.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-23762,
author = {Chenyu Yang and
Shiqian Su and
Shi Liu and
Xuan Dong and
Yue Yu and
Weijie Su and
Xuehui Wang and
Zhaoyang Liu and
Jinguo Zhu and
Hao Li and
Wenhai Wang and
Yu Qiao and
Xizhou Zhu and
Jifeng Dai},
title = {ZeroGUI: Automating Online {GUI} Learning at Zero Human Cost},
journal = {CoRR},
volume = {abs/2505.23762},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2505.23762},
doi = {10.48550/ARXIV.2505.23762},
eprinttype = {arXiv},
eprint = {2505.23762},
timestamp = {Wed, 17 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2505-23762.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-00123,
author = {Gen Luo and
Ganlin Yang and
Ziyang Gong and
Guanzhou Chen and
Haonan Duan and
Erfei Cui and
Ronglei Tong and
Zhi Hou and
Tianyi Zhang and
Zhe Chen and
Shenglong Ye and
Lewei Lu and
Jingbo Wang and
Wenhai Wang and
Jifeng Dai and
Yu Qiao and
Rongrong Ji and
Xizhou Zhu},
title = {Visual Embodied Brain: Let Multimodal Large Language Models See, Think,
and Control in Spaces},
journal = {CoRR},
volume = {abs/2506.00123},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.00123},
doi = {10.48550/ARXIV.2506.00123},
eprinttype = {arXiv},
eprint = {2506.00123},
timestamp = {Sun, 07 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2506-00123.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-04217,
author = {Junting Chen and
Haotian Liang and
Lingxiao Du and
Weiyun Wang and
Mengkang Hu and
Yao Mu and
Wenhai Wang and
Jifeng Dai and
Ping Luo and
Wenqi Shao and
Lin Shao},
title = {OWMM-Agent: Open World Mobile Manipulation With Multi-modal Agentic
Data Synthesis},
journal = {CoRR},
volume = {abs/2506.04217},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.04217},
doi = {10.48550/ARXIV.2506.04217},
eprinttype = {arXiv},
eprint = {2506.04217},
timestamp = {Sun, 06 Jul 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2506-04217.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-06279,
author = {Shi Liu and
Weijie Su and
Xizhou Zhu and
Wenhai Wang and
Jifeng Dai},
title = {CoMemo: LVLMs Need Image Context with Image Memory},
journal = {CoRR},
volume = {abs/2506.06279},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.06279},
doi = {10.48550/ARXIV.2506.06279},
eprinttype = {arXiv},
eprint = {2506.06279},
timestamp = {Mon, 07 Jul 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2506-06279.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-18385,
author = {Nianchen Deng and
Lixin Gu and
Shenglong Ye and
Yinan He and
Zhe Chen and
Songze Li and
Haomin Wang and
Xingguang Wei and
Tianshuo Yang and
Min Dou and
Tong He and
Wenqi Shao and
Kaipeng Zhang and
Yi Wang and
Botian Shi and
Yanting Zhang and
Jifeng Dai and
Yu Qiao and
Hongjie Zhang and
Wenhai Wang},
title = {InternSpatial: {A} Comprehensive Dataset for Spatial Reasoning in
Vision-Language Models},
journal = {CoRR},
volume = {abs/2506.18385},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.18385},
doi = {10.48550/ARXIV.2506.18385},
eprinttype = {arXiv},
eprint = {2506.18385},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2506-18385.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-11893,
author = {Linwei Chen and
Ying Fu and
Lin Gu and
Dezhi Zheng and
Jifeng Dai},
title = {Spatial Frequency Modulation for Semantic Segmentation},
journal = {CoRR},
volume = {abs/2507.11893},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.11893},
doi = {10.48550/ARXIV.2507.11893},
eprinttype = {arXiv},
eprint = {2507.11893},
timestamp = {Wed, 20 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-11893.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-12566,
author = {Gen Luo and
Wenhan Dou and
Wenhao Li and
Zhaokai Wang and
Xue Yang and
Changyao Tian and
Hao Li and
Weiyun Wang and
Wenhai Wang and
Xizhou Zhu and
Yu Qiao and
Jifeng Dai},
title = {Mono-InternVL-1.5: Towards Cheaper and Faster Monolithic Multimodal
Large Language Models},
journal = {CoRR},
volume = {abs/2507.12566},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.12566},
doi = {10.48550/ARXIV.2507.12566},
eprinttype = {arXiv},
eprint = {2507.12566},
timestamp = {Mon, 18 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-12566.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-12841,
author = {Yiming Ren and
Zhiqiang Lin and
Yu Li and
Gao Meng and
Weiyun Wang and
Junjie Wang and
Zicheng Lin and
Jifeng Dai and
Yujiu Yang and
Wenhai Wang and
Ruihang Chu},
title = {AnyCap Project: {A} Unified Framework, Dataset, and Benchmark for
Controllable Omni-modal Captioning},
journal = {CoRR},
volume = {abs/2507.12841},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.12841},
doi = {10.48550/ARXIV.2507.12841},
eprinttype = {arXiv},
eprint = {2507.12841},
timestamp = {Thu, 23 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-12841.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-14675,
author = {Yuchen Duan and
Zhe Chen and
Yusong Hu and
Weiyun Wang and
Shenglong Ye and
Botian Shi and
Lewei Lu and
Qibin Hou and
Tong Lu and
Hongsheng Li and
Jifeng Dai and
Wenhai Wang},
title = {Docopilot: Improving Multimodal Models for Document-Level Understanding},
journal = {CoRR},
volume = {abs/2507.14675},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.14675},
doi = {10.48550/ARXIV.2507.14675},
eprinttype = {arXiv},
eprint = {2507.14675},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-14675.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-19478,
author = {Xuehui Wang and
Zhenyu Wu and
JingJing Xie and
Zichen Ding and
Bowen Yang and
Zehao Li and
Zhaoyang Liu and
Qingyun Li and
Xuan Dong and
Zhe Chen and
Weiyun Wang and
Xiangyu Zhao and
Jixuan Chen and
Haodong Duan and
Tianbao Xie and
Chenyu Yang and
Shiqian Su and
Yue Yu and
Yuan Huang and
Yiqian Liu and
Xiao Zhang and
Yanting Zhang and
Xiangyu Yue and
Weijie Su and
Xizhou Zhu and
Wei Shen and
Jifeng Dai and
Wenhai Wang},
title = {MMBench-GUI: Hierarchical Multi-Platform Evaluation Framework for
{GUI} Agents},
journal = {CoRR},
volume = {abs/2507.19478},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.19478},
doi = {10.48550/ARXIV.2507.19478},
eprinttype = {arXiv},
eprint = {2507.19478},
timestamp = {Wed, 17 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-19478.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2508-13103,
author = {Tianyi Zhang and
Haonan Duan and
Haoran Hao and
Yu Qiao and
Jifeng Dai and
Zhi Hou},
title = {Grounding Actions in Camera Space: Observation-Centric Vision-Language-Action
Policy},
journal = {CoRR},
volume = {abs/2508.13103},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2508.13103},
doi = {10.48550/ARXIV.2508.13103},
eprinttype = {arXiv},
eprint = {2508.13103},
timestamp = {Sun, 07 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2508-13103.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2508-18265,
author = {Weiyun Wang and
Zhangwei Gao and
Lixin Gu and
Hengjun Pu and
Long Cui and
Xingguang Wei and
Zhaoyang Liu and
Linglin Jing and
Shenglong Ye and
Jie Shao and
Zhaokai Wang and
Zhe Chen and
Hongjie Zhang and
Ganlin Yang and
Haomin Wang and
Qi Wei and
Jinhui Yin and
Wenhao Li and
Erfei Cui and
Guanzhou Chen and
Zichen Ding and
Changyao Tian and
Zhenyu Wu and
JingJing Xie and
Zehao Li and
Bowen Yang and
Yuchen Duan and
Xuehui Wang and
Zhi Hou and
Haoran Hao and
Tianyi Zhang and
Songze Li and
Xiangyu Zhao and
Haodong Duan and
Nianchen Deng and
Bin Fu and
Yinan He and
Yi Wang and
Conghui He and
Botian Shi and
Junjun He and
Yingtong Xiong and
Han Lv and
Lijun Wu and
Wenqi Shao and
Kaipeng Zhang and
Huipeng Deng and
Biqing Qi and
Jiaye Ge and
Qipeng Guo and
Wenwei Zhang and
Songyang Zhang and
Maosong Cao and
Junyao Lin and
Kexian Tang and
Jianfei Gao and
Haian Huang and
Yuzhe Gu and
Chengqi Lyu and
Huanze Tang and
Rui Wang and
Haijun Lv and
Wanli Ouyang and
Limin Wang and
Min Dou and
Xizhou Zhu and
Tong Lu and
Dahua Lin and
Jifeng Dai and
Weijie Su and
Bowen Zhou and
Kai Chen and
Yu Qiao and
Wenhai Wang and
Gen Luo},
title = {InternVL3.5: Advancing Open-Source Multimodal Models in Versatility,
Reasoning, and Efficiency},
journal = {CoRR},
volume = {abs/2508.18265},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2508.18265},
doi = {10.48550/ARXIV.2508.18265},
eprinttype = {arXiv},
eprint = {2508.18265},
timestamp = {Tue, 07 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2508-18265.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2509-14232,
author = {Zhaokai Wang and
Penghao Yin and
Xiangyu Zhao and
Changyao Tian and
Yu Qiao and
Wenhai Wang and
Jifeng Dai and
Gen Luo},
title = {GenExam: {A} Multidisciplinary Text-to-Image Exam},
journal = {CoRR},
volume = {abs/2509.14232},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2509.14232},
doi = {10.48550/ARXIV.2509.14232},
eprinttype = {arXiv},
eprint = {2509.14232},
timestamp = {Thu, 16 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2509-14232.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2509-24007,
author = {Yangzhou Liu and
Yue Cao and
Hao Li and
Gen Luo and
Zhe Chen and
Weiyun Wang and
Xiaobo Liang and
Biqing Qi and
Lijun Wu and
Changyao Tian and
Yanting Zhang and
Yuqiang Li and
Tong Lu and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {Sequential Diffusion Language Models},
journal = {CoRR},
volume = {abs/2509.24007},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2509.24007},
doi = {10.48550/ARXIV.2509.24007},
eprinttype = {arXiv},
eprint = {2509.24007},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2509-24007.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2510-08565,
author = {Changyao Tian and
Hao Li and
Gen Luo and
Xizhou Zhu and
Weijie Su and
Hanming Deng and
Jinguo Zhu and
Jie Shao and
Ziran Zhu and
Yunpeng Liu and
Lewei Lu and
Wenhai Wang and
Hongsheng Li and
Jifeng Dai},
title = {NaViL: Rethinking Scaling Properties of Native Multimodal Large Language
Models under Data Constraints},
journal = {CoRR},
volume = {abs/2510.08565},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2510.08565},
doi = {10.48550/ARXIV.2510.08565},
eprinttype = {arXiv},
eprint = {2510.08565},
timestamp = {Tue, 11 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2510-08565.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2510-11027,
author = {Ganlin Yang and
Tianyi Zhang and
Haoran Hao and
Weiyun Wang and
Yibin Liu and
Dehui Wang and
Guanzhou Chen and
Zijian Cai and
Junting Chen and
Weijie Su and
Wengang Zhou and
Yu Qiao and
Jifeng Dai and
Jiangmiao Pang and
Gen Luo and
Wenhai Wang and
Yao Mu and
Zhi Hou},
title = {Vlaser: Vision-Language-Action Model with Synergistic Embodied Reasoning},
journal = {CoRR},
volume = {abs/2510.11027},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2510.11027},
doi = {10.48550/ARXIV.2510.11027},
eprinttype = {arXiv},
eprint = {2510.11027},
timestamp = {Sun, 07 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2510-11027.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2511-11793,
author = {MiroMind Team and
Song Bai and
Lidong Bing and
Carson Chen and
Guanzheng Chen and
Yuntao Chen and
Zhe Chen and
Ziyi Chen and
Jifeng Dai and
Xuan Dong and
Wenhan Dou and
Yue Deng and
Yunjie Fu and
Junqi Ge and
Chenxia Han and
Tammy Huang and
Zhenhang Huang and
Jerry Jiao and
Shilei Jiang and
Tianyu Jiao and
Xiaoqi Jian and
Lei Lei and
Ruilin Li and
Ryan Luo and
Tiantong Li and
Xiang Lin and
Ziyuan Liu and
Zhiqi Li and
Jie Ni and
Qiang Ren and
Pax Sun and
Shiqian Su and
Chenxin Tao and
Bin Wang and
Hellen Wang and
Haonan Wang and
James Wang and
Jin Wang and
Jojo Wang and
Letian Wang and
Shizun Wang and
Weizhi Wang and
Zixuan Wang and
Jinfan Xu and
Sen Xing and
Chenyu Yang and
Hai Ye and
Jiaheng Yu and
Yue Yu and
Muyan Zhong and
Tianchen Zhao and
Xizhou Zhu and
Yanpeng Zhou and
Yifan Zhang and
Zhi Zhu},
title = {MiroThinker: Pushing the Performance Boundaries of Open-Source Research
Agents via Model, Context, and Interactive Scaling},
journal = {CoRR},
volume = {abs/2511.11793},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2511.11793},
doi = {10.48550/ARXIV.2511.11793},
eprinttype = {arXiv},
eprint = {2511.11793},
timestamp = {Sun, 19 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2511-11793.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/chinaf/ChenWTYGCTHLMMWDYGHSJXW24,
author = {Zhe Chen and
Weiyun Wang and
Hao Tian and
Shenglong Ye and
Zhangwei Gao and
Erfei Cui and
Wenwen Tong and
Kongzhi Hu and
Jiapeng Luo and
Zheng Ma and
Ji Ma and
Jiaqi Wang and
Xiaoyi Dong and
Hang Yan and
Hewei Guo and
Conghui He and
Botian Shi and
Zhenjiang Jin and
Chao Xu and
Bin Wang and
Xingjian Wei and
Wei Li and
Wenjian Zhang and
Bo Zhang and
Pinlong Cai and
Licheng Wen and
Xiangchao Yan and
Min Dou and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Dahua Lin and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {How far are we to GPT-4V? Closing the gap to commercial multimodal
models with open-source suites},
journal = {Sci. China Inf. Sci.},
volume = {67},
number = {12},
year = {2024},
url = {https://doi.org/10.1007/s11432-024-4231-5},
doi = {10.1007/S11432-024-4231-5},
timestamp = {Fri, 15 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/chinaf/ChenWTYGCTHLMMWDYGHSJXW24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/chinaf/LiuCGWCWTLZLQD24,
author = {Yangzhou Liu and
Yue Cao and
Zhangwei Gao and
Weiyun Wang and
Zhe Chen and
Wenhai Wang and
Hao Tian and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Yu Qiao and
Jifeng Dai},
title = {MMInstruct: a high-quality multi-modal instruction tuning dataset
with extensive diversity},
journal = {Sci. China Inf. Sci.},
volume = {67},
number = {12},
year = {2024},
url = {https://doi.org/10.1007/s11432-024-4187-3},
doi = {10.1007/S11432-024-4187-3},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/chinaf/LiuCGWCWTLZLQD24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/cmpb/ChengDDLZLWZH24,
author = {Heming Cheng and
Dongfang Ding and
Jifeng Dai and
Gen Li and
Ke Zhang and
Jianyun Li and
Liuchuang Wei and
Xue Zhang and
Jie Hou},
title = {Effect of a reduced arterial axial pre-stretch ratio during aging
on the cardiac output and cerebral blood flow in the healthy elders},
journal = {Comput. Methods Programs Biomed.},
volume = {257},
pages = {108468},
year = {2024},
url = {https://doi.org/10.1016/j.cmpb.2024.108468},
doi = {10.1016/J.CMPB.2024.108468},
timestamp = {Mon, 09 Dec 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/cmpb/ChengDDLZLWZH24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/LiSDWLWZLYDTXXCLLGJLSLQ24,
author = {Hongyang Li and
Chonghao Sima and
Jifeng Dai and
Wenhai Wang and
Lewei Lu and
Huijie Wang and
Jia Zeng and
Zhiqi Li and
Jiazhi Yang and
Hanming Deng and
Hao Tian and
Enze Xie and
Jiangwei Xie and
Li Chen and
Tianyu Li and
Yang Li and
Yulu Gao and
Xiaosong Jia and
Si Liu and
Jianping Shi and
Dahua Lin and
Yu Qiao},
title = {Delving Into the Devils of Bird's-Eye-View Perception: {A} Review,
Evaluation and Recipe},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {46},
number = {4},
pages = {2151--2170},
year = {2024},
url = {https://doi.org/10.1109/TPAMI.2023.3333838},
doi = {10.1109/TPAMI.2023.3333838},
timestamp = {Thu, 09 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/LiSDWLWZLYDTXXCLLGJLSLQ24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/FangGZCLDL24,
author = {Rongyao Fang and
Peng Gao and
Aojun Zhou and
Yingjie Cai and
Si Liu and
Jifeng Dai and
Hongsheng Li},
title = {FeatAug-DETR: Enriching One-to-Many Matching for DETRs With Feature
Augmentation},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {46},
number = {9},
pages = {6402--6415},
year = {2024},
url = {https://doi.org/10.1109/TPAMI.2024.3381961},
doi = {10.1109/TPAMI.2024.3381961},
timestamp = {Mon, 09 Dec 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/pami/FangGZCLDL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/visintelligence/GaoCCRWZTYHZLLQDW24,
author = {Zhangwei Gao and
Zhe Chen and
Erfei Cui and
Yiming Ren and
Weiyun Wang and
Jinguo Zhu and
Hao Tian and
Shenglong Ye and
Junjun He and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {Mini-InternVL: a flexible-transfer pocket multi-modal model with 5{\%}
parameters and 90{\%} performance},
journal = {Vis. Intell.},
volume = {2},
number = {1},
pages = {32},
year = {2024},
url = {https://doi.org/10.1007/s44267-024-00067-6},
doi = {10.1007/S44267-024-00067-6},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/visintelligence/GaoCCRWZTYHZLLQDW24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/XiongLCWZLWL00L24,
author = {Yuwen Xiong and
Zhiqi Li and
Yuntao Chen and
Feng Wang and
Xizhou Zhu and
Jiapeng Luo and
Wenhai Wang and
Tong Lu and
Hongsheng Li and
Yu Qiao and
Lewei Lu and
Jie Zhou and
Jifeng Dai},
title = {Efficient Deformable ConvNets: Rethinking Dynamic and Sparse Operator
for Vision Applications},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
pages = {5652--5661},
publisher = {{IEEE}},
year = {2024},
url = {https://doi.org/10.1109/CVPR52733.2024.00540},
doi = {10.1109/CVPR52733.2024.00540},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/XiongLCWZLWL00L24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/Li0WZZQWLLD24,
author = {Hao Li and
Xue Yang and
Zhaokai Wang and
Xizhou Zhu and
Jie Zhou and
Yu Qiao and
Xiaogang Wang and
Hongsheng Li and
Lewei Lu and
Jifeng Dai},
title = {Auto MC-Reward: Automated Dense Reward Design with Large Language
Models for Minecraft},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
pages = {16426--16435},
publisher = {{IEEE}},
year = {2024},
url = {https://doi.org/10.1109/CVPR52733.2024.01554},
doi = {10.1109/CVPR52733.2024.01554},
timestamp = {Mon, 03 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/Li0WZZQWLLD24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/Yu0LDD0Y24,
author = {Yi Yu and
Xue Yang and
Qingyun Li and
Feipeng Da and
Jifeng Dai and
Yu Qiao and
Junchi Yan},
title = {Point2RBox: Combine Knowledge from Synthetic Visual Patterns for End-to-End
Oriented Object Detection with Single Point Supervision},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
pages = {16783--16793},
publisher = {{IEEE}},
year = {2024},
url = {https://doi.org/10.1109/CVPR52733.2024.01588},
doi = {10.1109/CVPR52733.2024.01588},
timestamp = {Wed, 25 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/Yu0LDD0Y24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ChenWW0CXZZZLLL24,
author = {Zhe Chen and
Jiannan Wu and
Wenhai Wang and
Weijie Su and
Guo Chen and
Sen Xing and
Muyan Zhong and
Qinglong Zhang and
Xizhou Zhu and
Lewei Lu and
Bin Li and
Ping Luo and
Tong Lu and
Yu Qiao and
Jifeng Dai},
title = {Intern {VL:} Scaling up Vision Foundation Models and Aligning for
Generic Visual-Linguistic Tasks},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2024, Seattle, WA, USA, June 16-22, 2024},
pages = {24185--24198},
publisher = {{IEEE}},
year = {2024},
url = {https://doi.org/10.1109/CVPR52733.2024.02283},
doi = {10.1109/CVPR52733.2024.02283},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/ChenWW0CXZZZLLL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/LiuLGCLZLCQDW24,
author = {Zhaoyang Liu and
Zeqiang Lai and
Zhangwei Gao and
Erfei Cui and
Ziheng Li and
Xizhou Zhu and
Lewei Lu and
Qifeng Chen and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
editor = {Ales Leonardis and
Elisa Ricci and
Stefan Roth and
Olga Russakovsky and
Torsten Sattler and
G{\"{u}}l Varol},
title = {ControlLLM: Augment Language Models with Tools by Searching on Graphs},
booktitle = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
September 29-October 4, 2024, Proceedings, Part {XII}},
series = {Lecture Notes in Computer Science},
pages = {89--105},
publisher = {Springer},
year = {2024},
url = {https://doi.org/10.1007/978-3-031-73254-6\_6},
doi = {10.1007/978-3-031-73254-6\_6},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/eccv/LiuLGCLZLCQDW24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/LiWLLYDQZ24,
author = {Gang Li and
Wenhai Wang and
Xiang Li and
Ziheng Li and
Jian Yang and
Jifeng Dai and
Yu Qiao and
Shanshan Zhang},
editor = {Ales Leonardis and
Elisa Ricci and
Stefan Roth and
Olga Russakovsky and
Torsten Sattler and
G{\"{u}}l Varol},
title = {Distilling Knowledge from Large-Scale Image Models for Object Detection},
booktitle = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
September 29-October 4, 2024, Proceedings, Part {LXXXIV}},
series = {Lecture Notes in Computer Science},
pages = {142--160},
publisher = {Springer},
year = {2024},
url = {https://doi.org/10.1007/978-3-031-72907-2\_9},
doi = {10.1007/978-3-031-72907-2\_9},
timestamp = {Wed, 25 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/eccv/LiWLLYDQZ24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/WangRLLYCWLLZQD24,
author = {Weiyun Wang and
Yiming Ren and
Haowen Luo and
Tiantong Li and
Chenxiang Yan and
Zhe Chen and
Wenhai Wang and
Qingyun Li and
Lewei Lu and
Xizhou Zhu and
Yu Qiao and
Jifeng Dai},
editor = {Ales Leonardis and
Elisa Ricci and
Stefan Roth and
Olga Russakovsky and
Torsten Sattler and
G{\"{u}}l Varol},
title = {The All-Seeing Project {V2:} Towards General Relation Comprehension
of the Open World},
booktitle = {Computer Vision - {ECCV} 2024 - 18th European Conference, Milan, Italy,
September 29-October 4, 2024, Proceedings, Part {XXXIII}},
series = {Lecture Notes in Computer Science},
pages = {471--490},
publisher = {Springer},
year = {2024},
url = {https://doi.org/10.1007/978-3-031-73414-4\_27},
doi = {10.1007/978-3-031-73414-4\_27},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/WangRLLYCWLLZQD24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/TianTD0LL00HZ24,
author = {Changyao Tian and
Chenxin Tao and
Jifeng Dai and
Hao Li and
Ziheng Li and
Lewei Lu and
Xiaogang Wang and
Hongsheng Li and
Gao Huang and
Xizhou Zhu},
title = {{ADDP:} Learning General Representations for Image Recognition and
Generation with Alternating Denoising Diffusion Process},
booktitle = {The Twelfth International Conference on Learning Representations,
{ICLR} 2024, Vienna, Austria, May 7-11, 2024},
publisher = {OpenReview.net},
year = {2024},
url = {https://openreview.net/forum?id=cMPm8YFXZe},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/TianTD0LL00HZ24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/Wang0LWHXCLZ0CL24,
author = {Weiyun Wang and
Min Shi and
Qingyun Li and
Wenhai Wang and
Zhenhang Huang and
Linjie Xing and
Zhe Chen and
Hao Li and
Xizhou Zhu and
Zhiguo Cao and
Yushi Chen and
Tong Lu and
Jifeng Dai and
Yu Qiao},
title = {The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding
of the Open World},
booktitle = {The Twelfth International Conference on Learning Representations,
{ICLR} 2024, Vienna, Austria, May 7-11, 2024},
publisher = {OpenReview.net},
year = {2024},
url = {https://openreview.net/forum?id=c2R7ajodcI},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iclr/Wang0LWHXCLZ0CL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/YangWCD024,
author = {Yang Yang and
Wenhai Wang and
Zhe Chen and
Jifeng Dai and
Liang Zheng},
title = {Bounding Box Stability against Feature Dropout Reflects Detector Generalization
across Environments},
booktitle = {The Twelfth International Conference on Learning Representations,
{ICLR} 2024, Vienna, Austria, May 7-11, 2024},
publisher = {OpenReview.net},
year = {2024},
url = {https://openreview.net/forum?id=lmM4Ecm4HJ},
timestamp = {Mon, 18 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iclr/YangWCD024.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/0001CZCYGCLHTSY24,
author = {Yao Mu and
Junting Chen and
Qinglong Zhang and
Shoufa Chen and
Qiaojun Yu and
Chongjian Ge and
Runjian Chen and
Zhixuan Liang and
Mengkang Hu and
Chaofan Tao and
Peize Sun and
Haibao Yu and
Chao Yang and
Wenqi Shao and
Wenhai Wang and
Jifeng Dai and
Yu Qiao and
Mingyu Ding and
Ping Luo},
editor = {Ruslan Salakhutdinov and
Zico Kolter and
Katherine A. Heller and
Adrian Weller and
Nuria Oliver and
Jonathan Scarlett and
Felix Berkenkamp},
title = {RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis},
booktitle = {Forty-first International Conference on Machine Learning, {ICML} 2024,
Vienna, Austria, July 21-27, 2024},
series = {Proceedings of Machine Learning Research},
pages = {36434--36454},
publisher = {{PMLR} / OpenReview.net},
year = {2024},
url = {https://proceedings.mlr.press/v235/mu24a.html},
timestamp = {Mon, 09 Feb 2026 15:35:36 +0100},
biburl = {https://dblp.org/rec/conf/icml/0001CZCYGCLHTSY24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/0004WX0WC00DP24,
author = {Jiawei Gao and
Ziqin Wang and
Zeqi Xiao and
Jingbo Wang and
Tai Wang and
Jinkun Cao and
Xiaolin Hu and
Si Liu and
Jifeng Dai and
Jiangmiao Pang},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {CooHOI: Learning Cooperative Human-Object Interaction with Manipulated
Object Dynamics},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/918b9487f8ea4661e8ba5a02b2126658-Abstract-Conference.html},
timestamp = {Tue, 26 May 2026 17:12:08 +0200},
biburl = {https://dblp.org/rec/conf/nips/0004WX0WC00DP24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/DongZZCWOZDZLYG24,
author = {Xiaoyi Dong and
Pan Zhang and
Yuhang Zang and
Yuhang Cao and
Bin Wang and
Linke Ouyang and
Songyang Zhang and
Haodong Duan and
Wenwei Zhang and
Yining Li and
Hang Yan and
Yang Gao and
Zhe Chen and
Xinyue Zhang and
Wei Li and
Jingwen Li and
Wenhai Wang and
Kai Chen and
Conghui He and
Xingcheng Zhang and
Jifeng Dai and
Yu Qiao and
Dahua Lin and
Jiaqi Wang},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {InternLM-XComposer2-4KHD: {A} Pioneering Large Vision-Language Model
Handling Resolutions from 336 Pixels to 4K {HD}},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/4b06cdddb1cde6624c0be1465c7b800f-Abstract-Conference.html},
timestamp = {Thu, 26 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/DongZZCWOZDZLYG24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/NanLXD24,
author = {Zhixiong Nan and
Xianghong Li and
Tao Xiang and
Jifeng Dai},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {DI-MaskDINO: {A} Joint Object Detection and Instance Segmentation
Model},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/6f1346bac8b02f76a631400e2799b24b-Abstract-Conference.html},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/NanLXD24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/TaoZSLTL0000D24,
author = {Chenxin Tao and
Xizhou Zhu and
Shiqian Su and
Lewei Lu and
Changyao Tian and
Xuan Luo and
Gao Huang and
Hongsheng Li and
Yu Qiao and
Jie Zhou and
Jifeng Dai},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {Learning 1D Causal Visual Representation with De-focus Attention Networks},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/2d9c6cdb4cfe93869c090fea7375044b-Abstract-Conference.html},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/TaoZSLTL0000D24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/WangZRDLLH0ZLZL24,
author = {Weiyun Wang and
Shuibo Zhang and
Yiming Ren and
Yuchen Duan and
Tiantong Li and
Shuo Liu and
Mengkang Hu and
Zhe Chen and
Kaipeng Zhang and
Lewei Lu and
Xizhou Zhu and
Ping Luo and
Yu Qiao and
Jifeng Dai and
Wenqi Shao and
Wenhai Wang},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {Needle In {A} Multimodal Haystack},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/24a8968affe71ffe4067d022b9d16566-Abstract-Datasets\_and\_Benchmarks\_Track.html},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/WangZRDLLH0ZLZL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/WuZXL00WZLL0QD24,
author = {Jiannan Wu and
Muyan Zhong and
Sen Xing and
Zeqiang Lai and
Zhaoyang Liu and
Zhe Chen and
Wenhai Wang and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Ping Luo and
Yu Qiao and
Jifeng Dai},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {VisionLLM v2: An End-to-End Generalist Multimodal Large Language Model
for Hundreds of Vision-Language Tasks},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/81a60d18e010b27b36cd465c6604b915-Abstract-Conference.html},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/WuZXL00WZLL0QD24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/YangZZ0WDW0Z0D24,
author = {Chenyu Yang and
Xizhou Zhu and
Jinguo Zhu and
Weijie Su and
Junjie Wang and
Xuan Dong and
Wenhai Wang and
Lewei Lu and
Bin Li and
Jie Zhou and
Yu Qiao and
Jifeng Dai},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {Vision Model Pre-training on Interleaved Image-Text Data via Latent
Compression Learning},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/2a952768bb85041f95ed06a5b60cf4d5-Abstract-Conference.html},
timestamp = {Sun, 15 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/YangZZ0WDW0Z0D24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/Zhu0W0DGL0D24,
author = {Xizhou Zhu and
Xue Yang and
Zhaokai Wang and
Hao Li and
Wenhan Dou and
Junqi Ge and
Lewei Lu and
Yu Qiao and
Jifeng Dai},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {Parameter-Inverted Image Pyramid Networks},
booktitle = {Advances in Neural Information Processing Systems 37: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/ee81a23d6b83ac15fbeb5b7a30934e0b-Abstract-Conference.html},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/Zhu0W0DGL0D24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/siggraph/ShiHWBLZZCSQDL24,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Fu{-}Yun Wang and
Weikang Bian and
Dasong Li and
Yi Zhang and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
editor = {Andres Burbano and
Denis Zorin and
Wojciech Jarosz},
title = {Motion-I2V: Consistent and Controllable Image-to-Video Generation
with Explicit Motion Modeling},
booktitle = {{ACM} {SIGGRAPH} 2024 Conference Papers, {SIGGRAPH} 2024, Denver,
CO, USA, 27 July 2024- 1 August 2024},
pages = {111},
publisher = {{ACM}},
year = {2024},
url = {https://doi.org/10.1145/3641519.3657497},
doi = {10.1145/3641519.3657497},
timestamp = {Mon, 07 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/siggraph/ShiHWBLZZCSQDL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2401-06197,
author = {Yuwen Xiong and
Zhiqi Li and
Yuntao Chen and
Feng Wang and
Xizhou Zhu and
Jiapeng Luo and
Wenhai Wang and
Tong Lu and
Hongsheng Li and
Yu Qiao and
Lewei Lu and
Jie Zhou and
Jifeng Dai},
title = {Efficient Deformable ConvNets: Rethinking Dynamic and Sparse Operator
for Vision Applications},
journal = {CoRR},
volume = {abs/2401.06197},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2401.06197},
doi = {10.48550/ARXIV.2401.06197},
eprinttype = {arXiv},
eprint = {2401.06197},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2401-06197.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2401-10208,
author = {Changyao Tian and
Xizhou Zhu and
Yuwen Xiong and
Weiyun Wang and
Zhe Chen and
Wenhai Wang and
Yuntao Chen and
Lewei Lu and
Tong Lu and
Jie Zhou and
Hongsheng Li and
Yu Qiao and
Jifeng Dai},
title = {MM-Interleaved: Interleaved Image-Text Generative Modeling via Multi-modal
Feature Synchronizer},
journal = {CoRR},
volume = {abs/2401.10208},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2401.10208},
doi = {10.48550/ARXIV.2401.10208},
eprinttype = {arXiv},
eprint = {2401.10208},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2401-10208.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2401-15977,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Fu{-}Yun Wang and
Weikang Bian and
Dasong Li and
Yi Zhang and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {Motion-I2V: Consistent and Controllable Image-to-Video Generation
with Explicit Motion Modeling},
journal = {CoRR},
volume = {abs/2401.15977},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2401.15977},
doi = {10.48550/ARXIV.2401.15977},
eprinttype = {arXiv},
eprint = {2401.15977},
timestamp = {Mon, 07 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2401-15977.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2402-16117,
author = {Yao Mu and
Junting Chen and
Qinglong Zhang and
Shoufa Chen and
Qiaojun Yu and
Chongjian Ge and
Runjian Chen and
Zhixuan Liang and
Mengkang Hu and
Chaofan Tao and
Peize Sun and
Haibao Yu and
Chao Yang and
Wenqi Shao and
Wenhai Wang and
Jifeng Dai and
Yu Qiao and
Mingyu Ding and
Ping Luo},
title = {RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis},
journal = {CoRR},
volume = {abs/2402.16117},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2402.16117},
doi = {10.48550/ARXIV.2402.16117},
eprinttype = {arXiv},
eprint = {2402.16117},
timestamp = {Fri, 30 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2402-16117.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2402-19474,
author = {Weiyun Wang and
Yiming Ren and
Haowen Luo and
Tiantong Li and
Chenxiang Yan and
Zhe Chen and
Wenhai Wang and
Qingyun Li and
Lewei Lu and
Xizhou Zhu and
Yu Qiao and
Jifeng Dai},
title = {The All-Seeing Project {V2:} Towards General Relation Comprehension
of the Open World},
journal = {CoRR},
volume = {abs/2402.19474},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2402.19474},
doi = {10.48550/ARXIV.2402.19474},
eprinttype = {arXiv},
eprint = {2402.19474},
timestamp = {Mon, 18 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2402-19474.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2403-02308,
author = {Yuchen Duan and
Weiyun Wang and
Zhe Chen and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Yu Qiao and
Hongsheng Li and
Jifeng Dai and
Wenhai Wang},
title = {Vision-RWKV: Efficient and Scalable Visual Perception with RWKV-Like
Architectures},
journal = {CoRR},
volume = {abs/2403.02308},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2403.02308},
doi = {10.48550/ARXIV.2403.02308},
eprinttype = {arXiv},
eprint = {2403.02308},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2403-02308.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2403-13803,
author = {Yang Yang and
Wenhai Wang and
Zhe Chen and
Jifeng Dai and
Liang Zheng},
title = {Bounding Box Stability against Feature Dropout Reflects Detector Generalization
across Environments},
journal = {CoRR},
volume = {abs/2403.13803},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2403.13803},
doi = {10.48550/ARXIV.2403.13803},
eprinttype = {arXiv},
eprint = {2403.13803},
timestamp = {Mon, 18 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2403-13803.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2404-06512,
author = {Xiaoyi Dong and
Pan Zhang and
Yuhang Zang and
Yuhang Cao and
Bin Wang and
Linke Ouyang and
Songyang Zhang and
Haodong Duan and
Wenwei Zhang and
Yining Li and
Hang Yan and
Yang Gao and
Zhe Chen and
Xinyue Zhang and
Wei Li and
Jingwen Li and
Wenhai Wang and
Kai Chen and
Conghui He and
Xingcheng Zhang and
Jifeng Dai and
Yu Qiao and
Dahua Lin and
Jiaqi Wang},
title = {InternLM-XComposer2-4KHD: {A} Pioneering Large Vision-Language Model
Handling Resolutions from 336 Pixels to 4K {HD}},
journal = {CoRR},
volume = {abs/2404.06512},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2404.06512},
doi = {10.48550/ARXIV.2404.06512},
eprinttype = {arXiv},
eprint = {2404.06512},
timestamp = {Thu, 26 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2404-06512.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2404-16821,
author = {Zhe Chen and
Weiyun Wang and
Hao Tian and
Shenglong Ye and
Zhangwei Gao and
Erfei Cui and
Wenwen Tong and
Kongzhi Hu and
Jiapeng Luo and
Zheng Ma and
Ji Ma and
Jiaqi Wang and
Xiaoyi Dong and
Hang Yan and
Hewei Guo and
Conghui He and
Botian Shi and
Zhenjiang Jin and
Chao Xu and
Bin Wang and
Xingjian Wei and
Wei Li and
Wenjian Zhang and
Bo Zhang and
Pinlong Cai and
Licheng Wen and
Xiangchao Yan and
Min Dou and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Dahua Lin and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal
Models with Open-Source Suites},
journal = {CoRR},
volume = {abs/2404.16821},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2404.16821},
doi = {10.48550/ARXIV.2404.16821},
eprinttype = {arXiv},
eprint = {2404.16821},
timestamp = {Fri, 15 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2404-16821.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2405-14739,
author = {Chongjie Si and
Xuehui Wang and
Xue Yang and
Zhengqin Xu and
Qingyun Li and
Jifeng Dai and
Yu Qiao and
Xiaokang Yang and
Wei Shen},
title = {FLoRA: Low-Rank Core Space for N-dimension},
journal = {CoRR},
volume = {abs/2405.14739},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2405.14739},
doi = {10.48550/ARXIV.2405.14739},
eprinttype = {arXiv},
eprint = {2405.14739},
timestamp = {Thu, 06 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2405-14739.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2405-19334,
author = {Yingqing He and
Zhaoyang Liu and
Jingye Chen and
Zeyue Tian and
Hongyu Liu and
Xiaowei Chi and
Runtao Liu and
Ruibin Yuan and
Yazhou Xing and
Wenhai Wang and
Jifeng Dai and
Yong Zhang and
Wei Xue and
Qifeng Liu and
Yike Guo and
Qifeng Chen},
title = {LLMs Meet Multimodal Generation and Editing: {A} Survey},
journal = {CoRR},
volume = {abs/2405.19334},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2405.19334},
doi = {10.48550/ARXIV.2405.19334},
eprinttype = {arXiv},
eprint = {2405.19334},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2405-19334.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-04330,
author = {Xizhou Zhu and
Xue Yang and
Zhaokai Wang and
Hao Li and
Wenhan Dou and
Junqi Ge and
Lewei Lu and
Yu Qiao and
Jifeng Dai},
title = {Parameter-Inverted Image Pyramid Networks},
journal = {CoRR},
volume = {abs/2406.04330},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.04330},
doi = {10.48550/ARXIV.2406.04330},
eprinttype = {arXiv},
eprint = {2406.04330},
timestamp = {Tue, 06 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-04330.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-04342,
author = {Chenxin Tao and
Xizhou Zhu and
Shiqian Su and
Lewei Lu and
Changyao Tian and
Xuan Luo and
Gao Huang and
Hongsheng Li and
Yu Qiao and
Jie Zhou and
Jifeng Dai},
title = {Learning 1D Causal Visual Representation with De-focus Attention Networks},
journal = {CoRR},
volume = {abs/2406.04342},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.04342},
doi = {10.48550/ARXIV.2406.04342},
eprinttype = {arXiv},
eprint = {2406.04342},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-04342.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-07230,
author = {Weiyun Wang and
Shuibo Zhang and
Yiming Ren and
Yuchen Duan and
Tiantong Li and
Shuo Liu and
Mengkang Hu and
Zhe Chen and
Kaipeng Zhang and
Lewei Lu and
Xizhou Zhu and
Ping Luo and
Yu Qiao and
Jifeng Dai and
Wenqi Shao and
Wenhai Wang},
title = {Needle In {A} Multimodal Haystack},
journal = {CoRR},
volume = {abs/2406.07230},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.07230},
doi = {10.48550/ARXIV.2406.07230},
eprinttype = {arXiv},
eprint = {2406.07230},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-07230.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-07543,
author = {Chenyu Yang and
Xizhou Zhu and
Jinguo Zhu and
Weijie Su and
Junjie Wang and
Xuan Dong and
Wenhai Wang and
Lewei Lu and
Bin Li and
Jie Zhou and
Yu Qiao and
Jifeng Dai},
title = {Vision Model Pre-training on Interleaved Image-Text Data via Latent
Compression Learning},
journal = {CoRR},
volume = {abs/2406.07543},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.07543},
doi = {10.48550/ARXIV.2406.07543},
eprinttype = {arXiv},
eprint = {2406.07543},
timestamp = {Wed, 26 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-07543.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-08085,
author = {Haoji Zhang and
Yiqin Wang and
Yansong Tang and
Yong Liu and
Jiashi Feng and
Jifeng Dai and
Xiaojie Jin},
title = {Flash-VStream: Memory-Based Real-Time Understanding for Long Video
Streams},
journal = {CoRR},
volume = {abs/2406.08085},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.08085},
doi = {10.48550/ARXIV.2406.08085},
eprinttype = {arXiv},
eprint = {2406.08085},
timestamp = {Thu, 07 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-08085.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-08394,
author = {Jiannan Wu and
Muyan Zhong and
Sen Xing and
Zeqiang Lai and
Zhaoyang Liu and
Wenhai Wang and
Zhe Chen and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Ping Luo and
Yu Qiao and
Jifeng Dai},
title = {VisionLLM v2: An End-to-End Generalist Multimodal Large Language Model
for Hundreds of Vision-Language Tasks},
journal = {CoRR},
volume = {abs/2406.08394},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.08394},
doi = {10.48550/ARXIV.2406.08394},
eprinttype = {arXiv},
eprint = {2406.08394},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-08394.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-08418,
author = {Qingyun Li and
Zhe Chen and
Weiyun Wang and
Wenhai Wang and
Shenglong Ye and
Zhenjiang Jin and
Guanzhou Chen and
Yinan He and
Zhangwei Gao and
Erfei Cui and
Jiashuo Yu and
Hao Tian and
Jiasheng Zhou and
Chao Xu and
Bin Wang and
Xingjian Wei and
Wei Li and
Wenjian Zhang and
Bo Zhang and
Pinlong Cai and
Licheng Wen and
Xiangchao Yan and
Zhenxiang Li and
Pei Chu and
Yi Wang and
Min Dou and
Changyao Tian and
Xizhou Zhu and
Lewei Lu and
Yushi Chen and
Junjun He and
Zhongying Tu and
Tong Lu and
Yali Wang and
Limin Wang and
Dahua Lin and
Yu Qiao and
Botian Shi and
Conghui He and
Jifeng Dai},
title = {OmniCorpus: {A} Unified Multimodal Corpus of 10 Billion-Level Images
Interleaved with Text},
journal = {CoRR},
volume = {abs/2406.08418},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.08418},
doi = {10.48550/ARXIV.2406.08418},
eprinttype = {arXiv},
eprint = {2406.08418},
timestamp = {Thu, 26 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-08418.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-14558,
author = {Jiawei Gao and
Ziqin Wang and
Zeqi Xiao and
Jingbo Wang and
Tai Wang and
Jinkun Cao and
Xiaolin Hu and
Si Liu and
Jifeng Dai and
Jiangmiao Pang},
title = {CooHOI: Learning Cooperative Human-Object Interaction with Manipulated
Object Dynamics},
journal = {CoRR},
volume = {abs/2406.14558},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.14558},
doi = {10.48550/ARXIV.2406.14558},
eprinttype = {arXiv},
eprint = {2406.14558},
timestamp = {Tue, 23 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-14558.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2407-00603,
author = {Yiqin Wang and
Haoji Zhang and
Yansong Tang and
Yong Liu and
Jiashi Feng and
Jifeng Dai and
Xiaojie Jin},
title = {Hierarchical Memory for Long Video {QA}},
journal = {CoRR},
volume = {abs/2407.00603},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2407.00603},
doi = {10.48550/ARXIV.2407.00603},
eprinttype = {arXiv},
eprint = {2407.00603},
timestamp = {Thu, 07 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2407-00603.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2407-03320,
author = {Pan Zhang and
Xiaoyi Dong and
Yuhang Zang and
Yuhang Cao and
Rui Qian and
Lin Chen and
Qipeng Guo and
Haodong Duan and
Bin Wang and
Linke Ouyang and
Songyang Zhang and
Wenwei Zhang and
Yining Li and
Yang Gao and
Peng Sun and
Xinyue Zhang and
Wei Li and
Jingwen Li and
Wenhai Wang and
Hang Yan and
Conghui He and
Xingcheng Zhang and
Kai Chen and
Jifeng Dai and
Yu Qiao and
Dahua Lin and
Jiaqi Wang},
title = {InternLM-XComposer-2.5: {A} Versatile Large Vision Language Model
Supporting Long-Contextual Input and Output},
journal = {CoRR},
volume = {abs/2407.03320},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2407.03320},
doi = {10.48550/ARXIV.2407.03320},
eprinttype = {arXiv},
eprint = {2407.03320},
timestamp = {Thu, 26 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2407-03320.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2407-15838,
author = {Yangzhou Liu and
Yue Cao and
Zhangwei Gao and
Weiyun Wang and
Zhe Chen and
Wenhai Wang and
Hao Tian and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Yu Qiao and
Jifeng Dai},
title = {MMInstruct: {A} High-Quality Multi-Modal Instruction Tuning Dataset
with Extensive Diversity},
journal = {CoRR},
volume = {abs/2407.15838},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2407.15838},
doi = {10.48550/ARXIV.2407.15838},
eprinttype = {arXiv},
eprint = {2407.15838},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2407-15838.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2408-02718,
author = {Fanqing Meng and
Jin Wang and
Chuanhao Li and
Quanfeng Lu and
Hao Tian and
Jiaqi Liao and
Xizhou Zhu and
Jifeng Dai and
Yu Qiao and
Ping Luo and
Kaipeng Zhang and
Wenqi Shao},
title = {{MMIU:} Multimodal Multi-image Understanding for Evaluating Large
Vision-Language Models},
journal = {CoRR},
volume = {abs/2408.02718},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2408.02718},
doi = {10.48550/ARXIV.2408.02718},
eprinttype = {arXiv},
eprint = {2408.02718},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2408-02718.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-08202,
author = {Gen Luo and
Xue Yang and
Wenhan Dou and
Zhaokai Wang and
Jifeng Dai and
Yu Qiao and
Xizhou Zhu},
title = {Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large
Language Models with Endogenous Visual Pre-training},
journal = {CoRR},
volume = {abs/2410.08202},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.08202},
doi = {10.48550/ARXIV.2410.08202},
eprinttype = {arXiv},
eprint = {2410.08202},
timestamp = {Mon, 18 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-08202.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-10267,
author = {He Guo and
Yulong Wang and
Zixuan Ye and
Jifeng Dai and
Yuwen Xiong},
title = {big.LITTLE Vision Transformer for Efficient Visual Recognition},
journal = {CoRR},
volume = {abs/2410.10267},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.10267},
doi = {10.48550/ARXIV.2410.10267},
eprinttype = {arXiv},
eprint = {2410.10267},
timestamp = {Mon, 25 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-10267.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-13861,
author = {Rongyao Fang and
Chengqi Duan and
Kun Wang and
Hao Li and
Hao Tian and
Xingyu Zeng and
Rui Zhao and
Jifeng Dai and
Hongsheng Li and
Xihui Liu},
title = {{PUMA:} Empowering Unified {MLLM} with Multi-granular Visual Generation},
journal = {CoRR},
volume = {abs/2410.13861},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.13861},
doi = {10.48550/ARXIV.2410.13861},
eprinttype = {arXiv},
eprint = {2410.13861},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-13861.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-15959,
author = {Zhi Hou and
Tianyi Zhang and
Yuwen Xiong and
Hengjun Pu and
Chengyang Zhao and
Ronglei Tong and
Yu Qiao and
Jifeng Dai and
Yuntao Chen},
title = {Diffusion Transformer Policy},
journal = {CoRR},
volume = {abs/2410.15959},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.15959},
doi = {10.48550/ARXIV.2410.15959},
eprinttype = {arXiv},
eprint = {2410.15959},
timestamp = {Sat, 31 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-15959.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-16261,
author = {Zhangwei Gao and
Zhe Chen and
Erfei Cui and
Yiming Ren and
Weiyun Wang and
Jinguo Zhu and
Hao Tian and
Shenglong Ye and
Junjun He and
Xizhou Zhu and
Lewei Lu and
Tong Lu and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {Mini-InternVL: {A} Flexible-Transfer Pocket Multimodal Model with
5{\%} Parameters and 90{\%} Performance},
journal = {CoRR},
volume = {abs/2410.16261},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.16261},
doi = {10.48550/ARXIV.2410.16261},
eprinttype = {arXiv},
eprint = {2410.16261},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-16261.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-16707,
author = {Zhixiong Nan and
Xianghong Li and
Tao Xiang and
Jifeng Dai},
title = {DI-MaskDINO: {A} Joint Object Detection and Instance Segmentation
Model},
journal = {CoRR},
volume = {abs/2410.16707},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.16707},
doi = {10.48550/ARXIV.2410.16707},
eprinttype = {arXiv},
eprint = {2410.16707},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-16707.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2411-10442,
author = {Weiyun Wang and
Zhe Chen and
Wenhai Wang and
Yue Cao and
Yangzhou Liu and
Zhangwei Gao and
Jinguo Zhu and
Xizhou Zhu and
Lewei Lu and
Yu Qiao and
Jifeng Dai},
title = {Enhancing the Reasoning Ability of Multimodal Large Language Models
via Mixed Preference Optimization},
journal = {CoRR},
volume = {abs/2411.10442},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2411.10442},
doi = {10.48550/ARXIV.2411.10442},
eprinttype = {arXiv},
eprint = {2411.10442},
timestamp = {Wed, 01 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2411-10442.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-01271,
author = {Sen Xing and
Muyan Zhong and
Zeqiang Lai and
Liangchen Li and
Jiawen Liu and
Yaohui Wang and
Jifeng Dai and
Wenhai Wang},
title = {MuLan: Adapting Multilingual Diffusion Models for Hundreds of Languages
with Negligible Cost},
journal = {CoRR},
volume = {abs/2412.01271},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.01271},
doi = {10.48550/ARXIV.2412.01271},
eprinttype = {arXiv},
eprint = {2412.01271},
timestamp = {Sun, 12 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-01271.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-01407,
author = {Zehuan Wu and
Jingcheng Ni and
Xiaodong Wang and
Yuxin Guo and
Rui Chen and
Lewei Lu and
Jifeng Dai and
Yuwen Xiong},
title = {HoloDrive: Holistic 2D-3D Multi-Modal Street Scene Generation for
Autonomous Driving},
journal = {CoRR},
volume = {abs/2412.01407},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.01407},
doi = {10.48550/ARXIV.2412.01407},
eprinttype = {arXiv},
eprint = {2412.01407},
timestamp = {Sun, 12 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-01407.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-05271,
author = {Zhe Chen and
Weiyun Wang and
Yue Cao and
Yangzhou Liu and
Zhangwei Gao and
Erfei Cui and
Jinguo Zhu and
Shenglong Ye and
Hao Tian and
Zhaoyang Liu and
Lixin Gu and
Xuehui Wang and
Qingyun Li and
Yimin Ren and
Zixuan Chen and
Jiapeng Luo and
Jiahao Wang and
Tan Jiang and
Bo Wang and
Conghui He and
Botian Shi and
Xingcheng Zhang and
Han Lv and
Yi Wang and
Wenqi Shao and
Pei Chu and
Zhongying Tu and
Tong He and
Zhiyong Wu and
Huipeng Deng and
Jiaye Ge and
Kai Chen and
Min Dou and
Lewei Lu and
Xizhou Zhu and
Tong Lu and
Dahua Lin and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {Expanding Performance Boundaries of Open-Source Multimodal Models
with Model, Data, and Test-Time Scaling},
journal = {CoRR},
volume = {abs/2412.05271},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.05271},
doi = {10.48550/ARXIV.2412.05271},
eprinttype = {arXiv},
eprint = {2412.05271},
timestamp = {Wed, 03 Jun 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-05271.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-09604,
author = {Hao Li and
Changyao Tian and
Jie Shao and
Xizhou Zhu and
Zhaokai Wang and
Jinguo Zhu and
Wenhan Dou and
Xiaogang Wang and
Hongsheng Li and
Lewei Lu and
Jifeng Dai},
title = {SynerGen-VL: Towards Synergistic Image Understanding and Generation
with Vision Experts and Token Folding},
journal = {CoRR},
volume = {abs/2412.09604},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.09604},
doi = {10.48550/ARXIV.2412.09604},
eprinttype = {arXiv},
eprint = {2412.09604},
timestamp = {Mon, 20 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-09604.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-09613,
author = {Chenyu Yang and
Xuan Dong and
Xizhou Zhu and
Weijie Su and
Jiahao Wang and
Hao Tian and
Zhe Chen and
Wenhai Wang and
Lewei Lu and
Jifeng Dai},
title = {{PVC:} Progressive Visual Token Compression for Unified Image and
Video Processing in Large Vision-Language Models},
journal = {CoRR},
volume = {abs/2412.09613},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.09613},
doi = {10.48550/ARXIV.2412.09613},
eprinttype = {arXiv},
eprint = {2412.09613},
timestamp = {Mon, 20 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-09613.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-09616,
author = {Junqi Ge and
Ziyi Chen and
Jintao Lin and
Jinguo Zhu and
Xihui Liu and
Jifeng Dai and
Xizhou Zhu},
title = {{V2PE:} Improving Multimodal Long-Context Capability of Vision-Language
Models with Variable Visual Position Encoding},
journal = {CoRR},
volume = {abs/2412.09616},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.09616},
doi = {10.48550/ARXIV.2412.09616},
eprinttype = {arXiv},
eprint = {2412.09616},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-09616.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2412-16158,
author = {Chenxin Tao and
Shiqian Su and
Xizhou Zhu and
Chenyu Zhang and
Zhe Chen and
Jiawen Liu and
Wenhai Wang and
Lewei Lu and
Gao Huang and
Yu Qiao and
Jifeng Dai},
title = {HoVLE: Unleashing the Power of Monolithic Vision-Language Models with
Holistic Vision-Language Embedding},
journal = {CoRR},
volume = {abs/2412.16158},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.16158},
doi = {10.48550/ARXIV.2412.16158},
eprinttype = {arXiv},
eprint = {2412.16158},
timestamp = {Thu, 23 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-16158.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ShiHLZCSQDL23,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Dasong Li and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {FlowFormer++: Masked Cost Volume Autoencoding for Pretraining Optical
Flow Estimation},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {1599--1610},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.00160},
doi = {10.1109/CVPR52729.2023.00160},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/ShiHLZCSQDL23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/TaoZ0HLZ00D23,
author = {Chenxin Tao and
Xizhou Zhu and
Weijie Su and
Gao Huang and
Bin Li and
Jie Zhou and
Yu Qiao and
Xiaogang Wang and
Jifeng Dai},
title = {Siamese Image Modeling for Self-Supervised Vision Representation Learning},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {2132--2141},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.00212},
doi = {10.1109/CVPR52729.2023.00212},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/TaoZ0HLZ00D23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LiZJZLYWQWWD23,
author = {Hao Li and
Jinguo Zhu and
Xiaohu Jiang and
Xizhou Zhu and
Hongsheng Li and
Chun Yuan and
Xiaohua Wang and
Yu Qiao and
Xiaogang Wang and
Wenhai Wang and
Jifeng Dai},
title = {Uni-Perceiver v2: {A} Generalist Model for Large-Scale Vision and
Vision-Language Tasks},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {2691--2700},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.00264},
doi = {10.1109/CVPR52729.2023.00264},
timestamp = {Thu, 29 Jan 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/LiZJZLYWQWWD23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/WangDCHLZHLLLWQ23,
author = {Wenhai Wang and
Jifeng Dai and
Zhe Chen and
Zhenhang Huang and
Zhiqi Li and
Xizhou Zhu and
Xiaowei Hu and
Tong Lu and
Lewei Lu and
Hongsheng Li and
Xiaogang Wang and
Yu Qiao},
title = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable
Convolutions},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {14408--14419},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.01385},
doi = {10.1109/CVPR52729.2023.01385},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/WangDCHLZHLLLWQ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/0002ZTLLHQWZD23,
author = {Weijie Su and
Xizhou Zhu and
Chenxin Tao and
Lewei Lu and
Bin Li and
Gao Huang and
Yu Qiao and
Xiaogang Wang and
Jie Zhou and
Jifeng Dai},
title = {Towards All-in-One Pre-Training via Maximizing Multi-Modal Mutual
Information},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {15888--15899},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.01525},
doi = {10.1109/CVPR52729.2023.01525},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/0002ZTLLHQWZD23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/YangCTTZZHLQLZD23,
author = {Chenyu Yang and
Yuntao Chen and
Hao Tian and
Chenxin Tao and
Xizhou Zhu and
Zhaoxiang Zhang and
Gao Huang and
Hongyang Li and
Yu Qiao and
Lewei Lu and
Jie Zhou and
Jifeng Dai},
title = {BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition
via Perspective Supervision},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {17830--17839},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.01710},
doi = {10.1109/CVPR52729.2023.01710},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/YangCTTZZHLQLZD23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/HuYCLSZCDLWLJLD23,
author = {Yihan Hu and
Jiazhi Yang and
Li Chen and
Keyu Li and
Chonghao Sima and
Xizhou Zhu and
Siqi Chai and
Senyao Du and
Tianwei Lin and
Wenhai Wang and
Lewei Lu and
Xiaosong Jia and
Qiang Liu and
Jifeng Dai and
Yu Qiao and
Hongyang Li},
title = {Planning-oriented Autonomous Driving},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {17853--17862},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.01712},
doi = {10.1109/CVPR52729.2023.01712},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/HuYCLSZCDLWLJLD23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/XuH00D0H23,
author = {Jiaqi Xu and
Xiaowei Hu and
Lei Zhu and
Qi Dou and
Jifeng Dai and
Yu Qiao and
Pheng{-}Ann Heng},
title = {Video Dehazing via a Multi-Range Temporal Alignment Network with Physical
Prior},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {18053--18062},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.01731},
doi = {10.1109/CVPR52729.2023.01731},
timestamp = {Mon, 03 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/XuH00D0H23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ZhuWFYGDQH23,
author = {Yurui Zhu and
Tianyu Wang and
Xueyang Fu and
Xuanyu Yang and
Xin Guo and
Jifeng Dai and
Yu Qiao and
Xiaowei Hu},
title = {Learning Weather-General and Weather-Specific Features for Image Restoration
Under Multiple Adverse Weather Conditions},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2023, Vancouver, BC, Canada, June 17-24, 2023},
pages = {21747--21758},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/CVPR52729.2023.02083},
doi = {10.1109/CVPR52729.2023.02083},
timestamp = {Wed, 17 Sep 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/ZhuWFYGDQH23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/ShiHBLZCSQDL23,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Weikang Bian and
Dasong Li and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation},
booktitle = {{IEEE/CVF} International Conference on Computer Vision, {ICCV} 2023,
Paris, France, October 1-6, 2023},
pages = {12435--12446},
publisher = {{IEEE}},
year = {2023},
url = {https://doi.org/10.1109/ICCV51070.2023.01146},
doi = {10.1109/ICCV51070.2023.01146},
timestamp = {Mon, 03 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iccv/ShiHBLZCSQDL23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/ChenDWHLDQ23,
author = {Zhe Chen and
Yuchen Duan and
Wenhai Wang and
Junjun He and
Tong Lu and
Jifeng Dai and
Yu Qiao},
title = {Vision Transformer Adapter for Dense Predictions},
booktitle = {The Eleventh International Conference on Learning Representations,
{ICLR} 2023, Kigali, Rwanda, May 1-5, 2023},
publisher = {OpenReview.net},
year = {2023},
url = {https://openreview.net/forum?id=plKu2GByCNW},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iclr/ChenDWHLDQ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/MuZHWDJWDQL23,
author = {Yao Mu and
Qinglong Zhang and
Mengkang Hu and
Wenhai Wang and
Mingyu Ding and
Jun Jin and
Bin Wang and
Jifeng Dai and
Yu Qiao and
Ping Luo},
editor = {Alice Oh and
Tristan Naumann and
Amir Globerson and
Kate Saenko and
Moritz Hardt and
Sergey Levine},
title = {EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought},
booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference
on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
LA, USA, December 10 - 16, 2023},
year = {2023},
url = {http://papers.nips.cc/paper\_files/paper/2023/hash/4ec43957eda1126ad4887995d05fae3b-Abstract-Conference.html},
timestamp = {Fri, 30 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/MuZHWDJWDQL23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/SunPGLDWZZQWDQW23,
author = {Keqiang Sun and
Junting Pan and
Yuying Ge and
Hao Li and
Haodong Duan and
Xiaoshi Wu and
Renrui Zhang and
Aojun Zhou and
Zipeng Qin and
Yi Wang and
Jifeng Dai and
Yu Qiao and
Limin Wang and
Hongsheng Li},
editor = {Alice Oh and
Tristan Naumann and
Amir Globerson and
Kate Saenko and
Moritz Hardt and
Sergey Levine},
title = {JourneyDB: {A} Benchmark for Generative Image Understanding},
booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference
on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
LA, USA, December 10 - 16, 2023},
year = {2023},
url = {http://papers.nips.cc/paper\_files/paper/2023/hash/9bc59aff4685e39e1a8175d5303248a1-Abstract-Datasets\_and\_Benchmarks.html},
timestamp = {Fri, 27 Dec 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/SunPGLDWZZQWDQW23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/WangCCWZZLLZQD23,
author = {Wenhai Wang and
Zhe Chen and
Xiaokang Chen and
Jiannan Wu and
Xizhou Zhu and
Gang Zeng and
Ping Luo and
Tong Lu and
Jie Zhou and
Yu Qiao and
Jifeng Dai},
editor = {Alice Oh and
Tristan Naumann and
Amir Globerson and
Kate Saenko and
Moritz Hardt and
Sergey Levine},
title = {VisionLLM: Large Language Model is also an Open-Ended Decoder for
Vision-Centric Tasks},
booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference
on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
LA, USA, December 10 - 16, 2023},
year = {2023},
url = {http://papers.nips.cc/paper\_files/paper/2023/hash/c1f7b1ed763e9c75e4db74b49b76db5f-Abstract-Conference.html},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/WangCCWZZLLZQD23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-01237,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Dasong Li and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {FlowFormer++: Masked Cost Volume Autoencoding for Pretraining Optical
Flow Estimation},
journal = {CoRR},
volume = {abs/2303.01237},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.01237},
doi = {10.48550/ARXIV.2303.01237},
eprinttype = {arXiv},
eprint = {2303.01237},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-01237.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-01503,
author = {Rongyao Fang and
Peng Gao and
Aojun Zhou and
Yingjie Cai and
Si Liu and
Jifeng Dai and
Hongsheng Li},
title = {FeatAug-DETR: Enriching One-to-Many Matching for DETRs with Feature
Augmentation},
journal = {CoRR},
volume = {abs/2303.01503},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.01503},
doi = {10.48550/ARXIV.2303.01503},
eprinttype = {arXiv},
eprint = {2303.01503},
timestamp = {Fri, 10 Nov 2023 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-01503.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-08340,
author = {Xiaoyu Shi and
Zhaoyang Huang and
Weikang Bian and
Dasong Li and
Manyuan Zhang and
Ka Chun Cheung and
Simon See and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation},
journal = {CoRR},
volume = {abs/2303.08340},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.08340},
doi = {10.48550/ARXIV.2303.08340},
eprinttype = {arXiv},
eprint = {2303.08340},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-08340.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-09757,
author = {Jiaqi Xu and
Xiaowei Hu and
Lei Zhu and
Qi Dou and
Jifeng Dai and
Yu Qiao and
Pheng{-}Ann Heng},
title = {Video Dehazing via a Multi-Range Temporal Alignment Network with Physical
Prior},
journal = {CoRR},
volume = {abs/2303.09757},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.09757},
doi = {10.48550/ARXIV.2303.09757},
eprinttype = {arXiv},
eprint = {2303.09757},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-09757.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2305-05662,
author = {Zhaoyang Liu and
Yinan He and
Wenhai Wang and
Weiyun Wang and
Yi Wang and
Shoufa Chen and
Qinglong Zhang and
Zeqiang Lai and
Yang Yang and
Qingyun Li and
Jiashuo Yu and
Kunchang Li and
Zhe Chen and
Xue Yang and
Xizhou Zhu and
Yali Wang and
Limin Wang and
Ping Luo and
Jifeng Dai and
Yu Qiao},
title = {InternGPT: Solving Vision-Centric Tasks by Interacting with Chatbots
Beyond Language},
journal = {CoRR},
volume = {abs/2305.05662},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2305.05662},
doi = {10.48550/ARXIV.2305.05662},
eprinttype = {arXiv},
eprint = {2305.05662},
timestamp = {Fri, 27 Dec 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2305-05662.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2305-11175,
author = {Wenhai Wang and
Zhe Chen and
Xiaokang Chen and
Jiannan Wu and
Xizhou Zhu and
Gang Zeng and
Ping Luo and
Tong Lu and
Jie Zhou and
Yu Qiao and
Jifeng Dai},
title = {VisionLLM: Large Language Model is also an Open-Ended Decoder for
Vision-Centric Tasks},
journal = {CoRR},
volume = {abs/2305.11175},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2305.11175},
doi = {10.48550/ARXIV.2305.11175},
eprinttype = {arXiv},
eprint = {2305.11175},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2305-11175.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2305-15021,
author = {Yao Mu and
Qinglong Zhang and
Mengkang Hu and
Wenhai Wang and
Mingyu Ding and
Jun Jin and
Bin Wang and
Jifeng Dai and
Yu Qiao and
Ping Luo},
title = {EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought},
journal = {CoRR},
volume = {abs/2305.15021},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2305.15021},
doi = {10.48550/ARXIV.2305.15021},
eprinttype = {arXiv},
eprint = {2305.15021},
timestamp = {Fri, 30 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2305-15021.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2305-17144,
author = {Xizhou Zhu and
Yuntao Chen and
Hao Tian and
Chenxin Tao and
Weijie Su and
Chenyu Yang and
Gao Huang and
Bin Li and
Lewei Lu and
Xiaogang Wang and
Yu Qiao and
Zhaoxiang Zhang and
Jifeng Dai},
title = {Ghost in the Minecraft: Generally Capable Agents for Open-World Environments
via Large Language Models with Text-based Knowledge and Memory},
journal = {CoRR},
volume = {abs/2305.17144},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2305.17144},
doi = {10.48550/ARXIV.2305.17144},
eprinttype = {arXiv},
eprint = {2305.17144},
timestamp = {Sat, 06 Sep 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2305-17144.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2306-01721,
author = {Zeqiang Lai and
Yuchen Duan and
Jifeng Dai and
Ziheng Li and
Ying Fu and
Hongsheng Li and
Yu Qiao and
Wenhai Wang},
title = {Denoising Diffusion Semantic Segmentation with Mask Prior Modeling},
journal = {CoRR},
volume = {abs/2306.01721},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2306.01721},
doi = {10.48550/ARXIV.2306.01721},
eprinttype = {arXiv},
eprint = {2306.01721},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2306-01721.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2306-05423,
author = {Changyao Tian and
Chenxin Tao and
Jifeng Dai and
Hao Li and
Ziheng Li and
Lewei Lu and
Xiaogang Wang and
Hongsheng Li and
Gao Huang and
Xizhou Zhu},
title = {{ADDP:} Learning General Representations for Image Recognition and
Generation with Alternating Denoising Diffusion Process},
journal = {CoRR},
volume = {abs/2306.05423},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2306.05423},
doi = {10.48550/ARXIV.2306.05423},
eprinttype = {arXiv},
eprint = {2306.05423},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2306-05423.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2306-05442,
author = {Zhaoyang Huang and
Xiaoyu Shi and
Chao Zhang and
Qiang Wang and
Yijin Li and
Hongwei Qin and
Jifeng Dai and
Xiaogang Wang and
Hongsheng Li},
title = {FlowFormer: {A} Transformer Architecture and Its Masked Cost Volume
Autoencoding for Optical Flow},
journal = {CoRR},
volume = {abs/2306.05442},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2306.05442},
doi = {10.48550/ARXIV.2306.05442},
eprinttype = {arXiv},
eprint = {2306.05442},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2306-05442.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2307-00716,
author = {Junting Pan and
Keqiang Sun and
Yuying Ge and
Hao Li and
Haodong Duan and
Xiaoshi Wu and
Renrui Zhang and
Aojun Zhou and
Zipeng Qin and
Yi Wang and
Jifeng Dai and
Yu Qiao and
Limin Wang and
Hongsheng Li},
title = {JourneyDB: {A} Benchmark for Generative Image Understanding},
journal = {CoRR},
volume = {abs/2307.00716},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2307.00716},
doi = {10.48550/ARXIV.2307.00716},
eprinttype = {arXiv},
eprint = {2307.00716},
timestamp = {Sat, 14 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2307-00716.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2308-01907,
author = {Weiyun Wang and
Min Shi and
Qingyun Li and
Wenhai Wang and
Zhenhang Huang and
Linjie Xing and
Zhe Chen and
Hao Li and
Xizhou Zhu and
Zhiguo Cao and
Yushi Chen and
Tong Lu and
Jifeng Dai and
Yu Qiao},
title = {The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding
of the Open World},
journal = {CoRR},
volume = {abs/2308.01907},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2308.01907},
doi = {10.48550/ARXIV.2308.01907},
eprinttype = {arXiv},
eprint = {2308.01907},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2308-01907.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-07653,
author = {Zeqiang Lai and
Xizhou Zhu and
Jifeng Dai and
Yu Qiao and
Wenhai Wang},
title = {Mini-DALLE3: Interactive Text to Image by Prompting Large Language
Models},
journal = {CoRR},
volume = {abs/2310.07653},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2310.07653},
doi = {10.48550/ARXIV.2310.07653},
eprinttype = {arXiv},
eprint = {2310.07653},
timestamp = {Wed, 24 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2310-07653.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-17796,
author = {Zhaoyang Liu and
Zeqiang Lai and
Zhangwei Gao and
Erfei Cui and
Zhiheng Li and
Xizhou Zhu and
Lewei Lu and
Qifeng Chen and
Yu Qiao and
Jifeng Dai and
Wenhai Wang},
title = {ControlLLM: Augment Language Models with Tools by Searching on Graphs},
journal = {CoRR},
volume = {abs/2310.17796},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2310.17796},
doi = {10.48550/ARXIV.2310.17796},
eprinttype = {arXiv},
eprint = {2310.17796},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2310-17796.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2311-14758,
author = {Yu Yi and
Xue Yang and
Qingyun Li and
Feipeng Da and
Junchi Yan and
Jifeng Dai and
Yu Qiao},
title = {Point2RBox: Combine Knowledge from Synthetic Visual Patterns for End-to-end
Oriented Object Detection with Single Point Supervision},
journal = {CoRR},
volume = {abs/2311.14758},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2311.14758},
doi = {10.48550/ARXIV.2311.14758},
eprinttype = {arXiv},
eprint = {2311.14758},
timestamp = {Mon, 22 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2311-14758.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2311-18835,
author = {Rongyao Fang and
Shilin Yan and
Zhaoyang Huang and
Jingqiu Zhou and
Hao Tian and
Jifeng Dai and
Hongsheng Li},
title = {InstructSeq: Unifying Vision Tasks with Instruction-conditioned Multi-modal
Sequence Generation},
journal = {CoRR},
volume = {abs/2311.18835},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2311.18835},
doi = {10.48550/ARXIV.2311.18835},
eprinttype = {arXiv},
eprint = {2311.18835},
timestamp = {Wed, 20 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2311-18835.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2312-09238,
author = {Hao Li and
Xue Yang and
Zhaokai Wang and
Xizhou Zhu and
Jie Zhou and
Yu Qiao and
Xiaogang Wang and
Hongsheng Li and
Lewei Lu and
Jifeng Dai},
title = {Auto MC-Reward: Automated Dense Reward Design with Large Language
Models for Minecraft},
journal = {CoRR},
volume = {abs/2312.09238},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2312.09238},
doi = {10.48550/ARXIV.2312.09238},
eprinttype = {arXiv},
eprint = {2312.09238},
timestamp = {Tue, 06 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2312-09238.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2312-09245,
author = {Wenhai Wang and
Jiangwei Xie and
Chuanyang Hu and
Haoming Zou and
Jianan Fan and
Wenwen Tong and
Yang Wen and
Silei Wu and
Hanming Deng and
Zhiqi Li and
Hao Tian and
Lewei Lu and
Xizhou Zhu and
Xiaogang Wang and
Yu Qiao and
Jifeng Dai},
title = {DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral
Planning States for Autonomous Driving},
journal = {CoRR},
volume = {abs/2312.09245},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2312.09245},
doi = {10.48550/ARXIV.2312.09245},
eprinttype = {arXiv},
eprint = {2312.09245},
timestamp = {Tue, 19 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2312-09245.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2312-11562,
author = {Jiankai Sun and
Chuanyang Zheng and
Enze Xie and
Zhengying Liu and
Ruihang Chu and
Jianing Qiu and
Jiaqi Xu and
Mingyu Ding and
Hongyang Li and
Mengzhe Geng and
Yue Wu and
Wenhai Wang and
Junsong Chen and
Zhangyue Yin and
Xiaozhe Ren and
Jie Fu and
Junxian He and
Wu Yuan and
Qi Liu and
Xihui Liu and
Yu Li and
Hao Dong and
Yu Cheng and
Ming Zhang and
Pheng{-}Ann Heng and
Jifeng Dai and
Ping Luo and
Jingdong Wang and
Ji{-}Rong Wen and
Xipeng Qiu and
Yike Guo and
Hui Xiong and
Qun Liu and
Zhenguo Li},
title = {A Survey of Reasoning with Foundation Models},
journal = {CoRR},
volume = {abs/2312.11562},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2312.11562},
doi = {10.48550/ARXIV.2312.11562},
eprinttype = {arXiv},
eprint = {2312.11562},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2312-11562.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2312-14238,
author = {Zhe Chen and
Jiannan Wu and
Wenhai Wang and
Weijie Su and
Guo Chen and
Sen Xing and
Muyan Zhong and
Qinglong Zhang and
Xizhou Zhu and
Lewei Lu and
Bin Li and
Ping Luo and
Tong Lu and
Yu Qiao and
Jifeng Dai},
title = {InternVL: Scaling up Vision Foundation Models and Aligning for Generic
Visual-Linguistic Tasks},
journal = {CoRR},
volume = {abs/2312.14238},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2312.14238},
doi = {10.48550/ARXIV.2312.14238},
eprinttype = {arXiv},
eprint = {2312.14238},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2312-14238.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LiFDLHZ22,
author = {Hao Li and
Tianwen Fu and
Jifeng Dai and
Hongsheng Li and
Gao Huang and
Xizhou Zhu},
title = {AutoLoss-Zero: Searching Loss Functions from Scratch for Generic Tasks},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
pages = {999--1008},
publisher = {{IEEE}},
year = {2022},
url = {https://doi.org/10.1109/CVPR52688.2022.00108},
doi = {10.1109/CVPR52688.2022.00108},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/LiFDLHZ22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/TaoWZDSHD22,
author = {Chenxin Tao and
Honghui Wang and
Xizhou Zhu and
Jiahua Dong and
Shiji Song and
Gao Huang and
Jifeng Dai},
title = {Exploring the Equivalence of Siamese Self-Supervised Learning via
{A} Unified Gradient Framework},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
pages = {14411--14420},
publisher = {{IEEE}},
year = {2022},
url = {https://doi.org/10.1109/CVPR52688.2022.01403},
doi = {10.1109/CVPR52688.2022.01403},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/TaoWZDSHD22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ZhuZLWLWD22,
author = {Xizhou Zhu and
Jinguo Zhu and
Hao Li and
Xiaoshi Wu and
Hongsheng Li and
Xiaohua Wang and
Jifeng Dai},
title = {Uni-Perceiver: Pre-training Unified Architecture for Generic Perception
for Zero-shot and Few-shot Tasks},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
pages = {16783--16794},
publisher = {{IEEE}},
year = {2022},
url = {https://doi.org/10.1109/CVPR52688.2022.01630},
doi = {10.1109/CVPR52688.2022.01630},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/ZhuZLWLWD22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/LiWLXSLQD22,
author = {Zhiqi Li and
Wenhai Wang and
Hongyang Li and
Enze Xie and
Chonghao Sima and
Tong Lu and
Yu Qiao and
Jifeng Dai},
editor = {Shai Avidan and
Gabriel J. Brostow and
Moustapha Ciss{\'{e}} and
Giovanni Maria Farinella and
Tal Hassner},
title = {BEVFormer: Learning Bird's-Eye-View Representation from Multi-camera
Images via Spatiotemporal Transformers},
booktitle = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
Israel, October 23-27, 2022, Proceedings, Part {IX}},
series = {Lecture Notes in Computer Science},
pages = {1--18},
publisher = {Springer},
year = {2022},
url = {https://doi.org/10.1007/978-3-031-20077-9\_1},
doi = {10.1007/978-3-031-20077-9\_1},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/eccv/LiWLXSLQD22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/TianWZDQ22,
author = {Changyao Tian and
Wenhai Wang and
Xizhou Zhu and
Jifeng Dai and
Yu Qiao},
editor = {Shai Avidan and
Gabriel J. Brostow and
Moustapha Ciss{\'{e}} and
Giovanni Maria Farinella and
Tal Hassner},
title = {{VL-LTR:} Learning Class-wise Visual-Linguistic Representation for
Long-Tailed Visual Recognition},
booktitle = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
Israel, October 23-27, 2022, Proceedings, Part {XXV}},
series = {Lecture Notes in Computer Science},
pages = {73--91},
publisher = {Springer},
year = {2022},
url = {https://doi.org/10.1007/978-3-031-19806-9\_5},
doi = {10.1007/978-3-031-19806-9\_5},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/TianWZDQ22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/LinGZGMWDQL22,
author = {Ziyi Lin and
Shijie Geng and
Renrui Zhang and
Peng Gao and
Gerard de Melo and
Xiaogang Wang and
Jifeng Dai and
Yu Qiao and
Hongsheng Li},
editor = {Shai Avidan and
Gabriel J. Brostow and
Moustapha Ciss{\'{e}} and
Giovanni Maria Farinella and
Tal Hassner},
title = {Frozen {CLIP} Models are Efficient Video Learners},
booktitle = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
Israel, October 23-27, 2022, Proceedings, Part {XXXV}},
series = {Lecture Notes in Computer Science},
pages = {388--404},
publisher = {Springer},
year = {2022},
url = {https://doi.org/10.1007/978-3-031-19833-5\_23},
doi = {10.1007/978-3-031-19833-5\_23},
timestamp = {Mon, 14 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/LinGZGMWDQL22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/ZhangZFGLDQL22,
author = {Renrui Zhang and
Wei Zhang and
Rongyao Fang and
Peng Gao and
Kunchang Li and
Jifeng Dai and
Yu Qiao and
Hongsheng Li},
editor = {Shai Avidan and
Gabriel J. Brostow and
Moustapha Ciss{\'{e}} and
Giovanni Maria Farinella and
Tal Hassner},
title = {Tip-Adapter: Training-Free Adaption of {CLIP} for Few-Shot Classification},
booktitle = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
Israel, October 23-27, 2022, Proceedings, Part {XXXV}},
series = {Lecture Notes in Computer Science},
pages = {493--510},
publisher = {Springer},
year = {2022},
url = {https://doi.org/10.1007/978-3-031-19833-5\_29},
doi = {10.1007/978-3-031-19833-5\_29},
timestamp = {Thu, 12 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/eccv/ZhangZFGLDQL22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/HuangSZWCQDL22,
author = {Zhaoyang Huang and
Xiaoyu Shi and
Chao Zhang and
Qiang Wang and
Ka Chun Cheung and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
editor = {Shai Avidan and
Gabriel J. Brostow and
Moustapha Ciss{\'{e}} and
Giovanni Maria Farinella and
Tal Hassner},
title = {FlowFormer: {A} Transformer Architecture for Optical Flow},
booktitle = {Computer Vision - {ECCV} 2022 - 17th European Conference, Tel Aviv,
Israel, October 23-27, 2022, Proceedings, Part {XVII}},
series = {Lecture Notes in Computer Science},
pages = {668--685},
publisher = {Springer},
year = {2022},
url = {https://doi.org/10.1007/978-3-031-19790-1\_40},
doi = {10.1007/978-3-031-19790-1\_40},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/eccv/HuangSZWCQDL22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/GaoMLLDQ22,
author = {Peng Gao and
Teli Ma and
Hongsheng Li and
Ziyi Lin and
Jifeng Dai and
Yu Qiao},
editor = {Sanmi Koyejo and
S. Mohamed and
A. Agarwal and
Danielle Belgrave and
K. Cho and
A. Oh},
title = {{MCMAE:} Masked Convolution Meets Masked Autoencoders},
booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference
on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
LA, USA, November 28 - December 9, 2022},
year = {2022},
url = {http://papers.nips.cc/paper\_files/paper/2022/hash/e7938ede51225b490bb69f7b361a9259-Abstract-Conference.html},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/GaoMLLDQ22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/ZhuZWWLWD22,
author = {Jinguo Zhu and
Xizhou Zhu and
Wenhai Wang and
Xiaohua Wang and
Hongsheng Li and
Xiaogang Wang and
Jifeng Dai},
editor = {Sanmi Koyejo and
S. Mohamed and
A. Agarwal and
Danielle Belgrave and
K. Cho and
A. Oh},
title = {Uni-Perceiver-MoE: Learning Sparse Generalist Models with Conditional
MoEs},
booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference
on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
LA, USA, November 28 - December 9, 2022},
year = {2022},
url = {http://papers.nips.cc/paper\_files/paper/2022/hash/11fc8c98b46d4cbdfe8157267228f7d7-Abstract-Conference.html},
timestamp = {Mon, 08 Jan 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/ZhuZWWLWD22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2203-16194,
author = {Zhaoyang Huang and
Xiaoyu Shi and
Chao Zhang and
Qiang Wang and
Ka Chun Cheung and
Hongwei Qin and
Jifeng Dai and
Hongsheng Li},
title = {FlowFormer: {A} Transformer Architecture for Optical Flow},
journal = {CoRR},
volume = {abs/2203.16194},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2203.16194},
doi = {10.48550/ARXIV.2203.16194},
eprinttype = {arXiv},
eprint = {2203.16194},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2203-16194.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2203-17270,
author = {Zhiqi Li and
Wenhai Wang and
Hongyang Li and
Enze Xie and
Chonghao Sima and
Tong Lu and
Qiao Yu and
Jifeng Dai},
title = {BEVFormer: Learning Bird's-Eye-View Representation from Multi-Camera
Images via Spatiotemporal Transformers},
journal = {CoRR},
volume = {abs/2203.17270},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2203.17270},
doi = {10.48550/ARXIV.2203.17270},
eprinttype = {arXiv},
eprint = {2203.17270},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2203-17270.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2205-03892,
author = {Peng Gao and
Teli Ma and
Hongsheng Li and
Ziyi Lin and
Jifeng Dai and
Yu Qiao},
title = {ConvMAE: Masked Convolution Meets Masked Autoencoders},
journal = {CoRR},
volume = {abs/2205.03892},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2205.03892},
doi = {10.48550/ARXIV.2205.03892},
eprinttype = {arXiv},
eprint = {2205.03892},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2205-03892.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2205-08534,
author = {Zhe Chen and
Yuchen Duan and
Wenhai Wang and
Junjun He and
Tong Lu and
Jifeng Dai and
Yu Qiao},
title = {Vision Transformer Adapter for Dense Predictions},
journal = {CoRR},
volume = {abs/2205.08534},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2205.08534},
doi = {10.48550/ARXIV.2205.08534},
eprinttype = {arXiv},
eprint = {2205.08534},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2205-08534.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-01204,
author = {Chenxin Tao and
Xizhou Zhu and
Gao Huang and
Yu Qiao and
Xiaogang Wang and
Jifeng Dai},
title = {Siamese Image Modeling for Self-Supervised Vision Representation Learning},
journal = {CoRR},
volume = {abs/2206.01204},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2206.01204},
doi = {10.48550/ARXIV.2206.01204},
eprinttype = {arXiv},
eprint = {2206.01204},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2206-01204.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-04674,
author = {Jinguo Zhu and
Xizhou Zhu and
Wenhai Wang and
Xiaohua Wang and
Hongsheng Li and
Xiaogang Wang and
Jifeng Dai},
title = {Uni-Perceiver-MoE: Learning Sparse Generalist Models with Conditional
MoEs},
journal = {CoRR},
volume = {abs/2206.04674},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2206.04674},
doi = {10.48550/ARXIV.2206.04674},
eprinttype = {arXiv},
eprint = {2206.04674},
timestamp = {Fri, 03 Nov 2023 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2206-04674.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2207-09519,
author = {Renrui Zhang and
Zhang Wei and
Rongyao Fang and
Peng Gao and
Kunchang Li and
Jifeng Dai and
Yu Qiao and
Hongsheng Li},
title = {Tip-Adapter: Training-free Adaption of {CLIP} for Few-shot Classification},
journal = {CoRR},
volume = {abs/2207.09519},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2207.09519},
doi = {10.48550/ARXIV.2207.09519},
eprinttype = {arXiv},
eprint = {2207.09519},
timestamp = {Tue, 16 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2207-09519.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2208-03550,
author = {Ziyi Lin and
Shijie Geng and
Renrui Zhang and
Peng Gao and
Gerard de Melo and
Xiaogang Wang and
Jifeng Dai and
Yu Qiao and
Hongsheng Li},
title = {Frozen {CLIP} Models are Efficient Video Learners},
journal = {CoRR},
volume = {abs/2208.03550},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2208.03550},
doi = {10.48550/ARXIV.2208.03550},
eprinttype = {arXiv},
eprint = {2208.03550},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2208-03550.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2209-05324,
author = {Hongyang Li and
Chonghao Sima and
Jifeng Dai and
Wenhai Wang and
Lewei Lu and
Huijie Wang and
Enze Xie and
Zhiqi Li and
Hanming Deng and
Hao Tian and
Xizhou Zhu and
Li Chen and
Yulu Gao and
Xiangwei Geng and
Jia Zeng and
Yang Li and
Jiazhi Yang and
Xiaosong Jia and
Bohan Yu and
Yu Qiao and
Dahua Lin and
Si Liu and
Junchi Yan and
Jianping Shi and
Ping Luo},
title = {Delving into the Devils of Bird's-eye-view Perception: {A} Review,
Evaluation and Recipe},
journal = {CoRR},
volume = {abs/2209.05324},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2209.05324},
doi = {10.48550/ARXIV.2209.05324},
eprinttype = {arXiv},
eprint = {2209.05324},
timestamp = {Tue, 14 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2209-05324.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2211-05778,
author = {Wenhai Wang and
Jifeng Dai and
Zhe Chen and
Zhenhang Huang and
Zhiqi Li and
Xizhou Zhu and
Xiaowei Hu and
Tong Lu and
Lewei Lu and
Hongsheng Li and
Xiaogang Wang and
Yu Qiao},
title = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable
Convolutions},
journal = {CoRR},
volume = {abs/2211.05778},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.05778},
doi = {10.48550/ARXIV.2211.05778},
eprinttype = {arXiv},
eprint = {2211.05778},
timestamp = {Wed, 03 Jun 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-05778.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2211-05781,
author = {Jifeng Dai and
Min Shi and
Weiyun Wang and
Sitong Wu and
Linjie Xing and
Wenhai Wang and
Xizhou Zhu and
Lewei Lu and
Jie Zhou and
Xiaogang Wang and
Yu Qiao and
Xiaowei Hu},
title = {Demystify Transformers {\&} Convolutions in Modern Image Deep
Networks},
journal = {CoRR},
volume = {abs/2211.05781},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.05781},
doi = {10.48550/ARXIV.2211.05781},
eprinttype = {arXiv},
eprint = {2211.05781},
timestamp = {Mon, 29 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-05781.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2211-09807,
author = {Weijie Su and
Xizhou Zhu and
Chenxin Tao and
Lewei Lu and
Bin Li and
Gao Huang and
Yu Qiao and
Xiaogang Wang and
Jie Zhou and
Jifeng Dai},
title = {Towards All-in-one Pre-training via Maximizing Multi-modal Mutual
Information},
journal = {CoRR},
volume = {abs/2211.09807},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.09807},
doi = {10.48550/ARXIV.2211.09807},
eprinttype = {arXiv},
eprint = {2211.09807},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-09807.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2211-09808,
author = {Hao Li and
Jinguo Zhu and
Xiaohu Jiang and
Xizhou Zhu and
Hongsheng Li and
Chun Yuan and
Xiaohua Wang and
Yu Qiao and
Xiaogang Wang and
Wenhai Wang and
Jifeng Dai},
title = {Uni-Perceiver v2: {A} Generalist Model for Large-Scale Vision and
Vision-Language Tasks},
journal = {CoRR},
volume = {abs/2211.09808},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.09808},
doi = {10.48550/ARXIV.2211.09808},
eprinttype = {arXiv},
eprint = {2211.09808},
timestamp = {Tue, 24 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-09808.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2211-10439,
author = {Chenyu Yang and
Yuntao Chen and
Hao Tian and
Chenxin Tao and
Xizhou Zhu and
Zhaoxiang Zhang and
Gao Huang and
Hongyang Li and
Yu Qiao and
Lewei Lu and
Jie Zhou and
Jifeng Dai},
title = {BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition
via Perspective Supervision},
journal = {CoRR},
volume = {abs/2211.10439},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.10439},
doi = {10.48550/ARXIV.2211.10439},
eprinttype = {arXiv},
eprint = {2211.10439},
timestamp = {Tue, 19 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-10439.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2212-10156,
author = {Yihan Hu and
Jiazhi Yang and
Li Chen and
Keyu Li and
Chonghao Sima and
Xizhou Zhu and
Siqi Chai and
Senyao Du and
Tianwei Lin and
Wenhai Wang and
Lewei Lu and
Xiaosong Jia and
Qiang Liu and
Jifeng Dai and
Yu Qiao and
Hongyang Li},
title = {Goal-oriented Autonomous Driving},
journal = {CoRR},
volume = {abs/2212.10156},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2212.10156},
doi = {10.48550/ARXIV.2212.10156},
eprinttype = {arXiv},
eprint = {2212.10156},
timestamp = {Mon, 14 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2212-10156.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/TianCDZZ21,
author = {Hao Tian and
Yuntao Chen and
Jifeng Dai and
Zhaoxiang Zhang and
Xizhou Zhu},
title = {Unsupervised Object Detection With {LIDAR} Clues},
booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2021, virtual, June 19-25, 2021},
pages = {5962--5972},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2021},
url = {https://openaccess.thecvf.com/content/CVPR2021/html/Tian\_Unsupervised\_Object\_Detection\_With\_LIDAR\_Clues\_CVPR\_2021\_paper.html},
doi = {10.1109/CVPR46437.2021.00590},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/TianCDZZ21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/0007Z0D021,
author = {Peng Gao and
Minghang Zheng and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
booktitle = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
2021, Montreal, QC, Canada, October 10-17, 2021},
pages = {3601--3610},
publisher = {{IEEE}},
year = {2021},
url = {https://doi.org/10.1109/ICCV48922.2021.00360},
doi = {10.1109/ICCV48922.2021.00360},
timestamp = {Mon, 14 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/0007Z0D021.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/WangZYDKG21,
author = {Wenguan Wang and
Tianfei Zhou and
Fisher Yu and
Jifeng Dai and
Ender Konukoglu and
Luc Van Gool},
title = {Exploring Cross-Image Pixel Contrast for Semantic Segmentation},
booktitle = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
2021, Montreal, QC, Canada, October 10-17, 2021},
pages = {7283--7293},
publisher = {{IEEE}},
year = {2021},
url = {https://doi.org/10.1109/ICCV48922.2021.00721},
doi = {10.1109/ICCV48922.2021.00721},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/WangZYDKG21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/LiuDZLDH21,
author = {Zhuoming Liu and
Hao Ding and
Huaping Zhong and
Weijia Li and
Jifeng Dai and
Conghui He},
title = {Influence Selection for Active Learning},
booktitle = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
2021, Montreal, QC, Canada, October 10-17, 2021},
pages = {9254--9263},
publisher = {{IEEE}},
year = {2021},
url = {https://doi.org/10.1109/ICCV48922.2021.00914},
doi = {10.1109/ICCV48922.2021.00914},
timestamp = {Thu, 07 May 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/LiuDZLDH21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/0019DHSLS0D021,
author = {Rui Liu and
Hanming Deng and
Yangyi Huang and
Xiaoyu Shi and
Lewei Lu and
Wenxiu Sun and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {FuseFormer: Fusing Fine-Grained Information in Transformers for Video
Inpainting},
booktitle = {2021 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
2021, Montreal, QC, Canada, October 10-17, 2021},
pages = {14020--14029},
publisher = {{IEEE}},
year = {2021},
url = {https://doi.org/10.1109/ICCV48922.2021.01378},
doi = {10.1109/ICCV48922.2021.01378},
timestamp = {Mon, 14 Apr 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/0019DHSLS0D021.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/LiTZWHD21,
author = {Hao Li and
Chenxin Tao and
Xizhou Zhu and
Xiaogang Wang and
Gao Huang and
Jifeng Dai},
title = {Auto Seg-Loss: Searching Metric Surrogates for Semantic Segmentation},
booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
Virtual Event, Austria, May 3-7, 2021},
publisher = {OpenReview.net},
year = {2021},
url = {https://openreview.net/forum?id=MJAqnaC2vO1},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/LiTZWHD21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/ZhuSLLWD21,
author = {Xizhou Zhu and
Weijie Su and
Lewei Lu and
Bin Li and
Xiaogang Wang and
Jifeng Dai},
title = {Deformable {DETR:} Deformable Transformers for End-to-End Object Detection},
booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
Virtual Event, Austria, May 3-7, 2021},
publisher = {OpenReview.net},
year = {2021},
url = {https://openreview.net/forum?id=gZ9hCDWe6ke},
timestamp = {Tue, 15 Nov 2022 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/iclr/ZhuSLLWD21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/TaoLZHLD21,
author = {Chenxin Tao and
Zizhang Li and
Xizhou Zhu and
Gao Huang and
Yong Liu and
Jifeng Dai},
editor = {Marc'Aurelio Ranzato and
Alina Beygelzimer and
Yann N. Dauphin and
Percy Liang and
Jennifer Wortman Vaughan},
title = {Searching Parameterized {AP} Loss for Object Detection},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference
on Neural Information Processing Systems 2021, NeurIPS 2021, December
6-14, 2021, virtual},
pages = {22021--22033},
year = {2021},
url = {https://proceedings.neurips.cc/paper/2021/hash/b9009beb804fa097c04d226a8ba5102e-Abstract.html},
timestamp = {Tue, 11 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/TaoLZHLD21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2101-07448,
author = {Peng Gao and
Minghang Zheng and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
journal = {CoRR},
volume = {abs/2101.07448},
year = {2021},
url = {https://arxiv.org/abs/2101.07448},
eprinttype = {arXiv},
eprint = {2101.07448},
timestamp = {Thu, 14 Jul 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2101-07448.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2101-11939,
author = {Wenguan Wang and
Tianfei Zhou and
Fisher Yu and
Jifeng Dai and
Ender Konukoglu and
Luc Van Gool},
title = {Exploring Cross-Image Pixel Contrast for Semantic Segmentation},
journal = {CoRR},
volume = {abs/2101.11939},
year = {2021},
url = {https://arxiv.org/abs/2101.11939},
eprinttype = {arXiv},
eprint = {2101.11939},
timestamp = {Mon, 18 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2101-11939.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2103-14026,
author = {Hao Li and
Tianwen Fu and
Jifeng Dai and
Hongsheng Li and
Gao Huang and
Xizhou Zhu},
title = {AutoLoss-Zero: Searching Loss Functions from Scratch for Generic Tasks},
journal = {CoRR},
volume = {abs/2103.14026},
year = {2021},
url = {https://arxiv.org/abs/2103.14026},
eprinttype = {arXiv},
eprint = {2103.14026},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2103-14026.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2104-06637,
author = {Rui Liu and
Hanming Deng and
Yangyi Huang and
Xiaoyu Shi and
Lewei Lu and
Wenxiu Sun and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {Decoupled Spatial-Temporal Transformer for Video Inpainting},
journal = {CoRR},
volume = {abs/2104.06637},
year = {2021},
url = {https://arxiv.org/abs/2104.06637},
eprinttype = {arXiv},
eprint = {2104.06637},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2104-06637.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2106-02242,
author = {Peng Gao and
Shijie Geng and
Yu Qiao and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {Scalable Transformers for Neural Machine Translation},
journal = {CoRR},
volume = {abs/2106.02242},
year = {2021},
url = {https://arxiv.org/abs/2106.02242},
eprinttype = {arXiv},
eprint = {2106.02242},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2106-02242.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2107-01151,
author = {Haiyang Wang and
Wenguan Wang and
Xizhou Zhu and
Jifeng Dai and
Liwei Wang},
title = {Collaborative Visual Navigation},
journal = {CoRR},
volume = {abs/2107.01151},
year = {2021},
url = {https://arxiv.org/abs/2107.01151},
eprinttype = {arXiv},
eprint = {2107.01151},
timestamp = {Tue, 12 Apr 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2107-01151.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2108-02404,
author = {Peng Gao and
Minghang Zheng and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {Fast Convergence of {DETR} with Spatially Modulated Co-Attention},
journal = {CoRR},
volume = {abs/2108.02404},
year = {2021},
url = {https://arxiv.org/abs/2108.02404},
eprinttype = {arXiv},
eprint = {2108.02404},
timestamp = {Thu, 14 Jul 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2108-02404.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2108-09331,
author = {Zhuoming Liu and
Hao Ding and
Huaping Zhong and
Weijia Li and
Jifeng Dai and
Conghui He},
title = {Influence Selection for Active Learning},
journal = {CoRR},
volume = {abs/2108.09331},
year = {2021},
url = {https://arxiv.org/abs/2108.09331},
eprinttype = {arXiv},
eprint = {2108.09331},
timestamp = {Thu, 11 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2108-09331.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2109-02974,
author = {Rui Liu and
Hanming Deng and
Yangyi Huang and
Xiaoyu Shi and
Lewei Lu and
Wenxiu Sun and
Xiaogang Wang and
Jifeng Dai and
Hongsheng Li},
title = {FuseFormer: Fusing Fine-Grained Information in Transformers for Video
Inpainting},
journal = {CoRR},
volume = {abs/2109.02974},
year = {2021},
url = {https://arxiv.org/abs/2109.02974},
eprinttype = {arXiv},
eprint = {2109.02974},
timestamp = {Thu, 14 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2109-02974.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2111-03930,
author = {Renrui Zhang and
Rongyao Fang and
Wei Zhang and
Peng Gao and
Kunchang Li and
Jifeng Dai and
Yu Qiao and
Hongsheng Li},
title = {Tip-Adapter: Training-free CLIP-Adapter for Better Vision-Language
Modeling},
journal = {CoRR},
volume = {abs/2111.03930},
year = {2021},
url = {https://arxiv.org/abs/2111.03930},
eprinttype = {arXiv},
eprint = {2111.03930},
timestamp = {Thu, 12 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2111-03930.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2111-13579,
author = {Changyao Tian and
Wenhai Wang and
Xizhou Zhu and
Xiaogang Wang and
Jifeng Dai and
Yu Qiao},
title = {{VL-LTR:} Learning Class-wise Visual-Linguistic Representation for
Long-Tailed Visual Recognition},
journal = {CoRR},
volume = {abs/2111.13579},
year = {2021},
url = {https://arxiv.org/abs/2111.13579},
eprinttype = {arXiv},
eprint = {2111.13579},
timestamp = {Mon, 03 Jun 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2111-13579.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2112-01522,
author = {Xizhou Zhu and
Jinguo Zhu and
Hao Li and
Xiaoshi Wu and
Xiaogang Wang and
Hongsheng Li and
Xiaohua Wang and
Jifeng Dai},
title = {Uni-Perceiver: Pre-training Unified Architecture for Generic Perception
for Zero-shot and Few-shot Tasks},
journal = {CoRR},
volume = {abs/2112.01522},
year = {2021},
url = {https://arxiv.org/abs/2112.01522},
eprinttype = {arXiv},
eprint = {2112.01522},
timestamp = {Tue, 06 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-01522.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2112-05138,
author = {Chenxin Tao and
Zizhang Li and
Xizhou Zhu and
Gao Huang and
Yong Liu and
Jifeng Dai},
title = {Searching Parameterized {AP} Loss for Object Detection},
journal = {CoRR},
volume = {abs/2112.05138},
year = {2021},
url = {https://arxiv.org/abs/2112.05138},
eprinttype = {arXiv},
eprint = {2112.05138},
timestamp = {Tue, 11 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-05138.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2112-05141,
author = {Chenxin Tao and
Honghui Wang and
Xizhou Zhu and
Jiahua Dong and
Shiji Song and
Gao Huang and
Jifeng Dai},
title = {Exploring the Equivalence of Siamese Self-Supervised Learning via
{A} Unified Gradient Framework},
journal = {CoRR},
volume = {abs/2112.05141},
year = {2021},
url = {https://arxiv.org/abs/2112.05141},
eprinttype = {arXiv},
eprint = {2112.05141},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-05141.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/YangHCSDH20,
author = {Le Yang and
Yizeng Han and
Xi Chen and
Shiji Song and
Jifeng Dai and
Gao Huang},
title = {Resolution Adaptive Networks for Efficient Inference},
booktitle = {2020 {IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2020, Seattle, WA, USA, June 13-19, 2020},
pages = {2366--2375},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2020},
url = {https://openaccess.thecvf.com/content\_CVPR\_2020/html/Yang\_Resolution\_Adaptive\_Networks\_for\_Efficient\_Inference\_CVPR\_2020\_paper.html},
doi = {10.1109/CVPR42600.2020.00244},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/YangHCSDH20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/WangZDPS020,
author = {Wenguan Wang and
Hailong Zhu and
Jifeng Dai and
Yanwei Pang and
Jianbing Shen and
Ling Shao},
title = {Hierarchical Human Parsing With Typed Part-Relation Reasoning},
booktitle = {2020 {IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2020, Seattle, WA, USA, June 13-19, 2020},
pages = {8926--8936},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2020},
url = {https://openaccess.thecvf.com/content\_CVPR\_2020/html/Wang\_Hierarchical\_Human\_Parsing\_With\_Typed\_Part-Relation\_Reasoning\_CVPR\_2020\_paper.html},
doi = {10.1109/CVPR42600.2020.00895},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/WangZDPS020.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/SunWDG20,
author = {Guolei Sun and
Wenguan Wang and
Jifeng Dai and
Luc Van Gool},
editor = {Andrea Vedaldi and
Horst Bischof and
Thomas Brox and
Jan{-}Michael Frahm},
title = {Mining Cross-Image Semantics for Weakly Supervised Semantic Segmentation},
booktitle = {Computer Vision - {ECCV} 2020 - 16th European Conference, Glasgow,
UK, August 23-28, 2020, Proceedings, Part {II}},
series = {Lecture Notes in Computer Science},
pages = {347--365},
publisher = {Springer},
year = {2020},
url = {https://doi.org/10.1007/978-3-030-58536-5\_21},
doi = {10.1007/978-3-030-58536-5\_21},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/SunWDG20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/GaoZLD20,
author = {Hang Gao and
Xizhou Zhu and
Stephen Lin and
Jifeng Dai},
title = {Deformable Kernels: Adapting Effective Receptive Fields for Object
Deformation},
booktitle = {8th International Conference on Learning Representations, {ICLR} 2020,
Addis Ababa, Ethiopia, April 26-30, 2020},
publisher = {OpenReview.net},
year = {2020},
url = {https://openreview.net/forum?id=SkxSv6VFvS},
timestamp = {Thu, 19 May 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/GaoZLD20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/SuZCLLWD20,
author = {Weijie Su and
Xizhou Zhu and
Yue Cao and
Bin Li and
Lewei Lu and
Furu Wei and
Jifeng Dai},
title = {{VL-BERT:} Pre-training of Generic Visual-Linguistic Representations},
booktitle = {8th International Conference on Learning Representations, {ICLR} 2020,
Addis Ababa, Ethiopia, April 26-30, 2020},
publisher = {OpenReview.net},
year = {2020},
url = {https://openreview.net/forum?id=SygXPaEYvH},
timestamp = {Tue, 12 Apr 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/SuZCLLWD20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2003-04845,
author = {Wenguan Wang and
Hailong Zhu and
Jifeng Dai and
Yanwei Pang and
Jianbing Shen and
Ling Shao},
title = {Hierarchical Human Parsing with Typed Part-Relation Reasoning},
journal = {CoRR},
volume = {abs/2003.04845},
year = {2020},
url = {https://arxiv.org/abs/2003.04845},
eprinttype = {arXiv},
eprint = {2003.04845},
timestamp = {Tue, 17 Mar 2020 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2003-04845.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2003-07326,
author = {Le Yang and
Yizeng Han and
Xi Chen and
Shiji Song and
Jifeng Dai and
Gao Huang},
title = {Resolution Adaptive Networks for Efficient Inference},
journal = {CoRR},
volume = {abs/2003.07326},
year = {2020},
url = {https://arxiv.org/abs/2003.07326},
eprinttype = {arXiv},
eprint = {2003.07326},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2003-07326.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2007-01947,
author = {Guolei Sun and
Wenguan Wang and
Jifeng Dai and
Luc Van Gool},
title = {Mining Cross-Image Semantics for Weakly Supervised Semantic Segmentation},
journal = {CoRR},
volume = {abs/2007.01947},
year = {2020},
url = {https://arxiv.org/abs/2007.01947},
eprinttype = {arXiv},
eprint = {2007.01947},
timestamp = {Fri, 17 Jul 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2007-01947.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2009-01559,
author = {Jingru Tan and
Gang Zhang and
Hanming Deng and
Changbao Wang and
Lewei Lu and
Quanquan Li and
Jifeng Dai},
title = {1st Place Solution of {LVIS} Challenge 2020: {A} Good Box is not a
Guarantee of a Good Mask},
journal = {CoRR},
volume = {abs/2009.01559},
year = {2020},
url = {https://arxiv.org/abs/2009.01559},
eprinttype = {arXiv},
eprint = {2009.01559},
timestamp = {Wed, 16 Sep 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2009-01559.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2010-04159,
author = {Xizhou Zhu and
Weijie Su and
Lewei Lu and
Bin Li and
Xiaogang Wang and
Jifeng Dai},
title = {Deformable {DETR:} Deformable Transformers for End-to-End Object Detection},
journal = {CoRR},
volume = {abs/2010.04159},
year = {2020},
url = {https://arxiv.org/abs/2010.04159},
eprinttype = {arXiv},
eprint = {2010.04159},
timestamp = {Tue, 15 Nov 2022 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2010-04159.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2010-07930,
author = {Hao Li and
Chenxin Tao and
Xizhou Zhu and
Xiaogang Wang and
Gao Huang and
Jifeng Dai},
title = {Auto Seg-Loss: Searching Metric Surrogates for Semantic Segmentation},
journal = {CoRR},
volume = {abs/2010.07930},
year = {2020},
url = {https://arxiv.org/abs/2010.07930},
eprinttype = {arXiv},
eprint = {2010.07930},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2010-07930.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2011-12953,
author = {Hao Tian and
Yuntao Chen and
Jifeng Dai and
Zhaoxiang Zhang and
Xizhou Zhu},
title = {Unsupervised Object Detection with LiDAR Clues},
journal = {CoRR},
volume = {abs/2011.12953},
year = {2020},
url = {https://arxiv.org/abs/2011.12953},
eprinttype = {arXiv},
eprint = {2011.12953},
timestamp = {Wed, 20 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2011-12953.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ZhuHLD19,
author = {Xizhou Zhu and
Han Hu and
Stephen Lin and
Jifeng Dai},
title = {Deformable ConvNets {V2:} More Deformable, Better Results},
booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2019, Long Beach, CA, USA, June 16-20, 2019},
pages = {9308--9316},
publisher = {Computer Vision Foundation / {IEEE}},
year = {2019},
url = {http://openaccess.thecvf.com/content\_CVPR\_2019/html/Zhu\_Deformable\_ConvNets\_V2\_More\_Deformable\_Better\_Results\_CVPR\_2019\_paper.html},
doi = {10.1109/CVPR.2019.00953},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/ZhuHLD19.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/ZhuCZLD19,
author = {Xizhou Zhu and
Dazhi Cheng and
Zheng Zhang and
Stephen Lin and
Jifeng Dai},
title = {An Empirical Study of Spatial Attention Mechanisms in Deep Networks},
booktitle = {2019 {IEEE/CVF} International Conference on Computer Vision, {ICCV}
2019, Seoul, Korea (South), October 27 - November 2, 2019},
pages = {6687--6696},
publisher = {{IEEE}},
year = {2019},
url = {https://doi.org/10.1109/ICCV.2019.00679},
doi = {10.1109/ICCV.2019.00679},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/ZhuCZLD19.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1904-05873,
author = {Xizhou Zhu and
Dazhi Cheng and
Zheng Zhang and
Stephen Lin and
Jifeng Dai},
title = {An Empirical Study of Spatial Attention Mechanisms in Deep Networks},
journal = {CoRR},
volume = {abs/1904.05873},
year = {2019},
url = {http://arxiv.org/abs/1904.05873},
eprinttype = {arXiv},
eprint = {1904.05873},
timestamp = {Thu, 19 May 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1904-05873.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1906-07155,
author = {Kai Chen and
Jiaqi Wang and
Jiangmiao Pang and
Yuhang Cao and
Yu Xiong and
Xiaoxiao Li and
Shuyang Sun and
Wansen Feng and
Ziwei Liu and
Jiarui Xu and
Zheng Zhang and
Dazhi Cheng and
Chenchen Zhu and
Tianheng Cheng and
Qijie Zhao and
Buyu Li and
Xin Lu and
Rui Zhu and
Yue Wu and
Jifeng Dai and
Jingdong Wang and
Jianping Shi and
Wanli Ouyang and
Chen Change Loy and
Dahua Lin},
title = {MMDetection: Open MMLab Detection Toolbox and Benchmark},
journal = {CoRR},
volume = {abs/1906.07155},
year = {2019},
url = {http://arxiv.org/abs/1906.07155},
eprinttype = {arXiv},
eprint = {1906.07155},
timestamp = {Fri, 15 Dec 2023 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1906-07155.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1908-08530,
author = {Weijie Su and
Xizhou Zhu and
Yue Cao and
Bin Li and
Lewei Lu and
Furu Wei and
Jifeng Dai},
title = {{VL-BERT:} Pre-training of Generic Visual-Linguistic Representations},
journal = {CoRR},
volume = {abs/1908.08530},
year = {2019},
url = {http://arxiv.org/abs/1908.08530},
eprinttype = {arXiv},
eprint = {1908.08530},
timestamp = {Tue, 12 Apr 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1908-08530.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1910-02940,
author = {Hang Gao and
Xizhou Zhu and
Steve Lin and
Jifeng Dai},
title = {Deformable Kernels: Adapting Effective Receptive Fields for Object
Deformation},
journal = {CoRR},
volume = {abs/1910.02940},
year = {2019},
url = {http://arxiv.org/abs/1910.02940},
eprinttype = {arXiv},
eprint = {1910.02940},
timestamp = {Fri, 06 Jan 2023 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1910-02940.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/HuGZDW18,
author = {Han Hu and
Jiayuan Gu and
Zheng Zhang and
Jifeng Dai and
Yichen Wei},
title = {Relation Networks for Object Detection},
booktitle = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
pages = {3588--3597},
publisher = {Computer Vision Foundation / {IEEE} Computer Society},
year = {2018},
url = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Hu\_Relation\_Networks\_for\_CVPR\_2018\_paper.html},
doi = {10.1109/CVPR.2018.00378},
timestamp = {Sun, 06 Oct 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/HuGZDW18.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ZhuDYW18,
author = {Xizhou Zhu and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Towards High Performance Video Object Detection},
booktitle = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
pages = {7210--7218},
publisher = {Computer Vision Foundation / {IEEE} Computer Society},
year = {2018},
url = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Zhu\_Towards\_High\_Performance\_CVPR\_2018\_paper.html},
doi = {10.1109/CVPR.2018.00753},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/ZhuDYW18.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/GuHWWD18,
author = {Jiayuan Gu and
Han Hu and
Liwei Wang and
Yichen Wei and
Jifeng Dai},
editor = {Vittorio Ferrari and
Martial Hebert and
Cristian Sminchisescu and
Yair Weiss},
title = {Learning Region Features for Object Detection},
booktitle = {Computer Vision - {ECCV} 2018 - 15th European Conference, Munich,
Germany, September 8-14, 2018, Proceedings, Part {XII}},
series = {Lecture Notes in Computer Science},
pages = {392--406},
publisher = {Springer},
year = {2018},
url = {https://doi.org/10.1007/978-3-030-01258-8\_24},
doi = {10.1007/978-3-030-01258-8\_24},
timestamp = {Sun, 06 Oct 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/GuHWWD18.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1803-07066,
author = {Jiayuan Gu and
Han Hu and
Liwei Wang and
Yichen Wei and
Jifeng Dai},
title = {Learning Region Features for Object Detection},
journal = {CoRR},
volume = {abs/1803.07066},
year = {2018},
url = {http://arxiv.org/abs/1803.07066},
eprinttype = {arXiv},
eprint = {1803.07066},
timestamp = {Mon, 05 Jun 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1803-07066.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1804-05830,
author = {Xizhou Zhu and
Jifeng Dai and
Xingchi Zhu and
Yichen Wei and
Lu Yuan},
title = {Towards High Performance Video Object Detection for Mobiles},
journal = {CoRR},
volume = {abs/1804.05830},
year = {2018},
url = {http://arxiv.org/abs/1804.05830},
eprinttype = {arXiv},
eprint = {1804.05830},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1804-05830.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1811-11167,
author = {Zheng Zhang and
Dazhi Cheng and
Xizhou Zhu and
Stephen Lin and
Jifeng Dai},
title = {Integrated Object Detection and Tracking with Tracklet-Conditioned
Detection},
journal = {CoRR},
volume = {abs/1811.11167},
year = {2018},
url = {http://arxiv.org/abs/1811.11167},
eprinttype = {arXiv},
eprint = {1811.11167},
timestamp = {Thu, 19 May 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1811-11167.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1811-11168,
author = {Xizhou Zhu and
Han Hu and
Stephen Lin and
Jifeng Dai},
title = {Deformable ConvNets v2: More Deformable, Better Results},
journal = {CoRR},
volume = {abs/1811.11168},
year = {2018},
url = {http://arxiv.org/abs/1811.11168},
eprinttype = {arXiv},
eprint = {1811.11168},
timestamp = {Mon, 05 Jun 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1811-11168.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/ZhuXDYW17,
author = {Xizhou Zhu and
Yuwen Xiong and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Deep Feature Flow for Video Recognition},
booktitle = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2017, Honolulu, HI, USA, July 21-26, 2017},
pages = {4141--4150},
publisher = {{IEEE} Computer Society},
year = {2017},
url = {https://doi.ieeecomputersociety.org/10.1109/CVPR.2017.441},
doi = {10.1109/CVPR.2017.441},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/ZhuXDYW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LiQDJW17,
author = {Yi Li and
Haozhi Qi and
Jifeng Dai and
Xiangyang Ji and
Yichen Wei},
title = {Fully Convolutional Instance-Aware Semantic Segmentation},
booktitle = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2017, Honolulu, HI, USA, July 21-26, 2017},
pages = {4438--4446},
publisher = {{IEEE} Computer Society},
year = {2017},
url = {https://doi.org/10.1109/CVPR.2017.472},
doi = {10.1109/CVPR.2017.472},
timestamp = {Tue, 21 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/LiQDJW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/ZhuWDYW17,
author = {Xizhou Zhu and
Yujie Wang and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Flow-Guided Feature Aggregation for Video Object Detection},
booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,
Italy, October 22-29, 2017},
pages = {408--417},
publisher = {{IEEE} Computer Society},
year = {2017},
url = {https://doi.org/10.1109/ICCV.2017.52},
doi = {10.1109/ICCV.2017.52},
timestamp = {Tue, 14 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/ZhuWDYW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/DaiQXLZHW17,
author = {Jifeng Dai and
Haozhi Qi and
Yuwen Xiong and
Yi Li and
Guodong Zhang and
Han Hu and
Yichen Wei},
title = {Deformable Convolutional Networks},
booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,
Italy, October 22-29, 2017},
pages = {764--773},
publisher = {{IEEE} Computer Society},
year = {2017},
url = {https://doi.org/10.1109/ICCV.2017.89},
doi = {10.1109/ICCV.2017.89},
timestamp = {Tue, 21 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/DaiQXLZHW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiQXLZHW17,
author = {Jifeng Dai and
Haozhi Qi and
Yuwen Xiong and
Yi Li and
Guodong Zhang and
Han Hu and
Yichen Wei},
title = {Deformable Convolutional Networks},
journal = {CoRR},
volume = {abs/1703.06211},
year = {2017},
url = {http://arxiv.org/abs/1703.06211},
eprinttype = {arXiv},
eprint = {1703.06211},
timestamp = {Mon, 05 Jun 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiQXLZHW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/ZhuWDYW17,
author = {Xizhou Zhu and
Yujie Wang and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Flow-Guided Feature Aggregation for Video Object Detection},
journal = {CoRR},
volume = {abs/1703.10025},
year = {2017},
url = {http://arxiv.org/abs/1703.10025},
eprinttype = {arXiv},
eprint = {1703.10025},
timestamp = {Thu, 14 Oct 2021 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/ZhuWDYW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1711-11575,
author = {Han Hu and
Jiayuan Gu and
Zheng Zhang and
Jifeng Dai and
Yichen Wei},
title = {Relation Networks for Object Detection},
journal = {CoRR},
volume = {abs/1711.11575},
year = {2017},
url = {http://arxiv.org/abs/1711.11575},
eprinttype = {arXiv},
eprint = {1711.11575},
timestamp = {Mon, 05 Jun 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1711-11575.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1711-11577,
author = {Xizhou Zhu and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Towards High Performance Video Object Detection},
journal = {CoRR},
volume = {abs/1711.11577},
year = {2017},
url = {http://arxiv.org/abs/1711.11577},
eprinttype = {arXiv},
eprint = {1711.11577},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1711-11577.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/DaiHS16,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {Instance-Aware Semantic Segmentation via Multi-task Network Cascades},
booktitle = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
pages = {3150--3158},
publisher = {{IEEE} Computer Society},
year = {2016},
url = {https://doi.org/10.1109/CVPR.2016.343},
doi = {10.1109/CVPR.2016.343},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/DaiHS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/LinDJHS16,
author = {Di Lin and
Jifeng Dai and
Jiaya Jia and
Kaiming He and
Jian Sun},
title = {ScribbleSup: Scribble-Supervised Convolutional Networks for Semantic
Segmentation},
booktitle = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
pages = {3159--3167},
publisher = {{IEEE} Computer Society},
year = {2016},
url = {https://doi.org/10.1109/CVPR.2016.344},
doi = {10.1109/CVPR.2016.344},
timestamp = {Mon, 03 Mar 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/LinDJHS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/DaiHLR016,
author = {Jifeng Dai and
Kaiming He and
Yi Li and
Shaoqing Ren and
Jian Sun},
editor = {Bastian Leibe and
Jiri Matas and
Nicu Sebe and
Max Welling},
title = {Instance-Sensitive Fully Convolutional Networks},
booktitle = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam,
The Netherlands, October 11-14, 2016, Proceedings, Part {VI}},
series = {Lecture Notes in Computer Science},
pages = {534--549},
publisher = {Springer},
year = {2016},
url = {https://doi.org/10.1007/978-3-319-46466-4\_32},
doi = {10.1007/978-3-319-46466-4\_32},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/eccv/DaiHLR016.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/DaiLHS16,
author = {Jifeng Dai and
Yi Li and
Kaiming He and
Jian Sun},
editor = {Daniel D. Lee and
Masashi Sugiyama and
Ulrike von Luxburg and
Isabelle Guyon and
Roman Garnett},
title = {{R-FCN:} Object Detection via Region-based Fully Convolutional Networks},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference
on Neural Information Processing Systems 2016, December 5-10, 2016,
Barcelona, Spain},
pages = {379--387},
year = {2016},
url = {https://proceedings.neurips.cc/paper/2016/hash/577ef1154f3240ad5b9b413aa7346a1e-Abstract.html},
timestamp = {Mon, 16 May 2022 15:41:51 +0200},
biburl = {https://dblp.org/rec/conf/nips/DaiLHS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiHLRS16,
author = {Jifeng Dai and
Kaiming He and
Yi Li and
Shaoqing Ren and
Jian Sun},
title = {Instance-sensitive Fully Convolutional Networks},
journal = {CoRR},
volume = {abs/1603.08678},
year = {2016},
url = {http://arxiv.org/abs/1603.08678},
eprinttype = {arXiv},
eprint = {1603.08678},
timestamp = {Tue, 15 Sep 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiHLRS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/LinDJHS16,
author = {Di Lin and
Jifeng Dai and
Jiaya Jia and
Kaiming He and
Jian Sun},
title = {ScribbleSup: Scribble-Supervised Convolutional Networks for Semantic
Segmentation},
journal = {CoRR},
volume = {abs/1604.05144},
year = {2016},
url = {http://arxiv.org/abs/1604.05144},
eprinttype = {arXiv},
eprint = {1604.05144},
timestamp = {Wed, 24 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/LinDJHS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiLHS16,
author = {Jifeng Dai and
Yi Li and
Kaiming He and
Jian Sun},
title = {{R-FCN:} Object Detection via Region-based Fully Convolutional Networks},
journal = {CoRR},
volume = {abs/1605.06409},
year = {2016},
url = {http://arxiv.org/abs/1605.06409},
eprinttype = {arXiv},
eprint = {1605.06409},
timestamp = {Tue, 15 Sep 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiLHS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/LiQDJW16,
author = {Yi Li and
Haozhi Qi and
Jifeng Dai and
Xiangyang Ji and
Yichen Wei},
title = {Fully Convolutional Instance-aware Semantic Segmentation},
journal = {CoRR},
volume = {abs/1611.07709},
year = {2016},
url = {http://arxiv.org/abs/1611.07709},
eprinttype = {arXiv},
eprint = {1611.07709},
timestamp = {Tue, 15 Sep 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/LiQDJW16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/ZhuXDYW16,
author = {Xizhou Zhu and
Yuwen Xiong and
Jifeng Dai and
Lu Yuan and
Yichen Wei},
title = {Deep Feature Flow for Video Recognition},
journal = {CoRR},
volume = {abs/1611.07715},
year = {2016},
url = {http://arxiv.org/abs/1611.07715},
eprinttype = {arXiv},
eprint = {1611.07715},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/ZhuXDYW16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/DaiH015,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {Convolutional feature masking for joint object and stuff segmentation},
booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2015, Boston, MA, USA, June 7-12, 2015},
pages = {3992--4000},
publisher = {{IEEE} Computer Society},
year = {2015},
url = {https://doi.org/10.1109/CVPR.2015.7299025},
doi = {10.1109/CVPR.2015.7299025},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/DaiH015.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/DaiHS15,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks
for Semantic Segmentation},
booktitle = {2015 {IEEE} International Conference on Computer Vision, {ICCV} 2015,
Santiago, Chile, December 7-13, 2015},
pages = {1635--1643},
publisher = {{IEEE} Computer Society},
year = {2015},
url = {https://doi.org/10.1109/ICCV.2015.191},
doi = {10.1109/ICCV.2015.191},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/DaiHS15.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:journals/corr/DaiW14,
author = {Jifeng Dai and
Ying Nian Wu},
editor = {Yoshua Bengio and
Yann LeCun},
title = {Generative Modeling of Convolutional Neural Networks},
booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
year = {2015},
url = {http://arxiv.org/abs/1412.6296},
timestamp = {Thu, 25 Jul 2019 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiW14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiH015,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks
for Semantic Segmentation},
journal = {CoRR},
volume = {abs/1503.01640},
year = {2015},
url = {http://arxiv.org/abs/1503.01640},
eprinttype = {arXiv},
eprint = {1503.01640},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiH015.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiHS15,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {Instance-aware Semantic Segmentation via Multi-task Network Cascades},
journal = {CoRR},
volume = {abs/1512.04412},
year = {2015},
url = {http://arxiv.org/abs/1512.04412},
eprinttype = {arXiv},
eprint = {1512.04412},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiHS15.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/DaiHHZW14,
author = {Jifeng Dai and
Yi Hong and
Wenze Hu and
Song{-}Chun Zhu and
Ying Nian Wu},
title = {Unsupervised Learning of Dictionaries of Hierarchical Compositional
Models},
booktitle = {2014 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2014, Columbus, OH, USA, June 23-28, 2014},
pages = {2505--2512},
publisher = {{IEEE} Computer Society},
year = {2014},
url = {https://doi.org/10.1109/CVPR.2014.321},
doi = {10.1109/CVPR.2014.321},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/cvpr/DaiHHZW14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/DaiH014,
author = {Jifeng Dai and
Kaiming He and
Jian Sun},
title = {Convolutional Feature Masking for Joint Object and Stuff Segmentation},
journal = {CoRR},
volume = {abs/1412.1283},
year = {2014},
url = {http://arxiv.org/abs/1412.1283},
eprinttype = {arXiv},
eprint = {1412.1283},
timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/DaiH014.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/DaiWZZ13,
author = {Jifeng Dai and
Ying Nian Wu and
Jie Zhou and
Song{-}Chun Zhu},
title = {Cosegmentation and Cosketch by Unsupervised Learning},
booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2013, Sydney,
Australia, December 1-8, 2013},
pages = {1305--1312},
publisher = {{IEEE} Computer Society},
year = {2013},
url = {https://doi.org/10.1109/ICCV.2013.165},
doi = {10.1109/ICCV.2013.165},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iccv/DaiWZZ13.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/DaiFZ12,
author = {Jifeng Dai and
Jianjiang Feng and
Jie Zhou},
title = {Robust and Efficient Ridge-Based Palmprint Matching},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {34},
number = {8},
pages = {1618--1632},
year = {2012},
url = {https://doi.org/10.1109/TPAMI.2011.237},
doi = {10.1109/TPAMI.2011.237},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/DaiFZ12.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpr/DaiFZ12,
author = {Jifeng Dai and
Jianjiang Feng and
Jie Zhou},
title = {Mining sub-categories for object detection},
booktitle = {Proceedings of the 21st International Conference on Pattern Recognition,
{ICPR} 2012, Tsukuba, Japan, November 11-15, 2012},
pages = {3260--3263},
publisher = {{IEEE} Computer Society},
year = {2012},
url = {https://ieeexplore.ieee.org/document/6460860/},
timestamp = {Tue, 10 Aug 2021 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/icpr/DaiFZ12.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/pami/DaiZ11,
author = {Jifeng Dai and
Jie Zhou},
title = {Multifeature-Based High-Resolution Palmprint Recognition},
journal = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
volume = {33},
number = {5},
pages = {945--957},
year = {2011},
url = {https://doi.org/10.1109/TPAMI.2010.164},
doi = {10.1109/TPAMI.2010.164},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/pami/DaiZ11.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
manage site settings
To protect your privacy, all features that rely on external API calls from your browser are turned off by default. You need to opt-in for them to become active. All settings here will be stored as cookies with your web browser. For more information see our F.A.Q.