default search action
BibTeX records: Andy Zou
@article{DBLP:journals/corr/abs-2603-15714,
author = {Mateusz Dziemian and
Maxwell Lin and
Xiaohan Fu and
Micha Nowak and
Nick Winter and
Eliot Krzysztof Jones and
Andy Zou and
Lama Ahmad and
Kamalika Chaudhuri and
Sahana Chennabasappa and
Xander Davies and
Lauren Deason and
Benjamin L. Edelman and
Tanner Emek and
Ivan Evtimov and
Jim Gust and
Maia Hamin and
Kat He and
Klaudia Krawiecka and
Riccardo Patana and
Neil Perry and
Troy Peterson and
Xiangyu Qi and
Javier Rando and
Zifan Wang and
Zihan Wang and
Spencer Whitman and
Eric Winsor and
Arman Zharmagambetov and
Matt Fredrikson and
Zico Kolter},
title = {How Vulnerable Are {AI} Agents to Indirect Prompt Injections? Insights
from a Large-Scale Public Competition},
journal = {CoRR},
volume = {abs/2603.15714},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2603.15714},
doi = {10.48550/ARXIV.2603.15714},
eprinttype = {arXiv},
eprint = {2603.15714},
timestamp = {Tue, 14 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2603-15714.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/AndriushchenkoS25,
author = {Maksym Andriushchenko and
Alexandra Souly and
Mateusz Dziemian and
Derek Duenas and
Maxwell Lin and
Justin Wang and
Dan Hendrycks and
Andy Zou and
J. Zico Kolter and
Matt Fredrikson and
Yarin Gal and
Xander Davies},
title = {AgentHarm: {A} Benchmark for Measuring Harmfulness of {LLM} Agents},
booktitle = {The Thirteenth International Conference on Learning Representations,
{ICLR} 2025, Singapore, April 24-28, 2025},
publisher = {OpenReview.net},
year = {2025},
url = {https://openreview.net/forum?id=AC5n7xHuR1},
timestamp = {Thu, 15 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/AndriushchenkoS25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/TamirisaBPZGSLW25,
author = {Rishub Tamirisa and
Bhrugu Bharathi and
Long Phan and
Andy Zhou and
Alice Gatti and
Tarun Suresh and
Maxwell Lin and
Justin Wang and
Rowan Wang and
Ron Arel and
Andy Zou and
Dawn Song and
Bo Li and
Dan Hendrycks and
Mantas Mazeika},
title = {Tamper-Resistant Safeguards for Open-Weight LLMs},
booktitle = {The Thirteenth International Conference on Learning Representations,
{ICLR} 2025, Singapore, April 24-28, 2025},
publisher = {OpenReview.net},
year = {2025},
url = {https://openreview.net/forum?id=4FIjRodbW6},
timestamp = {Thu, 15 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/iclr/TamirisaBPZGSLW25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2502-14296,
author = {Yue Huang and
Chujie Gao and
Siyuan Wu and
Haoran Wang and
Xiangqi Wang and
Yujun Zhou and
Yanbo Wang and
Jiayi Ye and
Jiawen Shi and
Qihui Zhang and
Yuan Li and
Han Bao and
Zhaoyi Liu and
Tianrui Guan and
Dongping Chen and
Ruoxi Chen and
Kehan Guo and
Andy Zou and
Bryan Hooi Kuen{-}Yew and
Caiming Xiong and
Elias Stengel{-}Eskin and
Hongyang Zhang and
Hongzhi Yin and
Huan Zhang and
Huaxiu Yao and
Jaehong Yoon and
Jieyu Zhang and
Kai Shu and
Kaijie Zhu and
Ranjay Krishna and
Swabha Swayamdipta and
Taiwei Shi and
Weijia Shi and
Xiang Li and
Yiwei Li and
Yuexing Hao and
Zhihao Jia and
Zhize Li and
Xiuying Chen and
Zhengzhong Tu and
Xiyang Hu and
Tianyi Zhou and
Jieyu Zhao and
Lichao Sun and
Furong Huang and
Or Cohen Sasson and
Prasanna Sattigeri and
Anka Reuel and
Max Lamparth and
Yue Zhao and
Nouha Dziri and
Yu Su and
Huan Sun and
Heng Ji and
Chaowei Xiao and
Mohit Bansal and
Nitesh V. Chawla and
Jian Pei and
Jianfeng Gao and
Michael Backes and
Philip S. Yu and
Neil Zhenqiang Gong and
Pin{-}Yu Chen and
Bo Li and
Xiangliang Zhang},
title = {On the Trustworthiness of Generative Foundation Models: Guideline,
Assessment, and Perspective},
journal = {CoRR},
volume = {abs/2502.14296},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2502.14296},
doi = {10.48550/ARXIV.2502.14296},
eprinttype = {arXiv},
eprint = {2502.14296},
timestamp = {Tue, 24 Mar 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2502-14296.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2504-16980,
author = {Pratyush Maini and
Sachin Goyal and
Dylan Sam and
Alexander Robey and
Yash Savani and
Yiding Jiang and
Andy Zou and
Zacharcy C. Lipton and
J. Zico Kolter},
title = {Safety Pretraining: Toward the Next Generation of Safe {AI}},
journal = {CoRR},
volume = {abs/2504.16980},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2504.16980},
doi = {10.48550/ARXIV.2504.16980},
eprinttype = {arXiv},
eprint = {2504.16980},
timestamp = {Fri, 23 May 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2504-16980.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-01050,
author = {Kai Hu and
Weichen Yu and
Li Zhang and
Alexander Robey and
Andy Zou and
Chengming Xu and
Haoqi Hu and
Matt Fredrikson},
title = {Transferable Adversarial Attacks on Black-Box Vision-Language Models},
journal = {CoRR},
volume = {abs/2505.01050},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2505.01050},
doi = {10.48550/ARXIV.2505.01050},
eprinttype = {arXiv},
eprint = {2505.01050},
timestamp = {Thu, 26 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2505-01050.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-03350,
author = {Eliot Krzysztof Jones and
Alexander Robey and
Andy Zou and
Zachary Ravichandran and
George J. Pappas and
Hamed Hassani and
Matt Fredrikson and
J. Zico Kolter},
title = {Adversarial Attacks on Robotic Vision Language Action Models},
journal = {CoRR},
volume = {abs/2506.03350},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.03350},
doi = {10.48550/ARXIV.2506.03350},
eprinttype = {arXiv},
eprint = {2506.03350},
timestamp = {Sun, 06 Jul 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2506-03350.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-20526,
author = {Andy Zou and
Maxwell Lin and
Eliot Krzysztof Jones and
Micha Nowak and
Mateusz Dziemian and
Nick Winter and
Alexander Grattan and
Valent Nathanael and
Ayla Croft and
Xander Davies and
Jai Patel and
Robert Kirk and
Nate Burnikell and
Yarin Gal and
Dan Hendrycks and
J. Zico Kolter and
Matt Fredrikson},
title = {Security Challenges in {AI} Agent Deployment: Insights from a Large
Scale Public Competition},
journal = {CoRR},
volume = {abs/2507.20526},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.20526},
doi = {10.48550/ARXIV.2507.20526},
eprinttype = {arXiv},
eprint = {2507.20526},
timestamp = {Thu, 21 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-20526.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-23701,
author = {Long Phan and
Mantas Mazeika and
Andy Zou and
Dan Hendrycks},
title = {TextQuests: How Good are LLMs at Text-Based Video Games?},
journal = {CoRR},
volume = {abs/2507.23701},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.23701},
doi = {10.48550/ARXIV.2507.23701},
eprinttype = {arXiv},
eprint = {2507.23701},
timestamp = {Fri, 22 Aug 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-23701.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2508-19980,
author = {Dylan Sam and
Alexander Robey and
Andy Zou and
Matt Fredrikson and
J. Zico Kolter},
title = {Evaluating Language Model Reasoning about Confidential Information},
journal = {CoRR},
volume = {abs/2508.19980},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2508.19980},
doi = {10.48550/ARXIV.2508.19980},
eprinttype = {arXiv},
eprint = {2508.19980},
timestamp = {Mon, 22 Sep 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2508-19980.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2509-17938,
author = {Satyapriya Krishna and
Andy Zou and
Rahul Gupta and
Eliot Krzysztof Jones and
Nick Winter and
Dan Hendrycks and
J. Zico Kolter and
Matt Fredrikson and
Spyros Matsoukas},
title = {{D-REX:} {A} Benchmark for Detecting Deceptive Reasoning in Large
Language Models},
journal = {CoRR},
volume = {abs/2509.17938},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2509.17938},
doi = {10.48550/ARXIV.2509.17938},
eprinttype = {arXiv},
eprint = {2509.17938},
timestamp = {Sat, 18 Oct 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2509-17938.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2510-18212,
author = {Dan Hendrycks and
Dawn Song and
Christian Szegedy and
Honglak Lee and
Yarin Gal and
Erik Brynjolfsson and
Sharon Li and
Andy Zou and
Lionel Levine and
Bo Han and
Jie Fu and
Ziwei Liu and
Jinwoo Shin and
Kimin Lee and
Mantas Mazeika and
Long Phan and
George Ingebretsen and
Adam Khoja and
Cihang Xie and
Olawale Salaudeen and
Matthias Hein and
Kevin Zhao and
Alexander Pan and
David Duvenaud and
Bo Li and
Steve Omohundro and
Gabriel Alfour and
Max Tegmark and
Kevin McGrew and
Gary Marcus and
Jaan Tallinn and
Eric Schmidt and
Yoshua Bengio},
title = {A Definition of {AGI}},
journal = {CoRR},
volume = {abs/2510.18212},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2510.18212},
doi = {10.48550/ARXIV.2510.18212},
eprinttype = {arXiv},
eprint = {2510.18212},
timestamp = {Mon, 22 Dec 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2510-18212.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2512-09882,
author = {Justin W. Lin and
Eliot Krzysztof Jones and
Donovan Julian Jasper and
Ethan Jun{-}shen Ho and
Anna Wu and
Arnold Tianyi Yang and
Neil Perry and
Andy Zou and
Matt Fredrikson and
J. Zico Kolter and
Percy Liang and
Dan Boneh and
Daniel E. Ho},
title = {Comparing {AI} Agents to Cybersecurity Professionals in Real-World
Penetration Testing},
journal = {CoRR},
volume = {abs/2512.09882},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2512.09882},
doi = {10.48550/ARXIV.2512.09882},
eprinttype = {arXiv},
eprint = {2512.09882},
timestamp = {Fri, 23 Jan 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2512-09882.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/LiPGYBGLDGMHLJL24,
author = {Nathaniel Li and
Alexander Pan and
Anjali Gopal and
Summer Yue and
Daniel Berrios and
Alice Gatti and
Justin D. Li and
Ann{-}Kathrin Dombrowski and
Shashwat Goel and
Gabriel Mukobi and
Nathan Helm{-}Burger and
Rassin Lababidi and
Lennart Justen and
Andrew B. Liu and
Michael Chen and
Isabelle Barrass and
Oliver Zhang and
Xiaoyuan Zhu and
Rishub Tamirisa and
Bhrugu Bharathi and
Ariel Herbert{-}Voss and
Cort B. Breuer and
Andy Zou and
Mantas Mazeika and
Zifan Wang and
Palash Oswal and
Weiran Lin and
Adam A. Hunt and
Justin Tienken{-}Harder and
Kevin Y. Shih and
Kemper Talley and
John Guan and
Ian Steneker and
David Campbell and
Brad Jokubaitis and
Steven Basart and
Stephen Fitz and
Ponnurangam Kumaraguru and
Kallol Krishna Karmakar and
Uday Kiran Tupakula and
Vijay Varadharajan and
Yan Shoshitaishvili and
Jimmy Ba and
Kevin M. Esvelt and
Alexandr Wang and
Dan Hendrycks},
editor = {Ruslan Salakhutdinov and
Zico Kolter and
Katherine A. Heller and
Adrian Weller and
Nuria Oliver and
Jonathan Scarlett and
Felix Berkenkamp},
title = {The {WMDP} Benchmark: Measuring and Reducing Malicious Use with Unlearning},
booktitle = {Forty-first International Conference on Machine Learning, {ICML} 2024,
Vienna, Austria, July 21-27, 2024},
series = {Proceedings of Machine Learning Research},
pages = {28525--28550},
publisher = {{PMLR} / OpenReview.net},
year = {2024},
url = {https://proceedings.mlr.press/v235/li24bc.html},
timestamp = {Mon, 09 Feb 2026 15:35:36 +0100},
biburl = {https://dblp.org/rec/conf/icml/LiPGYBGLDGMHLJL24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/MazeikaPYZ0MSLB24,
author = {Mantas Mazeika and
Long Phan and
Xuwang Yin and
Andy Zou and
Zifan Wang and
Norman Mu and
Elham Sakhaee and
Nathaniel Li and
Steven Basart and
Bo Li and
David A. Forsyth and
Dan Hendrycks},
editor = {Ruslan Salakhutdinov and
Zico Kolter and
Katherine A. Heller and
Adrian Weller and
Nuria Oliver and
Jonathan Scarlett and
Felix Berkenkamp},
title = {HarmBench: {A} Standardized Evaluation Framework for Automated Red
Teaming and Robust Refusal},
booktitle = {Forty-first International Conference on Machine Learning, {ICML} 2024,
Vienna, Austria, July 21-27, 2024},
series = {Proceedings of Machine Learning Research},
pages = {35181--35224},
publisher = {{PMLR} / OpenReview.net},
year = {2024},
url = {https://proceedings.mlr.press/v235/mazeika24a.html},
timestamp = {Mon, 09 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/icml/MazeikaPYZ0MSLB24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/ZouPWDLAKFH24,
author = {Andy Zou and
Long Phan and
Justin Wang and
Derek Duenas and
Maxwell Lin and
Maksym Andriushchenko and
J. Zico Kolter and
Matt Fredrikson and
Dan Hendrycks},
editor = {Amir Globersons and
Lester Mackey and
Danielle Belgrave and
Angela Fan and
Ulrich Paquet and
Jakub M. Tomczak and
Cheng Zhang},
title = {Improving Alignment and Robustness with Circuit Breakers},
booktitle = {Advances in Neural Information Processing Systems 38: Annual Conference
on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
BC, Canada, December 10 - 15, 2024},
year = {2024},
url = {http://papers.nips.cc/paper\_files/paper/2024/hash/97ca7168c2c333df5ea61ece3b3276e1-Abstract-Conference.html},
timestamp = {Thu, 13 Feb 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/ZouPWDLAKFH24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2402-04249,
author = {Mantas Mazeika and
Long Phan and
Xuwang Yin and
Andy Zou and
Zifan Wang and
Norman Mu and
Elham Sakhaee and
Nathaniel Li and
Steven Basart and
Bo Li and
David A. Forsyth and
Dan Hendrycks},
title = {HarmBench: {A} Standardized Evaluation Framework for Automated Red
Teaming and Robust Refusal},
journal = {CoRR},
volume = {abs/2402.04249},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2402.04249},
doi = {10.48550/ARXIV.2402.04249},
eprinttype = {arXiv},
eprint = {2402.04249},
timestamp = {Tue, 20 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2402-04249.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2403-03218,
author = {Nathaniel Li and
Alexander Pan and
Anjali Gopal and
Summer Yue and
Daniel Berrios and
Alice Gatti and
Justin D. Li and
Ann{-}Kathrin Dombrowski and
Shashwat Goel and
Long Phan and
Gabriel Mukobi and
Nathan Helm{-}Burger and
Rassin Lababidi and
Lennart Justen and
Andrew B. Liu and
Michael Chen and
Isabelle Barrass and
Oliver Zhang and
Xiaoyuan Zhu and
Rishub Tamirisa and
Bhrugu Bharathi and
Adam Khoja and
Zhenqi Zhao and
Ariel Herbert{-}Voss and
Cort B. Breuer and
Andy Zou and
Mantas Mazeika and
Zifan Wang and
Palash Oswal and
Weiran Liu and
Adam A. Hunt and
Justin Tienken{-}Harder and
Kevin Y. Shih and
Kemper Talley and
John Guan and
Russell Kaplan and
Ian Steneker and
David Campbell and
Brad Jokubaitis and
Alex Levinson and
Jean Wang and
William Qian and
Kallol Krishna Karmakar and
Steven Basart and
Stephen Fitz and
Mindy Levine and
Ponnurangam Kumaraguru and
Uday Kiran Tupakula and
Vijay Varadharajan and
Yan Shoshitaishvili and
Jimmy Ba and
Kevin M. Esvelt and
Alexandr Wang and
Dan Hendrycks},
title = {The {WMDP} Benchmark: Measuring and Reducing Malicious Use With Unlearning},
journal = {CoRR},
volume = {abs/2403.03218},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2403.03218},
doi = {10.48550/ARXIV.2403.03218},
eprinttype = {arXiv},
eprint = {2403.03218},
timestamp = {Sat, 15 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2403-03218.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2405-14782,
author = {Stella Biderman and
Hailey Schoelkopf and
Lintang Sutawika and
Leo Gao and
Jonathan Tow and
Baber Abbasi and
Alham Fikri Aji and
Pawan Sasanka Ammanamanchi and
Sidney Black and
Jordan Clive and
Anthony DiPofi and
Julen Etxaniz and
Benjamin Fattori and
Jessica Zosa Forde and
Charles Foster and
Jeffrey Hsu and
Mimansa Jaiswal and
Wilson Y. Lee and
Haonan Li and
Charles Lovering and
Niklas Muennighoff and
Ellie Pavlick and
Jason Phang and
Aviya Skowron and
Samson Tan and
Xiangru Tang and
Kevin A. Wang and
Genta Indra Winata and
Fran{\c{c}}ois Yvon and
Andy Zou},
title = {Lessons from the Trenches on Reproducible Evaluation of Language Models},
journal = {CoRR},
volume = {abs/2405.14782},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2405.14782},
doi = {10.48550/ARXIV.2405.14782},
eprinttype = {arXiv},
eprint = {2405.14782},
timestamp = {Sun, 01 Feb 2026 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2405-14782.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-04313,
author = {Andy Zou and
Long Phan and
Justin Wang and
Derek Duenas and
Maxwell Lin and
Maksym Andriushchenko and
Rowan Wang and
Zico Kolter and
Matt Fredrikson and
Dan Hendrycks},
title = {Improving Alignment and Robustness with Circuit Breakers},
journal = {CoRR},
volume = {abs/2406.04313},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2406.04313},
doi = {10.48550/ARXIV.2406.04313},
eprinttype = {arXiv},
eprint = {2406.04313},
timestamp = {Sat, 13 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2406-04313.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2408-00761,
author = {Rishub Tamirisa and
Bhrugu Bharathi and
Long Phan and
Andy Zhou and
Alice Gatti and
Tarun Suresh and
Maxwell Lin and
Justin Wang and
Rowan Wang and
Ron Arel and
Andy Zou and
Dawn Song and
Bo Li and
Dan Hendrycks and
Mantas Mazeika},
title = {Tamper-Resistant Safeguards for Open-Weight LLMs},
journal = {CoRR},
volume = {abs/2408.00761},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2408.00761},
doi = {10.48550/ARXIV.2408.00761},
eprinttype = {arXiv},
eprint = {2408.00761},
timestamp = {Mon, 09 Sep 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2408-00761.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-09024,
author = {Maksym Andriushchenko and
Alexandra Souly and
Mateusz Dziemian and
Derek Duenas and
Maxwell Lin and
Justin Wang and
Dan Hendrycks and
Andy Zou and
Zico Kolter and
Matt Fredrikson and
Eric Winsor and
Jerome Wynne and
Yarin Gal and
Xander Davies},
title = {AgentHarm: {A} Benchmark for Measuring Harmfulness of {LLM} Agents},
journal = {CoRR},
volume = {abs/2410.09024},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2410.09024},
doi = {10.48550/ARXIV.2410.09024},
eprinttype = {arXiv},
eprint = {2410.09024},
timestamp = {Fri, 22 Nov 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2410-09024.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tmlr/SrivastavaRRSAF23,
author = {Aarohi Srivastava and
Abhinav Rastogi and
Abhishek Rao and
Abu Awal Md Shoeb and
Abubakar Abid and
Adam Fisch and
Adam R. Brown and
Adam Santoro and
Aditya Gupta and
Adri{\`{a}} Garriga{-}Alonso and
Agnieszka Kluska and
Aitor Lewkowycz and
Akshat Agarwal and
Alethea Power and
Alex Ray and
Alex Warstadt and
Alexander W. Kocurek and
Ali Safaya and
Ali Tazarv and
Alice Xiang and
Alicia Parrish and
Allen Nie and
Aman Hussain and
Amanda Askell and
Amanda Dsouza and
Ambrose Slone and
Ameet Rahane and
Anantharaman S. Iyer and
Anders Andreassen and
Andrea Madotto and
Andrea Santilli and
Andreas Stuhlm{\"{u}}ller and
Andrew M. Dai and
Andrew La and
Andrew K. Lampinen and
Andy Zou and
Angela Jiang and
Angelica Chen and
Anh Vuong and
Animesh Gupta and
Anna Gottardi and
Antonio Norelli and
Anu Venkatesh and
Arash Gholamidavoodi and
Arfa Tabassum and
Arul Menezes and
Arun Kirubarajan and
Asher Mullokandov and
Ashish Sabharwal and
Austin Herrick and
Avia Efrat and
Aykut Erdem and
Ayla Karakas and
B. Ryan Roberts and
Bao Sheng Loe and
Barret Zoph and
Bartlomiej Bojanowski and
Batuhan {\"{O}}zyurt and
Behnam Hedayatnia and
Behnam Neyshabur and
Benjamin Inden and
Benno Stein and
Berk Ekmekci and
Bill Yuchen Lin and
Blake Howald and
Bryan Orinion and
Cameron Diao and
Cameron Dour and
Catherine Stinson and
Cedrick Argueta and
C{\`{e}}sar Ferri Ram{\'{\i}}rez and
Chandan Singh and
Charles Rathkopf and
Chenlin Meng and
Chitta Baral and
Chiyu Wu and
Chris Callison{-}Burch and
Chris Waites and
Christian Voigt and
Christopher D. Manning and
Christopher Potts and
Cindy Ramirez and
Clara E. Rivera and
Clemencia Siro and
Colin Raffel and
Courtney Ashcraft and
Cristina Garbacea and
Damien Sileo and
Dan Garrette and
Dan Hendrycks and
Dan Kilman and
Dan Roth and
Daniel Freeman and
Daniel Khashabi and
Daniel Levy and
Daniel Mosegu{\'{\i}} Gonz{\'{a}}lez and
Danielle Perszyk and
Danny Hernandez and
Danqi Chen and
Daphne Ippolito and
Dar Gilboa and
David Dohan and
David Drakard and
David Jurgens and
Debajyoti Datta and
Deep Ganguli and
Denis Emelin and
Denis Kleyko and
Deniz Yuret and
Derek Chen and
Derek Tam and
Dieuwke Hupkes and
Diganta Misra and
Dilyar Buzan and
Dimitri Coelho Mollo and
Diyi Yang and
Dong{-}Ho Lee and
Dylan Schrader and
Ekaterina Shutova and
Ekin Dogus Cubuk and
Elad Segal and
Eleanor Hagerman and
Elizabeth Barnes and
Elizabeth Donoway and
Ellie Pavlick and
Emanuele Rodol{\`{a}} and
Emma Lam and
Eric Chu and
Eric Tang and
Erkut Erdem and
Ernie Chang and
Ethan A. Chi and
Ethan Dyer and
Ethan J. Jerzak and
Ethan Kim and
Eunice Engefu Manyasi and
Evgenii Zheltonozhskii and
Fanyue Xia and
Fatemeh Siar and
Fernando Mart{\'{\i}}nez{-}Plumed and
Francesca Happ{\'{e}} and
Fran{\c{c}}ois Chollet and
Frieda Rong and
Gaurav Mishra and
Genta Indra Winata and
Gerard de Melo and
Germ{\'{a}}n Kruszewski and
Giambattista Parascandolo and
Giorgio Mariani and
Gloria Wang and
Gonzalo Jaimovitch{-}L{\'{o}}pez and
Gregor Betz and
Guy Gur{-}Ari and
Hana Galijasevic and
Hannah Kim and
Hannah Rashkin and
Hannaneh Hajishirzi and
Harsh Mehta and
Hayden Bogar and
Henry Shevlin and
Hinrich Sch{\"{u}}tze and
Hiromu Yakura and
Hongming Zhang and
Hugh Mee Wong and
Ian Ng and
Isaac Noble and
Jaap Jumelet and
Jack Geissinger and
Jackson Kernion and
Jacob Hilton and
Jaehoon Lee and
Jaime Fern{\'{a}}ndez Fisac and
James B. Simon and
James Koppel and
James Zheng and
James Zou and
Jan Kocon and
Jana Thompson and
Janelle Wingfield and
Jared Kaplan and
Jarema Radom and
Jascha Sohl{-}Dickstein and
Jason Phang and
Jason Wei and
Jason Yosinski and
Jekaterina Novikova and
Jelle Bosscher and
Jennifer Marsh and
Jeremy Kim and
Jeroen Taal and
Jesse H. Engel and
Jesujoba Alabi and
Jiacheng Xu and
Jiaming Song and
Jillian Tang and
Joan Waweru and
John Burden and
John Miller and
John U. Balis and
Jonathan Batchelder and
Jonathan Berant and
J{\"{o}}rg Frohberg and
Jos Rozen and
Jos{\'{e}} Hern{\'{a}}ndez{-}Orallo and
Joseph Boudeman and
Joseph Guerr and
Joseph Jones and
Joshua B. Tenenbaum and
Joshua S. Rule and
Joyce Chua and
Kamil Kanclerz and
Karen Livescu and
Karl Krauth and
Karthik Gopalakrishnan and
Katerina Ignatyeva and
Katja Markert and
Kaustubh D. Dhole and
Kevin Gimpel and
Kevin Omondi and
Kory W. Mathewson and
Kristen Chiafullo and
Ksenia Shkaruta and
Kumar Shridhar and
Kyle McDonell and
Kyle Richardson and
Laria Reynolds and
Leo Gao and
Li Zhang and
Liam Dugan and
Lianhui Qin and
Lidia Contreras Ochando and
Louis{-}Philippe Morency and
Luca Moschella and
Lucas Lam and
Lucy Noble and
Ludwig Schmidt and
Luheng He and
Luis Oliveros Col{\'{o}}n and
Luke Metz and
L{\"{u}}tfi Kerem Senel and
Maarten Bosma and
Maarten Sap and
Maartje ter Hoeve and
Maheen Farooqi and
Manaal Faruqui and
Mantas Mazeika and
Marco Baturan and
Marco Marelli and
Marco Maru and
Mar{\'{\i}}a Jos{\'{e}} Ram{\'{\i}}rez{-}Quintana and
Marie Tolkiehn and
Mario Giulianelli and
Martha Lewis and
Martin Potthast and
Matthew L. Leavitt and
Matthias Hagen and
M{\'{a}}ty{\'{a}}s Schubert and
Medina Baitemirova and
Melody Arnaud and
Melvin McElrath and
Michael A. Yee and
Michael Cohen and
Michael Gu and
Michael I. Ivanitskiy and
Michael Starritt and
Michael Strube and
Michal Swedrowski and
Michele Bevilacqua and
Michihiro Yasunaga and
Mihir Kale and
Mike Cain and
Mimee Xu and
Mirac Suzgun and
Mitch Walker and
Mo Tiwari and
Mohit Bansal and
Moin Aminnaseri and
Mor Geva and
Mozhdeh Gheini and
Mukund Varma T. and
Nanyun Peng and
Nathan A. Chi and
Nayeon Lee and
Neta Gur{-}Ari Krakover and
Nicholas Cameron and
Nicholas Roberts and
Nick Doiron and
Nicole Martinez and
Nikita Nangia and
Niklas Deckers and
Niklas Muennighoff and
Nitish Shirish Keskar and
Niveditha Iyer and
Noah Constant and
Noah Fiedel and
Nuan Wen and
Oliver Zhang and
Omar Agha and
Omar Elbaghdadi and
Omer Levy and
Owain Evans and
Pablo Antonio Moreno Casares and
Parth Doshi and
Pascale Fung and
Paul Pu Liang and
Paul Vicol and
Pegah Alipoormolabashi and
Peiyuan Liao and
Percy Liang and
Peter Chang and
Peter Eckersley and
Phu Mon Htut and
Pinyu Hwang and
Piotr Milkowski and
Piyush Patil and
Pouya Pezeshkpour and
Priti Oli and
Qiaozhu Mei and
Qing Lyu and
Qinlang Chen and
Rabin Banjade and
Rachel Etta Rudolph and
Raefer Gabriel and
Rahel Habacker and
Ramon Risco and
Rapha{\"{e}}l Milli{\`{e}}re and
Rhythm Garg and
Richard Barnes and
Rif A. Saurous and
Riku Arakawa and
Robbe Raymaekers and
Robert Frank and
Rohan Sikand and
Roman Novak and
Roman Sitelew and
Ronan LeBras and
Rosanne Liu and
Rowan Jacobs and
Rui Zhang and
Ruslan Salakhutdinov and
Ryan Chi and
Ryan Lee and
Ryan Stovall and
Ryan Teehan and
Rylan Yang and
Sahib Singh and
Saif M. Mohammad and
Sajant Anand and
Sam Dillavou and
Sam Shleifer and
Sam Wiseman and
Samuel Gruetter and
Samuel R. Bowman and
Samuel S. Schoenholz and
Sanghyun Han and
Sanjeev Kwatra and
Sarah A. Rous and
Sarik Ghazarian and
Sayan Ghosh and
Sean Casey and
Sebastian Bischoff and
Sebastian Gehrmann and
Sebastian Schuster and
Sepideh Sadeghi and
Shadi Hamdan and
Sharon Zhou and
Shashank Srivastava and
Sherry Shi and
Shikhar Singh and
Shima Asaadi and
Shixiang Shane Gu and
Shubh Pachchigar and
Shubham Toshniwal and
Shyam Upadhyay and
Shyamolima (Shammie) Debnath and
Siamak Shakeri and
Simon Thormeyer and
Simone Melzi and
Siva Reddy and
Sneha Priscilla Makini and
Soo{-}Hwan Lee and
Spencer Torene and
Sriharsha Hatwar and
Stanislas Dehaene and
Stefan Divic and
Stefano Ermon and
Stella Biderman and
Stephanie Lin and
Stephen Prasad and
Steven T. Piantadosi and
Stuart M. Shieber and
Summer Misherghi and
Svetlana Kiritchenko and
Swaroop Mishra and
Tal Linzen and
Tal Schuster and
Tao Li and
Tao Yu and
Tariq Ali and
Tatsu Hashimoto and
Te{-}Lin Wu and
Th{\'{e}}o Desbordes and
Theodore Rothschild and
Thomas Phan and
Tianle Wang and
Tiberius Nkinyili and
Timo Schick and
Timofei Kornev and
Titus Tunduny and
Tobias Gerstenberg and
Trenton Chang and
Trishala Neeraj and
Tushar Khot and
Tyler Shultz and
Uri Shaham and
Vedant Misra and
Vera Demberg and
Victoria Nyamai and
Vikas Raunak and
Vinay V. Ramasesh and
Vinay Uday Prabhu and
Vishakh Padmakumar and
Vivek Srikumar and
William Fedus and
William Saunders and
William Zhang and
Wout Vossen and
Xiang Ren and
Xiaoyu Tong and
Xinran Zhao and
Xinyi Wu and
Xudong Shen and
Yadollah Yaghoobzadeh and
Yair Lakretz and
Yangqiu Song and
Yasaman Bahri and
Yejin Choi and
Yichi Yang and
Yiding Hao and
Yifu Chen and
Yonatan Belinkov and
Yu Hou and
Yufang Hou and
Yuntao Bai and
Zachary Seid and
Zhuoye Zhao and
Zijian Wang and
Zijie J. Wang and
Zirui Wang and
Ziyi Wu},
title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities
of language models},
journal = {Trans. Mach. Learn. Res.},
volume = {2023},
year = {2023},
url = {https://openreview.net/forum?id=uyTL5Bvosj},
timestamp = {Thu, 20 Nov 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/tmlr/SrivastavaRRSAF23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/PanCZLBWZEH23,
author = {Alexander Pan and
Jun Shern Chan and
Andy Zou and
Nathaniel Li and
Steven Basart and
Thomas Woodside and
Hanlin Zhang and
Scott Emmons and
Dan Hendrycks},
editor = {Andreas Krause and
Emma Brunskill and
Kyunghyun Cho and
Barbara Engelhardt and
Sivan Sabato and
Jonathan Scarlett},
title = {Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards
and Ethical Behavior in the Machiavelli Benchmark},
booktitle = {International Conference on Machine Learning, {ICML} 2023, 23-29 July
2023, Honolulu, Hawaii, {USA}},
series = {Proceedings of Machine Learning Research},
pages = {26837--26867},
publisher = {{PMLR}},
year = {2023},
url = {https://proceedings.mlr.press/v202/pan23a.html},
timestamp = {Tue, 17 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/icml/PanCZLBWZEH23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/HuZWLF23,
author = {Kai Hu and
Andy Zou and
Zifan Wang and
Klas Leino and
Matt Fredrikson},
editor = {Alice Oh and
Tristan Naumann and
Amir Globerson and
Kate Saenko and
Moritz Hardt and
Sergey Levine},
title = {Unlocking Deterministic Robustness Certification on ImageNet},
booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference
on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
LA, USA, December 10 - 16, 2023},
year = {2023},
url = {http://papers.nips.cc/paper\_files/paper/2023/hash/863da9d40547f1d1b18859519ce2dee4-Abstract-Conference.html},
timestamp = {Mon, 19 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/HuZWLF23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2301-12549,
author = {Kai Hu and
Andy Zou and
Zifan Wang and
Klas Leino and
Matt Fredrikson},
title = {Scaling in Depth: Unlocking Robustness Certification on ImageNet},
journal = {CoRR},
volume = {abs/2301.12549},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2301.12549},
doi = {10.48550/ARXIV.2301.12549},
eprinttype = {arXiv},
eprint = {2301.12549},
timestamp = {Sun, 06 Oct 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2301-12549.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-06189,
author = {Ram M. Kripa and
Andy Zou and
Ryan Jia and
Kenny Huang},
title = {Papaya: Federated Learning, but Fully Decentralized},
journal = {CoRR},
volume = {abs/2303.06189},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.06189},
doi = {10.48550/ARXIV.2303.06189},
eprinttype = {arXiv},
eprint = {2303.06189},
timestamp = {Sat, 30 Sep 2023 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-06189.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2304-03279,
author = {Alexander Pan and
Jun Shern Chan and
Andy Zou and
Nathaniel Li and
Steven Basart and
Thomas Woodside and
Jonathan Ng and
Hanlin Zhang and
Scott Emmons and
Dan Hendrycks},
title = {Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards
and Ethical Behavior in the {MACHIAVELLI} Benchmark},
journal = {CoRR},
volume = {abs/2304.03279},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2304.03279},
doi = {10.48550/ARXIV.2304.03279},
eprinttype = {arXiv},
eprint = {2304.03279},
timestamp = {Tue, 17 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2304-03279.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2307-15043,
author = {Andy Zou and
Zifan Wang and
J. Zico Kolter and
Matt Fredrikson},
title = {Universal and Transferable Adversarial Attacks on Aligned Language
Models},
journal = {CoRR},
volume = {abs/2307.15043},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2307.15043},
doi = {10.48550/ARXIV.2307.15043},
eprinttype = {arXiv},
eprint = {2307.15043},
timestamp = {Sun, 06 Oct 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2307-15043.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-01405,
author = {Andy Zou and
Long Phan and
Sarah Li Chen and
James Campbell and
Phillip Guo and
Richard Ren and
Alexander Pan and
Xuwang Yin and
Mantas Mazeika and
Ann{-}Kathrin Dombrowski and
Shashwat Goel and
Nathaniel Li and
Michael J. Byun and
Zifan Wang and
Alex Mallen and
Steven Basart and
Sanmi Koyejo and
Dawn Song and
Matt Fredrikson and
J. Zico Kolter and
Dan Hendrycks},
title = {Representation Engineering: {A} Top-Down Approach to {AI} Transparency},
journal = {CoRR},
volume = {abs/2310.01405},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2310.01405},
doi = {10.48550/ARXIV.2310.01405},
eprinttype = {arXiv},
eprint = {2310.01405},
timestamp = {Fri, 13 Jun 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2310-01405.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/HendrycksZMTLSS22,
author = {Dan Hendrycks and
Andy Zou and
Mantas Mazeika and
Leonard Tang and
Bo Li and
Dawn Song and
Jacob Steinhardt},
title = {PixMix: Dreamlike Pictures Comprehensively Improve Safety Measures},
booktitle = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
pages = {16762--16771},
publisher = {{IEEE}},
year = {2022},
url = {https://doi.org/10.1109/CVPR52688.2022.01628},
doi = {10.1109/CVPR52688.2022.01628},
timestamp = {Sun, 19 Jan 2025 13:39:04 +0100},
biburl = {https://dblp.org/rec/conf/cvpr/HendrycksZMTLSS22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/HendrycksBMZKMS22,
author = {Dan Hendrycks and
Steven Basart and
Mantas Mazeika and
Andy Zou and
Joseph Kwon and
Mohammadreza Mostajabi and
Jacob Steinhardt and
Dawn Song},
editor = {Kamalika Chaudhuri and
Stefanie Jegelka and
Le Song and
Csaba Szepesv{\'{a}}ri and
Gang Niu and
Sivan Sabato},
title = {Scaling Out-of-Distribution Detection for Real-World Settings},
booktitle = {International Conference on Machine Learning, {ICML} 2022, 17-23 July
2022, Baltimore, Maryland, {USA}},
series = {Proceedings of Machine Learning Research},
pages = {8759--8773},
publisher = {{PMLR}},
year = {2022},
url = {https://proceedings.mlr.press/v162/hendrycks22a.html},
timestamp = {Tue, 12 Jul 2022 17:36:52 +0200},
biburl = {https://dblp.org/rec/conf/icml/HendrycksBMZKMS22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/MazeikaTZBCSFSH22,
author = {Mantas Mazeika and
Eric Tang and
Andy Zou and
Steven Basart and
Jun Shern Chan and
Dawn Song and
David A. Forsyth and
Jacob Steinhardt and
Dan Hendrycks},
editor = {Sanmi Koyejo and
S. Mohamed and
A. Agarwal and
Danielle Belgrave and
K. Cho and
A. Oh},
title = {How Would The Viewer Feel? Estimating Wellbeing From Video Scenarios},
booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference
on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
LA, USA, November 28 - December 9, 2022},
year = {2022},
url = {http://papers.nips.cc/paper\_files/paper/2022/hash/75ff01252ab45ce278cb060effce4ca1-Abstract-Datasets\_and\_Benchmarks.html},
timestamp = {Mon, 08 Jan 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/MazeikaTZBCSFSH22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/ZouXJKMLSSEH22,
author = {Andy Zou and
Tristan Xiao and
Ryan Jia and
Joe Kwon and
Mantas Mazeika and
Richard Li and
Dawn Song and
Jacob Steinhardt and
Owain Evans and
Dan Hendrycks},
editor = {Sanmi Koyejo and
S. Mohamed and
A. Agarwal and
Danielle Belgrave and
K. Cho and
A. Oh},
title = {Forecasting Future World Events With Neural Networks},
booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference
on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
LA, USA, November 28 - December 9, 2022},
year = {2022},
url = {http://papers.nips.cc/paper\_files/paper/2022/hash/aec870a6772336c15dac992c16f2e7c9-Abstract-Datasets\_and\_Benchmarks.html},
timestamp = {Mon, 08 Jan 2024 00:00:00 +0100},
biburl = {https://dblp.org/rec/conf/nips/ZouXJKMLSSEH22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-04615,
author = {Aarohi Srivastava and
Abhinav Rastogi and
Abhishek Rao and
Abu Awal Md Shoeb and
Abubakar Abid and
Adam Fisch and
Adam R. Brown and
Adam Santoro and
Aditya Gupta and
Adri{\`{a}} Garriga{-}Alonso and
Agnieszka Kluska and
Aitor Lewkowycz and
Akshat Agarwal and
Alethea Power and
Alex Ray and
Alex Warstadt and
Alexander W. Kocurek and
Ali Safaya and
Ali Tazarv and
Alice Xiang and
Alicia Parrish and
Allen Nie and
Aman Hussain and
Amanda Askell and
Amanda Dsouza and
Ambrose Slone and
Ameet Rahane and
Anantharaman S. Iyer and
Anders Andreassen and
Andrea Madotto and
Andrea Santilli and
Andreas Stuhlm{\"{u}}ller and
Andrew M. Dai and
Andrew La and
Andrew K. Lampinen and
Andy Zou and
Angela Jiang and
Angelica Chen and
Anh Vuong and
Animesh Gupta and
Anna Gottardi and
Antonio Norelli and
Anu Venkatesh and
Arash Gholamidavoodi and
Arfa Tabassum and
Arul Menezes and
Arun Kirubarajan and
Asher Mullokandov and
Ashish Sabharwal and
Austin Herrick and
Avia Efrat and
Aykut Erdem and
Ayla Karakas and
B. Ryan Roberts and
Bao Sheng Loe and
Barret Zoph and
Bartlomiej Bojanowski and
Batuhan {\"{O}}zyurt and
Behnam Hedayatnia and
Behnam Neyshabur and
Benjamin Inden and
Benno Stein and
Berk Ekmekci and
Bill Yuchen Lin and
Blake Howald and
Bryan Orinion and
Cameron Diao and
Cameron Dour and
Catherine Stinson and
Cedrick Argueta and
C{\`{e}}sar Ferri Ram{\'{\i}}rez and
Chandan Singh and
Charles Rathkopf and
Chenlin Meng and
Chitta Baral and
Chiyu Wu and
Chris Callison{-}Burch and
Chris Waites and
Christian Voigt and
Christopher D. Manning and
Christopher Potts and
Cindy Ramirez and
Clara E. Rivera and
Clemencia Siro and
Colin Raffel and
Courtney Ashcraft and
Cristina Garbacea and
Damien Sileo and
Dan Garrette and
Dan Hendrycks and
Dan Kilman and
Dan Roth and
Daniel Freeman and
Daniel Khashabi and
Daniel Levy and
Daniel Mosegu{\'{\i}} Gonz{\'{a}}lez and
Danielle Perszyk and
Danny Hernandez and
Danqi Chen and
Daphne Ippolito and
Dar Gilboa and
David Dohan and
David Drakard and
David Jurgens and
Debajyoti Datta and
Deep Ganguli and
Denis Emelin and
Denis Kleyko and
Deniz Yuret and
Derek Chen and
Derek Tam and
Dieuwke Hupkes and
Diganta Misra and
Dilyar Buzan and
Dimitri Coelho Mollo and
Diyi Yang and
Dong{-}Ho Lee and
Dylan Schrader and
Ekaterina Shutova and
Ekin Dogus Cubuk and
Elad Segal and
Eleanor Hagerman and
Elizabeth Barnes and
Elizabeth Donoway and
Ellie Pavlick and
Emanuele Rodol{\`{a}} and
Emma Lam and
Eric Chu and
Eric Tang and
Erkut Erdem and
Ernie Chang and
Ethan A. Chi and
Ethan Dyer and
Ethan J. Jerzak and
Ethan Kim and
Eunice Engefu Manyasi and
Evgenii Zheltonozhskii and
Fanyue Xia and
Fatemeh Siar and
Fernando Mart{\'{\i}}nez{-}Plumed and
Francesca Happ{\'{e}} and
Fran{\c{c}}ois Chollet and
Frieda Rong and
Gaurav Mishra and
Genta Indra Winata and
Gerard de Melo and
Germ{\'{a}}n Kruszewski and
Giambattista Parascandolo and
Giorgio Mariani and
Gloria Wang and
Gonzalo Jaimovitch{-}L{\'{o}}pez and
Gregor Betz and
Guy Gur{-}Ari and
Hana Galijasevic and
Hannah Kim and
Hannah Rashkin and
Hannaneh Hajishirzi and
Harsh Mehta and
Hayden Bogar and
Henry Shevlin and
Hinrich Sch{\"{u}}tze and
Hiromu Yakura and
Hongming Zhang and
Hugh Mee Wong and
Ian Ng and
Isaac Noble and
Jaap Jumelet and
Jack Geissinger and
Jackson Kernion and
Jacob Hilton and
Jaehoon Lee and
Jaime Fern{\'{a}}ndez Fisac and
James B. Simon and
James Koppel and
James Zheng and
James Zou and
Jan Kocon and
Jana Thompson and
Janelle Wingfield and
Jared Kaplan and
Jarema Radom and
Jascha Sohl{-}Dickstein and
Jason Phang and
Jason Wei and
Jason Yosinski and
Jekaterina Novikova and
Jelle Bosscher and
Jennifer Marsh and
Jeremy Kim and
Jeroen Taal and
Jesse H. Engel and
Jesujoba Alabi and
Jiacheng Xu and
Jiaming Song and
Jillian Tang and
Joan Waweru and
John Burden and
John Miller and
John U. Balis and
Jonathan Batchelder and
Jonathan Berant and
J{\"{o}}rg Frohberg and
Jos Rozen and
Jos{\'{e}} Hern{\'{a}}ndez{-}Orallo and
Joseph Boudeman and
Joseph Guerr and
Joseph Jones and
Joshua B. Tenenbaum and
Joshua S. Rule and
Joyce Chua and
Kamil Kanclerz and
Karen Livescu and
Karl Krauth and
Karthik Gopalakrishnan and
Katerina Ignatyeva and
Katja Markert and
Kaustubh D. Dhole and
Kevin Gimpel and
Kevin Omondi and
Kory W. Mathewson and
Kristen Chiafullo and
Ksenia Shkaruta and
Kumar Shridhar and
Kyle McDonell and
Kyle Richardson and
Laria Reynolds and
Leo Gao and
Li Zhang and
Liam Dugan and
Lianhui Qin and
Lidia Contreras Ochando and
Louis{-}Philippe Morency and
Luca Moschella and
Lucas Lam and
Lucy Noble and
Ludwig Schmidt and
Luheng He and
Luis Oliveros Col{\'{o}}n and
Luke Metz and
L{\"{u}}tfi Kerem Senel and
Maarten Bosma and
Maarten Sap and
Maartje ter Hoeve and
Maheen Farooqi and
Manaal Faruqui and
Mantas Mazeika and
Marco Baturan and
Marco Marelli and
Marco Maru and
Mar{\'{\i}}a Jos{\'{e}} Ram{\'{\i}}rez{-}Quintana and
Marie Tolkiehn and
Mario Giulianelli and
Martha Lewis and
Martin Potthast and
Matthew L. Leavitt and
Matthias Hagen and
M{\'{a}}ty{\'{a}}s Schubert and
Medina Baitemirova and
Melody Arnaud and
Melvin McElrath and
Michael A. Yee and
Michael Cohen and
Michael Gu and
Michael I. Ivanitskiy and
Michael Starritt and
Michael Strube and
Michal Swedrowski and
Michele Bevilacqua and
Michihiro Yasunaga and
Mihir Kale and
Mike Cain and
Mimee Xu and
Mirac Suzgun and
Mitch Walker and
Mo Tiwari and
Mohit Bansal and
Moin Aminnaseri and
Mor Geva and
Mozhdeh Gheini and
Mukund Varma T. and
Nanyun Peng and
Nathan A. Chi and
Nayeon Lee and
Neta Gur{-}Ari Krakover and
Nicholas Cameron and
Nicholas Roberts and
Nick Doiron and
Nicole Martinez and
Nikita Nangia and
Niklas Deckers and
Niklas Muennighoff and
Nitish Shirish Keskar and
Niveditha Iyer and
Noah Constant and
Noah Fiedel and
Nuan Wen and
Oliver Zhang and
Omar Agha and
Omar Elbaghdadi and
Omer Levy and
Owain Evans and
Pablo Antonio Moreno Casares and
Parth Doshi and
Pascale Fung and
Paul Pu Liang and
Paul Vicol and
Pegah Alipoormolabashi and
Peiyuan Liao and
Percy Liang and
Peter Chang and
Peter Eckersley and
Phu Mon Htut and
Pinyu Hwang and
Piotr Milkowski and
Piyush Patil and
Pouya Pezeshkpour and
Priti Oli and
Qiaozhu Mei and
Qing Lyu and
Qinlang Chen and
Rabin Banjade and
Rachel Etta Rudolph and
Raefer Gabriel and
Rahel Habacker and
Ramon Risco and
Rapha{\"{e}}l Milli{\`{e}}re and
Rhythm Garg and
Richard Barnes and
Rif A. Saurous and
Riku Arakawa and
Robbe Raymaekers and
Robert Frank and
Rohan Sikand and
Roman Novak and
Roman Sitelew and
Ronan LeBras and
Rosanne Liu and
Rowan Jacobs and
Rui Zhang and
Ruslan Salakhutdinov and
Ryan Chi and
Ryan Lee and
Ryan Stovall and
Ryan Teehan and
Rylan Yang and
Sahib Singh and
Saif M. Mohammad and
Sajant Anand and
Sam Dillavou and
Sam Shleifer and
Sam Wiseman and
Samuel Gruetter and
Samuel R. Bowman and
Samuel S. Schoenholz and
Sanghyun Han and
Sanjeev Kwatra and
Sarah A. Rous and
Sarik Ghazarian and
Sayan Ghosh and
Sean Casey and
Sebastian Bischoff and
Sebastian Gehrmann and
Sebastian Schuster and
Sepideh Sadeghi and
Shadi Hamdan and
Sharon Zhou and
Shashank Srivastava and
Sherry Shi and
Shikhar Singh and
Shima Asaadi and
Shixiang Shane Gu and
Shubh Pachchigar and
Shubham Toshniwal and
Shyam Upadhyay and
Shyamolima (Shammie) Debnath and
Siamak Shakeri and
Simon Thormeyer and
Simone Melzi and
Siva Reddy and
Sneha Priscilla Makini and
Soo{-}Hwan Lee and
Spencer Torene and
Sriharsha Hatwar and
Stanislas Dehaene and
Stefan Divic and
Stefano Ermon and
Stella Biderman and
Stephanie Lin and
Stephen Prasad and
Steven T. Piantadosi and
Stuart M. Shieber and
Summer Misherghi and
Svetlana Kiritchenko and
Swaroop Mishra and
Tal Linzen and
Tal Schuster and
Tao Li and
Tao Yu and
Tariq Ali and
Tatsu Hashimoto and
Te{-}Lin Wu and
Th{\'{e}}o Desbordes and
Theodore Rothschild and
Thomas Phan and
Tianle Wang and
Tiberius Nkinyili and
Timo Schick and
Timofei Kornev and
Titus Tunduny and
Tobias Gerstenberg and
Trenton Chang and
Trishala Neeraj and
Tushar Khot and
Tyler Shultz and
Uri Shaham and
Vedant Misra and
Vera Demberg and
Victoria Nyamai and
Vikas Raunak and
Vinay V. Ramasesh and
Vinay Uday Prabhu and
Vishakh Padmakumar and
Vivek Srikumar and
William Fedus and
William Saunders and
William Zhang and
Wout Vossen and
Xiang Ren and
Xiaoyu Tong and
Xinran Zhao and
Xinyi Wu and
Xudong Shen and
Yadollah Yaghoobzadeh and
Yair Lakretz and
Yangqiu Song and
Yasaman Bahri and
Yejin Choi and
Yichi Yang and
Yiding Hao and
Yifu Chen and
Yonatan Belinkov and
Yu Hou and
Yufang Hou and
Yuntao Bai and
Zachary Seid and
Zhuoye Zhao and
Zijian Wang and
Zijie J. Wang and
Zirui Wang and
Ziyi Wu},
title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities
of language models},
journal = {CoRR},
volume = {abs/2206.04615},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2206.04615},
doi = {10.48550/ARXIV.2206.04615},
eprinttype = {arXiv},
eprint = {2206.04615},
timestamp = {Tue, 21 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2206-04615.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-15474,
author = {Andy Zou and
Tristan Xiao and
Ryan Jia and
Joe Kwon and
Mantas Mazeika and
Richard Li and
Dawn Song and
Jacob Steinhardt and
Owain Evans and
Dan Hendrycks},
title = {Forecasting Future World Events with Neural Networks},
journal = {CoRR},
volume = {abs/2206.15474},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2206.15474},
doi = {10.48550/ARXIV.2206.15474},
eprinttype = {arXiv},
eprint = {2206.15474},
timestamp = {Tue, 07 Apr 2026 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2206-15474.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2210-10039,
author = {Mantas Mazeika and
Eric Tang and
Andy Zou and
Steven Basart and
Jun Shern Chan and
Dawn Song and
David A. Forsyth and
Jacob Steinhardt and
Dan Hendrycks},
title = {How Would The Viewer Feel? Estimating Wellbeing From Video Scenarios},
journal = {CoRR},
volume = {abs/2210.10039},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2210.10039},
doi = {10.48550/ARXIV.2210.10039},
eprinttype = {arXiv},
eprint = {2210.10039},
timestamp = {Mon, 24 Oct 2022 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2210-10039.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/HendrycksBBZMSS21,
author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
title = {Measuring Massive Multitask Language Understanding},
booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
Virtual Event, Austria, May 3-7, 2021},
publisher = {OpenReview.net},
year = {2021},
url = {https://openreview.net/forum?id=d7KBjmI3GmQ},
timestamp = {Wed, 23 Jun 2021 17:36:39 +0200},
biburl = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/HendrycksMZPZNS21,
author = {Dan Hendrycks and
Mantas Mazeika and
Andy Zou and
Sahil Patel and
Christine Zhu and
Jesus Navarro and
Dawn Song and
Bo Li and
Jacob Steinhardt},
editor = {Joaquin Vanschoren and
Sai{-}Kit Yeung},
title = {What Would Jiminy Cricket Do? Towards Agents That Behave Morally},
booktitle = {Proceedings of the Neural Information Processing Systems Track on
Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December
2021, virtual},
year = {2021},
url = {https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/39059724f73a9969845dfe4146c5660e-Abstract-round2.html},
timestamp = {Thu, 05 May 2022 16:30:03 +0200},
biburl = {https://dblp.org/rec/conf/nips/HendrycksMZPZNS21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/MazeikaHLXHZRYWTTTSPBSPLF21,
author = {Mantas Mazeika and
Dan Hendrycks and
Huichen Li and
Xiaojun Xu and
Sidney Q. Hough and
Andy Zou and
Arezoo Rajabi and
Qi Yao and
Zihao Wang and
Jian Tian and
Yao Tang and
Di Tang and
Roman Smirnov and
Pavel Pleskov and
Nikita Benkovich and
Dawn Song and
Radha Poovendran and
Bo Li and
David A. Forsyth},
editor = {Marco Ciccone and
Gustavo Stolovitzky and
Jacob Albrecht},
title = {The Trojan Detection Challenge},
booktitle = {NeurIPS 2022 Competition Track, November 28 - December 9, 2022, Online},
series = {Proceedings of Machine Learning Research},
pages = {279--291},
publisher = {{PMLR}},
year = {2021},
url = {https://proceedings.mlr.press/v220/mazeika22a.html},
timestamp = {Mon, 12 Aug 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/MazeikaHLXHZRYWTTTSPBSPLF21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2110-13136,
author = {Dan Hendrycks and
Mantas Mazeika and
Andy Zou and
Sahil Patel and
Christine Zhu and
Jesus Navarro and
Dawn Song and
Bo Li and
Jacob Steinhardt},
title = {What Would Jiminy Cricket Do? Towards Agents That Behave Morally},
journal = {CoRR},
volume = {abs/2110.13136},
year = {2021},
url = {https://arxiv.org/abs/2110.13136},
eprinttype = {arXiv},
eprint = {2110.13136},
timestamp = {Mon, 07 Mar 2022 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2110-13136.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2112-05135,
author = {Dan Hendrycks and
Andy Zou and
Mantas Mazeika and
Leonard Tang and
Bo Li and
Dawn Song and
Jacob Steinhardt},
title = {PixMix: Dreamlike Pictures Comprehensively Improve Safety Measures},
journal = {CoRR},
volume = {abs/2112.05135},
year = {2021},
url = {https://arxiv.org/abs/2112.05135},
eprinttype = {arXiv},
eprint = {2112.05135},
timestamp = {Mon, 07 Mar 2022 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-05135.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2009-03300,
author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
title = {Measuring Massive Multitask Language Understanding},
journal = {CoRR},
volume = {abs/2009.03300},
year = {2020},
url = {https://arxiv.org/abs/2009.03300},
eprinttype = {arXiv},
eprint = {2009.03300},
timestamp = {Thu, 17 Sep 2020 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2009-03300.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
manage site settings
To protect your privacy, all features that rely on external API calls from your browser are turned off by default. You need to opt-in for them to become active. All settings here will be stored as cookies with your web browser. For more information see our F.A.Q.