BibTeX records: Andy Zou

download as .bib file

@article{DBLP:journals/corr/abs-2603-15714,
  author       = {Mateusz Dziemian and
                  Maxwell Lin and
                  Xiaohan Fu and
                  Micha Nowak and
                  Nick Winter and
                  Eliot Krzysztof Jones and
                  Andy Zou and
                  Lama Ahmad and
                  Kamalika Chaudhuri and
                  Sahana Chennabasappa and
                  Xander Davies and
                  Lauren Deason and
                  Benjamin L. Edelman and
                  Tanner Emek and
                  Ivan Evtimov and
                  Jim Gust and
                  Maia Hamin and
                  Kat He and
                  Klaudia Krawiecka and
                  Riccardo Patana and
                  Neil Perry and
                  Troy Peterson and
                  Xiangyu Qi and
                  Javier Rando and
                  Zifan Wang and
                  Zihan Wang and
                  Spencer Whitman and
                  Eric Winsor and
                  Arman Zharmagambetov and
                  Matt Fredrikson and
                  Zico Kolter},
  title        = {How Vulnerable Are {AI} Agents to Indirect Prompt Injections? Insights
                  from a Large-Scale Public Competition},
  journal      = {CoRR},
  volume       = {abs/2603.15714},
  year         = {2026},
  url          = {https://doi.org/10.48550/arXiv.2603.15714},
  doi          = {10.48550/ARXIV.2603.15714},
  eprinttype   = {arXiv},
  eprint       = {2603.15714},
  timestamp    = {Tue, 14 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2603-15714.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/AndriushchenkoS25,
  author       = {Maksym Andriushchenko and
                  Alexandra Souly and
                  Mateusz Dziemian and
                  Derek Duenas and
                  Maxwell Lin and
                  Justin Wang and
                  Dan Hendrycks and
                  Andy Zou and
                  J. Zico Kolter and
                  Matt Fredrikson and
                  Yarin Gal and
                  Xander Davies},
  title        = {AgentHarm: {A} Benchmark for Measuring Harmfulness of {LLM} Agents},
  booktitle    = {The Thirteenth International Conference on Learning Representations,
                  {ICLR} 2025, Singapore, April 24-28, 2025},
  publisher    = {OpenReview.net},
  year         = {2025},
  url          = {https://openreview.net/forum?id=AC5n7xHuR1},
  timestamp    = {Thu, 15 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/AndriushchenkoS25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/TamirisaBPZGSLW25,
  author       = {Rishub Tamirisa and
                  Bhrugu Bharathi and
                  Long Phan and
                  Andy Zhou and
                  Alice Gatti and
                  Tarun Suresh and
                  Maxwell Lin and
                  Justin Wang and
                  Rowan Wang and
                  Ron Arel and
                  Andy Zou and
                  Dawn Song and
                  Bo Li and
                  Dan Hendrycks and
                  Mantas Mazeika},
  title        = {Tamper-Resistant Safeguards for Open-Weight LLMs},
  booktitle    = {The Thirteenth International Conference on Learning Representations,
                  {ICLR} 2025, Singapore, April 24-28, 2025},
  publisher    = {OpenReview.net},
  year         = {2025},
  url          = {https://openreview.net/forum?id=4FIjRodbW6},
  timestamp    = {Thu, 15 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/TamirisaBPZGSLW25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2502-14296,
  author       = {Yue Huang and
                  Chujie Gao and
                  Siyuan Wu and
                  Haoran Wang and
                  Xiangqi Wang and
                  Yujun Zhou and
                  Yanbo Wang and
                  Jiayi Ye and
                  Jiawen Shi and
                  Qihui Zhang and
                  Yuan Li and
                  Han Bao and
                  Zhaoyi Liu and
                  Tianrui Guan and
                  Dongping Chen and
                  Ruoxi Chen and
                  Kehan Guo and
                  Andy Zou and
                  Bryan Hooi Kuen{-}Yew and
                  Caiming Xiong and
                  Elias Stengel{-}Eskin and
                  Hongyang Zhang and
                  Hongzhi Yin and
                  Huan Zhang and
                  Huaxiu Yao and
                  Jaehong Yoon and
                  Jieyu Zhang and
                  Kai Shu and
                  Kaijie Zhu and
                  Ranjay Krishna and
                  Swabha Swayamdipta and
                  Taiwei Shi and
                  Weijia Shi and
                  Xiang Li and
                  Yiwei Li and
                  Yuexing Hao and
                  Zhihao Jia and
                  Zhize Li and
                  Xiuying Chen and
                  Zhengzhong Tu and
                  Xiyang Hu and
                  Tianyi Zhou and
                  Jieyu Zhao and
                  Lichao Sun and
                  Furong Huang and
                  Or Cohen Sasson and
                  Prasanna Sattigeri and
                  Anka Reuel and
                  Max Lamparth and
                  Yue Zhao and
                  Nouha Dziri and
                  Yu Su and
                  Huan Sun and
                  Heng Ji and
                  Chaowei Xiao and
                  Mohit Bansal and
                  Nitesh V. Chawla and
                  Jian Pei and
                  Jianfeng Gao and
                  Michael Backes and
                  Philip S. Yu and
                  Neil Zhenqiang Gong and
                  Pin{-}Yu Chen and
                  Bo Li and
                  Xiangliang Zhang},
  title        = {On the Trustworthiness of Generative Foundation Models: Guideline,
                  Assessment, and Perspective},
  journal      = {CoRR},
  volume       = {abs/2502.14296},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2502.14296},
  doi          = {10.48550/ARXIV.2502.14296},
  eprinttype   = {arXiv},
  eprint       = {2502.14296},
  timestamp    = {Tue, 24 Mar 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2502-14296.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2504-16980,
  author       = {Pratyush Maini and
                  Sachin Goyal and
                  Dylan Sam and
                  Alexander Robey and
                  Yash Savani and
                  Yiding Jiang and
                  Andy Zou and
                  Zacharcy C. Lipton and
                  J. Zico Kolter},
  title        = {Safety Pretraining: Toward the Next Generation of Safe {AI}},
  journal      = {CoRR},
  volume       = {abs/2504.16980},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2504.16980},
  doi          = {10.48550/ARXIV.2504.16980},
  eprinttype   = {arXiv},
  eprint       = {2504.16980},
  timestamp    = {Fri, 23 May 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2504-16980.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2505-01050,
  author       = {Kai Hu and
                  Weichen Yu and
                  Li Zhang and
                  Alexander Robey and
                  Andy Zou and
                  Chengming Xu and
                  Haoqi Hu and
                  Matt Fredrikson},
  title        = {Transferable Adversarial Attacks on Black-Box Vision-Language Models},
  journal      = {CoRR},
  volume       = {abs/2505.01050},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2505.01050},
  doi          = {10.48550/ARXIV.2505.01050},
  eprinttype   = {arXiv},
  eprint       = {2505.01050},
  timestamp    = {Thu, 26 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2505-01050.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2506-03350,
  author       = {Eliot Krzysztof Jones and
                  Alexander Robey and
                  Andy Zou and
                  Zachary Ravichandran and
                  George J. Pappas and
                  Hamed Hassani and
                  Matt Fredrikson and
                  J. Zico Kolter},
  title        = {Adversarial Attacks on Robotic Vision Language Action Models},
  journal      = {CoRR},
  volume       = {abs/2506.03350},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2506.03350},
  doi          = {10.48550/ARXIV.2506.03350},
  eprinttype   = {arXiv},
  eprint       = {2506.03350},
  timestamp    = {Sun, 06 Jul 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2506-03350.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-20526,
  author       = {Andy Zou and
                  Maxwell Lin and
                  Eliot Krzysztof Jones and
                  Micha Nowak and
                  Mateusz Dziemian and
                  Nick Winter and
                  Alexander Grattan and
                  Valent Nathanael and
                  Ayla Croft and
                  Xander Davies and
                  Jai Patel and
                  Robert Kirk and
                  Nate Burnikell and
                  Yarin Gal and
                  Dan Hendrycks and
                  J. Zico Kolter and
                  Matt Fredrikson},
  title        = {Security Challenges in {AI} Agent Deployment: Insights from a Large
                  Scale Public Competition},
  journal      = {CoRR},
  volume       = {abs/2507.20526},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.20526},
  doi          = {10.48550/ARXIV.2507.20526},
  eprinttype   = {arXiv},
  eprint       = {2507.20526},
  timestamp    = {Thu, 21 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-20526.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2507-23701,
  author       = {Long Phan and
                  Mantas Mazeika and
                  Andy Zou and
                  Dan Hendrycks},
  title        = {TextQuests: How Good are LLMs at Text-Based Video Games?},
  journal      = {CoRR},
  volume       = {abs/2507.23701},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.23701},
  doi          = {10.48550/ARXIV.2507.23701},
  eprinttype   = {arXiv},
  eprint       = {2507.23701},
  timestamp    = {Fri, 22 Aug 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-23701.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2508-19980,
  author       = {Dylan Sam and
                  Alexander Robey and
                  Andy Zou and
                  Matt Fredrikson and
                  J. Zico Kolter},
  title        = {Evaluating Language Model Reasoning about Confidential Information},
  journal      = {CoRR},
  volume       = {abs/2508.19980},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2508.19980},
  doi          = {10.48550/ARXIV.2508.19980},
  eprinttype   = {arXiv},
  eprint       = {2508.19980},
  timestamp    = {Mon, 22 Sep 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2508-19980.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2509-17938,
  author       = {Satyapriya Krishna and
                  Andy Zou and
                  Rahul Gupta and
                  Eliot Krzysztof Jones and
                  Nick Winter and
                  Dan Hendrycks and
                  J. Zico Kolter and
                  Matt Fredrikson and
                  Spyros Matsoukas},
  title        = {{D-REX:} {A} Benchmark for Detecting Deceptive Reasoning in Large
                  Language Models},
  journal      = {CoRR},
  volume       = {abs/2509.17938},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2509.17938},
  doi          = {10.48550/ARXIV.2509.17938},
  eprinttype   = {arXiv},
  eprint       = {2509.17938},
  timestamp    = {Sat, 18 Oct 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2509-17938.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2510-18212,
  author       = {Dan Hendrycks and
                  Dawn Song and
                  Christian Szegedy and
                  Honglak Lee and
                  Yarin Gal and
                  Erik Brynjolfsson and
                  Sharon Li and
                  Andy Zou and
                  Lionel Levine and
                  Bo Han and
                  Jie Fu and
                  Ziwei Liu and
                  Jinwoo Shin and
                  Kimin Lee and
                  Mantas Mazeika and
                  Long Phan and
                  George Ingebretsen and
                  Adam Khoja and
                  Cihang Xie and
                  Olawale Salaudeen and
                  Matthias Hein and
                  Kevin Zhao and
                  Alexander Pan and
                  David Duvenaud and
                  Bo Li and
                  Steve Omohundro and
                  Gabriel Alfour and
                  Max Tegmark and
                  Kevin McGrew and
                  Gary Marcus and
                  Jaan Tallinn and
                  Eric Schmidt and
                  Yoshua Bengio},
  title        = {A Definition of {AGI}},
  journal      = {CoRR},
  volume       = {abs/2510.18212},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2510.18212},
  doi          = {10.48550/ARXIV.2510.18212},
  eprinttype   = {arXiv},
  eprint       = {2510.18212},
  timestamp    = {Mon, 22 Dec 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2510-18212.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2512-09882,
  author       = {Justin W. Lin and
                  Eliot Krzysztof Jones and
                  Donovan Julian Jasper and
                  Ethan Jun{-}shen Ho and
                  Anna Wu and
                  Arnold Tianyi Yang and
                  Neil Perry and
                  Andy Zou and
                  Matt Fredrikson and
                  J. Zico Kolter and
                  Percy Liang and
                  Dan Boneh and
                  Daniel E. Ho},
  title        = {Comparing {AI} Agents to Cybersecurity Professionals in Real-World
                  Penetration Testing},
  journal      = {CoRR},
  volume       = {abs/2512.09882},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2512.09882},
  doi          = {10.48550/ARXIV.2512.09882},
  eprinttype   = {arXiv},
  eprint       = {2512.09882},
  timestamp    = {Fri, 23 Jan 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2512-09882.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/LiPGYBGLDGMHLJL24,
  author       = {Nathaniel Li and
                  Alexander Pan and
                  Anjali Gopal and
                  Summer Yue and
                  Daniel Berrios and
                  Alice Gatti and
                  Justin D. Li and
                  Ann{-}Kathrin Dombrowski and
                  Shashwat Goel and
                  Gabriel Mukobi and
                  Nathan Helm{-}Burger and
                  Rassin Lababidi and
                  Lennart Justen and
                  Andrew B. Liu and
                  Michael Chen and
                  Isabelle Barrass and
                  Oliver Zhang and
                  Xiaoyuan Zhu and
                  Rishub Tamirisa and
                  Bhrugu Bharathi and
                  Ariel Herbert{-}Voss and
                  Cort B. Breuer and
                  Andy Zou and
                  Mantas Mazeika and
                  Zifan Wang and
                  Palash Oswal and
                  Weiran Lin and
                  Adam A. Hunt and
                  Justin Tienken{-}Harder and
                  Kevin Y. Shih and
                  Kemper Talley and
                  John Guan and
                  Ian Steneker and
                  David Campbell and
                  Brad Jokubaitis and
                  Steven Basart and
                  Stephen Fitz and
                  Ponnurangam Kumaraguru and
                  Kallol Krishna Karmakar and
                  Uday Kiran Tupakula and
                  Vijay Varadharajan and
                  Yan Shoshitaishvili and
                  Jimmy Ba and
                  Kevin M. Esvelt and
                  Alexandr Wang and
                  Dan Hendrycks},
  editor       = {Ruslan Salakhutdinov and
                  Zico Kolter and
                  Katherine A. Heller and
                  Adrian Weller and
                  Nuria Oliver and
                  Jonathan Scarlett and
                  Felix Berkenkamp},
  title        = {The {WMDP} Benchmark: Measuring and Reducing Malicious Use with Unlearning},
  booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024,
                  Vienna, Austria, July 21-27, 2024},
  series       = {Proceedings of Machine Learning Research},
  pages        = {28525--28550},
  publisher    = {{PMLR} / OpenReview.net},
  year         = {2024},
  url          = {https://proceedings.mlr.press/v235/li24bc.html},
  timestamp    = {Mon, 09 Feb 2026 15:35:36 +0100},
  biburl       = {https://dblp.org/rec/conf/icml/LiPGYBGLDGMHLJL24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/MazeikaPYZ0MSLB24,
  author       = {Mantas Mazeika and
                  Long Phan and
                  Xuwang Yin and
                  Andy Zou and
                  Zifan Wang and
                  Norman Mu and
                  Elham Sakhaee and
                  Nathaniel Li and
                  Steven Basart and
                  Bo Li and
                  David A. Forsyth and
                  Dan Hendrycks},
  editor       = {Ruslan Salakhutdinov and
                  Zico Kolter and
                  Katherine A. Heller and
                  Adrian Weller and
                  Nuria Oliver and
                  Jonathan Scarlett and
                  Felix Berkenkamp},
  title        = {HarmBench: {A} Standardized Evaluation Framework for Automated Red
                  Teaming and Robust Refusal},
  booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024,
                  Vienna, Austria, July 21-27, 2024},
  series       = {Proceedings of Machine Learning Research},
  pages        = {35181--35224},
  publisher    = {{PMLR} / OpenReview.net},
  year         = {2024},
  url          = {https://proceedings.mlr.press/v235/mazeika24a.html},
  timestamp    = {Mon, 09 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icml/MazeikaPYZ0MSLB24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/ZouPWDLAKFH24,
  author       = {Andy Zou and
                  Long Phan and
                  Justin Wang and
                  Derek Duenas and
                  Maxwell Lin and
                  Maksym Andriushchenko and
                  J. Zico Kolter and
                  Matt Fredrikson and
                  Dan Hendrycks},
  editor       = {Amir Globersons and
                  Lester Mackey and
                  Danielle Belgrave and
                  Angela Fan and
                  Ulrich Paquet and
                  Jakub M. Tomczak and
                  Cheng Zhang},
  title        = {Improving Alignment and Robustness with Circuit Breakers},
  booktitle    = {Advances in Neural Information Processing Systems 38: Annual Conference
                  on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
                  BC, Canada, December 10 - 15, 2024},
  year         = {2024},
  url          = {http://papers.nips.cc/paper\_files/paper/2024/hash/97ca7168c2c333df5ea61ece3b3276e1-Abstract-Conference.html},
  timestamp    = {Thu, 13 Feb 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/ZouPWDLAKFH24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2402-04249,
  author       = {Mantas Mazeika and
                  Long Phan and
                  Xuwang Yin and
                  Andy Zou and
                  Zifan Wang and
                  Norman Mu and
                  Elham Sakhaee and
                  Nathaniel Li and
                  Steven Basart and
                  Bo Li and
                  David A. Forsyth and
                  Dan Hendrycks},
  title        = {HarmBench: {A} Standardized Evaluation Framework for Automated Red
                  Teaming and Robust Refusal},
  journal      = {CoRR},
  volume       = {abs/2402.04249},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.04249},
  doi          = {10.48550/ARXIV.2402.04249},
  eprinttype   = {arXiv},
  eprint       = {2402.04249},
  timestamp    = {Tue, 20 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2402-04249.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2403-03218,
  author       = {Nathaniel Li and
                  Alexander Pan and
                  Anjali Gopal and
                  Summer Yue and
                  Daniel Berrios and
                  Alice Gatti and
                  Justin D. Li and
                  Ann{-}Kathrin Dombrowski and
                  Shashwat Goel and
                  Long Phan and
                  Gabriel Mukobi and
                  Nathan Helm{-}Burger and
                  Rassin Lababidi and
                  Lennart Justen and
                  Andrew B. Liu and
                  Michael Chen and
                  Isabelle Barrass and
                  Oliver Zhang and
                  Xiaoyuan Zhu and
                  Rishub Tamirisa and
                  Bhrugu Bharathi and
                  Adam Khoja and
                  Zhenqi Zhao and
                  Ariel Herbert{-}Voss and
                  Cort B. Breuer and
                  Andy Zou and
                  Mantas Mazeika and
                  Zifan Wang and
                  Palash Oswal and
                  Weiran Liu and
                  Adam A. Hunt and
                  Justin Tienken{-}Harder and
                  Kevin Y. Shih and
                  Kemper Talley and
                  John Guan and
                  Russell Kaplan and
                  Ian Steneker and
                  David Campbell and
                  Brad Jokubaitis and
                  Alex Levinson and
                  Jean Wang and
                  William Qian and
                  Kallol Krishna Karmakar and
                  Steven Basart and
                  Stephen Fitz and
                  Mindy Levine and
                  Ponnurangam Kumaraguru and
                  Uday Kiran Tupakula and
                  Vijay Varadharajan and
                  Yan Shoshitaishvili and
                  Jimmy Ba and
                  Kevin M. Esvelt and
                  Alexandr Wang and
                  Dan Hendrycks},
  title        = {The {WMDP} Benchmark: Measuring and Reducing Malicious Use With Unlearning},
  journal      = {CoRR},
  volume       = {abs/2403.03218},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2403.03218},
  doi          = {10.48550/ARXIV.2403.03218},
  eprinttype   = {arXiv},
  eprint       = {2403.03218},
  timestamp    = {Sat, 15 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2403-03218.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2405-14782,
  author       = {Stella Biderman and
                  Hailey Schoelkopf and
                  Lintang Sutawika and
                  Leo Gao and
                  Jonathan Tow and
                  Baber Abbasi and
                  Alham Fikri Aji and
                  Pawan Sasanka Ammanamanchi and
                  Sidney Black and
                  Jordan Clive and
                  Anthony DiPofi and
                  Julen Etxaniz and
                  Benjamin Fattori and
                  Jessica Zosa Forde and
                  Charles Foster and
                  Jeffrey Hsu and
                  Mimansa Jaiswal and
                  Wilson Y. Lee and
                  Haonan Li and
                  Charles Lovering and
                  Niklas Muennighoff and
                  Ellie Pavlick and
                  Jason Phang and
                  Aviya Skowron and
                  Samson Tan and
                  Xiangru Tang and
                  Kevin A. Wang and
                  Genta Indra Winata and
                  Fran{\c{c}}ois Yvon and
                  Andy Zou},
  title        = {Lessons from the Trenches on Reproducible Evaluation of Language Models},
  journal      = {CoRR},
  volume       = {abs/2405.14782},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2405.14782},
  doi          = {10.48550/ARXIV.2405.14782},
  eprinttype   = {arXiv},
  eprint       = {2405.14782},
  timestamp    = {Sun, 01 Feb 2026 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2405-14782.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2406-04313,
  author       = {Andy Zou and
                  Long Phan and
                  Justin Wang and
                  Derek Duenas and
                  Maxwell Lin and
                  Maksym Andriushchenko and
                  Rowan Wang and
                  Zico Kolter and
                  Matt Fredrikson and
                  Dan Hendrycks},
  title        = {Improving Alignment and Robustness with Circuit Breakers},
  journal      = {CoRR},
  volume       = {abs/2406.04313},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.04313},
  doi          = {10.48550/ARXIV.2406.04313},
  eprinttype   = {arXiv},
  eprint       = {2406.04313},
  timestamp    = {Sat, 13 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-04313.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2408-00761,
  author       = {Rishub Tamirisa and
                  Bhrugu Bharathi and
                  Long Phan and
                  Andy Zhou and
                  Alice Gatti and
                  Tarun Suresh and
                  Maxwell Lin and
                  Justin Wang and
                  Rowan Wang and
                  Ron Arel and
                  Andy Zou and
                  Dawn Song and
                  Bo Li and
                  Dan Hendrycks and
                  Mantas Mazeika},
  title        = {Tamper-Resistant Safeguards for Open-Weight LLMs},
  journal      = {CoRR},
  volume       = {abs/2408.00761},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2408.00761},
  doi          = {10.48550/ARXIV.2408.00761},
  eprinttype   = {arXiv},
  eprint       = {2408.00761},
  timestamp    = {Mon, 09 Sep 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2408-00761.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2410-09024,
  author       = {Maksym Andriushchenko and
                  Alexandra Souly and
                  Mateusz Dziemian and
                  Derek Duenas and
                  Maxwell Lin and
                  Justin Wang and
                  Dan Hendrycks and
                  Andy Zou and
                  Zico Kolter and
                  Matt Fredrikson and
                  Eric Winsor and
                  Jerome Wynne and
                  Yarin Gal and
                  Xander Davies},
  title        = {AgentHarm: {A} Benchmark for Measuring Harmfulness of {LLM} Agents},
  journal      = {CoRR},
  volume       = {abs/2410.09024},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2410.09024},
  doi          = {10.48550/ARXIV.2410.09024},
  eprinttype   = {arXiv},
  eprint       = {2410.09024},
  timestamp    = {Fri, 22 Nov 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2410-09024.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tmlr/SrivastavaRRSAF23,
  author       = {Aarohi Srivastava and
                  Abhinav Rastogi and
                  Abhishek Rao and
                  Abu Awal Md Shoeb and
                  Abubakar Abid and
                  Adam Fisch and
                  Adam R. Brown and
                  Adam Santoro and
                  Aditya Gupta and
                  Adri{\`{a}} Garriga{-}Alonso and
                  Agnieszka Kluska and
                  Aitor Lewkowycz and
                  Akshat Agarwal and
                  Alethea Power and
                  Alex Ray and
                  Alex Warstadt and
                  Alexander W. Kocurek and
                  Ali Safaya and
                  Ali Tazarv and
                  Alice Xiang and
                  Alicia Parrish and
                  Allen Nie and
                  Aman Hussain and
                  Amanda Askell and
                  Amanda Dsouza and
                  Ambrose Slone and
                  Ameet Rahane and
                  Anantharaman S. Iyer and
                  Anders Andreassen and
                  Andrea Madotto and
                  Andrea Santilli and
                  Andreas Stuhlm{\"{u}}ller and
                  Andrew M. Dai and
                  Andrew La and
                  Andrew K. Lampinen and
                  Andy Zou and
                  Angela Jiang and
                  Angelica Chen and
                  Anh Vuong and
                  Animesh Gupta and
                  Anna Gottardi and
                  Antonio Norelli and
                  Anu Venkatesh and
                  Arash Gholamidavoodi and
                  Arfa Tabassum and
                  Arul Menezes and
                  Arun Kirubarajan and
                  Asher Mullokandov and
                  Ashish Sabharwal and
                  Austin Herrick and
                  Avia Efrat and
                  Aykut Erdem and
                  Ayla Karakas and
                  B. Ryan Roberts and
                  Bao Sheng Loe and
                  Barret Zoph and
                  Bartlomiej Bojanowski and
                  Batuhan {\"{O}}zyurt and
                  Behnam Hedayatnia and
                  Behnam Neyshabur and
                  Benjamin Inden and
                  Benno Stein and
                  Berk Ekmekci and
                  Bill Yuchen Lin and
                  Blake Howald and
                  Bryan Orinion and
                  Cameron Diao and
                  Cameron Dour and
                  Catherine Stinson and
                  Cedrick Argueta and
                  C{\`{e}}sar Ferri Ram{\'{\i}}rez and
                  Chandan Singh and
                  Charles Rathkopf and
                  Chenlin Meng and
                  Chitta Baral and
                  Chiyu Wu and
                  Chris Callison{-}Burch and
                  Chris Waites and
                  Christian Voigt and
                  Christopher D. Manning and
                  Christopher Potts and
                  Cindy Ramirez and
                  Clara E. Rivera and
                  Clemencia Siro and
                  Colin Raffel and
                  Courtney Ashcraft and
                  Cristina Garbacea and
                  Damien Sileo and
                  Dan Garrette and
                  Dan Hendrycks and
                  Dan Kilman and
                  Dan Roth and
                  Daniel Freeman and
                  Daniel Khashabi and
                  Daniel Levy and
                  Daniel Mosegu{\'{\i}} Gonz{\'{a}}lez and
                  Danielle Perszyk and
                  Danny Hernandez and
                  Danqi Chen and
                  Daphne Ippolito and
                  Dar Gilboa and
                  David Dohan and
                  David Drakard and
                  David Jurgens and
                  Debajyoti Datta and
                  Deep Ganguli and
                  Denis Emelin and
                  Denis Kleyko and
                  Deniz Yuret and
                  Derek Chen and
                  Derek Tam and
                  Dieuwke Hupkes and
                  Diganta Misra and
                  Dilyar Buzan and
                  Dimitri Coelho Mollo and
                  Diyi Yang and
                  Dong{-}Ho Lee and
                  Dylan Schrader and
                  Ekaterina Shutova and
                  Ekin Dogus Cubuk and
                  Elad Segal and
                  Eleanor Hagerman and
                  Elizabeth Barnes and
                  Elizabeth Donoway and
                  Ellie Pavlick and
                  Emanuele Rodol{\`{a}} and
                  Emma Lam and
                  Eric Chu and
                  Eric Tang and
                  Erkut Erdem and
                  Ernie Chang and
                  Ethan A. Chi and
                  Ethan Dyer and
                  Ethan J. Jerzak and
                  Ethan Kim and
                  Eunice Engefu Manyasi and
                  Evgenii Zheltonozhskii and
                  Fanyue Xia and
                  Fatemeh Siar and
                  Fernando Mart{\'{\i}}nez{-}Plumed and
                  Francesca Happ{\'{e}} and
                  Fran{\c{c}}ois Chollet and
                  Frieda Rong and
                  Gaurav Mishra and
                  Genta Indra Winata and
                  Gerard de Melo and
                  Germ{\'{a}}n Kruszewski and
                  Giambattista Parascandolo and
                  Giorgio Mariani and
                  Gloria Wang and
                  Gonzalo Jaimovitch{-}L{\'{o}}pez and
                  Gregor Betz and
                  Guy Gur{-}Ari and
                  Hana Galijasevic and
                  Hannah Kim and
                  Hannah Rashkin and
                  Hannaneh Hajishirzi and
                  Harsh Mehta and
                  Hayden Bogar and
                  Henry Shevlin and
                  Hinrich Sch{\"{u}}tze and
                  Hiromu Yakura and
                  Hongming Zhang and
                  Hugh Mee Wong and
                  Ian Ng and
                  Isaac Noble and
                  Jaap Jumelet and
                  Jack Geissinger and
                  Jackson Kernion and
                  Jacob Hilton and
                  Jaehoon Lee and
                  Jaime Fern{\'{a}}ndez Fisac and
                  James B. Simon and
                  James Koppel and
                  James Zheng and
                  James Zou and
                  Jan Kocon and
                  Jana Thompson and
                  Janelle Wingfield and
                  Jared Kaplan and
                  Jarema Radom and
                  Jascha Sohl{-}Dickstein and
                  Jason Phang and
                  Jason Wei and
                  Jason Yosinski and
                  Jekaterina Novikova and
                  Jelle Bosscher and
                  Jennifer Marsh and
                  Jeremy Kim and
                  Jeroen Taal and
                  Jesse H. Engel and
                  Jesujoba Alabi and
                  Jiacheng Xu and
                  Jiaming Song and
                  Jillian Tang and
                  Joan Waweru and
                  John Burden and
                  John Miller and
                  John U. Balis and
                  Jonathan Batchelder and
                  Jonathan Berant and
                  J{\"{o}}rg Frohberg and
                  Jos Rozen and
                  Jos{\'{e}} Hern{\'{a}}ndez{-}Orallo and
                  Joseph Boudeman and
                  Joseph Guerr and
                  Joseph Jones and
                  Joshua B. Tenenbaum and
                  Joshua S. Rule and
                  Joyce Chua and
                  Kamil Kanclerz and
                  Karen Livescu and
                  Karl Krauth and
                  Karthik Gopalakrishnan and
                  Katerina Ignatyeva and
                  Katja Markert and
                  Kaustubh D. Dhole and
                  Kevin Gimpel and
                  Kevin Omondi and
                  Kory W. Mathewson and
                  Kristen Chiafullo and
                  Ksenia Shkaruta and
                  Kumar Shridhar and
                  Kyle McDonell and
                  Kyle Richardson and
                  Laria Reynolds and
                  Leo Gao and
                  Li Zhang and
                  Liam Dugan and
                  Lianhui Qin and
                  Lidia Contreras Ochando and
                  Louis{-}Philippe Morency and
                  Luca Moschella and
                  Lucas Lam and
                  Lucy Noble and
                  Ludwig Schmidt and
                  Luheng He and
                  Luis Oliveros Col{\'{o}}n and
                  Luke Metz and
                  L{\"{u}}tfi Kerem Senel and
                  Maarten Bosma and
                  Maarten Sap and
                  Maartje ter Hoeve and
                  Maheen Farooqi and
                  Manaal Faruqui and
                  Mantas Mazeika and
                  Marco Baturan and
                  Marco Marelli and
                  Marco Maru and
                  Mar{\'{\i}}a Jos{\'{e}} Ram{\'{\i}}rez{-}Quintana and
                  Marie Tolkiehn and
                  Mario Giulianelli and
                  Martha Lewis and
                  Martin Potthast and
                  Matthew L. Leavitt and
                  Matthias Hagen and
                  M{\'{a}}ty{\'{a}}s Schubert and
                  Medina Baitemirova and
                  Melody Arnaud and
                  Melvin McElrath and
                  Michael A. Yee and
                  Michael Cohen and
                  Michael Gu and
                  Michael I. Ivanitskiy and
                  Michael Starritt and
                  Michael Strube and
                  Michal Swedrowski and
                  Michele Bevilacqua and
                  Michihiro Yasunaga and
                  Mihir Kale and
                  Mike Cain and
                  Mimee Xu and
                  Mirac Suzgun and
                  Mitch Walker and
                  Mo Tiwari and
                  Mohit Bansal and
                  Moin Aminnaseri and
                  Mor Geva and
                  Mozhdeh Gheini and
                  Mukund Varma T. and
                  Nanyun Peng and
                  Nathan A. Chi and
                  Nayeon Lee and
                  Neta Gur{-}Ari Krakover and
                  Nicholas Cameron and
                  Nicholas Roberts and
                  Nick Doiron and
                  Nicole Martinez and
                  Nikita Nangia and
                  Niklas Deckers and
                  Niklas Muennighoff and
                  Nitish Shirish Keskar and
                  Niveditha Iyer and
                  Noah Constant and
                  Noah Fiedel and
                  Nuan Wen and
                  Oliver Zhang and
                  Omar Agha and
                  Omar Elbaghdadi and
                  Omer Levy and
                  Owain Evans and
                  Pablo Antonio Moreno Casares and
                  Parth Doshi and
                  Pascale Fung and
                  Paul Pu Liang and
                  Paul Vicol and
                  Pegah Alipoormolabashi and
                  Peiyuan Liao and
                  Percy Liang and
                  Peter Chang and
                  Peter Eckersley and
                  Phu Mon Htut and
                  Pinyu Hwang and
                  Piotr Milkowski and
                  Piyush Patil and
                  Pouya Pezeshkpour and
                  Priti Oli and
                  Qiaozhu Mei and
                  Qing Lyu and
                  Qinlang Chen and
                  Rabin Banjade and
                  Rachel Etta Rudolph and
                  Raefer Gabriel and
                  Rahel Habacker and
                  Ramon Risco and
                  Rapha{\"{e}}l Milli{\`{e}}re and
                  Rhythm Garg and
                  Richard Barnes and
                  Rif A. Saurous and
                  Riku Arakawa and
                  Robbe Raymaekers and
                  Robert Frank and
                  Rohan Sikand and
                  Roman Novak and
                  Roman Sitelew and
                  Ronan LeBras and
                  Rosanne Liu and
                  Rowan Jacobs and
                  Rui Zhang and
                  Ruslan Salakhutdinov and
                  Ryan Chi and
                  Ryan Lee and
                  Ryan Stovall and
                  Ryan Teehan and
                  Rylan Yang and
                  Sahib Singh and
                  Saif M. Mohammad and
                  Sajant Anand and
                  Sam Dillavou and
                  Sam Shleifer and
                  Sam Wiseman and
                  Samuel Gruetter and
                  Samuel R. Bowman and
                  Samuel S. Schoenholz and
                  Sanghyun Han and
                  Sanjeev Kwatra and
                  Sarah A. Rous and
                  Sarik Ghazarian and
                  Sayan Ghosh and
                  Sean Casey and
                  Sebastian Bischoff and
                  Sebastian Gehrmann and
                  Sebastian Schuster and
                  Sepideh Sadeghi and
                  Shadi Hamdan and
                  Sharon Zhou and
                  Shashank Srivastava and
                  Sherry Shi and
                  Shikhar Singh and
                  Shima Asaadi and
                  Shixiang Shane Gu and
                  Shubh Pachchigar and
                  Shubham Toshniwal and
                  Shyam Upadhyay and
                  Shyamolima (Shammie) Debnath and
                  Siamak Shakeri and
                  Simon Thormeyer and
                  Simone Melzi and
                  Siva Reddy and
                  Sneha Priscilla Makini and
                  Soo{-}Hwan Lee and
                  Spencer Torene and
                  Sriharsha Hatwar and
                  Stanislas Dehaene and
                  Stefan Divic and
                  Stefano Ermon and
                  Stella Biderman and
                  Stephanie Lin and
                  Stephen Prasad and
                  Steven T. Piantadosi and
                  Stuart M. Shieber and
                  Summer Misherghi and
                  Svetlana Kiritchenko and
                  Swaroop Mishra and
                  Tal Linzen and
                  Tal Schuster and
                  Tao Li and
                  Tao Yu and
                  Tariq Ali and
                  Tatsu Hashimoto and
                  Te{-}Lin Wu and
                  Th{\'{e}}o Desbordes and
                  Theodore Rothschild and
                  Thomas Phan and
                  Tianle Wang and
                  Tiberius Nkinyili and
                  Timo Schick and
                  Timofei Kornev and
                  Titus Tunduny and
                  Tobias Gerstenberg and
                  Trenton Chang and
                  Trishala Neeraj and
                  Tushar Khot and
                  Tyler Shultz and
                  Uri Shaham and
                  Vedant Misra and
                  Vera Demberg and
                  Victoria Nyamai and
                  Vikas Raunak and
                  Vinay V. Ramasesh and
                  Vinay Uday Prabhu and
                  Vishakh Padmakumar and
                  Vivek Srikumar and
                  William Fedus and
                  William Saunders and
                  William Zhang and
                  Wout Vossen and
                  Xiang Ren and
                  Xiaoyu Tong and
                  Xinran Zhao and
                  Xinyi Wu and
                  Xudong Shen and
                  Yadollah Yaghoobzadeh and
                  Yair Lakretz and
                  Yangqiu Song and
                  Yasaman Bahri and
                  Yejin Choi and
                  Yichi Yang and
                  Yiding Hao and
                  Yifu Chen and
                  Yonatan Belinkov and
                  Yu Hou and
                  Yufang Hou and
                  Yuntao Bai and
                  Zachary Seid and
                  Zhuoye Zhao and
                  Zijian Wang and
                  Zijie J. Wang and
                  Zirui Wang and
                  Ziyi Wu},
  title        = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities
                  of language models},
  journal      = {Trans. Mach. Learn. Res.},
  volume       = {2023},
  year         = {2023},
  url          = {https://openreview.net/forum?id=uyTL5Bvosj},
  timestamp    = {Thu, 20 Nov 2025 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/tmlr/SrivastavaRRSAF23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/PanCZLBWZEH23,
  author       = {Alexander Pan and
                  Jun Shern Chan and
                  Andy Zou and
                  Nathaniel Li and
                  Steven Basart and
                  Thomas Woodside and
                  Hanlin Zhang and
                  Scott Emmons and
                  Dan Hendrycks},
  editor       = {Andreas Krause and
                  Emma Brunskill and
                  Kyunghyun Cho and
                  Barbara Engelhardt and
                  Sivan Sabato and
                  Jonathan Scarlett},
  title        = {Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards
                  and Ethical Behavior in the Machiavelli Benchmark},
  booktitle    = {International Conference on Machine Learning, {ICML} 2023, 23-29 July
                  2023, Honolulu, Hawaii, {USA}},
  series       = {Proceedings of Machine Learning Research},
  pages        = {26837--26867},
  publisher    = {{PMLR}},
  year         = {2023},
  url          = {https://proceedings.mlr.press/v202/pan23a.html},
  timestamp    = {Tue, 17 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icml/PanCZLBWZEH23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/HuZWLF23,
  author       = {Kai Hu and
                  Andy Zou and
                  Zifan Wang and
                  Klas Leino and
                  Matt Fredrikson},
  editor       = {Alice Oh and
                  Tristan Naumann and
                  Amir Globerson and
                  Kate Saenko and
                  Moritz Hardt and
                  Sergey Levine},
  title        = {Unlocking Deterministic Robustness Certification on ImageNet},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/863da9d40547f1d1b18859519ce2dee4-Abstract-Conference.html},
  timestamp    = {Mon, 19 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/HuZWLF23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2301-12549,
  author       = {Kai Hu and
                  Andy Zou and
                  Zifan Wang and
                  Klas Leino and
                  Matt Fredrikson},
  title        = {Scaling in Depth: Unlocking Robustness Certification on ImageNet},
  journal      = {CoRR},
  volume       = {abs/2301.12549},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2301.12549},
  doi          = {10.48550/ARXIV.2301.12549},
  eprinttype   = {arXiv},
  eprint       = {2301.12549},
  timestamp    = {Sun, 06 Oct 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2301-12549.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2303-06189,
  author       = {Ram M. Kripa and
                  Andy Zou and
                  Ryan Jia and
                  Kenny Huang},
  title        = {Papaya: Federated Learning, but Fully Decentralized},
  journal      = {CoRR},
  volume       = {abs/2303.06189},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2303.06189},
  doi          = {10.48550/ARXIV.2303.06189},
  eprinttype   = {arXiv},
  eprint       = {2303.06189},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2303-06189.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2304-03279,
  author       = {Alexander Pan and
                  Jun Shern Chan and
                  Andy Zou and
                  Nathaniel Li and
                  Steven Basart and
                  Thomas Woodside and
                  Jonathan Ng and
                  Hanlin Zhang and
                  Scott Emmons and
                  Dan Hendrycks},
  title        = {Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards
                  and Ethical Behavior in the {MACHIAVELLI} Benchmark},
  journal      = {CoRR},
  volume       = {abs/2304.03279},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2304.03279},
  doi          = {10.48550/ARXIV.2304.03279},
  eprinttype   = {arXiv},
  eprint       = {2304.03279},
  timestamp    = {Tue, 17 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2304-03279.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2307-15043,
  author       = {Andy Zou and
                  Zifan Wang and
                  J. Zico Kolter and
                  Matt Fredrikson},
  title        = {Universal and Transferable Adversarial Attacks on Aligned Language
                  Models},
  journal      = {CoRR},
  volume       = {abs/2307.15043},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2307.15043},
  doi          = {10.48550/ARXIV.2307.15043},
  eprinttype   = {arXiv},
  eprint       = {2307.15043},
  timestamp    = {Sun, 06 Oct 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2307-15043.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-01405,
  author       = {Andy Zou and
                  Long Phan and
                  Sarah Li Chen and
                  James Campbell and
                  Phillip Guo and
                  Richard Ren and
                  Alexander Pan and
                  Xuwang Yin and
                  Mantas Mazeika and
                  Ann{-}Kathrin Dombrowski and
                  Shashwat Goel and
                  Nathaniel Li and
                  Michael J. Byun and
                  Zifan Wang and
                  Alex Mallen and
                  Steven Basart and
                  Sanmi Koyejo and
                  Dawn Song and
                  Matt Fredrikson and
                  J. Zico Kolter and
                  Dan Hendrycks},
  title        = {Representation Engineering: {A} Top-Down Approach to {AI} Transparency},
  journal      = {CoRR},
  volume       = {abs/2310.01405},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.01405},
  doi          = {10.48550/ARXIV.2310.01405},
  eprinttype   = {arXiv},
  eprint       = {2310.01405},
  timestamp    = {Fri, 13 Jun 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2310-01405.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/HendrycksZMTLSS22,
  author       = {Dan Hendrycks and
                  Andy Zou and
                  Mantas Mazeika and
                  Leonard Tang and
                  Bo Li and
                  Dawn Song and
                  Jacob Steinhardt},
  title        = {PixMix: Dreamlike Pictures Comprehensively Improve Safety Measures},
  booktitle    = {{IEEE/CVF} Conference on Computer Vision and Pattern Recognition,
                  {CVPR} 2022, New Orleans, LA, USA, June 18-24, 2022},
  pages        = {16762--16771},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/CVPR52688.2022.01628},
  doi          = {10.1109/CVPR52688.2022.01628},
  timestamp    = {Sun, 19 Jan 2025 13:39:04 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/HendrycksZMTLSS22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icml/HendrycksBMZKMS22,
  author       = {Dan Hendrycks and
                  Steven Basart and
                  Mantas Mazeika and
                  Andy Zou and
                  Joseph Kwon and
                  Mohammadreza Mostajabi and
                  Jacob Steinhardt and
                  Dawn Song},
  editor       = {Kamalika Chaudhuri and
                  Stefanie Jegelka and
                  Le Song and
                  Csaba Szepesv{\'{a}}ri and
                  Gang Niu and
                  Sivan Sabato},
  title        = {Scaling Out-of-Distribution Detection for Real-World Settings},
  booktitle    = {International Conference on Machine Learning, {ICML} 2022, 17-23 July
                  2022, Baltimore, Maryland, {USA}},
  series       = {Proceedings of Machine Learning Research},
  pages        = {8759--8773},
  publisher    = {{PMLR}},
  year         = {2022},
  url          = {https://proceedings.mlr.press/v162/hendrycks22a.html},
  timestamp    = {Tue, 12 Jul 2022 17:36:52 +0200},
  biburl       = {https://dblp.org/rec/conf/icml/HendrycksBMZKMS22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/MazeikaTZBCSFSH22,
  author       = {Mantas Mazeika and
                  Eric Tang and
                  Andy Zou and
                  Steven Basart and
                  Jun Shern Chan and
                  Dawn Song and
                  David A. Forsyth and
                  Jacob Steinhardt and
                  Dan Hendrycks},
  editor       = {Sanmi Koyejo and
                  S. Mohamed and
                  A. Agarwal and
                  Danielle Belgrave and
                  K. Cho and
                  A. Oh},
  title        = {How Would The Viewer Feel? Estimating Wellbeing From Video Scenarios},
  booktitle    = {Advances in Neural Information Processing Systems 35: Annual Conference
                  on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
                  LA, USA, November 28 - December 9, 2022},
  year         = {2022},
  url          = {http://papers.nips.cc/paper\_files/paper/2022/hash/75ff01252ab45ce278cb060effce4ca1-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Mon, 08 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/MazeikaTZBCSFSH22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/ZouXJKMLSSEH22,
  author       = {Andy Zou and
                  Tristan Xiao and
                  Ryan Jia and
                  Joe Kwon and
                  Mantas Mazeika and
                  Richard Li and
                  Dawn Song and
                  Jacob Steinhardt and
                  Owain Evans and
                  Dan Hendrycks},
  editor       = {Sanmi Koyejo and
                  S. Mohamed and
                  A. Agarwal and
                  Danielle Belgrave and
                  K. Cho and
                  A. Oh},
  title        = {Forecasting Future World Events With Neural Networks},
  booktitle    = {Advances in Neural Information Processing Systems 35: Annual Conference
                  on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
                  LA, USA, November 28 - December 9, 2022},
  year         = {2022},
  url          = {http://papers.nips.cc/paper\_files/paper/2022/hash/aec870a6772336c15dac992c16f2e7c9-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Mon, 08 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/ZouXJKMLSSEH22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-04615,
  author       = {Aarohi Srivastava and
                  Abhinav Rastogi and
                  Abhishek Rao and
                  Abu Awal Md Shoeb and
                  Abubakar Abid and
                  Adam Fisch and
                  Adam R. Brown and
                  Adam Santoro and
                  Aditya Gupta and
                  Adri{\`{a}} Garriga{-}Alonso and
                  Agnieszka Kluska and
                  Aitor Lewkowycz and
                  Akshat Agarwal and
                  Alethea Power and
                  Alex Ray and
                  Alex Warstadt and
                  Alexander W. Kocurek and
                  Ali Safaya and
                  Ali Tazarv and
                  Alice Xiang and
                  Alicia Parrish and
                  Allen Nie and
                  Aman Hussain and
                  Amanda Askell and
                  Amanda Dsouza and
                  Ambrose Slone and
                  Ameet Rahane and
                  Anantharaman S. Iyer and
                  Anders Andreassen and
                  Andrea Madotto and
                  Andrea Santilli and
                  Andreas Stuhlm{\"{u}}ller and
                  Andrew M. Dai and
                  Andrew La and
                  Andrew K. Lampinen and
                  Andy Zou and
                  Angela Jiang and
                  Angelica Chen and
                  Anh Vuong and
                  Animesh Gupta and
                  Anna Gottardi and
                  Antonio Norelli and
                  Anu Venkatesh and
                  Arash Gholamidavoodi and
                  Arfa Tabassum and
                  Arul Menezes and
                  Arun Kirubarajan and
                  Asher Mullokandov and
                  Ashish Sabharwal and
                  Austin Herrick and
                  Avia Efrat and
                  Aykut Erdem and
                  Ayla Karakas and
                  B. Ryan Roberts and
                  Bao Sheng Loe and
                  Barret Zoph and
                  Bartlomiej Bojanowski and
                  Batuhan {\"{O}}zyurt and
                  Behnam Hedayatnia and
                  Behnam Neyshabur and
                  Benjamin Inden and
                  Benno Stein and
                  Berk Ekmekci and
                  Bill Yuchen Lin and
                  Blake Howald and
                  Bryan Orinion and
                  Cameron Diao and
                  Cameron Dour and
                  Catherine Stinson and
                  Cedrick Argueta and
                  C{\`{e}}sar Ferri Ram{\'{\i}}rez and
                  Chandan Singh and
                  Charles Rathkopf and
                  Chenlin Meng and
                  Chitta Baral and
                  Chiyu Wu and
                  Chris Callison{-}Burch and
                  Chris Waites and
                  Christian Voigt and
                  Christopher D. Manning and
                  Christopher Potts and
                  Cindy Ramirez and
                  Clara E. Rivera and
                  Clemencia Siro and
                  Colin Raffel and
                  Courtney Ashcraft and
                  Cristina Garbacea and
                  Damien Sileo and
                  Dan Garrette and
                  Dan Hendrycks and
                  Dan Kilman and
                  Dan Roth and
                  Daniel Freeman and
                  Daniel Khashabi and
                  Daniel Levy and
                  Daniel Mosegu{\'{\i}} Gonz{\'{a}}lez and
                  Danielle Perszyk and
                  Danny Hernandez and
                  Danqi Chen and
                  Daphne Ippolito and
                  Dar Gilboa and
                  David Dohan and
                  David Drakard and
                  David Jurgens and
                  Debajyoti Datta and
                  Deep Ganguli and
                  Denis Emelin and
                  Denis Kleyko and
                  Deniz Yuret and
                  Derek Chen and
                  Derek Tam and
                  Dieuwke Hupkes and
                  Diganta Misra and
                  Dilyar Buzan and
                  Dimitri Coelho Mollo and
                  Diyi Yang and
                  Dong{-}Ho Lee and
                  Dylan Schrader and
                  Ekaterina Shutova and
                  Ekin Dogus Cubuk and
                  Elad Segal and
                  Eleanor Hagerman and
                  Elizabeth Barnes and
                  Elizabeth Donoway and
                  Ellie Pavlick and
                  Emanuele Rodol{\`{a}} and
                  Emma Lam and
                  Eric Chu and
                  Eric Tang and
                  Erkut Erdem and
                  Ernie Chang and
                  Ethan A. Chi and
                  Ethan Dyer and
                  Ethan J. Jerzak and
                  Ethan Kim and
                  Eunice Engefu Manyasi and
                  Evgenii Zheltonozhskii and
                  Fanyue Xia and
                  Fatemeh Siar and
                  Fernando Mart{\'{\i}}nez{-}Plumed and
                  Francesca Happ{\'{e}} and
                  Fran{\c{c}}ois Chollet and
                  Frieda Rong and
                  Gaurav Mishra and
                  Genta Indra Winata and
                  Gerard de Melo and
                  Germ{\'{a}}n Kruszewski and
                  Giambattista Parascandolo and
                  Giorgio Mariani and
                  Gloria Wang and
                  Gonzalo Jaimovitch{-}L{\'{o}}pez and
                  Gregor Betz and
                  Guy Gur{-}Ari and
                  Hana Galijasevic and
                  Hannah Kim and
                  Hannah Rashkin and
                  Hannaneh Hajishirzi and
                  Harsh Mehta and
                  Hayden Bogar and
                  Henry Shevlin and
                  Hinrich Sch{\"{u}}tze and
                  Hiromu Yakura and
                  Hongming Zhang and
                  Hugh Mee Wong and
                  Ian Ng and
                  Isaac Noble and
                  Jaap Jumelet and
                  Jack Geissinger and
                  Jackson Kernion and
                  Jacob Hilton and
                  Jaehoon Lee and
                  Jaime Fern{\'{a}}ndez Fisac and
                  James B. Simon and
                  James Koppel and
                  James Zheng and
                  James Zou and
                  Jan Kocon and
                  Jana Thompson and
                  Janelle Wingfield and
                  Jared Kaplan and
                  Jarema Radom and
                  Jascha Sohl{-}Dickstein and
                  Jason Phang and
                  Jason Wei and
                  Jason Yosinski and
                  Jekaterina Novikova and
                  Jelle Bosscher and
                  Jennifer Marsh and
                  Jeremy Kim and
                  Jeroen Taal and
                  Jesse H. Engel and
                  Jesujoba Alabi and
                  Jiacheng Xu and
                  Jiaming Song and
                  Jillian Tang and
                  Joan Waweru and
                  John Burden and
                  John Miller and
                  John U. Balis and
                  Jonathan Batchelder and
                  Jonathan Berant and
                  J{\"{o}}rg Frohberg and
                  Jos Rozen and
                  Jos{\'{e}} Hern{\'{a}}ndez{-}Orallo and
                  Joseph Boudeman and
                  Joseph Guerr and
                  Joseph Jones and
                  Joshua B. Tenenbaum and
                  Joshua S. Rule and
                  Joyce Chua and
                  Kamil Kanclerz and
                  Karen Livescu and
                  Karl Krauth and
                  Karthik Gopalakrishnan and
                  Katerina Ignatyeva and
                  Katja Markert and
                  Kaustubh D. Dhole and
                  Kevin Gimpel and
                  Kevin Omondi and
                  Kory W. Mathewson and
                  Kristen Chiafullo and
                  Ksenia Shkaruta and
                  Kumar Shridhar and
                  Kyle McDonell and
                  Kyle Richardson and
                  Laria Reynolds and
                  Leo Gao and
                  Li Zhang and
                  Liam Dugan and
                  Lianhui Qin and
                  Lidia Contreras Ochando and
                  Louis{-}Philippe Morency and
                  Luca Moschella and
                  Lucas Lam and
                  Lucy Noble and
                  Ludwig Schmidt and
                  Luheng He and
                  Luis Oliveros Col{\'{o}}n and
                  Luke Metz and
                  L{\"{u}}tfi Kerem Senel and
                  Maarten Bosma and
                  Maarten Sap and
                  Maartje ter Hoeve and
                  Maheen Farooqi and
                  Manaal Faruqui and
                  Mantas Mazeika and
                  Marco Baturan and
                  Marco Marelli and
                  Marco Maru and
                  Mar{\'{\i}}a Jos{\'{e}} Ram{\'{\i}}rez{-}Quintana and
                  Marie Tolkiehn and
                  Mario Giulianelli and
                  Martha Lewis and
                  Martin Potthast and
                  Matthew L. Leavitt and
                  Matthias Hagen and
                  M{\'{a}}ty{\'{a}}s Schubert and
                  Medina Baitemirova and
                  Melody Arnaud and
                  Melvin McElrath and
                  Michael A. Yee and
                  Michael Cohen and
                  Michael Gu and
                  Michael I. Ivanitskiy and
                  Michael Starritt and
                  Michael Strube and
                  Michal Swedrowski and
                  Michele Bevilacqua and
                  Michihiro Yasunaga and
                  Mihir Kale and
                  Mike Cain and
                  Mimee Xu and
                  Mirac Suzgun and
                  Mitch Walker and
                  Mo Tiwari and
                  Mohit Bansal and
                  Moin Aminnaseri and
                  Mor Geva and
                  Mozhdeh Gheini and
                  Mukund Varma T. and
                  Nanyun Peng and
                  Nathan A. Chi and
                  Nayeon Lee and
                  Neta Gur{-}Ari Krakover and
                  Nicholas Cameron and
                  Nicholas Roberts and
                  Nick Doiron and
                  Nicole Martinez and
                  Nikita Nangia and
                  Niklas Deckers and
                  Niklas Muennighoff and
                  Nitish Shirish Keskar and
                  Niveditha Iyer and
                  Noah Constant and
                  Noah Fiedel and
                  Nuan Wen and
                  Oliver Zhang and
                  Omar Agha and
                  Omar Elbaghdadi and
                  Omer Levy and
                  Owain Evans and
                  Pablo Antonio Moreno Casares and
                  Parth Doshi and
                  Pascale Fung and
                  Paul Pu Liang and
                  Paul Vicol and
                  Pegah Alipoormolabashi and
                  Peiyuan Liao and
                  Percy Liang and
                  Peter Chang and
                  Peter Eckersley and
                  Phu Mon Htut and
                  Pinyu Hwang and
                  Piotr Milkowski and
                  Piyush Patil and
                  Pouya Pezeshkpour and
                  Priti Oli and
                  Qiaozhu Mei and
                  Qing Lyu and
                  Qinlang Chen and
                  Rabin Banjade and
                  Rachel Etta Rudolph and
                  Raefer Gabriel and
                  Rahel Habacker and
                  Ramon Risco and
                  Rapha{\"{e}}l Milli{\`{e}}re and
                  Rhythm Garg and
                  Richard Barnes and
                  Rif A. Saurous and
                  Riku Arakawa and
                  Robbe Raymaekers and
                  Robert Frank and
                  Rohan Sikand and
                  Roman Novak and
                  Roman Sitelew and
                  Ronan LeBras and
                  Rosanne Liu and
                  Rowan Jacobs and
                  Rui Zhang and
                  Ruslan Salakhutdinov and
                  Ryan Chi and
                  Ryan Lee and
                  Ryan Stovall and
                  Ryan Teehan and
                  Rylan Yang and
                  Sahib Singh and
                  Saif M. Mohammad and
                  Sajant Anand and
                  Sam Dillavou and
                  Sam Shleifer and
                  Sam Wiseman and
                  Samuel Gruetter and
                  Samuel R. Bowman and
                  Samuel S. Schoenholz and
                  Sanghyun Han and
                  Sanjeev Kwatra and
                  Sarah A. Rous and
                  Sarik Ghazarian and
                  Sayan Ghosh and
                  Sean Casey and
                  Sebastian Bischoff and
                  Sebastian Gehrmann and
                  Sebastian Schuster and
                  Sepideh Sadeghi and
                  Shadi Hamdan and
                  Sharon Zhou and
                  Shashank Srivastava and
                  Sherry Shi and
                  Shikhar Singh and
                  Shima Asaadi and
                  Shixiang Shane Gu and
                  Shubh Pachchigar and
                  Shubham Toshniwal and
                  Shyam Upadhyay and
                  Shyamolima (Shammie) Debnath and
                  Siamak Shakeri and
                  Simon Thormeyer and
                  Simone Melzi and
                  Siva Reddy and
                  Sneha Priscilla Makini and
                  Soo{-}Hwan Lee and
                  Spencer Torene and
                  Sriharsha Hatwar and
                  Stanislas Dehaene and
                  Stefan Divic and
                  Stefano Ermon and
                  Stella Biderman and
                  Stephanie Lin and
                  Stephen Prasad and
                  Steven T. Piantadosi and
                  Stuart M. Shieber and
                  Summer Misherghi and
                  Svetlana Kiritchenko and
                  Swaroop Mishra and
                  Tal Linzen and
                  Tal Schuster and
                  Tao Li and
                  Tao Yu and
                  Tariq Ali and
                  Tatsu Hashimoto and
                  Te{-}Lin Wu and
                  Th{\'{e}}o Desbordes and
                  Theodore Rothschild and
                  Thomas Phan and
                  Tianle Wang and
                  Tiberius Nkinyili and
                  Timo Schick and
                  Timofei Kornev and
                  Titus Tunduny and
                  Tobias Gerstenberg and
                  Trenton Chang and
                  Trishala Neeraj and
                  Tushar Khot and
                  Tyler Shultz and
                  Uri Shaham and
                  Vedant Misra and
                  Vera Demberg and
                  Victoria Nyamai and
                  Vikas Raunak and
                  Vinay V. Ramasesh and
                  Vinay Uday Prabhu and
                  Vishakh Padmakumar and
                  Vivek Srikumar and
                  William Fedus and
                  William Saunders and
                  William Zhang and
                  Wout Vossen and
                  Xiang Ren and
                  Xiaoyu Tong and
                  Xinran Zhao and
                  Xinyi Wu and
                  Xudong Shen and
                  Yadollah Yaghoobzadeh and
                  Yair Lakretz and
                  Yangqiu Song and
                  Yasaman Bahri and
                  Yejin Choi and
                  Yichi Yang and
                  Yiding Hao and
                  Yifu Chen and
                  Yonatan Belinkov and
                  Yu Hou and
                  Yufang Hou and
                  Yuntao Bai and
                  Zachary Seid and
                  Zhuoye Zhao and
                  Zijian Wang and
                  Zijie J. Wang and
                  Zirui Wang and
                  Ziyi Wu},
  title        = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities
                  of language models},
  journal      = {CoRR},
  volume       = {abs/2206.04615},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2206.04615},
  doi          = {10.48550/ARXIV.2206.04615},
  eprinttype   = {arXiv},
  eprint       = {2206.04615},
  timestamp    = {Tue, 21 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2206-04615.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2206-15474,
  author       = {Andy Zou and
                  Tristan Xiao and
                  Ryan Jia and
                  Joe Kwon and
                  Mantas Mazeika and
                  Richard Li and
                  Dawn Song and
                  Jacob Steinhardt and
                  Owain Evans and
                  Dan Hendrycks},
  title        = {Forecasting Future World Events with Neural Networks},
  journal      = {CoRR},
  volume       = {abs/2206.15474},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2206.15474},
  doi          = {10.48550/ARXIV.2206.15474},
  eprinttype   = {arXiv},
  eprint       = {2206.15474},
  timestamp    = {Tue, 07 Apr 2026 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2206-15474.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2210-10039,
  author       = {Mantas Mazeika and
                  Eric Tang and
                  Andy Zou and
                  Steven Basart and
                  Jun Shern Chan and
                  Dawn Song and
                  David A. Forsyth and
                  Jacob Steinhardt and
                  Dan Hendrycks},
  title        = {How Would The Viewer Feel? Estimating Wellbeing From Video Scenarios},
  journal      = {CoRR},
  volume       = {abs/2210.10039},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2210.10039},
  doi          = {10.48550/ARXIV.2210.10039},
  eprinttype   = {arXiv},
  eprint       = {2210.10039},
  timestamp    = {Mon, 24 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2210-10039.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/HendrycksBBZMSS21,
  author       = {Dan Hendrycks and
                  Collin Burns and
                  Steven Basart and
                  Andy Zou and
                  Mantas Mazeika and
                  Dawn Song and
                  Jacob Steinhardt},
  title        = {Measuring Massive Multitask Language Understanding},
  booktitle    = {9th International Conference on Learning Representations, {ICLR} 2021,
                  Virtual Event, Austria, May 3-7, 2021},
  publisher    = {OpenReview.net},
  year         = {2021},
  url          = {https://openreview.net/forum?id=d7KBjmI3GmQ},
  timestamp    = {Wed, 23 Jun 2021 17:36:39 +0200},
  biburl       = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/HendrycksMZPZNS21,
  author       = {Dan Hendrycks and
                  Mantas Mazeika and
                  Andy Zou and
                  Sahil Patel and
                  Christine Zhu and
                  Jesus Navarro and
                  Dawn Song and
                  Bo Li and
                  Jacob Steinhardt},
  editor       = {Joaquin Vanschoren and
                  Sai{-}Kit Yeung},
  title        = {What Would Jiminy Cricket Do? Towards Agents That Behave Morally},
  booktitle    = {Proceedings of the Neural Information Processing Systems Track on
                  Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December
                  2021, virtual},
  year         = {2021},
  url          = {https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/39059724f73a9969845dfe4146c5660e-Abstract-round2.html},
  timestamp    = {Thu, 05 May 2022 16:30:03 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/HendrycksMZPZNS21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/MazeikaHLXHZRYWTTTSPBSPLF21,
  author       = {Mantas Mazeika and
                  Dan Hendrycks and
                  Huichen Li and
                  Xiaojun Xu and
                  Sidney Q. Hough and
                  Andy Zou and
                  Arezoo Rajabi and
                  Qi Yao and
                  Zihao Wang and
                  Jian Tian and
                  Yao Tang and
                  Di Tang and
                  Roman Smirnov and
                  Pavel Pleskov and
                  Nikita Benkovich and
                  Dawn Song and
                  Radha Poovendran and
                  Bo Li and
                  David A. Forsyth},
  editor       = {Marco Ciccone and
                  Gustavo Stolovitzky and
                  Jacob Albrecht},
  title        = {The Trojan Detection Challenge},
  booktitle    = {NeurIPS 2022 Competition Track, November 28 - December 9, 2022, Online},
  series       = {Proceedings of Machine Learning Research},
  pages        = {279--291},
  publisher    = {{PMLR}},
  year         = {2021},
  url          = {https://proceedings.mlr.press/v220/mazeika22a.html},
  timestamp    = {Mon, 12 Aug 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/MazeikaHLXHZRYWTTTSPBSPLF21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2110-13136,
  author       = {Dan Hendrycks and
                  Mantas Mazeika and
                  Andy Zou and
                  Sahil Patel and
                  Christine Zhu and
                  Jesus Navarro and
                  Dawn Song and
                  Bo Li and
                  Jacob Steinhardt},
  title        = {What Would Jiminy Cricket Do? Towards Agents That Behave Morally},
  journal      = {CoRR},
  volume       = {abs/2110.13136},
  year         = {2021},
  url          = {https://arxiv.org/abs/2110.13136},
  eprinttype   = {arXiv},
  eprint       = {2110.13136},
  timestamp    = {Mon, 07 Mar 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2110-13136.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2112-05135,
  author       = {Dan Hendrycks and
                  Andy Zou and
                  Mantas Mazeika and
                  Leonard Tang and
                  Bo Li and
                  Dawn Song and
                  Jacob Steinhardt},
  title        = {PixMix: Dreamlike Pictures Comprehensively Improve Safety Measures},
  journal      = {CoRR},
  volume       = {abs/2112.05135},
  year         = {2021},
  url          = {https://arxiv.org/abs/2112.05135},
  eprinttype   = {arXiv},
  eprint       = {2112.05135},
  timestamp    = {Mon, 07 Mar 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2112-05135.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2009-03300,
  author       = {Dan Hendrycks and
                  Collin Burns and
                  Steven Basart and
                  Andy Zou and
                  Mantas Mazeika and
                  Dawn Song and
                  Jacob Steinhardt},
  title        = {Measuring Massive Multitask Language Understanding},
  journal      = {CoRR},
  volume       = {abs/2009.03300},
  year         = {2020},
  url          = {https://arxiv.org/abs/2009.03300},
  eprinttype   = {arXiv},
  eprint       = {2009.03300},
  timestamp    = {Thu, 17 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2009-03300.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}