Foundational Challenges in Assuring Alignment and Safety of Large Language Models

Usman Anwar, Abulhair Saparov, Javier Rando, Daniel Paleka, Miles Turpin, Peter Hase, Ekdeep Singh Lubana, Erik Jenner, Stephen Casper, Oliver Sourbut, Benjamin L. Edelman, Zhaowei Zhang, Mario Günther, Anton Korinek, José Hernández-Orallo, Lewis Hammond, Eric J. Bigelow, Alexander Pan, Lauro Langosco, Tomasz Korbak, Heidi Chenyu Zhang, Ruiqi Zhong, Seán Ó hÉigeartaigh, Gabriel Recchia, Giulio Corsi, Alan Chan, Markus Anderljung, Lilian Edwards, Aleksandar Petrov, Christian Schröder de Witt, Sumeet Ramesh Motwani, Yoshua Bengio, Danqi Chen 0001, Philip Torr 0001, Samuel Albanie, Tegan Maharaj, Jakob Nicolaus Foerster, Florian Tramèr, He He 0001, Atoosa Kasirzadeh, Yejin Choi 0001, David Krueger 0001. Foundational Challenges in Assuring Alignment and Safety of Large Language Models. Trans. Mach. Learn. Res., 2024, 2024. [doi]

@article{AnwarSRPTHLJCSE24,
  title = {Foundational Challenges in Assuring Alignment and Safety of Large Language Models},
  author = {Usman Anwar and Abulhair Saparov and Javier Rando and Daniel Paleka and Miles Turpin and Peter Hase and Ekdeep Singh Lubana and Erik Jenner and Stephen Casper and Oliver Sourbut and Benjamin L. Edelman and Zhaowei Zhang and Mario Günther and Anton Korinek and José Hernández-Orallo and Lewis Hammond and Eric J. Bigelow and Alexander Pan and Lauro Langosco and Tomasz Korbak and Heidi Chenyu Zhang and Ruiqi Zhong and Seán Ó hÉigeartaigh and Gabriel Recchia and Giulio Corsi and Alan Chan and Markus Anderljung and Lilian Edwards and Aleksandar Petrov and Christian Schröder de Witt and Sumeet Ramesh Motwani and Yoshua Bengio and Danqi Chen 0001 and Philip Torr 0001 and Samuel Albanie and Tegan Maharaj and Jakob Nicolaus Foerster and Florian Tramèr and He He 0001 and Atoosa Kasirzadeh and Yejin Choi 0001 and David Krueger 0001},
  year = {2024},
  url = {https://openreview.net/forum?id=oVTkOs8Pka},
  researchr = {https://researchr.org/publication/AnwarSRPTHLJCSE24},
  cites = {0},
  citedby = {0},
  journal = {Trans. Mach. Learn. Res.},
  volume = {2024},
}