@inproceedings{b6f0f825804448d8995caca427541b88,
title = "Exploring Straightforward Methods for Automatic Conversational Red-Teaming",
abstract = "Large language models (LLMs) are increasingly used in business dialogue systems but they also pose security and ethical risks. Multi-turn conversations, in which context influences the model's behavior, can be exploited to generate undesired responses. In this paper, we investigate the use of off-the-shelf LLMs in conversational red-teaming settings, where an attacker LLM attempts to elicit undesired outputs from a target LLM. Our experiments address critical questions and offer valuable insights regarding the effectiveness of using LLMs as automated red-teamers, shedding light on key strategies and usage approaches that significantly impact their performance. Our findings demonstrate that off-the-shelf models can serve as effective red-teamers, capable of adapting their attack strategies based on prior attempts. Allowing these models to freely steer conversations and conceal their malicious intent further increases attack success. However, their effectiveness decreases as the alignment of the target model improves.",
author = "George Kour and Naama Zwerdling and Marcel Zalmanovici and Ateret Anaby-Tavor and Fandina, \{Ora Nova\} and Eitan Farchi",
note = "Publisher Copyright: {\textcopyright} 2025 Association for Computational Linguistics.; 2025 Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2025 ; Conference date: 29-04-2025 Through 04-05-2025",
year = "2025",
month = jan,
day = "1",
doi = "10.18653/v1/2025.naacl-industry.10",
language = "English",
series = "Proceedings of the 2025 Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies: Long Papers, NAACL-HLT 2025",
publisher = "Association for Computational Linguistics (ACL)",
pages = "112--128",
editor = "Weizhu Chen and Yi Yang and Mohammad Kachuee and Xue-Yong Fu",
booktitle = "Industry Track",
address = "United States",
}