@article{
chen2026feedbackdriven,
title={Feedback-Driven Black-Box Safety Alignment Testing of Large Language Models via Reinforcement Learning},
author={Xuan Chen and Yuzhou Nie and Lu Yan and Mingwei Zheng and Yunshu Mao and Wenbo Guo and Xiangyu Zhang},
journal={Transactions on Machine Learning Research},
issn={2835-8856},
year={2026},
url={https://openreview.net/forum?id=GWslY31w2b},
note={J2C Certification}
}