@article{
xue2026pointitout,
title={Point-It-Out: Benchmarking Embodied Reasoning for Vision Language Models in Multi-Stage Visual Grounding},
author={Haotian Xue and Yunhao Ge and Yu Zeng and Zhaoshuo Li and Ming-Yu Liu and Yongxin Chen and Jiaojiao Fan},
journal={Transactions on Machine Learning Research},
issn={2835-8856},
year={2026},
url={https://openreview.net/forum?id=9e0hRhFsal},
note={}
}