Zhe Hu
Ph.D. Student from PolyU
Natural Language Processing.
Zhe Hu is a Ph.D. Student from PolyU and a long-term collaborator of VU Lab.
Publications
-
@article{liu2026spatial, title = {Spatial Intelligence in Vision-Language Models: A Comprehensive Survey}, author = {Liu, Disheng and Liang, Tuo and Hu, Zhe and Peng, Jierui and Lu, Yiren and Xu, Yi and Fu, Yun and Yin, Yu}, journal = {TechRxiv}, year = {2026}, status = {preprint}, pdf = {https://www.techrxiv.org/doi/full/10.36227/techrxiv.176231405.57942913/v2}, website = {https://dishengll.github.io/Awesome-Spatial-VLMs/} } - Viva+: Human-Centered Situational Decision-Making.In Findings of the Association for Computational Linguistics: EMNLP, 2025.
@article{hu2025vivaplus, title = {Viva+: Human-Centered Situational Decision-Making}, author = {Hu, Zhe and Ren, Yixiao and Liu, Guang and Li, Jing and Yin, Yu}, journal = {Findings of the Association for Computational Linguistics: EMNLP}, year = {2025}, status = {accepted}, pdf = {https://arxiv.org/pdf/2509.23698.pdf}, website = {https://derekhu.com/project_page/viva_plus_website/} } - When ’YES’ Meets ’BUT’: Can Large Models Comprehend Contradictory Humor Through Comparative Reasoning?In arXiv preprint arXiv:2503.23137, 2025.
@article{liang2025yesbut, title = {When 'YES' Meets 'BUT': Can Large Models Comprehend Contradictory Humor Through Comparative Reasoning?}, author = {Liang, Tuo and Hu, Zhe and Li, Jing and Zhang, Hao and Lu, Yiren and Zhou, Yunlai and Qiao, Yiran and Liu, Disheng and Peng, Jierui and Ma, Jing and others}, journal = {arXiv preprint arXiv:2503.23137}, year = {2025}, status = {preprint}, pdf = {https://arxiv.org/pdf/2503.23137.pdf} } - Praxis-vlm: Vision-grounded decision making via text-driven reinforcement learning.In Advances in Neural Information Processing Systems (NeurIPS), 2025.
@article{hu2025praxis, title = {Praxis-vlm: Vision-grounded decision making via text-driven reinforcement learning}, author = {Hu, Zhe and Li, Jing and Pu, Zhongzhu and Chan, Hou Pong and Yin, Yu}, journal = {Advances in Neural Information Processing Systems (NeurIPS)}, year = {2025}, status = {accepted}, pdf = {https://arxiv.org/pdf/2503.16965}, code = {https://github.com/Derekkk/Praxis-VLM} } - When Words Outperform Vision: VLMs Can Self-Improve via Text-Only Training for Human-Centered Decision Making.In arXiv e-prints, arXiv:2503.16965, 2025.
@article{hu2025whenwords, title = {When Words Outperform Vision: VLMs Can Self-Improve via Text-Only Training for Human-Centered Decision Making}, author = {Hu, Zhe and Li, Jing and Yin, Yu}, journal = {arXiv e-prints, arXiv:2503.16965}, year = {2025}, status = {accepted}, pdf = {https://arxiv.org/pdf/2503.16965.pdf} } - Debate-to-Write: A Persona-Driven Multi-Agent Framework for Diverse Argument Generation.In Proceedings of the 31st International Conference on Computational Linguistics (COLING), 2025.
@inproceedings{hu2025debatetowrite, title = {Debate-to-Write: A Persona-Driven Multi-Agent Framework for Diverse Argument Generation}, author = {Hu, Zhe and Chan, Hou Pong and Li, Jing and Yin, Yu}, booktitle = {Proceedings of the 31st International Conference on Computational Linguistics (COLING)}, year = {2025}, status = {accepted}, pdf = {https://arxiv.org/pdf/2406.19643}, code = {https://github.com/Derekkk/LLM4ArgGen} } - VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values.In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2024.
@inproceedings{hu2024viva, title = {VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values}, author = {Hu, Zhe and Ren, Yixiao and Li, Jing and Yin, Yu}, booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, year = {2024}, status = {accepted}, pdf = {https://aclanthology.org/2024.emnlp-main.137.pdf}, website = {https://derekhu.com/project_page/viva_website_emnlp24/}, code = {https://github.com/Derekkk/VIVA_EMNLP24}, data = {https://huggingface.co/datasets/zhehuderek/VIVA_Benchmark_EMNLP24} } - AMERICANO: Argument Generation with Discourse-Driven Decomposition and Multi-Agent Interaction.In Proceedings of the 17th International Natural Language Generation Conference (INLG), 2024.
@inproceedings{hu2024americano, title = {AMERICANO: Argument Generation with Discourse-Driven Decomposition and Multi-Agent Interaction}, author = {Hu, Zhe and Chan, Hou Pong and Yin, Yu}, booktitle = {Proceedings of the 17th International Natural Language Generation Conference (INLG)}, year = {2024}, status = {accepted}, pdf = {https://aclanthology.org/2024.inlg-main.8/} } - Cracking the Code of Juxtaposition: Can AI Models Understand the Humorous Contradictions.In Advances in Neural Information Processing Systems (NeurIPS), vol. 37, pp. 47166–47188, 2024.
@article{hu2024cracking, title = {Cracking the Code of Juxtaposition: Can AI Models Understand the Humorous Contradictions}, author = {Hu, Zhe and Liang, Tuo and Li, Jing and Lu, Yiren and Zhou, Yunlai and Qiao, Yiran and Ma, Jing and Yin, Yu}, journal = {Advances in Neural Information Processing Systems (NeurIPS)}, volume = {37}, pages = {47166--47188}, year = {2024}, status = {accepted}, pdf = {https://openreview.net/pdf?id=bCMpdaQCNW}, website = {https://vulab-ai.github.io/YESBUT_Homepage/}, dataset = {https://huggingface.co/datasets/zhehuderek/YESBUT_Benchmark}, code = {https://github.com/Derekkk/VIVA_EMNLP24} } - Cautious Next Token Prediction.In Findings of the Association for Computational Linguistics: ACL 2025, Vienna, Austria, pp. 25685–25697, 2025.
@inproceedings{wang-etal-2025-cautious, title = {Cautious Next Token Prediction}, author = {Wang, Yizhou and Zhang, Lingzhi and Bai, Yue and Chiu, Mang Tik and Hu, Zhengmian and Zhang, Mingyuan and Dong, Qihua and Yin, Yu and Amirghodsi, Sohrab and Fu, Yun}, editor = {Che, Wanxiang and Nabende, Joyce and Shutova, Ekaterina and Pilehvar, Mohammad Taher}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2025}, month = jul, year = {2025}, status = {accepted}, address = {Vienna, Austria}, publisher = {Association for Computational Linguistics}, pdf = {https://aclanthology.org/2025.findings-acl.1318/}, doi = {10.18653/v1/2025.findings-acl.1318}, pages = {25685--25697}, isbn = {979-8-89176-256-5} }Next token prediction paradigm has been prevailing for autoregressive models in the era of LLMs. The current default sampling choice for popular LLMs is temperature scaling together with nucleus sampling to balance diversity and coherence. Nevertheless, such approach leads to inferior performance in various NLP tasks when the model is not certain about testing questions. To this end, we propose a brand new training-free decoding strategy, dubbed as Cautious Next Token Prediction (CNTP). In the decoding process, if the model has comparatively high prediction entropy at a certain step, we sample multiple trials starting from the step independently and stop when encountering any punctuation. Then we select the trial with the lowest perplexity score viewed as the most probable and reliable trial path given the model’s capacity. The trial number is negatively correlated with the prediction confidence, i.e., the less confident the model is, the more trials it should sample. This is consistent with human beings’ behaviour: when feeling uncertain or unconfident, one tends to think more creatively, exploring multiple thinking paths, to cautiously select the path one feels most confident about. Extensive experiments on both LLMs and MLLMs show that our proposed CNTP approach outperforms existing standard decoding strategies consistently by a clear margin. Moreover, the integration of CNTP with self consistency can further improve over vanilla self consistency. We believe our proposed CNTP has the potential to become one of the default choices for LLM decoding. Code is available at https://github.com/wyzjack/CNTP.