@inproceedings{9e9c78da662340d984f5694032f75d5c,
title = "Optimizing Vision-Language Model for Road Crossing Intention Estimation",
abstract = "Identifying a pedestrian's intention to cross the road is crucial for autonomous driving, as it alerts the system to stop or slow down. However, determining crossing intention from video is challenging due to the need for extracting complex high-level semantics. This paper introduces ClipCross, a novel classification framework optimized to ex-tract high-level semantic features using the vision-language model CLIP for determining crossing intention. Existing CLIP-based methods perform poorly in this task, as CLIP's image and text encoders fail to capture the nuanced se-mantic distinctions between crossing and non-crossing in-tention images. Clip Cross addresses this by optimizing a set of CLIP text embeddings to extract high-level semantic features, which a multi-layer perceptron uses to distinguish between crossing and non-crossing intentions. Clip Cross achieves state-of-the-art performance on crossing intention estimation benchmark datasets: PIE, PSI, and lAAD.",
keywords = "autonomous driving, crossing intention, crossing prediction, scene understanding",
author = "Roy Uziel and Oded Bialer",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 IEEE/CVF Winter Conference on Applications of Computer Vision, WACV 2025 ; Conference date: 28-02-2025 Through 04-03-2025",
year = "2025",
month = jan,
day = "1",
doi = "10.1109/WACV61041.2025.00173",
language = "English",
series = "Proceedings - 2025 IEEE Winter Conference on Applications of Computer Vision, WACV 2025",
publisher = "Institute of Electrical and Electronics Engineers",
pages = "1702--1712",
booktitle = "Proceedings - 2025 IEEE Winter Conference on Applications of Computer Vision, WACV 2025",
address = "United States",
}