@inproceedings{6afb0789578b498f8d5a79891d45cf73,
title = "DETECTION IN COMPLEX SCENES USING RGB AND DEPTH MULTIMODAL FEATURE FUSION",
abstract = "Unlike RGB images, depth images are robust to complex scenes of densely planted orchards. In this paper, we propose a fruit detection method using a multimodal feature fusion module (MMFF) of RGB and depth images. A dual-stream convolutional neural network is adopted in our method for feature extraction to capture multi-scale information of RGB images and depth images based on feature pyramids. The multimodal feature fusion module can filter similar and different features between modalities to suppress the same features and fuse different features. In addition, we use a multi-scale feature fusion method to fuse more information and improve the accuracy of fruit detection. To validate the effectiveness of our method, experimental research is conducted on a self-created pear dataset with multiple modalities. Extensive experiments demonstrate that our proposed approach can achieve state-of-the-art performance at low computation cost.",
keywords = "Depth image, Feature fusion, Multimodality, Object detection, RGB-D",
author = "Shengli Yan and Yuan Rao and Wenhui Hou",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 ; Conference date: 14-04-2024 Through 19-04-2024",
year = "2024",
month = jan,
day = "1",
doi = "10.1109/ICASSP48485.2024.10448205",
language = "English",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers",
pages = "2495--2499",
booktitle = "2024 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 - Proceedings",
address = "United States",
}