@inproceedings{8536a95e7d8541f298fea2b4f5b5897d,
title = "Needle in a haystack queries in cloud data lakes",
abstract = "Cloud data lakes are a modern approach for storing large amounts of data in a convenient and inexpensive way. Query engines (e.g. Hive, Presto, SparkSQL) are used to run SQL queries on data lakes. Their main focus is on analytical queries while random reads are overlooked. In this paper, we present our approach for optimizing needle in a haystack queries in cloud data lakes. The main idea is to maintain an index structure that maps indexed column values to their files. According to our analysis and experimental evaluation, our solution imposes a reasonable storage overhead while providing an order of magnitude performance improvement.",
author = "Grisha Weintraub and Ehud Gudes and Shlomi Dolev",
note = "CEUR Workshop Proceedings 1613-0073 Scopus rating (2020): CiteScore 0.8 SJR 0.177 SNIP 0.345 2021 Workshops of the EDBT/ICDT Joint Conference, EDBT/ICDT-WS 2021 23/03/21 → … Nicosia, Cyprus Publisher Copyright: {\textcopyright} 2021 Copyright for this paper by its author(s).; 2021 Workshops of the EDBT/ICDT Joint Conference, EDBT/ICDT-WS 2021 ; Conference date: 23-03-2021",
year = "2021",
language = "English",
volume = "2841",
series = "CEUR Workshop Proceedings",
publisher = "CEUR-WS.org",
editor = "Constantinos Costa and Evaggelia Pitoura",
booktitle = "Proceedings of the Workshops of the EDBT/ICDT 2021 Joint Conference, Nicosia, Cyprus, March 23, 2021",
}