From 293d4af6d349d07ecd72c96121033e2ab155d359 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Sun, 6 Jul 2025 15:28:07 -0400 Subject: updates --- chapters/design-space.tex | 5 +--- chapters/related-works.tex | 74 +++++++++++++++++++++++++++++++++++++++++++--- paper.tex | 2 +- references/references.bib | 46 ++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 9 deletions(-) diff --git a/chapters/design-space.tex b/chapters/design-space.tex index c8876de..f85883c 100644 --- a/chapters/design-space.tex +++ b/chapters/design-space.tex @@ -642,10 +642,7 @@ values themselves, and so this is not a significant limitation for our analysis. The resulting distributions are shown in Figure~\ref{fig:design-policy-ins-latency}. These distributions are representing using a ``reversed'' CDF with log scaling on both -axes. This representation has proven very useful for interpreting the -latency distributions that we see in evaluating dynamization, but is -slightly unusual, and so we've included a guide to interpreting these -charts in Appendix~\ref{append:rcdf}. +axes. \begin{figure} \centering diff --git a/chapters/related-works.tex b/chapters/related-works.tex index e828593..8e24750 100644 --- a/chapters/related-works.tex +++ b/chapters/related-works.tex @@ -274,10 +274,76 @@ partial compactions to reduce the size of reconstructions. These factors let SILK maintain the LSM tree structure without having to resort to insertion throttling, as we do in our system. -\section{GiST and GIN} - \section{Automated Index Composition} -\subsection{Periodic Table of Data Structures, etc.} -\subsection{Gene} +At the beginning of this work, we described what we see as the three +major lines of work that are attempting to partially automate index +design. The first of these we discussed was something we called automated +index composition. This is an approach to index design which uses a series +of data structure primitives to compose an index over a set of data that +has been optimized for a particular workload. Of these works, \emph{all} +of them are focused explicitly on single dimensional data with range +scans and point lookups, and only Cosine supports inserting new records. + +Two closely related papers in this line are the so-called Data +Alchemist~\cite{ds-alchemy} and the Periodic Table of Data +Structures~\cite{periodic-table}. Both of these works consider +automatically designing indexes for single dimensional range data, +capable of addressing point lookups and range scans. The Periodic Table +of Data Structures proposes a wide design space for data structures +based on creating individual ''nodes`` over the data, each of which +have different design decisions applied to them, which are termed +``first principles''. They consider first principles to be any design +decision that is irreducible to other decisions, such as whether the +type of partitioning used (range, radix, etc.), whether the data is +stored in a row-based or columnar format, etc. From this model, an +index can be designed by iteratively describing the first principles of +each node. Given these first principles, and the set of nodes, access +algorithms can be automatically devised. The Data Alchemist extends +this model of data structures with learned cost models and machine +learning based systems for automatically composing an optimal data +structure for a given workload. Both of these papers discuss their +core ideas, but don't include testing of a working system. The same +authors have two further works in this area that go more into detail on +cost models~\cite{data-calc} and explore the design continuum in more +detail~\cite{ds-continuum}. + +GENE~\cite{gene} advances the same basic line of research as the +previously mentioned works, but actually includes a functioning, +end-to-end index generation system. GENE decomposes data structures +in a few specific primitives based upon search algorithm and data +layout. Specifically, it supports scanning, binary search, interpolation +search, exponential search, hashing with closed addressing, and linear +regression modeling, as search algorithms. The supported data layout +parameters include column vs. row orientation, sorted vs. unsorted +ordering, compression, and function mapping (which obviates the need +to make other layout decisions, and is intended to be used with linear +regression search algorithms). GENE then designs an index based upon +these options automatically for a given workload by applying a genetic +algorithm. + +Cosine~\cite{cosine} is another similar system, which has been designed +for cloud based systems and accounts for cloud SLAs and (monetary) +budgeting concerns. It includes sophisticate cost modeling systems +to allow it to dynamically adapt the structure of the index, shifting +elements of it between an LSM-like structure, a B-tree-like structure, +and an LSH-like (log-structured hash) structure, as well as adjusting +various configuration parameters associated with each of these primitive +components. It is particularly notable for being the only work discussed +in this section that supports updates. + +Finally, fluid data structures~\cite{fluid-ds} represents a more formal +work in this area, based upon immutable data primitives. The core idea of +this work is that the physical representation of the data can be mutated +while maintaining its logical ordering, under certain circumstances. This +allows for regions of the data structure to shift dynamically to optimize +for particular types of search. To accomplish this, the authors define +a formal grammar, which they call a compositional organizational grammar +(COG) as a description of a data structure, and the transformation rules +that can be applied to a data structure based on this language while +ensuring logical equivalence. A runtime system, then, can automatically +apply these transformations to the structure. This is the only of the +works in this section that discusses generalizing its techniques to +non-single dimensional data. +\section{Generalized Index Templates} diff --git a/paper.tex b/paper.tex index 7ff5bd8..42ef40b 100644 --- a/paper.tex +++ b/paper.tex @@ -395,7 +395,7 @@ of Engineering Science and Mechanics % lines that redefine the \thechapter and \thesection: %\renewcommand\thechapter{} %\renewcommand\thesection{\arabic{section}} -\input{appendices/reverse-cdf.tex} +%\input{appendices/reverse-cdf.tex} % \include{Appendix-B/Appendix-B} % \include{Appendix-C/Appendix-C} % \include{Appendix-D/Appendix-D} diff --git a/references/references.bib b/references/references.bib index 297f67b..8a3049f 100644 --- a/references/references.bib +++ b/references/references.bib @@ -2119,3 +2119,49 @@ keywords = {database systems, low-latency transactions, preemptive scheduling, u bibsource = {dblp computer science bibliography, https://dblp.org} } +@inproceedings{ds-continuum, + author = {Stratos Idreos and + Niv Dayan and + Wilson Qin and + Mali Akmanalp and + Sophie Hilgard and + Andrew Ross and + James Lennon and + Varun Jain and + Harshita Gupta and + David Li and + Zichen Zhu}, + title = {Design Continuums and the Path Toward Self-Designing Key-Value Stores + that Know and Learn}, + booktitle = {9th Biennial Conference on Innovative Data Systems Research, {CIDR} + 2019, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings}, + publisher = {www.cidrdb.org}, + year = {2019}, + url = {http://cidrdb.org/cidr2019/papers/p143-idreos-cidr19.pdf}, + timestamp = {Mon, 18 Jul 2022 17:13:00 +0200}, + biburl = {https://dblp.org/rec/conf/cidr/IdreosDQAHRLJGL19.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@inproceedings{data-calc, + author = {Stratos Idreos and + Kostas Zoumpatianos and + Brian Hentschel and + Michael S. Kester and + Demi Guo}, + editor = {Gautam Das and + Christopher M. Jermaine and + Philip A. Bernstein}, + title = {The Data Calculator: Data Structure Design and Cost Synthesis from + First Principles and Learned Cost Models}, + booktitle = {Proceedings of the 2018 International Conference on Management of + Data, {SIGMOD} Conference 2018, Houston, TX, USA, June 10-15, 2018}, + pages = {535--550}, + publisher = {{ACM}}, + year = {2018}, + url = {https://doi.org/10.1145/3183713.3199671}, + doi = {10.1145/3183713.3199671}, + timestamp = {Wed, 21 Nov 2018 12:44:08 +0100}, + biburl = {https://dblp.org/rec/conf/sigmod/IdreosZHKG18.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} -- cgit v1.2.3