summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDouglas Rumbaugh <dbr4@psu.edu>2025-05-20 17:05:47 -0400
committerDouglas Rumbaugh <dbr4@psu.edu>2025-05-20 17:05:47 -0400
commitd76af9340632128dc3a8b05011b6cf8d53fb0ccb (patch)
tree5a4d67c291176e618db9a12dd2c00a8894341776
parentbf99837f39a61f6cce88e24431e08347db66270e (diff)
downloaddissertation-d76af9340632128dc3a8b05011b6cf8d53fb0ccb.tar.gz
updates
-rw-r--r--chapters/beyond-dsp.tex223
-rw-r--r--chapters/introduction.tex2
-rw-r--r--references/references.bib128
3 files changed, 319 insertions, 34 deletions
diff --git a/chapters/beyond-dsp.tex b/chapters/beyond-dsp.tex
index fd4537c..b94221f 100644
--- a/chapters/beyond-dsp.tex
+++ b/chapters/beyond-dsp.tex
@@ -1283,6 +1283,7 @@ possible to leverage problem-specific details within this interface to
get better asymptotic performance.
\subsection{Concurrency Control}
+\label{ssec:dyn-concurrency}
\section{Evaluation}
@@ -1548,6 +1549,31 @@ characteristics,
\text{Delete:} \quad &\Theta\left(\log_s n \right)
\end{align*}
+For testing, we considered a dynamized VPTree using $N_B = 1400$, $s =
+8$, the tiering layout policy, and tagged deletes. Because $k$-NN is a
+standard DDSP, we compare with the Bentley-Saxe Method (\textbf{BSM})\footnote{
+ There is one deviation from pure BSM in our implementation. We use
+ the same delete tagging scheme as the rest of our framework, meaning
+ that the hash tables for record lookup are embedded alongside each
+ block, rather than having a single global table. This means that
+ the lookup of the shard containing the record to be deleted runs
+ in $\Theta(\log_2 n)$ time, rather than $\Theta(1)$ time. However,
+ once the block has been identified, our approach allows the record to
+ be deleted in $\Theta(1)$ time, rather than requiring an inefficient
+ point-lookup directly on the VPTree.
+} and a dynamic data structure for the same search problem called an
+M-Tree~\cite{mtree,mtree-impl} (\textbf{MTree}), which is an example of a so-called
+"ball tree" structure that partitions high dimensional space using nodes
+representing spheres, which are merged and split to maintain balance in
+a manner not unlike a B+Tree. We also consider a static instance of a
+VPTree built over the same set of records (\textbf{VPTree}). We used
+L2 distance as our metric, which is defined for vectors of $d$
+dimensions as
+\begin{equation*}
+dist(r, s) = \sqrt{\sum_{i=0}^{d-1} \left(r_i - s_i\right)^2}
+\end{equation*}
+and ran the queries with $k=1000$ relative to a randomly selected point
+in the dataset.
\begin{figure*}
\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-insert} \label{fig:knn-insert}}
@@ -1559,9 +1585,69 @@ characteristics,
\label{fig:knn-eval}
\end{figure*}
+The results of this benchmarking are reported in
+Figure~\ref{fig:knn-eval}. The VPTree is shown here to \emph{vastly}
+out-perform the dynamic data structure in query performance in
+Figure~\ref{fig:knn-query}. Note that the y-axis of this figure
+is log-scaled. Interestingly, the query performance is not severely
+degraded relative to the static baseline regardless of the dynamization
+scheme used, with \textbf{BSM-VPTree} performing slightly \emph{better}
+than our framework for query performance. The reason for this is
+shown in Figure~\ref{fig:knn-insert}, where our framework outperforms
+the Bentley-Saxe method in insertion performance. These results are
+atributible to our selection of framework configuration parameters,
+which are biased towards better insertion performance. Both dynamized
+structures also outperform the dynamic baseline. Finally, as is becoming
+a trend, Figure~\ref{fig:knn-space} shows that the storage requirements
+of the static data structures, dynamized or not, are significantly less
+than M-Tree. M-Tree, like a B+Tree, requires leaving empty slots in its
+nodes to support insertion, and this results in a large amount of wasted
+space.
+
+As a final note, metric indexing is an area where dynamized static
+structures have been shown to work well already, and our results here
+are in line with the results of Naidan and Hetland, who applied BSM
+directly to metric data structures, including VPTree, in their own work
+and showed similar performance advantages~\cite{naidan14}.
+
+
+
+
\subsection{Range Scan}
+Next, we will consider applying our dynamization framework to learned
+indices for single-dimensional range scans. A learned index is a sorted
+data structure which attempts to index data by directly modeling a
+function mapping a key to its offset within a storage array. The result
+of a lookup against the index is a estimated location, along with a
+strict error bound, within which the record is guaranteed to be located.
+We apply our framework to create dynamized versions of two static learned
+indices: Triespline~\cite{plex} (\textbf{DE-TS}) and PGM~\cite{pgm}
+(\textbf{DE-PGM}), and compare with a standard Bentley-Saxe dynamized of
+Triespline (\textbf{BSM-TS}). Our dynamic baselines are ALEX~\cite{alex},
+which is dynamic learned index based on a B+Tree like structure, and
+PGM (\textbf{PGM}), which provides support for a dynamic version based
+on Bentley-Saxe dynamization (which is why we have not included a BSM
+version of PGM in our testing).
+
+For our dynamized versions of Triespline and PGM, we configure the
+framework with $N_B = 12000$, $s=8$ and the tiering layout policy. We
+consider range count queries, which traverse the range and return the
+number of records on it, rather than returning the set of records,
+to overcome differences in the query interfaces in our baselines, some
+of which make extra copies of the records. We consider traversing the
+range and counting to be a more fair comparison. Range counts are true
+invertible search problems, and so we use tombstone-deletes. The query
+process itself performs no preprocessing. Local queries use the index to
+identify the first record in the query range and then traverses the range,
+counting the number of records and tombstones encountered. These counts
+are then combined by adding up the total record count from all shards,
+subtracting the total tombstone count, and returning the final count. No
+repeats are necessary. The buffer query simply scans the unsorted array
+and performs the same counting. We examine range count queries with
+a fixed selectivity of $\sigma = 0.1\%$.
+
\begin{figure*}
\centering
\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-insert} \label{fig:rq-insert}}
@@ -1573,8 +1659,64 @@ characteristics,
\label{fig:eval-learned-index}
\end{figure*}
+The results of our evaluation are shown in
+Figure~\ref{fig:eval-learned-index}. Figure~\ref{fig:rq-insert} shows
+the insertion performance. DE-TS is the best in all cases, and the pure
+BSM version of Triespline is the worst by a substantial margin. Of
+particular interest in this chart is the inconsisent performance of
+ALEX, which does quite well on the \texttt{books} dataset, and poorly
+on the others. It is worth noting that getting ALEX to run \emph{at
+all} in some cases required a lot of trial and error and tuning, as its
+performance is highly distribution dependent. Our dynamized version of
+PGM consistently out-performed the built-in dynamic support of the same
+structure. One shouldn't read \emph{too} much into this result, as PGM
+itself supports some performance tuning and can be adjusted to balance
+between insertion and query performance. We ran it with the author's
+suggested default values, but in principle it could be possible to tune
+it to match our framework's performance here. The important take-away
+from this test is that our generalized framework can easily trade-blows
+with a custom, integrated solution.
+
+The query performance results in Figure~\ref{fig:rq-query} are a bit
+less interesting. All solutions perform similarly, with ALEX again
+showing itself be to fairly distribution dependent in its performance,
+performing the best out of all of the structures on the \texttt{books}
+dataset by a reasonable margin, but falling in line with the others on the
+remaining datasets. The standout result here is the dynamic PGM, which
+performs horrendously compared to all of the other structures. The same
+caveat from the previous paragraph applies here--PGM can be configured
+for better performance. But it's notable that our framework-dynamized PGM
+is able to beat PGM slightly in insertion performance without seeing the
+same massive degredation in query performance that PGM's native update
+suport does in its own update-optmized configuration.\footnote{
+ It's also worth noting that PGM implements tombstone deletes by
+ inserting a record with a matching key to the record to be deleted,
+ and a particular "tombstone" value, rather than using a header. This
+ means that it can not support duplicate keys when deletes are used,
+ unlike our approach. It also means that the records are smaller,
+ which should improve query performance, but we're able to beat it even
+ including the header. PGM is the reason we excluded the \texttt{wiki}
+ dataset from SOSD, as it has duplicate key values.
+} Finally, Figure~\ref{fig:rq-space} shows the storage requirements for
+these data structures. All of the dynamic options require significantly
+more space than the static Triespline, but ALEX requires the most by a
+very large margin. This is in keeping with the previous experiments, which
+all included similarly B+Tree-like structures that required significant
+additional storage space compared to static structures as part of their
+update support.
+
\subsection{String Search}
+As a final example of a search problem, we consider exact string matching
+using the fast succinct trie~\cite{zhang18}. While updatable
+tries aren't terribly unusual~\cite{m-bonsai,dynamic-trie}, succinct data
+structures, which attempt to approach an information-theoretic lower-bound
+on their binary representation of the data, are usually static because
+implementing updates while maintaining these compact representations
+is difficult~\cite{dynamic-trie}. There are specialized approaches for
+dynamizing such structures~\cite{dynamize-succinct}, but in this section
+we consider the effectiveness of our generalized framework for them.
+
\begin{figure*}
\centering
\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-insert} \label{fig:fst-insert}}
@@ -1582,17 +1724,75 @@ characteristics,
\subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-space} \label{fig:fst-size}}
%\vspace{-3mm}
\caption{FST Evaluation}
+ \label{fig:fst-eval}
%\vspace{-5mm}
\end{figure*}
+Our shard type is a direct wrapper around an implementation of fast
+succinct trie~\cite{fst-impl}. We store the strings in off-record
+storage, and the record type itself contains a pointer to the string in
+storage. Queries use no pre-processing and the local queries directly
+search for a matching string. We use the framework's early abort feature
+to stop as soon as the first result is found, and combine simply checks
+whether this record is a tombstone or not. If it's a tombstone, then
+the lookup is considered to have no found the search string. Otherwise,
+the record is returned. This results in a dynamized structure with the
+following asympotic costs,
+
+
\begin{align*}
\text{Insert:} \quad &\Theta\left(\log_s n\right) \\
\text{Query:} \quad &\Theta\left(N_B + \log n \log_s n\right ) \\
\text{Delete:} \quad &\Theta\left(\log_s n \right)
\end{align*}
+We compare our dynamized succinct trie (\textbf{DE-FST}), configured with
+$N_B = 1200$, $s = 8$, the tiering layout policy, and tombstone deletes,
+with a standard Bentley-Saxe dynamization (\textbf{BSM-FST}), as well
+as a single static instance of the structure (\textbf{FST}).
+
+The results are show in Figure~\ref{fig:fst-eval}. As with range scans,
+the Bentley-Saxe method shows horrible insertion performance relative to
+our framework in Figure~\ref{fig:fst-insert}. Note that the significant
+observed difference in update throughput for the two data sets is
+largely attributable to the relative sizes. The \texttt{usra} set is
+far larger than \texttt{english}. Figure~\ref{fig:fst-query} shows that
+our write-optimized framework configuration is slightly out-performed in
+query latency by the standard Bentley-Saxe dynamization, and that both
+dynamized structures are quite a bit slower than the static structure for
+queries. Finally, the storage costs for the data structures are shown
+in Figure~\ref{fig:fst-space}. For the \texttt{english} data set, the
+extra storage cost from decomposing the structure is quite significant,
+but the for \texttt{ursarc} set the sizes are quite comperable. It is
+not unexpected that dynamization would add storage cost for succinct
+(or any compressed) data structures, because the splitting of the records
+across multiple data structures reduces the ability of the structure to
+compress redundant data.
+
\subsection{Concurrency}
+We also tested the preliminary concurrency support described in
+Section~\ref{ssec:dyn-concurrency}, using IRS as our test case, with our
+dynamization configured with $N_B = 1200$, $s=8$, and the tiering layout
+policy. Note that IRS only supports tagging, as it isn't invertible even
+under the IDSP model, and our current concurrency implementation only
+supports deletes with tombstones, so we eschewed deletes entirely for
+this test.
+
+In this benchmark, we used a single thread to insert records
+into the structure at a constant rate, while we deployed a variable
+number of additional threads that continuously issued sampling queries
+against the structure. We used an AGG B+Tree as our baseline. Note
+that, to accurately maintain the aggregate weight counts as records
+are inserted, it is necessary that each operation obtain a lock on
+the root node of the tree~\cite{zhao22}. This makes this situation
+a good use-case for the automatic concurrency support provided by our
+framework. Figure~\ref{fig:irs-concurrency} shows the results of this
+benchmark for various numbers of concurreny query threads. As can be seen,
+our framework supports a stable update throughput up to 32 query threads,
+whereas the AGG B+Tree suffers from contention for the mutex and sees
+is performance degrade as the number of threads increases.
+
\begin{figure}
\centering
%\vspace{-2mm}
@@ -1604,3 +1804,26 @@ characteristics,
\end{figure}
\section{Conclusion}
+
+In this chapter, we sought to develop a set of tools for generalizing
+some of the results from our study of sampling data structures in
+Chapter~\ref{chap:sampling} to apply to a broader set of data structures.
+This results in our development of two new classes of search problem:
+extended decomposable search problems, and iterative deletion decomposable
+search problems. The former class allows for a pre-processing step
+to be used to generate individualize local queries for each block in a
+decomposed structure, and the latter allows for the query process to be
+repeated as necessary, with possible modifications to the local queries
+each time, to build up the result set iteratively. We then implemented a
+C++ framework for automatically dynamizing static data structures for
+search problems falling into either of these classes, which included an
+LSM tree inspired design space and support for concurrency.
+
+We used this framework to produce dynamized structures for a wide
+variety of search problems, and compared the results to existing
+dynamic baselines, as well as the original Bentley-Saxe method, where
+applicable. The results show that our framework is capable of creating
+dynamic structures that are competitive with, or superior to, custom-built
+dynamic structures, and also has clear performance advantages over the
+classical Bentley-Saxe method.
+
diff --git a/chapters/introduction.tex b/chapters/introduction.tex
index 7084867..bdde070 100644
--- a/chapters/introduction.tex
+++ b/chapters/introduction.tex
@@ -67,7 +67,7 @@ first learned index, RMI~\cite{RMI}. This index succeeding in showing
that a learned model can be both faster and smaller than a conventional
range index, but the proposed solution did not support updates. The
first (non-concurrently) updatable learned index, ALEX, took a year
-and a half to appear~\cite{ALEX}. Over the course of the subsequent
+and a half to appear~\cite{alex}. Over the course of the subsequent
three years, several learned indexes were proposed with concurrency
support~\cite{10.1145/3332466.3374547,10.14778/3489496.3489512} but a
recent performance study~\cite{10.14778/3551793.3551848} showed that these
diff --git a/references/references.bib b/references/references.bib
index 7d2b8a0..ca77259 100644
--- a/references/references.bib
+++ b/references/references.bib
@@ -1096,24 +1096,6 @@ author = {Frank Olken and Doron Rotem}
year = {1997},
}
-@inproceedings{ALEX,
- author = {Jialin Ding and
- Umar Farooq Minhas and
- Jia Yu and
- Chi Wang and
- Jaeyoung Do and
- Yinan Li and
- Hantian Zhang and
- Badrish Chandramouli and
- Johannes Gehrke and
- Donald Kossmann and
- David B. Lomet and
- Tim Kraska},
- title = {{ALEX:} An Updatable Adaptive Learned Index},
- booktitle = {Proceedings of the 2020 ACM International Conference on Management of
- Data},
- year = {2020},
-}
@article{pgm,
author = {Paolo Ferragina and
@@ -1210,19 +1192,39 @@ journal = {Proc. VLDB Endow.},
year = {2019},
}
-@article{DBLP:journals/corr/abs-1905-08898,
+
+
+@inproceedings{alex,
author = {Jialin Ding and
Umar Farooq Minhas and
- Hantian Zhang and
- Yinan Li and
+ Jia Yu and
Chi Wang and
+ Jaeyoung Do and
+ Yinan Li and
+ Hantian Zhang and
Badrish Chandramouli and
Johannes Gehrke and
Donald Kossmann and
- David B. Lomet},
+ David B. Lomet and
+ Tim Kraska},
+ editor = {David Maier and
+ Rachel Pottinger and
+ AnHai Doan and
+ Wang{-}Chiew Tan and
+ Abdussalam Alawini and
+ Hung Q. Ngo},
title = {{ALEX:} An Updatable Adaptive Learned Index},
- journal = {CoRR},
- year = {2019},
+ booktitle = {Proceedings of the 2020 International Conference on Management of
+ Data, {SIGMOD} Conference 2020, online conference [Portland, OR, USA],
+ June 14-19, 2020},
+ pages = {969--984},
+ publisher = {{ACM}},
+ year = {2020},
+ url = {https://doi.org/10.1145/3318464.3389711},
+ doi = {10.1145/3318464.3389711},
+ timestamp = {Thu, 15 Sep 2022 14:00:48 +0200},
+ biburl = {https://dblp.org/rec/conf/sigmod/DingMYWDLZCGKLK20.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{10.1145/3332466.3374547,
@@ -1251,15 +1253,6 @@ booktitle = {Proceedings of the 12th International Workshop on Data Management o
series = {DaMoN '16}
}
-@article{DBLP:journals/corr/abs-1910-06169,
- author = {Paolo Ferragina and
- Giorgio Vinciguerra},
- title = {The PGM-index: a multicriteria, compressed and learned approach to
- data indexing},
- journal = {CoRR},
- year = {2019},
-}
-
@article{byods-datalog,
author = {Sahebolamri, Arash and Barrett, Langston and Moore, Scott and Micinski, Kristopher},
title = {Bring Your Own Data Structures to Datalog},
@@ -1516,6 +1509,24 @@ keywords = {analytic model, analysis of algorithms, overflow chaining, performan
howpublished = {\url{https://github.com/google/leveldb}}
}
+@misc{mtree-impl,
+ author = {Eduardo R. D'Avila},
+ title = {M-Tree},
+ year = {2013},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/erdavila/M-Tree}}
+}
+
+
+@misc{fst-impl,
+ author = {Shunsuke Kanda},
+ title = {Fast Succinct Trie},
+ year = {2021},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/kampersanda/fast\_succinct\_trie}}
+}
@inproceedings{monkey,
author = {Niv Dayan and
@@ -1767,3 +1778,54 @@ keywords = {analytic model, analysis of algorithms, overflow chaining, performan
biburl = {https://dblp.org/rec/journals/cacm/Hoare61a.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
+@article{DBLP:journals/jea/KandaKTMF20,
+ author = {Shunsuke Kanda and
+ Dominik K{\"{o}}ppl and
+ Yasuo Tabei and
+ Kazuhiro Morita and
+ Masao Fuketa},
+ title = {Dynamic Path-decomposed Tries},
+ journal = {{ACM} J. Exp. Algorithmics},
+ volume = {25},
+ pages = {1--28},
+ year = {2020},
+ url = {https://doi.org/10.1145/3418033},
+ doi = {10.1145/3418033},
+ timestamp = {Sat, 08 Jan 2022 02:22:56 +0100},
+ biburl = {https://dblp.org/rec/journals/jea/KandaKTMF20.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{m-bonsai,
+ author = {Andreas Poyias and
+ Simon J. Puglisi and
+ Rajeev Raman},
+ title = {m-Bonsai: {A} Practical Compact Dynamic Trie},
+ journal = {Int. J. Found. Comput. Sci.},
+ volume = {29},
+ number = {8},
+ pages = {1257--1278},
+ year = {2018},
+ url = {https://doi.org/10.1142/S0129054118430025},
+ doi = {10.1142/S0129054118430025},
+ timestamp = {Sun, 19 Jan 2025 14:44:51 +0100},
+ biburl = {https://dblp.org/rec/journals/ijfcs/PoyiasPR18.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{dynamic-trie,
+ author = {Shunsuke Kanda and
+ Dominik K{\"{o}}ppl and
+ Yasuo Tabei and
+ Kazuhiro Morita and
+ Masao Fuketa},
+ title = {Dynamic Path-decomposed Tries},
+ journal = {{ACM} J. Exp. Algorithmics},
+ volume = {25},
+ pages = {1--28},
+ year = {2020},
+ url = {https://doi.org/10.1145/3418033},
+ doi = {10.1145/3418033},
+ timestamp = {Sat, 08 Jan 2022 02:22:56 +0100},
+ biburl = {https://dblp.org/rec/journals/jea/KandaKTMF20.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}