summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--chapters/background.tex417
-rw-r--r--chapters/background.tex.bak446
-rw-r--r--chapters/sigmod23/background.tex12
-rw-r--r--chapters/sigmod23/exp-baseline.tex12
-rw-r--r--chapters/sigmod23/exp-extensions.tex4
-rw-r--r--chapters/sigmod23/exp-parameter-space.tex4
-rw-r--r--chapters/sigmod23/experiment.tex6
-rw-r--r--chapters/sigmod23/extensions.tex4
-rw-r--r--chapters/sigmod23/framework.tex48
-rw-r--r--chapters/sigmod23/introduction.tex4
-rw-r--r--references/references.bib105
12 files changed, 913 insertions, 150 deletions
diff --git a/.gitignore b/.gitignore
index c91a7e3..63dc0ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.bak
*.aux
*.log
*.out
diff --git a/chapters/background.tex b/chapters/background.tex
index 69436c8..8ad92a8 100644
--- a/chapters/background.tex
+++ b/chapters/background.tex
@@ -95,7 +95,7 @@ and later work by Overmars lifted this constraint and considered a more
general class of search problems called \emph{$C(n)$-decomposable search
problems},
-\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars83}]
+\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars-cn-decomp}]
A search problem $F: (\mathcal{D}, \mathcal{Q}) \to \mathcal{R}$ is $C(n)$-decomposable
if and only if there exists an $O(C(n))$-time computable, associative,
and commutative binary operator $\mergeop$ such that,
@@ -113,12 +113,14 @@ decomposable even in cases with more than two partial results.
As an example, consider range scans,
\begin{definition}[Range Count]
+ \label{def:range-count}
Let $d$ be a set of $n$ points in $\mathbb{R}$. Given an interval,
$ q = [x, y],\quad x,y \in \mathbb{R}$, a range count returns
the cardinality, $|d \cap q|$.
\end{definition}
\begin{theorem}
+\label{ther:decomp-range-count}
Range Count is a decomposable search problem.
\end{theorem}
@@ -130,7 +132,7 @@ Definition~\ref{def:dsp}, gives
\end{align*}
which is true by the distributive property of union and
intersection. Addition is an associative and commutative
-operator that can be calculated in $O(1)$ time. Therefore, range counts
+operator that can be calculated in $\Theta(1)$ time. Therefore, range counts
are DSPs.
\end{proof}
@@ -376,15 +378,18 @@ database indices. We refer to a data structure with update support as
contain header information (like visibility) that is updated in place.
}
-This section discusses \emph{dynamization}, the construction of a dynamic
-data structure based on an existing static one. When certain conditions
-are satisfied by the data structure and its associated search problem,
-this process can be done automatically, and with provable asymptotic
-bounds on amortized insertion performance, as well as worst case query
-performance. We will first discuss the necessary data structure
-requirements, and then examine several classical dynamization techniques.
-The section will conclude with a discussion of delete support within the
-context of these techniques.
+This section discusses \emph{dynamization}, the construction of a
+dynamic data structure based on an existing static one. When certain
+conditions are satisfied by the data structure and its associated
+search problem, this process can be done automatically, and with
+provable asymptotic bounds on amortized insertion performance, as well
+as worst case query performance. This is in contrast to the manual
+design of dynamic data structures, which involve techniques based on
+partially rebuilding small portions of a single data structure (called
+\emph{local reconstruction})~\cite{overmars83}. This is a very high cost
+intervention that requires significant effort on the part of the data
+structure designer, whereas conventional dynamization can be performed
+with little-to-no modification of the underlying data structure at all.
It is worth noting that there are a variety of techniques
discussed in the literature for dynamizing structures with specific
@@ -395,6 +400,18 @@ of insert and query operations~\cite{batched-decomposable}. This
section discusses techniques that are more general, and don't require
workload-specific assumptions.
+We will first discuss the necessary data structure requirements, and
+then examine several classical dynamization techniques. The section
+will conclude with a discussion of delete support within the context
+of these techniques. For more detail than is included in this chapter,
+Overmars wrote a book providing a comprehensive survey of techniques for
+creating dynamic data structures, including not only the dynamization
+techniques discussed here, but also local reconstruction based
+techniques and more~\cite{overmars83}.\footnote{
+ Sadly, this book isn't readily available in
+ digital format as of the time of writing.
+}
+
\subsection{Global Reconstruction}
@@ -412,7 +429,7 @@ possible if $\mathcal{I}$ supports the following two operations,
\end{align*}
where $\mathtt{build}$ constructs an instance $\mathscr{I}\in\mathcal{I}$
over the data structure over a set of records $d \subseteq \mathcal{D}$
-in $C(|d|)$ time, and $\mathtt{unbuild}$ returns the set of records $d
+in $B(|d|)$ time, and $\mathtt{unbuild}$ returns the set of records $d
\subseteq \mathcal{D}$ used to construct $\mathscr{I} \in \mathcal{I}$ in
$\Theta(1)$ time,\footnote{
There isn't any practical reason why $\mathtt{unbuild}$ must run
@@ -428,7 +445,7 @@ data structure $\mathscr{I} \in \mathcal{I}$ can be defined by,
\end{align*}
It goes without saying that this operation is sub-optimal, as the
-insertion cost is $\Theta(C(n))$, and $C(n) \in \Omega(n)$ at best for
+insertion cost is $\Theta(B(n))$, and $B(n) \in \Omega(n)$ at best for
most data structures. However, this global reconstruction strategy can
be used as a primitive for more sophisticated techniques that can provide
reasonable performance.
@@ -438,7 +455,7 @@ reasonable performance.
The problem with global reconstruction is that each insert must rebuild
the entire data structure, involving all of its records. This results
-in a worst-case insert cost of $\Theta(C(n))$. However, opportunities
+in a worst-case insert cost of $\Theta(B(n))$. However, opportunities
for improving this scheme can present themselves when considering the
\emph{amortized} insertion cost.
@@ -446,11 +463,11 @@ Consider the cost accrued by the dynamized structure under global
reconstruction over the lifetime of the structure. Each insert will result
in all of the existing records being rewritten, so at worst each record
will be involved in $\Theta(n)$ reconstructions, each reconstruction
-having $\Theta(C(n))$ cost. We can amortize this cost over the $n$ records
+having $\Theta(B(n))$ cost. We can amortize this cost over the $n$ records
inserted to get an amortized insertion cost for global reconstruction of,
\begin{equation*}
-I_a(n) = \frac{C(n) \cdot n}{n} = C(n)
+I_a(n) = \frac{B(n) \cdot n}{n} = B(n)
\end{equation*}
This doesn't improve things as is, however it does present two
@@ -459,9 +476,9 @@ the reconstructions, or the number of times a record is reconstructed,
then we could reduce the amortized insertion cost.
The key insight, first discussed by Bentley and Saxe, is that
-this goal can be accomplished by \emph{decomposing} the data
-structure into multiple, smaller structures, each built from a
-disjoint partition of the data. As long as the search problem
+both of these goals can be accomplished by \emph{decomposing} the
+data structure into multiple, smaller structures, each built from
+a disjoint partition of the data. As long as the search problem
being considered is decomposable, queries can be answered from
this structure with bounded worst-case overhead, and the amortized
insertion cost can be improved~\cite{saxe79}. Significant theoretical
@@ -470,21 +487,34 @@ data structure~\cite{saxe79, overmars81, overmars83} and for leveraging
specific efficiencies of the data structures being considered to improve
these reconstructions~\cite{merge-dsp}.
-There are two general decomposition techniques that emerged from this
-work. The earliest of these is the logarithmic method, often called
-the Bentley-Saxe method in modern literature, and is the most commonly
-discussed technique today. A later technique, the equal block method,
-was also examined. It is generally not as effective as the Bentley-Saxe
-method, but it has some useful properties for explanatory purposes and
-so will be discussed here as well.
-
-\subsection{Equal Block Method~\cite[pp.~96-100]{overmars83}}
+There are two general decomposition techniques that emerged from
+this work. The earliest of these is the logarithmic method, often
+called the Bentley-Saxe method in modern literature, and is the most
+commonly discussed technique today. The Bentley-Saxe method has been
+directly applied in a few instances in the literature, such as to
+metric indexing structures~\cite{naidan14} and spatial structures~\cite{bkdtree},
+and has also been used in a modified form for genetic sequence search
+structures~\cite{almodaresi23} and graphs~\cite{lsmgraph}, to cite a few
+examples.
+
+A later technique, the equal block method, was also developed. It is
+generally not as effective as the Bentley-Saxe method, and as a result we
+have not identified any specific applications of this technique outside
+of the theoretical literature, however we will discuss it as well in
+the interest of completeness, and because it does lend itself well to
+demonstrating certain properties of decomposition-based dynamization
+techniques.
+
+\subsection{Equal Block Method}
\label{ssec:ebm}
Though chronologically later, the equal block method is theoretically a
bit simpler, and so we will begin our discussion of decomposition-based
-technique for dynamization of decomposable search problems with it. The
-core concept of the equal block method is to decompose the data structure
+technique for dynamization of decomposable search problems with it. There
+have been several proposed variations of this concept~\cite{maurer79,
+maurer80}, but we will focus on the most developed form as described by
+Overmars and von Leeuwan~\cite{overmars-art-of-dyn, overmars83}. The core
+concept of the equal block method is to decompose the data structure
into several smaller data structures, called blocks, over partitions
of the data. This decomposition is performed such that each block is of
roughly equal size.
@@ -499,7 +529,7 @@ to be governed by a smooth, monotonically increasing function $f(n)$ such
that, at any point, the following two constraints are obeyed.
\begin{align}
f\left(\frac{n}{2}\right) \leq s \leq f(2n) \label{ebm-c1}\\
- \forall_{1 \leq j \leq s} \quad | \mathscr{I}_j | \leq \frac{2n}{i} \label{ebm-c2}
+ \forall_{1 \leq j \leq s} \quad | \mathscr{I}_j | \leq \frac{2n}{s} \label{ebm-c2}
\end{align}
where $|\mathscr{I}_j|$ is the number of records in the block,
$|\text{unbuild}(\mathscr{I}_j)|$.
@@ -528,16 +558,16 @@ where $F(\mathscr{I}, q)$ is a slight abuse of notation, referring to
answering the query over $d$ using the data structure $\mathscr{I}$.
This technique provides better amortized performance bounds than global
-reconstruction, at the possible cost of increased query performance for
+reconstruction, at the possible cost of worse query performance for
sub-linear queries. We'll omit the details of the proof of performance
for brevity and streamline some of the original notation (full details
can be found in~\cite{overmars83}), but this technique ultimately
results in a data structure with the following performance characteristics,
\begin{align*}
-\text{Amortized Insertion Cost:}&\quad \Theta\left(\frac{C(n)}{n} + C\left(\frac{n}{f(n)}\right)\right) \\
+\text{Amortized Insertion Cost:}&\quad \Theta\left(\frac{B(n)}{n} + B\left(\frac{n}{f(n)}\right)\right) \\
\text{Worst-case Query Cost:}& \quad \Theta\left(f(n) \cdot \mathscr{Q}\left(\frac{n}{f(n)}\right)\right) \\
\end{align*}
-where $C(n)$ is the cost of statically building $\mathcal{I}$, and
+where $B(n)$ is the cost of statically building $\mathcal{I}$, and
$\mathscr{Q}(n)$ is the cost of answering $F$ using $\mathcal{I}$.
%TODO: example?
@@ -599,18 +629,35 @@ structure in the same way that incrementing the binary number by $1$ does.
By applying BSM to a data structure, a dynamized structure can be created
with the following performance characteristics,
\begin{align*}
-\text{Amortized Insertion Cost:}&\quad \Theta\left(\left(\frac{C(n)}{n}\cdot \log_2 n\right)\right) \\
-\text{Worst Case Insertion Cost:}&\quad \Theta\left(C(n)\right) \\
+\text{Amortized Insertion Cost:}&\quad \Theta\left(\left(\frac{B(n)}{n}\cdot \log_2 n\right)\right) \\
+\text{Worst Case Insertion Cost:}&\quad \Theta\left(B(n)\right) \\
\text{Worst-case Query Cost:}& \quad \Theta\left(\log_2 n\cdot \mathscr{Q}\left(n\right)\right) \\
\end{align*}
This is a particularly attractive result because, for example, a data
-structure having $C(n) \in \Theta(n)$ will have an amortized insertion
-cost of $\log_2 (n)$, which is quite reasonable. The cost is an extra
-logarithmic multiple attached to the query complexity. It is also worth
-noting that the worst-case insertion cost remains the same as global
-reconstruction, but this case arises only very rarely. If you consider the
-binary decomposition representation, the worst-case behavior is triggered
-each time the existing number overflows, and a new digit must be added.
+structure having $B(n) \in \Theta(n)$ will have an amortized insertion
+cost of $\log_2 (n)$, which is quite reasonable. The trade-off for this
+is an extra logarithmic multiple attached to the query complexity. It is
+also worth noting that the worst-case insertion cost remains the same
+as global reconstruction, but this case arises only very rarely. If
+you consider the binary decomposition representation, the worst-case
+behavior is triggered each time the existing number overflows, and a
+new digit must be added.
+
+As a final note about the query performance of this structure, because
+the overhead due to querying the blocks is logarithmic, under certain
+circumstances this cost can be absorbed, resulting in no effect on the
+asymptotic worst-case query performance. As an example, consider a linear
+scan of the data running in $\Theta(n)$ time. In this case, every record
+must be considered, and so there isn't any performance penalty\footnote{
+ From an asymptotic perspective. There will still be measurable performance
+ effects from caching, etc., even in this case.
+} to breaking the records out into multiple chunks and scanning them
+individually. For formally, for any query running in $\mathscr{Q}(n) \in
+\Omega\left(n^\epsilon\right)$ time where $\epsilon > 0$, the worst-case
+cost of answering a decomposable search problem from a BSM dynamization
+is $\Theta\left(\mathscr{Q}(n)\right)$.~\cite{saxe79}
+
+\subsection{Merge Decomposable Search Problems}
\subsection{Delete Support}
@@ -651,6 +698,290 @@ This presents several problems,
require additional work to fix.
\end{itemize}
+To resolve these difficulties, two very different approaches have been
+proposed for supporting deletes, each of which rely on certain properties
+of the search problem and data structure. These are the use of a ghost
+structure and weak deletes.
+
+\subsubsection{Ghost Structure for Invertible Search Problems}
+
+The first proposed mechanism for supporting deletes was discussed
+alongside the Bentley-Saxe method in Bentley and Saxe's original
+paper. This technique applies to a class of search problems called
+\emph{invertible} (also called \emph{decomposable counting problems}
+in later literature~\cite{overmars83}). Invertible search problems
+are decomposable, and also support an ``inverse'' merge operator, $\Delta$,
+that is able to remove records from the result set. More formally,
+\begin{definition}[Invertible Search Problem~\cite{saxe79}]
+\label{def:invert}
+A decomposable search problem, $F$ is invertible if and only if there
+exists a constant time computable operator, $\Delta$, such that
+\begin{equation*}
+F(A / B, q) = F(A, q)~\Delta~F(B, q)
+\end{equation*}
+for all $A, B \in \mathcal{PS}(\mathcal{D})$ where $A \cap B = \emptyset$.
+\end{definition}
+
+Given a search problem with this property, it is possible to perform
+deletes by creating a secondary ``ghost'' structure. When a record
+is to be deleted, it is inserted into this structure. Then, when the
+dynamization is queried, this ghost structure is queried as well as the
+main one. The results from the ghost structure can be removed from the
+result set using the inverse merge operator. This simulates the result
+that would have been obtained had the records been physically removed
+from the main structure.
+
+Two examples of invertible search problems are set membership
+and range count. Range count was formally defined in
+Definition~\ref{def:range-count}.
+
+\begin{theorem}
+Range count is an invertible search problem.
+\end{theorem}
+
+\begin{proof}
+To prove that range count is an invertible search problem, it must be
+decomposable and have a $\Delta$ operator. That it is a DSP has already
+been proven in Theorem~\ref{ther:decomp-range-count}.
+
+Let $\Delta$ be subtraction $(-)$. Applying this to Definition~\ref{def:invert}
+gives,
+\begin{equation*}
+|(A / B) \cap q | = |(A \cap q) / (B \cap q)| = |(A \cap q)| - |(B \cap q)|
+\end{equation*}
+which is true by the distributive property of set difference and
+intersection. Subtraction is computable in constant time, therefore
+range count is an invertible search problem using subtraction as $\Delta$.
+\end{proof}
+
+The set membership search problem is defined as follows,
+\begin{definition}[Set Membership]
+\label{def:set-membership}
+Consider a set of elements $d \subseteq \mathcal{D}$ from some domain,
+and a single element $r \in \mathcal{D}$. A test of set membership is a
+search problem of the form $F: (\mathcal{PS}(\mathcal{D}), \mathcal{D})
+\to \mathbb{B}$ such that $F(d, r) = r \in d$, which maps to $0$ if $r
+\not\in d$ and $1$ if $r \in d$.
+\end{definition}
+
+\begin{theorem}
+Set membership is an invertible search problem.
+\end{theorem}
+
+\begin{proof}
+To prove that set membership is invertible, it is necessary to establish
+that it is a decomposable search problem, and that a $\Delta$ operator
+exists. We'll begin with the former.
+\begin{lemma}
+ \label{lem:set-memb-dsp}
+ Set membership is a decomposable search problem.
+\end{lemma}
+\begin{proof}
+Let $\mergeop$ be the logical disjunction ($\lor$). This yields,
+\begin{align*}
+F(A \cup B, r) &= F(A, r) \lor F(B, r) \\
+r \in (A \cup B) &= (r \in A) \lor (r \in B)
+\end{align*}
+which is true, following directly from the definition of union. The
+logical disjunction is an associative, commutative operator that can
+be calculated in $\Theta(1)$ time. Therefore, set membership is a
+decomposable search problem.
+\end{proof}
+
+For the inverse merge operator, $\Delta$, it is necessary that $F(A,
+r) ~\Delta~F(B, r)$ be true \emph{only} if $r \in A$ and $r \not\in
+B$. Thus, it could be directly implemented as $F(A, r)~\Delta~F(B, r) =
+F(A, r) \land \neg F(B, r)$, which is constant time if
+the operands are already known.
+
+Thus, we have shown that set membership is a decomposable search problem,
+and that a constant time $\Delta$ operator exists. Therefore, it is an
+invertible search problem.
+\end{proof}
+
+For search problems such as these, this technique allows for deletes to be
+supported with the same cost as an insert. Unfortunately, it suffers from
+write amplification because each deleted record is recorded twice--one in
+the main structure, and once in the ghost structure. This means that $n$
+is, in effect, the total number of records and deletes. This can lead
+to some serious problems, for example if every record in a structure
+of $n$ records is deleted, the net result will be an "empty" dynamized
+data structure containing $2n$ physical records within it. To circumvent
+this problem, Bentley and Saxe proposed a mechanism of setting a maximum
+threshold for the size of the ghost structure relative to the main one,
+and performing a complete re-partitioning of the data once this threshold
+is reached, removing all deleted records from the main structure,
+emptying the ghost structure, and rebuilding blocks with the records
+that remain according to the invariants of the technique.
+
+\subsubsection{Weak Deletes for Deletion Decomposable Search Problems}
+
+Another approach for supporting deletes was proposed later, by Overmars
+and van Leeuwen, for a class of search problem called \emph{deletion
+decomposable}. These are decomposable search problems for which the
+underlying data structure supports a delete operation. More formally,
+
+\begin{definition}[Deletion Decomposable Search Problem~\cite{merge-dsp}]
+ A decomposable search problem, $F$, and its data structure,
+ $\mathcal{I}$, is deletion decomposable if and only if, for some
+ instance $\mathscr{I} \in \mathcal{I}$, containing $n$ records,
+ there exists a deletion routine $\mathtt{delete}(\mathscr{I},
+ r)$ that removes some $r \in \mathcal{D}$ in time $D(n)$ without
+ increasing the query time, deletion time, or storage requirement,
+ for $\mathscr{I}$.
+\end{definition}
+
+Superficially, this doesn't appear very useful. If the underlying data
+structure already supports deletes, there isn't much reason to use a
+dynamization technique to add deletes to it. However, one point worth
+mentioning is that it is possible, in many cases, to easily \emph{add}
+delete support to a static structure. If it is possible to locate a
+record and somehow mark it as deleted, without removing it from the
+structure, and then efficiently ignore these records while querying,
+then the given structure and its search problem can be said to be
+deletion decomposable. This technique for deleting records is called
+\emph{weak deletes}.
+
+\begin{definition}[Weak Deletes~\cite{overmars81}]
+\label{def:weak-delete}
+A data structure is said to support weak deletes if it provides a
+routine, \texttt{delete}, that guarantees that after $\alpha \cdot n$
+deletions, where $\alpha < 1$, the query cost is bounded by $k_\alpha
+\mathscr{Q}(n)$ for some constant $k_\alpha$ dependent only upon $\alpha$,
+where $\mathscr{Q}(n)$ is the cost of answering the query against a
+structure upon which no weak deletes were performed.\footnote{
+ This paper also provides a similar definition for weak updates,
+ but these aren't of interest to us in this work, and so the above
+ definition was adapted from the original with the weak update
+ constraints removed.
+} The results of the query of a block containing weakly deleted records
+should be the same as the results would be against a block with those
+records removed.
+\end{definition}
+
+As an example of a deletion decomposable search problem, consider the set
+membership problem considered above (Definition~\ref{def:set-membership})
+where $\mathcal{I}$, the data structure used to answer queries of the
+search problem, is a hash map.\footnote{
+ While most hash maps are already dynamic, and so wouldn't need
+ dynamization to be applied, there do exist static ones too. For example,
+ the hash map being considered could be implemented using perfect
+ hashing~\cite{perfect-hashing}, which has many static implementations.
+}
+
+\begin{theorem}
+ The set membership problem, answered using a static hash map, is
+ deletion decomposable.
+\end{theorem}
+
+\begin{proof}
+We've already shown in Lemma~\ref{lem:set-memb-dsp} that set membership
+is a decomposable search problem. For it to be deletion decomposable,
+we must demonstrate that the hash map, $\mathcal{I}$, supports deleting
+records without hurting its query performance, delete performance, or
+storage requirements. Assume that an instance $\mathscr{I} \in
+\mathcal{I}$ having $|\mathscr{I}| = n$ can answer queries in
+$\mathscr{Q}(n) \in \Theta(1)$ time and requires $\Omega(n)$ storage.
+
+Such a structure can support weak deletes. Each record within the
+structure has a single bit attached to it, indicating whether it has
+been deleted or not. These bits will require $\Theta(n)$ storage and
+be initialized to 0 when the structure is constructed. A delete can
+be performed by querying the structure for the record to be deleted in
+$\Theta(1)$ time, and setting the bit to 1 if the record is found. This
+operation has $D(n) \in \Theta(1)$ cost.
+
+\begin{lemma}
+\label{lem:weak-deletes}
+The delete procedure as described above satisfies the requirements of
+Definition~\ref{def:weak-delete} for weak deletes.
+\end{lemma}
+\begin{proof}
+Per Definition~\ref{def:weak-delete}, there must exist some constant
+dependent only on $\alpha$, $k_\alpha$, such that after $\alpha \cdot
+n$ deletes against $\mathscr{I}$ with $\alpha < 1$, the query cost is
+bounded by $\Theta(\alpha \mathscr{Q}(n))$.
+
+In this case, $\mathscr{Q}(n) \in \Theta(1)$, and therefore our final
+query cost must be bounded by $\Theta(k_\alpha)$. When a query is
+executed against $\mathscr{I}$, there are three possible cases,
+\begin{enumerate}
+\item The record being searched for does not exist in $\mathscr{I}$. In
+this case, the query result is 0.
+\item The record being searched for does exist in $\mathscr{I}$ and has
+a delete bit value of 0. In this case, the query result is 1.
+\item The record being searched for does exist in $\mathscr{I}$ and has
+a delete bit value of 1 (i.e., it has been deleted). In this case, the
+query result is 0.
+\end{enumerate}
+In all three cases, the addition of deletes requires only $\Theta(1)$
+extra work at most. Therefore, set membership over a static hash map
+using our proposed deletion mechanism satisfies the requirements for
+weak deletes, with $k_\alpha = 1$.
+\end{proof}
+
+Finally, we note that the cost of one of these weak deletes is $D(n)
+= \mathscr{Q}(n)$. By Lemma~\ref{lem:weak-deletes}, the delete cost is
+not asymptotically harmed by deleting records.
+
+Thus, we've shown that set membership using a static hash map is a
+decomposable search problem, the storage cost remains $\Omega(n)$ and the
+query and delete costs are unaffected by the presence of deletes using the
+proposed mechanism. All of the requirements of deletion decomposability
+are satisfied, therefore set membership using a static hash map is a
+deletion decomposable search problem.
+\end{proof}
+
+For such problems, deletes can be supported by first identifying the
+block in the dynamization containing the record to be deleted, and
+then calling $\mathtt{delete}$ on it. In order to allow this block to
+be easily located, it is possible to maintain a hash table over all
+of the records, alongside the dynamization, which maps each record
+onto the block containing it. This table must be kept up to date as
+reconstructions occur, but this can be done at no extra asymptotic costs
+for any data structures having $B(n) \in \Omega(n)$, as it requires only
+linear time. This allows for deletes to be performed in $\mathscr{D}(n)
+\in \Theta(D(n))$ time.
+
+The presence of deleted records within the structure does introduce a
+new problem, however. Over time, the number of records in each block will
+drift away from the requirements imposed by the dynamization technique. It
+will eventually become necessary to re-partition the records to restore
+these invariants, which are necessary for bounding the number of blocks,
+and thereby the query performance. The particular invariant maintenance
+rules depend upon the decomposition scheme used.
+
+\Paragraph{Bentley-Saxe Method.} When creating a BSM dynamization for
+a deletion decomposable search problem, the $i$th block where $i \geq 2$\footnote{
+ Block $i=0$ will only ever have one record, so no special maintenance must be
+ done for it. A delete will simply empty it completely.
+},
+in the absence of deletes, will contain $2^{i-1} + 1$ records. When a
+delete occurs in block $i$, no special action is taken until the number
+of records in that block falls below $2^{i-2}$. Once this threshold is
+reached, a reconstruction can be performed to restore the appropriate
+record counts in each block.~\cite{merge-dsp}
+
+\Paragraph{Equal Block Method.} For the equal block method, there are
+two cases in which a delete may cause a block to fail to obey the method's
+size invariants,
+\begin{enumerate}
+ \item If enough records are deleted, it is possible for the number
+ of blocks to exceed $f(2n)$, violating Invariant~\ref{ebm-c1}.
+ \item The deletion of records may cause the maximum size of each
+ block to shrink, causing some blocks to exceed the maximum capacity
+ of $\nicefrac{2n}{s}$. This is a violation of Invariant~\ref{ebm-c2}.
+\end{enumerate}
+In both cases, it should be noted that $n$ is decreased as records are
+deleted. Should either of these cases emerge as a result of a delete,
+the entire structure must be reconfigured to ensure that its invariants
+are maintained. This reconfiguration follows the same procedure as when
+an insert results in a violation: $s$ is updated to be exactly $f(n)$, all
+existing blocks are unbuilt, and then the records are evenly redistributed
+into the $s$ blocks.~\cite{overmars-art-of-dyn}
+
+
+\subsection{Worst-Case Optimal Techniques}
\section{Limitations of Classical Dynamization Techniques}
diff --git a/chapters/background.tex.bak b/chapters/background.tex.bak
index 78f4a30..c27c790 100644
--- a/chapters/background.tex.bak
+++ b/chapters/background.tex.bak
@@ -14,7 +14,7 @@ indices will be discussed briefly. Indices are the primary use of data
structures within the database context that is of interest to our work.
Following this, existing theoretical results in the area of data structure
dynamization will be discussed, which will serve as the building blocks
-for our techniques in subsquent chapters. The chapter will conclude with
+for our techniques in subsequent chapters. The chapter will conclude with
a discussion of some of the limitations of these existing techniques.
\section{Queries and Search Problems}
@@ -62,7 +62,7 @@ As an example of using these definitions, a \emph{membership test}
or \emph{range scan} would be considered search problems, and a range
scan over the interval $[10, 99]$ would be a query. We've drawn this
distinction because, as we'll see as we enter into the discussion of
-our work in later chapters, it is useful to have seperate, unambiguous
+our work in later chapters, it is useful to have separate, unambiguous
terms for these two concepts.
\subsection{Decomposable Search Problems}
@@ -85,6 +85,7 @@ their work on dynamization, and we will adopt their definition,
\begin{equation*}
F(A \cup B, q) = F(A, q)~ \mergeop ~F(B, q)
\end{equation*}
+ for all $A, B \in \mathcal{PS}(\mathcal{D})$ where $A \cap B = \emptyset$.
\end{definition}
The requirement for $\mergeop$ to be constant-time was used by Bentley and
@@ -94,13 +95,14 @@ and later work by Overmars lifted this constraint and considered a more
general class of search problems called \emph{$C(n)$-decomposable search
problems},
-\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars83}]
+\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars-cn-decomp}]
A search problem $F: (\mathcal{D}, \mathcal{Q}) \to \mathcal{R}$ is $C(n)$-decomposable
if and only if there exists an $O(C(n))$-time computable, associative,
and commutative binary operator $\mergeop$ such that,
\begin{equation*}
F(A \cup B, q) = F(A, q)~ \mergeop ~F(B, q)
\end{equation*}
+ for all $A, B \in \mathcal{PS}(\mathcal{D})$ where $A \cap B = \emptyset$.
\end{definition}
To demonstrate that a search problem is decomposable, it is necessary to
@@ -111,12 +113,14 @@ decomposable even in cases with more than two partial results.
As an example, consider range scans,
\begin{definition}[Range Count]
+ \label{def:range-count}
Let $d$ be a set of $n$ points in $\mathbb{R}$. Given an interval,
$ q = [x, y],\quad x,y \in \mathbb{R}$, a range count returns
the cardinality, $|d \cap q|$.
\end{definition}
\begin{theorem}
+\label{ther:decomp-range-count}
Range Count is a decomposable search problem.
\end{theorem}
@@ -128,7 +132,7 @@ Definition~\ref{def:dsp}, gives
\end{align*}
which is true by the distributive property of union and
intersection. Addition is an associative and commutative
-operator that can be calculated in $O(1)$ time. Therefore, range counts
+operator that can be calculated in $\Theta(1)$ time. Therefore, range counts
are DSPs.
\end{proof}
@@ -144,7 +148,7 @@ The calculation of the arithmetic mean of a set of numbers is a DSP.
Consider the search problem $A:\mathcal{D} \to (\mathbb{R}, \mathbb{Z})$,
where $\mathcal{D}\subset\mathbb{R}$ and is a multi-set. The output tuple
contains the sum of the values within the input set, and the
-cardinality of the input set. For two disjoint paritions of the data,
+cardinality of the input set. For two disjoint partitions of the data,
$D_1$ and $D_2$, let $A(D_1) = (s_1, c_1)$ and $A(D_2) = (s_2, c_2)$. Let
$A(D_1) \mergeop A(D_2) = (s_1 + s_2, c_1 + c_2)$.
@@ -374,15 +378,18 @@ database indices. We refer to a data structure with update support as
contain header information (like visibility) that is updated in place.
}
-This section discusses \emph{dynamization}, the construction of a dynamic
-data structure based on an existing static one. When certain conditions
-are satisfied by the data structure and its associated search problem,
-this process can be done automatically, and with provable asymptotic
-bounds on amortized insertion performance, as well as worst case query
-performance. We will first discuss the necessary data structure
-requirements, and then examine several classical dynamization techniques.
-The section will conclude with a discussion of delete support within the
-context of these techniques.
+This section discusses \emph{dynamization}, the construction of a
+dynamic data structure based on an existing static one. When certain
+conditions are satisfied by the data structure and its associated
+search problem, this process can be done automatically, and with
+provable asymptotic bounds on amortized insertion performance, as well
+as worst case query performance. This is in contrast to the manual
+design of dynamic data structures, which involve techniques based on
+partially rebuilding small portions of a single data structure (called
+\emph{local reconstruction})~\cite{overmars83}. This is a very high cost
+intervention that requires significant effort on the part of the data
+structure designer, whereas conventional dynamization can be performed
+with little-to-no modification of the underlying data structure at all.
It is worth noting that there are a variety of techniques
discussed in the literature for dynamizing structures with specific
@@ -393,6 +400,18 @@ of insert and query operations~\cite{batched-decomposable}. This
section discusses techniques that are more general, and don't require
workload-specific assumptions.
+We will first discuss the necessary data structure requirements, and
+then examine several classical dynamization techniques. The section
+will conclude with a discussion of delete support within the context
+of these techniques. For more detail than is included in this chapter,
+Overmars wrote a book providing a comprehensive survey of techniques for
+creating dynamic data structures, including not only the dynamization
+techniques discussed here, but also local reconstruction based
+techniques and more~\cite{overmars83}.\footnote{
+ Sadly, this book isn't readily available in
+ digital format as of the time of writing.
+}
+
\subsection{Global Reconstruction}
@@ -410,13 +429,13 @@ possible if $\mathcal{I}$ supports the following two operations,
\end{align*}
where $\mathtt{build}$ constructs an instance $\mathscr{I}\in\mathcal{I}$
over the data structure over a set of records $d \subseteq \mathcal{D}$
-in $C(|d|)$ time, and $\mathtt{unbuild}$ returns the set of records $d
+in $B(|d|)$ time, and $\mathtt{unbuild}$ returns the set of records $d
\subseteq \mathcal{D}$ used to construct $\mathscr{I} \in \mathcal{I}$ in
$\Theta(1)$ time,\footnote{
There isn't any practical reason why $\mathtt{unbuild}$ must run
in constant time, but this is the assumption made in \cite{saxe79}
and in subsequent work based on it, and so we will follow the same
- defininition here.
+ definition here.
} such that $\mathscr{I} = \mathtt{build}(\mathtt{unbuild}(\mathscr{I}))$.
Given this structure, an insert of record $r \in \mathcal{D}$ into a
@@ -426,7 +445,7 @@ data structure $\mathscr{I} \in \mathcal{I}$ can be defined by,
\end{align*}
It goes without saying that this operation is sub-optimal, as the
-insertion cost is $\Theta(C(n))$, and $C(n) \in \Omega(n)$ at best for
+insertion cost is $\Theta(B(n))$, and $B(n) \in \Omega(n)$ at best for
most data structures. However, this global reconstruction strategy can
be used as a primitive for more sophisticated techniques that can provide
reasonable performance.
@@ -436,19 +455,19 @@ reasonable performance.
The problem with global reconstruction is that each insert must rebuild
the entire data structure, involving all of its records. This results
-in a worst-case insert cost of $\Theta(C(n))$. However, opportunities
+in a worst-case insert cost of $\Theta(B(n))$. However, opportunities
for improving this scheme can present themselves when considering the
\emph{amortized} insertion cost.
-Consider the cost acrrued by the dynamized structure under global
+Consider the cost accrued by the dynamized structure under global
reconstruction over the lifetime of the structure. Each insert will result
in all of the existing records being rewritten, so at worst each record
will be involved in $\Theta(n)$ reconstructions, each reconstruction
-having $\Theta(C(n))$ cost. We can amortize this cost over the $n$ records
+having $\Theta(B(n))$ cost. We can amortize this cost over the $n$ records
inserted to get an amortized insertion cost for global reconstruction of,
\begin{equation*}
-I_a(n) = \frac{C(n) \cdot n}{n} = C(n)
+I_a(n) = \frac{B(n) \cdot n}{n} = B(n)
\end{equation*}
This doesn't improve things as is, however it does present two
@@ -457,9 +476,9 @@ the reconstructions, or the number of times a record is reconstructed,
then we could reduce the amortized insertion cost.
The key insight, first discussed by Bentley and Saxe, is that
-this goal can be accomplished by \emph{decomposing} the data
-structure into multiple, smaller structures, each built from a
-disjoint partition of the data. As long as the search problem
+both of these goals can be accomplished by \emph{decomposing} the
+data structure into multiple, smaller structures, each built from
+a disjoint partition of the data. As long as the search problem
being considered is decomposable, queries can be answered from
this structure with bounded worst-case overhead, and the amortized
insertion cost can be improved~\cite{saxe79}. Significant theoretical
@@ -468,21 +487,34 @@ data structure~\cite{saxe79, overmars81, overmars83} and for leveraging
specific efficiencies of the data structures being considered to improve
these reconstructions~\cite{merge-dsp}.
-There are two general decomposition techniques that emerged from this
-work. The earliest of these is the logarithmic method, often called
-the Bentley-Saxe method in modern literature, and is the most commonly
-discussed technique today. A later technique, the equal block method,
-was also examined. It is generally not as effective as the Bentley-Saxe
-method, but it has some useful properties for explainatory purposes and
-so will be discussed here as well.
-
-\subsection{Equal Block Method~\cite[pp.~96-100]{overmars83}}
+There are two general decomposition techniques that emerged from
+this work. The earliest of these is the logarithmic method, often
+called the Bentley-Saxe method in modern literature, and is the most
+commonly discussed technique today. The Bentley-Saxe method has been
+directly applied in a few instances in the literature, such as to
+metric indexing structures~\cite{naidan14} and spatial structures~\cite{bkdtree},
+and has also been used in a modified form for genetic sequence search
+structures~\cite{almodaresi23} and graphs~\cite{lsmgraph}, to cite a few
+examples.
+
+A later technique, the equal block method, was also developed. It is
+generally not as effective as the Bentley-Saxe method, and as a result we
+have not identified any specific applications of this technique outside
+of the theoretical literature, however we will discuss it as well in
+the interest of completeness, and because it does lend itself well to
+demonstrating certain properties of decomposition-based dynamization
+techniques.
+
+\subsection{Equal Block Method}
\label{ssec:ebm}
Though chronologically later, the equal block method is theoretically a
bit simpler, and so we will begin our discussion of decomposition-based
-technique for dynamization of decomposable search problems with it. The
-core concept of the equal block method is to decompose the data structure
+technique for dynamization of decomposable search problems with it. There
+have been several proposed variations of this concept~\cite{maurer79,
+maurer80}, but we will focus on the most developed form as described by
+Overmars and von Leeuwan~\cite{overmars-art-of-dyn, overmars83}. The core
+concept of the equal block method is to decompose the data structure
into several smaller data structures, called blocks, over partitions
of the data. This decomposition is performed such that each block is of
roughly equal size.
@@ -497,7 +529,7 @@ to be governed by a smooth, monotonically increasing function $f(n)$ such
that, at any point, the following two constraints are obeyed.
\begin{align}
f\left(\frac{n}{2}\right) \leq s \leq f(2n) \label{ebm-c1}\\
- \forall_{1 \leq j \leq s} \quad | \mathscr{I}_j | \leq \frac{2n}{i} \label{ebm-c2}
+ \forall_{1 \leq j \leq s} \quad | \mathscr{I}_j | \leq \frac{2n}{s} \label{ebm-c2}
\end{align}
where $|\mathscr{I}_j|$ is the number of records in the block,
$|\text{unbuild}(\mathscr{I}_j)|$.
@@ -512,7 +544,7 @@ Following an insert, it is possible that Constraint~\ref{ebm-c1} is violated.\fo
Constraint~\ref{ebm-c2} cannot be violated by inserts, but may be
violated by deletes. We're omitting deletes from the discussion at
this point, but will circle back to them in Section~\ref{sec:deletes}.
-} In this case, the constraints are enforced by "reconfiguring" the
+} In this case, the constraints are enforced by "re-configuring" the
structure. $s$ is updated to be exactly $f(n)$, all of the existing
blocks are unbuilt, and then the records are redistributed evenly into
$s$ blocks.
@@ -526,16 +558,16 @@ where $F(\mathscr{I}, q)$ is a slight abuse of notation, referring to
answering the query over $d$ using the data structure $\mathscr{I}$.
This technique provides better amortized performance bounds than global
-reconstruction, at the possible cost of increased query performance for
+reconstruction, at the possible cost of worse query performance for
sub-linear queries. We'll omit the details of the proof of performance
for brevity and streamline some of the original notation (full details
can be found in~\cite{overmars83}), but this technique ultimately
-results in a data structure with the following performance characterstics,
+results in a data structure with the following performance characteristics,
\begin{align*}
-\text{Amortized Insertion Cost:}&\quad \Theta\left(\frac{C(n)}{n} + C\left(\frac{n}{f(n)}\right)\right) \\
+\text{Amortized Insertion Cost:}&\quad \Theta\left(\frac{B(n)}{n} + B\left(\frac{n}{f(n)}\right)\right) \\
\text{Worst-case Query Cost:}& \quad \Theta\left(f(n) \cdot \mathscr{Q}\left(\frac{n}{f(n)}\right)\right) \\
\end{align*}
-where $C(n)$ is the cost of statically building $\mathcal{I}$, and
+where $B(n)$ is the cost of statically building $\mathcal{I}$, and
$\mathscr{Q}(n)$ is the cost of answering $F$ using $\mathcal{I}$.
%TODO: example?
@@ -597,18 +629,35 @@ structure in the same way that incrementing the binary number by $1$ does.
By applying BSM to a data structure, a dynamized structure can be created
with the following performance characteristics,
\begin{align*}
-\text{Amortized Insertion Cost:}&\quad \Theta\left(\left(\frac{C(n)}{n}\cdot \log_2 n\right)\right) \\
-\text{Worst Case Insertion Cost:}&\quad \Theta\left(C(n)\right) \\
+\text{Amortized Insertion Cost:}&\quad \Theta\left(\left(\frac{B(n)}{n}\cdot \log_2 n\right)\right) \\
+\text{Worst Case Insertion Cost:}&\quad \Theta\left(B(n)\right) \\
\text{Worst-case Query Cost:}& \quad \Theta\left(\log_2 n\cdot \mathscr{Q}\left(n\right)\right) \\
\end{align*}
This is a particularly attractive result because, for example, a data
-structure having $C(n) \in \Theta(n)$ will have an amortized insertion
-cost of $\log_2 (n)$, which is quite reasonable. The cost is an extra
-logarithmic multiple attached to the query complexity. It is also worth
-noting that the worst-case insertion cost remains the same as global
-reconstruction, but this case arises only very rarely. If you consider the
-binary decomposition representation, the worst-case behavior is triggered
-each time the existing number overflows, and a new digit must be added.
+structure having $B(n) \in \Theta(n)$ will have an amortized insertion
+cost of $\log_2 (n)$, which is quite reasonable. The trade-off for this
+is an extra logarithmic multiple attached to the query complexity. It is
+also worth noting that the worst-case insertion cost remains the same
+as global reconstruction, but this case arises only very rarely. If
+you consider the binary decomposition representation, the worst-case
+behavior is triggered each time the existing number overflows, and a
+new digit must be added.
+
+As a final note about the query performance of this structure, because
+the overhead due to querying the blocks is logarithmic, under certain
+circumstances this cost can be absorbed, resulting in no effect on the
+asymptotic worst-case query performance. As an example, consider a linear
+scan of the data running in $\Theta(n)$ time. In this case, every record
+must be considered, and so there isn't any performance penality\footnote{
+ From an asymptotic perspective. There will still be measurable performance
+ effects from caching, etc., even in this case.
+} to breaking the records out into multiple chunks and scanning them
+individually. For formally, for any query running in $\mathscr{Q}(n) \in
+\Omega\left(n^\epsilon\right)$ time where $\epsilon > 0$, the worst-case
+cost of answering a decomposable search problem from a BSM dynamization
+is $\Theta\left(\mathscr{Q}(n)\right)$.~\cite{saxe79}
+
+\subsection{Merge Decomposable Search Problems}
\subsection{Delete Support}
@@ -624,7 +673,7 @@ deleted from the structure in $C(n)$ time as follows,
However, supporting deletes within the dynamization schemes discussed
above is more complicated. The core problem is that inserts affect the
dynamized structure in a deterministic way, and as a result certain
-partionining schemes can be leveraged to reason about the
+partitioning schemes can be leveraged to reason about the
performance. But, deletes do not work like this.
\begin{figure}
@@ -649,6 +698,290 @@ This presents several problems,
require additional work to fix.
\end{itemize}
+To resolve these difficulties, two very different approachs have been
+proposed for supporting deletes, each of which rely on certain properties
+of the search problem and data structure. These are the use of a ghost
+structure and weak deletes.
+
+\subsubsection{Ghost Structure for Invertible Search Problems}
+
+The first proposed mechanism for supporting deletes was discussed
+alongside the Bentley-Saxe method in Bentley and Saxe's original
+paper. This technique applies to a class of search problems called
+\emph{invertible} (also called \emph{decomposable counting problems}
+in later literature~\cite{overmars83}). Invertible search problems
+are decomposable, and also support an ``inverse'' merge operator, $\Delta$,
+that is able to remove records from the result set. More formally,
+\begin{definition}[Invertible Search Problem~\cite{saxe79}]
+\label{def:invert}
+A decomposable search problem, $F$ is invertible if and only if there
+exists a constant time computable operator, $\Delta$, such that
+\begin{equation*}
+F(A / B, q) = F(A, q)~\Delta~F(B, q)
+\end{equation*}
+for all $A, B \in \mathcal{PS}(\mathcal{D})$ where $A \cap B = \emptyset$.
+\end{definition}
+
+Given a search problem with this property, it is possible to perform
+deletes by creating a secondary ``ghost'' structure. When a record
+is to be deleted, it is inserted into this structure. Then, when the
+dynamization is queried, this ghost structure is queried as well as the
+main one. The results from the ghost structure can be removed from the
+result set using the inverse merge operator. This simulates the result
+that would have been obtained had the records been physically removed
+from the main structure.
+
+Two examples of invertible search problems are set membership
+and range count. Range count was formally defined in
+Definition~\ref{def:range-count}.
+
+\begin{theorem}
+Range count is an invertible search problem.
+\end{theorem}
+
+\begin{proof}
+To prove that range count is an invertible search problem, it must be
+decomposable and have a $\Delta$ operator. That it is a DSP has already
+been proven in Theorem~\ref{ther:decomp-range-count}.
+
+Let $\Delta$ be subtraction $(-)$. Applying this to Definition~\ref{def:invert}
+gives,
+\begin{equation*}
+|(A / B) \cap q | = |(A \cap q) / (B \cap q)| = |(A \cap q)| - |(B \cap q)|
+\end{equation*}
+which is true by the distributive property of set difference and
+intersection. Subtraction is computable in constant time, therefore
+range count is an invertible search problem using subtraction as $\Delta$.
+\end{proof}
+
+The set membership search problem is defined as follows,
+\begin{definition}[Set Membership]
+\label{def:set-membership}
+Consider a set of elements $d \subseteq \mathcal{D}$ from some domain,
+and a single element $r \in \mathcal{D}$. A test of set membership is a
+search problem of the form $F: (\mathcal{PS}(\mathcal{D}), \mathcal{D})
+\to \mathbb{B}$ such that $F(d, r) = r \in d$, which maps to $0$ if $r
+\not\in d$ and $1$ if $r \in d$.
+\end{definition}
+
+\begin{theorem}
+Set membership is an invertible search problem.
+\end{theorem}
+
+\begin{proof}
+To prove that set membership is invertible, it is necessary to establish
+that it is a decomposable search problem, and that a $\Delta$ operator
+exists. We'll begin with the former.
+\begin{lemma}
+ \label{lem:set-memb-dsp}
+ Set membership is a decomposable search problem.
+\end{lemma}
+\begin{proof}
+Let $\mergeop$ be the logical disjunction ($\lor$). This yields,
+\begin{align*}
+F(A \cup B, r) &= F(A, r) \lor F(B, r) \\
+r \in (A \cup B) &= (r \in A) \lor (r \in B)
+\end{align*}
+which is true, following directly from the definition of union. The
+logical disjunction is an associative, commutative operator that can
+be calculated in $\Theta(1)$ time. Therefore, set membership is a
+decomposable search problem.
+\end{proof}
+
+For the inverse merge operator, $\Delta$, it is necessary that $F(A,
+r) ~\Delta~F(B, r)$ be true \emph{only} if $r \in A$ and $r \not\in
+B$. Thus, it could be directly implemented as $F(A, r)~\Delta~F(B, r) =
+F(A, r) \land \neg F(B, r)$, which is constant time if
+the operands are already known.
+
+Thus, we have shown that set membership is a decomposable search problem,
+and that a constant time $\Delta$ operator exists. Therefore, it is an
+invertible search problem.
+\end{proof}
+
+For search problems such as these, this technique allows for deletes to be
+supported with the same cost as an insert. Unfortunately, it suffers from
+write amplification because each deleted record is recorded twice--one in
+the main structure, and once in the ghost structure. This means that $n$
+is, in effect, the total number of records and deletes. This can lead
+to some serious problems, for example if every record in a structure
+of $n$ records is deleted, the net result will be an "empty" dynamized
+data structure contaning $2n$ physical records within it. To circumvent
+this problem, Bentley and Saxe proposed a mechanism of setting a maximum
+threshold for the size of the ghost structure relative to the main one,
+and performing a complete re-paritioning of the data once this threshold
+is reached, removing all deleted records from the main structure,
+emptying the ghost structure, and rebuilding blocks with the records
+that remain according to the invariants of the technique.
+
+\subsubsection{Weak Deletes for Deletion Decomposable Search Problems}
+
+Another approach for supporting deletes was proposed later, by Overmars
+and van Leeuwen, for a class of search problem called \emph{deletion
+decomposable}. These are decomposable search problems for which the
+underlying data structure supports a delete operation. More formally,
+
+\begin{definition}[Deletion Decomposable Search Problem~\cite{merge-dsp}]
+ A decomposable search problem, $F$, and its data structure,
+ $\mathcal{I}$, is deletion decomposable if and only if, for some
+ instance $\mathscr{I} \in \mathcal{I}$, containing $n$ records,
+ there exists a deletion routine $\mathtt{delete}(\mathscr{I},
+ r)$ that removees some $r \in \mathcal{D}$ in time $D(n)$ without
+ increasing the query time, deletion time, or storage requirement,
+ for $\mathscr{I}$.
+\end{definition}
+
+Superficially, this doesn't appear very useful. If the underlying data
+structure already supports deletes, there isn't much reason to use a
+dynamization technique to add deletes to it. However, one point worth
+mentioning is that it is possible, in many cases, to easily \emph{add}
+delete support to a static structure. If it is possible to locate a
+record and somehow mark it as deleted, without removing it from the
+structure, and then efficiently ignore these records while querying,
+then the given structure and its search problem can be said to be
+deletion decomposable. This technique for deleting records is called
+\emph{weak deletes}.
+
+\begin{definition}[Weak Deletes~\cite{overmars81}]
+\label{def:weak-delete}
+A data structure is said to support weak deletes if it provides a
+routine, \texttt{delete}, that guarentees that after $\alpha \cdot n$
+deletions, where $\alpha < 1$, the query cost is bounded by $k_\alpha
+\mathscr{Q}(n)$ for some constant $k_\alpha$ dependent only upon $\alpha$,
+where $\mathscr{Q}(n)$ is the cost of answering the query against a
+structure upon which no weak deletes were performed.\footnote{
+ This paper also provides a similar definition for weak updates,
+ but these aren't of interest to us in this work, and so the above
+ definition was adapted from the original with the weak update
+ constraints removed.
+} The results of the query of a block containing weakly deleted records
+should be the same as the results would be against a block with those
+records removed.
+\end{definition}
+
+As an example of a deletion decomposable search problem, consider the set
+membership problem considered above (Definition~\ref{def:set-membership})
+where $\mathcal{I}$, the data structure used to answer queries of the
+search problem, is a hash map.\footnote{
+ While most hash maps are already dynamic, and so wouldn't need
+ dynamization to be applied, there do exist static ones too. For example,
+ the hash map being considered could be implemented using perfect
+ hashing~\cite{perfect-hashing}, which has many static implementations.
+}
+
+\begin{theorem}
+ The set membership problem, answered using a static hash map, is
+ deletion decomposable.
+\end{theorem}
+
+\begin{proof}
+We've already shown in Lemma~\ref{lem:set-memb-dsp} that set membership
+is a decomposable search problem. For it to be deletion decomposable,
+we must demonstrate that the hash map, $\mathcal{I}$, supports deleting
+records without hurting its query performance, delete performance, or
+storage requirements. Assume that an instance $\mathscr{I} \in
+\mathcal{I}$ having $|\mathscr{I}| = n$ can answer queries in
+$\mathscr{Q}(n) \in \Theta(1)$ time and requires $\Omega(n)$ storage.
+
+Such a structure can support weak deletes. Each record within the
+structure has a single bit attached to it, indicating whether it has
+been deleted or not. These bits will require $\Theta(n)$ storage and
+be initialized to 0 when the structure is constructed. A delete can
+be performed by querying the structure for the record to be deleted in
+$\Theta(1)$ time, and setting the bit to 1 if the record is found. This
+operation has $D(n) \in \Theta(1)$ cost.
+
+\begin{lemma}
+\label{lem:weak-deletes}
+The delete procedure as described above satisfies the requirements of
+Definition~\ref{def:weak-delete} for weak deletes.
+\end{lemma}
+\begin{proof}
+Per Definition~\ref{def:weak-delete}, there must exist some constant
+dependent only on $\alpha$, $k_\alpha$, such that after $\alpha \cdot
+n$ deletes against $\mathscr{I}$ with $\alpha < 1$, the query cost is
+bounded by $\Theta(\alpha \mathscr{Q}(n))$.
+
+In this case, $\mathscr{Q}(n) \in \Theta(1)$, and therefore our final
+query cost must be bounded by $\Theta(k_\alpha)$. When a query is
+executed against $\mathscr{I}$, there are three possible cases,
+\begin{enumerate}
+\item The record being searched for does not exist in $\mathscr{I}$. In
+this case, the query result is 0.
+\item The record being searched for does exist in $\mathscr{I}$ and has
+a delete bit value of 0. In this case, the query result is 1.
+\item The record being searched for does exist in $\mathscr{I}$ and has
+a delete bit value of 1 (i.e., it has been deleted). In this case, the
+query result is 0.
+\end{enumerate}
+In all three cases, the addition of deletes requires only $\Theta(1)$
+extra work at most. Therefore, set membership over a static hash map
+using our proposed deletion mechanism satisfies the requirements for
+weak deletes, with $k_\alpha = 1$.
+\end{proof}
+
+Finally, we note that the cost of one of these weak deletes is $D(n)
+= \mathscr{Q}(n)$. By Lemma~\ref{lem:weak-deletes}, the delete cost is
+not asymptotically harmed by deleting records.
+
+Thus, we've shown that set membership using a static hash map is a
+decomposable search problem, the storage cost remains $\Omega(n)$ and the
+query and delete costs are unaffected by the presence of deletes using the
+proposed mechanism. All of the requirements of deletion decomposability
+are satisfied, therefore set membership using a static hash map is a
+deletion decomposable search problem.
+\end{proof}
+
+For such problems, deletes can be supported by first identifying the
+block in the dynamization containing the record to be deleted, and
+then calling $\mathtt{delete}$ on it. In order to allow this block to
+be easily located, it is possible to maintain a hash table over all
+of the records, alongside the dynamization, which maps each record
+onto the block containing it. This table must be kept up to date as
+reconstructions occur, but this can be done at no extra asymptotic costs
+for any data structures having $B(n) \in \Omega(n)$, as it requires only
+linear time. This allows for deletes to be performed in $\mathscr{D}(n)
+\in \Theta(D(n))$ time.
+
+The presence of deleted records within the structure does introduce a
+new problem, however. Over time, the number of records in each block will
+drift away from the requirements imposed by the dynamization technique. It
+will eventually become necessary to repartition the records to restore
+these invariants, which are necessary for bounding the number of blocks,
+and thereby the query performance. The particular invariant maintence
+rules depend upon the decomposition scheme used.
+
+\Paragraph{Bentley-Saxe Method.} When creating a BSM dynamization for
+a deletion decomposable search problem, the $i$th block where $i \geq 2$\footnote{
+ Block $i=0$ will only ever have one record, so no special maintenance must be
+ done for it. A delete will simply empty it completely.
+},
+in the absense of deletes, will contain $2^{i-1} + 1$ records. When a
+delete occurs in block $i$, no special action is taken until the number
+of records in that block falls below $2^{i-2}$. Once this threshold is
+reached, a reconstruction can be performed to restore the appropriate
+record counts in each block.~\cite{merge-dsp}
+
+\Paragraph{Equal Block Method.} For the equal block method, there are
+two cases in which a delete may cause a block to fail to obey the method's
+size invariants,
+\begin{enumerate}
+ \item If enough records are deleted, it is possible for the number
+ of blocks to exceed $f(2n)$, violating Invariant~\ref{ebm-c1}.
+ \item The deletion of records may cause the maximum size of each
+ block to shrink, causing some blocks to exceed the maximum capacity
+ of $\nicefrac{2n}{s}$. This is a violation of Invariant~\ref{ebm-c2}.
+\end{enumerate}
+In both cases, it should be noted that $n$ is decreased as records are
+deleted. Should either of these cases emerge as a result of a delete,
+the entire structure must be reconfigured to ensure that its invariants
+are maintained. This reconfiguration follows the same procedure as when
+an insert results in a violation: $s$ is updated to be exactly $f(n)$, all
+existing blocks are unbuilt, and then the records are evenly redistributed
+into the $s$ blocks.~\cite{overmars-art-of-dyn}
+
+
+\subsection{Worst-Case Optimal Techniques}
\section{Limitations of Classical Dynamization Techniques}
@@ -811,6 +1144,7 @@ cost, we could greatly reduce the cost of supporting $C(n)$-decomposable
queries.
\subsubsection{Independent Range Sampling}
+\label{ssec:background-irs}
Another problem that is not decomposable is independent sampling. There
are a variety of problems falling under this umbrella, including weighted
@@ -831,15 +1165,7 @@ matching of records in result sets. To work around this, a slight abuse
of definition is in order: assume that the equality conditions within
the DSP definition can be interpreted to mean ``the contents in the two
sets are drawn from the same distribution''. This enables the category
-of DSP to apply to this type of problem. More formally,
-\begin{definition}[Decomposable Sampling Problem]
- A sampling problem $F: (D, Q) \to R$, $F$ is decomposable if and
- only if there exists a constant-time computable, associative, and
- commutative binary operator $\mergeop$ such that,
- \begin{equation*}
- F(A \cup B, q) \sim F(A, q)~ \mergeop ~F(B, q)
- \end{equation*}
-\end{definition}
+of DSP to apply to this type of problem.
Even with this abuse, however, IRS cannot generally be considered
decomposable; it is at best $C(n)$-decomposable. The reason for this is
diff --git a/chapters/sigmod23/background.tex b/chapters/sigmod23/background.tex
index b4ccbf1..af3b80a 100644
--- a/chapters/sigmod23/background.tex
+++ b/chapters/sigmod23/background.tex
@@ -37,12 +37,12 @@ have \emph{statistical independence} and for the distribution of records
in the sample set to match the distribution of source data set. This
requires that the sampling of a record does not affect the probability of
any other record being sampled in the future. Such sample sets are said
-to be drawn i.i.d (idendepently and identically distributed). Throughout
+to be drawn i.i.d (independently and identically distributed). Throughout
this chapter, the term "independent" will be used to describe both
statistical independence, and identical distribution.
Independence of sample sets is important because many useful statistical
-results are derived from assumping that the condition holds. For example,
+results are derived from assuming that the condition holds. For example,
it is a requirement for the application of statistical tools such as
the Central Limit Theorem~\cite{bulmer79}, which is the basis for many
concentration bounds. A failure to maintain independence in sampling
@@ -54,7 +54,7 @@ sampling} (IQS)~\cite{hu14}. In IQS, a sample set is constructed from a
specified number of records in the result set of a database query. In
this context, it isn't enough to ensure that individual records are
sampled independently; the sample sets from repeated queries must also be
-indepedent. This precludes, for example, caching and returning the same
+independent. This precludes, for example, caching and returning the same
sample set to multiple repetitions of the same query. This inter-query
independence provides a variety of useful properties, such as fairness
and representativeness of query results~\cite{tao22}.
@@ -194,7 +194,7 @@ call static sampling indices (SSIs) in this chapter,\footnote{
is based, which was published prior to our realization that a strong
distinction between an index and a data structure would be useful. I
am retaining the term SSI in this chapter for consistency with the
- original paper, but understand that in the termonology established in
+ original paper, but understand that in the terminology established in
Chapter~\ref{chap:background}, SSIs are data structures, not indices.
},
that are capable of answering sampling queries more efficiently than
@@ -216,7 +216,7 @@ per sample. Thus, a WSS query can be answered in $\Theta(k)$ time,
assuming the structure has already been built. Unfortunately, the alias
structure cannot be efficiently updated, as inserting new records would
change the relative weights of \emph{all} the records, and require fully
-repartitioning the structure.
+re-partitioning the structure.
While the alias method only applies to WSS, other sampling problems can
be solved by using the alias method within the context of a larger data
@@ -245,7 +245,7 @@ the alias structure with support for weight updates over a fixed set of
elements~\cite{hagerup93,matias03,allendorf23}. These approaches do not
allow the insertion or removal of new records, however, only in-place
weight updates. While in principle they could be constructed over the
-entire domain of possible records, with the weights of non-existant
+entire domain of possible records, with the weights of non-existent
records set to $0$, this is hardly practical. Thus, these structures are
not suited for the database sampling applications that are of interest to
us in this chapter.
diff --git a/chapters/sigmod23/exp-baseline.tex b/chapters/sigmod23/exp-baseline.tex
index da62766..5585c36 100644
--- a/chapters/sigmod23/exp-baseline.tex
+++ b/chapters/sigmod23/exp-baseline.tex
@@ -5,7 +5,7 @@ Olken's method on an aggregate B+Tree. We also examine the query performance
of a single instance of the SSI in question to establish how much query
performance is lost in the dynamization. Unless otherwise specified,
IRS and WIRS queries are run with a selectivity of $0.1\%$. Additionally,
-the \texttt{OSM} dataset was downsampled to 500 million records, except
+the \texttt{OSM} dataset was down-sampled to 500 million records, except
for scalability tests. The synthetic uniform and zipfian datasets were
generated with 1 billion records. As with the previous section, all
benchmarks began by warming up the structure with $10\%$ of the total
@@ -50,13 +50,13 @@ resulting in better performance.
\end{figure*}
In Figures~\ref{fig:wirs-insert} and \ref{fig:wirs-sample} we examine
-the performed of \texttt{DE-WIRS} compared to \text{AGG B+TreE} and an
+the performed of \texttt{DE-WIRS} compared to \text{AGG B+tree} and an
alias-augmented B+Tree. We see the same basic set of patterns in this
case as we did with WSS. \texttt{AGG B+Tree} defeats our dynamized
index on the \texttt{twitter} dataset, but loses on the others, in
terms of insertion performance. We can see that the alias-augmented
B+Tree is much more expensive to build than an alias structure, and
-so its insertion performance advantage is erroded somewhat compared to
+so its insertion performance advantage is eroded somewhat compared to
the dynamic structure. For queries we see that the \texttt{AGG B+Tree}
performs similarly for WIRS sampling as it did for WSS sampling, but the
alias-augmented B+Tree structure is quite a bit slower at WIRS than the
@@ -82,7 +82,7 @@ being introduced by the dynamization.
We next considered IRS queries. Figures~\ref{fig:irs-insert1} and
\ref{fig:irs-sample1} show the results of our testing of single-threaded
\texttt{DE-IRS} running in-memory against the in-memory ISAM Tree and
-\texttt{AGG B+treE}. The ISAM tree structure can be efficiently bulk-loaded,
+\texttt{AGG B+tree}. The ISAM tree structure can be efficiently bulk-loaded,
which results in a much faster construction time than the alias structure
or alias-augmented B+tree. This gives it a significant update performance
advantage, and we see in Figure~\ref{fig:irs-insert1} that \texttt{DE-IRS}
@@ -96,7 +96,7 @@ the performance differences.
We also consider the scalability of inserts, queries, and deletes, of
\texttt{DE-IRS} compared to \texttt{AGG B+tree} across a wide range of
data sizes. Figure~\ref{fig:irs-insert-s} shows that \texttt{DE-IRS}'s
-insertion performance scales similarly with datasize as the baseline, and
+insertion performance scales similarly with data size as the baseline, and
Figure~\ref{fig:irs-sample-s} tells a similar story for query performance.
Figure~\ref{fig:irs-delete-s} compares the delete performance of the
two structures, where \texttt{DE-IRS} is configured to use tagging. As
@@ -110,7 +110,7 @@ the B+tree is superior to \texttt{DE-IRS} because of the cost of the
preliminary processing that our dynamized structure must do to begin
to answer queries. However, as the sample set size increases, this cost
increasingly begins to pay off, with \texttt{DE-IRS} quickly defeating
-the dynamic structure in averge per-sample latency. One other interesting
+the dynamic structure in average per-sample latency. One other interesting
note is the performance of the static ISAM tree, which begins on-par with
the B+Tree, but also sees an improvement as the sample set size increases.
This is because of cache effects. During the initial tree traversal, both
diff --git a/chapters/sigmod23/exp-extensions.tex b/chapters/sigmod23/exp-extensions.tex
index 62f15f4..3d3f5b7 100644
--- a/chapters/sigmod23/exp-extensions.tex
+++ b/chapters/sigmod23/exp-extensions.tex
@@ -49,9 +49,9 @@ as additional insertion threads are added. Both plots show linear scaling
up to 3 or 4 threads, before the throughput levels off. Further, even
with as many as 32 threads, the system is able to maintain a stable
insertion throughput. Note that this implementation of concurrency
-is incredibly rudamentary, and doesn't take advantage of concurrent
+is incredibly rudimentary, and doesn't take advantage of concurrent
merging opportunities, among other things. An implementation with
support for this will be discussed in Chapter~\ref{chap:tail-latency},
-and shown to perform significantly better. Even with this rudamentary
+and shown to perform significantly better. Even with this rudimentary
implementation of concurrency, however, \texttt{DE-IRS} is able to
outperform \texttt{AB-tree} under all conditions tested.
diff --git a/chapters/sigmod23/exp-parameter-space.tex b/chapters/sigmod23/exp-parameter-space.tex
index d53c592..9583312 100644
--- a/chapters/sigmod23/exp-parameter-space.tex
+++ b/chapters/sigmod23/exp-parameter-space.tex
@@ -62,7 +62,7 @@ operations) reducing their effect on the overall throughput.
The influence of scale factor on update performance is shown in
Figure~\ref{fig:insert_sf}. The effect is different depending on the
-layout policy, with larger scale factors benefitting update performance
+layout policy, with larger scale factors benefiting update performance
under tiering, and hurting it under leveling. The effect of the mutable
buffer size on insertion, shown in Figure~\ref{fig:insert_mt}, is a little
less clear, but does show a slight upward trend, with larger buffers
@@ -86,7 +86,7 @@ effect on query performance. Thus, in this context, is would appear
that the scale factor is primarily useful as an insertion performance
tuning tool. The mutable buffer size, in Figure~\ref{fig:sample_mt},
also generally has no clear effect. This is expected, because the buffer
-contains onyl a small number of records relative to the entire dataset,
+contains only a small number of records relative to the entire dataset,
and so has a fairly low probability of being selected for drawing
a sample from. Even when it is selected, rejection sampling is very
inexpensive. The one exception to this trend is when using tombstones,
diff --git a/chapters/sigmod23/experiment.tex b/chapters/sigmod23/experiment.tex
index 4dbb4c2..727284a 100644
--- a/chapters/sigmod23/experiment.tex
+++ b/chapters/sigmod23/experiment.tex
@@ -28,7 +28,7 @@ added to records when testing dynamic baselines. Additionally, weighted
testing attached a 64-bit integer weight to each record. This weight was
not included in the record for non-weighted testing. The weights and
keys were both used directly from the datasets, and values were added
-seperately and unique to each record.
+separately and unique to each record.
We used the following datasets for testing,
\begin{itemize}
@@ -75,13 +75,13 @@ method on an AGG-BTree.
\item \textbf{DE-WIRS.} An implementation of the dynamized alias-augmented
B+Tree~\cite{afshani17} as discussed in Section~\ref{ssec:wirs-struct} for
-weighted indepedent range sampling. We compare this against a WIRS
+weighted independent range sampling. We compare this against a WIRS
implementation of Olken's method on an AGG-BTree.
\end{itemize}
All of the tested structures, with the exception of the external memory
-DE-IRS implementation and AB-Tree, were wholely contained within system
+DE-IRS implementation and AB-Tree, were wholly contained within system
memory. AB-Tree is a native external structure, so for the in-memory
concurrency evaluation we configured it with enough cache to maintain
the entire structure in memory to simulate an in-memory implementation.\footnote{
diff --git a/chapters/sigmod23/extensions.tex b/chapters/sigmod23/extensions.tex
index 06d55a5..3a3cba3 100644
--- a/chapters/sigmod23/extensions.tex
+++ b/chapters/sigmod23/extensions.tex
@@ -9,7 +9,7 @@ concurrency and external data structures.
\subsection{External Data Structures}
\label{ssec:ext-external}
-Our dynamization techniques can easily accomodate external data structures
+Our dynamization techniques can easily accommodate external data structures
as well as in-memory ones. To demonstrate this, we have implemented
a dynamized version of an external ISAM tree for use in answering IRS
queries. The mutable buffer remains an unsorted array in memory, however
@@ -46,7 +46,7 @@ file or a Spark RDD, and a centralized control node can manage the
mutable buffer. Flushing this buffer would create a new file/RDD, and
reconstructions could likewise be performed by creating new immutable
structures through the merging of existing ones, using the same basic
-scheme as has already been discussed in this chapter. Using thes tools,
+scheme as has already been discussed in this chapter. Using these tools,
SSIs over datasets that exceed the capacity of a single node could be
supported. Such distributed SSIs do exist, such as the RDD-based sampling
structure using in XDB~\cite{li19}.
diff --git a/chapters/sigmod23/framework.tex b/chapters/sigmod23/framework.tex
index 0f3fac8..2f2515b 100644
--- a/chapters/sigmod23/framework.tex
+++ b/chapters/sigmod23/framework.tex
@@ -1,7 +1,7 @@
\section{Dynamization of SSIs}
\label{sec:framework}
-Our goal, then, is to design a solution to indepedent sampling that is
+Our goal, then, is to design a solution to independent sampling that is
able to achieve \emph{both} efficient updates and efficient sampling,
while also maintaining statistical independence both within and between
IQS queries, and to do so in a generalized fashion without needing to
@@ -98,7 +98,7 @@ define the decomposability conditions for a query sampling problem,
\end{enumerate}
\end{definition}
-These two conditions warrant further explaination. The first condition
+These two conditions warrant further explanation. The first condition
is simply a redefinition of the standard decomposability criteria to
consider matching the distribution, rather than the exact records in $R$,
as the correctness condition for the merge process. The second condition
@@ -114,7 +114,7 @@ problems. First, we note that many SSIs have a sampling procedure that
naturally involves two phases. First, some preliminary work is done
to determine metadata concerning the set of records to sample from,
and then $k$ samples are drawn from the structure, taking advantage of
-this metadata. If we represent the time cost of the prelimary work
+this metadata. If we represent the time cost of the preliminary work
with $P(n)$ and the cost of drawing a sample with $S(n)$, then these
structures query cost functions are of the form,
@@ -213,7 +213,7 @@ $k$ records have been sampled.
\end{example}
Assuming a Bentley-Saxe decomposition with $\log n$ blocks and assuming
-a constant number of repetitions, the cost of answering a decomposible
+a constant number of repetitions, the cost of answering a decomposable
sampling query having a pre-processing cost of $P(n)$, a weight-determination
cost of $W(n)$ and a per-sample cost of $S(n)$ will be,
\begin{equation}
@@ -241,7 +241,7 @@ satisfied by either the search problem or data structure. Unfortunately,
neither approach can work as a ``drop-in'' solution in the context of
sampling problems, because of the way that deleted records interact with
the sampling process itself. Sampling problems, as formalized here,
-are neither invertable, nor deletion decomposable. In this section,
+are neither invertible, nor deletion decomposable. In this section,
we'll discuss our mechanisms for supporting deletes, as well as how
these can be handled during sampling while maintaining correctness.
@@ -397,7 +397,7 @@ Section~\ref{sec:sampling-implementation}.
\subsubsection{Bounding Rejection Probability}
-When a sampled record has been rejected, it must be resampled. This
+When a sampled record has been rejected, it must be re-sampled. This
introduces performance overhead resulting from extra memory access and
random number generations, and hurts our ability to provide performance
bounds on our sampling operations. In the worst case, a structure
@@ -413,7 +413,7 @@ we have the opportunity to remove deleted records. This will cause the
record counts associated with each block of the structure to gradually
drift out of alignment with the "perfect" powers of two associated with
the Bentley-Saxe method, however. In the theoretical literature on this
-topic, the solution to this problem is to periodically repartition all of
+topic, the solution to this problem is to periodically re-partition all of
the records to re-align the block sizes~\cite{merge-dsp, saxe79}. This
approach could also be easily applied here, if desired, though we
do not in our implementations, for reasons that will be discussed in
@@ -449,7 +449,7 @@ is not sufficient.
Fortunately, this passive system can be used as the basis for a
system that does provide a bound. This is because it guarantees,
whether tagging or tombstones are used, that any given deleted
-record will \emph{eventually} be cancelled out after a finite number
+record will \emph{eventually} be canceled out after a finite number
of reconstructions. If the number of deleted records gets too high,
some or all of these deleted records can be cleared out by proactively
performing reconstructions. We call these proactive reconstructions
@@ -524,7 +524,7 @@ that is both tunable, and generally more performant, at the cost of some
additional theoretical complexity. There has been some theoretical work
in this area, based upon nesting instances of the equal block method
within the Bentley-Saxe method~\cite{overmars81}, but these methods are
-unwieldy and are targetted at tuning the worst-case at the expense of the
+unwieldy and are targeted at tuning the worst-case at the expense of the
common case. We will take a different approach to adding configurability
to our dynamization system.
@@ -532,7 +532,7 @@ Though it has thus far gone unmentioned, some readers may have
noted the astonishing similarity between decomposition-based
dynamization techniques, and a data structure called the Log-structured
Merge-tree. First proposed by O'Neil in the mid '90s\cite{oneil96},
-the LSM Tree was designed to optmize write throughout for external data
+the LSM Tree was designed to optimize write throughout for external data
structures. It accomplished this task by buffer inserted records in a
small in-memory AVL Tree, and then flushing this buffer to disk when
it filled up. The flush process itself would fully rebuild the on-disk
@@ -543,13 +543,13 @@ layered, external structures, to reduce the cost of reconstruction.
In more recent times, the LSM Tree has seen significant development and
been used as the basis for key-value stores like RocksDB~\cite{dong21}
and LevelDB~\cite{leveldb}. This work has produced an incredibly large
-and well explored parameterization of the reconstruction procedures of
+and well explored parametrization of the reconstruction procedures of
LSM Trees, a good summary of which can be bound in this recent tutorial
paper~\cite{sarkar23}. Examples of this design space exploration include:
different ways to organize each "level" of the tree~\cite{dayan19,
-dostoevsky, autumn}, different growth rates, buffering, sub-partioning
+dostoevsky, autumn}, different growth rates, buffering, sub-partitioning
of structures to allow finer-grained reconstruction~\cite{dayan22}, and
-approaches for allocating resources to auxilliary structures attached to
+approaches for allocating resources to auxiliary structures attached to
the main ones for accelerating certain types of query~\cite{dayan18-1,
zhu21, monkey}.
@@ -561,7 +561,7 @@ following four elements for use in our dynamization technique,
\begin{itemize}
\item A small dynamic buffer into which new records are inserted
\item A variable growth rate, called as \emph{scale factor}
- \item The ability to attach auxilliary structures to each block
+ \item The ability to attach auxiliary structures to each block
\item Two different strategies for reconstructing data structures
\end{itemize}
This design space and its associated trade-offs will be discussed in
@@ -585,28 +585,28 @@ $N_B \cdot 2^i$ records in the $i$th block. We call this unsorted array
the \emph{mutable buffer}.
\Paragraph{Scale Factor.} In the Bentley-Saxe method, each block is
-twice as large as the block the preceeds it There is, however, no reason
+twice as large as the block the precedes it There is, however, no reason
why this growth rate couldn't be adjusted. In our system, we make the
growth rate a user-specified constant called the \emph{scale factor},
$s$, such that the $i$th level contains $N_B \cdot s^i$ records.
-\Paragraph{Auxilliary Structures.} In Section~\ref{ssec:sampling-deletes},
+\Paragraph{Auxiliary Structures.} In Section~\ref{ssec:sampling-deletes},
we encountered two problems relating to supporting deletes that can be
-resolved through the use of auxilliary structures. First, regardless
+resolved through the use of auxiliary structures. First, regardless
of whether tagging or tombstones are used, the data structure requires
support for an efficient point-lookup operation. Many SSIs are tree-based
and thus support this, but not all data structures do. In such cases,
-the point-lookup operation could be provided by attaching an auxilliary
+the point-lookup operation could be provided by attaching an auxiliary
hash table to the data structure that maps records to their location in
the SSI. We use term \emph{shard} to refer to the combination of a
-block with these optional auxilliary structures.
+block with these optional auxiliary structures.
In addition, the tombstone deletion mechanism requires performing a point
lookup for every record sampled, to validate that it has not been deleted.
This introduces a large amount of overhead into the sampling process,
as this requires searching each block in the structure. One approach
that can be used to help improve the performance of these searches,
-without requiring as much storage as adding auxilliary hash tables to
+without requiring as much storage as adding auxiliary hash tables to
every block, is to include bloom filters~\cite{bloom70}. A bloom filter
is an approximate data structure that answers tests of set membership
with bounded, single-sided error. These are commonly used in LSM Trees
@@ -687,7 +687,7 @@ s^{i+1}$ records. If tiering is used, each level will contain up to
$s$ SSIs, each with up to $N_B \cdot s^i$ records. The scale factor,
$s$, controls the rate at which the capacity of each level grows. The
framework supports deletes using either the tombstone or tagging policy,
-which can be selected by the user acccording to her preference. To support
+which can be selected by the user according to her preference. To support
these delete mechanisms, each record contains an attached header with
bits to indicate its tombstone or delete status.
@@ -735,7 +735,7 @@ compaction is complete, the delete proportions are checked again, and
this process is repeated until all levels satisfy the bound.
Following this procedure, inserts have a worst case cost of $I \in
-\Theta(B_M(n))$, equivalent to Bently-Saxe. The amortized cost can be
+\Theta(B_M(n))$, equivalent to Bentley-Saxe. The amortized cost can be
determined by finding the total cost of reconstructions involving each
record and amortizing it over each insert. The cost of the insert is
composed of three parts,
@@ -773,7 +773,7 @@ cost of a delete is the same as cost of doing a point lookup, as the
"delete" itself is simply setting a bit in the header of the record,
once it has been located. There will be $\Theta(\log_s n)$ total shards
in the structure, each with a look-up cost of $L(n)$ using either the
-SSI's native point-lookup, or an auxilliary hash table, and the lookup
+SSI's native point-lookup, or an auxiliary hash table, and the lookup
must also scan the buffer in $\Theta(N_B)$ time. Thus, the worst-case
cost of a tagged delete is,
\begin{equation*}
@@ -792,7 +792,7 @@ to sample from the unsorted buffer as well. There are two approaches
for sampling from the buffer. The most general approach would be to
temporarily build an SSI over the records within the buffer, and then
treat this is a normal shard for the remainder of the sampling procedure.
-In this case, the sampling algorithm remains indentical to the algorithm
+In this case, the sampling algorithm remains identical to the algorithm
discussed in Section~\ref{ssec:decomposed-structure-sampling}, following
the construction of the temporary shard. This results in a worst-case
sampling cost of,
diff --git a/chapters/sigmod23/introduction.tex b/chapters/sigmod23/introduction.tex
index befdbba..1a33c2e 100644
--- a/chapters/sigmod23/introduction.tex
+++ b/chapters/sigmod23/introduction.tex
@@ -3,7 +3,7 @@
Having discussed the relevant background materials, we will now turn to a
discussion of our first attempt to address the limitations of dynamization
in the context of one particular class of non-decomposable search problem:
-indepedent random sampling. We've already discussed one representative
+independent random sampling. We've already discussed one representative
problem of this class, independent range sampling, and shown how it is
not traditionally decomposable. This specific problem is one of several
very similar types of problem, however, and in this chapter we will also
@@ -21,7 +21,7 @@ problems is limited by the techniques used within databases to implement
them. Existing implementations tend to sacrifice either performance,
by requiring the entire result set of be materialized prior to applying
Bernoulli sampling, or statistical independence. There exists techniques
-for obtaining both sampling performance and indepedence by leveraging
+for obtaining both sampling performance and independence by leveraging
existing B+Tree indices with slight modification~\cite{olken-thesis},
but even this technique has worse sampling performance than could be
achieved using specialized static sampling indices.
diff --git a/references/references.bib b/references/references.bib
index 1bc708b..38244e6 100644
--- a/references/references.bib
+++ b/references/references.bib
@@ -1630,3 +1630,108 @@ keywords = {analytic model, analysis of algorithms, overflow chaining, performan
bibsource = {dblp computer science bibliography, https://dblp.org}
}
+@inproceedings{overmars-cn-decomp,
+ title={Searching in the past II- general transformations},
+ author={Mark H. Overmars},
+ year={1981},
+ url={https://api.semanticscholar.org/CorpusID:56886448}
+}
+
+@inproceedings{overmars-art-of-dyn,
+ author = {Jan van Leeuwen and
+ Mark H. Overmars},
+ editor = {Jozef Gruska and
+ Michal Chytil},
+ title = {The Art of Dynamizing},
+ booktitle = {Mathematical Foundations of Computer Science 1981, Strbske Pleso,
+ Czechoslovakia, August 31 - September 4, 1981, Proceedings},
+ series = {Lecture Notes in Computer Science},
+ volume = {118},
+ pages = {121--131},
+ publisher = {Springer},
+ year = {1981},
+ url = {https://doi.org/10.1007/3-540-10856-4\_78},
+ doi = {10.1007/3-540-10856-4\_78},
+ timestamp = {Tue, 14 May 2019 10:00:37 +0200},
+ biburl = {https://dblp.org/rec/conf/mfcs/LeeuwenO81.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{maurer80,
+ title={Dynamic systems of static data structures, Bericht 42, Inst. f},
+ author={Van Leeuwen, J and Maurer, HA},
+ journal={Informationsverarbeitung, TU Graz, Austria},
+ year={1980}
+}
+
+@book{maurer79,
+ title={Dynamic solutions of decomposable searching problems},
+ author={Maurer, HA and Ottmann, Th},
+ year={1979},
+ publisher={Technische Universit{\"a}t Graz/Forschungszentrum Graz. Institut f{\"u}r~…}
+}
+
+
+@article{lsmgraph,
+ author = {Song Yu and
+ Shufeng Gong and
+ Qian Tao and
+ Sijie Shen and
+ Yanfeng Zhang and
+ Wenyuan Yu and
+ Pengxi Liu and
+ Zhixin Zhang and
+ Hongfu Li and
+ Xiaojian Luo and
+ Ge Yu and
+ Jingren Zhou},
+ title = {LSMGraph: {A} High-Performance Dynamic Graph Storage System with Multi-Level
+ {CSR}},
+ journal = {Proc. {ACM} Manag. Data},
+ volume = {2},
+ number = {6},
+ pages = {243:1--243:28},
+ year = {2024},
+ url = {https://doi.org/10.1145/3698818},
+ doi = {10.1145/3698818},
+ timestamp = {Wed, 19 Mar 2025 21:16:37 +0100},
+ biburl = {https://dblp.org/rec/journals/pacmmod/YuGTSZYLZLLYZ24.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{bkdtree,
+ author = {Octavian Procopiuc and
+ Pankaj K. Agarwal and
+ Lars Arge and
+ Jeffrey Scott Vitter},
+ editor = {Thanasis Hadzilacos and
+ Yannis Manolopoulos and
+ John F. Roddick and
+ Yannis Theodoridis},
+ title = {Bkd-Tree: {A} Dznamic Scalable kd-Tree},
+ booktitle = {Advances in Spatial and Temporal Databases, 8th International Symposium,
+ {SSTD} 2003, Santorini Island, Greece, July 24-27, 2003, Proceedings},
+ series = {Lecture Notes in Computer Science},
+ volume = {2750},
+ pages = {46--65},
+ publisher = {Springer},
+ year = {2003},
+ url = {https://doi.org/10.1007/978-3-540-45072-6\_4},
+ doi = {10.1007/978-3-540-45072-6\_4},
+ timestamp = {Tue, 21 Mar 2023 21:00:39 +0100},
+ biburl = {https://dblp.org/rec/conf/ssd/ProcopiucAAV03.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@article{perfect-hashing,
+ title={Perfect hashing},
+ author={Czech, Zbigniew J and Havas, George and Majewski, Bohdan S},
+ journal={Theoretical Computer Science},
+ volume={182},
+ number={1-2},
+ pages={1--143},
+ year={1997},
+ publisher={Elsevier}
+}
+