summaryrefslogtreecommitdiffstats
path: root/chapters
diff options
context:
space:
mode:
Diffstat (limited to 'chapters')
-rw-r--r--chapters/beyond-dsp.tex347
-rw-r--r--chapters/dynamization.tex1
-rw-r--r--chapters/sigmod23/framework.tex1
3 files changed, 305 insertions, 44 deletions
diff --git a/chapters/beyond-dsp.tex b/chapters/beyond-dsp.tex
index ee32bfb..e76a638 100644
--- a/chapters/beyond-dsp.tex
+++ b/chapters/beyond-dsp.tex
@@ -355,14 +355,15 @@ We next turn out attention to support for deletes. Efficient delete
support in Bentley-Saxe dynamization is provably impossible~\cite{saxe79},
but, as discussed in Section~\ref{ssec:dyn-deletes} it is possible
to support them in restricted situations, where either the search
-problem is invertible (Definition~\ref{}) or the data structure and
-search problem combined are deletion decomposable (Definition~\ref{}).
-In Chapter~\ref{chap:sampling}, we considered a set of search problems
-which did \emph{not} satisfy any of these properties, and instead built a
-customized solution for deletes that required tight integration with the
-query process in order to function. While such a solution was acceptable
-for the goals of that chapter, it is not sufficient for our goal in this
-chapter of producing a generalized system.
+problem is invertible (Definition~\ref{def:invert}) or the data
+structure and search problem combined are deletion decomposable
+(Definition~\ref{def:background-ddsp}). In Chapter~\ref{chap:sampling},
+we considered a set of search problems which did \emph{not} satisfy
+any of these properties, and instead built a customized solution for
+deletes that required tight integration with the query process in order
+to function. While such a solution was acceptable for the goals of that
+chapter, it is not sufficient for our goal in this chapter of producing
+a generalized system.
Additionally, of the two types of problem that can support deletes, the
invertible case is preferable. This is because the amount of work necessary
@@ -404,14 +405,16 @@ IDSP definition expands eDSP with a fifth operation,
needed and return true. Otherwise, return false.
\end{itemize}
-If this routine returns true, then the query process is repeated
-from $\mathbftt{distribute\_query}$, and if it returns false then the
-result is returned to the user. If the number of repetitions of the
-query is bounded by $R(n)$, then the following provides an upper bound
-on the worst-case query complexity of an IDSP,
+If this routine returns true, it must also modify the local queries as
+necessary to account for the work that remains to be completed (e.g.,
+update the number of records to retrieve). Then, the query process resumes
+from the execution of the local queries. If it returns false, then the
+result is simply returned to the user. If the number of repetitions of
+the query is bounded by $R(n)$, then the following provides an upper
+bound on the worst-case query complexity of an IDSP,
\begin{equation*}
- O\left(\log_2 n \cdot P(n) + R(n) \left(D(n) + \log_2 n \cdot Q_s(n) +
+ O\left(\log_2 n \cdot P(n) + D(n) + R(n) \left(\log_2 n \cdot Q_s(n) +
C_e(n)\right)\right)
\end{equation*}
@@ -480,8 +483,8 @@ current traversal state in the meta-information object, and resume a
$k$-NN query on the structure from that state at no cost.
\SetKwFunction{repeat}{repeat}
-
-\begin{algorithm}[th]
+\afterpage{\clearpage}
+\begin{algorithm}[p]
\caption{$k$-NN with Iterative Decomposability}
\label{alg:idsp-knn}
\KwIn{$k$: result size, $p$: query point}
@@ -501,17 +504,15 @@ $k$-NN query on the structure from that state at no cost.
\BlankLine
\Def{\query{$\mathscr{I}_i$, $q_i=(k,p,\mathscr{M}_i)$}}{
$(r_i, \mathscr{M}_i) \gets \mathscr{I}_i.\text{knn\_from}(k, p, \mathscr{M}_i)$ \;
- \BlankLine
- \Comment{The local result includes the records stored in a priority queue and query state}
+ \Comment{The local result stores records in a priority queue}
\Return $(r_i, \mathscr{M}_i)$ \;
}
\BlankLine
\Def{\combine{$r_1, \ldots, r_m, \ldots, r_n$, $q=(k,p)$}}{
- $R \gets \{\}$ \;
+ $R \gets \{\}$ ;
$pq \gets \text{PriorityQueue}()$ ;
$gpq \gets \text{PriorityQueue}()$ \;
- \BlankLine
\Comment{Results $1$ through $m$ are from the primary structure,
and $m+1$ through $n$ are from the ghost structure.}
\For {$i\gets 1 \ldots m$} {
@@ -527,7 +528,6 @@ $k$-NN query on the structure from that state at no cost.
\While{$|R| < k \land \neg pq.\text{empty}()$} {
$(i, d) \gets pq.\text{dequeue}()$ \;
- \BlankLine
$R \gets R \cup r_i.\text{dequeue}()$ \;
\If {$\neg r_i.\text{empty}()$} {
$pq.\text{enqueue}(i, r_i.\text{front}())$ \;
@@ -539,7 +539,6 @@ $k$-NN query on the structure from that state at no cost.
\While{$\neg gpq.\text{empty}()$} {
$(i, d) \gets gpq.\text{dequeue}()$ \;
- \BlankLine
\If {$r_i.\text{front}() \in R$} {
$R \gets R / \{r_i.\text{front}()\}$ \;
@@ -549,7 +548,6 @@ $k$-NN query on the structure from that state at no cost.
}
}
- \BlankLine
\Return $R$ \;
}
\BlankLine
@@ -686,9 +684,104 @@ and deletes, depending upon the classification of the problem in the
taxonomy of Figure~\ref{fig:taxonomy}. The user provides the data
structure and query implementations as template parameters, and the
framework then provides an interface that allows for queries, inserts,
-and deletes against the new dynamic structure.
+and deletes against the new dynamic structure. Specifically, in addition
+to accessors for various structural information, the framework provides
+the following main operations,
+
+\begin{itemize}
+\item \texttt{int insert(RecordType); } \\
+ This function will insert a record into the dynamized structure,
+ and will return $1$ if the record was successfully inserted, and $0$
+ if it was not. Insertion failure is part of the concurrency control
+ mechanism, and failed inserts should be retried after a short delay.
+ More details of this are in Section~\ref{ssec:dyn-concurrency}.
+
+\item \texttt{int erase(RecordType);} \\
+ This function will delete a record from the dynamized structure,
+ returning $1$ on success and $0$ on failure. The meaning of a
+ failure to delete is dependent upon the delete mechanism in use,
+ and will be discussed in Section~\ref{ssec:dyn-deletes}.
+
+\item \texttt{std::future<QueryResult> query(QueryParameters); } \\
+ This function will execute a query with the specified parameters
+ against the structure and return the result. This interface is
+ asynchronous, and returns a future immediately, which can be used
+ to access the query result once the query has finished executing.
+
+\end{itemize}
+
+It can be configured with a template argument to run in single-threaded
+mode, or multi-threaded mode. In multi-threaded mode, the above routines
+can be called concurrently without any necessary synchronization in
+user code, and without requiring any special modification to the data
+structure and queries, beyond those changes necessary to use them in
+single-threaded mode.
+
+\subsection{Basic Principles}
+
+Before discussing the interfaces that the user must implement to
+use their code with our framework, it seems wise to discuss the
+high level functioning and structure of the framework, the details
+of which inform certain decisions about the necessary features
+that the user must implement to interface with it. The high level
+structure and organization of the framework is similar to that of
+Section~\ref{ssec:sampling-framework}.
+
+The framework requires the user to specify types to represent the
+record, query, and data structure (which we call a shard). The
+details of the interface requirements for these types are discussed in
+Section~\ref{ssec:dyn-interface}, and are enforced using C++20's concepts
+mechanism.
+
+\begin{figure}
+ \centering %\vspace{-3mm}
+ \subfloat[\small Leveling]{\includegraphics[width=.5\textwidth]{diag/leveling} \label{fig:dyn-leveling}}
+ %\vspace{-3mm}
+ \subfloat[\small Tiering]{\includegraphics[width=.5\textwidth]{diag/tiering} \label{fig:dyn-tiering}}
+ %\vspace{-3mm}
+ \caption{\small An overview of the general structure of the
+ dynamization framework using (a) leveling and
+ (b) tiering layout policies, with a scale factor 3.
+ Each shard is shown as a
+ dotted box, wrapping its associated dataset ($D_i$) and index ($I_i$). }
+ \label{fig:dyn-framework}
+ %\vspace{-3mm}
+\end{figure}
+
+Internally, the framework consists of a sequence of \emph{levels} with
+increasing record capacity, each containing one or more \emph{shards}. The
+layout of these levels is defined by a template argument, the \emph{layout
+policy}, and an integer called the \emph{scale factor}. The latter governs
+how quickly the record capacities of each level grow, and the former
+controls how those records are broken into shards on the level and the
+way in which records move from level to level during reconstructions. The
+details of layout policies, reconstruction, etc., will be discussed in
+a later section.
+
+Logically ``above'' these levels is a small unsorted array, called the
+\emph{mutable buffer}. The mutable buffer is of user-configurable size,
+and all inserts into the structure are first placed into it. When
+the buffer fills, it will be flushed into the structure, requiring
+reconstructions to occur in a manner consistent with the layout policy
+in order to make room. A simple graphical representation of the framework
+and two of its layout policies is shown in Figure~\ref{fig:dyn-framework}.
+
+The framework provides two mechanisms for supporting deletes: tagging
+and tombstones. These are identical to the mechanisms discussed in
+Section~\ref{ssec:sampling-deletes}, with tombstone deletes operating by
+inserting a record identicle to the one to be deleted into the structure,
+with an indicator bit set in the header, and tagged deletes performing
+a lookup of the record to be deleted in the structure and setting
+a bit in its header directly. Tombstone deletes are used to support
+invertible search problems, and tagged deletes are used for deletion
+decomposable search problems. While the delete procedure itself is handled
+automatically by the framework based upon the specified mechanism, it is
+the user's responsible to appropriately handle deleted records in their
+query and shard implementations.
+
\subsection{Interfaces}
+\label{ssec:dyn-interface}
In order to enforce interface requirements, our implementation takes
advantage of C++20 concepts. There are three major sets of interfaces
@@ -697,16 +790,16 @@ queries. We'll discuss each of these in this section.
\subsubsection{Record Interface}
-The record interface is the simplest of the three. Records are C++
-structs, and they must implement an equality comparision operator. Beyond
-this, the framework places no additional constraints and makes
-no assumptions about record contents, their ordering properties,
-etc. Though the records must be fixed length (as they are structs),
+The record interface is the simplest of the three. The type used as a
+record only requires an implementation of an equality comparison operator,
+and is assumed to be of fixed length. Beyond this, the framework places
+no additional constraints and makes no assumptions about record contents,
+their ordering properties, etc. Though the records must be fixed length,
variable length data can be supported using off-record storage and
pointers if necessary. Each record is automatically wrapped by the
framework with a header that is used to facilitate deletion support.
-The record concept is shown in Listing~\ref{lst:record}, along with the
-wrapped header type that is used to interact with records within
+The record concept is shown in Listing~\ref{lst:record}, along with
+the wrapped header type that is used to interact with records within
the framework.
\begin{lstfloat}
@@ -828,17 +921,18 @@ collection type should be used for these results. A range scan, for
example, could specified the result types as a vector of records, map
of records, etc., depending on the use case.
-There are two significant differences between the IDSP interface and
-the query concept implementation. The first is in the way that the query
-result object is passed between the \texttt{combine} and \texttt{repeat}
-function. To avoid copying it, as it can be large for some types of query,
-this object is initialized by the framework and passed by reference into
-these two functions. The second difference is that the \texttt{repeat}
-function is responsible for making any updates to local query objects,
-and that when \texttt{repeat} returns \texttt{true}, the local queries
-will be immediately re-executed. To facilitate this, the \texttt{repeat}
-function takes the local query objects as arguments.
-
+There is one significant difference between the IDSP interface and the
+query concept implementation. For efficiency purposes, \texttt{combine}
+does not return the query result object. Instead, the framework
+itself initializes the object, and then passes it by reference into
+\texttt{combine}. This is necessary because \texttt{combine} can be called
+multiple times, depending on whether the query must be repeated. Adding
+it as an argument to \texttt{combine}, rather than returning it,
+allows for the local query results to be discarded completely, and new
+results generated and added to the existing result set, in the case
+of a repetition. Without this modification, the user would either need
+to define an additional combination operation for final result types,
+or duplicate effort in the combine step on each repetition.
\begin{lstfloat}
@@ -887,17 +981,182 @@ framework.}
\end{lstfloat}
-\subsection{Configurability}
+\subsection{Internal Mechanisms}
-\subsection{Concurrency}
+\subsubsection{Inserts and Layout Policy}
+
+\begin{algorithm}[t]
+ \caption{Insertion with Dynamization Framework}
+ \label{algo:insert}
+ \KwIn{$r$: new record to insert}
+ \If{\texttt{buffer is not full}}{
+ $\texttt{buffer.append}(r)$\;
+ \Return
+ }
+ $\texttt{idx} \gets 0$\;
+ \For{$i \gets 0 \cdots \texttt{n\_levels}$}{
+ \If{$\texttt{level}_i \texttt{ can hold records in }\texttt{level}_{i - 1}$}{
+ \texttt{idx} = i\;
+ \Break\;
+ }
+ }
+ \For{$i \gets \texttt{idx} \cdots 1$}{
+ \If{layout\_policy = \texttt{LEVELING}} {
+ $\texttt{level}_i \gets
+ \texttt{merge\_shards}(\texttt{level}_i, \texttt{level}_{i - 1})$ \;
+ }
+
+ \If{layout\_policy = \texttt{TIERING}} {
+ $\texttt{new\_shard} \gets \texttt{merge\_shards}(\texttt{level}_{i-1})$ \;
+ $\texttt{level}_i \gets \texttt{add\_shard}(\texttt{level}_i, \texttt{new\_shard})$ \;
+ }
+ }
+ $\texttt{level}_0 \gets \texttt{add\_shard}(\texttt{level}_0, \texttt{build\_shard}(\texttt{buffer}))$\;
+ $\texttt{buffer.append}(r)$\;
+ \Return
+\end{algorithm}
+
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsubsection{Delete Policy}
+
+\Paragraph{Asymptotic Complexity.}
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsubsection{Queries}
+
+
+\begin{algorithm}[t]
+ \caption{Query with Dynamization Framework}
+ \label{algo:query-framework}
+ \KwIn{$q$: query parameters, $b$: mutable buffer, $S$: static index shards at all levels}
+ \KwOut{$R$: query results}
+
+ $\mathscr{S}_b \gets \texttt{local\_preproc}_{buffer}(b, q);\ \ \mathscr{S} \gets \{\}$ \;
+ \For{$s \in S$}{$\mathscr{S} \gets \mathscr{S}\ \cup (s, \texttt{local\_preproc}(s, q))$\;}
+ $(q_b, q_1, \ldots q_m) \gets \texttt{distribute\_query}(\mathscr{S}_b, \mathscr{S}, q)$ \;
+ $\mathcal{R} \gets \{\}; \ \ \texttt{rpt} \gets \bot$ \;
+ \Do{\texttt{rpt}}{
+ $locR \gets \{\}$ \;
+ $locR \gets locR \cup \texttt{local\_query}_{buffer}(b, q_b)$ \;
+ % the subscript in this one is wonky. Maybe do an array of Qs?
+ \For{$s \in S$}{$locR \gets locR \cup \texttt{local\_query}(s, q_s)$}
+ %\Comment{For \red{name}, use \texttt{tombstone\_lookup} to remove all deleted records. }
+ %\If{\textbf{not} \texttt{SKIP\_DELETE\_FILTER}}{$locR \gets \texttt{filter\_deletes}(locR, S)$}
+ $\mathcal{R} \gets \mathcal{R} \cup \texttt{combine}(locR, q_b, q_1, \ldots, q_m)$\;
+ $(\texttt{rpt}, q_b, q_1, \ldots,
+ q_m) \gets \texttt{repeat}(q, \mathcal{R}, q_b, q_1,\ldots, q_m)$\;
+ }
+ \Return{$\mathcal{R}$}
+
+\end{algorithm}
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsection{Concurrency Control}
\section{Evaluation}
\subsection{Experimental Setup}
\subsection{Design Space Evaluation}
+
+\begin{figure}
+ %\vspace{0pt}
+ \centering
+ \subfloat[Insertion Throughput \\ vs. Buffer Size]{\includegraphics[width=.4\textwidth]{img/fig-ps-mt-insert} \label{fig:ins-buffer-size}}
+ \subfloat[Insertion Throughput \\ vs. Scale Factor]{\includegraphics[width=.4\textwidth]{img/fig-ps-sf-insert} \label{fig:ins-scale-factor}}
+ \\ %\vspace{-2mm}
+ \subfloat[Query Latency vs. Buffer Size]{\includegraphics[width=.4\textwidth]{img/fig-ps-mt-query} \label{fig:q-buffer-size}}
+ \subfloat[Query Latency vs. Scale Factor]{\includegraphics[width=.4\textwidth]{img/fig-ps-sf-query} \label{fig:q-scale-factor}}
+ %\vspace{-2mm}
+ \caption{Design Space Evaluation (Triespline)}
+ %\vspace{-2mm}
+\end{figure}
+
+
\subsection{Independent Range Sampling}
+
+\begin{figure*}
+ %\vspace{0pt}
+ \centering
+ \subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-insert} \label{fig:irs-insert}}
+ \subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-query} \label{fig:irs-query}}
+ \subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-space} \label{fig:irs-space}}
+ %\vspace{-3mm}
+ \caption{IRS Index Evaluation}
+ \label{fig:irs}
+ %\vspace{-6mm}
+\end{figure*}
+
+
+\begin{align*}
+ \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+ \text{Query:} \quad &\Theta\left(\log_s n \log_f n + \frac{k}{1 - \delta}\right) \\
+ \text{Delete:} \quad &\Theta\left(\log_s n \log_f n\right)
+\end{align*}
+
+
\subsection{k-NN Search}
+
+\begin{figure*}
+ \subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-insert} \label{fig:knn-insert}}
+ \subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-query} \label{fig:knn-query}}
+ \subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-space} \label{fig:knn-space}}
+ %\vspace{-3mm}
+ \caption{k-NN Index Evaluation}
+ %\vspace{-3mm}
+ \label{fig:knn-eval}
+\end{figure*}
+
+
+\begin{align*}
+ \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+ \text{Query:} \quad &\Theta\left(N_B + \log n \log_s n\right ) \\
+ \text{Delete:} \quad &\Theta\left(\log_s n \right)
+\end{align*}
+
\subsection{Range Scan}
+
+\begin{figure*}
+ \centering
+ \subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-insert} \label{fig:rq-insert}}
+ \subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-query} \label{fig:rq-query}}
+ \subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-space} \label{fig:rq-space}}
+ %\vspace{-3mm}
+ \caption{Learned Index Evaluation}
+ %\vspace{-3mm}
+ \label{fig:eval-learned-index}
+\end{figure*}
+
\subsection{String Search}
+
+\begin{figure*}
+ \centering
+ \subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-insert} \label{fig:fst-insert}}
+ \subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-query} \label{fig:fst-query}}
+ \subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-space} \label{fig:fst-size}}
+ %\vspace{-3mm}
+ \caption{FST Evaluation}
+ %\vspace{-5mm}
+\end{figure*}
+
+\begin{align*}
+ \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+ \text{Query:} \quad &\Theta\left(N_B + \log n \log_s n\right ) \\
+ \text{Delete:} \quad &\Theta\left(\log_s n \right)
+\end{align*}
+
\subsection{Concurrency}
+\begin{figure}
+ \centering
+ %\vspace{-2mm}
+ \includegraphics[width=.5\textwidth]{img/fig-bs-irs-concurrency}
+ %\vspace{-2mm}
+ \caption{IRS Thread Scaling}
+ \label{fig:irs-concurrency}
+ %\vspace{-2mm}
+\end{figure}
+
\section{Conclusion}
diff --git a/chapters/dynamization.tex b/chapters/dynamization.tex
index c21bfbc..0ee77d3 100644
--- a/chapters/dynamization.tex
+++ b/chapters/dynamization.tex
@@ -625,6 +625,7 @@ decomposable}. These are decomposable search problems for which the
underlying data structure supports a delete operation. More formally,
\begin{definition}[Deletion Decomposable Search Problem~\cite{merge-dsp}]
+ \label{def:background-ddsp}
A decomposable search problem, $F$, and its data structure,
$\mathcal{I}$, is deletion decomposable if and only if, for some
instance $\mathscr{I} \in \mathcal{I}$, containing $n$ records,
diff --git a/chapters/sigmod23/framework.tex b/chapters/sigmod23/framework.tex
index ad250cc..02a43de 100644
--- a/chapters/sigmod23/framework.tex
+++ b/chapters/sigmod23/framework.tex
@@ -676,6 +676,7 @@ facilities to work around their absence.
\end{figure*}
\subsection{Framework Construction}
+\label{ssec:sampling-framework}
The framework itself is shown in Figure~\ref{fig:sampling-framework},
along with some of its configuration parameters and its insert procedure