From 9b896766ea5feeb3c80866a4dabde75565a43398 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Fri, 16 May 2025 16:02:33 -0400
Subject: updates

---
 chapters/beyond-dsp.tex | 347 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 303 insertions(+), 44 deletions(-)

(limited to 'chapters/beyond-dsp.tex')

diff --git a/chapters/beyond-dsp.tex b/chapters/beyond-dsp.tex
index ee32bfb..e76a638 100644
--- a/chapters/beyond-dsp.tex
+++ b/chapters/beyond-dsp.tex
@@ -355,14 +355,15 @@ We next turn out attention to support for deletes. Efficient delete
 support in Bentley-Saxe dynamization is provably impossible~\cite{saxe79},
 but, as discussed in Section~\ref{ssec:dyn-deletes} it is possible
 to support them in restricted situations, where either the search
-problem is invertible (Definition~\ref{}) or the data structure and
-search problem combined are deletion decomposable (Definition~\ref{}).
-In Chapter~\ref{chap:sampling}, we considered a set of search problems
-which did \emph{not} satisfy any of these properties, and instead built a
-customized solution for deletes that required tight integration with the
-query process in order to function. While such a solution was acceptable
-for the goals of that chapter, it is not sufficient for our goal in this
-chapter of producing a generalized system.
+problem is invertible (Definition~\ref{def:invert}) or the data
+structure and search problem combined are deletion decomposable
+(Definition~\ref{def:background-ddsp}).  In Chapter~\ref{chap:sampling},
+we considered a set of search problems which did \emph{not} satisfy
+any of these properties, and instead built a customized solution for
+deletes that required tight integration with the query process in order
+to function. While such a solution was acceptable for the goals of that
+chapter, it is not sufficient for our goal in this chapter of producing
+a generalized system.
 
 Additionally, of the two types of problem that can support deletes, the
 invertible case is preferable. This is because the amount of work necessary
@@ -404,14 +405,16 @@ IDSP definition expands eDSP with a fifth operation,
 	needed and return true. Otherwise, return false.
 \end{itemize}
 
-If this routine returns true, then the query process is repeated
-from $\mathbftt{distribute\_query}$, and if it returns false then the
-result is returned to the user. If the number of repetitions of the
-query is bounded by $R(n)$, then the following provides an upper bound
-on the worst-case query complexity of an IDSP,
+If this routine returns true, it must also modify the local queries as
+necessary to account for the work that remains to be completed (e.g.,
+update the number of records to retrieve). Then, the query process resumes
+from the execution of the local queries. If it returns false, then the
+result is simply returned to the user. If the number of repetitions of
+the query is bounded by $R(n)$, then the following provides an upper
+bound on the worst-case query complexity of an IDSP,
 
 \begin{equation*}
-    O\left(\log_2 n \cdot P(n) + R(n) \left(D(n) + \log_2 n \cdot Q_s(n) +
+    O\left(\log_2 n \cdot P(n) + D(n) + R(n) \left(\log_2 n \cdot Q_s(n) +
 	       C_e(n)\right)\right)
 \end{equation*}
 
@@ -480,8 +483,8 @@ current traversal state in the meta-information object, and resume a
 $k$-NN query on the structure from that state at no cost. 
 
 \SetKwFunction{repeat}{repeat}
-
-\begin{algorithm}[th]
+\afterpage{\clearpage}
+\begin{algorithm}[p]
     \caption{$k$-NN with Iterative Decomposability}
     \label{alg:idsp-knn}
     \KwIn{$k$: result size, $p$: query point}
@@ -501,17 +504,15 @@ $k$-NN query on the structure from that state at no cost.
     \BlankLine
     \Def{\query{$\mathscr{I}_i$, $q_i=(k,p,\mathscr{M}_i)$}}{
         $(r_i, \mathscr{M}_i) \gets \mathscr{I}_i.\text{knn\_from}(k, p, \mathscr{M}_i)$ \;
-        \BlankLine
-        \Comment{The local result includes the records stored in a priority queue and query state}
+        \Comment{The local result stores records in a priority queue}
         \Return $(r_i, \mathscr{M}_i)$  \;
     }
 
     \BlankLine
     \Def{\combine{$r_1, \ldots, r_m, \ldots, r_n$, $q=(k,p)$}}{
-        $R \gets \{\}$ \;
+        $R \gets \{\}$ ;
         $pq \gets \text{PriorityQueue}()$ ;
         $gpq \gets \text{PriorityQueue}()$ \; 
-        \BlankLine
         \Comment{Results $1$ through $m$ are from the primary structure,
         and $m+1$ through $n$ are from the ghost structure.}
         \For {$i\gets 1 \ldots m$} {
@@ -527,7 +528,6 @@ $k$-NN query on the structure from that state at no cost.
         \While{$|R| < k \land \neg pq.\text{empty}()$} {
             $(i, d) \gets pq.\text{dequeue}()$ \;
 
-            \BlankLine
             $R \gets R \cup r_i.\text{dequeue}()$ \;
             \If {$\neg r_i.\text{empty}()$} {
                 $pq.\text{enqueue}(i, r_i.\text{front}())$ \;
@@ -539,7 +539,6 @@ $k$-NN query on the structure from that state at no cost.
     \While{$\neg gpq.\text{empty}()$} {
         $(i, d) \gets gpq.\text{dequeue}()$ \;
 
-            \BlankLine
         \If {$r_i.\text{front}() \in R$} {
             $R \gets R / \{r_i.\text{front}()\}$ \;
 
@@ -549,7 +548,6 @@ $k$-NN query on the structure from that state at no cost.
         }
     }
 
-    \BlankLine
         \Return $R$ \;
     }
     \BlankLine
@@ -686,9 +684,104 @@ and deletes, depending upon the classification of the problem in the
 taxonomy of Figure~\ref{fig:taxonomy}. The user provides the data
 structure and query implementations as template parameters, and the
 framework then provides an interface that allows for queries, inserts,
-and deletes against the new dynamic structure.
+and deletes against the new dynamic structure. Specifically, in addition
+to accessors for various structural information, the framework provides
+the following main operations,
+
+\begin{itemize}
+\item \texttt{int insert(RecordType); } \\
+    This function will insert a record into the dynamized structure,
+    and will return $1$ if the record was successfully inserted, and $0$
+    if it was not. Insertion failure is part of the concurrency control
+    mechanism, and failed inserts should be retried after a short delay.
+    More details of this are in Section~\ref{ssec:dyn-concurrency}.
+
+\item \texttt{int erase(RecordType);} \\
+    This function will delete a record from the dynamized structure,
+    returning $1$ on success and $0$ on failure. The meaning of a
+    failure to delete is dependent upon the delete mechanism in use,
+    and will be discussed in Section~\ref{ssec:dyn-deletes}.
+
+\item \texttt{std::future<QueryResult> query(QueryParameters); } \\
+    This function will execute a query with the specified parameters
+    against the structure and return the result. This interface is
+    asynchronous, and returns a future immediately, which can be used
+    to access the query result once the query has finished executing.
+
+\end{itemize}
+
+It can be configured with a template argument to run in single-threaded
+mode, or multi-threaded mode. In multi-threaded mode, the above routines
+can be called concurrently without any necessary synchronization in
+user code, and without requiring any special modification to the data
+structure and queries, beyond those changes necessary to use them in
+single-threaded mode.
+
+\subsection{Basic Principles}
+
+Before discussing the interfaces that the user must implement to
+use their code with our framework, it seems wise to discuss the
+high level functioning and structure of the framework, the details
+of which inform certain decisions about the necessary features
+that the user must implement to interface with it. The high level
+structure and organization of the framework is similar to that of
+Section~\ref{ssec:sampling-framework}.
+
+The framework requires the user to specify types to represent the
+record, query, and data structure (which we call a shard). The
+details of the interface requirements for these types are discussed in
+Section~\ref{ssec:dyn-interface}, and are enforced using C++20's concepts
+mechanism.
+
+\begin{figure}
+    \centering %\vspace{-3mm}
+	\subfloat[\small Leveling]{\includegraphics[width=.5\textwidth]{diag/leveling} \label{fig:dyn-leveling}}
+	%\vspace{-3mm}
+	\subfloat[\small Tiering]{\includegraphics[width=.5\textwidth]{diag/tiering} \label{fig:dyn-tiering}}
+	%\vspace{-3mm}
+    \caption{\small An overview of the general structure of the
+    dynamization framework using (a) leveling and
+    (b) tiering layout policies, with a scale factor 3. 
+    Each shard is shown as a 
+    dotted box, wrapping its associated dataset ($D_i$) and index ($I_i$). }
+	\label{fig:dyn-framework}
+	%\vspace{-3mm}
+\end{figure}
+
+Internally, the framework consists of a sequence of \emph{levels} with
+increasing record capacity, each containing one or more \emph{shards}. The
+layout of these levels is defined by a template argument, the \emph{layout
+policy}, and an integer called the \emph{scale factor}. The latter governs
+how quickly the record capacities of each level grow, and the former
+controls how those records are broken into shards on the level and the
+way in which records move from level to level during reconstructions. The
+details of layout policies, reconstruction, etc., will be discussed in
+a later section.
+
+Logically ``above'' these levels is a small unsorted array, called the
+\emph{mutable buffer}. The mutable buffer is of user-configurable size,
+and all inserts into the structure are first placed into it. When
+the buffer fills, it will be flushed into the structure, requiring
+reconstructions to occur in a manner consistent with the layout policy
+in order to make room. A simple graphical representation of the framework
+and two of its layout policies is shown in Figure~\ref{fig:dyn-framework}.
+
+The framework provides two mechanisms for supporting deletes: tagging
+and tombstones. These are identical to the mechanisms discussed in
+Section~\ref{ssec:sampling-deletes}, with tombstone deletes operating by
+inserting a record identicle to the one to be deleted into the structure,
+with an indicator bit set in the header, and tagged deletes performing
+a lookup of the record to be deleted in the structure and setting
+a bit in its header directly. Tombstone deletes are used to support
+invertible search problems, and tagged deletes are used for deletion
+decomposable search problems. While the delete procedure itself is handled
+automatically by the framework based upon the specified mechanism, it is
+the user's responsible to appropriately handle deleted records in their
+query and shard implementations.
+
 
 \subsection{Interfaces}
+\label{ssec:dyn-interface}
 
 In order to enforce interface requirements, our implementation takes
 advantage of C++20 concepts. There are three major sets of interfaces
@@ -697,16 +790,16 @@ queries. We'll discuss each of these in this section.
 
 \subsubsection{Record Interface}
 
-The record interface is the simplest of the three. Records are C++
-structs, and they must implement an equality comparision operator. Beyond
-this, the framework places no additional constraints and makes
-no assumptions about record contents, their ordering properties,
-etc. Though the records must be fixed length (as they are structs),
+The record interface is the simplest of the three. The type used as a
+record only requires an implementation of an equality comparison operator,
+and is assumed to be of fixed length. Beyond this, the framework places
+no additional constraints and makes no assumptions about record contents,
+their ordering properties, etc. Though the records must be fixed length,
 variable length data can be supported using off-record storage and
 pointers if necessary. Each record is automatically wrapped by the
 framework with a header that is used to facilitate deletion support.
-The record concept is shown in Listing~\ref{lst:record}, along with the
-wrapped header type that is used to interact with records within
+The record concept is shown in Listing~\ref{lst:record}, along with
+the wrapped header type that is used to interact with records within
 the framework.
 
 \begin{lstfloat}
@@ -828,17 +921,18 @@ collection type should be used for these results. A range scan, for
 example, could specified the result types as a vector of records, map
 of records, etc., depending on the use case.
 
-There are two significant differences between the IDSP interface and
-the query concept implementation. The first is in the way that the query
-result object is passed between the \texttt{combine} and \texttt{repeat}
-function. To avoid copying it, as it can be large for some types of query,
-this object is initialized by the framework and passed by reference into
-these two functions. The second difference is that the \texttt{repeat}
-function is responsible for making any updates to local query objects,
-and that when \texttt{repeat} returns \texttt{true}, the local queries
-will be immediately re-executed. To facilitate this, the \texttt{repeat}
-function takes the local query objects as arguments.
-
+There is one significant difference between the IDSP interface and the
+query concept implementation. For efficiency purposes, \texttt{combine}
+does not return the query result object. Instead, the framework
+itself initializes the object, and then passes it by reference into
+\texttt{combine}. This is necessary because \texttt{combine} can be called
+multiple times, depending on whether the query must be repeated. Adding
+it as an argument to \texttt{combine}, rather than returning it,
+allows for the local query results to be discarded completely, and new
+results generated and added to the existing result set, in the case
+of a repetition. Without this modification, the user would either need
+to define an additional combination operation for final result types,
+or duplicate effort in the combine step on each repetition.
 
 \begin{lstfloat}
 
@@ -887,17 +981,182 @@ framework.}
 \end{lstfloat}
 
 
-\subsection{Configurability}
+\subsection{Internal Mechanisms}
 
-\subsection{Concurrency}
+\subsubsection{Inserts and Layout Policy}
+
+\begin{algorithm}[t]
+	\caption{Insertion with Dynamization Framework}
+	\label{algo:insert}
+	\KwIn{$r$: new record to insert}
+	\If{\texttt{buffer is not full}}{
+		$\texttt{buffer.append}(r)$\;
+		\Return
+	}
+	$\texttt{idx} \gets 0$\;
+	\For{$i \gets 0 \cdots \texttt{n\_levels}$}{
+		\If{$\texttt{level}_i \texttt{ can hold records in }\texttt{level}_{i - 1}$}{
+			\texttt{idx} = i\;
+			\Break\;
+		}
+	}
+	\For{$i \gets \texttt{idx} \cdots 1$}{
+        \If{layout\_policy = \texttt{LEVELING}} {
+            $\texttt{level}_i \gets
+            \texttt{merge\_shards}(\texttt{level}_i, \texttt{level}_{i - 1})$ \;
+        }
+
+        \If{layout\_policy = \texttt{TIERING}} {
+            $\texttt{new\_shard} \gets \texttt{merge\_shards}(\texttt{level}_{i-1})$ \;
+            $\texttt{level}_i \gets \texttt{add\_shard}(\texttt{level}_i, \texttt{new\_shard})$  \;
+        }
+	}
+    $\texttt{level}_0 \gets \texttt{add\_shard}(\texttt{level}_0, \texttt{build\_shard}(\texttt{buffer}))$\;
+	$\texttt{buffer.append}(r)$\;
+	\Return
+\end{algorithm}
+
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsubsection{Delete Policy}
+
+\Paragraph{Asymptotic Complexity.}
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsubsection{Queries}
+
+
+\begin{algorithm}[t]
+	\caption{Query with Dynamization Framework}
+	\label{algo:query-framework}
+	\KwIn{$q$: query parameters, $b$: mutable buffer, $S$: static index shards at all levels}
+	\KwOut{$R$: query results}
+	
+    $\mathscr{S}_b \gets \texttt{local\_preproc}_{buffer}(b, q);\ \ \mathscr{S} \gets \{\}$ \;
+    \For{$s \in S$}{$\mathscr{S} \gets \mathscr{S}\ \cup (s, \texttt{local\_preproc}(s, q))$\;}
+    $(q_b, q_1, \ldots q_m) \gets \texttt{distribute\_query}(\mathscr{S}_b, \mathscr{S}, q)$ \;
+    $\mathcal{R} \gets \{\}; \ \ \texttt{rpt} \gets \bot$ \;
+    \Do{\texttt{rpt}}{
+		$locR \gets \{\}$ \;
+        $locR \gets locR \cup \texttt{local\_query}_{buffer}(b, q_b)$ \;
+        % the subscript in this one is wonky. Maybe do an array of Qs?
+		\For{$s \in S$}{$locR \gets locR \cup \texttt{local\_query}(s, q_s)$}
+        %\Comment{For \red{name}, use \texttt{tombstone\_lookup} to remove all deleted records. }
+		%\If{\textbf{not} \texttt{SKIP\_DELETE\_FILTER}}{$locR \gets \texttt{filter\_deletes}(locR, S)$}
+        $\mathcal{R} \gets \mathcal{R} \cup \texttt{combine}(locR, q_b, q_1, \ldots, q_m)$\;
+        $(\texttt{rpt}, q_b, q_1, \ldots,
+        q_m) \gets \texttt{repeat}(q, \mathcal{R}, q_b, q_1,\ldots, q_m)$\;
+	}
+    \Return{$\mathcal{R}$}
+	
+\end{algorithm}
+
+\Paragraph{Asymptotic Complexity.}
+
+\subsection{Concurrency Control}
 
 \section{Evaluation}
 \subsection{Experimental Setup}
 \subsection{Design Space Evaluation}
+
+\begin{figure}
+	%\vspace{0pt}
+	\centering
+    \subfloat[Insertion Throughput \\ vs. Buffer Size]{\includegraphics[width=.4\textwidth]{img/fig-ps-mt-insert} \label{fig:ins-buffer-size}}
+	\subfloat[Insertion Throughput \\ vs. Scale Factor]{\includegraphics[width=.4\textwidth]{img/fig-ps-sf-insert} \label{fig:ins-scale-factor}}
+    \\ %\vspace{-2mm}
+    \subfloat[Query Latency vs. Buffer Size]{\includegraphics[width=.4\textwidth]{img/fig-ps-mt-query} \label{fig:q-buffer-size}}
+	\subfloat[Query Latency vs. Scale Factor]{\includegraphics[width=.4\textwidth]{img/fig-ps-sf-query} \label{fig:q-scale-factor}}
+	%\vspace{-2mm}
+    \caption{Design Space Evaluation (Triespline)}
+    %\vspace{-2mm}
+\end{figure}
+
+
 \subsection{Independent Range Sampling}
+
+\begin{figure*}
+	%\vspace{0pt}
+	\centering
+	\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-insert} \label{fig:irs-insert}}
+	\subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-query} \label{fig:irs-query}} 
+	\subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-space} \label{fig:irs-space}}
+	%\vspace{-3mm}
+	\caption{IRS Index Evaluation}
+	\label{fig:irs}
+	%\vspace{-6mm}
+\end{figure*}
+
+
+\begin{align*}
+    \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+    \text{Query:}  \quad &\Theta\left(\log_s n \log_f n + \frac{k}{1 - \delta}\right) \\
+    \text{Delete:} \quad &\Theta\left(\log_s n \log_f n\right)
+\end{align*}
+
+
 \subsection{k-NN Search}
+
+\begin{figure*}
+		\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-insert} \label{fig:knn-insert}}
+		\subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-query} \label{fig:knn-query}} 
+		\subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn-space} \label{fig:knn-space}} 
+		%\vspace{-3mm}
+        \caption{k-NN Index Evaluation}
+        %\vspace{-3mm}
+        \label{fig:knn-eval}
+\end{figure*}
+
+
+\begin{align*}
+    \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+    \text{Query:}  \quad &\Theta\left(N_B + \log n \log_s n\right ) \\
+    \text{Delete:} \quad &\Theta\left(\log_s n \right)
+\end{align*}
+
 \subsection{Range Scan}
+
+\begin{figure*}
+		\centering
+		\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-insert} \label{fig:rq-insert}}
+		\subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-query} \label{fig:rq-query}} 
+		\subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-rq-space} \label{fig:rq-space}} 
+        %\vspace{-3mm}
+		\caption{Learned Index Evaluation}
+		%\vspace{-3mm}
+		\label{fig:eval-learned-index}
+\end{figure*}
+
 \subsection{String Search}
+
+\begin{figure*}
+    \centering
+	\subfloat[Update Throughput]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-insert} \label{fig:fst-insert}} 
+	\subfloat[Query Latency]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-query} \label{fig:fst-query}} 
+	\subfloat[Index Overhead]{\includegraphics[width=.32\textwidth, trim=5mm 2mm 0 0]{img/fig-bs-fst-space} \label{fig:fst-size}} 
+    %\vspace{-3mm}
+    \caption{FST Evaluation}
+    %\vspace{-5mm}
+\end{figure*}
+
+\begin{align*}
+    \text{Insert:} \quad &\Theta\left(\log_s n\right) \\
+    \text{Query:}  \quad &\Theta\left(N_B + \log n \log_s n\right ) \\
+    \text{Delete:} \quad &\Theta\left(\log_s n \right)
+\end{align*}
+
 \subsection{Concurrency}
 
+\begin{figure}
+    \centering
+	%\vspace{-2mm}
+	\includegraphics[width=.5\textwidth]{img/fig-bs-irs-concurrency} 
+	%\vspace{-2mm}
+	\caption{IRS Thread Scaling}
+	\label{fig:irs-concurrency}
+	%\vspace{-2mm}
+\end{figure}
+
 \section{Conclusion}
-- 
cgit v1.2.3