From 0dab92f9b14e75f68dda8c556398ea2d55e27494 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 4 Jun 2025 16:01:20 -0400 Subject: updates --- chapters/design-space.tex | 40 +++++++++----------- chapters/tail-latency.tex | 61 ++++++++++++++++++++++++++++++- img/design-space/isam-parm-sweep.pdf | Bin 10116 -> 10807 bytes img/design-space/knn-parm-sweep.pdf | Bin 0 -> 10237 bytes img/design-space/vptree-tput.pdf | Bin 0 -> 15790 bytes img/tail-latency/recon-thread-scale.pdf | Bin 0 -> 9800 bytes img/tail-latency/stall-latency-curve.pdf | Bin 10223 -> 10192 bytes img/tail-latency/stall-sf-sweep.pdf | Bin 0 -> 9975 bytes 8 files changed, 76 insertions(+), 25 deletions(-) create mode 100644 img/design-space/knn-parm-sweep.pdf create mode 100644 img/design-space/vptree-tput.pdf create mode 100644 img/tail-latency/recon-thread-scale.pdf create mode 100644 img/tail-latency/stall-sf-sweep.pdf diff --git a/chapters/design-space.tex b/chapters/design-space.tex index 952be42..7cfe6a3 100644 --- a/chapters/design-space.tex +++ b/chapters/design-space.tex @@ -192,14 +192,13 @@ analysis. The worst-case cost of a reconstruction is $B(n)$, and there are $\log_s(n)$ total levels, so the total reconstruction costs associated with a record can be upper-bounded by, $B(n) \cdot \frac{W(\log_s(n))}{n}$, and then this cost amortized over the $n$ -insertions necessary to get the record into the last level, resulting +insertions necessary to get the record into the last level. We'lll also +condense the multiplicative constants and drop the additive ones to more +clearly represent the relationship we're looking to show. This results in an amortized insertion cost of, \begin{equation*} -\frac{B(n)}{n} \cdot \frac{1}{2}(s-1) \cdot ( (s-1)\log_s n + s) +\frac{B(n)}{n} \cdot s \log_s n \end{equation*} -Note that, in the case of $s=2$, this expression reduces to the same amortized -insertion cost as was derived using Binomial Theorem in the original BSM -paper~\cite{saxe79}. \end{proof} \begin{theorem} @@ -361,9 +360,10 @@ and sum this over all of the levels. \end{equation*} To calculate the amortized insertion cost, we multiply this write amplification number of the cost of rebuilding the structures, and divide by the total number -of records, +of records. We'll condense the constant into a single $s$, as this best +expresses the nature of the relationship we're looking for, \begin{equation*} -I_A(n) \in \Theta\left(\frac{B(n)}{n}\cdot \frac{1}{2} (s+1) \log_s n\right) +I_A(n) \in \Theta\left(\frac{B(n)}{n}\cdot s \log_s n\right) \end{equation*} \end{proof} @@ -503,17 +503,10 @@ I(n) \in \Theta\left(B(n)\right) \end{theorem} \begin{proof} The worst-case reconstruction in tiering involves performing a -reconstruction on each level. Of these, the largest level will -contain $\Theta(n)$ records, and thus dominates the cost of the -reconstruction. More formally, the total cost of this reconstruction -will be, +reconstruction on each level. More formally, the total cost of this +reconstruction will be, \begin{equation*} -I(n) = \sum_{i=0}{\log_s n} B(s^i) = B(1) + B(s) + B(s^2) + \ldots B(s^{\log_s n}) -\end{equation*} -Of these, the final term $B(s^{\log_s n}) = B(n)$ dominates the others, -resulting in an asymptotic worst-case cost of, -\begin{equation*} -I(n) \in \Theta\left(B(n)\right) +I(n) \in \Theta\left(\sum_{i=0}^{\log_s n} B(s^i)\right) \end{equation*} \end{proof} @@ -600,8 +593,8 @@ reconstructions, one per level. & \textbf{Gen. BSM} & \textbf{Leveling} & \textbf{Tiering} \\ \hline $\mathscr{Q}(n)$ &$O\left(\log_s n \cdot \mathscr{Q}_S(n)\right)$ & $O\left(\log_s n \cdot \mathscr{Q}_S(n)\right)$ & $O\left(s \log_s n \cdot \mathscr{Q}_S(n)\right)$\\ \hline $\mathscr{Q}_B(n)$ & $\Theta(\mathscr{Q}_S(n))$ & $O(\log_s n \cdot \mathscr{Q}_S(n))$ & $O(\log_s n \cdot \mathscr{Q}_S(n))$ \\ \hline -$I(n)$ & $\Theta(B(n))$ & $\Theta\left(B\left(\frac{s-1}{s} \cdot n\right)\right)$ & $\Theta(B(n))$\\ \hline -$I_A(n)$ & $\Theta\left(\frac{B(n)}{n} \frac{1}{2}(s-1)\cdot((s-1)\log_s n +s)\right)$ & $\Theta\left(\frac{B(n)}{n} \frac{1}{2}(s-1)\log_s n\right)$& $\Theta\left(\frac{B(n)}{n} \log_s n\right)$ \\ \hline +$I(n)$ & $\Theta(B(n))$ & $\Theta\left(B\left(\frac{s-1}{s} \cdot n\right)\right)$ & $ \Theta\left(\sum_{i=0}^{\log_s n} B(s^i)\right)$ \\ \hline +$I_A(n)$ & $\Theta\left(\frac{B(n)}{n} s\log_s n)\right)$ & $\Theta\left(\frac{B(n)}{n} s\log_s n\right)$& $\Theta\left(\frac{B(n)}{n} \log_s n\right)$ \\ \hline \end{tabular} \caption{Comparison of cost functions for various layout policies for DSPs} @@ -699,7 +692,7 @@ due to cache effects most likely, but less so than in the MDSP case. \begin{figure} \centering \subfloat[ISAM Tree]{\includegraphics[width=.5\textwidth]{img/design-space/isam-tput.pdf} \label{fig:design-isam-tput}} -\subfloat[VPTree]{\includegraphics[width=.5\textwidth]{img/design-space/vptree-insert-dist.pdf} \label{fig:design-vptree-tput}} \\ +\subfloat[VPTree]{\includegraphics[width=.5\textwidth]{img/design-space/vptree-tput.pdf} \label{fig:design-vptree-tput}} \\ \caption{Insertion Throughput for Layout Policies} \label{fig:design-ins-tput} \end{figure} @@ -772,9 +765,10 @@ method shows similar trends to leveling. In general, the Bentley-Saxe method appears to follow a very similar trend to that of leveling, albeit with even more dramatic performance -degradation as the scale factor is increased. Generally it seems to be -a strictly worse alternative to leveling in all but its best-case query -cost, and we will omit it from our tests moving forward as a result. +degradation as the scale factor is increased and slightly better query +performance across the board. Generally it seems to be a strictly worse +alternative to leveling in all but its best-case query cost, and we will +omit it from our tests moving forward. \subsection{Query Size Effects} diff --git a/chapters/tail-latency.tex b/chapters/tail-latency.tex index a88fe0c..4e79cff 100644 --- a/chapters/tail-latency.tex +++ b/chapters/tail-latency.tex @@ -939,7 +939,7 @@ able to reduce the insertion tail latency, while being able to match the general insertion and query performance of a strict tiering policy. Recall that, in the insertion stall case, no explicit shard capacity limits are enforced by the framework. Reconstructions are triggered with each buffer -flush on all levels exceeding a specified shard count ($s = 4$ in these +flush on all levels exceeding a specified shard count ($s = 6$ in these tests) and the buffer flushes immediately when full with no regard to the state of the structure. Thus, limiting the insertion latency is the only means the system uses to maintain its shard count at a manageable level. @@ -957,7 +957,7 @@ each time the buffer flushed. Note that a stall value of one indicates no stalling at all, and values less than one indicate $1 - \delta$ probability of an insert being rejected. Thus, a lower stall value means more stalls are introduced. The tiering policy is strict tiering with a -scale factor of $s=4$. It uses the concurrency control scheme described +scale factor of $s=6$. It uses the concurrency control scheme described in Section~\ref{ssec:dyn-concurrency}. @@ -1101,6 +1101,63 @@ to provide a superior set of design trade-offs than the strict policies, at least in environments where sufficient parallel processing and memory are available to leverage parallel reconstructions. +\subsection{Thread Scaling} + +\begin{figure} +\centering +\subfloat[Insertion Throughput vs. Query Latency]{\includegraphics[width=.5\textwidth]{img/tail-latency/recon-thread-scale.pdf} \label{fig:tl-latency-threads}} +\subfloat[Insertion Query Interference]{\includegraphics[width=.5\textwidth]{img/tail-latency/knn-stall-shard-dist.pdf} \label{fig:tl-query-scaling}} \\ + +\caption{Framework Thread Scaling} +\label{fig:tl-threads} + +\end{figure} + +In the previous tests, we ran our system configured with 32 available +threads, which was more than enough to run all reconstructions and +queries fully in parallel. However, it's important to determine how well +the system works in more resource constrained environments. The system +shares internal threads between reconstructions and queries, and that +flushing occurs on a dedicated thread seperate from these. During the +benchmark, one client thread issued queries continously and another +issued inserts. The index accumulated a total of five levels, so +the maximum amount of parralelism available during the testing was 4 +parallel reconstructions, along with the dedicated flushing thread and +any concurrent queries. In these tests, we used the SOSD \texttt{OSM} +dataset (200M records) and point-lookup queries without early abort +against a dynamized ISAM tree. + +For our first test, we considered the insertion throughput vs. query +latency trade-off for various stall amounts with several internal +thread counts. We inserted 30\% of the dataset first, and then measured +the insertion throughput over the insertion of the rest of the data +on a client thread, while another client thread continously issued +queries against the structure. The results of this test are shown in +Figure~\ref{fig:tl-latency-threads}. The first note is that the change +in the number of available internal threads has little effect on the +insertion throughput. This is to be expected, as inserts throughput is +limited only by the stall amount, and by the buffer flushing operation. As +flushing occurs on a dedicated thread, it is unaffected by changes in the +internal thread configuration of the system. + +Query latency, however, does show a difference at the upper end of +insertion throughput. Insufficient parallel threads can affect the +query latency in two ways, +\begin{enumerate} + \item As queries and reconstructions share threads, if all threads + are occupied by a long running reconstruction, then queries must wait + for the reconstruction to complete before they can execute. + \item Increased capacity for parallel reconstructions allows shards + to be merged more rapidly, resulting in an overall reduction in the + shard count. +\end{enumerate} +Interestingly, at least in this test, both of these effects are largely +supressed with only a moderate reduction in insertion throughput. But, +insufficient parallelism does result in the higher-throughput +configurations suffering a significant query latency increase. + + + \section{Conclusion} diff --git a/img/design-space/isam-parm-sweep.pdf b/img/design-space/isam-parm-sweep.pdf index 8639c2b..95f31e6 100644 Binary files a/img/design-space/isam-parm-sweep.pdf and b/img/design-space/isam-parm-sweep.pdf differ diff --git a/img/design-space/knn-parm-sweep.pdf b/img/design-space/knn-parm-sweep.pdf new file mode 100644 index 0000000..c6243f8 Binary files /dev/null and b/img/design-space/knn-parm-sweep.pdf differ diff --git a/img/design-space/vptree-tput.pdf b/img/design-space/vptree-tput.pdf new file mode 100644 index 0000000..adf3937 Binary files /dev/null and b/img/design-space/vptree-tput.pdf differ diff --git a/img/tail-latency/recon-thread-scale.pdf b/img/tail-latency/recon-thread-scale.pdf new file mode 100644 index 0000000..c43fc8c Binary files /dev/null and b/img/tail-latency/recon-thread-scale.pdf differ diff --git a/img/tail-latency/stall-latency-curve.pdf b/img/tail-latency/stall-latency-curve.pdf index 3a94d57..ebffbf6 100644 Binary files a/img/tail-latency/stall-latency-curve.pdf and b/img/tail-latency/stall-latency-curve.pdf differ diff --git a/img/tail-latency/stall-sf-sweep.pdf b/img/tail-latency/stall-sf-sweep.pdf new file mode 100644 index 0000000..326402b Binary files /dev/null and b/img/tail-latency/stall-sf-sweep.pdf differ -- cgit v1.2.3