From 4b3173bbe0ecb9cc5624c2762183f1b90fb134c7 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 25 Jun 2025 16:40:55 -0400 Subject: Plot updates --- chapters/tail-latency.tex | 84 ++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 41 deletions(-) (limited to 'chapters/tail-latency.tex') diff --git a/chapters/tail-latency.tex b/chapters/tail-latency.tex index 5b3dfa5..1d707b4 100644 --- a/chapters/tail-latency.tex +++ b/chapters/tail-latency.tex @@ -229,7 +229,7 @@ compared to B+Trees, as shown in Figure~\ref{fig:tl-floodl0-query}. \subfloat[Insertion Latency Distribution]{\includegraphics[width=.5\textwidth]{img/tail-latency/floodl0-insert.pdf} \label{fig:tl-floodl0-insert}} \subfloat[Query Latency Distribution]{\includegraphics[width=.5\textwidth]{img/tail-latency/floodl0-query.pdf} \label{fig:tl-floodl0-query}} \\ -\caption{Latency Distributions for a "Reconstructionless" Dynamization} +\caption{Latency Distributions for a Reconstructionless Dynamization} \label{fig:tl-floodl0} \end{figure} @@ -1317,7 +1317,7 @@ using the SOSD \texttt{OSM} dataset and point lookup queries. \centering \subfloat[Insertion Throughput vs. Query Latency for Varying Scale Factors]{\includegraphics[width=.5\textwidth]{img/tail-latency/stall-sf-sweep.pdf} \label{fig:tl-sf-curve}} \subfloat[Insertion Tail Latency for Varying Buffer Sizes]{\includegraphics[width=.5\textwidth]{img/tail-latency/buffer-tail-latency.pdf} \label{fig:tl-buffer-tail}} \\ -\caption{"Legacy" Design Space Examination} +\caption{Legacy Design Space Examination} \label{fig:tl-design-space} \end{figure} @@ -1351,7 +1351,7 @@ relatively small. \begin{figure} \centering \subfloat[Insertion Throughput vs. Query Latency]{\includegraphics[width=.5\textwidth]{img/tail-latency/recon-thread-scale.pdf} \label{fig:tl-latency-threads}} -\subfloat[Insertion Query Interference]{\includegraphics[width=.5\textwidth]{img/tail-latency/knn-stall-shard-dist.pdf} \label{fig:tl-query-scaling}} \\ +\subfloat[Maximum Insertion Throughput for a Given Query Latency]{\includegraphics[width=.5\textwidth]{img/tail-latency/constant-query.pdf} \label{fig:tl-query-scaling}} \\ \caption{Framework Thread Scaling} \label{fig:tl-threads} @@ -1372,44 +1372,46 @@ any concurrent queries. In these tests, we used the SOSD \texttt{OSM} dataset (200M records) and point-lookup queries without early abort against a dynamized ISAM tree. -For our first test, we considered the insertion throughput vs. query -latency trade-off for various stall amounts with several internal -thread counts. We inserted 30\% of the dataset first, and then measured -the insertion throughput over the insertion of the rest of the data -on a client thread, while another client thread continuously issued -queries against the structure. The results of this test are shown in -Figure~\ref{fig:tl-latency-threads}. The first note is that the change -in the number of available internal threads has little effect on the -insertion throughput. This is to be expected, as inserts throughput is -limited only by the stall amount, and by the buffer flushing operation. As -flushing occurs on a dedicated thread, it is unaffected by changes in the -internal thread configuration of the system. - -Query latency, however, does show a difference at the upper end of -insertion throughput. Insufficient parallel threads can affect the -query latency in two ways, -\begin{enumerate} - \item As queries and reconstructions share threads, if all threads - are occupied by a long running reconstruction, then queries must wait - for the reconstruction to complete before they can execute. - \item Increased capacity for parallel reconstructions allows shards - to be merged more rapidly, resulting in an overall reduction in the - shard count. -\end{enumerate} -Interestingly, at least in this test, both of these effects are largely -suppressed with only a moderate reduction in insertion throughput. But, -insufficient parallelism does result in the higher-throughput -configurations suffering a significant query latency increase in general. - -Of particular note here is the single internal thread test. While for very -low insertion throughputs, even one thread is enough to keep pace, query -performance degrades rapidly as the insertion throughput is increased, -so much so that we had to cut off part of the curve to ensure that the -other thread configurations were visible in the plot at all. Recall that -this configuration requires that both queries and reconstructions be -scheduled on the same shared thread, and so the query latency suffers -significantly from having to wait behind long-running reconstructions, -as well as taking longer due to having more shards in the structure. +We considered the insertion throughput vs. query latency trade-off for +various stall amounts with several internal thread counts. We inserted +30\% of the dataset first, and then measured the insertion throughput over +the insertion of the rest of the data on a client thread, while another +client thread continuously issued queries against the structure. The +results of this test are shown in Figure~\ref{fig:tl-latency-threads}. The +first note is that the change in the number of available internal +threads has little effect on the insertion throughput, as shown by the +clustering of the points on the curve. This is to be expected, as inserts +throughput is limited only by the stall amount, and by the buffer flushing +operation. As flushing occurs on a dedicated thread, it is unaffected +by changes in the internal thread configuration of the system. + +In terms of query performance, there are two general effects that can be +observed. The first effect is that the previously noted effect of reduced +query performance as the insertion throughput is increased is observed +in all cases, irrespective of thread count. However, interestingly, +the thread count itself has little effect on the curve outside of the +case of only having a single thread. This can also be seen in +Figure~\ref{fig:tl-query-scaling}, which shows an alternative view of +the same data revealing the best measured insertion throughput associated +with a given query latency bound. In both cases, two or more threads are +capable of significantly higher insertion throughput at a given query +latency. But, at very low insertion throughputs, this effect vanishes +and all thread counts are roughly equivalent in performance. + +A large part of the reason for this significant deviation in +behavior between one thread and multiple is likely that queries and +reconstructions share the same pool of background threads in this +framework. Our testing involved issuing queries continuously on a +single thread, while performing inserts, and so two threads background +threads ensures that a reconstruction and query can be run in parallel, +whereas a single thread will force queries to wait behind long running +reconstructions. Once this bottleneck is overcome, a reduction in the +amount of parallel reconstruction seems to have only a minor influence +on overall performance. This is likely because, although in the worst +case the system requires $\log_s n$ threads to fully parallelize +reconstructions, this worst case is fairly rare. The vast majority of +reconstructions only require a fraction of this total parallel capacity. + \section{Conclusion} -- cgit v1.2.3