From 5e4ad2777acc4c2420514e39fb98b7cf2e200996 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Sun, 27 Apr 2025 17:36:57 -0400
Subject: Initial commit

---
 chapters/sigmod23/experiment.tex | 48 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 chapters/sigmod23/experiment.tex

(limited to 'chapters/sigmod23/experiment.tex')

diff --git a/chapters/sigmod23/experiment.tex b/chapters/sigmod23/experiment.tex
new file mode 100644
index 0000000..75cf32e
--- /dev/null
+++ b/chapters/sigmod23/experiment.tex
@@ -0,0 +1,48 @@
+\section{Evaluation}
+\label{sec:experiment}
+
+\Paragraph{Experimental Setup.} All experiments were run under Ubuntu 20.04 LTS
+on a dual-socket Intel Xeon Gold 6242R server with 384 GiB of physical memory
+and 40 physical cores. External tests were run using a 4 TB WD Red SA500 SATA
+SSD, rated for 95000 and 82000 IOPS for random reads and writes respectively. 
+
+\Paragraph{Datasets.} Testing utilized a variety of synthetic and real-world
+datasets. For all datasets used, the key was represented as a 64-bit integer,
+the weight as a 64-bit integer, and the value as a 32-bit integer. Each record
+also contained a 32-bit header. The weight was omitted from IRS testing.
+Keys and weights were pulled from the dataset directly, and values were
+generated separately and were unique for each record. The following datasets 
+were used,
+\begin{itemize}
+\item \textbf{Synthetic Uniform.} A non-weighted, synthetically generated list 
+                                  of keys drawn from a uniform distribution.
+\item \textbf{Synthetic Zipfian.} A non-weighted, synthetically generated list 
+                                  of keys drawn from a Zipfian distribution with 
+                                  a skew of $0.8$.
+\item \textbf{Twitter~\cite{data-twitter,data-twitter1}.} $41$ million Twitter user ids, weighted by follower counts.
+\item \textbf{Delicious~\cite{data-delicious}.} $33.7$ million URLs, represented using unique integers, 
+                          weighted by the number of associated tags.
+\item \textbf{OSM~\cite{data-osm}.} $2.6$ billion geospatial coordinates for points
+                    of interest, collected by OpenStreetMap. The latitude, converted
+                    to a 64-bit integer, was used as the key and the number of
+                    its associated semantic tags as the weight. 
+\end{itemize}
+The synthetic datasets were not used for weighted experiments, as they do not
+have weights. For unweighted experiments, the Twitter and Delicious datasets
+were not used, as they have uninteresting key distributions.
+
+\Paragraph{Compared Methods.} In this section, indexes extended using the
+framework are compared against existing dynamic baselines. Specifically, DE-WSS
+(Section~\ref{ssec:wss-struct}), DE-IRS (Section~\ref{ssec:irs-struct}), and
+DE-WIRS (Section~\ref{ssec:irs-struct}) are examined. In-memory extensions are
+compared against the B+tree with aggregate weight tags on internal nodes (AGG
+B+tree) \cite{olken95} and concurrent and external extensions are compared
+against the AB-tree \cite{zhao22}. Sampling performance is also compared against
+comparable static sampling indexes: the alias structure \cite{walker74} for WSS,
+the in-memory ISAM tree for IRS, and the alias-augmented B+tree \cite{afshani17}
+for WIRS. Note that all structures under test, with the exception of the
+external DE-IRS and external AB-tree, were contained entirely within system
+memory. All benchmarking code and data structures were implemented using  C++17
+and compiled using gcc 11.3.0 at the \texttt{-O3} optimization level. The
+extension framework itself, excluding the shard implementations and utility
+headers, consisted of a header-only library of about 1200 SLOC.
-- 
cgit v1.2.3