summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDouglas Rumbaugh <dbr4@psu.edu>2025-04-27 17:36:57 -0400
committerDouglas Rumbaugh <dbr4@psu.edu>2025-04-27 17:36:57 -0400
commit5e4ad2777acc4c2420514e39fb98b7cf2e200996 (patch)
tree276c075048e85426436db8babf0ca1f37e9fdba2
downloaddissertation-5e4ad2777acc4c2420514e39fb98b7cf2e200996.tar.gz
Initial commit
-rw-r--r--.gitignore9
-rw-r--r--Makefile32
-rw-r--r--chapters/abstract.tex42
-rw-r--r--chapters/acknowledgments.tex2
-rw-r--r--chapters/background.tex746
-rw-r--r--chapters/background.tex.bak574
-rw-r--r--chapters/beyond-bsm.tex3
-rw-r--r--chapters/beyond-dsp.tex863
-rw-r--r--chapters/chapter1-old.tex256
-rw-r--r--chapters/chapter1.tex.bak204
-rw-r--r--chapters/conclusion.tex43
-rw-r--r--chapters/dynamic-extension-sampling.tex22
-rw-r--r--chapters/future-work.tex174
-rw-r--r--chapters/introduction.tex95
-rw-r--r--chapters/sigmod23/abstract.tex29
-rw-r--r--chapters/sigmod23/background.tex182
-rw-r--r--chapters/sigmod23/conclusion.tex17
-rw-r--r--chapters/sigmod23/examples.tex143
-rw-r--r--chapters/sigmod23/exp-baseline.tex98
-rw-r--r--chapters/sigmod23/exp-extensions.tex40
-rw-r--r--chapters/sigmod23/exp-parameter-space.tex105
-rw-r--r--chapters/sigmod23/experiment.tex48
-rw-r--r--chapters/sigmod23/extensions.tex57
-rw-r--r--chapters/sigmod23/framework.tex573
-rw-r--r--chapters/sigmod23/introduction.tex20
-rw-r--r--chapters/sigmod23/relatedwork.tex33
-rw-r--r--chapters/vita.tex0
-rw-r--r--cls/ACM-Reference-Format.bst3081
-rw-r--r--cls/psuthesis.cls915
-rw-r--r--cls/userlib.tex22
-rw-r--r--img/fig-bs-irs-insert.pdfbin0 -> 16265 bytes
-rw-r--r--img/fig-bs-irs-query.pdfbin0 -> 19596 bytes
-rw-r--r--img/fig-bs-irs-space.pdfbin0 -> 18517 bytes
-rw-r--r--img/fig-bs-knn-insert.pdfbin0 -> 18488 bytes
-rw-r--r--img/fig-bs-knn-query.pdfbin0 -> 13582 bytes
-rw-r--r--img/fig-bs-knn.pdfbin0 -> 18967 bytes
-rw-r--r--img/fig-bs-rq-insert.pdfbin0 -> 22322 bytes
-rw-r--r--img/fig-bs-rq-query.pdfbin0 -> 28817 bytes
-rw-r--r--img/fig-bs-rq-space.pdfbin0 -> 21682 bytes
-rw-r--r--img/fig-bs-wss-insert.pdfbin0 -> 17092 bytes
-rw-r--r--img/fig-bs-wss-sample.pdf0
-rw-r--r--img/leveling.pdfbin0 -> 5836 bytes
-rw-r--r--img/sigmod23/alias.pdfbin0 -> 7645 bytes
-rw-r--r--img/sigmod23/delete-tagging.pdfbin0 -> 10951 bytes
-rw-r--r--img/sigmod23/delete-tombstone.pdfbin0 -> 12218 bytes
-rw-r--r--img/sigmod23/merge-leveling.pdfbin0 -> 14652 bytes
-rw-r--r--img/sigmod23/merge-tiering.pdfbin0 -> 15395 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-ext-insert.pdfbin0 -> 19747 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-ext-sample.pdfbin0 -> 23070 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-irs-insert.pdfbin0 -> 16614 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-irs-sample.pdfbin0 -> 22793 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-irs-samplesize.pdfbin0 -> 13642 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-irs-selectivity.pdfbin0 -> 13908 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-wirs-insert.pdfbin0 -> 16701 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-wirs-sample.pdfbin0 -> 23112 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-wss-insert.pdfbin0 -> 16402 bytes
-rw-r--r--img/sigmod23/plot/fig-bs-wss-sample.pdfbin0 -> 22831 bytes
-rw-r--r--img/sigmod23/plot/fig-cc-irs-scale.pdfbin0 -> 13858 bytes
-rw-r--r--img/sigmod23/plot/fig-cc-irs-thread.pdfbin0 -> 10177 bytes
-rw-r--r--img/sigmod23/plot/fig-cl-buffsweep.pdfbin0 -> 13629 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-bloom-memory.pdf0
-rw-r--r--img/sigmod23/plot/fig-ps-wss-bloom-sample.pdf0
-rw-r--r--img/sigmod23/plot/fig-ps-wss-bloom.pdfbin0 -> 12883 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-mt-insert.pdfbin0 -> 14489 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-mt-sample.pdfbin0 -> 14519 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-samplesize.pdfbin0 -> 12681 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-sf-insert.pdfbin0 -> 10446 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-sf-sample.pdfbin0 -> 13544 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-tp-insert.pdfbin0 -> 10867 bytes
-rw-r--r--img/sigmod23/plot/fig-ps-wss-tp-sample.pdfbin0 -> 14148 bytes
-rw-r--r--img/sigmod23/plot/fig-sc-irs-delete.pdfbin0 -> 10862 bytes
-rw-r--r--img/sigmod23/plot/fig-sc-irs-insert.pdfbin0 -> 10572 bytes
-rw-r--r--img/sigmod23/plot/fig-sc-irs-sample.pdfbin0 -> 15052 bytes
-rw-r--r--img/sigmod23/plot/fig-sc-wss-insert.pdfbin0 -> 10796 bytes
-rw-r--r--img/sigmod23/plot/fig-sc-wss-sample.pdfbin0 -> 15245 bytes
-rw-r--r--img/sigmod23/sampling.pdfbin0 -> 14248 bytes
-rw-r--r--img/tiering.pdfbin0 -> 6281 bytes
-rw-r--r--paper.tex437
-rw-r--r--references/references.bib1419
79 files changed, 10284 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c91a7e3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.aux
+*.log
+*.out
+*.blg
+*.bbl
+*.gz
+build/*
+draft/*
+exp-fig-code/*.pdf
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9e91350
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,32 @@
+TEXFLAGS = -output-directory=build
+
+export TEXINPUTS:=.:./cls:${TEXINPUTS}
+export BSTINPUTS:=.:./cls:${BSTINPUTS}
+
+.PHONY: all
+all: paper
+
+.PHONY: build
+build:
+ -mkdir build
+ -mkdir draft
+ -mkdir img
+
+.PHONY: plots
+plots: build
+ for f in plot/*; do \
+ gnuplot "$$f"; \
+ done;
+
+.PHONY: paper
+paper: build paper.tex
+ pdflatex $(TEXFLAGS) paper.tex
+ bibtex build/paper.aux
+ pdflatex $(TEXFLAGS) paper.tex
+ pdflatex $(TEXFLAGS) paper.tex
+ mv build/paper.pdf draft/dissertation.pdf
+
+clean:
+ rm -rf build
+ rm -rf draft
+
diff --git a/chapters/abstract.tex b/chapters/abstract.tex
new file mode 100644
index 0000000..5ddfd37
--- /dev/null
+++ b/chapters/abstract.tex
@@ -0,0 +1,42 @@
+Modern data systems must cope with a wider variety of data than ever
+before, and as a result we've seen the proliferation of a large number of
+highly specialized data management systems, such as vector and graph
+databases. These systems are built upon specialized data structures for
+a particular query, or class of queries, and as a result have a very
+specific range of efficacy. Beyond this, they are difficult to develop
+because of the requirements that they place upon the data structures at
+their core, including requiring support for concurrent updates. As a
+result, a large number of potentially useful data structures are excluded
+from use in such systems, or at the very least require a large amount of
+development time to be made useful.
+
+This work seeks to address this difficulty by introducing a framework for
+automatic data structure dynamization. Given a static data structure and
+an associated query, satisfying certain requirements, this proposed work
+will enable automatically adding support for concurrent updates, with
+minimal modification to the data structure itself. It is based on a
+body of theoretical work on dynamization, often called the "Bentley-Saxe
+Method", which partitions data into a number of small data structures,
+and periodically rebuilds these as records are inserted or deleted, in
+a manner that maintains asymptotic bounds on worst case query time,
+as well as amortized insertion time. These techniques, as they currently
+exist, are limited in usefulness as they exhibit poor performance in
+practice, and lack support for concurrency. But, they serve as a solid
+theoretical base upon which a novel system can be built to address
+these concerns.
+
+To develop this framework, sampling queries (which are not well served
+by existing dynamic data structures) are first considered. The results
+of this analysis are then generalized to produce a framework for
+single-threaded dynamization that is applicable to a large number
+of possible data structures and query types, and the general framework
+evaluated across a number of data structures and query types. These
+dynamized static structures are shown to equal or exceed the performance
+of existing specialized dynamic structures in both update and query
+performance.
+
+Finally, this general framework is expanded with support for concurrent
+operations (inserts and queries), and the use of scheduling and
+parallelism is studied to provide worst-case insertion guarantees,
+as well as a rich trade-off space between query and insertion performance.
+
diff --git a/chapters/acknowledgments.tex b/chapters/acknowledgments.tex
new file mode 100644
index 0000000..c6e25fd
--- /dev/null
+++ b/chapters/acknowledgments.tex
@@ -0,0 +1,2 @@
+And again here--no header, just text.
+
diff --git a/chapters/background.tex b/chapters/background.tex
new file mode 100644
index 0000000..75e2b59
--- /dev/null
+++ b/chapters/background.tex
@@ -0,0 +1,746 @@
+\chapter{Background}
+\label{chap:background}
+
+This chapter will introduce important background information and
+existing work in the area of data structure dynamization. We will
+first discuss the concept of a search problem, which is central to
+dynamization techniques. While one might imagine that restrictions on
+dynamization would be functions of the data structure to be dynamized,
+in practice the requirements placed on the data structure are quite mild,
+and it is the necessary properties of the search problem that the data
+structure is used to address that provide the central difficulty to
+applying dynamization techniques in a given area. After this, database
+indices will be discussed briefly. Indices are the primary use of data
+structures within the database context that is of interest to our work.
+Following this, existing theoretical results in the area of data structure
+dynamization will be discussed, which will serve as the building blocks
+for our techniques in subsquent chapters. The chapter will conclude with
+a discussion of some of the limitations of these existing techniques.
+
+\section{Queries and Search Problems}
+\label{sec:dsp}
+
+Data access lies at the core of most database systems. We want to ask
+questions of the data, and ideally get the answer efficiently. We
+will refer to the different types of question that can be asked as
+\emph{search problems}. We will be using this term in a similar way as
+the word \emph{query} \footnote{
+ The term query is often abused and used to
+ refer to several related, but slightly different things. In the
+ vernacular, a query can refer to either a) a general type of search
+ problem (as in "range query"), b) a specific instance of a search
+ problem, or c) a program written in a query language.
+}
+is often used within the database systems literature: to refer to a
+general class of questions. For example, we could consider range scans,
+point-lookups, nearest neighbor searches, predicate filtering, random
+sampling, etc., to each be a general search problem. Formally, for the
+purposes of this work, a search problem is defined as follows,
+
+\begin{definition}[Search Problem]
+ Given three multi-sets, $\mathcal{D}$, $\mathcal{R}$, and $\mathcal{Q}$, a search problem is a function
+ $F: (\mathcal{D}, \mathcal{Q}) \to \mathcal{R}$, where $\mathcal{D}$ represents the domain of data to be searched,
+ $\mathcal{Q}$ represents the domain of query parameters, and $\mathcal{R}$ represents the
+answer domain.\footnote{
+ It is important to note that it is not required for $\mathcal{R} \subseteq \mathcal{D}$. As an
+example, a \texttt{COUNT} aggregation might map a set of strings onto
+ an integer. Most common queries do satisfy $\mathcal{R} \subseteq \mathcal{D}$, but this need
+not be a universal constraint.
+}
+\end{definition}
+
+We will use the term \emph{query} to mean a specific instance of a search
+problem,
+
+\begin{definition}[Query]
+ Given three multi-sets, $\mathcal{D}$, $\mathcal{R}$, and $\mathcal{Q}$, a search problem $F$ and
+ a specific set of query parameters $q \in \mathcal{Q}$, a query is a specific
+ instance of the search problem, $F(\mathcal{D}, q)$.
+\end{definition}
+
+As an example of using these definitions, a \emph{membership test}
+or \emph{range scan} would be considered search problems, and a range
+scan over the interval $[10, 99]$ would be a query. We've drawn this
+distinction because, as we'll see as we enter into the discussion of
+our work in later chapters, it is useful to have seperate, unambiguous
+terms for these two concepts.
+
+\subsection{Decomposable Search Problems}
+
+Dynamization techniques require the partitioning of one data structure
+into several, smaller ones. As a result, these techniques can only
+be applied in situations where the search problem to be answered can
+be answered from this set of smaller data structures, with the same
+answer as would have been obtained had all of the data been used to
+construct a single, large structure. This requirement is formalized in
+the definition of a class of problems called \emph{decomposable search
+problems (DSP)}. This class was first defined by Bentley and Saxe in
+their work on dynamization, and we will adopt their definition,
+
+\begin{definition}[Decomposable Search Problem~\cite{saxe79}]
+ \label{def:dsp}
+ A search problem $F: (\mathcal{D}, \mathcal{Q}) \to \mathcal{R}$ is decomposable if and
+ only if there exists a constant-time computable, associative, and
+ commutative binary operator $\square$ such that,
+ \begin{equation*}
+ F(A \cup B, q) = F(A, q)~ \square ~F(B, q)
+ \end{equation*}
+\end{definition}
+
+The requirement for $\square$ to be constant-time was used by Bentley and
+Saxe to prove specific performance bounds for answering queries from a
+decomposed data structure. However, it is not strictly \emph{necessary},
+and later work by Overmars lifted this constraint and considered a more
+general class of search problems called \emph{$C(n)$-decomposable search
+problems},
+
+\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars83}]
+ A search problem $F: (\mathcal{D}, \mathcal{Q}) \to \mathcal{R}$ is $C(n)$-decomposable
+ if and only if there exists an $O(C(n))$-time computable, associative,
+ and commutative binary operator $\square$ such that,
+ \begin{equation*}
+ F(A \cup B, q) = F(A, q)~ \square ~F(B, q)
+ \end{equation*}
+\end{definition}
+
+To demonstrate that a search problem is decomposable, it is necessary to
+show the existence of the merge operator, $\square$, with the necessary
+properties, and to show that $F(A \cup B, q) = F(A, q)~ \square ~F(B,
+q)$. With these two results, induction demonstrates that the problem is
+decomposable even in cases with more than two partial results.
+
+As an example, consider range scans,
+\begin{definition}[Range Count]
+ Let $d$ be a set of $n$ points in $\mathbb{R}$. Given an interval,
+ $ q = [x, y],\quad x,y \in \mathbb{R}$, a range count returns
+ the cardinality, $|d \cap q|$.
+\end{definition}
+
+\begin{theorem}
+Range Count is a decomposable search problem.
+\end{theorem}
+
+\begin{proof}
+Let $\square$ be addition ($+$). Applying this to
+Definition~\ref{def:dsp}, gives
+\begin{align*}
+ |(A \cup B) \cap q| = |(A \cap q)| + |(B \cap q)|
+\end{align*}
+which is true by the distributive property of union and
+intersection. Addition is an associative and commutative
+operator that can be calculated in $O(1)$ time. Therefore, range counts
+are DSPs.
+\end{proof}
+
+Because the codomain of a DSP is not restricted, more complex output
+structures can be used to allow for problems that are not directly
+decomposable to be converted to DSPs, possibly with some minor
+post-processing. For example, calculating the arithmetic mean of a set
+of numbers can be formulated as a DSP,
+\begin{theorem}
+The calculation of the arithmetic mean of a set of numbers is a DSP.
+\end{theorem}
+\begin{proof}
+ Consider the search problem $A:\mathcal{D} \to (\mathbb{R}, \mathbb{Z})$,
+ where $\mathcal{D}\subset\mathbb{R}$ and is a multi-set. The output tuple
+contains the sum of the values within the input set, and the
+cardinality of the input set. For two disjoint paritions of the data,
+$D_1$ and $D_2$, let $A(D_1) = (s_1, c_1)$ and $A(D_2) = (s_2, c_2)$. Let
+$A(D_1) \square A(D_2) = (s_1 + s_2, c_1 + c_2)$.
+
+Applying Definition~\ref{def:dsp}, gives
+\begin{align*}
+ A(D_1 \cup D_2) &= A(D_1)\square A(D_2) \\
+ (s_1 + s_2, c_1 + c_2) &= (s_1 + s_2, c_1 + c_2) = (s, c)
+\end{align*}
+From this result, the average can be determined in constant time by
+taking $\nicefrac{s}{c}$. Therefore, calculating the average of a set
+of numbers is a DSP.
+\end{proof}
+
+
+
+\section{Database Indexes}
+\label{sec:indexes}
+
+Within a database system, search problems are expressed using
+some high level language (or mapped directly to commands, for
+simpler systems like key-value stores), which is processed by
+the database system to produce a result. Within many database
+systems, the most basic access primitive is a table scan, which
+sequentially examines each record within the data set. There are many
+situations in which the same query could be answered in less time using
+a more sophisticated data access scheme, however, and databases support
+a limited number of such schemes through the use of specialized data
+structures called \emph{indices} (or indexes). Indices can be built over
+a set of attributes in a table and provide faster access for particular
+search problems.
+
+The term \emph{index} is often abused within the database community
+to refer to a range of closely related, but distinct, conceptual
+categories.\footnote{
+The word index can be used to refer to a structure mapping record
+information to the set of records matching that information, as a
+general synonym for ``data structure'', to data structures used
+specifically in query processing, etc.
+}
+This ambiguity is rarely problematic, as the subtle differences between
+these categories are not often significant, and context clarifies the
+intended meaning in situations where they are. However, this work
+explicitly operates at the interface of two of these categories, and so
+it is important to disambiguate between them.
+
+\subsection{The Classical Index}
+
+A database index is a specialized data structure that provides a means
+to efficiently locate records that satisfy specific criteria. This
+enables more efficient query processing for supported search problems. A
+classical index can be modeled as a function, mapping a set of attribute
+values, called a key, $\mathcal{K}$, to a set of record identifiers,
+$\mathcal{R}$. The codomain of an index can be either the set of
+record identifiers, a set containing sets of record identifiers, or
+the set of physical records, depending upon the configuration of the
+index.~\cite{cowbook} For our purposes here, we'll focus on the first of
+these, but the use of other codmains wouldn't have any material effect
+on our discussion.
+
+We will use the following definition of a "classical" database index,
+
+\begin{definition}[Classical Index~\cite{cowbook}]
+Consider a set of database records, $\mathcal{D}$. An index over
+these records, $\mathcal{I}_\mathcal{D}$ is a map of the form
+ $\mathcal{I}_\mathcal{D}:(\mathcal{K}, \mathcal{D}) \to \mathcal{R}$, where
+$\mathcal{K}$ is a set of attributes of the records in $\mathcal{D}$,
+called a \emph{key}.
+\end{definition}
+
+In order to facilitate this mapping, indexes are built using data
+structures. The selection of data structure has implications on the
+performance of the index, and the types of search problem it can be
+used to accelerate. Broadly speaking, classical indices can be divided
+into two categories: ordered and unordered. Ordered indices allow for
+the iteration over a set of record identifiers in a particular sorted
+order of keys, and the efficient location of a specific key value in
+that order. These indices can be used to accelerate range scans and
+point-lookups. Unordered indices are specialized for point-lookups on a
+particular key value, and do not support iterating over records in some
+order.~\cite{cowbook, mysql-btree-hash}
+
+There is a very small set of data structures that are usually used for
+creating classical indexes. For ordered indices, the most commonly used
+data structure is the B-tree~\cite{ubiq-btree},\footnote{
+ By \emph{B-tree} here, we are referring not to the B-tree data
+ structure, but to a wide range of related structures derived from
+ the B-tree. Examples include the B$^+$-tree, B$^\epsilon$-tree, etc.
+}
+and the log-structured merge (LSM) tree~\cite{oneil96} is also often
+used within the context of key-value stores~\cite{rocksdb}. Some databases
+implement unordered indices using hash tables~\cite{mysql-btree-hash}.
+
+
+\subsection{The Generalized Index}
+
+The previous section discussed the classical definition of index
+as might be found in a database systems textbook. However, this
+definition is limited by its association specifically with mapping
+key fields to records. For the purposes of this work, a broader
+definition of index will be considered,
+
+\begin{definition}[Generalized Index]
+Consider a set of database records, $\mathcal{D}$, and search
+problem, $\mathcal{Q}$.
+A generalized index, $\mathcal{I}_\mathcal{D}$
+is a map of the form $\mathcal{I}_\mathcal{D}:(\mathcal{Q}, \mathcal{D}) \to
+\mathcal{R})$.
+\end{definition}
+
+A classical index is a special case of a generalized index, with $\mathcal{Q}$
+being a point-lookup or range scan based on a set of record attributes.
+
+There are a number of generalized indexes that appear in some database systems.
+For example, some specialized databases or database extensions have support for
+indexes based the R-tree\footnote{ Like the B-tree, R-tree here is used as a
+signifier for a general class of related data structures.} for spatial
+databases~\cite{postgis-doc, ubiq-rtree} or hierarchical navigable small world
+graphs for similarity search~\cite{pinecone-db}, among others. These systems
+are typically either an add-on module, or a specialized standalone database
+that has been designed specifically for answering particular types of queries
+(such as spatial queries, similarity search, string matching, etc.).
+
+%\subsection{Indexes in Query Processing}
+
+%A database management system utilizes indexes to accelerate certain
+%types of query. Queries are expressed to the system in some high
+%level language, such as SQL or Datalog. These are generalized
+%languages capable of expressing a wide range of possible queries.
+%The DBMS is then responsible for converting these queries into a
+%set of primitive data access procedures that are supported by the
+%underlying storage engine. There are a variety of techniques for
+%this, including mapping directly to a tree of relational algebra
+%operators and interpreting that tree, query compilation, etc. But,
+%ultimately, this internal query representation is limited by the routines
+%supported by the storage engine.~\cite{cowbook}
+
+%As an example, consider the following SQL query (representing a
+%2-dimensional k-nearest neighbor problem)\footnote{There are more efficient
+%ways of answering this query, but I'm aiming for simplicity here
+%to demonstrate my point},
+%
+%\begin{verbatim}
+%SELECT dist(A.x, A.y, Qx, Qy) as d, A.key FROM A
+% WHERE A.property = filtering_criterion
+% ORDER BY d
+% LIMIT 5;
+%\end{verbatim}
+%
+%This query will be translated into a logical query plan (a sequence
+%of relational algebra operators) by the query planner, which could
+%result in a plan like this,
+%
+%\begin{verbatim}
+%query plan here
+%\end{verbatim}
+%
+%With this logical query plan, the DBMS will next need to determine
+%which supported operations it can use to most efficiently answer
+%this query. For example, the selection operation (A) could be
+%physically manifested as a table scan, or could be answered using
+%an index scan if there is an ordered index over \texttt{A.property}.
+%The query optimizer will make this decision based on its estimate
+%of the selectivity of the predicate. This may result in one of the
+%following physical query plans
+%
+%\begin{verbatim}
+%physical query plan
+%\end{verbatim}
+%
+%In either case, however, the space of possible physical plans is
+%limited by the available access methods: either a sorted scan on
+%an attribute (index) or an unsorted scan (table scan). The database
+%must filter for all elements matching the filtering criterion,
+%calculate the distances between all of these points and the query,
+%and then sort the results to get the final answer. Additionally,
+%note that the sort operation in the plan is a pipeline-breaker. If
+%this plan were to appear as a sub-tree in a larger query plan, the
+%overall plan would need to wait for the full evaluation of this
+%sub-query before it could proceed, as sorting requires the full
+%result set.
+%
+%Imagine a world where a new index was available to the DBMS: a
+%nearest neighbor index. This index would allow the iteration over
+%records in sorted order, relative to some predefined metric and a
+%query point. If such an index existed over \texttt{(A.x, A.y)} using
+%\texttt{dist}, then a third physical plan would be available to the DBMS,
+%
+%\begin{verbatim}
+%\end{verbatim}
+%
+%This plan pulls records in order of their distance to \texttt{Q}
+%directly, using an index, and then filters them, avoiding the
+%pipeline breaking sort operation. While it's not obvious in this
+%case that this new plan is superior (this would depend upon the
+%selectivity of the predicate), it is a third option. It becomes
+%increasingly superior as the selectivity of the predicate grows,
+%and is clearly superior in the case where the predicate has unit
+%selectivity (requiring only the consideration of $5$ records total).
+%
+%This use of query-specific indexing schemes presents a query
+%optimization challenge: how does the database know when a particular
+%specialized index can be used for a given query, and how can
+%specialized indexes broadcast their capabilities to the query optimizer
+%in a general fashion? This work is focused on the problem of enabling
+%the existence of such indexes, rather than facilitating their use;
+%however these are important questions that must be considered in
+%future work for this solution to be viable. There has been work
+%done surrounding the use of arbitrary indexes in queries in the past,
+%such as~\cite{byods-datalog}. This problem is considered out-of-scope
+%for the proposed work, but will be considered in the future.
+
+\section{Classical Dynamization Techniques}
+
+Because data in a database is regularly updated, data structures
+intended to be used as an index must support updates (inserts, in-place
+modification, and deletes). Not all potentially useful data structures
+support updates, and so a general strategy for adding update support
+would increase the number of data structures that could be used as
+database indices. We refer to a data structure with update support as
+\emph{dynamic}, and one without update support as \emph{static}.\footnote{
+
+ The term static is distinct from immutable. Static refers to the
+ layout of records within the data structure, whereas immutable
+ refers to the data stored within those records. This distinction
+ will become relevant when we discuss different techniques for adding
+ delete support to data structures. The data structures used are
+ always static, but not necessarily immutable, because the records may
+ contain header information (like visibility) that is updated in place.
+}
+
+This section discusses \emph{dynamization}, the construction of a dynamic
+data structure based on an existing static one. When certain conditions
+are satisfied by the data structure and its associated search problem,
+this process can be done automatically, and with provable asymptotic
+bounds on amortized insertion performance, as well as worst case query
+performance. We will first discuss the necessary data structure
+requirements, and then examine several classical dynamization techniques.
+The section will conclude with a discussion of delete support within the
+context of these techniques.
+
+\subsection{Global Reconstruction}
+
+The most fundamental dynamization technique is that of \emph{global
+reconstruction}. While not particularly useful on its own, global
+reconstruction serves as the basis for the techniques to follow, and so
+we will begin our discussion of dynamization with it.
+
+Consider a class of data structure, $\mathcal{I}$, capable of answering a
+search problem, $\mathcal{Q}$. Insertion via global reconstruction is
+possible if $\mathcal{I}$ supports the following two operations,
+\begin{align*}
+\mathtt{build} : \mathcal{PS}(\mathcal{D})& \to \mathcal{I} \\
+\mathtt{unbuild} : \mathcal{I}& \to \mathcal{PS}(\mathcal{D})
+\end{align*}
+where $\mathtt{build}$ constructs an instance $\mathscr{i}\in\mathcal{I}$
+over the data structure over a set of records $d \subseteq \mathcal{D}$
+in $C(|d|)$ time, and $\mathtt{unbuild}$ returns the set of records $d
+\subseteq \mathcal{D}$ used to construct $\mathscr{i} \in \mathcal{I}$ in
+$\Theta(1)$ time,\footnote{
+ There isn't any practical reason why $\mathtt{unbuild}$ must run
+ in constant time, but this is the assumption made in \cite{saxe79}
+ and in subsequent work based on it, and so we will follow the same
+ defininition here.
+} such that $\mathscr{i} = \mathtt{build}(\mathtt{unbuild}(\mathscr{i}))$.
+
+
+
+
+
+
+\subsection{Amortized Global Reconstruction: The Bentley-Saxe Method}
+\label{ssec:bsm}
+
+Another approach to support updates is to amortize the cost of
+global reconstruction over multiple updates. This approach can take
+take three forms,
+\begin{enumerate}
+
+ \item Pairing a dynamic data structure (called a buffer or
+ memtable) with an instance of the structure being extended.
+ Updates are written to the buffer, and when the buffer is
+ full its records are merged with those in the static
+ structure, and the structure is rebuilt. This approach is
+ used by one version of the originally proposed
+ LSM-tree~\cite{oneil96}. Technically this technique proposed
+ in that work for the purposes of converting random writes
+ into sequential ones (all structures involved are dynamic),
+ but it can be used for dynamization as well.
+
+ \item Creating multiple, smaller data structures each
+ containing a partition of the records from the dataset, and
+ reconstructing individual structures to accommodate new
+ inserts in a systematic manner. This technique is the basis
+ of the Bentley-Saxe method~\cite{saxe79}.
+
+ \item Using both of the above techniques at once. This is
+ the approach used by modern incarnations of the
+ LSM-tree~\cite{rocksdb}.
+
+\end{enumerate}
+
+In all three cases, it is necessary for the search problem associated
+with the index to be a DSP, as answering it will require querying
+multiple structures (the buffer and/or one or more instances of the
+data structure) and merging the results together to get a final
+result. This section will focus exclusively on the Bentley-Saxe
+method, as it is the basis for the proposed methodology.
+
+When dividing records across multiple structures, there is a clear
+trade-off between read performance and write performance. Keeping
+the individual structures small reduces the cost of reconstructing,
+and thereby increases update performance. However, this also means
+that more structures will be required to accommodate the same number
+of records, when compared to a scheme that allows the structures
+to be larger. As each structure must be queried independently, this
+will lead to worse query performance. The reverse is also true,
+fewer, larger structures will have better query performance and
+worse update performance, with the extreme limit of this being a
+single structure that is fully rebuilt on each insert.
+
+The key insight of the Bentley-Saxe method~\cite{saxe79} is that a
+good balance can be struck by using a geometrically increasing
+structure size. In Bentley-Saxe, the sub-structures are ``stacked'',
+with the base level having a capacity of a single record, and
+each subsequent level doubling in capacity. When an update is
+performed, the first empty level is located and a reconstruction
+is triggered, merging the structures of all levels below this empty
+one, along with the new record. The merits of this approach are
+that it ensures that ``most'' reconstructions involve the smaller
+data structures towards the bottom of the sequence, while most of
+the records reside in large, infrequently updated, structures towards
+the top. This balances between the read and write implications of
+structure size, while also allowing the number of structures required
+to represent $n$ records to be worst-case bounded by $O(\log n)$.
+
+Given a structure and DSP with $P(n)$ construction cost and $Q_S(n)$
+query cost, the Bentley-Saxe Method will produce a dynamic data
+structure with,
+
+\begin{align}
+ \text{Query Cost} \qquad & O\left(Q_s(n) \cdot \log n\right) \\
+ \text{Amortized Insert Cost} \qquad & O\left(\frac{P(n)}{n} \log n\right)
+\end{align}
+
+In the case of a $C(n)$-decomposable problem, the query cost grows to
+\begin{equation}
+ O\left((Q_s(n) + C(n)) \cdot \log n\right)
+\end{equation}
+
+
+While the Bentley-Saxe method manages to maintain good performance in
+terms of \emph{amortized} insertion cost, it has has poor worst-case performance. If the
+entire structure is full, it must grow by another level, requiring
+a full reconstruction involving every record within the structure.
+A slight adjustment to the technique, due to Overmars and van
+Leeuwen~\cite{overmars81}, allows for the worst-case insertion cost to be bounded by
+$O\left(\frac{P(n)}{n} \log n\right)$, however it does so by dividing
+each reconstruction into small pieces, one of which is executed
+each time a new update occurs. This has the effect of bounding the
+worst-case performance, but does so by sacrificing the expected
+case performance, and adds a lot of complexity to the method. This
+technique is not used much in practice.\footnote{
+ We've yet to find any example of it used in a journal article
+ or conference paper.
+}
+
+\section{Limitations of the Bentley-Saxe Method}
+\label{sec:bsm-limits}
+
+While fairly general, the Bentley-Saxe method has a number of limitations. Because
+of the way in which it merges query results together, the number of search problems
+to which it can be efficiently applied is limited. Additionally, the method does not
+expose any trade-off space to configure the structure: it is one-size fits all.
+
+\subsection{Limits of Decomposability}
+\label{ssec:decomp-limits}
+Unfortunately, the DSP abstraction used as the basis of the Bentley-Saxe
+method has a few significant limitations that must first be overcome,
+before it can be used for the purposes of this work. At a high level, these limitations
+are as follows,
+
+\begin{itemize}
+ \item Each local query must be oblivious to the state of every partition,
+ aside from the one it is directly running against. Further,
+ Bentley-Saxe provides no facility for accessing cross-block state
+ or performing multiple query passes against each partition.
+
+ \item The result merge operation must be $O(1)$ to maintain good query
+ performance.
+
+ \item The result merge operation must be commutative and associative,
+ and is called repeatedly to merge pairs of results.
+\end{itemize}
+
+These requirements restrict the types of queries that can be supported by
+the method efficiently. For example, k-nearest neighbor and independent
+range sampling are not decomposable.
+
+\subsubsection{k-Nearest Neighbor}
+\label{sssec-decomp-limits-knn}
+The k-nearest neighbor (KNN) problem is a generalization of the nearest
+neighbor problem, which seeks to return the closest point within the
+dataset to a given query point. More formally, this can be defined as,
+\begin{definition}[Nearest Neighbor]
+
+ Let $D$ be a set of $n>0$ points in $\mathbb{R}^d$ and $f(x, y)$
+ be some function $f: D^2 \to \mathbb{R}^+$ representing the distance
+ between two points within $D$. The nearest neighbor problem, $NN(D,
+ q)$ returns some $d \in D$ having $\min_{d \in D} \{f(d, q)\}$
+ for some query point, $q \in \mathbb{R}^d$.
+
+\end{definition}
+
+In practice, it is common to require $f(x, y)$ be a metric,\footnote
+{
+ Contrary to its vernacular usage as a synonym for ``distance'', a
+ metric is more formally defined as a valid distance function over
+ a metric space. Metric spaces require their distance functions to
+ have the following properties,
+ \begin{itemize}
+ \item The distance between a point and itself is always 0.
+ \item All distances between non-equal points must be positive.
+ \item For all points, $x, y \in D$, it is true that
+ $f(x, y) = f(y, x)$.
+ \item For any three points $x, y, z \in D$ it is true that
+ $f(x, z) \leq f(x, y) + f(y, z)$.
+ \end{itemize}
+
+ These distances also must have the interpretation that $f(x, y) <
+ f(x, z)$ means that $y$ is ``closer'' to $x$ than $z$ is to $x$. This
+ is the opposite of the definition of similarity, and so some minor
+ manipulations are usually required to make similarity measures work
+ in metric-based indexes. \cite{intro-analysis}
+}
+and this will be done in the examples of indexes for addressing
+this problem in this work, but it is not a fundamental aspect of the problem
+formulation. The nearest neighbor problem itself is decomposable, with
+a simple merge function that accepts the result with the smallest value
+of $f(x, q)$ for any two inputs\cite{saxe79}.
+
+The k-nearest neighbor problem generalizes nearest-neighbor to return
+the $k$ nearest elements,
+\begin{definition}[k-Nearest Neighbor]
+
+ Let $D$ be a set of $n \geq k$ points in $\mathbb{R}^d$ and $f(x, y)$
+ be some function $f: D^2 \to \mathbb{R}^+$ representing the distance
+ between two points within $D$. The k-nearest neighbor problem,
+ $KNN(D, q, k)$ seeks to identify a set $R\subset D$ with $|R| = k$
+ such that $\forall d \in D - R, r \in R, f(d, q) \geq f(r, q)$.
+
+\end{definition}
+
+This can be thought of as solving the nearest-neighbor problem $k$ times,
+each time removing the returned result from $D$ prior to solving the
+problem again. Unlike the single nearest-neighbor case (which can be
+thought of as KNN with $k=1$), this problem is \emph{not} decomposable.
+
+\begin{theorem}
+ KNN is not a decomposable search problem.
+\end{theorem}
+
+\begin{proof}
+To prove this, consider the query $KNN(D, q, k)$ against some partitioned
+dataset $D = D_0 \cup D_1 \ldots \cup D_\ell$. If KNN is decomposable,
+then there must exist some constant-time, commutative, and associative
+binary operator $\square$, such that $R = \square_{0 \leq i \leq l}
+R_i$ where $R_i$ is the result of evaluating the query $KNN(D_i, q,
+k)$. Consider the evaluation of the merge operator against two arbitrary
+result sets, $R = R_i \square R_j$. It is clear that $|R| = |R_i| =
+|R_j| = k$, and that the contents of $R$ must be the $k$ records from
+$R_i \cup R_j$ that are nearest to $q$. Thus, $\square$ must solve the
+problem $KNN(R_i \cup R_j, q, k)$. However, KNN cannot be solved in $O(1)$
+time. Therefore, KNN is not a decomposable search problem.
+\end{proof}
+
+With that said, it is clear that there isn't any fundamental restriction
+preventing the merging of the result sets;
+it is only the case that an
+arbitrary performance requirement wouldn't be satisfied. It is possible
+to merge the result sets in non-constant time, and so it is the case that
+KNN is $C(n)$-decomposable. Unfortunately, this classification brings with
+it a reduction in query performance as a result of the way result merges are
+performed in Bentley-Saxe.
+
+As a concrete example of these costs, consider using Bentley-Saxe to
+extend the VPTree~\cite{vptree}. The VPTree is a static, metric index capable of
+answering KNN queries in $KNN(D, q, k) \in O(k \log n)$. One possible
+merge algorithm for KNN would be to push all of the elements in the two
+arguments onto a min-heap, and then pop off the first $k$. In this case,
+the cost of the merge operation would be $C(k) = k \log k$. Were $k$ assumed
+to be constant, then the operation could be considered to be constant-time.
+But given that $k$ is only bounded in size above
+by $n$, this isn't a safe assumption to make in general. Evaluating the
+total query cost for the extended structure, this would yield,
+
+\begin{equation}
+ KNN(D, q, k) \in O\left(k\log n \left(\log n + \log k\right) \right)
+\end{equation}
+
+The reason for this large increase in cost is the repeated application
+of the merge operator. The Bentley-Saxe method requires applying the
+merge operator in a binary fashion to each partial result, multiplying
+its cost by a factor of $\log n$. Thus, the constant-time requirement
+of standard decomposability is necessary to keep the cost of the merge
+operator from appearing within the complexity bound of the entire
+operation in the general case.\footnote {
+ There is a special case, noted by Overmars, where the total cost is
+ $O(Q(n) + C(n))$, without the logarithmic term, when $(Q(n) + C(n))
+ \in \Omega(n^\epsilon)$ for some $\epsilon >0$. This accounts for the
+ case where the cost of the query and merge operation are sufficiently
+ large to consume the logarithmic factor, and so it doesn't represent
+ a special case with better performance.
+}
+If the result merging operation could be revised to remove this
+duplicated cost, the cost of supporting $C(n)$-decomposable queries
+could be greatly reduced.
+
+\subsubsection{Independent Range Sampling}
+
+Another problem that is not decomposable is independent sampling. There
+are a variety of problems falling under this umbrella, including weighted
+set sampling, simple random sampling, and weighted independent range
+sampling, but this section will focus on independent range sampling.
+
+\begin{definition}[Independent Range Sampling~\cite{tao22}]
+ Let $D$ be a set of $n$ points in $\mathbb{R}$. Given a query
+ interval $q = [x, y]$ and an integer $k$, an independent range
+ sampling query returns $k$ independent samples from $D \cap q$
+ with each point having equal probability of being sampled.
+\end{definition}
+
+This problem immediately encounters a category error when considering
+whether it is decomposable: the result set is randomized, whereas the
+conditions for decomposability are defined in terms of an exact matching
+of records in result sets. To work around this, a slight abuse of definition
+is in order:
+assume that the equality conditions within the DSP definition can
+be interpreted to mean ``the contents in the two sets are drawn from the
+same distribution''. This enables the category of DSP to apply to this type
+of problem. More formally,
+\begin{definition}[Decomposable Sampling Problem]
+ A sampling problem $F: (D, Q) \to R$, $F$ is decomposable if and
+ only if there exists a constant-time computable, associative, and
+ commutative binary operator $\square$ such that,
+ \begin{equation*}
+ F(A \cup B, q) \sim F(A, q)~ \square ~F(B, q)
+ \end{equation*}
+\end{definition}
+
+Even with this abuse, however, IRS cannot generally be considered decomposable;
+it is at best $C(n)$-decomposable. The reason for this is that matching the
+distribution requires drawing the appropriate number of samples from each each
+partition of the data. Even in the special case that $|D_0| = |D_1| = \ldots =
+|D_\ell|$, the number of samples from each partition that must appear in the
+result set cannot be known in advance due to differences in the selectivity
+of the predicate across the partitions.
+
+\begin{example}[IRS Sampling Difficulties]
+
+ Consider three partitions of data, $D_0 = \{1, 2, 3, 4, 5\}, D_1 =
+ \{1, 1, 1, 1, 3\}, D_2 = \{4, 4, 4, 4, 4\}$ using bag semantics and
+ an IRS query over the interval $[3, 4]$ with $k=12$. Because all three
+ partitions have the same size, it seems sensible to evenly distribute
+ the samples across them ($4$ samples from each partition). Applying
+ the query predicate to the partitions results in the following,
+ $d_0 = \{3, 4\}, d_1 = \{3 \}, d_2 = \{4, 4, 4, 4\}$.
+
+ In expectation, then, the first result set will contain $R_0 = \{3,
+ 3, 4, 4\}$ as it has a 50\% chance of sampling a $3$ and the same
+ probability of a $4$. The second and third result sets can only
+ be ${3, 3, 3, 3}$ and ${4, 4, 4, 4}$ respectively. Merging these
+ together, we'd find that the probability distribution of the sample
+ would be $p(3) = 0.5$ and $p(4) = 0.5$. However, were were to perform
+ the same sampling operation over the full dataset (not partitioned),
+ the distribution would be $p(3) = 0.25$ and $p(4) = 0.75$.
+
+\end{example}
+
+The problem is that the number of samples drawn from each partition needs to be
+weighted based on the number of elements satisfying the query predicate in that
+partition. In the above example, by drawing $4$ samples from $D_1$, more weight
+is given to $3$ than exists within the base dataset. This can be worked around
+by sampling a full $k$ records from each partition, returning both the sample
+and the number of records satisfying the predicate as that partition's query
+result, and then performing another pass of IRS as the merge operator, but this
+is the same approach as was used for KNN above. This leaves IRS firmly in the
+$C(n)$-decomposable camp. If it were possible to pre-calculate the number of
+samples to draw from each partition, then a constant-time merge operation could
+be used.
+
+\section{Conclusion}
+This chapter discussed the necessary background information pertaining to
+queries and search problems, indexes, and techniques for dynamic extension. It
+described the potential for using custom indexes for accelerating particular
+kinds of queries, as well as the challenges associated with constructing these
+indexes. The remainder of this document will seek to address these challenges
+through modification and extension of the Bentley-Saxe method, describing work
+that has already been completed, as well as the additional work that must be
+done to realize this vision.
diff --git a/chapters/background.tex.bak b/chapters/background.tex.bak
new file mode 100644
index 0000000..d57b370
--- /dev/null
+++ b/chapters/background.tex.bak
@@ -0,0 +1,574 @@
+\chapter{Background}
+
+This chapter will introduce important background information that
+will be used throughput the remainder of the document. We'll first
+define precisely what is meant by a query, and consider some special
+classes of query that will become relevant in our discussion of dynamic
+extension. We'll then consider the difference between a static and a
+dynamic structure, and techniques for converting static structures into
+dynamic ones in a variety of circumstances.
+
+\section{Database Indexes}
+
+The term \emph{index} is often abused within the database community
+to refer to a range of closely related, but distinct, conceptual
+categories\footnote{
+The word index can be used to refer to a structure mapping record
+information to the set of records matching that information, as a
+general synonym for ``data structure'', to data structures used
+specifically in query processing, etc.
+}.
+This ambiguity is rarely problematic, as the subtle differences
+between these categories are not often significant, and context
+clarifies the intended meaning in situtations where they are.
+However, this work explicitly operates at the interface of two of
+these categories, and so it is important to disambiguiate between
+them. As a result, we will be using the word index to
+refer to a very specific structure
+
+\subsection{The Traditional Index}
+A database index is a specialized structure which provides a means
+to efficiently locate records that satisfy specific criteria. This
+enables more efficient query processing for support queries. A
+traditional database index can be modeled as a function, mapping a
+set of attribute values, called a key, $\mathcal{K}$, to a set of
+record identifiers, $\mathcal{R}$. Technically, the codomain of an
+index can be either a record identifier, a set of record identifiers,
+or the physical record itself, depending upon the configuration of
+the index. For the purposes of this work, the focus will be on the
+first of these, but in principle any of the three index types could
+be used with little material difference to the discussion.
+
+Formally speaking, we will use the following definition of a traditional
+database index,
+\begin{definition}[Traditional Index]
+Consider a set of database records, $\mathcal{D}$. An index over
+these records, $\mathcal{I}_\mathcal{D}$ is a map of the form
+$F:(\mathcal{I}_\mathcal{D}, \mathcal{K}) \to \mathcal{R}$, where
+$\mathcal{K}$ is a set of attributes of the records in $\mathcal{D}$,
+called a \emph{key}.
+\end{definition}
+
+In order to facilitate this mapping, indexes are built using data
+structures. The specific data structure used has particular
+implications about the performance of the index, and the situations
+in which the index is effectively. Broadly speaking, traditional
+database indexes can be categorized in two ways: ordered indexes
+and unordered indexes. The former of these allows for iteration
+over the set of record identifiers in some sorted order, starting
+at the returned record. The latter allows for point-lookups only.
+
+There is a very small set of data structures that are usually used
+for creating database indexes. The most common range index in RDBMSs
+is the B-tree\footnote{ By \emph{B-tree} here, I am referring not
+to the B-tree datastructure, but to a wide range of related structures
+derived from the B-tree. Examples include the B$^+$-tree,
+B$^\epsilon$-tree, etc. } based index, and key-value stores commonly
+use indices built on the LSM-tree. Some databases support unordered
+indexes using hashtables. Beyond these, some specialized databases or
+database extensions have support for indexes based on other structures,
+such as the R-tree\footnote{
+Like the B-tree, R-tree here is used as a signifier for a general class
+of related data structures} for spatial databases or approximate small
+world graph models for similarity search.
+
+\subsection{The Generalized Index}
+
+The previous section discussed the traditional definition of index
+as might be found in a database systems textbook. However, this
+definition is limited by its association specifically with mapping
+key fields to records. For the purposes of this work, I will be
+considering a slightly broader definition of index,
+
+\begin{definition}[Generalized Index]
+Consider a set of database records, $\mathcal{D}$ and a search
+problem, $\mathcal{Q}$. A generalized index, $\mathcal{I}_\mathcal{D}$
+is a map of the form $F:(\mathcal{I}_\mathcal{D}, \mathcal{Q}) \to
+\mathcal{R})$.
+\end{definition}
+
+\emph{Search problems} are the topic of the next section, but in
+brief a search problem represents a general class of query, such
+as range scan, point lookup, k-nearest neightbor, etc. A traditional
+index is a special case of a generalized index, having $\mathcal{Q}$
+being a point-lookup or range query based on a set of record
+attributes.
+
+\subsection{Indices in Query Processing}
+
+A database management system utilizes indices to accelerate certain
+types of query. Queries are expressed to the system in some high
+level language, such as SQL or Datalog. These are generalized
+languages capable of expressing a wide range of possible queries.
+The DBMS is then responsible for converting these queries into a
+set of primitive data access procedures that are supported by the
+underlying storage engine. There are a variety of techniques for
+this, including mapping directly to a tree of relational algebra
+operators and interpretting that tree, query compilation, etc. But,
+ultimately, the expressiveness of this internal query representation
+is limited by the routines supported by the storage engine.
+
+As an example, consider the following SQL query (representing a
+2-dimensional k-nearest neighbor)\footnote{There are more efficient
+ways of answering this query, but I'm aiming for simplicity here
+to demonstrate my point},
+
+\begin{verbatim}
+SELECT dist(A.x, A.y, Qx, Qy) as d, A.key FROM A
+ WHERE A.property = filtering_criterion
+ ORDER BY d
+ LIMIT 5;
+\end{verbatim}
+
+This query will be translated into a logical query plan (a sequence
+of relational algebra operators) by the query planner, which could
+result in a plan like this,
+
+\begin{verbatim}
+query plan here
+\end{verbatim}
+
+With this logical query plan, the DBMS will next need to determine
+which supported operations it can use to most efficiently answer
+this query. For example, the selection operation (A) could be
+physically manifested as a table scan, or could be answered using
+an index scan if there is an ordered index over \texttt{A.property}.
+The query optimizer will make this decision based on its estimate
+of the selectivity of the predicate. This may result in one of the
+following physical query plans
+
+\begin{verbatim}
+physical query plan
+\end{verbatim}
+
+In either case, however, the space of possible physical plans is
+limited by the available access methods: either a sorted scan on
+an attribute (index) or an unsorted scan (table scan). The database
+must filter for all elements matching the filtering criterion,
+calculate the distances between all of these points and the query,
+and then sort the results to get the final answer. Additionally,
+note that the sort operation in the plan is a pipeline-breaker. If
+this plan were to appear as a subtree in a larger query plan, the
+overall plan would need to wait for the full evaluation of this
+sub-query before it could proceed, as sorting requires the full
+result set.
+
+Imagine a world where a new index was available to our DBMS: a
+nearest neighbor index. This index would allow the iteration over
+records in sorted order, relative to some predefined metric and a
+query point. If such an index existed over \texttt{(A.x, A.y)} using
+\texttt{dist}, then a third physical plan would be available to the DBMS,
+
+\begin{verbatim}
+\end{verbatim}
+
+This plan pulls records in order of their distance to \texttt{Q}
+directly, using an index, and then filters them, avoiding the
+pipeline breaking sort operation. While it's not obvious in this
+case that this new plan is superior (this would depend a lot on the
+selectivity of the predicate), it is a third option. It becomes
+increasingly superior as the selectivity of the predicate grows,
+and is clearly superior in the case where the predicate has unit
+selectivity (requiring only the consideration of $5$ records total).
+The construction of this special index will be considered in
+Section~\ref{ssec:knn}.
+
+This use of query-specific indexing schemes also presents a query
+planning challenge: how does the database know when a particular
+specialized index can be used for a given query, and how can
+specialized indexes broadcast their capabilities to the query planner
+in a general fashion? This work is focused on the problem of enabling
+the existence of such indexes, rather than facilitating their use,
+however these are important questions that must be considered in
+future work for this solution to be viable. There has been work
+done surrounding the use of arbtrary indexes in queries in the past,
+such as~\cite{byods-datalog}. This problem is considered out-of-scope
+for the proposed work, but will be considered in the future.
+
+\section{Queries and Search Problems}
+
+In our discussion of generalized indexes, we encountered \emph{search
+problems}. A search problem is a term used within the literature
+on data structures in a manner similar to how the database community
+sometimes uses the term query\footnote{
+Like with the term index, the term query is often abused and used to
+refer to several related, but slightly different things. In the vernacular,
+a query can refer to either a) a general type of search problem (as in "range query"),
+b) a specific instance of a search problem, or c) a program written in a query language.
+}, to refer to a general
+class of questions asked of data. Examples include range queries,
+point-lookups, nearest neighbor queries, predicate filtering, random
+sampling, etc. Formally, for the purposes of this work, we will define
+a search problem as follows,
+\begin{definition}[Search Problem]
+Given three multisets, $D$, $R$, and $Q$, a search problem is a function
+$F: (D, Q) \to R$, where $D$ represents the domain of data to be searched,
+$Q$ represents the domain of query parameters, and $R$ represents the
+answer domain.
+\footnote{
+It is important to note that it is not required for $R \subseteq D$. As an
+example, a \texttt{COUNT} aggregation might map a set of strings onto
+an integer. Most common queries do satisfy $R \subseteq D$, but this need
+not be a universal constraint.
+}
+\end{definition}
+
+And we will use the word \emph{query} to refer to a specific instance
+of a search problem, except when used as part of the generally
+accepted name of a search problem (i.e., range query).
+
+\begin{definition}[Query]
+Given three multisets, $D$, $R$, and $Q$, a search problem $F$ and
+a specific set of query parameters $q \in Q$, a query is a specific
+instance of the search problem, $F(D, q)$.
+\end{definition}
+
+As an example of using these definitions, a \emph{membership test}
+or \emph{range query} would be considered search problems, and a
+range query over the interval $[10, 99]$ would be a query.
+
+\subsection{Decomposable Search Problems}
+
+An important subset of search problems is that of decomposable
+search problems (DSPs). This class was first defined by Saxe and
+Bentley as follows,
+
+\begin{definition}[Decomposable Search Problem~\cite{saxe79}]
+ \label{def:dsp}
+ Given a search problem $F: (D, Q) \to R$, $F$ is decomposable if and
+ only if there exists a consant-time computable, associative, and
+ commutative binary operator $\square$ such that,
+ \begin{equation*}
+ F(A \cup B, q) = F(A, q)~ \square ~F(B, q)
+ \end{equation*}
+\end{definition}
+
+The constant-time requirement was used to prove bounds on the costs of
+evaluating DSPs over data broken across multiple partitions. Further work
+by Overmars lifted this constraint and considered a more general class
+of DSP,
+\begin{definition}[$C(n)$-decomposable Search Problem~\cite{overmars83}]
+ Given a search problem $F: (D, Q) \to R$, $F$ is $C(n)$-decomposable
+ if and only if there exists an $O(C(n))$-time computable, associative,
+ and commutative binary operator $\square$ such that,
+ \begin{equation*}
+ F(A \cup B, q) = F(A, q)~ \square ~F(B, q)
+ \end{equation*}
+\end{definition}
+
+Decomposability is an important property because it allows for
+search problems to be answered over partitioned datasets. The details
+of this will be discussed in Section~\ref{ssec:bentley-saxe} in the
+context of creating dynamic data structures. Many common types of
+search problems appearing in databases are decomposable, such as
+range queries or predicate filtering.
+
+To demonstrate that a search problem is decomposable, it is necessary
+to show the existance of the merge operator, $\square$, and to show
+that $F(A \cup B, q) = F(A, q)~ \square ~F(B, q)$. With these two
+results, simple induction demonstrates that the problem is decomposable
+even in cases with more than two partial results.
+
+As an example, consider range queries,
+\begin{definition}[Range Query]
+Let $D$ be a set of $n$ points in $\mathbb{R}$. Given an interval,
+$ q = [x, y],\quad x,y \in R$, a range query returns all points in
+$D \cap q$.
+\end{definition}
+
+\begin{theorem}
+Range Queries are a DSP.
+\end{theorem}
+
+\begin{proof}
+Let $\square$ be the set union operator ($\cup$). Applying this to
+Definition~\ref{def:dsp}, we have
+\begin{align*}
+ (A \cup B) \cap q = (A \cap q) \cup (B \cap q)
+\end{align*}
+which is true by the distributive property of set union and
+intersection. Assuming an implementation allowing for an $O(1)$
+set union operation, range queries are DSPs.
+\end{proof}
+
+Because the codomain of a DSP is not restricted, more complex output
+structures can be used to allow for problems that are not directly
+decomposable to be converted to DSPs, possibly with some minor
+post-processing. For example, the calculation of the mean of a set
+of numbers can be constructed as a DSP using the following technique,
+\begin{theorem}
+The calculation of the average of a set of numbers is a DSP.
+\end{theorem}
+\begin{proof}
+Define the search problem as $A:D \to (\mathbb{R}, \mathbb{Z})$,
+where $D\subset\mathbb{R}$ and is a multiset. The output tuple
+contains the sum of the values within the input set, and the
+cardinality of the input set. Let the $A(D_1) = (s_1, c_1)$ and
+$A(D_2) = (s_2, c_2)$. Then, define $A(D_1)\square A(D_2) = (s_1 +
+s_2, c_1 + c_2)$.
+
+Applying Definition~\ref{def:dsp}, we have
+\begin{align*}
+ A(D_1 \cup D_2) &= A(D_1)\square A(D_2) \\
+ (s_1 + s_2, c_1 + c_2) &= (s_1 + s_2, c_1 + c_2) = (s, c)
+\end{align*}
+From this result, the average can be determined in constant time by
+taking $\nicefrac{s}{c}$. Therefore, calculating the average of a set
+of numbers is a DSP.
+\end{proof}
+
+\section{Dynamic Extension Techniques}
+
+Because data in a database is regularly updated, data structures
+intended to be used as an index must support updates (inserts,
+in-place modification, and deletes) to their data. In principle,
+any data structure can support updates to its underlying data through
+global reconstruction: adjusting the record set and then rebuilding
+the entire structure. Ignoring this trivial (and highly inefficient)
+approach, a data structure with support for updates is called
+\emph{dynamic}, and one without support for updates is called
+\emph{static}. In this section, we discuss approaches for modifying
+a static data structure to grant it support for updates, a process
+called \emph{dynamic extension} or \emph{dynamization}. A theoretical
+survey of this topic can be found in~\cite{overmars83}, but this
+work doesn't cover several techniques that are used in practice.
+As such, much of this section constitutes our own analysis, tying
+together threads from a variety of sources.
+
+\subsection{Local Reconstruction}
+
+One way of viewing updates to a data structure is as reconstructing
+all or part of the structure. To minimize the cost of the update,
+it is ideal to minimize the size of the reconstruction that accompanies
+an update, either by careful structuring of the data to ensure
+minimal disruption to surrounding records by an update, or by
+deferring the reconstructions and amortizing their costs over as
+many updates as possible.
+
+While minimizing the size of a reconstruction seems the most obvious,
+and best, approach, it is limited in its applicability. The more
+related ``nearby'' records in the structure are, the more records
+will be affected by a change. Records can be related in terms of
+some ordering of their values, which we'll term a \emph{spatial
+ordering}, or in terms of their order of insertion to the structure,
+which we'll term a \emph{temporal ordering}. Note that these terms
+don't imply anything about the nature of the data, and instead
+relate to the principles used by the data structure to arrange them.
+
+Arrays provide the extreme version of both of these ordering
+principles. In an unsorted array, in which records are appended to
+the end of the array, there is no spatial ordering dependence between
+records. This means that any insert or update will require no local
+reconstruction, aside from the record being directly affected.\footnote{
+A delete can also be performed without any structural adjustments
+in a variety of ways. Reorganization of the array as a result of
+deleted records serves an efficiency purpose, but isn't required
+for the correctness of the structure. } However, the order of
+records in the array \emph{does} express a strong temporal dependency:
+the index of a record in the array provides the exact insertion
+order.
+
+A sorted array provides exactly the opposite situation. The order
+of a record in the array reflects an exact spatial ordering of
+records with respect to their sorting function. This means that an
+update or insert will require reordering a large number of records
+(potentially all of them, in the worst case). Because of the stronger
+spatial dependence of records in the structure, an update will
+require a larger-scale reconstruction. Additionally, there is no
+temporal component to the ordering of the records: inserting a set
+of records into a sorted array will produce the same final structure
+irrespective of insertion order.
+
+It's worth noting that the spatial dependency discussed here, as
+it relates to reconstruction costs, is based on the physical layout
+of the records and not the logical ordering of them. To exemplify
+this, a sorted singly-linked list can maintain the same logical
+order of records as a sorted array, but limits the spatial dependce
+between records each records preceeding node. This means that an
+insert into this structure will require only a single node update,
+regardless of where in the structure this insert occurs.
+
+The amount of spatial dependence in a structure directly reflects
+a trade-off between read and write performance. In the above example,
+performing a lookup for a given record in a sorted array requires
+asymptotically fewer comparisons in the worst case than an unsorted
+array, because the spatial dependecies can be exploited for an
+accelerated search (binary vs. linear search). Interestingly, this
+remains the case for lookups against a sorted array vs. a sorted
+linked list. Even though both structures have the same logical order
+of records, limited spatial dependecies between nodes in a linked
+list forces the lookup to perform a scan anyway.
+
+A balanced binary tree sits between these two extremes. Like a
+linked list, individual nodes have very few connections. However
+the nodes are arranged in such a way that a connection existing
+between two nodes implies further information about the ordering
+of children of those nodes. In this light, rebalancing of the tree
+can be seen as maintaining a certain degree of spatial dependence
+between the nodes in the tree, ensuring that it is balanced between
+the two children of each node. A very general summary of tree
+rebalancing techniques can be found in~\cite{overmars83}. Using an
+AVL tree~\cite{avl} as a specific example, each insert in the tree
+involves adding the new node and updating its parent (like you'd
+see in a simple linked list), followed by some larger scale local
+reconstruction in the form of tree rotations, to maintain the balance
+factor invariant. This means that insertion requires more reconstruction
+effort than the single pointer update in the linked list case, but
+results in much more efficient searches (which, as it turns out,
+makes insertion more efficient in general too, even with the overhead,
+because finding the insertion point is much faster).
+
+\subsection{Amortized Local Reconstruction}
+
+In addition to control update cost by arranging the structure so
+as to reduce the amount of reconstruction necessary to maintain the
+desired level of spatial dependence, update costs can also be reduced
+by amortizing the local reconstruction cost over multiple updates.
+This is often done in one of two ways: leaving gaps or adding
+overflow buckets. These gaps and buckets allows for a buffer of
+insertion capacity to be sustained by the data structure, before
+a reconstruction is triggered.
+
+A classic example of the gap approach is found in the
+B$^+$-tree~\cite{b+tree} commonly used in RDBMS indexes, as well
+as open addressing for hash tables. In a B$^+$-tree, each node has
+a fixed size, which must be at least half-utilized (aside from the
+root node). The empty spaces within these nodes are gaps, which can
+be cheaply filled with new records on insert. Only when a node has
+been filled must a local reconstruction (called a structural
+modification operation for B-trees) occur to redistribute the data
+into multiple nodes and replenish the supply of gaps. This approach
+is particularly well suited to data structures in contexts where
+the natural unit of storage is larger than a record, as in disk-based
+(with 4KiB pages) or cache-optimized (with 64B cachelines) structures.
+This gap-based approach was also used to create ALEX, an updatable
+learned index~\cite{ALEX}.
+
+The gap approach has a number of disadvantages. It results in a
+somewhat sparse structure, thereby wasting storage. For example, a
+B$^+$-tree requires all nodes other than the root to be at least
+half full--meaning in the worst case up to half of the space required
+by the structure could be taken up by gaps. Additionally, this
+scheme results in some inserts being more expensive than others:
+most new records will occupy an available gap, but some will trigger
+more expensive SMOs. In particular, it has been observed with
+B$^+$-trees that this can lead to ``waves of misery''~\cite{wavesofmisery}:
+the gaps in many nodes fill at about the same time, leading to
+periodic clusters of high-cost merge operations.
+
+Overflow buckets are seen in ISAM-tree based indexes~\cite{myisam},
+as well as hash tables with closed addressing. In this approach,
+parts of the structure into which records would be inserted (leaf
+nodes of ISAM, directory entries in CA hashing) have a pointer to
+an overflow location, where newly inserted records can be placed.
+This allows for the structure to, theoretically, sustain an unlimited
+amount of insertions. However, read performance degrades, because
+the more overflow capacity is utilized, the less the records in the
+structure are ordered according to the data structure's definition.
+Thus, periodically a reconstruction is necessary to distribute the
+overflow records into the structure itself.
+
+\subsection{Amortized Global Reconstruction: The Bentley-Saxe Method}
+
+Another approach to support updates is to amortize the cost of
+global reconstruction over multiple updates. This approach can take
+take three forms,
+\begin{enumerate}
+
+ \item Pairing a dynamic data structure (called a buffer or
+ memtable) with an instance of the structure being extended.
+ Updates are written to the buffer, and when the buffer is
+ full its records are merged with those in the static
+ structure, and the structure is rebuilt. This approach is
+ used by one version of the originally proposed
+ LSM-tree~\cite{oneil93}. Technically this technique proposed
+ in that work for the purposes of converting random writes
+ into sequential ones (all structures involved are dynamic),
+ but it can be used for dynamization as well.
+
+ \item Creating multiple, smaller data structures each
+ containing a partition of the records from the dataset, and
+ reconstructing individual structures to accomodate new
+ inserts in a systematic manner. This technique is the basis
+ of the Bentley-Saxe method~\cite{saxe79}.
+
+ \item Using both of the above techniques at once. This is
+ the approach used by modern incarnations of the
+ LSM~tree~\cite{rocksdb}.
+
+\end{enumerate}
+
+In all three cases, it is necessary for the search problem associated
+with the index to be a DSP, as answering it will require querying
+multiple structures (the buffer and/or one or more instances of the
+data structure) and merging the results together to get a final
+result. This section will focus exclusively on the Bentley-Saxe
+method, as it is the basis for our proposed methodology.p
+
+When dividing records across multiple structures, there is a clear
+trade-off between read performance and write performance. Keeping
+the individual structures small reduces the cost of reconstructing,
+and thereby increases update performance. However, this also means
+that more structures will be required to accommodate the same number
+of records, when compared to a scheme that allows the structures
+to be larger. As each structure must be queried independently, this
+will lead to worse query performance. The reverse is also true,
+fewer, larger structures will have better query performance and
+worse update performance, with the extreme limit of this being a
+single structure that is fully rebuilt on each insert.
+
+\begin{figure}
+ \caption{Inserting a new record using the Bentley-Saxe method.}
+ \label{fig:bsm-example}
+\end{figure}
+
+The key insight of the Bentley-Saxe method~\cite{saxe79} is that a
+good balance can be struck by uses a geometrically increasing
+structure size. In Bentley-Saxe, the sub-structures are ``stacked'',
+with the bottom level having a capacity of a single record, and
+each subsequent level doubling in capacity. When an update is
+performed, the first empty level is located and a reconstruction
+is triggered, merging the structures of all levels below this empty
+one, along with the new record. An example of this process is shown
+in Figure~\ref{fig:bsm-example}. The merits of this approach are
+that it ensures that ``most'' reconstructions involve the smaller
+data structures towards the bottom of the sequence, while most of
+the records reside in large, infrequently updated, structures towards
+the top. This balances between the read and write implications of
+structure size, while also allowing the number of structures required
+to represent $n$ records to be worst-case bounded by $O(\log n)$.
+
+Given a structure with an $O(P(n))$ construction cost and $O(Q_S(n))$
+query cost, the Bentley-Saxe Method will produce a dynamic data
+structure with,
+
+\begin{align}
+ \text{Query Cost} \qquad & O\left(Q_s(n) \cdot \log n\right) \\
+ \text{Amortized Insert Cost} \qquad & O\left(\frac{P(n)}{n} \log n\right)
+\end{align}
+
+However, the method has poor worst-case insertion cost: if the
+entire structure is full, it must grow by another level, requiring
+a full reconstruction involving every record within the structure.
+A slight adjustment to the technique, due to Overmars and van Leuwen
+\cite{}, allows for the worst-case insertion cost to be bounded by
+$O\left(\frac{P(n)}{n} \log n\right)$, however it does so by dividing
+each reconstruction into small pieces, one of which is executed
+each time a new update occurs. This has the effect of bounding the
+worst-case performance, but does so by sacrificing the expected
+case performance, and adds a lot of complexity to the method. This
+technique is not used much in practice.\footnote{
+ I've yet to find any example of it used in a journal article
+ or conference paper.
+}
+
+
+
+
+
+\subsection{Limitations of the Bentley-Saxe Method}
+
+
+
+
+
diff --git a/chapters/beyond-bsm.tex b/chapters/beyond-bsm.tex
new file mode 100644
index 0000000..290d9b1
--- /dev/null
+++ b/chapters/beyond-bsm.tex
@@ -0,0 +1,3 @@
+\chapter{Expanding the Design Space}
+\section{The LSM Tree}
+\section{Benefits of Buffering}
diff --git a/chapters/beyond-dsp.tex b/chapters/beyond-dsp.tex
new file mode 100644
index 0000000..77f5fb4
--- /dev/null
+++ b/chapters/beyond-dsp.tex
@@ -0,0 +1,863 @@
+\chapter{Generalizing the Framework}
+\label{chap:framework}
+
+The previous chapter demonstrated
+the possible utility of
+designing indexes based upon the dynamic extension of static data
+structures. However, the presented strategy falls short of a general
+framework, as it is specific to sampling problems. In this chapter,
+the techniques of that work will be discussed in more general terms,
+to arrive at a more broadly applicable solution. A general
+framework is proposed, which places only two requirements on supported data
+structures,
+
+\begin{itemize}
+ \item Extended Decomposability
+ \item Record Identity
+\end{itemize}
+
+In this chapter, first these two properties are defined. Then,
+a general dynamic extension framework is described which can
+be applied to any data structure supporting these properties. Finally,
+an experimental evaluation is presented that demonstrates the viability
+of this framework.
+
+\section{Extended Decomposability}
+
+Chapter~\ref{chap:sampling} demonstrated how non-DSPs can be efficiently
+addressed using Bentley-Saxe, so long as the query interface is
+modified to accommodate their needs. For Independent sampling
+problems, this involved a two-pass approach, where some pre-processing
+work was performed against each shard and used to construct a shard
+alias structure. This structure was then used to determine how many
+samples to draw from each shard.
+
+To generalize this approach, a new class of decomposability is proposed,
+called \emph{extended decomposability}. At present, its
+definition is tied tightly to the query interface, rather
+than a formal mathematical definition. In extended decomposability,
+rather than treating a search problem as a monolith, the algorithm
+is decomposed into multiple components.
+This allows
+for communication between shards as part of the query process.
+Additionally, rather than using a binary merge operator, extended
+decomposability uses a variadic function that merges all of the
+result sets in one pass, reducing the cost due to merging by a
+logarithmic factor without introducing any new restrictions.
+
+The basic interface that must be supported by a extended-decomposable
+search problem (eDSP) is,
+\begin{itemize}
+
+ \item $\mathbftt{local\_preproc}(\mathcal{I}_i, \mathcal{Q}) \to
+ \mathscr{S}_i$ \\
+ Pre-processes each partition $\mathcal{D}_i$ using index
+ $\mathcal{I}_i$ to produce preliminary information about the
+ query result on this partition, encoded as an object
+ $\mathscr{S}_i$.
+
+ \item $\mathbftt{distribute\_query}(\mathscr{S}_1, \ldots,
+ \mathscr{S}_m, \mathcal{Q}) \to \mathcal{Q}_1, \ldots,
+ \mathcal{Q}_m$\\
+ Processes the list of preliminary information objects
+ $\mathscr{S}_i$ and emits a list of local queries
+ $\mathcal{Q}_i$ to run independently on each partition.
+
+ \item $\mathbftt{local\_query}(\mathcal{I}_i, \mathcal{Q}_i)
+ \to \mathcal{R}_i$ \\
+ Executes the local query $\mathcal{Q}_i$ over partition
+ $\mathcal{D}_i$ using index $\mathcal{I}_i$ and returns a
+ partial result $\mathcal{R}_i$.
+
+ \item $\mathbftt{merge}(\mathcal{R}_1, \ldots \mathcal{R}_m) \to
+ \mathcal{R}$ \\
+ Merges the partial results to produce the final answer.
+
+\end{itemize}
+
+The pseudocode for the query algorithm using this interface is,
+\begin{algorithm}
+ \DontPrintSemicolon
+ \SetKwProg{Proc}{procedure}{ BEGIN}{END}
+ \SetKwProg{For}{for}{ DO}{DONE}
+
+ \Proc{\mathbftt{QUERY}($D[]$, $\mathscr{Q}$)} {
+ \For{$i \in [0, |D|)$} {
+ $S[i] := \mathbftt{local\_preproc}(D[i], \mathscr{Q})$
+ } \;
+
+ $ Q := \mathbftt{distribute\_query}(S, \mathscr{Q}) $ \; \;
+
+ \For{$i \in [0, |D|)$} {
+ $R[i] := \mathbftt{local\_query}(D[i], Q[i])$
+ } \;
+
+ $OUT := \mathbftt{merge}(R)$ \;
+
+ \Return {$OUT$} \;
+ }
+\end{algorithm}
+
+In this system, each query can report a partial result with
+\mathbftt{local\_preproc}, which can be used by
+\mathbftt{distribute\_query} to adjust the per-partition query
+parameters, allowing for direct communication of state between
+partitions. Queries which do not need this functionality can simply
+return empty $\mathscr{S}_i$ objects from \mathbftt{local\_preproc}.
+
+\subsection{Query Complexity}
+
+Before describing how to use this new interface and definition to
+support more efficient queries than standard decomposability, more
+more general expression for the cost of querying such a structure should
+be derived.
+Recall that Bentley-Saxe, when applied to a $C(n)$-decomposable
+problem, has the following query cost,
+
+\begin{equation}
+ \label{eq3:Bentley-Saxe}
+ O\left(\log n \cdot \left( Q_s(n) + C(n)\right)\right)
+\end{equation}
+where $Q_s(n)$ is the cost of the query against one partition, and
+$C(n)$ is the cost of the merge operator.
+
+Let $Q_s(n)$ represent the cost of \mathbftt{local\_query} and
+$C(n)$ the cost of \mathbftt{merge} in the extended decomposability
+case. Additionally, let $P(n)$ be the cost of $\mathbftt{local\_preproc}$
+and $\mathcal{D}(n)$ be the cost of \mathbftt{distribute\_query}.
+Additionally, recall that $|D| = \log n$ for the Bentley-Saxe method.
+In this case, the cost of a query is
+\begin{equation}
+ O \left( \log n \cdot P(n) + \mathcal{D}(n) +
+ \log n \cdot Q_s(n) + C(n) \right)
+\end{equation}
+
+Superficially, this looks to be strictly worse than the Bentley-Saxe
+case in Equation~\ref{eq3:Bentley-Saxe}. However, the important
+thing to understand is that for $C(n)$-decomposable queries, $P(n)
+\in O(1)$ and $\mathcal{D}(n) \in O(1)$, as these steps are unneeded.
+Thus, for normal decomposable queries, the cost actually reduces
+to,
+\begin{equation}
+ O \left( \log n \cdot Q_s(n) + C(n) \right)
+\end{equation}
+which is actually \emph{better} than Bentley-Saxe. Meanwhile, the
+ability perform state-sharing between queries can facilitate better
+solutions than would otherwise be possible.
+
+In light of this new approach, consider the two examples of
+non-decomposable search problems from Section~\ref{ssec:decomp-limits}.
+
+\subsection{k-Nearest Neighbor}
+\label{ssec:knn}
+The KNN problem is $C(n)$-decomposable, and Section~\ref{sssec-decomp-limits-knn}
+arrived at a Bentley-Saxe based solution to this problem based on
+VPTree, with a query cost of
+\begin{equation}
+ O \left( k \log^2 n + k \log n \log k \right)
+\end{equation}
+by running KNN on each partition, and then merging the result sets
+with a heap.
+
+Applying the interface of extended-decomposability to this problem
+allows for some optimizations. Pre-processing is not necessary here,
+but the variadic merge function can be leveraged to get an asymptotically
+better solution. Simply dropping the existing algorithm into this
+interface will result in a merge algorithm with cost,
+\begin{equation}
+ C(n) \in O \left( k \log n \left( \log k + \log\log n\right)\right)
+\end{equation}
+which results in a total query cost that is slightly \emph{worse}
+than the original,
+
+\begin{equation}
+ O \left( k \log^2 n + k \log n \left(\log k + \log\log n\right) \right)
+\end{equation}
+
+The problem is that the number of records considered in a given
+merge has grown from $O(k)$ in the binary merge case to $O(\log n
+\cdot k)$ in the variadic merge. However, because the merge function
+now has access to all of the data at once, the algorithm can be modified
+slightly for better efficiency by only pushing $\log n$ elements
+into the heap at a time. This trick only works if
+the $R_i$s are in sorted order relative to $f(x, q)$,
+however this condition is satisfied by the result sets returned by
+KNN against a VPTree. Thus, for each $R_i$, the first element in sorted
+order can be inserted into the heap,
+element in sorted order into the heap, tagged with a reference to
+which $R_i$ it was taken from. Then, when the heap is popped, the
+next element from the associated $R_i$ can be inserted.
+This allows the heap's size to be maintained at no larger
+than $O(\log n)$, and limits the algorithm to no more than
+$k$ pop operations and $\log n + k - 1$ pushes.
+
+This algorithm reduces the cost of KNN on this structure to,
+\begin{equation}
+ O(k \log^2 n + \log n)
+\end{equation}
+which is strictly better than the original.
+
+\subsection{Independent Range Sampling}
+
+The eDSP abstraction also provides sufficient features to implement
+IRS, using the same basic approach as was used in the previous
+chapter. Unlike KNN, IRS will take advantage of the extended query
+interface. Recall from the Chapter~\ref{chap:sampling} that the approach used
+for answering sampling queries (ignoring the buffer, for now) was,
+
+\begin{enumerate}
+ \item Query each shard to establish the weight that should be assigned to the
+ shard in sample size assignments.
+ \item Build an alias structure over those weights.
+ \item For each sample, reference the alias structure to determine which shard
+ to sample from, and then draw the sample.
+\end{enumerate}
+
+This approach can be mapped easily onto the eDSP interface as follows,
+\begin{itemize}
+ \item[\texttt{local\_preproc}] Determine and return the total weight of candidate records for
+ sampling in the shard.
+ \item[\texttt{distribute\_query}] Using the shard weights, construct an alias structure associating
+ each shard with its total weight. Then, query this alias structure $k$ times. For shard $i$, the
+ local query $\mathscr{Q}_i$ will have its sample size assigned based on how many times $i$ is returned
+ during the alias querying.
+ \item[\texttt{local\_query}] Process the local query using the underlying data structure's normal sampling
+ procedure.
+ \item[\texttt{merge}] Union all of the partial results together.
+\end{itemize}
+
+This division of the query maps closely onto the cost function,
+\begin{equation}
+ O\left(P(n) + kS(n)\right)
+\end{equation}
+used in Chapter~\ref{chap:sampling}, where the $W(n) + P(n)$ pre-processing
+cost is associated with the cost of \texttt{local\_preproc} and the
+$kS(n)$ sampling cost is associated with $\texttt{local\_query}$.
+The \texttt{distribute\_query} operation will require $O(\log n)$
+time to construct the shard alias structure, and $O(k)$ time to
+query it. Accounting then for the fact that \texttt{local\_preproc}
+will be called once per shard ($\log n$ times), and a total of $k$
+records will be sampled as the cost of $S(n)$ each, this results
+in a total query cost of,
+\begin{equation}
+ O\left(\left[W(n) + P(n)\right]\log n + k S(n)\right)
+\end{equation}
+which matches the cost in Equation~\ref{eq:sample-cost}.
+
+\section{Record Identity}
+
+Another important consideration for the framework is support for
+deletes, which are important in the contexts of database systems.
+The sampling extension framework supported two techniques
+for the deletion of records: tombstone-based deletes and tagging-based
+deletes. In both cases, the solution required that the shard support
+point lookups, either for checking tombstones or for finding the
+record to mark it as deleted. Implicit in this is an important
+property of the underlying data structure which was taken for granted
+in that work, but which will be made explicit here: record identity.
+
+Delete support requires that each record within the index be uniquely
+identifiable, and linkable directly to a location in storage. This
+property is called \emph{record identity}.
+ In the context of database
+indexes, it isn't a particularly contentious requirement. Indexes
+already are designed to provide a mapping directly to a record in
+storage, which (at least in the context of RDBMS) must have a unique
+identifier attached. However, in more general contexts, this
+requirement will place some restrictions on the applicability of
+the framework.
+
+For example, approximate data structures or summaries, such as Bloom
+filters~\cite{bloom70} or count-min sketches~\cite{countmin-sketch}
+are data structures which don't necessarily store the underlying
+record. In principle, some summaries \emph{could} be supported by
+normal Bentley-Saxe as there exist mergeable
+summaries~\cite{mergeable-summaries}. But because these data structures
+violate the record identity property, they would not support deletes
+(either in the framework, or Bentley-Saxe). The framework considers
+deletes to be a first-class citizen, and this is formalized by
+requiring record identity as a property that supported data structures
+must have.
+
+\section{The General Framework}
+
+Based on these properties, and the work described in
+Chapter~\ref{chap:sampling}, dynamic extension framework has been devised with
+broad support for data structures. It is implemented in C++20, using templates
+and concepts to define the necessary interfaces. A user of this framework needs
+to provide a definition for their data structure with a prescribed interface
+(called a \texttt{shard}), and a definition for their query following an
+interface based on the above definition of an eDSP. These two classes can then
+be used as template parameters to automatically create a dynamic index, which
+exposes methods for inserting and deleting records, as well as executing
+queries.
+
+\subsection{Framework Design}
+
+\Paragraph{Structure.} The overall design of the general framework
+itself is not substantially different from the sampling framework
+discussed in the Chapter~\ref{chap:sampling}. It consists of a mutable buffer
+and a set of levels containing data structures with geometrically
+increasing capacities. The \emph{mutable buffer} is a small unsorted
+record array of fixed capacity that buffers incoming inserts. As
+the mutable buffer is kept sufficiently small (e.g. fits in L2 CPU
+cache), the cost of querying it without any auxiliary structures
+can be minimized, while still allowing better insertion performance
+than Bentley-Saxe, which requires rebuilding an index structure for
+each insertion. The use of an unsorted buffer is necessary to
+ensure that the framework doesn't require an existing dynamic version
+of the index structure being extended, which would defeat the purpose
+of the entire exercise.
+
+The majority of the data within the structure is stored in a sequence
+of \emph{levels} with geometrically increasing record capacity,
+such that the capacity of level $i$ is $s^{i+1}$, where $s$ is a
+configurable parameter called the \emph{scale factor}. Unlike
+Bentley-Saxe, these levels are permitted to be partially full, which
+allows significantly more flexibility in terms of how reconstruction
+is performed. This also opens up the possibility of allowing each
+level to allocate its record capacity across multiple data structures
+(named \emph{shards}) rather than just one. This decision is called
+the \emph{layout policy}, with the use of a single structure being
+called \emph{leveling}, and multiple structures being called
+\emph{tiering}.
+
+\begin{figure}
+\centering
+\subfloat[Leveling]{\includegraphics[width=.5\textwidth]{img/leveling} \label{fig:leveling}}
+\subfloat[Tiering]{\includegraphics[width=.5\textwidth]{img/tiering} \label{fig:tiering}}
+ \caption{\textbf{An overview of the general structure of the
+ dynamic extension framework} using leveling (Figure~\ref{fig:leveling}) and
+tiering (Figure~\ref{fig:tiering}) layout policies. The pictured extension has
+a scale factor of 3, with $L_0$ being at capacity, and $L_1$ being at
+one third capacity. Each shard is shown as a dotted box, wrapping its associated
+dataset ($D_i$), data structure ($I_i$), and auxiliary structures $(A_i)$. }
+\label{fig:framework}
+\end{figure}
+
+\Paragraph{Shards.} The basic building block of the dynamic extension
+is called a shard, defined as $\mathcal{S}_i = (\mathcal{D}_i,
+\mathcal{I}_i, A_i)$, which consists of a partition of the data
+$\mathcal{D}_i$, an instance of the static index structure being
+extended $\mathcal{I}_i$, and an optional auxiliary structure $A_i$.
+To ensure the viability of level reconstruction, the extended data
+structure should at least support a construction method
+$\mathtt{build}(\mathcal{D})$ that can build a new static index
+from a set of records $\mathcal{D}$ from scratch. This set of records
+may come from the mutable buffer, or from a union of underlying
+data of multiple other shards. It is also beneficial for $\mathcal{I}_i$
+to support efficient point-lookups, which can search for a record's
+storage location by its identifier (given by the record identify
+requirements of the framework). The shard can also be customized
+to provide any necessary features for supporting the index being
+extended. For example, auxiliary data structures like Bloom filters
+or hash tables can be added to improve point-lookup performance,
+or additional, specialized query functions can be provided for use
+by the query functions.
+
+From an implementation standpoint, the shard object provides a shim
+between the data structure and the framework itself. At minimum,
+it must support the following interface,
+\begin{itemize}
+ \item $\mathbftt{construct}(B) \to S$ \\
+ Construct a new shard from the contents of the mutable buffer, $B$.
+
+ \item $\mathbftt{construct}(S_0, \ldots, S_n) \to S$
+ Construct a new shard from the records contained within a list of already
+ existing shards.
+
+ \item $\mathbftt{point\_lookup}(r) \to *r$ \\
+ Search for a record, $r$, by identity and return a reference to its
+ location in storage.
+\end{itemize}
+
+\Paragraph{Insertion \& deletion.} The framework supports inserting
+new records and deleting records already in the index. These two
+operations also allow for updates to existing records, by first
+deleting the old version and then inserting a new one. These
+operations are added by the framework automatically, and require
+only a small shim or minor adjustments to the code of the data
+structure being extended within the implementation of the shard
+object.
+
+Insertions are performed by first wrapping the record to be inserted
+with a framework header, and then appending it to the end of the
+mutable buffer. If the mutable buffer is full, it is flushed to
+create a new shard, which is combined into the first level of the
+structure. The level reconstruction process is layout policy
+dependent. In the case of leveling, the underlying data of the
+source shard and the target shard are combined, resulting a new
+shard replacing the target shard in the target level. When using
+tiering, the newly created shard is simply placed into the target
+level. If the target level is full, the framework first triggers a merge on the
+target level, which will create another shard at one higher level,
+and then inserts the former shard at the now empty target level.
+Note that each time a new shard is created, the framework must invoke
+$\mathtt{build}$ to construct a new index from scratch for this
+shard.
+
+The framework supports deletes using two approaches: either by
+inserting a special tombstone record or by performing a lookup for
+the record to be deleted and setting a bit in the header. This
+decision is called the \emph{delete policy}, with the former being
+called \emph{tombstone delete} and the latter \emph{tagged delete}.
+The framework will automatically filter deleted records from query
+results before returning them to the user, either by checking for
+the delete tag, or by performing a lookup of each record for an
+associated tombstone. The number of deleted records within the
+framework can be bounded by canceling tombstones and associated
+records when they meet during reconstruction, or by dropping all
+tagged records when a shard is reconstructed. The framework also
+supports aggressive reconstruction (called \emph{compaction}) to
+precisely bound the number of deleted records within the index,
+which can be helpful to improve the performance of certain types
+of query. This is useful for certain search problems, as was seen with
+sampling queries in Chapter~\ref{chap:sampling}, but is not
+generally necessary to bound query cost in most cases.
+
+\Paragraph{Design space.} The framework described in this section
+has a large design space. In fact, much of the design space has
+similar knobs to the well-known LSM Tree~\cite{dayan17}, albeit in
+a different environment: the framework targets in-memory static
+index structures for general extended decomposable queries without
+efficient index merging support, whereas the LSM-tree targets
+external range indexes that can be efficiently merged.
+
+The framework's design trades off among auxiliary memory usage, read performance,
+and write performance. The two most significant decisions are the
+choice of layout and delete policy. A tiering layout policy reduces
+write amplification compared to leveling, requiring each record to
+only be written once per level, but increases the number of shards
+within the structure, which can hurt query performance. As for
+delete policy, the use of tombstones turns deletes into insertions,
+which are typically faster. However, depending upon the nature of
+the query being executed, the delocalization of the presence
+information for a record may result in one extra point lookup for
+each record in the result set of a query, vastly reducing read
+performance. In these cases, tagging may make more sense. This
+results in each delete turning into a slower point-lookup, but
+always allows for constant-time visibility checks of records. The
+other two major parameters, scale factor and buffer size, can be
+used to tune the performance once the policies have been selected.
+Generally speaking, larger scale factors result in fewer shards,
+but can increase write amplification under leveling. Large buffer
+sizes can adversely affect query performance when an unsorted buffer
+is used, while allowing higher update throughput. Because the overall
+design of the framework remains largely unchanged, the design space
+exploration of Section~\ref{ssec:ds-exp} remains relevant here.
+
+\subsection{The Shard Interface}
+
+The shard object serves as a ``shim'' between a data structure and
+the extension framework, providing a set of mandatory functions
+which are used by the framework code to facilitate reconstruction
+and deleting records. The data structure being extended can be
+provided by a different library and included as an attribute via
+composition/aggregation, or can be directly implemented within the
+shard class. Additionally, shards can contain any necessary auxiliary
+structures, such as bloom filters or hash tables, as necessary to
+support the required interface.
+
+The require interface for a shard object is as follows,
+\begin{verbatim}
+ new(MutableBuffer) -> Shard
+ new(Shard[]) -> Shard
+ point_lookup(Record, Boolean) -> Record
+ get_data() -> Record
+ get_record_count() -> Int
+ get_tombstone_count() -> Int
+ get_memory_usage() -> Int
+ get_aux_memory_usage() -> Int
+\end{verbatim}
+
+The first two functions are constructors, necessary to build a new Shard
+from either an array of other shards (for a reconstruction), or from
+a mutable buffer (for a buffer flush).\footnote{
+ This is the interface as it currently stands in the existing implementation, but
+ is subject to change. In particular, we are considering changing the shard reconstruction
+ procedure to allow for only one necessary constructor, with a more general interface. As
+ we look to concurrency, being able to construct shards from arbitrary combinations of shards
+ and buffers will become convenient, for example.
+ }
+The \texttt{point\_lookup} operation is necessary for delete support, and is
+used either to locate a record for delete when tagging is used, or to search
+for a tombstone associated with a record when tombstones are used. The boolean
+is intended to be used to communicate to the shard whether the lookup is
+intended to locate a tombstone or a record, and is meant to be used to allow
+the shard to control whether a point lookup checks a filter before searching,
+but could also be used for other purposes. The \texttt{get\_data}
+function exposes a pointer to the beginning of the array of records contained
+within the shard--it imposes no restriction on the order of these records, but
+does require that all records can be accessed sequentially from this pointer,
+and that the order of records does not change. The rest of the functions are
+accessors for various shard metadata. The record and tombstone count numbers
+are used by the framework for reconstruction purposes.\footnote{The record
+count includes tombstones as well, so the true record count on a level is
+$\text{reccnt} - \text{tscnt}$.} The memory usage statistics are, at present,
+only exposed directly to the user and have no effect on the framework's
+behavior. In the future, these may be used for concurrency control and task
+scheduling purposes.
+
+Beyond these, a shard can expose any additional functions that are necessary
+for its associated query classes. For example, a shard intended to be used for
+range queries might expose upper and lower bound functions, or a shard used for
+nearest neighbor search might expose a nearest-neighbor function.
+
+\subsection{The Query Interface}
+\label{ssec:fw-query-int}
+
+The required interface for a query in the framework is a bit more
+complicated than the interface defined for an eDSP, because the
+framework needs to query the mutable buffer as well as the shards.
+As a result, there is some slight duplication of functions, with
+specialized query and pre-processing routines for both shards and
+buffers. Specifically, a query must define the following functions,
+\begin{verbatim}
+ get_query_state(QueryParameters, Shard) -> ShardState;
+ get_buffer_query_state(QueryParameters, Buffer) -> BufferState;
+
+ process_query_states(QueryParameters, ShardStateList, BufferStateList) -> LocalQueryList;
+
+ query(LocalQuery, Shard) -> ResultList
+ buffer_query(LocalQuery, Buffer) -> ResultList
+
+ merge(ResultList) -> FinalResult
+
+ delete_query_state(ShardState)
+ delete_buffer_query_state(BufferState)
+
+ bool EARLY_ABORT;
+ bool SKIP_DELETE_FILTER;
+\end{verbatim}
+
+The \texttt{get\_query\_state} and \texttt{get\_buffer\_query\_state} functions
+map to the \texttt{local\_preproc} operation of the eDSP definition for shards
+and buffers respectively. \texttt{process\_query\_states} serves the function
+of \texttt{distribute\_query}. Note that this function takes a list of buffer
+states; although the proposed framework above contains only a single buffer,
+future support for concurrency will require multiple buffers, and so the
+interface is set up with support for this. The \texttt{query} and
+\texttt{buffer\_query} functions execute the local query against the shard or
+buffer and return the intermediate results, which are merged using
+\texttt{merge} into a final result set. The \texttt{EARLY\_ABORT} parameter can
+be set to \texttt{true} to force the framework to immediately return as soon as
+the first result is found, rather than querying the entire structure, and the
+\texttt{SKIP\_DELETE\_FILTER} disables the framework's automatic delete
+filtering, allowing deletes to be manually handled within the \texttt{merge}
+function by the developer. These flags exist to allow for optimizations for
+certain types of query. For example, point-lookups can take advantage of
+\texttt{EARLY\_ABORT} to stop as soon as a match is found, and
+\texttt{SKIP\_DELETE\_FILTER} can be used for more efficient tombstone delete
+handling in range queries, where tombstones for results will always be in the
+\texttt{ResultList}s going into \texttt{merge}.
+
+The framework itself answers queries by simply calling these routines in
+a prescribed order,
+\begin{verbatim}
+query(QueryArguments qa) BEGIN
+ FOR i < BufferCount DO
+ BufferStates[i] = get_buffer_query_state(qa, Buffers[i])
+ DONE
+
+ FOR i < ShardCount DO
+ ShardStates[i] = get_query_state(qa, Shards[i])
+ DONE
+
+ process_query_states(qa, ShardStates, BufferStates)
+
+ FOR i < BufferCount DO
+ temp = buffer_query(BufferStates[i], Buffers[i])
+ IF NOT SKIP_DELETE_FILTER THEN
+ temp = filter_deletes(temp)
+ END
+ Results[i] = temp;
+
+ IF EARLY_ABORT AND Results[i].size() > 0 THEN
+ delete_states(ShardStates, BufferStates)
+ return merge(Results)
+ END
+ DONE
+
+ FOR i < ShardCount DO
+ temp = query(ShardStates[i], Shards[i])
+ IF NOT SKIP_DELETE_FILTER THEN
+ temp = filter_deletes(temp)
+ END
+ Results[i + BufferCount] = temp
+ IF EARLY_ABORT AD Results[i + BufferCount].size() > 0 THEN
+ delete_states(ShardStates, BufferStates)
+ return merge(Results)
+ END
+ DONE
+
+ delete_states(ShardStates, BufferStates)
+ return merge(Results)
+END
+\end{verbatim}
+
+\subsubsection{Standardized Queries}
+
+Provided with the framework are several "standardized" query classes, including
+point lookup, range query, and IRS. These queries can be freely applied to any
+shard class that implements the necessary optional interfaces. For example, the
+provided IRS and range query both require the shard to implement a
+\texttt{lower\_bound} and \texttt{upper\_bound} function that returns an index.
+They then use this index to access the record array exposed via
+\texttt{get\_data}. This is convenient, because it helps to separate the search
+problem from the data structure, and moves towards presenting these two objects
+as orthogonal.
+
+In the next section the framework is evaluated by producing a number of indexes
+for three different search problems. Specifically, the framework is applied to
+a pair of learned indexes, as well as an ISAM-tree. All three of these shards
+provide the bound interface described above, meaning that the same range query
+class can be used for all of them. It also means that the learned indexes
+automatically have support for IRS. And, of course, they also all can be used
+with the provided point-lookup query, which simply uses the required
+\texttt{point\_lookup} function of the shard.
+
+At present, the framework only supports associating a single query class with
+an index. However, this is simply a limitation of implementation. In the future,
+approaches will be considered for associating arbitrary query classes to allow
+truly multi-purpose indexes to be constructed. This is not to say that every
+data structure will necessarily be efficient at answering every type of query
+that could be answered using their interface--but in a database system, being
+able to repurpose an existing index to accelerate a wide range of query types
+would certainly seem worth considering.
+
+\section{Framework Evaluation}
+
+The framework was evaluated using three different types of search problem:
+range-count, high-dimensional k-nearest neighbor, and independent range
+sampling. In all three cases, an extended static data structure was compared
+with dynamic alternatives for the same search problem to demonstrate the
+framework's competitiveness.
+
+\subsection{Methodology}
+
+All tests were performed using Ubuntu 22.04
+LTS on a dual-socket Intel Xeon Gold 6242R server with 384 GiB of
+installed memory and 40 physical cores. Benchmark code was compiled
+using \texttt{gcc} version 11.3.0 at the \texttt{-O3} optimization level.
+
+
+\subsection{Range Queries}
+
+A first test evaluates the performance of the framework in the context of
+range queries against learned indexes. In Chapter~\ref{chap:intro}, the
+lengthy development cycle of this sort of data structure was discussed,
+and so learned indexes were selected as an evaluation candidate to demonstrate
+how this framework could allow such lengthy development lifecycles to be largely
+bypassed.
+
+Specifically, the framework is used to produce dynamic learned indexes based on
+TrieSpline~\cite{plex} (DE-TS) and the static version of PGM~\cite{pgm} (DE-PGM). These
+are both single-pass construction static learned indexes, and thus well suited for use
+within this framework compared to more complex structures like RMI~\cite{RMI}, which have
+more expensive construction algorithms. The two framework-extended data structures are
+compared with dynamic learned indexes, namely ALEX~\cite{ALEX} and the dynamic version of
+PGM~\cite{pgm}. PGM provides an interesting comparison, as its native
+dynamic version was implemented using a slightly modified version Bentley-Saxe method.
+
+When performing range queries over large data sets, the
+copying of query results can introduce significant overhead. Because the four
+tested structures have different data copy behaviors, a range count query was
+used for testing, rather than a pure range query. This search problem exposes
+the searching performance of the data structures, while controlling for different
+data copy behaviors, and so should provide more directly comparable results.
+
+Range count
+queries were executed with a selectivity of $0.01\%$ against three datasets
+from the SOSD benchmark~\cite{sosd-datasets}: \texttt{book}, \texttt{fb}, and
+\texttt{osm}, which all have 200 million 64-bit keys following a variety of
+distributions, which were paired with uniquely generated 64-bit values. There
+is a fourth dataset in SOSD, \texttt{wiki}, which was excluded from testing
+because it contained duplicate keys, which are not supported by dynamic
+PGM.\footnote{The dynamic version of PGM supports deletes using tombstones,
+but doesn't wrap records with a header to accomplish this. Instead it reserves
+one possible value to represent a tombstone. Records are deleted by inserting a
+record having the same key, but this different value. This means that duplicate
+keys, even if they have different values, are unsupported as two records with
+the same key will be treated as a delete by the index.~\cite{pgm} }
+
+The shard implementations for DE-PGM and DE-TS required about 300 lines of
+C++ code each, and no modification to the data structures themselves. For both
+data structures, the framework was configured with a buffer of 12,000 records, a scale
+factor of 8, the tombstone delete policy, and tiering. Each shard stored $D_i$
+as a sorted array of records, used an instance of the learned index for
+$\mathcal{I}_i$, and has no auxiliary structures. The local query routine used
+the learned index to locate the first key in the query range and then iterated
+over the sorted array until the end of the range is reached, counting the
+number of records and tombstones required. The mutable buffer query performed
+the counting over a full scan. No local preprocessing was needed, and the merge
+operation simply summed the record and tombstone counts, and returned their
+difference.
+
+\begin{figure*}[t]
+ \centering
+ \subfloat[Update Throughput]{\includegraphics[width=.5\textwidth]{img/fig-bs-rq-insert} \label{fig:rq-insert}}
+ \subfloat[Query Latency]{\includegraphics[width=.5\textwidth]{img/fig-bs-rq-query} \label{fig:rq-query}} \\
+ \subfloat[Index Sizes]{\includegraphics[width=.5\textwidth, trim=5mm 5mm 0 0 ]{img/fig-bs-rq-space} \label{fig:idx-space}}
+ \caption{Range Count Evaluation}
+ \label{fig:results1}
+\end{figure*}
+
+Figure~\ref{fig:rq-insert} shows the update throughput of all competitors. ALEX
+performs the worst in all cases, and PGM performs the best, with the extended
+indexes falling in the middle. It is not unexpected that PGM performs better
+than the framework, because the Bentley-Saxe extension in PGM is custom-built,
+and thus has a tighter integration than a general framework would allow.
+However, even with this advantage, DE-PGM still reaches up to 85\% of PGM's
+insertion throughput. Additionally, Figure~\ref{fig:rq-query} shows that PGM
+pays a large cost in query latency for its advantage in insertion, with the
+framework extended indexes significantly outperforming it. Further, DE-TS even
+outperforms ALEX for query latency in some cases. Finally,
+Figure~\ref{fig:idx-space} shows the storage cost of the indexes, without
+counting the space necessary to store the records themselves. The storage cost
+of a learned index is fairly variable, as it is largely a function of the
+distribution of the data, but in all cases, the extended learned
+indexes, which build compact data arrays without gaps, occupy three orders of
+magnitude smaller storage space compared to ALEX, which requires leaving gaps
+in the data arrays.
+
+\subsection{High-Dimensional k-Nearest Neighbor}
+The next test evaluates the framework for the extension of high-dimensional
+metric indexes for the k-nearest neighbor search problem. An M-tree~\cite{mtree}
+was used as the dynamic baseline,\footnote{
+ Specifically, the M-tree implementation tested can be found at \url{https://github.com/dbrumbaugh/M-Tree}
+ and is a fork of a structure written originally by Eduardo D'Avila, modified to compile under C++20. The
+ tree uses a random selection algorithm for ball splitting.
+} and a VPTree~\cite{vptree} as the static structure. The framework was used to
+extend VPTree to produce the dynamic version, DE-VPTree.
+An M-Tree is a tree that partitions records based on
+high-dimensional spheres and supports updates by splitting and merging these
+partitions.
+A VPTree is a binary tree that is produced by recursively selecting
+a point, called the vantage point, and partitioning records based on their
+distance from that point. This results in a difficult to modify structure that
+can be constructed in $O(n \log n)$ time and can answer KNN queries in $O(k
+\log n)$ time.
+
+DE-VPTree, used a buffer of 12,000 records, a scale factor of 6, tiering, and
+delete tagging. The query was implemented without a pre-processing step, using
+the standard VPTree algorithm for KNN queries against each shard. All $k$
+records were determined for each shard, and then the merge operation used a
+heap to merge the results sets together and return the $k$ nearest neighbors
+from the $k\log(n)$ intermediate results. This is a type of query that pays a
+non-constant merge cost, even with the framework's expanded query interface, of
+$O(k \log k)$. In effect, the kNN query must be answered twice: once for each
+shard to get the intermediate result sets, and then a second time within the
+merge operation to select the kNN from the result sets.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=.75\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-knn}
+ \caption{KNN Index Evaluation}
+ \label{fig:knn}
+\end{figure}
+Euclidean distance was used as the metric for both structures, and $k=1000$ was
+used for all queries. The reference point for each query was selected randomly
+from points within the dataset. Tests were run using the Spanish Billion Words
+dataset~\cite{sbw}, of 300-dimensional vectors. The results are shown in
+Figure~\ref{fig:knn}. In this case, the static nature of the VPTree allows it
+to dominate the M-Tree in query latency, and the simpler reconstruction
+procedure shows a significant insertion performance improvement as well.
+
+\subsection{Independent Range Sampling}
+Finally, the
+framework was tested using one-dimensional IRS queries. As before,
+a static ISAM-tree was used as the data structure to be extended,
+however the sampling query was implemented using the query interface from
+Section~\ref{ssec:fw-query-int}. The pre-processing step identifies the first
+and last query falling into the range to be sampled from, and determines the
+total weight based on this range, for each shard. Then, in the local query
+generation step, these weights are used to construct and alias structure, which
+is used to assign sample sizes to each shard based on weight to avoid
+introducing skew into the results. After this, the query routine generates
+random numbers between the established bounds to sample records, and the merge
+operation appends the individual result sets together. This static procedure
+only requires a pair of tree traversals per shard, regardless of how many
+samples are taken.
+
+\begin{figure}
+ \centering
+ \subfloat[Query Latency]{\includegraphics[width=.5\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-query} \label{fig:irs-query}}
+ \subfloat[Update Throughput]{\includegraphics[width=.5\textwidth, trim=5mm 5mm 0 0]{img/fig-bs-irs-insert} \label{fig:irs-insert}}
+ \caption{IRS Index Evaluation}
+ \label{fig:results2}
+\end{figure}
+
+The extended ISAM structure (DE-IRS) was compared to a B$^+$-Tree
+with aggregate weight tags on internal nodes (AGG B+Tree) for sampling
+and insertion performance, and to a single instance of the static ISAM-tree (ISAM),
+which does not support updates. DE-IRS was configured with a buffer size
+of 12,000 records, a scale factor of 6, tiering, and delete tagging. The IRS
+queries had a selectivity of $0.1\%$ with sample size of $k=1000$. Testing
+was performed using the same datasets as were used for range queries.
+
+Figure~\ref{fig:irs-query}
+shows the significant latency advantage that the dynamically extended ISAM tree
+enjoys compared to a B+Tree. DE-IRS is up to 23 times faster than the B$^+$-Tree at
+answering sampling queries, and only about 3 times slower than the fully static
+solution. In this case, the extra query cost caused by needing to query
+multiple structures is more than balanced by the query efficiency of each of
+those structures, relative to tree sampling. Interestingly, the framework also
+results in better update performance compared to the B$^+$-Tree, as shown in
+Figure~\ref{fig:irs-insert}. This is likely because the ISAM shards can be
+efficiently constructed using a combination of sorted-merge operations and
+bulk-loading, and avoid expensive structural modification operations that are
+necessary for maintaining a B$^+$-Tree.
+
+\subsection{Discussion}
+
+
+The results demonstrate not only that the framework's update support is
+competitive with custom-built dynamic data structures, but that the framework
+is even able to, in many cases, retain some of the query performance advantage
+of its extended static data structure. This is particularly evident in the k-nearest
+neighbor and independent range sampling tests, where the static version of the
+structure was directly tested as well. These tests demonstrate one of the advantages
+of static data structures: they are able to maintain much tighter inter-record relationships
+than dynamic ones, because update support typically requires relaxing these relationships
+to make it easier to update them. While the framework introduces the overhead of querying
+multiple structures and merging them together, it is clear from the results that this overhead
+is generally less than the overhead incurred by the update support techniques used
+in the dynamic structures. The only case where the framework was defeated in query performance
+was in competition with ALEX, where the resulting query latencies were comparable.
+
+It is also evident that the update support provided by the framework is on par with, if not
+superior, to that provided by the dynamic baselines, at least in terms of throughput. The
+framework will certainly suffer from larger tail latency spikes, which weren't measured in
+this round of testing, due to the larger scale of the reconstructions, but the amortization
+of these costs over a large number of inserts allows for the maintenance of a respectable
+level of throughput. In fact, the only case where the framework loses in insertion throughput
+is against the dynamic PGM. However, an examination of the query latency reveals that this
+is likely due to the fact that the standard configuration of the Bently-Saxe variant used
+by PGM is highly tuned for insertion performance, as the query latencies against this data
+structure are far worse than any other learned index tested, so even this result shouldn't
+be taken as a ``clear'' defeat of the framework's implementation.
+
+Overall, it is clear from this evaluation that the dynamic extension framework is a
+promising alternative to manual index redesign for accommodating updates. In almost
+all cases, the framework-extended static data structures provided superior insertion
+throughput in all cases, and query latencies that either matched or exceeded that of
+the dynamic baselines. Additionally, though it is hard to quantity, the code complexity
+of the framework-extended data structures was much less, with the shard implementations
+requiring only a small amount of relatively straightforward code to interface with pre-existing
+static data structures, or with the necessary data structure implementations themselves being
+simpler.
+
+\section{Conclusion}
+
+In this chapter, a generalize version of the framework originally proposed in
+Chapter~\ref{chap:sampling} was proposed. This framework is based on two
+key properties: extended decomposability and record identity. It is capable
+of extending any data structure and search problem supporting these two properties
+with support for inserts and deletes. An evaluation of this framework was performed
+by extending several static data structures, and comparing the resulting structures'
+performance against dynamic baselines capable of answering the same type of search
+problem. The extended structures generally performed as well as, if not better, than
+their dynamic baselines in query performance, insert performance, or both. This demonstrates
+the capability of this framework to produce viable indexes in a variety of contexts. However,
+the framework is not yet complete. In the next chapter, the work required to bring this
+framework to completion will be described.
diff --git a/chapters/chapter1-old.tex b/chapters/chapter1-old.tex
new file mode 100644
index 0000000..fca257d
--- /dev/null
+++ b/chapters/chapter1-old.tex
@@ -0,0 +1,256 @@
+\chapter{Introduction}
+
+It probably goes without saying that database systems are heavily
+dependent upon data structures, both for auxiliary use within the system
+itself, and for indexing the data in storage to facilitate faster access.
+As a result of this, the design of novel data structures constitutes a
+significant sub-field within the database community. However, there is a
+stark divide between theoretical work and so-called "practical" work in
+this area, with many theoretically oriented data structures not seeing
+much, if any, use in real systems. I would go so far as to assert that
+many of these published data structures have \emph{never} been actually
+used.
+
+This situation exists with reason, of course. Fundamentally, the rules
+of engagement within the theory community differ from those within the
+systems community. Asymptotic analysis, which eschews constant factors,
+dominates theoretical analysis of data structures, whereas the systems
+community cares a great deal about these constants. We'll see within
+this document itself just how significant a divide this is in terms of
+real performance numbers. But, perhaps an even more significant barrier
+to theoretical data structures is that of support for features.
+
+A data structure, technically speaking, only needs to define algorithms
+for constructing and querying it. I'll describe such minimal structures
+as \emph{static data structures} within this document. Many theoretical
+structures that seem potentially useful fall into this category. Examples
+include alias-augmented structures for independent sampling, vantage-point
+trees for multi-dimensional similarity search, ISAM trees for traditional
+one-dimensional indexing, the vast majority of learned indexes, etc.
+
+These structures allow for highly efficient answering of their associated
+types of query, but have either fallen out of use (ISAM Trees) or have
+yet to see widespread adoption in database systems. This is because the
+minimal interface provided by a static data structure is usually not
+sufficient to address the real-world engineering challenges associated
+with database systems. Instead, data structures used by such systems must
+support variety of additional features: updates to the underlying data,
+concurrent access, fault-tolerance, etc. This lack of feature support
+is a major barrier to the adoption of such structures.
+
+In the current data structure design paradigm, support for such features
+requires extensive redesign of the static data structure, often over a
+lengthy development cycle. Learned indexes provide a good case study for
+this. The first learned index, RMI, was proposed by Kraska \emph{et al.}
+in 2017~\cite{kraska-rmi}. As groundbreaking as this data structure,
+and the idea behind it, was, it lacks support for updates and thus was
+of very limited practical utility. Work then proceeded over the next
+year-and-a-half to develop an updatable data structure based on the
+concepts of RMI, culminating in ALEX~\cite{alex}, which first appeared
+on archive a year-and-a-half later. The next several years saw the
+development of a wide range of learned indexes, promising support for
+updates and concurrency. However, a recent survey found that all of them
+were still largely inferior to more mature indexing techniques, at least
+on certain workloads.
+
+These adventures in learned index design represent much of the modern
+index design process in microcosm. It is not unreasonable to expect
+that, as the technology matures, learned indexes may one day become
+commonplace. But the amount of development and research effort to get
+there is, clearly, vast.
+
+On the opposite end of the spectrum, theoretical data structure works
+also attempt to extend their structures with update support using a
+variety of techniques. However, the differing rules of engagement often
+result in solutions to this problem that are horribly impractical in
+database systems. As an example, Hu, Qiao, and Tao have proposed a data
+structure for efficient range sampling, and included in their design a
+discussion of efficient support for updates~\cite{irs}. Without getting
+into details, they need to add multiple additional data structures beside
+their sampling structure to facilitate this, including a hash table and
+multiple linked lists. Asymptotically, this approach doesn't affect space
+or time complexity as there is a constant number of extra structures,
+and the cost of maintaining and accessing them are on par with the costs
+associated with their main structure. But it's clear that the space
+and time costs of these extra data structures would have relevance in
+a real system. A similar problem arises in a recent attempt to create a
+dynamic alias structure, which uses multiple auxiliary data structures,
+and further assumes that the key space size is a constant that can be
+neglected~\cite{that-paper}.
+
+Further, update support is only one of many features that a data
+structure must support for use in database systems. Given these challenges
+associated with just update support, one can imagine the amount of work
+required to get a data structure fully ``production ready''!
+
+However, all of these tribulations are, I'm going to argue, not
+fundamental to data structure design, but rather a consequence of the
+modern data structure design paradigm. Rather than this process of manual
+integration of features into the data structure itself, we propose a
+new paradigm: \emph{Framework-driven Data Structure Design}. Under this
+paradigm, the process of designing a data structure is reduced to the
+static case: an algorithm for querying the structure and an algorithm
+for building it from a set of elements. Once these are defined, a high
+level framework can be used to automatically add support for other
+desirable features, such as updates, concurrency, and fault-tolerance,
+in a manner that is mostly transparent to the static structure itself.
+
+This idea is not without precedent. For example, a similar approach
+is used to provide fault-tolerance to indexes within traditional,
+disk-based RDBMS. The RDBMS provides a storage engine which has its own
+fault tolerance systems. Any data structure built on top of this storage
+engine can benefit from its crash recovery, requiring only a small amount
+of effort to integrate the system. As a result, crash recovery/fault
+tolerance is not handled at the level of the data structure in such
+systems. The B+Tree index itself doesn't have the mechanism built into
+it, it relies upon the framework provided by the RDBMS.
+
+Similarly, there is an existing technique which uses a similar process
+to add support for updates to static structures, commonly called the
+Bentley-Saxe method.
+
+\section{Research Objectives}
+The proposed project has four major objectives,
+\begin{enumerate}
+\item Automatic Dynamic Extension
+
+ The first phase of this project has seen the development of a
+ \emph{dynamic extension framework}, which is capable of adding
+ support for inserts and deletes of data to otherwise static data
+ structures, so long as a few basic assumptions about the structure
+ and associated queries are satisfied. This framework is based on
+ the core principles of the Bentley-Saxe method, and is implemented
+ using C++ templates to allow for ease of use.
+
+ As part of the extension of BSM, a large design space has been added,
+ giving the framework a trade-off space between memory usage, insert
+ performance, and query performance. This allows for the performance
+ characteristics of the framework-extended data structure to be tuned
+ for particular use cases, and provides a large degree of flexibility
+ to the technique.
+
+\item Automatic Concurrency Support
+
+ Because the Bentley-Saxe method is based on the reconstruction
+ of otherwise immutable blocks, a basic concurrency implementation
+ is straightforward. While there are hard blocking points when a
+ reconstruction requires the results of an as-of-yet incomplete
+ reconstruction, all other operations can be easily performed
+ concurrently, so long as the destruction of blocks can be deferred
+ until all operations actively using it are complete. This lends itself
+ to a simple epoch-based system, where a particular configuration of
+ blocks constitutes an epoch, and the reconstruction of one or more
+ blocks triggers a shift to a new epoch upon its completion. Each
+ query will see exactly one epoch, and that epoch will remain in
+ existence until all queries using it have terminated.
+
+ With this strategy, the problem of adding support for concurrent
+ operations is largely converted into one of resource management.
+ Retaining old epochs, adding more buffers, and running reconstruction
+ operations all require storage. Further, large reconstructions
+ consume memory bandwidth and CPU resources, which must be shared
+ with active queries. And, at least some reconstructions will actively
+ block others, which will lead to tail latency spikes.
+
+ The objective of this phase of the project is the creation of a
+ scheduling system, built into the framework, that will schedule
+ queries and merges so as to ensure that the system operates within
+ specific tail latency and resource utilization constraints. In
+ particular, it is important to effectively hide the large insertion
+ tail latencies caused by reconstructions, and to limit the storage
+ required to retain old versions of the structure. Alongside
+ scheduling, the use of admission control will be considered for helping
+ to maintain latency guarantees even in adversarial conditions.
+
+\item Automatic Multi-node Support
+
+ It is increasingly the case that the requirements for data management
+ systems exceed the capacity of a single node, requiring horizontal
+ scaling. Unfortunately, the design of data structures that work
+ effectively in a distributed, multi-node environment is non-trivial.
+ However, the same design elements that make it straightforward to
+ implement a framework-driven concurrency system should also lend
+ themselves to adding multi-node support to a data structure. The
+ framework uses immutable blocks of data, which are periodically
+ reconstructed by combining them with other blocks. This system is
+ superficially similar to the RDDs used by Apache Spark, for example.
+
+ What is not so straightforward, however, is the implementation
+ decisions that underlie this framework. It is not obvious that the
+ geometric block sizing technique used by BSM is well suited to this
+ task, and so a comprehensive evaluation of block sizing techniques
+ will be required. Additionally, there are significant challenges
+ to be overcome regarding block placement on nodes, fault-tolerance
+ and recovery, how best to handle buffering, and the effect of block
+ sizing strategies and placement on end-to-end query performance. All
+ of these problems will be studied during this phase of the project.
+
+
+\item Automatic Performance Tuning
+
+ During all phases of the project, various tunable parameters will
+ be introduced that allow for various trade-offs between insertion
+ performance, query performance, and memory usage. These allow for a
+ user to fine-tune the performance characteristics of the framework
+ to suit her use-cases. However, this tunability may introduce an
+ obstacle to adoption for the system, as it is not necessarily trivial
+ to arrive at an effective configuration of the system, given a set of
+ performance requirements. Thus, the final phase of the project will
+ consider systems to automatically tune the framework. As a further
+ benefit, such a system could allow dynamic adjustment to the tunable
+ parameters of the framework during execution, to allow for automatic
+ and transparent evolution in the phase of changing workloads.
+
+\end{enumerate}
+
+
+\begin{enumerate}
+ \item Thrust 1. Automatic Concurrency and Scheduling
+
+ The design of the framework lends itself to a straightforward, data
+ structure independent, concurrency implementation, but ensuring good
+ performance of this implementation will require intelligent scheduling.
+ In this thrust, we will study the problem of scheduling operations
+ within the framework to meet certain tail latency guarantees, within a
+ particular set of resource constraints.
+
+ RQ1: How best to parameterize merge and query operations
+ RQ2: Develop a real-time (or nearly real time) scheduling
+ system to make decisions about when the merge, while
+ ensuring certain tail latency requirements within a
+ set of resource constraints
+
+
+ \item Thrust 2. Temporal and Spatial Data Partitioning
+
+ The framework is based upon a temporal partitioning of data, however
+ there are opportunities to improve the performance of certain
+ operations by introducing a spatial partitioning scheme as well. In
+ this thrust, we will expand the framework to support arbitrary
+ partitioning schemes, and access the efficacy of spatial partitioning
+ under a variety of contexts.
+
+ RQ1: What effect does spatial partitioning within levels have on
+ the performance of inserts and queries?
+ RQ2: Does a trade-offs exist between spatial and temporal partitioning?
+ RQ3: To what degree do results about spatial partitioning generalize
+ across different types of index (particularly multi-dimensional
+ ones).
+
+ \item Thrust 3. Dynamic Performance Tuning
+
+ The framework contains a large number of tunable parameters which allow
+ for trade-offs between memory usage, read performance, and write
+ performance. In this thrust, we will comprehensively evaluate this
+ design space, and develop a system for automatically adjusting these
+ parameters during system operation. This will allow the system to
+ dynamically change its own configuration when the workload changes.
+
+ RQ1: Quantity and model the effects of framework tuning parameters on
+ various performance metrics.
+ RQ2: Evaluate the utility of having a heterogeneous configuration, with
+ different parameter values on different levels.
+ RQ3: Develop a system for dynamically adjusting these values based on
+ current performance data.
+
+\end{enumerate}
diff --git a/chapters/chapter1.tex.bak b/chapters/chapter1.tex.bak
new file mode 100644
index 0000000..c66ba2c
--- /dev/null
+++ b/chapters/chapter1.tex.bak
@@ -0,0 +1,204 @@
+\chapter{Introduction}
+
+It probably goes without saying that database systems are heavily
+dependent upon data structures, both for auxiliary use within the system
+itself, and for indexing the data in storage to facilitate faster access.
+As a result of this, the design of novel data structures constitutes a
+significant subfield within the database community. However, there is a
+stark divide between theoretical work and so-called "practical" work in
+this area, with many theoretically oriented data structures not seeing
+much, if any, use in real systems. I would go so far as to assert that
+many of these published data structures have \emph{never} been actually
+used.
+
+This situation exists with reason, of course. Fundamentally, the rules
+of engadgement within the theory community differ from those within the
+systems community. Asymptotic analysis, which eschews constant factors,
+dominates theoretical analysis of data structures, whereas the systems
+community cares a great deal about these constants. We'll see within
+this document itself just how significant a divide this is in terms of
+real performance numbers. But, perhaps an even more significant barrier
+to theoretical data structures is that of support for features.
+
+A data structure, technically speaking, only needs to define algorithms
+for constructing and querying it. I'll describe such minimal structures
+as \emph{static data structures} within this document. Many theoretical
+structures that seem potentially useful fall into this category. Examples
+include alias-augmented structures for independent sampling, vantage-point
+trees for multi-dimensional similiarity search, ISAM trees for traditional
+one-dimensional indexing, the vast majority of learned indexes, etc.
+
+These structures allow for highly efficient answering of their associated
+types of query, but have either fallen out of use (ISAM Trees) or have
+yet to see widespread adoption in database systems. This is because the
+minimal interface provided by a static data structure is usually not
+sufficient to address the real-world engineering challenges associated
+with database systems. Instead, data structures used by such systems must
+support variety of additional features: updates to the underlying data,
+concurrent access, fault-tolerance, etc. This lack of feature support
+is a major barrier to the adoption of such structures.
+
+In the current data structure design paradigm, support for such features
+requires extensive redesign of the static data structure, often over a
+lengthy development cycle. Learned indexes provide a good case study for
+this. The first learned index, RMI, was proposed by Kraska \emph{et al.}
+in 2017~\cite{kraska-rmi}. As groundbreaking as this data structure,
+and the idea behind it, was, it lacks support for updates and thus was
+of very limited practical utility. Work then proceeded over the next
+year-and-a-half to develop an updatable data structure based on the
+concepts of RMI, culmintating in ALEX~\cite{alex}, which first appeared
+on archive a year-and-a-half later. The next several years saw the
+development of a wide range of learned indexes, promising support for
+updates and concurrency. However, a recent survey found that all of them
+were still largely inferior to more mature indexing techniques, at least
+on certain workloads.
+
+These adventures in learned index design represent much of the modern
+index design process in microcosm. It is not unreasonable to expect
+that, as the technology matures, learned indexes may one day become
+commonplace. But the amount of development and research effort to get
+there is, clearly, vast.
+
+On the opposite end of the spectrum, theoretical data structure works
+also attempt to extend their structures with update support using a
+variety of techniques. However, the differing rules of engagement often
+result in solutions to this problem that are horribly impractical in
+database systems. As an example, Hu, Qiao, and Tao have proposed a data
+structure for efficient range sampling, and included in their design a
+discussion of efficient support for updates~\cite{irs}. Without getting
+into details, they need to add multiple additional data structures beside
+their sampling structure to facilitate this, including a hash table and
+multiple linked lists. Asymptotically, this approach doesn't affect space
+or time complexity as there is a constant number of extra structures,
+and the cost of maintaining and accessing them are on par with the costs
+associated with their main structure. But it's clear that the space
+and time costs of these extra data structures would have relevance in
+a real system. A similar problem arises in a recent attempt to create a
+dynamic alias structure, which uses multiple auxilliary data structures,
+and further assumes that the key space size is a constant that can be
+neglected~\cite{that-paper}.
+
+Further, update support is only one of many features that a data
+structure must support for use in database systems. Given these challenges
+associated with just update support, one can imagine the amount of work
+required to get a data structure fully ``production ready''!
+
+However, all of these tribulations are, I'm going to argue, not
+fundamental to data structure design, but rather a consequence of the
+modern data structure design paradigm. Rather than this process of manual
+integration of features into the data structure itself, we propose a
+new paradigm: \emph{Framework-driven Data Structure Design}. Under this
+paradigm, the process of designing a data structure is reduced to the
+static case: an algorithm for querying the structure and an algorithm
+for building it from a set of elements. Once these are defined, a high
+level framework can be used to automatically add support for other
+desirable features, such as updates, concurrency, and fault-tolerance,
+in a manner that is mostly transparent to the static structure itself.
+
+This idea is not without precident. For example, a similar approach
+is used to provide fault-tolerance to indexes within traditional,
+disk-based RDBMS. The RDBMS provides a storage engine which has its own
+fault tolerance systems. Any data structure built on top of this storage
+engine can benefit from its crash recovery, requiring only a small amount
+of effort to integrate the system. As a result, crash recovery/fault
+tolerance is not handled at the level of the data structure in such
+systems. The B+Tree index itself doesn't have the mechanism built into
+it, it relies upon the framework provided by the RDBMS.
+
+Similarly, there is an existing technique which uses a similar process
+to add support for updates to static structures, commonly called the
+Bentley-Saxe method.
+
+\section{Research Objectives}
+The proposed project has four major objectives,
+\begin{enumerate}
+\item Automatic Dynamic Extension
+
+ The first phase of this project has seen the development of a
+ \emph{dynamic extension framework}, which is capable of adding
+ support for inserts and deletes of data to otherwise static data
+ structures, so long as a few basic assumptions about the structure
+ and associated queries are satisified. This framework is based on
+ the core principles of the Bentley-Saxe method, and is implemented
+ using C++ templates to allow for ease of use.
+
+ As part of the extension of BSM, a large design space has been added,
+ giving the framework a trade-off space between memory usage, insert
+ performance, and query performance. This allows for the performance
+ characteristics of the framework-extended data structure to be tuned
+ for particular use cases, and provides a large degree of flexibility
+ to the technique.
+
+\item Automatic Concurrency Support
+
+ Because the Bentley-Saxe method is based on the reconstruction
+ of otherwise immutable blocks, a basic concurrency implementation
+ is straightforward. While there are hard blocking points when a
+ reconstruction requires the results of an as-of-yet incomplete
+ reconstruction, all other operations can be easily performed
+ concurrently, so long as the destruction of blocks can be deferred
+ until all operations actively using it are complete. This lends itself
+ to a simple epoch-based system, where a particular configuration of
+ blocks constitutes an epoch, and the reconstruction of one or more
+ blocks triggers a shift to a new epoch upon its completion. Each
+ query will see exactly one epoch, and that epoch will remain in
+ existence until all queries using it have terminated.
+
+ With this strategy, the problem of adding support for concurrent
+ operations is largely converted into one of resource management.
+ Retaining old epochs, adding more buffers, and running reconstruction
+ operations all require storage. Further, large reconstructions
+ consume memory bandwidth and CPU resources, which must be shared
+ with active queries. And, at least some reconstructions will actively
+ block others, which will lead to tail latency spikes.
+
+ The objective of this phase of the project is the creation of a
+ scheduling system, built into the framework, that will schedule
+ queries and merges so as to ensure that the system operates within
+ specific tail latency and resource utilization constraints. In
+ particular, it is important to effectively hide the large insertion
+ tail latencies caused by reconstructions, and to limit the storage
+ required to retain old versions of the structure. Alongside
+ scheduling, the use of admission control will be considered for helping
+ to maintain latency guarentees even in adverserial conditions.
+
+\item Automatic Multi-node Support
+
+ It is increasingly the case that the requirements for data management
+ systems exceed the capacity of a single node, requiring horizontal
+ scaling. Unfortunately, the design of data structures that work
+ effectively in a distributed, multi-node environment is non-trivial.
+ However, the same design elements that make it straightforward to
+ implement a framework-driven concurrency system should also lend
+ themselves to adding multi-node support to a data structure. The
+ framework uses immutable blocks of data, which are periodically
+ reconstructed by combining them with other blocks. This system is
+ superficially similar to the RDDs used by Apache Spark, for example.
+
+ What is not so straightforward, however, is the implementation
+ decisions that underly this framework. It is not obvious that the
+ geometric block sizing technique used by BSM is well suited to this
+ task, and so a comprehensive evaluation of block sizing techniques
+ will be required. Additionally, there are significant challenges
+ to be overcome regarding block placement on nodes, fault-tolerance
+ and recovery, how best to handle buffering, and the effect of block
+ sizing strategies and placement on end-to-end query performance. All
+ of these problems will be studied during this phase of the project.
+
+
+\item Automatic Performance Tuning
+
+ During all phases of the project, various tunable parameters will
+ be introduced that allow for various trade-offs between insertion
+ performance, query performance, and memory usage. These allow for a
+ user to fine-tune the performance characteristics of the framework
+ to suit her use-cases. However, this tunability may introduce an
+ obstical to adoption for the system, as it is not necessarily trivial
+ to arrive at an effective configuration of the system, given a set of
+ performance requirements. Thus, the final phase of the project will
+ consider systems to automatically tune the framework. As a further
+ benefit, such a system could allow dynamic adjustment to the tunable
+ parameters of the framework during execution, to allow for automatic
+ and transparent evolution in the phase of changing workloads.
+
+\end{enumerate}
diff --git a/chapters/conclusion.tex b/chapters/conclusion.tex
new file mode 100644
index 0000000..b4439ec
--- /dev/null
+++ b/chapters/conclusion.tex
@@ -0,0 +1,43 @@
+\chapter{Conclusion}
+\label{chap:conclusion}
+
+Using data structures, a wide range of analytical queries against large data
+sets can be accelerated. Unfortunately, these data structures must be
+concurrently updatable to ensure timely results, as the underlying data is
+frequently subject to change. This requirement for concurrent update support
+excludes many possible data structures from use in these contexts, and the
+creation of a data structure with update support is non-trivial.
+
+The framework proposed by this work would allow for existing data
+structures to be automatically extended with tunable support for
+concurrent updates, with potential for future work to add even more
+features. It is based on an extension of the Bentley-Saxe method,
+which supports updates in static structures by splitting the data
+structure into multiple partitions and systematically reconstructing
+them. The Bentley-Saxe method has been adjusted to utilize a different
+query interface, based on the newly proposed extended decomposability,
+which brings with it more efficient support for many types of search
+problems not well served by the original techniques. It also introduces
+two approaches for handling deletes, buffering of inserts, and a more
+tunable reconstruction strategy, as well as support for concurrency,
+none of which were present in the original method.
+
+Using this framework, many data structures and search problems can be
+used as the basis of an index, requiring only that they support the
+eDSP abstraction and can uniquely identify and locate each record. The
+creation of an index requires only a small amount of shim code between
+the structure and the framework (called a shard).
+
+The current version of the framework supports tunable, single-threaded
+updates, and has been experimentally validated to extend static data
+structures with update support, and maintain performance on-par
+with or better than existing dynamic alternatives for a number of
+complex search problems, including k-nearest neighbor and a variety
+of independent sampling problems. Beyond presenting these results,
+this work proposes the extension of this framework with support for
+concurrency with tail-latency mitigations, online and fine-grained
+tuning, and examining more sophisticated data partitioning schemes to
+ease certain challenges associated with large-scale reconstructions.
+The completion of this framework would be a major milestone in a larger
+project to vastly expand the capabilities of database management systems
+through the use of more complex data access primitives.
diff --git a/chapters/dynamic-extension-sampling.tex b/chapters/dynamic-extension-sampling.tex
new file mode 100644
index 0000000..58db672
--- /dev/null
+++ b/chapters/dynamic-extension-sampling.tex
@@ -0,0 +1,22 @@
+\chapter{Dynamic Extension Framework for Sampling Indexes}
+\label{chap:sampling}
+
+\begin{center}
+ \emph{The following chapter is an adaptation of work completed in collaboration with Dr. Dong Xie and published
+ in PACMMOD Volume 1, Issue 4 (December 2023) under the title "Practical Dynamic Extension of Sampling Indexes".
+ }
+ \hrule
+\end{center}
+
+\input{chapters/sigmod23/introduction}
+\input{chapters/sigmod23/background}
+\input{chapters/sigmod23/framework}
+\input{chapters/sigmod23/examples}
+\input{chapters/sigmod23/extensions}
+\input{chapters/sigmod23/experiment}
+\input{chapters/sigmod23/exp-parameter-space}
+\input{chapters/sigmod23/exp-baseline}
+\input{chapters/sigmod23/exp-extensions}
+%\input{chapters/sigmod23/relatedwork}
+\input{chapters/sigmod23/conclusion}
+
diff --git a/chapters/future-work.tex b/chapters/future-work.tex
new file mode 100644
index 0000000..d4ddd52
--- /dev/null
+++ b/chapters/future-work.tex
@@ -0,0 +1,174 @@
+\chapter{Proposed Work}
+\label{chap:proposed}
+
+The previous two chapters described work already completed, however
+there are a number of work that remains to be done as part of this
+project. Update support is only one of the important features that an
+index requires of its data structure. In this chapter, the remaining
+research problems will be discussed briefly, to lay out a set of criteria
+for project completion.
+
+\section{Concurrency Support}
+
+Database management systems are designed to hide the latency of
+IO operations, and one of the techniques they use are being highly
+concurrent. As a result, any data structure used to build a database
+index must also support concurrent updates and queries. The sampling
+extension framework described in Chapter~\ref{chap:sampling} had basic
+concurrency support, but work is ongoing to integrate a superior system
+into the framework of Chapter~\ref{chap:framework}.
+
+Because the framework is based on the Bentley-Saxe method, it has a number
+of desirable properties for making concurrency management simpler. With
+the exception of the buffer, the vast majority of the data resides in
+static data structures. When using tombstones, these static structures
+become fully immutable. This turns concurrency control into a resource
+management problem, and suggests a simple multi-version concurrency
+control scheme. Each version of the structure, defined as being the
+state between two reconstructions, is tagged with an epoch number. A
+query, then, will read only a single epoch, which will be preserved
+in storage until all queries accessing it have terminated. Because the
+mutable buffer is append-only, a consistent view of it can be obtained
+by storing the tail of the log at the start of query execution. Thus,
+a fixed snapshot of the index can be represented as a two-tuple containing
+the epoch number and buffer tail index.
+
+The major limitation of the Chapter~\ref{chap:sampling} system was
+the handling of buffer expansion. While the mutable buffer itself is
+an unsorted array, and thus supports concurrent inserts using a simple
+fetch-and-add operation, the real hurdle to insert performance is managing
+reconstruction. During a reconstruction, the buffer is full and cannot
+support any new inserts. Because active queries may be using the buffer,
+it cannot be immediately flushed, and so inserts are blocked. Because of
+this, it is necessary to use multiple buffers to sustain insertions. When
+a buffer is filled, a background thread is used to perform the
+reconstruction, and a new buffer is added to continue inserting while that
+reconstruction occurs. In Chapter~\ref{chap:sampling}, the solution used
+was limited by its restriction to only two buffers (and as a result,
+a maximum of two active epochs at any point in time). Any sustained
+insertion workload would quickly fill up the pair of buffers, and then
+be forced to block until one of the buffers could be emptied. This
+emptying of the buffer was contingent on \emph{both} all queries using
+the buffer finishing, \emph{and} on the reconstruction using that buffer
+to finish. As a result, the length of the block on inserts could be long
+(multiple seconds, or even minutes for particularly large reconstructions)
+and indeterminate (a given index could be involved in a very long running
+query, and the buffer would be blocked until the query completed).
+
+Thus, a more effective concurrency solution would need to support
+dynamically adding mutable buffers as needed to maintain insertion
+throughput. This would allow for insertion throughput to be maintained
+so long as memory for more buffer space is available.\footnote{For the
+in-memory indexes considered thus far, it isn't clear that running out of
+memory for buffers is a recoverable error in all cases. The system would
+require the same amount of memory for storing record (technically more,
+considering index overhead) in a shard as it does in the buffer. In the
+case of an external storage system, the calculus would be different,
+of course.} It would also ensure that a long running could only block
+insertion if there is insufficient memory to create a new buffer or to
+run a reconstruction. However, as the number of buffered records grows,
+there is the potential for query performance to suffer, which leads to
+another important aspect of an effective concurrency control scheme.
+
+\subsection{Tail Latency Control}
+
+The concurrency control scheme discussed thus far allows for maintaining
+insertion throughput by allowing an unbounded portion of the new data
+to remain buffered in an unsorted fashion. Over time, this buffered
+data will be moved into data structures in the background, as the
+system performs merges (which are moved off of the critical path for
+most operations). While this system allows for fast inserts, it has the
+potential to damage query performance. This is because the more buffered
+data there is, the more a query must fall back on its inefficient
+scan-based buffer path, as opposed to using the data structure.
+
+Unfortunately, reconstructions can be incredibly lengthy (recall that
+the worst-case scenario involves rebuilding a static structure over
+all of the records; this is, thankfully, quite rare). This implies that
+it may be necessary in certain circumstances to throttle insertions to
+maintain certain levels of query performance. Additionally, it may be
+worth preemptively performing large reconstructions during periods of
+low utilization, similar to systems like Silk designed for mitigating
+tail latency spikes in LSM-tree based systems~\cite{balmau19}.
+
+Additionally, it is possible that large reconstructions may have a
+negative effect on query performance, due to system resource utilization.
+Reconstructions can use a large amount of memory bandwidth, which must
+be shared by queries. The effects of parallel reconstruction on query
+performance will need to be assessed, and strategies for mitigation of
+this effect, be it a scheduling-based solution, or a resource-throttling
+one, considered if necessary.
+
+
+\section{Fine-Grained Online Performance Tuning}
+
+The framework has a large number of configurable parameters, and
+introducing concurrency control will add even more. The parameter sweeps
+in Section~\ref{ssec:ds-exp} show that there are trade-offs between
+read and write performance across this space. Unfortunately, the current
+framework applies this configuration parameters globally, and does not
+allow them to be changed after the index is constructed. It seems apparent
+that better performance might be obtained by adjusting this approach.
+
+First, there is nothing preventing these parameters from being configured
+on a per-level basis. Having different layout policies on different
+levels (for example, tiering on higher levels and leveling on lower ones),
+different scale factors, etc. More index specific tuning, like controlling
+memory budget for auxiliary structures, could also be considered.
+
+This fine-grained tuning will open up an even broader design space,
+which has the benefit of improving the configurability of the system,
+but the disadvantage of making configuration more difficult. Additionally,
+it does nothing to address the problem of workload drift: a configuration
+may be optimal now, but will it remain effective in the future as the
+read/write mix of the workload changes? Both of these challenges can be
+addressed using dynamic tuning.
+
+The theory is that the framework could be augmented with some workload
+and performance statistics tracking. Based on these numbers, during
+reconstruction, the framework could decide to adjust the configuration
+of one or more levels in an online fashion, to lean more towards read
+or write performance, or to dial back memory budgets as the system's
+memory usage increases. Additionally, buffer-related parameters could
+be tweaked in real time as well. If insertion throughput is high, it
+might be worth it to temporarily increase the buffer size, rather than
+spawning multiple smaller buffers.
+
+A system like this would allow for more consistent performance of the
+system in the face of changing workloads, and also increase the ease
+of use of the framework by removing the burden of configuration from
+the user.
+
+
+\section{Alternative Data Partitioning Schemes}
+
+One problem with Bentley-Saxe or LSM-tree derived systems is temporary
+memory usage spikes. When performing a reconstruction, the system needs
+enough storage to store the shards involved in the reconstruction,
+and also the newly constructed shard. This is made worse in the face
+of multi-version concurrency, where multiple older versions of shards
+may be retained in memory at once. It's well known that, in the worst
+case, such a system may temporarily require double its current memory
+usage~\cite{dayan22}.
+
+One approach to addressing this problem in LSM-tree based systems is
+to adjust the compaction granularity~\cite{dayan22}. In the terminology
+associated with this framework, the idea is to further sub-divide each
+shard into smaller chunks, partitioned based on keys. That way, when a
+reconstruction is triggered, rather than reconstructing an entire shard,
+these smaller partitions can be used instead. One of the partitions in
+the source shard can be selected, and then merged with the partitions
+in the next level down having overlapping key ranges. The amount of
+memory required for reconstruction (and also reconstruction time costs)
+can then be controlled by adjusting these partitions.
+
+Unfortunately, while this system works incredibly well for LSM-tree
+based systems which store one-dimensional data in sorted arrays, it
+encounters some problems in the context of a general index. It isn't
+clear how to effectively partition multi-dimensional data in the same
+way. Additionally, in the general case, each partition would need to
+contain its own instance of the index, as the framework supports data
+structures that don't themselves support effective partitioning in the
+way that a simple sorted array would. These challenges will need to be
+overcome to devise effective, general schemes for data partitioning to
+address the problems of reconstruction size and memory usage.
diff --git a/chapters/introduction.tex b/chapters/introduction.tex
new file mode 100644
index 0000000..a5d9740
--- /dev/null
+++ b/chapters/introduction.tex
@@ -0,0 +1,95 @@
+\chapter{Introduction}
+\label{chap:intro}
+
+One of the major challenges facing current data systems is the processing
+of complex and varied analytical queries over vast data sets. One commonly
+used technique for accelerating these queries is the application of data
+structures to create indexes, which are the basis for specialized database
+systems and data processing libraries. Unfortunately, the development
+of these indexes is difficult because of the requirements placed on
+them by data processing systems. Data is frequently subject to updates,
+yet a large number of potentially useful data structures are static.
+Further, many large-scale data processing systems are highly concurrent,
+which increases the barrier to entry even further. The process for
+developing data structures that satisfy these requirements is arduous.
+
+To demonstrate this difficulty, consder the recent example of the
+evolution of learned indexes. These are data structures designed to
+efficiently solve a simple problem: single dimensional range queries
+over sorted data. They seek to reduce the size of the structure, as
+well as lookup times, by replacing a traditional data structure with a
+learned model capable of predicting the location of a record in storage
+that matches a key value to within bounded error. This concept was first
+proposed by Kraska et al. in 2017, when they published a paper on the
+first learned index, RMI~\cite{RMI}. This index succeeding in showing
+that a learned model can be both faster and smaller than a conventional
+range index, but the proposed solution did not support updates. The
+first (non-concurrently) updatable learned index, ALEX, took a year
+and a half to appear~\cite{ALEX}. Over the course of the subsequent
+three years, several learned indexes were proposed with concurrency
+support~\cite{10.1145/3332466.3374547,10.14778/3489496.3489512} but a
+recent performance study~\cite{10.14778/3551793.3551848} showed that these
+were still generally inferior to ART-OLC~\cite{10.1145/2933349.2933352},
+a traditional index. This same study did however demonstrate that a new
+design, ALEX+, was able to outperform ART-OLC under certain circumstances,
+but even with this result learned indexes are not generally considered
+production ready, because they suffer from significant performance
+regressions under certain workloads, and are highly sensitive to the
+distribution of keys~\cite{10.14778/3551793.3551848}. Despite the
+demonstrable advantages of the technique and over half a decade of
+development, learned indexes still have not reached a generally usable
+state.\footnote{
+ In Chapter~\ref{chap:framework}, we apply our proposed technique to
+ existing static learned indexes to produce an effective dynamic index.
+}
+
+This work proposes a strategy for addressing this problem by providing a
+framework for automatically introducing support for concurrent updates
+(including both inserts and deletes) to many static data structures. With
+this framework, a wide range of static, or otherwise impractical, data
+structures will be made practically useful in data systems. Based
+on a classical, theoretical framework called the Bentley-Saxe
+Method~\cite{saxe79}, the proposed system will provide a library
+that can automatically extend many data structures with support for
+concurrent updates, as well as a tunable design space to allow for the
+user to make trade-offs between read performance, write performance,
+and storage usage. The framework will address a number of limitations
+present in the original technique, widely increasing its applicability
+and practicality. It will also provide a workload-adaptive, online tuning
+system that can automatically adjust the tuning parameters of the data
+structure in the face of changing workloads.
+
+This framework is based on the splitting of the data structure into
+several smaller pieces, which are periodically reconstructed to support
+updates. A systematic partitioning and reconstruction approach is used
+to provide specific guarantees on amortized insertion performance, and
+worst case query performance. The underlying Bentley-Saxe method is
+extended using a novel query abstraction to broaden its applicability,
+and the partitioning and reconstruction processes are adjusted to improve
+performance and introduce configurability.
+
+Specifically, the proposed work will address the following points,
+\begin{enumerate}
+ \item The proposal of a theoretical framework for analysing queries
+ and data structures that extends existing theoretical
+ approaches and allows for more data structures to be dynamized.
+ \item The design of a system based upon this theoretical framework
+ for automatically dynamizing static data structures in a performant
+ and configurable manner.
+ \item The extension of this system with support for concurrent operations,
+ and the use of concurrency to provide more effective worst-case
+ performance guarantees.
+\end{enumerate}
+
+The rest of this document is structured as follows. First,
+Chapter~\ref{chap:background} introduces relevant background information,
+including the importance of data structures and indexes in database systems,
+the concept of a search problem, and techniques for designing updatable data
+structures. Next, in Chapter~\ref{chap:sampling}, the application of the
+Bentley-Saxe method to a number of sampling data structures is presented. The
+extension of these structures introduces a number of challenges which must be
+addressed, resulting in significant modification of the underlying technique.
+Then, Chapter~\ref{chap:framework} discusses the generalization of the
+modifications from the sampling framework into a more general framework.
+Chapter~\ref{chap:proposed} discusses the work that remains to be completed as
+part of this project, and Chapter~\ref{chap:conclusion} concludes the work.
diff --git a/chapters/sigmod23/abstract.tex b/chapters/sigmod23/abstract.tex
new file mode 100644
index 0000000..3ff0c08
--- /dev/null
+++ b/chapters/sigmod23/abstract.tex
@@ -0,0 +1,29 @@
+\begin{abstract}
+
+ The execution of analytical queries on massive datasets presents challenges
+ due to long response times and high computational costs. As a result, the
+ analysis of representative samples of data has emerged as an attractive
+ alternative; this avoids the cost of processing queries against the entire
+ dataset, while still producing statistically valid results. Unfortunately,
+ the sampling techniques in common use sacrifice either sample quality or
+ performance, and so are poorly suited for this task. However, it is
+ possible to build high quality sample sets efficiently with the assistance
+ of indexes. This introduces a new challenge: real-world data is subject to
+ continuous update, and so the indexes must be kept up to date. This is
+ difficult, because existing sampling indexes present a dichotomy; efficient
+ sampling indexes are difficult to update, while easily updatable indexes
+ have poor sampling performance. This paper seeks to address this gap by
+ proposing a general and practical framework for extending most sampling
+ indexes with efficient update support, based on splitting indexes into
+ smaller shards, combined with a systematic approach to the periodic
+ reconstruction. The framework's design space is examined, with an eye
+ towards exploring trade-offs between update performance, sampling
+ performance, and memory usage. Three existing static sampling indexes are
+ extended using this framework to support updates, and the generalization of
+ the framework to concurrent operations and larger-than-memory data is
+ discussed. Through a comprehensive suite of benchmarks, the extended
+ indexes are shown to match or exceed the update throughput of
+ state-of-the-art dynamic baselines, while presenting significant
+ improvements in sampling latency.
+
+\end{abstract}
diff --git a/chapters/sigmod23/background.tex b/chapters/sigmod23/background.tex
new file mode 100644
index 0000000..58324bd
--- /dev/null
+++ b/chapters/sigmod23/background.tex
@@ -0,0 +1,182 @@
+\section{Background}
+\label{sec:background}
+
+This section formalizes the sampling problem and describes relevant existing
+solutions. Before discussing these topics, though, a clarification of
+definition is in order. The nomenclature used to describe sampling varies
+slightly throughout the literature. In this chapter, the term \emph{sample} is
+used to indicate a single record selected by a sampling operation, and a
+collection of these samples is called a \emph{sample set}; the number of
+samples within a sample set is the \emph{sample size}. The term \emph{sampling}
+is used to indicate the selection of either a single sample or a sample set;
+the specific usage should be clear from context.
+
+
+\Paragraph{Independent Sampling Problem.} When conducting sampling, it is often
+desirable for the drawn samples to have \emph{statistical independence}. This
+requires that the sampling of a record does not affect the probability of any
+other record being sampled in the future. Independence is a requirement for the
+application of statistical tools such as the Central Limit
+Theorem~\cite{bulmer79}, which is the basis for many concentration bounds.
+A failure to maintain independence in sampling invalidates any guarantees
+provided by these statistical methods.
+
+In each of the problems considered, sampling can be performed either with
+replacement (WR) or without replacement (WoR). It is possible to answer any WoR
+sampling query using a constant number of WR queries, followed by a
+deduplication step~\cite{hu15}, and so this chapter focuses exclusively on WR
+sampling.
+
+A basic version of the independent sampling problem is \emph{weighted set
+sampling} (WSS),\footnote{
+ This nomenclature is adopted from Tao's recent survey of sampling
+ techniques~\cite{tao22}. This problem is also called
+ \emph{weighted random sampling} (WRS) in the literature.
+}
+in which each record is associated with a weight that determines its
+probability of being sampled. More formally, WSS is defined
+as:
+\begin{definition}[Weighted Set Sampling~\cite{walker74}]
+ Let $D$ be a set of data whose members are associated with positive
+ weights $w: D \to \mathbb{R}^+$. Given an integer $k \geq 1$, a weighted
+ set sampling query returns $k$ independent random samples from $D$ with
+ each data point $d \in D$ having a probability of $\frac{w(d)}{\sum_{p\in
+ D}w(p)}$ of being sampled.
+\end{definition}
+Each query returns a sample set of size $k$, rather than a
+single sample. Queries returning sample sets are the common case, because the
+robustness of analysis relies on having a sufficiently large sample
+size~\cite{ben-eliezer20}. The common \emph{simple random sampling} (SRS)
+problem is a special case of WSS, where every element has unit weight.
+
+In the context of databases, it is also common to discuss a more general
+version of the sampling problem, called \emph{independent query sampling}
+(IQS)~\cite{hu14}. An IQS query samples a specified number of records from the
+result set of a database query. In this context, it is insufficient to merely
+ensure individual records are sampled independently; the sample sets returned
+by repeated IQS queries must be independent as well. This provides a variety of
+useful properties, such as fairness and representativeness of query
+results~\cite{tao22}. As a concrete example, consider simple random sampling on
+the result set of a single-dimensional range reporting query. This is
+called independent range sampling (IRS), and is formally defined as:
+
+\begin{definition}[Independent Range Sampling~\cite{tao22}]
+ Let $D$ be a set of $n$ points in $\mathbb{R}$. Given a query
+ interval $q = [x, y]$ and an integer $k$, an independent range sampling
+ query returns $k$ independent samples from $D \cap q$ with each
+ point having equal probability of being sampled.
+\end{definition}
+A generalization of IRS exists, called \emph{Weighted Independent Range
+Sampling} (WIRS)~\cite{afshani17}, which is similar to WSS. Each point in $D$
+is associated with a positive weight $w: D \to \mathbb{R}^+$, and samples are
+drawn from the range query results $D \cap q$ such that each data point has a
+probability of $\nicefrac{w(d)}{\sum_{p \in D \cap q}w(p)}$ of being sampled.
+
+
+\Paragraph{Existing Solutions.} While many sampling techniques exist,
+few are supported in practical database systems. The existing
+\texttt{TABLESAMPLE} operator provided by SQL in all major DBMS
+implementations~\cite{postgres-doc} requires either a linear scan (e.g.,
+Bernoulli sampling) that results in high sample retrieval costs, or relaxed
+statistical guarantees (e.g., block sampling~\cite{postgres-doc} used in
+PostgreSQL).
+
+Index-assisted sampling solutions have been studied
+extensively. Olken's method~\cite{olken89} is a classical solution to
+independent sampling problems. This algorithm operates upon traditional search
+trees, such as the B+tree used commonly as a database index. It conducts a
+random walk on the tree uniformly from the root to a leaf, resulting in a
+$O(\log n)$ sampling cost for each returned record. Should weighted samples be
+desired, rejection sampling can be performed. A sampled record, $r$, is
+accepted with probability $\nicefrac{w(r)}{w_{max}}$, with an expected
+number of $\nicefrac{w_{max}}{w_{avg}}$ samples to be taken per element in the
+sample set. Olken's method can also be extended to support general IQS by
+rejecting all sampled records failing to satisfy the query predicate. It can be
+accelerated by adding aggregated weight tags to internal
+nodes~\cite{olken-thesis,zhao22}, allowing rejection sampling to be performed
+during the tree-traversal to abort dead-end traversals early.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=.5\textwidth]{img/sigmod23/alias.pdf}
+ \caption{\textbf{A pictorial representation of an alias
+ structure}, built over a set of weighted records. Sampling is performed by
+ first (1) selecting a cell by uniformly generating an integer index on
+ $[0,n)$, and then (2) selecting an item by generating a
+ second uniform float on $[0,1]$ and comparing it to the cell's normalized
+ cutoff values. In this example, the first random number is $0$,
+ corresponding to the first cell, and the second is $.7$. This is larger
+ than $\nicefrac{.15}{.25}$, and so $3$ is selected as the result of the
+ query.
+ This allows $O(1)$ independent weighted set sampling, but adding a new
+ element requires a weight adjustment to every element in the structure, and
+ so isn't generally possible without performing a full reconstruction.}
+ \label{fig:alias}
+
+\end{figure}
+
+There also exist static data structures, referred to in this chapter as static
+sampling indexes (SSIs)\footnote{
+The name SSI was established in the published version of this paper prior to the
+realization that a distinction between the terms index and data structure would
+be useful. We'll continue to use the term SSI for the remainder of this chapter,
+to maintain consistency with the published work, but technically an SSI refers to
+ a data structure, not an index, in the nomenclature established in the previous
+ chapter.
+ }, that are capable of answering sampling queries in
+near-constant time\footnote{
+ The designation
+``near-constant'' is \emph{not} used in the technical sense of being constant
+to within a polylogarithmic factor (i.e., $\tilde{O}(1)$). It is instead used to mean
+constant to within an additive polylogarithmic term, i.e., $f(x) \in O(\log n +
+1)$.
+%For example, drawing $k$ samples from $n$ records using a near-constant
+%approach would require $O(\log n + k)$ time. This is in contrast to a
+%tree-traversal approach, which would require $O(k\log n)$ time.
+} relative to the size of the dataset. An example of such a
+structure is used in Walker's alias method \cite{walker74,vose91}, a technique
+for answering WSS queries with $O(1)$ query cost per sample, but requiring
+$O(n)$ time to construct. It distributes the weight of items across $n$ cells,
+where each cell is partitioned into at most two items, such that the total
+proportion of each cell assigned to an item is its total weight. A query
+selects one cell uniformly at random, then chooses one of the two items in the
+cell by weight; thus, selecting items with probability proportional to their
+weight in $O(1)$ time. A pictorial representation of this structure is shown in
+Figure~\ref{fig:alias}.
+
+The alias method can also be used as the basis for creating SSIs capable of
+answering general IQS queries using a technique called alias
+augmentation~\cite{tao22}. As a concrete example, previous
+papers~\cite{afshani17,tao22} have proposed solutions for WIRS queries using $O(\log n
++ k)$ time, where the $\log n$ cost is only be paid only once per query, after which
+elements can be sampled in constant time. This structure is built by breaking
+the data up into disjoint chunks of size $\nicefrac{n}{\log n}$, called
+\emph{fat points}, each with an alias structure. A B+tree is then constructed,
+using the fat points as its leaf nodes. The internal nodes are augmented with
+an alias structure over the total weight of each child. This alias structure
+is used instead of rejection sampling to determine the traversal path to take
+through the tree, and then the alias structure of the fat point is used to
+sample a record. Because rejection sampling is not used during the traversal,
+two traversals suffice to establish the valid range of records for sampling,
+after which samples can be collected without requiring per-sample traversals.
+More examples of alias augmentation applied to different IQS problems can be
+found in a recent survey by Tao~\cite{tao22}.
+
+There do exist specialized sampling indexes~\cite{hu14} with both efficient
+sampling and support for updates, but these are restricted to specific query
+types and are often very complex structures, with poor constant factors
+associated with sampling and update costs, and so are of limited practical
+utility. There has also been work~\cite{hagerup93,matias03,allendorf23} on
+extending the alias structure to support weight updates over a fixed set of
+elements. However, these solutions do not allow insertion or deletion in the
+underlying dataset, and so are not well suited to database sampling
+applications.
+
+\Paragraph{The Dichotomy.} Among these techniques, there exists a
+clear trade-off between efficient sampling and support for updates. Tree-traversal
+based sampling solutions pay a dataset size based cost per sample, in exchange for
+update support. The static solutions lack support for updates, but support
+near-constant time sampling. While some data structures exist with support for
+both, these are restricted to highly specialized query types. Thus in the
+general case there exists a dichotomy: existing sampling indexes can support
+either data updates or efficient sampling, but not both.
diff --git a/chapters/sigmod23/conclusion.tex b/chapters/sigmod23/conclusion.tex
new file mode 100644
index 0000000..de6bffc
--- /dev/null
+++ b/chapters/sigmod23/conclusion.tex
@@ -0,0 +1,17 @@
+\section{Conclusion}
+\label{sec:conclusion}
+
+This chapter discussed the creation of a framework for the dynamic extension of
+static indexes designed for various sampling problems. Specifically, extensions
+were created for the alias structure (WSS), the in-memory ISAM tree (IRS), and
+the alias-augmented B+tree (WIRS). In each case, the SSIs were extended
+successfully with support for updates and deletes, without compromising their
+sampling performance advantage relative to existing dynamic baselines. This was
+accomplished by leveraging ideas borrowed from the Bentley-Saxe method and the
+design space of the LSM tree to divide the static index into multiple shards,
+which could be individually reconstructed in a systematic fashion to
+accommodate new data. This framework provides a large design space for trading
+between update performance, sampling performance, and memory usage, which was
+explored experimentally. The resulting extended indexes were shown to approach
+or match the insertion performance of the B+tree, while simultaneously
+performing significantly faster in sampling operations under most situations.
diff --git a/chapters/sigmod23/examples.tex b/chapters/sigmod23/examples.tex
new file mode 100644
index 0000000..cdbc398
--- /dev/null
+++ b/chapters/sigmod23/examples.tex
@@ -0,0 +1,143 @@
+\section{Framework Instantiations}
+\label{sec:instance}
+In this section, the framework is applied to three sampling problems and their
+associated SSIs. All three sampling problems draw random samples from records
+satisfying a simple predicate, and so result sets for all three can be
+constructed by directly merging the result sets of the queries executed against
+individual shards, the primary requirement for the application of the
+framework. The SSIs used for each problem are discussed, including their
+support of the remaining two optional requirements for framework application.
+
+\subsection{Dynamically Extended WSS Structure}
+\label{ssec:wss-struct}
+As a first example of applying this framework for dynamic extension,
+the alias structure for answering WSS queries is considered. This is a
+static structure that can be constructed in $O(n)$ time and supports WSS
+queries in $O(1)$ time. The alias structure will be used as the SSI, with
+the shards containing an alias structure paired with a sorted array of
+records. { The use of sorted arrays for storing the records
+allows for more efficient point-lookups, without requiring any additional
+space. The total weight associated with a query for
+a given alias structure is the total weight of all of its records,
+and can be tracked at the shard level and retrieved in constant time. }
+
+Using the formulae from Section~\ref{sec:framework}, the worst-case
+costs of insertion, sampling, and deletion are easily derived. The
+initial construction cost from the buffer is $C_c(N_b) \in O(N_b
+\log N_b)$, requiring the sorting of the buffer followed by alias
+construction. After this point, the shards can be reconstructed in
+linear time while maintaining sorted order. Thus, the reconstruction
+cost is $C_r(n) \in O(n)$. As each shard contains a sorted array,
+the point-lookup cost is $L(n) \in O(\log n)$. The total weight can
+be tracked with the shard, requiring $W(n) \in O(1)$ time to access,
+and there is no necessary preprocessing, so $P(n) \in O(1)$. Samples
+can be drawn in $S(n) \in O(1)$ time. Plugging these results into the
+formulae for insertion, sampling, and deletion costs gives,
+
+\begin{align*}
+ \text{Insertion:} \quad &O\left(\log_s n\right) \\
+ \text{Sampling:} \quad &O\left(\log_s n + \frac{k}{1 - \delta}\cdot R(n)\right) \\
+ \text{Tagged Delete:} \quad &O\left(\log_s n \log n\right)
+\end{align*}
+where $R(n) \in O(1)$ for tagging and $R(n) \in O(\log_s n \log n)$ for
+tombstones.
+
+\Paragraph{Bounding Rejection Rate.} In the weighted sampling case,
+the framework's generic record-based compaction trigger mechanism
+is insufficient to bound the rejection rate. This is because the
+probability of a given record being sampling is dependent upon its
+weight, as well as the number of records in the index. If a highly
+weighted record is deleted, it will be preferentially sampled, resulting
+in a larger number of rejections than would be expected based on record
+counts alone. This problem can be rectified using the framework's user-specified
+compaction trigger mechanism.
+In addition to
+tracking record counts, each level also tracks its rejection rate,
+\begin{equation*}
+\rho_i = \frac{\text{rejections}}{\text{sampling attempts}}
+\end{equation*}
+A configurable rejection rate cap, $\rho$, is then defined. If $\rho_i
+> \rho$ on a level, a compaction is triggered. In the case
+the tombstone delete policy, it is not the level containing the sampled
+record, but rather the level containing its tombstone, that is considered
+the source of the rejection. This is necessary to ensure that the tombstone
+is moved closer to canceling its associated record by the compaction.
+
+\subsection{Dynamically Extended IRS Structure}
+\label{ssec:irs-struct}
+Another sampling problem to which the framework can be applied is
+independent range sampling (IRS). The SSI in this example is the in-memory
+ISAM tree. The ISAM tree supports efficient point-lookups
+ directly, and the total weight of an IRS query can be
+easily obtained by counting the number of records within the query range,
+which is determined as part of the preprocessing of the query.
+
+The static nature of shards in the framework allows for an ISAM tree
+to be constructed with adjacent nodes positioned contiguously in memory.
+By selecting a leaf node size that is a multiple of the record size, and
+avoiding placing any headers within leaf nodes, the set of leaf nodes can
+be treated as a sorted array of records with direct indexing, and the
+internal nodes allow for faster searching of this array.
+Because of this layout, per-sample tree-traversals are avoided. The
+start and end of the range from which to sample can be determined using
+a pair of traversals, and then records can be sampled from this range
+using random number generation and array indexing.
+
+Assuming a sorted set of input records, the ISAM tree can be bulk-loaded
+in linear time. The insertion analysis proceeds like the WSS example
+previously discussed. The initial construction cost is $C_c(N_b) \in
+O(N_b \log N_b)$ and reconstruction cost is $C_r(n) \in O(n)$. The ISAM
+tree supports point-lookups in $L(n) \in O(\log_f n)$ time, where $f$
+is the fanout of the tree.
+
+The process for performing range sampling against the ISAM tree involves
+two stages. First, the tree is traversed twice: once to establish the index of
+the first record greater than or equal to the lower bound of the query,
+and again to find the index of the last record less than or equal to the
+upper bound of the query. This process has the effect of providing the
+number of records within the query range, and can be used to determine
+the weight of the shard in the shard alias structure. Its cost is $P(n)
+\in O(\log_f n)$. Once the bounds are established, samples can be drawn
+by randomly generating uniform integers between the upper and lower bound,
+in $S(n) \in O(1)$ time each.
+
+This results in the extended version of the ISAM tree having the following
+insert, sampling, and delete costs,
+\begin{align*}
+ \text{Insertion:} \quad &O\left(\log_s n\right) \\
+ \text{Sampling:} \quad &O\left(\log_s n \log_f n + \frac{k}{1 - \delta}\cdot R(n)\right) \\
+ \text{Tagged Delete:} \quad &O\left(\log_s n \log_f n\right)
+\end{align*}
+where $R(n) \in O(1)$ for tagging and $R(n) \in O(\log_s n \log_f n)$ for
+tombstones.
+
+
+\subsection{Dynamically Extended WIRS Structure}
+\label{ssec:wirs-struct}
+As a final example of applying this framework, the WIRS problem will be
+considered. Specifically, the alias-augmented B+tree approach, described
+by Tao \cite{tao22}, generalizing work by Afshani and Wei \cite{afshani17},
+and Hu et al. \cite{hu14}, will be extended.
+This structure allows for efficient point-lookups, as
+it is based on the B+tree, and the total weight of a given WIRS query can
+be calculated given the query range using aggregate weight tags within
+the tree.
+
+The alias-augmented B+tree is a static structure of linear space, capable
+of being built initially in $C_c(N_b) \in O(N_b \log N_b)$ time, being
+bulk-loaded from sorted lists of records in $C_r(n) \in O(n)$ time,
+and answering WIRS queries in $O(\log_f n + k)$ time, where the query
+cost consists of preliminary work to identify the sampling range
+and calculate the total weight, with $P(n) \in O(\log_f n)$ cost, and
+constant-time drawing of samples from that range with $S(n) \in O(1)$.
+This results in the following costs,
+\begin{align*}
+ \text{Insertion:} \quad &O\left(\log_s n\right) \\
+ \text{Sampling:} \quad &O\left(\log_s n \log_f n + \frac{k}{1 - \delta} \cdot R(n)\right) \\
+ \text{Tagged Delete:} \quad &O\left(\log_s n \log_f n\right)
+\end{align*}
+where $R(n) \in O(1)$ for tagging and $R(n) \in O(\log_s n \log_f n)$ for
+tombstones. Because this is a weighted sampling structure, the custom
+compaction trigger discussed in in Section~\ref{ssec:wss-struct} is applied
+to maintain bounded rejection rates during sampling.
+
diff --git a/chapters/sigmod23/exp-baseline.tex b/chapters/sigmod23/exp-baseline.tex
new file mode 100644
index 0000000..9e7929c
--- /dev/null
+++ b/chapters/sigmod23/exp-baseline.tex
@@ -0,0 +1,98 @@
+\subsection{Comparison to Baselines}
+
+Next, the performance of indexes extended using the framework is compared
+against tree sampling on the aggregate B+tree, as well as problem-specific
+SSIs for WSS, WIRS, and IRS queries. Unless otherwise specified, IRS and WIRS
+queries were executed with a selectivity of $0.1\%$ and 500 million randomly
+selected records from the OSM dataset were used. The uniform and zipfian
+synthetic datasets were 1 billion records in size. All benchmarks warmed up the
+data structure by inserting 10\% of the records, and then measured the
+throughput inserting the remaining records, while deleting 5\% of them over the
+course of the benchmark. Once all records were inserted, the sampling
+performance was measured. The reported update throughputs were calculated using
+both inserts and deletes, following the warmup period.
+
+\begin{figure*}
+ \centering
+ \subfloat[Insertion Throughput vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-wss-insert} \label{fig:wss-insert}}
+ \subfloat[Sampling Latency vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-wss-sample} \label{fig:wss-sample}} \\
+ \subfloat[Insertion Scalability vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-sc-wss-insert} \label{fig:wss-insert-s}}
+ \subfloat[Sampling Scalability vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-sc-wss-sample} \label{fig:wss-sample-s}}
+ \caption{Framework Comparisons to Baselines for WSS}
+\end{figure*}
+
+Starting with WSS, Figure~\ref{fig:wss-insert} shows that the DE-WSS structure
+is competitive with the AGG B+tree in terms of insertion performance, achieving
+about 85\% of the AGG B+tree's insertion throughput on the Twitter dataset, and
+beating it by similar margins on the other datasets. In terms of sampling
+performance in Figure~\ref{fig:wss-sample}, it beats the B+tree handily, and
+compares favorably to the static alias structure. Figures~\ref{fig:wss-insert-s}
+and \ref{fig:wss-sample-s} show the performance scaling of the three structures as
+the dataset size increases. All of the structures exhibit the same type of
+performance degradation with respect to dataset size.
+
+\begin{figure*}
+ \centering
+ \subfloat[Insertion Throughput vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-wirs-insert} \label{fig:wirs-insert}}
+ \subfloat[Sampling Latency vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-wirs-sample} \label{fig:wirs-sample}}
+ \caption{Framework Comparison to Baselines for WIRS}
+\end{figure*}
+
+Figures~\ref{fig:wirs-insert} and \ref{fig:wirs-sample} show the performance of
+the DE-WIRS index, relative to the AGG B+tree and the alias-augmented B+tree. This
+example shows the same pattern of behavior as was seen with DE-WSS, though the
+margin between the DE-WIRS and its corresponding SSI is much narrower.
+Additionally, the constant factors associated with the construction cost of the
+alias-augmented B+tree are much larger than the alias structure. The loss of
+insertion performance due to this is seen clearly in Figure~\ref{fig:wirs-insert}, where
+the margin of advantage between DE-WIRS and the AGG B+tree in insertion
+throughput shrinks compared to the DE-WSS index, and the AGG B+tree's advantage
+on the Twitter dataset is expanded.
+
+\begin{figure*}
+ \subfloat[Insertion Scalability vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-sc-irs-insert} \label{fig:irs-insert-s}}
+ \subfloat[Sampling Scalability vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-sc-irs-sample} \label{fig:irs-sample-s}} \\
+
+ \subfloat[Insertion Throughput vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-irs-insert} \label{fig:irs-insert1}}
+ \subfloat[Sampling Latency vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-irs-sample} \label{fig:irs-sample1}} \\
+
+ \subfloat[Delete Scalability vs. Baselines]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-sc-irs-delete} \label{fig:irs-delete}}
+ \subfloat[Sampling Latency vs. Sample Size]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-irs-samplesize} \label{fig:irs-samplesize}}
+ \caption{Framework Comparison to Baselines for IRS}
+
+\end{figure*}
+Finally, Figures~\ref{fig:irs-insert1} and \ref{fig:irs-sample1} show a
+comparison of the in-memory DE-IRS index against the in-memory ISAM tree and the AGG
+B+tree for answering IRS queries. The cost of bulk-loading the ISAM tree is less
+than the cost of building the alias structure, or the alias-augmented B+tree, and
+so here DE-IRS defeats the AGG B+tree by wider margins in insertion throughput,
+though the margin narrows significantly in terms of sampling performance
+advantage.
+
+DE-IRS was further tested to evaluate scalability.
+Figure~\ref{fig:irs-insert-s} shows average insertion throughput,
+Figure~\ref{fig:irs-delete} shows average delete latency (under tagging), and
+Figure~\ref{fig:irs-sample-s} shows average sampling latencies for DE-IRS and
+AGG B+tree over a range of data sizes. In all cases, DE-IRS and B+tree show
+similar patterns of performance degradation as the datasize grows. Note that
+the delete latencies of DE-IRS are worse than AGG B+tree, because of the B+tree's
+cheaper point-lookups.
+
+Figure~\ref{fig:irs-sample-s}
+also includes one other point of interest: the sampling performance of
+DE-IRS \emph{improves} when the data size grows from one million to ten million
+records. While at first glance the performance increase may appear paradoxical,
+it actually demonstrates an important result concerning the effect of the
+unsorted mutable buffer on index performance. At one million records, the
+buffer constitutes approximately 1\% of the total data size; this results in
+the buffer being sampled from with greater frequency (as it has more total
+weight) than would be the case with larger data. The greater the frequency of
+buffer sampling, the more rejections will occur, and the worse the sampling
+performance will be. This illustrates the importance of keeping the buffer
+small, even when a scan is not used for buffer sampling. Finally,
+Figure~\ref{fig:irs-samplesize} shows the decreasing per-sample cost as the
+number of records requested by a sampling query grows for DE-IRS, compared to
+AGG B+tree. Note that DE-IRS benefits significantly more from batching samples
+than AGG B+tree, and that the improvement is greatest up to $k=100$ samples per
+query.
+
diff --git a/chapters/sigmod23/exp-extensions.tex b/chapters/sigmod23/exp-extensions.tex
new file mode 100644
index 0000000..d929e92
--- /dev/null
+++ b/chapters/sigmod23/exp-extensions.tex
@@ -0,0 +1,40 @@
+\subsection{External and Concurrent Extensions}
+
+\begin{figure*}[h]%
+ \centering
+ \subfloat[External Insertion Throughput]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-ext-insert.pdf} \label{fig:ext-insert}}
+ \subfloat[External Sampling Latency]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-bs-ext-sample.pdf} \label{fig:ext-sample}} \\
+
+ \subfloat[Concurrent Insert Latency vs. Throughput]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-cc-irs-scale} \label{fig:con-latency}}
+ \subfloat[Concurrent Insert Throughput vs. Thread Count]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-cc-irs-thread} \label{fig:con-tput}}
+
+ \caption{External and Concurrent Extensions of DE-IRS}
+ \label{fig:irs-extensions}
+\end{figure*}
+
+Proof of concept implementations of external and concurrent extensions were
+also tested for IRS queries. Figures \ref{fig:ext-sample} and
+\ref{fig:ext-insert} show the performance of the external DE-IRS sampling index
+against AB-tree. DE-IRS was configured with 4 in-memory levels, using at most
+350 MiB of memory in testing, including bloom filters. {
+For DE-IRS, the \texttt{O\_DIRECT} flag was used to disable OS caching, and
+CGroups were used to limit process memory to 1 GiB to simulate a memory
+constrained environment. The AB-tree implementation tested
+had a cache, which was configured with a memory budget of 64 GiB. This extra
+memory was provided to be fair to AB-tree. Because it uses per-sample
+tree-traversals, it is much more reliant on caching for good performance. DE-IRS was
+tested without a caching layer.} The tests were performed with 4 billion (80 GiB)
+{and 8 billion (162 GiB) uniform and zipfian
+records}, and 2.6 billion (55 GiB) OSM records. DE-IRS outperformed the AB-tree
+by over an order of magnitude in both insertion and sampling performance.
+
+Finally, Figures~\ref{fig:con-latency} and \ref{fig:con-tput} show the
+multi-threaded insertion performance of the in-memory DE-IRS index with
+concurrency support, compared to AB-tree running entirely in memory, using the
+synthetic uniform dataset. Note that in Figure~\ref{fig:con-latency}, some of
+the AB-tree results are cut off, due to having significantly lower throughput
+and higher latency compared with the DE-IRS. Even without concurrent
+merging, the framework shows linear scaling up to 4 threads of insertion,
+before leveling off; throughput remains flat even up to 32 concurrent
+insertion threads. An implementation with support for concurrent merging would
+scale even better.
diff --git a/chapters/sigmod23/exp-parameter-space.tex b/chapters/sigmod23/exp-parameter-space.tex
new file mode 100644
index 0000000..d2057ac
--- /dev/null
+++ b/chapters/sigmod23/exp-parameter-space.tex
@@ -0,0 +1,105 @@
+\subsection{Framework Design Space Exploration}
+\label{ssec:ds-exp}
+
+The proposed framework brings with it a large design space, described in
+Section~\ref{ssec:design-space}. First, this design space will be examined
+using a standardized benchmark to measure the average insertion throughput and
+sampling latency of DE-WSS at several points within this space. Tests were run
+using a random selection of 500 million records from the OSM dataset, with the
+index warmed up by the insertion of 10\% of the total records prior to
+beginning any measurement. Over the course of the insertion period, 5\% of the
+records were deleted, except for the tests in
+Figures~\ref{fig:insert_delete_prop}, \ref{fig:sample_delete_prop}, and
+\ref{fig:bloom}, in which 25\% of the records were deleted. Reported update
+throughputs were calculated using both inserts and deletes, following the
+warmup period. The standard values
+used for parameters not being varied in a given test were $s = 6$, $N_b =
+12000$, $k=1000$, and $\delta = 0.05$, with buffer rejection sampling.
+
+\begin{figure*}
+ \centering
+ \subfloat[Insertion Throughput vs. Mutable Buffer Capacity]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-mt-insert} \label{fig:insert_mt}}
+ \subfloat[Insertion Throughput vs. Scale Factor]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-sf-insert} \label{fig:insert_sf}} \\
+
+ \subfloat[Insertion Throughput vs.\\Max Delete Proportion]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-tp-insert} \label{fig:insert_delete_prop}}
+ \subfloat[Per 1000 Sampling Latency vs.\\Mutable Buffer Capacity]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-mt-sample} \label{fig:sample_mt}} \\
+
+ \caption{DE-WSS Design Space Exploration I}
+ \label{fig:parameter-sweeps1}
+\end{figure*}
+
+The results of this testing are displayed in
+Figures~\ref{fig:parameter-sweeps1},~\ref{fig:parameter-sweeps2},~and:wq~\ref{fig:parameter-sweeps3}.
+The two largest contributors to differences in performance were the selection
+of layout policy and of delete policy. Figures~\ref{fig:insert_mt} and
+\ref{fig:insert_sf} show that the choice of layout policy plays a larger role
+than delete policy in insertion performance, with tiering outperforming
+leveling in both configurations. The situation is reversed in sampling
+performance, seen in Figure~\ref{fig:sample_mt} and \ref{fig:sample_sf}, where
+the performance difference between layout policies is far less than between
+delete policies.
+
+The values used for the scale factor and buffer size have less influence than
+layout and delete policy. Sampling performance is largely independent of them
+over the ranges of values tested, as shown in Figures~\ref{fig:sample_mt} and
+\ref{fig:sample_sf}. This isn't surprising, as these parameters adjust the
+number of shards, which only contributes to shard alias construction time
+during sampling and is is amortized over all samples taken in a query. The
+buffer also contributes rejections, but the cost of a rejection is small and
+the buffer constitutes only a small portion of the total weight, so these are
+negligible. However, under tombstones there is an upward trend in latency with
+buffer size, as delete checks occasionally require a full buffer scan. The
+effect of buffer size on insertion is shown in Figure~\ref{fig:insert_mt}.
+{ There is only a small improvement in insertion performance as the mutable
+buffer grows. This is because a larger buffer results in fewer reconstructions,
+but these reconstructions individually take longer, and so the net positive
+effect is less than might be expected.} Finally, Figure~\ref{fig:insert_sf}
+shows the effect of scale factor on insertion performance. As expected, tiering
+performs better with higher scale factors, whereas the insertion performance of
+leveling trails off as the scale factor is increased, due to write
+amplification.
+
+\begin{figure*}
+ \centering
+ \subfloat[Per 1000 Sampling Latency vs. Scale Factor]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-sf-sample} \label{fig:sample_sf}}
+ \subfloat[Per 1000 Sampling Latency vs. Max Delete Proportion]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-tp-sample}\label{fig:sample_delete_prop}} \\
+ \caption{DE-WSS Design Space Exploration II}
+ \label{fig:parameter-sweeps2}
+\end{figure*}
+
+Figures~\ref{fig:insert_delete_prop} and \ref{fig:sample_delete_prop} show the
+cost of maintaining $\delta$ with a base delete rate of 25\%. The low cost of
+an in-memory sampling rejection results in only a slight upward trend in the
+sampling latency as the number of deleted records increases. While compaction
+is necessary to avoid pathological cases, there does not seem to be a
+significant benefit to aggressive compaction thresholds.
+Figure~\ref{fig:insert_delete_prop} shows the effect of compactions on insert
+performance. There is little effect on performance under tagging, but there is
+a clear negative performance trend associated with aggressive compaction when
+using tombstones. Under tagging, a single compaction is guaranteed to remove
+all deleted records on a level, whereas with tombstones a compaction can
+cascade for multiple levels before the delete bound is satisfied, resulting in
+a larger cost per incident.
+
+\begin{figure*}
+ \centering
+ \subfloat[Sampling Latency vs. Sample Size]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-samplesize} \label{fig:sample_k}}
+ \subfloat[Per 1000 Sampling Latency vs. Bloom Filter Memory]{\includegraphics[width=.5\textwidth]{img/sigmod23/plot/fig-ps-wss-bloom}\label{fig:bloom}} \\
+ \caption{DE-WSS Design Space Exploration III}
+ \label{fig:parameter-sweeps3}
+\end{figure*}
+
+Figure~\ref{fig:bloom} demonstrates the trade-off between memory usage for
+Bloom filters and sampling performance under tombstones. This test was run
+using 25\% incoming deletes with no compaction, to maximize the number of
+tombstones within the index as a worst-case scenario. As expected, allocating
+more memory to Bloom filters, decreasing their false positive rates,
+accelerates sampling. Finally, Figure~\ref{fig:sample_k} shows the relationship
+between average per sample latency and the sample set size. It shows the effect
+of amortizing the initial shard alias setup work across an increasing number of
+samples, with $k=100$ as the point at which latency levels off.
+
+Based upon these results, a set of parameters was established for the extended
+indexes, which is used in the next section for baseline comparisons. This
+standard configuration uses tagging as the delete policy and tiering as the
+layout policy, with $k=1000$, $N_b = 12000$, $\delta = 0.05$, and $s = 6$.
diff --git a/chapters/sigmod23/experiment.tex b/chapters/sigmod23/experiment.tex
new file mode 100644
index 0000000..75cf32e
--- /dev/null
+++ b/chapters/sigmod23/experiment.tex
@@ -0,0 +1,48 @@
+\section{Evaluation}
+\label{sec:experiment}
+
+\Paragraph{Experimental Setup.} All experiments were run under Ubuntu 20.04 LTS
+on a dual-socket Intel Xeon Gold 6242R server with 384 GiB of physical memory
+and 40 physical cores. External tests were run using a 4 TB WD Red SA500 SATA
+SSD, rated for 95000 and 82000 IOPS for random reads and writes respectively.
+
+\Paragraph{Datasets.} Testing utilized a variety of synthetic and real-world
+datasets. For all datasets used, the key was represented as a 64-bit integer,
+the weight as a 64-bit integer, and the value as a 32-bit integer. Each record
+also contained a 32-bit header. The weight was omitted from IRS testing.
+Keys and weights were pulled from the dataset directly, and values were
+generated separately and were unique for each record. The following datasets
+were used,
+\begin{itemize}
+\item \textbf{Synthetic Uniform.} A non-weighted, synthetically generated list
+ of keys drawn from a uniform distribution.
+\item \textbf{Synthetic Zipfian.} A non-weighted, synthetically generated list
+ of keys drawn from a Zipfian distribution with
+ a skew of $0.8$.
+\item \textbf{Twitter~\cite{data-twitter,data-twitter1}.} $41$ million Twitter user ids, weighted by follower counts.
+\item \textbf{Delicious~\cite{data-delicious}.} $33.7$ million URLs, represented using unique integers,
+ weighted by the number of associated tags.
+\item \textbf{OSM~\cite{data-osm}.} $2.6$ billion geospatial coordinates for points
+ of interest, collected by OpenStreetMap. The latitude, converted
+ to a 64-bit integer, was used as the key and the number of
+ its associated semantic tags as the weight.
+\end{itemize}
+The synthetic datasets were not used for weighted experiments, as they do not
+have weights. For unweighted experiments, the Twitter and Delicious datasets
+were not used, as they have uninteresting key distributions.
+
+\Paragraph{Compared Methods.} In this section, indexes extended using the
+framework are compared against existing dynamic baselines. Specifically, DE-WSS
+(Section~\ref{ssec:wss-struct}), DE-IRS (Section~\ref{ssec:irs-struct}), and
+DE-WIRS (Section~\ref{ssec:irs-struct}) are examined. In-memory extensions are
+compared against the B+tree with aggregate weight tags on internal nodes (AGG
+B+tree) \cite{olken95} and concurrent and external extensions are compared
+against the AB-tree \cite{zhao22}. Sampling performance is also compared against
+comparable static sampling indexes: the alias structure \cite{walker74} for WSS,
+the in-memory ISAM tree for IRS, and the alias-augmented B+tree \cite{afshani17}
+for WIRS. Note that all structures under test, with the exception of the
+external DE-IRS and external AB-tree, were contained entirely within system
+memory. All benchmarking code and data structures were implemented using C++17
+and compiled using gcc 11.3.0 at the \texttt{-O3} optimization level. The
+extension framework itself, excluding the shard implementations and utility
+headers, consisted of a header-only library of about 1200 SLOC.
diff --git a/chapters/sigmod23/extensions.tex b/chapters/sigmod23/extensions.tex
new file mode 100644
index 0000000..6c242e9
--- /dev/null
+++ b/chapters/sigmod23/extensions.tex
@@ -0,0 +1,57 @@
+\captionsetup[subfloat]{justification=centering}
+\section{Extensions}
+\label{sec:discussion}
+In this section, various extensions of the framework are considered.
+Specifically, the applicability of the framework to external or distributed
+data structures is discussed, as well as the use of the framework to add
+automatic support for concurrent updates and sampling to extended SSIs.
+
+\Paragraph{Larger-than-Memory Data.} This framework can be applied to external
+static sampling structures with minimal modification. As a proof-of-concept,
+the IRS structure was extended with support for shards containing external ISAM
+trees. This structure supports storing a configurable number of shards in
+memory, and the rest on disk, making it well suited for operating in
+memory-constrained environments. The on-disk shards contain standard ISAM
+trees, with $8\text{KiB}$ page-aligned nodes. The external version of the
+index only supports tombstone-based deletes, as tagging would require random
+writes. In principle a hybrid approach to deletes is possible, where a delete
+first searches the in-memory data for the record to be deleted, tagging it if
+found. If the record is not found, then a tombstone could be inserted. As the
+data size grows, though, and the preponderance of data is found on disk, this
+approach would largely revert to the standard tombstone approach in practice.
+External settings make the framework even more attractive, in terms of
+performance characteristics, due to the different cost model. In external data
+structures, performance is typically measured in terms of the number of IO
+operations, meaning that much of the overhead introduced by the framework for
+tasks like querying the mutable buffer, building auxiliary structures, extra
+random number generations due to the shard alias structure, and the like,
+become far less significant.
+
+Because the framework maintains immutability of shards, it is also well suited for
+use on top of distributed file-systems or with other distributed data
+abstractions like RDDs in Apache Spark~\cite{rdd}. Each shard can be
+encapsulated within an immutable file in HDFS or an RDD in Spark. A centralized
+control node or driver program can manage the mutable buffer, flushing it into
+a new file or RDD when it is full, merging with existing files or RDDs using
+the same reconstruction scheme already discussed for the framework. This setup
+allows for datasets exceeding the capacity of a single node to be supported. As
+an example, XDB~\cite{li19} features an RDD-based distributed sampling
+structure that could be supported by this framework.
+
+\Paragraph{Concurrency.} The immutability of the majority of the structures
+within the index makes for a straightforward concurrency implementation.
+Concurrency control on the buffer is made trivial by the fact it is a simple,
+unsorted array. The rest of the structure is never updated (aside from possible
+delete tagging), and so concurrency becomes a simple matter of delaying the
+freeing of memory used by internal structures until all the threads accessing
+them have exited, rather than immediately on merge completion. A very basic
+concurrency implementation can be achieved by using the tombstone delete
+policy, and a reference counting scheme to control the deletion of the shards
+following reconstructions. Multiple insert buffers can be used to improve
+insertion throughput, as this will allow inserts to proceed in parallel with
+merges, ultimately allowing concurrency to scale up to the point of being
+bottlenecked by memory bandwidth and available storage. This proof-of-concept
+implementation is based on a simplified version of an approach proposed by
+Golan-Gueta et al. for concurrent log-structured data stores
+\cite{golan-gueta15}.
+
diff --git a/chapters/sigmod23/framework.tex b/chapters/sigmod23/framework.tex
new file mode 100644
index 0000000..32a32e1
--- /dev/null
+++ b/chapters/sigmod23/framework.tex
@@ -0,0 +1,573 @@
+\section{Dynamic Sampling Index Framework}
+\label{sec:framework}
+
+This work is an attempt to design a solution to independent sampling
+that achieves \emph{both} efficient updates and near-constant cost per
+sample. As the goal is to tackle the problem in a generalized fashion,
+rather than design problem-specific data structures for used as the basis
+of an index, a framework is created that allows for already
+existing static data structures to be used as the basis for a sampling
+index, by automatically adding support for data updates using a modified
+version of the Bentley-Saxe method.
+
+Unfortunately, Bentley-Saxe as described in Section~\ref{ssec:bsm} cannot be
+directly applied to sampling problems. The concept of decomposability is not
+cleanly applicable to sampling, because the distribution of records in the
+result set, rather than the records themselves, must be matched following the
+result merge. Efficiently controlling the distribution requires each sub-query
+to access information external to the structure against which it is being
+processed, a contingency unaccounted for by Bentley-Saxe. Further, the process
+of reconstruction used in Bentley-Saxe provides poor worst-case complexity
+bounds~\cite{saxe79}, and attempts to modify the procedure to provide better
+worst-case performance are complex and have worse performance in the common
+case~\cite{overmars81}. Despite these limitations, this chapter will argue that
+the core principles of the Bentley-Saxe method can be profitably applied to
+sampling indexes, once a system for controlling result set distributions and a
+more effective reconstruction scheme have been devised. The solution to
+the former will be discussed in Section~\ref{ssec:sample}. For the latter,
+inspiration is drawn from the literature on the LSM tree.
+
+The LSM tree~\cite{oneil96} is a data structure proposed to optimize
+write throughput in disk-based storage engines. It consists of a memory
+table of bounded size, used to buffer recent changes, and a hierarchy
+of external levels containing indexes of exponentially increasing
+size. When the memory table has reached capacity, it is emptied into the
+external levels. Random writes are avoided by treating the data within
+the external levels as immutable; all writes go through the memory
+table. This introduces write amplification but maximizes sequential
+writes, which is important for maintaining high throughput in disk-based
+systems. The LSM tree is associated with a broad and well studied design
+space~\cite{dayan17,dayan18,dayan22,balmau19,dayan18-1} containing
+trade-offs between three key performance metrics: read performance, write
+performance, and auxiliary memory usage. The challenges
+faced in reconstructing predominately in-memory indexes are quite
+ different from those which the LSM tree is intended
+to address, having little to do with disk-based systems and sequential IO
+operations. But, the LSM tree possesses a rich design space for managing
+the periodic reconstruction of data structures in a manner that is both
+more practical and more flexible than that of Bentley-Saxe. By borrowing
+from this design space, this preexisting body of work can be leveraged,
+and many of Bentley-Saxe's limitations addressed.
+
+\captionsetup[subfloat]{justification=centering}
+
+\begin{figure*}
+ \centering
+ \subfloat[Leveling]{\includegraphics[width=.75\textwidth]{img/sigmod23/merge-leveling} \label{fig:leveling}}\\
+ \subfloat[Tiering]{\includegraphics[width=.75\textwidth]{img/sigmod23/merge-tiering} \label{fig:tiering}}
+
+ \caption{\textbf{A graphical overview of the sampling framework and its insert procedure.} A
+ mutable buffer (MB) sits atop two levels (L0, L1) containing shards (pairs
+ of SSIs and auxiliary structures [A]) using the leveling
+ (Figure~\ref{fig:leveling}) and tiering (Figure~\ref{fig:tiering}) layout
+ policies. Records are represented as black/colored squares, and grey
+ squares represent unused capacity. An insertion requiring a multi-level
+ reconstruction is illustrated.} \label{fig:framework}
+
+\end{figure*}
+
+
+\subsection{Framework Overview}
+The goal of this chapter is to build a general framework that extends most SSIs
+with efficient support for updates by splitting the index into small data structures
+to reduce reconstruction costs, and then distributing the sampling process over these
+smaller structures.
+The framework is designed to work efficiently with any SSI, so
+long as it has the following properties,
+\begin{enumerate}
+ \item The underlying full query $Q$ supported by the SSI from whose results
+ samples are drawn satisfies the following property:
+ for any dataset $D = \cup_{i = 1}^{n}D_i$
+ where $D_i \cap D_j = \emptyset$, $Q(D) = \cup_{i = 1}^{n}Q(D_i)$.
+ \item \emph{(Optional)} The SSI supports efficient point-lookups.
+ \item \emph{(Optional)} The SSI is capable of efficiently reporting the total weight of all records
+ returned by the underlying full query.
+\end{enumerate}
+
+The first property applies to the query being sampled from, and is essential
+for the correctness of sample sets reported by extended sampling
+indexes.\footnote{ This condition is stricter than the definition of a
+decomposable search problem in the Bentley-Saxe method, which allows for
+\emph{any} constant-time merge operation, not just union.
+However, this condition is satisfied by many common types of database
+query, such as predicate-based filtering queries.} The latter two properties
+are optional, but reduce deletion and sampling costs respectively. Should the
+SSI fail to support point-lookups, an auxiliary hash table can be attached to
+the data structures.
+Should it fail to support query result weight reporting, rejection
+sampling can be used in place of the more efficient scheme discussed in
+Section~\ref{ssec:sample}. The analysis of this framework will generally
+assume that all three conditions are satisfied.
+
+Given an SSI with these properties, a dynamic extension can be produced as
+shown in Figure~\ref{fig:framework}. The extended index consists of disjoint
+shards containing an instance of the SSI being extended, and optional auxiliary
+data structures. The auxiliary structures allow acceleration of certain
+operations that are required by the framework, but which the SSI being extended
+does not itself support efficiently. Examples of possible auxiliary structures
+include hash tables, Bloom filters~\cite{bloom70}, and range
+filters~\cite{zhang18,siqiang20}. The shards are arranged into levels of
+increasing record capacity, with either one shard, or up to a fixed maximum
+number of shards, per level. The decision to place one or many shards per level
+is called the \emph{layout policy}. The policy names are borrowed from the
+literature on the LSM tree, with the former called \emph{leveling} and the
+latter called \emph{tiering}.
+
+To avoid a reconstruction on every insert, an unsorted array of fixed capacity
+($N_b$), called the \emph{mutable buffer}, is used to buffer updates. Because it is
+unsorted, it is kept small to maintain reasonably efficient sampling
+and point-lookup performance. All updates are performed by appending new
+records to the tail of this buffer.
+If a record currently within the index is
+to be updated to a new value, it must first be deleted, and then a record with
+the new value inserted. This ensures that old versions of records are properly
+filtered from query results.
+
+When the buffer is full, it is flushed to make room for new records. The
+flushing procedure is based on the layout policy in use. When using leveling
+(Figure~\ref{fig:leveling}) a new SSI is constructed using both the records in
+$L_0$ and those in the buffer. This is used to create a new shard, which
+replaces the one previously in $L_0$. When using tiering
+(Figure~\ref{fig:tiering}) a new shard is built using only the records from the
+buffer, and placed into $L_0$ without altering the existing shards. Each level
+has a record capacity of $N_b \cdot s^{i+1}$, controlled by a configurable
+parameter, $s$, called the scale factor. Records are organized in one large
+shard under leveling, or in $s$ shards of $N_b \cdot s^i$ capacity each under
+tiering. When a level reaches its capacity, it must be emptied to make room for
+the records flushed into it. This is accomplished by moving its records down to
+the next level of the index. Under leveling, this requires constructing a new
+shard containing all records from both the source and target levels, and
+placing this shard into the target, leaving the source empty. Under tiering,
+the shards in the source level are combined into a single new shard that is
+placed into the target level. Should the target be full, it is first emptied by
+applying the same procedure. New empty levels
+are dynamically added as necessary to accommodate these reconstructions.
+Note that shard reconstructions are not necessarily performed using
+merging, though merging can be used as an optimization of the reconstruction
+procedure where such an algorithm exists. In general, reconstruction requires
+only pooling the records of the shards being combined and then applying the SSI's
+standard construction algorithm to this set of records.
+
+\begin{table}[t]
+\caption{Frequently Used Notation}
+\centering
+
+\begin{tabular}{|p{2.5cm} p{5cm}|}
+ \hline
+ \textbf{Variable} & \textbf{Description} \\ \hline
+ $N_b$ & Capacity of the mutable buffer \\ \hline
+ $s$ & Scale factor \\ \hline
+ $C_c(n)$ & SSI initial construction cost \\ \hline
+ $C_r(n)$ & SSI reconstruction cost \\ \hline
+ $L(n)$ & SSI point-lookup cost \\ \hline
+ $P(n)$ & SSI sampling pre-processing cost \\ \hline
+ $S(n)$ & SSI per-sample sampling cost \\ \hline
+ $W(n)$ & Shard weight determination cost \\ \hline
+ $R(n)$ & Shard rejection check cost \\ \hline
+ $\delta$ & Maximum delete proportion \\ \hline
+ %$\rho$ & Maximum rejection rate \\ \hline
+\end{tabular}
+\label{tab:nomen}
+
+\end{table}
+
+Table~\ref{tab:nomen} lists frequently used notation for the various parameters
+of the framework, which will be used in the coming analysis of the costs and
+trade-offs associated with operations within the framework's design space. The
+remainder of this section will discuss the performance characteristics of
+insertion into this structure (Section~\ref{ssec:insert}), how it can be used
+to correctly answer sampling queries (Section~\ref{ssec:insert}), and efficient
+approaches for supporting deletes (Section~\ref{ssec:delete}). Finally, it will
+close with a detailed discussion of the trade-offs within the framework's
+design space (Section~\ref{ssec:design-space}).
+
+
+\subsection{Insertion}
+\label{ssec:insert}
+The framework supports inserting new records by first appending them to the end
+of the mutable buffer. When it is full, the buffer is flushed into a sequence
+of levels containing shards of increasing capacity, using a procedure
+determined by the layout policy as discussed in Section~\ref{sec:framework}.
+This method allows for the cost of repeated shard reconstruction to be
+effectively amortized.
+
+Let the cost of constructing the SSI from an arbitrary set of $n$ records be
+$C_c(n)$ and the cost of reconstructing the SSI given two or more shards
+containing $n$ records in total be $C_r(n)$. The cost of an insert is composed
+of three parts: appending to the mutable buffer, constructing a new
+shard from the buffered records during a flush, and the total cost of
+reconstructing shards containing the record over the lifetime of the index. The
+cost of appending to the mutable buffer is constant, and the cost of constructing a
+shard from the buffer can be amortized across the records participating in the
+buffer flush, giving $\nicefrac{C_c(N_b)}{N_b}$. These costs are paid exactly once for
+each record. To derive an expression for the cost of repeated reconstruction,
+first note that each record will participate in at most $s$ reconstructions on
+a given level, resulting in a worst-case amortized cost of $O\left(s\cdot
+\nicefrac{C_r(n)}{n}\right)$ paid per level. The index itself will contain at most
+$\log_s n$ levels. Thus, over the lifetime of the index a given record
+will pay $O\left(s\cdot \nicefrac{C_r(n)}{n}\log_s n\right)$ cost in repeated
+reconstruction.
+
+Combining these results, the total amortized insertion cost is
+\begin{equation}
+O\left(\frac{C_c(N_b)}{N_b} + s \cdot \frac{C_r(n)}{n} \log_s n\right)
+\end{equation}
+This can be simplified by noting that $s$ is constant, and that $N_b \ll n$ and also
+a constant. By neglecting these terms, the amortized insertion cost of the
+framework is,
+\begin{equation}
+O\left(\frac{C_r(n)}{n}\log_s n\right)
+\end{equation}
+
+
+\subsection{Sampling}
+\label{ssec:sample}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{img/sigmod23/sampling}
+ \caption{\textbf{Overview of the multiple-shard sampling query process} for
+ Example~\ref{ex:sample} with $k=1000$. First, (1) the normalized weights of
+ the shards is determined, then (2) these weights are used to construct an
+ alias structure. Next, (3) the alias structure is queried $k$ times to
+ determine per shard sample sizes, and then (4) sampling is performed.
+ Finally, (5) any rejected samples are retried starting from the alias
+ structure, and the process is repeated until the desired number of samples
+ has been retrieved.}
+ \label{fig:sample}
+
+\end{figure}
+
+For many SSIs, sampling queries are completed in two stages. Some preliminary
+processing is done to identify the range of records from which to sample, and then
+samples are drawn from that range. For example, IRS over a sorted list of
+records can be performed by first identifying the upper and lower bounds of the
+query range in the list, and then sampling records by randomly generating
+indexes within those bounds. The general cost of a sampling query can be
+modeled as $P(n) + k S(n)$, where $P(n)$ is the cost of preprocessing, $k$ is
+the number of samples drawn, and $S(n)$ is the cost of sampling a single
+record.
+
+When sampling from multiple shards, the situation grows more complex. For each
+sample, the shard to select the record from must first be decided. Consider an
+arbitrary sampling query $X(D, k)$ asking for a sample set of size $k$ against
+dataset $D$. The framework splits $D$ across $m$ disjoint shards, such that $D
+= \bigcup_{i=1}^m D_i$ and $D_i \cap D_j = \emptyset, \forall i,j < m$. The
+framework must ensure that $X(D, k)$ and $\bigcup_{i=0}^m X(D_i, k_i)$ follow
+the same distribution, by selecting appropriate values for the $k_i$s. If care
+is not taken to balance the number of samples drawn from a shard with the total
+weight of the shard under $X$, then bias can be introduced into the sample
+set's distribution. The selection of $k_i$s can be viewed as an instance of WSS,
+and solved using the alias method.
+
+When sampling using the framework, first the weight of each shard under the
+sampling query is determined and a \emph{shard alias structure} built over
+these weights. Then, for each sample, the shard alias is used to
+determine the shard from which to draw the sample. Let $W(n)$ be the cost of
+determining this total weight for a single shard under the query. The initial setup
+cost, prior to drawing any samples, will be $O\left([W(n) + P(n)]\log_s
+n\right)$, as the preliminary work for sampling from each shard must be
+performed, as well as weights determined and alias structure constructed. In
+many cases, however, the preliminary work will also determine the total weight,
+and so the relevant operation need only be applied once to accomplish both
+tasks.
+
+To ensure that all records appear in the sample set with the appropriate
+probability, the mutable buffer itself must also be a valid target for
+sampling. There are two generally applicable techniques that can be applied for
+this, both of which can be supported by the framework. The query being sampled
+from can be directly executed against the buffer and the result set used to
+build a temporary SSI, which can be sampled from. Alternatively, rejection
+sampling can be used to sample directly from the buffer, without executing the
+query. In this case, the total weight of the buffer is used for its entry in
+the shard alias structure. This can result in the buffer being
+over-represented in the shard selection process, and so any rejections during
+buffer sampling must be retried starting from shard selection. These same
+considerations apply to rejection sampling used against shards, as well.
+
+
+\begin{example}
+ \label{ex:sample}
+ Consider executing a WSS query, with $k=1000$, across three shards
+ containing integer keys with unit weight. $S_1$ contains only the
+ key $-2$, $S_2$ contains all integers on $[1,100]$, and $S_3$
+ contains all integers on $[101, 200]$. These structures are shown
+ in Figure~\ref{fig:sample}. Sampling is performed by first
+ determining the normalized weights for each shard: $w_1 = 0.005$,
+ $w_2 = 0.4975$, $w_3 = 0.4975$, which are then used to construct a
+ shard alias structure. The shard alias structure is then queried
+ $k$ times, resulting in a distribution of $k_i$s that is
+ commensurate with the relative weights of each shard. Finally,
+ each shard is queried in turn to draw the appropriate number
+ of samples.
+\end{example}
+
+
+Assuming that rejection sampling is used on the mutable buffer, the worst-case
+time complexity for drawing $k$ samples from an index containing $n$ elements
+with a sampling cost of $S(n)$ is,
+\begin{equation}
+ \label{eq:sample-cost}
+ O\left(\left[W(n) + P(n)\right]\log_s n + kS(n)\right)
+\end{equation}
+
+%If instead a temporary SSI is constructed, the cost of sampling
+%becomes: $O\left(N_b + C_c(N_b) + (W(n) + P(n))\log_s n + kS(n)\right)$.
+
+\begin{figure}
+ \centering
+ \subfloat[Tombstone Rejection Check]{\includegraphics[width=.75\textwidth]{img/sigmod23/delete-tombstone} \label{fig:delete-tombstone}}\\
+ \subfloat[Tagging Rejection Check]{\includegraphics[width=.75\textwidth]{img/sigmod23/delete-tagging} \label{fig:delete-tag}}
+
+ \caption{\textbf{Overview of the rejection check procedure for deleted records.} First,
+ a record is sampled (1).
+ When using the tombstone delete policy
+ (Figure~\ref{fig:delete-tombstone}), the rejection check starts by (2) querying
+ the bloom filter of the mutable buffer. The filter indicates the record is
+ not present, so (3) the filter on $L_0$ is queried next. This filter
+ returns a false positive, so (4) a point-lookup is executed against $L_0$.
+ The lookup fails to find a tombstone, so the search continues and (5) the
+ filter on $L_1$ is checked, which reports that the tombstone is present.
+ This time, it is not a false positive, and so (6) a lookup against $L_1$
+ (7) locates the tombstone. The record is thus rejected. When using the
+ tagging policy (Figure~\ref{fig:delete-tag}), (1) the record is sampled and
+ (2) checked directly for the delete tag. It is set, so the record is
+ immediately rejected.}
+
+ \label{fig:delete}
+
+\end{figure}
+
+
+\subsection{Deletion}
+\label{ssec:delete}
+
+Because the shards are static, records cannot be arbitrarily removed from them.
+This requires that deletes be supported in some other way, with the ultimate
+goal being the prevention of deleted records' appearance in sampling query
+result sets. This can be realized in two ways: locating the record and marking
+it, or inserting a new record which indicates that an existing record should be
+treated as deleted. The framework supports both of these techniques, the
+selection of which is called the \emph{delete policy}. The former policy is
+called \emph{tagging} and the latter \emph{tombstone}.
+
+Tagging a record is straightforward. Point-lookups are performed against each
+shard in the index, as well as the buffer, for the record to be deleted. When
+it is found, a bit in a header attached to the record is set. When sampling,
+any records selected with this bit set are automatically rejected. Tombstones
+represent a lazy strategy for deleting records. When a record is deleted using
+tombstones, a new record with identical key and value, but with a ``tombstone''
+bit set, is inserted into the index. A record's presence can be checked by
+performing a point-lookup. If a tombstone with the same key and value exists
+above the record in the index, then it should be rejected when sampled.
+
+Two important aspects of performance are pertinent when discussing deletes: the
+cost of the delete operation, and the cost of verifying the presence of a
+sampled record. The choice of delete policy represents a trade-off between
+these two costs. Beyond this simple trade-off, the delete policy also has other
+implications that can affect its applicability to certain types of SSI. Most
+notably, tombstones do not require any in-place updating of records, whereas
+tagging does. This means that using tombstones is the only way to ensure total
+immutability of the data within shards, which avoids random writes and eases
+concurrency control. The tombstone delete policy, then, is particularly
+appealing in external and concurrent contexts.
+
+\Paragraph{Deletion Cost.} The cost of a delete under the tombstone policy is
+the same as an ordinary insert. Tagging, by contrast, requires a point-lookup
+of the record to be deleted, and so is more expensive. Assuming a point-lookup
+operation with cost $L(n)$, a tagged delete must search each level in the
+index, as well as the buffer, requiring $O\left(N_b + L(n)\log_s n\right)$
+time.
+
+\Paragraph{Rejection Check Costs.} In addition to the cost of the delete
+itself, the delete policy affects the cost of determining if a given record has
+been deleted. This is called the \emph{rejection check cost}, $R(n)$. When
+using tagging, the information necessary to make the rejection decision is
+local to the sampled record, and so $R(n) \in O(1)$. However, when using tombstones
+it is not; a point-lookup must be performed to search for a given record's
+corresponding tombstone. This look-up must examine the buffer, and each shard
+within the index. This results in a rejection check cost of $R(n) \in O\left(N_b +
+L(n) \log_s n\right)$. The rejection check process for the two delete policies is
+summarized in Figure~\ref{fig:delete}.
+
+Two factors contribute to the tombstone rejection check cost: the size of the
+buffer, and the cost of performing a point-lookup against the shards. The
+latter cost can be controlled using the framework's ability to associate
+auxiliary structures with shards. For SSIs which do not support efficient
+point-lookups, a hash table can be added to map key-value pairs to their
+location within the SSI. This allows for constant-time rejection checks, even
+in situations where the index would not otherwise support them. However, the
+storage cost of this intervention is high, and in situations where the SSI does
+support efficient point-lookups, it is not necessary. Further performance
+improvements can be achieved by noting that the probability of a given record
+having an associated tombstone in any particular shard is relatively small.
+This means that many point-lookups will be executed against shards that do not
+contain the tombstone being searched for. In this case, these unnecessary
+lookups can be partially avoided using Bloom filters~\cite{bloom70} for
+tombstones. By inserting tombstones into these filters during reconstruction,
+point-lookups against some shards which do not contain the tombstone being
+searched for can be bypassed. Filters can be attached to the buffer as well,
+which may be even more significant due to the linear cost of scanning it. As
+the goal is a reduction of rejection check costs, these filters need only be
+populated with tombstones. In a later section, techniques for bounding the
+number of tombstones on a given level are discussed, which will allow for the
+memory usage of these filters to be tightly controlled while still ensuring
+precise bounds on filter error.
+
+\Paragraph{Sampling with Deletes.} The addition of deletes to the framework
+alters the analysis of sampling costs. A record that has been deleted cannot
+be present in the sample set, and therefore the presence of each sampled record
+must be verified. If a record has been deleted, it must be rejected. When
+retrying samples rejected due to delete, the process must restart from shard
+selection, as deleted records may be counted in the weight totals used to
+construct that structure. This increases the cost of sampling to,
+\begin{equation}
+\label{eq:sampling-cost}
+ O\left([W(n) + P(n)]\log_s n + \frac{kS(n)}{1 - \mathbf{Pr}[\text{rejection}]} \cdot R(n)\right)
+\end{equation}
+where $R(n)$ is the cost of checking if a sampled record has been deleted, and
+$\nicefrac{k}{1 -\mathbf{Pr}[\text{rejection}]}$ is the expected number of sampling
+attempts required to obtain $k$ samples, given a fixed rejection probability.
+The rejection probability itself is a function of the workload, and is
+unbounded.
+
+\Paragraph{Bounding the Rejection Probability.} Rejections during sampling
+constitute wasted memory accesses and random number generations, and so steps
+should be taken to minimize their frequency. The probability of a rejection is
+directly related to the number of deleted records, which is itself a function
+of workload and dataset. This means that, without building counter-measures
+into the framework, tight bounds on sampling performance cannot be provided in
+the presence of deleted records. It is therefore critical that the framework
+support some method for bounding the number of deleted records within the
+index.
+
+While the static nature of shards prevents the direct removal of records at the
+moment they are deleted, it doesn't prevent the removal of records during
+reconstruction. When using tagging, all tagged records encountered during
+reconstruction can be removed. When using tombstones, however, the removal
+process is non-trivial. In principle, a rejection check could be performed for
+each record encountered during reconstruction, but this would increase
+reconstruction costs and introduce a new problem of tracking tombstones
+associated with records that have been removed. Instead, a lazier approach can
+be used: delaying removal until a tombstone and its associated record
+participate in the same shard reconstruction. This delay allows both the record
+and its tombstone to be removed at the same time, an approach called
+\emph{tombstone cancellation}. In general, this can be implemented using an
+extra linear scan of the input shards before reconstruction to identify
+tombstones and associated records for cancellation, but potential optimizations
+exist for many SSIs, allowing it to be performed during the reconstruction
+itself at no extra cost.
+
+The removal of deleted records passively during reconstruction is not enough to
+bound the number of deleted records within the index. It is not difficult to
+envision pathological scenarios where deletes result in unbounded rejection
+rates, even with this mitigation in place. However, the dropping of deleted
+records does provide a useful property: any specific deleted record will
+eventually be removed from the index after a finite number of reconstructions.
+Using this fact, a bound on the number of deleted records can be enforced. A
+new parameter, $\delta$, is defined, representing the maximum proportion of
+deleted records within the index. Each level, and the buffer, tracks the number
+of deleted records it contains by counting its tagged records or tombstones.
+Following each buffer flush, the proportion of deleted records is checked
+against $\delta$. If any level is found to exceed it, then a proactive
+reconstruction is triggered, pushing its shards down into the next level. The
+process is repeated until all levels respect the bound, allowing the number of
+deleted records to be precisely controlled, which, by extension, bounds the
+rejection rate. This process is called \emph{compaction}.
+
+Assuming every record is equally likely to be sampled, this new bound can be
+applied to the analysis of sampling costs. The probability of a record being
+rejected is $\mathbf{Pr}[\text{rejection}] = \delta$. Applying this result to
+Equation~\ref{eq:sampling-cost} yields,
+\begin{equation}
+%\label{eq:sampling-cost-del}
+ O\left([W(n) + P(n)]\log_s n + \frac{kS(n)}{1 - \delta} \cdot R(n)\right)
+\end{equation}
+
+Asymptotically, this proactive compaction does not alter the analysis of
+insertion costs. Each record is still written at most $s$ times on each level,
+there are at most $\log_s n$ levels, and the buffer insertion and SSI
+construction costs are all unchanged, and so on. This results in the amortized
+insertion cost remaining the same.
+
+This compaction strategy is based upon tombstone and record counts, and the
+bounds assume that every record is equally likely to be sampled. For certain
+sampling problems (such as WSS), there are other conditions that must be
+considered to provide a bound on the rejection rate. To account for these
+situations in a general fashion, the framework supports problem-specific
+compaction triggers that can be tailored to the SSI being used. These allow
+compactions to be triggered based on other properties, such as rejection rate
+of a level, weight of deleted records, and the like.
+
+
+\subsection{Trade-offs on Framework Design Space}
+\label{ssec:design-space}
+The framework has several tunable parameters, allowing it to be tailored for
+specific applications. This design space contains trade-offs among three major
+performance characteristics: update cost, sampling cost, and auxiliary memory
+usage. The two most significant decisions when implementing this framework are
+the selection of the layout and delete policies. The asymptotic analysis of the
+previous sections obscures some of the differences between these policies, but
+they do have significant practical performance implications.
+
+\Paragraph{Layout Policy.} The choice of layout policy represents a clear
+trade-off between update and sampling performance. Leveling
+results in fewer shards of larger size, whereas tiering results in a larger
+number of smaller shards. As a result, leveling reduces the costs associated
+with point-lookups and sampling query preprocessing by a constant factor,
+compared to tiering. However, it results in more write amplification: a given
+record may be involved in up to $s$ reconstructions on a single level, as
+opposed to the single reconstruction per level under tiering.
+
+\Paragraph{Delete Policy.} There is a trade-off between delete performance and
+sampling performance that exists in the choice of delete policy. Tagging
+requires a point-lookup when performing a delete, which is more expensive than
+the insert required by tombstones. However, it also allows constant-time
+rejection checks, unlike tombstones which require a point-lookup of each
+sampled record. In situations where deletes are common and write-throughput is
+critical, tombstones may be more useful. Tombstones are also ideal in
+situations where immutability is required, or random writes must be avoided.
+Generally speaking, however, tagging is superior when using SSIs that support
+it, because sampling rejection checks will usually be more common than deletes.
+
+\Paragraph{Mutable Buffer Capacity and Scale Factor.} The mutable buffer
+capacity and scale factor both influence the number of levels within the index,
+and by extension the number of distinct shards. Sampling and point-lookups have
+better performance with fewer shards. Smaller shards are also faster to
+reconstruct, although the same adjustments that reduce shard size also result
+in a larger number of reconstructions, so the trade-off here is less clear.
+
+The scale factor has an interesting interaction with the layout policy: when
+using leveling, the scale factor directly controls the amount of write
+amplification per level. Larger scale factors mean more time is spent
+reconstructing shards on a level, reducing update performance. Tiering does not
+have this problem and should see its update performance benefit directly from a
+larger scale factor, as this reduces the number of reconstructions.
+
+The buffer capacity also influences the number of levels, but is more
+significant in its effects on point-lookup performance: a lookup must perform a
+linear scan of the buffer. Likewise, the unstructured nature of the buffer also
+will contribute negatively towards sampling performance, irrespective of which
+buffer sampling technique is used. As a result, although a large buffer will
+reduce the number of shards, it will also hurt sampling and delete (under
+tagging) performance. It is important to minimize the cost of these buffer
+scans, and so it is preferable to keep the buffer small, ideally small enough
+to fit within the CPU's L2 cache. The number of shards within the index is,
+then, better controlled by changing the scale factor, rather than the buffer
+capacity. Using a smaller buffer will result in more compactions and shard
+reconstructions; however, the empirical evaluation in Section~\ref{ssec:ds-exp}
+demonstrates that this is not a serious performance problem when a scale factor
+is chosen appropriately. When the shards are in memory, frequent small
+reconstructions do not have a significant performance penalty compared to less
+frequent, larger ones.
+
+\Paragraph{Auxiliary Structures.} The framework's support for arbitrary
+auxiliary data structures allows for memory to be traded in exchange for
+insertion or sampling performance. The use of Bloom filters for accelerating
+tombstone rejection checks has already been discussed, but many other options
+exist. Bloom filters could also be used to accelerate point-lookups for delete
+tagging, though such filters would require much more memory than tombstone-only
+ones to be effective. An auxiliary hash table could be used for accelerating
+point-lookups, or range filters like SuRF \cite{zhang18} or Rosetta
+\cite{siqiang20} added to accelerate pre-processing for range queries like in
+IRS or WIRS.
diff --git a/chapters/sigmod23/introduction.tex b/chapters/sigmod23/introduction.tex
new file mode 100644
index 0000000..0155c7d
--- /dev/null
+++ b/chapters/sigmod23/introduction.tex
@@ -0,0 +1,20 @@
+\section{Introduction} \label{sec:intro}
+
+As a first attempt at realizing a dynamic extension framework, one of the
+non-decomposable search problems discussed in the previous chapter was
+considered: independent range sampling, along with a number of other
+independent sampling problems. These sorts of queries are important in a
+variety of contexts, including including approximate query processing
+(AQP)~\cite{blinkdb,quickr,verdict,cohen23}, interactive data
+exploration~\cite{sps,xie21}, financial audit sampling~\cite{olken-thesis}, and
+feature selection for machine learning~\cite{ml-sampling}. However, they are
+not well served using existing techniques, which tend to sacrifice statistical
+independence for performance, or vise versa. In this chapter, a solution for
+independent sampling is presented that manages to achieve both statistical
+independence, and good performance, by designing a Bentley-Saxe inspired
+framework for introducing update support to efficient static sampling data
+structures. It seeks to demonstrate the viability of Bentley-Saxe as the basis
+for adding update support to data structures, as well as showing that the
+limitations of the decomposable search problem abstraction can be overcome
+through alternative query processing techniques to preserve good
+performance.
diff --git a/chapters/sigmod23/relatedwork.tex b/chapters/sigmod23/relatedwork.tex
new file mode 100644
index 0000000..600cd0d
--- /dev/null
+++ b/chapters/sigmod23/relatedwork.tex
@@ -0,0 +1,33 @@
+\section{Related Work}
+\label{sec:related}
+
+The general IQS problem was first proposed by Hu, Qiao, and Tao~\cite{hu14} and
+has since been the subject of extensive research
+\cite{irsra,afshani17,xie21,aumuller20}. These papers involve the use of
+specialized indexes to assist in drawing samples efficiently from the result
+sets of specific types of query, and are largely focused on in-memory settings.
+A recent survey by Tao~\cite{tao22} acknowledged that dynamization remains a major
+challenge for efficient sampling indexes. There do exist specific examples of
+sampling indexes~\cite{hu14} designed to support dynamic updates, but they are
+specialized, and impractical due to their
+implementation complexity and high constant-factors in their cost functions. A
+static index for spatial independent range sampling~\cite{xie21} has been
+proposed with a dynamic extension similar to the one proposed in this paper, but the method was not
+generalized, and its design space was not explored. There are also
+weight-updatable implementations of the alias structure \cite{hagerup93,
+matias03, allendorf23} that function under various assumptions about the weight
+distribution. These are of limited utility in a database context as they do not
+support direct insertion or deletion of entries. Efforts have also been made to
+improve tree-traversal based sampling approaches. Notably, the AB-tree
+\cite{zhao22} extends tree-sampling with support for concurrent updates, which
+has been a historical pain point.
+
+The Bentley-Saxe method was first proposed by Saxe and Bentley~\cite{saxe79}.
+Overmars and van Leeuwen extended this framework to provide better worst-case
+bounds~\cite{overmars81}, but their approach hurts common case performance by
+splitting reconstructions into small pieces and executing these pieces each
+time a record is inserted. Though not commonly used in database systems, the
+method has been applied to address specialized, problems, such as the creation
+of dynamic metric indexing structures~\cite{naidan14}, analysis of
+trajectories~\cite{custers19}, and genetic sequence search
+indexes~\cite{almodaresi23}.
diff --git a/chapters/vita.tex b/chapters/vita.tex
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/chapters/vita.tex
diff --git a/cls/ACM-Reference-Format.bst b/cls/ACM-Reference-Format.bst
new file mode 100644
index 0000000..c47cb4c
--- /dev/null
+++ b/cls/ACM-Reference-Format.bst
@@ -0,0 +1,3081 @@
+%%% -*-BibTeX-*-
+%%% ====================================================================
+%%% @BibTeX-style-file{
+%%% author = "Nelson H. F. Beebe, Boris Veytsman and Gerald Murray",
+%%% version = "2.1",
+%%% acmart-version = "1.90",
+%%% date = "Mar 26 2023",
+%%% filename = "ACM-Reference-Format.bst",
+%%% email = "borisv@lk.net, boris@varphi.com",
+%%% codetable = "ISO/ASCII",
+%%% keywords = "ACM Transactions bibliography style; BibTeX",
+%%% license = "public domain",
+%%% supported = "yes",
+%%% abstract = "",
+%%% }
+%%% ====================================================================
+
+%%% Revision history: see source in git
+
+ENTRY
+ { address
+ advisor
+ archiveprefix
+ author
+ booktitle
+ chapter
+ city
+ date
+ edition
+ editor
+ eprint
+ eprinttype
+ eprintclass
+ howpublished
+ institution
+ journal
+ key
+ location
+ month
+ note
+ number
+ organization
+ pages
+ primaryclass
+ publisher
+ school
+ series
+ title
+ type
+ volume
+ year
+ % New keys recognized
+ issue % UTAH: used in, e.g., ACM SIGSAM Bulletin and ACM Communications in Computer Algebra
+ articleno
+ eid
+ day % UTAH: needed for newspapers, weeklies, bi-weeklies
+ doi % UTAH
+ url % UTAH
+ bookpages % UTAH
+ numpages
+ lastaccessed % UTAH: used only for @Misc{...}
+ coden % UTAH
+ isbn % UTAH
+ isbn-13 % UTAH
+ issn % UTAH
+ lccn % UTAH
+ distinctURL % whether to print url if doi is present
+ }
+ {}
+ { label.year extra.label sort.year sort.label basic.label.year}
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+INTEGERS { show-isbn-10-and-13 } % initialized below in begin.bib
+
+INTEGERS { nameptr namesleft numnames }
+
+INTEGERS { multiresult }
+
+INTEGERS { len }
+
+INTEGERS { last.extra.num }
+
+STRINGS { s t t.org u }
+
+STRINGS { last.label next.extra }
+
+STRINGS { p1 p2 p3 page.count }
+
+
+FUNCTION { not }
+{
+ { #0 }
+ { #1 }
+ if$
+}
+
+FUNCTION { and }
+{
+ 'skip$
+ { pop$ #0 }
+ if$
+}
+
+FUNCTION { or }
+{
+ { pop$ #1 }
+ 'skip$
+ if$
+}
+
+
+FUNCTION { dump.stack.1 }
+{
+ duplicate$ "STACK[top] = [" swap$ * "]" * warning$
+}
+
+FUNCTION { dump.stack.2 }
+{
+ duplicate$ "STACK[top ] = [" swap$ * "]" * warning$
+ swap$
+ duplicate$ "STACK[top-1] = [" swap$ * "]" * warning$
+ swap$
+}
+
+FUNCTION { empty.or.unknown }
+{
+ %% Examine the top stack entry, and push 1 if it is empty, or
+ %% consists only of whitespace, or is a string beginning with two
+ %% queries (??), and otherwise, push 0.
+ %%
+ %% This function provides a replacement for empty$, with the
+ %% convenient feature that unknown values marked by two leading
+ %% queries are treated the same as missing values, and thus, do not
+ %% appear in the output .bbl file, and yet, their presence in .bib
+ %% file(s) serves to mark values which are temporarily missing, but
+ %% are expected to be filled in eventually once more data is
+ %% obtained. The TeX User Group and BibNet bibliography archives
+ %% make extensive use of this practice.
+ %%
+ %% An empty string cannot serve the same purpose, because just as in
+ %% statistics data processing, an unknown value is not the same as an
+ %% empty value.
+ %%
+ %% At entry: stack = ... top:[string]
+ %% At exit: stack = ... top:[0 or 1]
+
+ duplicate$ empty$
+ { pop$ #1 }
+ { #1 #2 substring$ "??" = }
+ if$
+}
+
+FUNCTION { empty.or.zero }
+{
+ %% Examine the top entry and push 1 if it is empty, or is zero
+ duplicate$ empty$
+ { pop$ #1 }
+ { "0" = }
+ if$
+}
+
+
+FUNCTION { writeln }
+{
+ %% In BibTeX style files, the sequences
+ %%
+ %% ... "one" "two" output
+ %% ... "one" "two" output.xxx
+ %%
+ %% ship "one" to the output file, possibly following by punctuation,
+ %% leaving the stack with
+ %%
+ %% ... "two"
+ %%
+ %% There is thus a one-string lag in output processing that must be
+ %% carefully handled to avoid duplicating a string in the output
+ %% file. Unless otherwise noted, all output.xxx functions leave
+ %% just one new string on the stack, and that model should be born
+ %% in mind when reading or writing function code.
+ %%
+ %% BibTeX's asynchronous buffering of output from strings from the
+ %% stack is confusing because newline$ bypasses the buffer. It
+ %% would have been so much easier for newline to be a character
+ %% rather than a state of the output-in-progress.
+ %%
+ %% The documentation in btxhak.dvi is WRONG: it says
+ %%
+ %% newline$ Writes onto the bbl file what's accumulated in the
+ %% output buffer. It writes a blank line if and only
+ %% if the output buffer is empty. Since write$ does
+ %% reasonable line breaking, you should use this
+ %% function only when you want a blank line or an
+ %% explicit line break.
+ %%
+ %% write$ Pops the top (string) literal and writes it on the
+ %% output buffer (which will result in stuff being
+ %% written onto the bbl file when the buffer fills
+ %% up).
+ %%
+ %% Examination of the BibTeX source code shows that write$ does
+ %% indeed behave as claimed, but newline$ sends a newline character
+ %% directly to the output file, leaving the stack unchanged. The
+ %% first line "Writes onto ... buffer." is therefore wrong.
+ %%
+ %% The original BibTeX style files almost always use "write$ newline$"
+ %% in that order, so it makes sense to hide that pair in a private
+ %% function like this one, named after a statement in Pascal,
+ %% the programming language embedded in the BibTeX Web program.
+
+ write$ % output top-of-stack string
+ newline$ % immediate write of newline (not via stack)
+}
+
+FUNCTION { init.state.consts }
+{
+ #0 'before.all :=
+ #1 'mid.sentence :=
+ #2 'after.sentence :=
+ #3 'after.block :=
+}
+
+FUNCTION { output.nonnull }
+{ % Stack in: ... R S T Stack out: ... R T File out: S<comma><space>
+ 's :=
+ output.state mid.sentence =
+ {
+ ", " * write$
+ }
+ {
+ output.state after.block =
+ {
+ add.period$ writeln
+ "\newblock " write$
+ }
+ {
+ output.state before.all =
+ {
+ write$
+ }
+ {
+ add.period$ " " * write$
+ }
+ if$
+ }
+ if$
+ mid.sentence 'output.state :=
+ }
+ if$
+ s
+}
+
+FUNCTION { output.nonnull.dot.space }
+{ % Stack in: ... R S T Stack out: ... R T File out: S<dot><space>
+ 's :=
+ output.state mid.sentence = % { "<DEBUG output.nonnull.dot.space>. " * write$ }
+ {
+ ". " * write$
+ }
+ {
+ output.state after.block =
+ {
+ add.period$ writeln "\newblock " write$
+ }
+ {
+ output.state before.all =
+ {
+ write$
+ }
+ {
+ add.period$ " " * write$
+ }
+ if$
+ }
+ if$
+ mid.sentence 'output.state :=
+ }
+ if$
+ s
+}
+
+FUNCTION { output.nonnull.remove }
+{ % Stack in: ... R S T Stack out: ... R T File out: S<space>
+ 's :=
+ output.state mid.sentence =
+ {
+ " " * write$
+ }
+ {
+ output.state after.block =
+ {
+ add.period$ writeln "\newblock " write$
+ }
+ {
+ output.state before.all =
+ {
+ write$
+ }
+ {
+ add.period$ " " * write$
+ }
+ if$
+ }
+ if$
+ mid.sentence 'output.state :=
+ }
+ if$
+ s
+}
+
+FUNCTION { output.nonnull.removenospace }
+{ % Stack in: ... R S T Stack out: ... R T File out: S
+ 's :=
+ output.state mid.sentence =
+ {
+ "" * write$
+ }
+ {
+ output.state after.block =
+ {
+ add.period$ writeln "\newblock " write$
+ }
+ {
+ output.state before.all =
+ {
+ write$
+ }
+ {
+ add.period$ " " * write$
+ }
+ if$
+ }
+ if$
+ mid.sentence 'output.state :=
+ }
+ if$
+ s
+}
+
+FUNCTION { output }
+{ % discard top token if empty, else like output.nonnull
+ duplicate$ empty.or.unknown
+ 'pop$
+ 'output.nonnull
+ if$
+}
+
+FUNCTION { output.dot.space }
+{ % discard top token if empty, else like output.nonnull.dot.space
+ duplicate$ empty.or.unknown
+ 'pop$
+ 'output.nonnull.dot.space
+ if$
+}
+
+FUNCTION { output.removenospace }
+{ % discard top token if empty, else like output.nonnull.removenospace
+ duplicate$ empty.or.unknown
+ 'pop$
+ 'output.nonnull.removenospace
+ if$
+}
+
+FUNCTION { output.check }
+{ % like output, but warn if key name on top-of-stack is not set
+ 't :=
+ duplicate$ empty.or.unknown
+ { pop$ "empty " t * " in " * cite$ * warning$ }
+ 'output.nonnull
+ if$
+}
+
+FUNCTION { bibinfo.output.check }
+{ % like output.check, adding bibinfo field
+ 't :=
+ duplicate$ empty.or.unknown
+ { pop$ "empty " t * " in " * cite$ * warning$ }
+ { "\bibinfo{" t "}{" * * swap$ * "}" *
+ output.nonnull }
+ if$
+}
+
+FUNCTION { output.check.dot.space }
+{ % like output.dot.space, but warn if key name on top-of-stack is not set
+ 't :=
+ duplicate$ empty.or.unknown
+ { pop$ "empty " t * " in " * cite$ * warning$ }
+ 'output.nonnull.dot.space
+ if$
+}
+
+FUNCTION { fin.block }
+{ % functionally, but not logically, identical to fin.entry
+ add.period$
+ writeln
+}
+
+FUNCTION { fin.entry }
+{
+ add.period$
+ writeln
+}
+
+FUNCTION { new.sentence }
+{ % update sentence state, with neither output nor stack change
+ output.state after.block =
+ 'skip$
+ {
+ output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+ if$
+ }
+ if$
+}
+
+FUNCTION { fin.sentence }
+{
+ add.period$
+ write$
+ new.sentence
+ ""
+}
+
+FUNCTION { new.block }
+{
+ output.state before.all =
+ 'skip$
+ { after.block 'output.state := }
+ if$
+}
+
+FUNCTION { output.coden } % UTAH
+{ % output non-empty CODEN as one-line sentence (stack untouched)
+ coden empty.or.unknown
+ { }
+ { "\showCODEN{" coden * "}" * writeln }
+ if$
+}
+
+%
+% Sometimes articleno starts with the word 'Article' or 'Paper.
+% (this is a bug of acmdl, sigh)
+% We strip them. We assume eid or articleno is already on stack
+%
+
+FUNCTION { strip.articleno.or.eid }
+{
+ 't :=
+ t #1 #7 substring$ "Article" =
+ {t #8 t text.length$ substring$ 't :=}
+ { }
+ if$
+ t #1 #7 substring$ "article" =
+ {t #8 t text.length$ substring$ 't :=}
+ { }
+ if$
+ t #1 #5 substring$ "Paper" =
+ {t #6 t text.length$ substring$ 't :=}
+ { }
+ if$
+ t #1 #5 substring$ "paper" =
+ {t #6 t text.length$ substring$ 't :=}
+ { }
+ if$
+ % Strip any left trailing space or ~
+ t #1 #1 substring$ " " =
+ {t #2 t text.length$ substring$ 't :=}
+ { }
+ if$
+ t #1 #1 substring$ "~" =
+ {t #2 t text.length$ substring$ 't :=}
+ { }
+ if$
+ t
+}
+
+
+FUNCTION { format.articleno }
+{
+ articleno empty.or.unknown not eid empty.or.unknown not and
+ { "Both articleno and eid are defined for " cite$ * warning$ }
+ 'skip$
+ if$
+ articleno empty.or.unknown eid empty.or.unknown and
+ { "" }
+ {
+ numpages empty.or.unknown
+ { "articleno or eid field, but no numpages field, in "
+ cite$ * warning$ }
+ { }
+ if$
+ eid empty.or.unknown
+ { "Article \bibinfo{articleno}{" articleno strip.articleno.or.eid * "}" * }
+ { "Article \bibinfo{articleno}{" eid strip.articleno.or.eid * "}" * }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.year }
+{ % push year string or "[n.\,d.]" onto output stack
+ %% Because year is a mandatory field, we always force SOMETHING
+ %% to be output
+ "\bibinfo{year}{"
+ year empty.or.unknown
+ { "[n.\,d.]" }
+ { year }
+ if$
+ * "}" *
+}
+
+FUNCTION { format.day.month }
+{ % push "day month " or "month " or "" onto output stack
+ day empty.or.unknown
+ {
+ month empty.or.unknown
+ { "" }
+ { "\bibinfo{date}{" month * "} " *}
+ if$
+ }
+ {
+ month empty.or.unknown
+ { "" }
+ { "\bibinfo{date}{" day * " " * month * "} " *}
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.day.month.year } % UTAH
+{ % if month is empty, push "" else push "(MON.)" or "(DD MON.)"
+ % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+ % acm-*.bst addition: prefix parenthesized date string with
+ % ", Article nnn "
+ articleno empty.or.unknown eid empty.or.unknown and
+ { "" }
+ { output.state after.block =
+ {", " format.articleno * }
+ { format.articleno }
+ if$
+ }
+ if$
+ " (" * format.day.month * format.year * ")" *
+}
+
+FUNCTION { output.day.month.year } % UTAH
+{ % if month is empty value, do nothing; else output stack top and
+ % leave with new top string "(MON.)" or "(DD MON.)"
+ % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+ format.day.month.year
+ output.nonnull.remove
+}
+
+FUNCTION { strip.doi } % UTAH
+{ % Strip any Web address prefix to recover the bare DOI, leaving the
+ % result on the output stack, as recommended by CrossRef DOI
+ % documentation.
+ % For example, reduce "http://doi.acm.org/10.1145/1534530.1534545" to
+ % "10.1145/1534530.1534545". A suitable URL is later typeset and
+ % displayed as the LAST item in the reference list entry. Publisher Web
+ % sites wrap this with a suitable link to a real URL to resolve the DOI,
+ % and the master https://doi.org/ address is preferred, since publisher-
+ % specific URLs can disappear in response to economic events. All
+ % journals are encouraged by the DOI authorities to use that typeset
+ % format and link procedures for uniformity across all publications that
+ % include DOIs in reference lists.
+ % The numeric prefix is guaranteed to start with "10.", so we use
+ % that as a test.
+ % 2017-02-04 Added stripping of https:// (Boris)
+ doi #1 #3 substring$ "10." =
+ { doi }
+ {
+ doi 't := % get modifiable copy of DOI
+
+ % Change https:// to http:// to strip both prefixes (BV)
+
+ t #1 #8 substring$ "https://" =
+ { "http://" t #9 t text.length$ #8 - substring$ * 't := }
+ { }
+ if$
+
+ t #1 #7 substring$ "http://" =
+ {
+ t #8 t text.length$ #7 - substring$ 't :=
+
+ "INTERNAL STYLE-FILE ERROR" 's :=
+
+ % search for next "/" and assign its suffix to s
+
+ { t text.length$ }
+ {
+ t #1 #1 substring$ "/" =
+ {
+ % save rest of string as true DOI (should be 10.xxxx/yyyy)
+ t #2 t text.length$ #1 - substring$ 's :=
+ "" 't := % empty string t terminates the loop
+ }
+ {
+ % discard first character and continue loop: t <= substring(t,2,last)
+ t #2 t text.length$ #1 - substring$ 't :=
+ }
+ if$
+ }
+ while$
+
+ % check for valid DOI (should be 10.xxxx/yyyy)
+ s #1 #3 substring$ "10." =
+ { }
+ { "unrecognized DOI substring " s * " in DOI value [" * doi * "]" * warning$ }
+ if$
+
+ s % push the stripped DOI on the output stack
+
+ }
+ {
+ "unrecognized DOI value [" doi * "]" * warning$
+ doi % push the unrecognized original DOI on the output stack
+ }
+ if$
+ }
+ if$
+}
+
+%
+% Change by BV: added standard prefix to URL
+%
+FUNCTION { output.doi } % UTAH
+{ % output non-empty DOI as one-line sentence (stack untouched)
+ doi empty.or.unknown
+ { }
+ {
+ %% Use \urldef here for the same reason it is used in output.url,
+ %% see output.url for further discussion.
+ "\urldef\tempurl%" writeln
+ "\url{https://doi.org/" strip.doi * "}" * writeln
+ "\showDOI{\tempurl}" writeln
+ }
+ if$
+}
+
+FUNCTION { output.isbn } % UTAH
+{ % output non-empty ISBN-10 and/or ISBN-13 as one-line sentences (stack untouched)
+ show-isbn-10-and-13
+ {
+ %% show both 10- and 13-digit ISBNs
+ isbn empty.or.unknown
+ { }
+ {
+ "\showISBNx{" isbn * "}" * writeln
+ }
+ if$
+ isbn-13 empty.or.unknown
+ { }
+ {
+ "\showISBNxiii{" isbn-13 * "}" * writeln
+ }
+ if$
+ }
+ {
+ %% show 10-digit ISBNs only if 13-digit ISBNs not available
+ isbn-13 empty.or.unknown
+ {
+ isbn empty.or.unknown
+ { }
+ {
+ "\showISBNx{" isbn * "}" * writeln
+ }
+ if$
+ }
+ {
+ "\showISBNxiii{" isbn-13 * "}" * writeln
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION { output.issn } % UTAH
+{ % output non-empty ISSN as one-line sentence (stack untouched)
+ issn empty.or.unknown
+ { }
+ { "\showISSN{" issn * "}" * writeln }
+ if$
+}
+
+FUNCTION { output.issue }
+{ % output non-empty issue number as a one-line sentence (stack untouched)
+ issue empty.or.unknown
+ { }
+ { "Issue " issue * "." * writeln }
+ if$
+}
+
+FUNCTION { output.lccn } % UTAH
+{ % return with stack untouched
+ lccn empty.or.unknown
+ { }
+ { "\showLCCN{" lccn * "}" * writeln }
+ if$
+}
+
+FUNCTION { output.note } % UTAH
+{ % return with stack empty
+ note empty.or.unknown
+ { }
+ { "\shownote{" note * "}" add.period$ * writeln }
+ if$
+}
+
+FUNCTION { output.note.check } % UTAH
+{ % return with stack empty
+ note empty.or.unknown
+ { "empty note in " cite$ * warning$ }
+ { "\shownote{" note * "}" add.period$ * writeln }
+ if$
+}
+
+FUNCTION { output.eprint } %
+{ % return with stack empty
+ eprint empty.or.unknown
+ { }
+ { "\showeprint"
+ archiveprefix empty.or.unknown
+ { eprinttype empty.or.unknown
+ { }
+ { "[" eprinttype "]" * * * }
+ if$
+ }
+ { "[" archiveprefix "l" change.case$ "]" * * * }
+ if$
+ "{" eprint "}" * * *
+ primaryclass empty.or.unknown
+ { eprintclass empty.or.unknown
+ { }
+ { "~[" eprintclass "]" * * * }
+ if$
+ }
+ { "~[" primaryclass "]" * * * }
+ if$
+ writeln
+ }
+ if$
+}
+
+
+%
+% Changes by BV 2011/04/15. Do not output
+% url if doi is defined
+%
+%
+% Changes by BV 2021/11/26. Output url even if doi is defined
+% if distinctURL is not zero.
+%
+FUNCTION { output.url } % UTAH
+{ % return with stack untouched
+ % output URL and associated lastaccessed fields
+ doi empty.or.unknown distinctURL empty.or.zero not or
+ {
+ url empty.or.unknown
+ { }
+ {
+ %% Use \urldef, outside \showURL, so that %nn, #, etc in URLs work
+ %% correctly. Put the actual URL on its own line to reduce the
+ %% likelihood of BibTeX's nasty line wrapping after column 79.
+ %% \url{} can undo this, but if that doesn't work for some reason
+ %% the .bbl file would have to be repaired manually.
+ "\urldef\tempurl%" writeln
+ "\url{" url * "}" * writeln
+
+ "\showURL{%" writeln
+ lastaccessed empty.or.unknown
+ { "" }
+ { "Retrieved " lastaccessed * " from " * }
+ if$
+ "\tempurl}" * writeln
+ }
+ if$
+ }
+ { }
+ if$
+}
+
+FUNCTION { output.year.check }
+{ % warn if year empty, output top string and leave " YEAR<label>" on stack in mid-sentence
+ year empty.or.unknown
+ { "empty year in " cite$ * warning$
+ write$
+ " \bibinfo{year}{[n.\,d.]}"
+ "\natexlab{" extra.label * "}" * *
+ mid.sentence 'output.state :=
+ }
+ { write$
+ " \bibinfo{year}{" year * "}" *
+ "\natexlab{" extra.label * "}" * *
+ mid.sentence 'output.state :=
+ }
+ if$
+}
+
+
+FUNCTION { le }
+{
+ %% test whether first number is less than or equal to second number
+ %% stack in: n1 n2
+ %% stack out: if n1 <= n2 then 1 else 0
+
+ %% "DEBUG: le " cite$ * warning$
+ > { #0 } { #1 } if$
+}
+
+FUNCTION { ge }
+{
+ %% test whether first number is greater than or equal to second number
+ %% stack in: n1 n2
+ %% stack out: if n1 >= n2 then 1 else 0
+
+ %% "DEBUG: ge " cite$ * warning$
+ < { #0 } { #1 } if$
+}
+
+FUNCTION { is.leading.digit }
+{
+ %% test whether first character of string is a digit
+ %% stack in: string
+ %% stack out: if first-char-is-digit then 1 else 0
+
+ #1 #1 substring$ % replace string by string[1:1]
+ duplicate$ % string[1:1] string[1:1]
+ chr.to.int$
+ "0" chr.to.int$ swap$ le % "0" <= string[1:1] --> 0-or-1
+ swap$ % 0-or-1 string[1:1]
+ chr.to.int$
+ "9" chr.to.int$ le % string[1:1} <= "9" --> 0-or-1
+ and
+}
+
+FUNCTION { skip.digits }
+{
+ %% skip over leading digits in string
+ %% stack in: string
+ %% stack out: rest-of-string leading-digits
+
+ %% "DEBUG: enter skip.digits " cite$ * warning$
+
+ %% dump.stack.1
+
+ duplicate$
+ 't :=
+ 't.org :=
+ "" 'u :=
+
+ { t text.length$ }
+ {
+ %% "=================DEBUG: skip.digits t = [" t * "]" * warning$
+ t is.leading.digit
+ { t #2 t text.length$ #1 - substring$ }
+ {
+ t 'u :=
+ ""
+ }
+ if$
+ 't :=
+ }
+ while$
+
+ u % rest of string
+ t.org #1 t.org text.length$ u text.length$ - substring$ % leading digits
+
+ %% "DEBUG: t.org = [" t.org * "]" * warning$
+ %% "DEBUG: u = [" u * "]" * warning$
+
+ %% dump.stack.2
+
+ %% "DEBUG: leave skip.digits " cite$ * warning$
+}
+
+FUNCTION { skip.nondigits }
+{
+ %% skip over leading nondigits in string
+ %% stack in: string
+ %% stack out: rest-of-string
+
+ %% "DEBUG: enter skip.nondigits " cite$ * warning$
+
+ 't :=
+ "" 'u :=
+
+ { t text.length$ }
+ {
+ %% "=================DEBUG: skip.nondigits t = [" t * "]" * warning$
+ t is.leading.digit
+ {
+ t 'u :=
+ ""
+ }
+ { t #2 t text.length$ #1 - substring$ }
+ if$
+ 't :=
+ }
+ while$
+
+ u % rest of string
+
+ %% dump.stack.1
+ %% "DEBUG: leave skip.nondigits " cite$ * warning$
+}
+
+FUNCTION { parse.next.number }
+{
+ %% stack in: string
+ %% stack out: rest-of-string next-numeric-part-of-string
+ %% Example:
+ %% stack in: "123:1--123:59"
+ %% stack out: ":1--123:59" "123"
+
+ 's :=
+ s skip.nondigits 's :=
+ s skip.digits
+}
+
+FUNCTION { reduce.pages.to.page.count }
+{
+ %% Stack in: arbitrary-and-unused
+ %% Stack out: unchanged
+ %%
+ %% For the new-style pagination with article number and numpages or
+ %% pages, we expect to have BibTeX entries containing something like
+ %% articleno = "17",
+ %% pages = "1--23",
+ %% with output "Article 17, 23 pages",
+ %% or
+ %% articleno = "17",
+ %% numpages = "23",
+ %% with output "Article 17, 23 pages",
+ %% or
+ %% articleno = "17",
+ %% pages = "17:1--17:23",
+ %% with output "Article 17, 23 pages",
+ %%
+ %% If articleno is missing or empty, then we should output "1--23",
+ %% "23" (with a warning of a missing articleno), or "17:1--17:23",
+ %% respectively.
+
+ %% "DEBUG: enter reduce.pages.to.page.count " cite$ * warning$
+
+ %% "DEBUG: pages = [" pages * "]" * warning$
+
+ pages
+ parse.next.number 'p1 :=
+ parse.next.number 'p2 :=
+ parse.next.number 'p3 :=
+ parse.next.number 'page.count :=
+
+ duplicate$
+ empty.or.unknown
+ { }
+ {
+ duplicate$ "unexpected trailing garbage [" swap$ *
+ "] after n:p1--n:p2 in pages = [" *
+ pages *
+ "] in " *
+ cite$ *
+ warning$
+ }
+ if$
+
+ pop$
+
+ %% "DEBUG: reduce.pages.to.page.count: "
+ %% " p1 = " p1 * *
+ %% " p2 = " p2 * *
+ %% " p3 = " p3 * *
+ %% " p4 = " page.count * *
+ %% " in " cite$ * * warning$
+
+ p1 p3 = p2 "1" = and numpages empty.or.unknown and
+ { "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$ }
+ {
+ numpages empty.or.unknown
+ { pages }
+ { numpages }
+ if$
+ 'page.count :=
+ }
+ if$
+
+ p1 "1" = p3 empty.or.unknown and numpages empty.or.unknown and
+ {
+ p2 'page.count :=
+ "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$
+ }
+ {
+ numpages empty.or.unknown
+ { pages }
+ { numpages }
+ if$
+ 'page.count :=
+ }
+ if$
+
+ %% "DEBUG: leave reduce.pages.to.page.count " cite$ * warning$
+}
+
+FUNCTION { new.block.checkb }
+{ % issue a new.block only if at least one of top two stack strings is not empty
+ empty.or.unknown
+ swap$ empty.or.unknown
+ and
+ 'skip$
+ 'new.block
+ if$
+}
+
+FUNCTION { field.or.null }
+{ % convert empty value to null string, else return value
+ duplicate$ empty.or.unknown
+ { pop$ "" }
+ 'skip$
+ if$
+}
+
+
+
+FUNCTION { emphasize }
+{ % emphasize a non-empty top string on the stack
+ duplicate$ empty.or.unknown
+ { pop$ "" }
+ { "\emph{" swap$ * "}" * }
+ if$
+}
+
+FUNCTION { comma }
+{ % convert empty string to null string, or brace string and add trailing comma
+ duplicate$ empty.or.unknown
+ { pop$ "" }
+ { "{" swap$ * "}," * }
+ if$
+}
+
+FUNCTION { format.names }
+{
+ % Format bibliographical entries with the first author last name first,
+ % and subsequent authors with initials followed by last name.
+ % All names are formatted in this routine.
+
+ 's :=
+ #1 'nameptr := % nameptr = 1;
+ s num.names$ 'numnames := % numnames = num.name$(s);
+ numnames 'namesleft :=
+ { namesleft #0 > }
+ { nameptr #1 =
+ %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+ %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+ {"\bibinfo{person}{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+ {"\bibinfo{person}{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+ if$
+ nameptr #1 >
+ {
+ namesleft #1 >
+ { ", " * t * }
+ {
+ numnames #2 >
+ { "," * }
+ 'skip$
+ if$
+ t "\bibinfo{person}{others}" =
+ { " {et~al\mbox{.}}" * } % jrh: avoid spacing problems
+ { " {and} " * t * } % from Chicago Manual of Style
+ if$
+ }
+ if$
+ }
+ 't
+ if$
+ nameptr #1 + 'nameptr := % nameptr += 1;
+ namesleft #1 - 'namesleft := % namesleft =- 1;
+ }
+ while$
+}
+
+FUNCTION { my.full.label }
+{
+ 's :=
+ #1 'nameptr := % nameptr = 1;
+ s num.names$ 'numnames := % numnames = num.name$(s);
+ numnames 'namesleft :=
+ { namesleft #0 > }
+
+ { s nameptr "{vv~}{ll}" format.name$ 't := % get the next name
+ nameptr #1 >
+ {
+ namesleft #1 >
+ { ", " * t * }
+ {
+ numnames #2 >
+ { "," * }
+ 'skip$
+ if$
+ t "others" =
+ { " et~al\mbox{.}" * } % jrh: avoid spacing problems
+ { " and " * t * } % from Chicago Manual of Style
+ if$
+ }
+ if$
+ }
+ 't
+ if$
+ nameptr #1 + 'nameptr := % nameptr += 1;
+ namesleft #1 - 'namesleft := % namesleft =- 1;
+ }
+ while$
+
+}
+
+FUNCTION { format.names.fml }
+{
+ % Format names in "familiar" format, with first initial followed by
+ % last name. Like format.names, ALL names are formatted.
+ % jtb: The names are NOT put in small caps
+
+ 's :=
+ #1 'nameptr := % nameptr = 1;
+ s num.names$ 'numnames := % numnames = num.name$(s);
+ numnames 'namesleft :=
+ { namesleft #0 > }
+
+ {
+ "\bibinfo{person}{" s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ * "}" * 't :=
+
+ nameptr #1 >
+ {
+ namesleft #1 >
+ { ", " * t * }
+ {
+ numnames #2 >
+ { "," * }
+ 'skip$
+ if$
+ t "\bibinfo{person}{others}" =
+ { " {et~al\mbox{.}}" * }
+ { " {and} " * t * }
+ if$
+ }
+ if$
+ }
+ 't
+ if$
+ nameptr #1 + 'nameptr := % nameptr += 1;
+ namesleft #1 - 'namesleft := % namesleft =- 1;
+ }
+ while$
+}
+
+FUNCTION { format.authors }
+{
+ author empty.or.unknown
+ { "" }
+ {
+ "\bibfield{author}{"
+ author format.names add.period$ * "}" *} % jtb: add period if none before
+ if$
+}
+
+FUNCTION { format.key }
+{
+ empty.or.unknown
+ { key field.or.null }
+ { "" }
+ if$
+}
+
+FUNCTION { format.no.key }
+{
+ empty.or.unknown
+ { "" }
+ { "" }
+ if$
+}
+
+FUNCTION { format.editors.fml }
+{
+ % Format editor names for use in the "in" types: inbook, incollection,
+ % inproceedings: first initial, then last names. When editors are the
+ % LABEL for an entry, then format.editor is used which lists editors
+ % by last name first.
+
+ editor empty.or.unknown
+ { "" }
+ {
+ "\bibfield{editor}{"
+ editor format.names.fml
+ * "}" *
+ editor num.names$ #1 >
+ { " (Eds.)" * }
+ { " (Ed.)" * }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.editors }
+{ % format editor names for use in labels, last names first.
+ editor empty.or.unknown
+ { "" }
+ {
+ "\bibfield{editor}{"
+ editor format.names
+ * "}" *
+ editor num.names$ #1 >
+ { " (Eds.)." * }
+ { " (Ed.)." * }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.articletitle }
+{
+ title empty.or.unknown
+ { "" }
+ % Use this to preserve lettercase in titles:
+ { "\showarticletitle{" title * "}" * }
+ % Use this for downcase title style:
+ % { \showarticletitle{" title "t" change.case$ * "}" * }
+ if$
+}
+
+FUNCTION { format.title }
+{
+ title empty.or.unknown
+ { "" }
+ % Use this to preserve lettercase in titles:
+ { "\bibinfo{title}{" title * "}" * }
+ % Use this for downcase title style:
+ % { title "t" change.case$ }
+ if$
+}
+
+FUNCTION { n.dashify }
+{
+ 't :=
+ ""
+ { t empty.or.unknown not }
+ {
+ t #1 #1 substring$ "-" =
+ {
+ t #1 #2 substring$ "--" = not
+ { "--" *
+ t #2 global.max$ substring$ 't :=
+ }
+ {
+ { t #1 #1 substring$ "-" = }
+ {
+ "-" *
+ t #2 global.max$ substring$ 't :=
+ }
+ while$
+ }
+ if$
+ }
+ {
+ t #1 #1 substring$ *
+ t #2 global.max$ substring$ 't :=
+ }
+ if$
+ }
+ while$
+}
+
+FUNCTION { format.a.title.with.edition }
+{
+ "\bibinfo{booktitle}{"
+ swap$ emphasize *
+ edition empty.or.unknown
+ 'skip$
+ { " (\bibinfo{edition}{" * edition "l" change.case$ *
+ "} ed.)" * } % jtb: no parens for ed.
+ if$
+ "}" *
+}
+
+FUNCTION { format.btitle }
+{ title format.a.title.with.edition }
+
+FUNCTION { format.emphasize.booktitle }
+{ booktitle format.a.title.with.edition }
+
+
+
+FUNCTION { format.city }
+{
+ % jtb: if the preceding string (the title of the conference) is non-empty,
+ % jtb: append the location, otherwise leave empty (so as to trigger the
+ % jtb: error message in output.check
+
+ duplicate$ empty.or.unknown
+ { }
+ {
+ city empty.or.unknown location empty.or.unknown and
+ {
+ date empty.or.unknown
+ { }
+ { " (" * date * ")" * }
+ if$
+ }
+ {
+ location empty.or.unknown
+ {
+ date empty.or.unknown
+ { " (" * city * ")" * }
+ { " (" * city * ", " * date * ")" * }
+ if$
+ }
+ {
+ date empty.or.unknown
+ { " (" * location * ")" * }
+ { " (" * location * ", " * date * ")" * }
+ if$
+ }
+ if$
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION { tie.or.space.connect }
+{
+ duplicate$ text.length$ #3 <
+ { "~" }
+ { " " }
+ if$
+ swap$ * *
+}
+
+FUNCTION { either.or.check }
+{
+ empty.or.unknown
+ 'pop$
+ { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+ if$
+}
+
+FUNCTION { format.bvolume }
+{
+ % jtb: If there is a series, this is added and the volume trails after it.
+ % jtb: Otherwise, "Vol" is Capitalized.
+
+ volume empty.or.unknown
+ { "" }
+ {
+ series empty.or.unknown
+ { "Vol.~\bibinfo{volume}{" volume "}" * *}
+ { "\bibinfo{series}{" series "}, " * *
+ "Vol.~\bibinfo{volume}{" volume "}" * * *}
+ if$
+ "volume and number" number either.or.check
+ }
+ if$
+}
+
+FUNCTION { format.bvolume.noseries }
+{
+ volume empty.or.unknown
+ { "" }
+ { "Vol.~\bibinfo{volume}{" volume "}" * *
+ "volume and number" number either.or.check
+ }
+ if$
+}
+
+FUNCTION { format.series }
+{
+ series empty.or.unknown
+ {""}
+ {" \emph{(\bibinfo{series}{" * series "}" *
+ volume empty.or.unknown
+ {
+ number empty.or.unknown
+ {")}" *}
+ {", \bibinfo{number}{" number "})}" * * *}
+ if$
+ }
+ {", Vol.~\bibinfo{volume}{" volume "})}" * * *
+ "volume and number" number either.or.check
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.number.series }
+{
+ volume empty.or.unknown
+ {
+ number empty.or.unknown
+ {
+ volume empty.or.unknown
+ { "" }
+ {
+ series empty.or.unknown
+ { "" }
+ { " (\bibinfo{series}{" series * "})" * }
+ if$
+ }
+ if$
+ } % { series field.or.null }
+ {
+ output.state mid.sentence =
+ { "Number" } % gnp - changed to mixed case always
+ { "Number" }
+ if$
+ number tie.or.space.connect series empty.or.unknown
+ { "there's a number but no series in " cite$ * warning$ }
+ { " in \bibinfo{series}{" * series * "}" * }
+ if$
+ }
+ if$
+ }
+ {
+ ""
+ }
+ if$
+}
+
+FUNCTION { multi.page.check }
+{
+ 't :=
+ #0 'multiresult :=
+ { multiresult not
+ t empty.or.unknown not
+ and
+ }
+ { t #1 #1 substring$
+ duplicate$ "-" =
+ swap$ duplicate$ "," =
+ swap$ "+" =
+ or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+ if$
+ }
+ while$
+ multiresult
+}
+
+FUNCTION { format.pages }
+{
+ pages empty.or.unknown
+ { "" }
+ { "\bibinfo{pages}{"
+ pages multi.page.check
+ { pages n.dashify } % gnp - removed () % jtb: removed pp.
+ { pages }
+ if$
+ * "}" *
+ }
+ if$
+}
+
+FUNCTION { format.pages.check.without.articleno }
+{ %% format pages field only if articleno is absent
+ %% Stack out: pages-specification
+ numpages missing$ pages missing$ and
+ { "page numbers missing in both pages and numpages fields in " cite$ * warning$ }
+ { }
+ if$
+
+ articleno empty.or.unknown eid empty.or.unknown and
+ {
+ pages missing$
+ {
+ numpages empty.or.unknown
+ {""}
+ { "\bibinfo{numpages}{" numpages * "}~pages" * }
+ if$
+ }
+ { format.pages }
+ if$
+ }
+ { "" }
+ if$
+}
+
+FUNCTION { format.pages.check }
+{
+ pages empty.or.unknown
+ { "page numbers missing in " cite$ * warning$ "" }
+ { pages n.dashify }
+ if$
+}
+
+FUNCTION { format.bookpages }
+{
+ bookpages empty.or.unknown
+ { "" }
+ { bookpages "book pages" tie.or.space.connect }
+ if$
+}
+
+FUNCTION { format.named.pages }
+{
+ pages empty.or.unknown
+ { "" }
+ { format.pages "pages" tie.or.space.connect }
+ if$
+}
+
+%
+% Changed by Boris Veytsman, 2011-03-13
+% Now the word "pages" is printed even if
+% there field pages is not empty.
+%
+
+FUNCTION { format.page.count }
+{
+ page.count empty.or.unknown
+ { "" }
+ { "\bibinfo{numpages}{" page.count * "}~pages" * }
+ if$
+}
+
+FUNCTION { format.articleno.numpages }
+{
+ %% There are seven possible outputs, depending on which fields are set.
+ %%
+ %% These four are handled here:
+ %%
+ %% articleno, numpages, pages -> "Article articleno-value, numpages-value pages"
+ %% articleno, numpages -> "Article articleno-value, numpages-value pages"
+ %% articleno, pages -> "Article articleno-value, reduced-pages-value pages"
+ %% articleno -> "Article articleno-value" and warn about missing numpages
+ %%
+ %% The remaining three have already been handled by
+ %% format.pages.check.without.articleno:
+ %%
+ %% numpages, pages -> "pages-value"
+ %% numpages -> "numpages-value"
+ %% pages -> "pages-value"
+ %%
+ %% We no longer issue warninig when missing articleno, but having numpages
+
+ articleno empty.or.unknown eid empty.or.unknown and
+ {
+%% numpages empty.or.unknown
+%% { }
+%% { "numpages field, but no articleno or eid field, in "
+%% cite$ * warning$ }
+%% if$
+ ""
+ }
+ {
+ numpages empty.or.unknown
+ {
+ pages empty.or.unknown
+ {
+ "articleno or eid, but no pages or numpages field in "
+ cite$ * warning$
+ "" 'page.count :=
+ }
+ { reduce.pages.to.page.count }
+ if$
+ }
+ { numpages 'page.count := }
+ if$
+
+ %% The Article number is now handled in format.day.month.year because
+ %% ACM prefers the style "Digital Libraries 12, 3, Article 5 (July 2008)"
+ %% over "Digital Libraries 12, 3 (July 2008), Article 5"
+ %% format.articleno output
+ format.page.count
+ }
+ if$
+}
+
+FUNCTION {calc.format.page.count}
+{
+ numpages empty.or.unknown
+ {
+ pages empty.or.unknown
+ {
+ "" 'page.count :=
+ }
+ { reduce.pages.to.page.count }
+ if$
+ }
+ { numpages 'page.count := }
+ if$
+ format.page.count
+}
+
+
+FUNCTION { journal.canon.abbrev }
+{
+ % Returns a canonical abbreviation for 'journal', or else 'journal'
+ % unchanged.
+ journal "ACM Computing Surveys" = { "Comput. Surveys" } {
+ journal "{ACM} Computing Surveys" = { "Comput. Surveys" } {
+ journal "ACM Transactions on Mathematical Software" = { "ACM Trans. Math. Software" } {
+ journal "{ACM} Transactions on Mathematical Software" = { "ACM Trans. Math. Software" } {
+ journal "ACM SIGNUM Newsletter" = { "ACM SIGNUM Newslett." } {
+ journal "ACM {SIGNUM} Newsletter" = { "ACM SIGNUM Newslett." } {
+ journal "{ACM} SIGNUM Newsletter" = { "ACM SIGNUM Newslett." } {
+ journal "{ACM} {SIGNUM} Newsletter" = { "ACM SIGNUM Newslett." } {
+ journal "American Journal of Sociology" = { "Amer. J. Sociology" } {
+ journal "American Mathematical Monthly" = { "Amer. Math. Monthly" } {
+ journal "American Mathematical Society Translations" = { "Amer. Math. Soc. Transl." } {
+ journal "Applied Mathematics and Computation" = { "Appl. Math. Comput." } {
+ journal "British Journal of Mathematical and Statistical Psychology" = { "Brit. J. Math. Statist. Psych." } {
+ journal "Bulletin of the American Mathematical Society" = { "Bull. Amer. Math. Soc." } {
+ journal "Canadian Mathematical Bulletin" = { "Canad. Math. Bull." } {
+ journal "Communications of the ACM" = { "Commun. ACM" } {
+ journal "Communications of the {ACM}" = { "Commun. ACM" } {
+ journal "Computers and Structures" = { "Comput. \& Structures" } {
+ journal "Contemporary Mathematics" = { "Contemp. Math." } {
+ journal "Crelle's Journal" = { "Crelle's J." } {
+ journal "Giornale di Mathematiche" = { "Giorn. Mat." } {
+ journal "IEEE Transactions on Aerospace and Electronic Systems" = { "IEEE Trans. Aerospace Electron. Systems" } {
+ journal "{IEEE} Transactions on Aerospace and Electronic Systems" = { "IEEE Trans. Aerospace Electron. Systems" } {
+ journal "IEEE Transactions on Automatic Control" = { "IEEE Trans. Automat. Control" } {
+ journal "{IEEE} Transactions on Automatic Control" = { "IEEE Trans. Automat. Control" } {
+ journal "IEEE Transactions on Computers" = { "IEEE Trans. Comput." } {
+ journal "{IEEE} Transactions on Computers" = { "IEEE Trans. Comput." } {
+ journal "IMA Journal of Numerical Analysis" = { "IMA J. Numer. Anal." } {
+ journal "{IMA} Journal of Numerical Analysis" = { "IMA J. Numer. Anal." } {
+ journal "Information Processing Letters" = { "Inform. Process. Lett." } {
+ journal "International Journal for Numerical Methods in Engineering" = { "Internat. J. Numer. Methods Engrg." } {
+ journal "International Journal of Control" = { "Internat. J. Control" } {
+ journal "International Journal of Supercomputing Applications" = { "Internat. J. Supercomputing Applic." } {
+ journal "Journal of Computational Physics" = { "J. Comput. Phys." } {
+ journal "Journal of Computational and Applied Mathematics" = { "J. Comput. Appl. Math." } {
+ journal "Journal of Computer and System Sciences" = { "J. Comput. System Sci." } {
+ journal "Journal of Mathematical Analysis and Applications" = { "J. Math. Anal. Appl." } {
+ journal "Journal of Mathematical Physics" = { "J. Math. Phys." } {
+ journal "Journal of Parallel and Distributed Computing" = { "J. Parallel and Distrib. Comput." } {
+ journal "Journal of Research of the National Bureau of Standards" = { "J. Res. Nat. Bur. Standards" } {
+ journal "Journal of VLSI and Computer Systems" = { "J. VLSI Comput. Syst." } {
+ journal "Journal of {VLSI} and Computer Systems" = { "J. VLSI Comput. Syst." } {
+ journal "Journal of the ACM" = { "J. ACM" } {
+ journal "Journal of the American Statistical Association" = { "J. Amer. Statist. Assoc." } {
+ journal "Journal of the Institute of Mathematics and its Applications" = { "J. Inst. Math. Appl." } {
+ journal "Journal of the Society for Industrial and Applied Mathematics" = { "J. Soc. Indust. Appl. Math." } {
+ journal "Journal of the Society for Industrial and Applied Mathematics, Series B, Numerical Analysis" = { "J. Soc. Indust. Appl. Math. Ser. B Numer. Anal." } {
+ journal "Linear Algebra and its Applications" = { "Linear Algebra Appl." } {
+ journal "Mathematica Scandinavica" = { "Math. Scand." } {
+ journal "Mathematical Tables and Other Aids to Computation" = { "Math. Tables Aids Comput." } {
+ journal "Mathematics of Computation" = { "Math. Comp." } {
+ journal "Mathematische Annalen" = { "Math. Ann." } {
+ journal "Numerische Mathematik" = { "Numer. Math." } {
+ journal "Pacific Journal of Mathematics" = { "Pacific J. Math." } {
+ journal "Parallel Computing" = { "Parallel Comput." } {
+ journal "Philosophical Magazine" = { "Philos. Mag." } {
+ journal "Proceedings of the American Mathematical Society" = { "Proc. Amer. Math. Soc." } {
+ journal "Proceedings of the IEEE" = { "Proc. IEEE" } {
+ journal "Proceedings of the {IEEE}" = { "Proc. IEEE" } {
+ journal "Proceedings of the National Academy of Sciences of the USA" = { "Proc. Nat. Acad. Sci. U. S. A." } {
+ journal "Quarterly Journal of Mathematics, Oxford, Series (2)" = { "Quart. J. Math. Oxford Ser. (2)" } {
+ journal "Quarterly of Applied Mathematics" = { "Quart. Appl. Math." } {
+ journal "Review of the International Statisical Institute" = { "Rev. Inst. Internat. Statist." } {
+ journal "SIAM Journal on Algebraic and Discrete Methods" = { "SIAM J. Algebraic Discrete Methods" } {
+ journal "{SIAM} Journal on Algebraic and Discrete Methods" = { "SIAM J. Algebraic Discrete Methods" } {
+ journal "SIAM Journal on Applied Mathematics" = { "SIAM J. Appl. Math." } {
+ journal "{SIAM} Journal on Applied Mathematics" = { "SIAM J. Appl. Math." } {
+ journal "SIAM Journal on Computing" = { "SIAM J. Comput." } {
+ journal "{SIAM} Journal on Computing" = { "SIAM J. Comput." } {
+ journal "SIAM Journal on Matrix Analysis and Applications" = { "SIAM J. Matrix Anal. Appl." } {
+ journal "{SIAM} Journal on Matrix Analysis and Applications" = { "SIAM J. Matrix Anal. Appl." } {
+ journal "SIAM Journal on Numerical Analysis" = { "SIAM J. Numer. Anal." } {
+ journal "{SIAM} Journal on Numerical Analysis" = { "SIAM J. Numer. Anal." } {
+ journal "SIAM Journal on Scientific and Statistical Computing" = { "SIAM J. Sci. Statist. Comput." } {
+ journal "{SIAM} Journal on Scientific and Statistical Computing" = { "SIAM J. Sci. Statist. Comput." } {
+ journal "SIAM Review" = { "SIAM Rev." } {
+ journal "{SIAM} Review" = { "SIAM Rev." } {
+ journal "Software Practice and Experience" = { "Software Prac. Experience" } {
+ journal "Statistical Science" = { "Statist. Sci." } {
+ journal "The Computer Journal" = { "Comput. J." } {
+ journal "Transactions of the American Mathematical Society" = { "Trans. Amer. Math. Soc." } {
+ journal "USSR Computational Mathematics and Mathematical Physics" = { "U. S. S. R. Comput. Math. and Math. Phys." } {
+ journal "{USSR} Computational Mathematics and Mathematical Physics" = { "U. S. S. R. Comput. Math. and Math. Phys." } {
+ journal "Zeitschrift fur Angewandte Mathematik und Mechanik" = { "Z. Angew. Math. Mech." } {
+ journal "Zeitschrift fur Angewandte Mathematik und Physik" = { "Z. Angew. Math. Phys." } {
+ journal
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+}
+
+FUNCTION { format.journal.volume.number.day.month.year }
+{
+ % By Young (and Spencer)
+ % GNP - fixed bugs with missing volume, number, and/or pages
+ %
+ % Format journal, volume, number, pages for article types.
+ %
+ journal empty.or.unknown
+ { "no journal in " cite$ * warning$ "" }
+ { "\bibinfo{journal}{"
+ journal.canon.abbrev emphasize *
+ "}" * }
+ if$
+
+ number empty.or.unknown
+ {
+ volume empty.or.unknown
+ { "no number and no volume in " cite$ * warning$ "" * }
+ { " " * " \bibinfo{volume}{" * volume * "}" * }
+ if$
+ }
+ {
+ volume empty.or.unknown
+ {
+ "unusual to have number, but no volume, for " cite$ * warning$
+ " \bibinfo{number}{" * number * "}" *
+ }
+ { " \bibinfo{volume}{" * volume * "}, \bibinfo{number}{" *
+ number * "}" *}
+ if$
+ }
+ if$
+ after.block 'output.state :=
+
+ % Sometimes proceedings are published in journals
+ % In this case we do not want to put year, day and month here
+
+ type$ "inproceedings" =
+ { }
+ {format.day.month.year * }
+ if$
+}
+
+FUNCTION { format.chapter.pages }
+{
+ chapter empty.or.unknown
+ 'format.pages
+ { type empty.or.unknown
+ { "Chapter" } % gnp - changed to mixed case
+ { type "t" change.case$ }
+ if$
+ chapter tie.or.space.connect
+ pages empty.or.unknown
+ {"page numbers missing in " cite$ * warning$} % gnp - added check
+ { ", " * format.pages * }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.in.emphasize.booktitle }
+{ % jtb: format for collections or proceedings not appearing in a journal
+ booktitle empty.or.unknown
+ { "" }
+ { "In " format.emphasize.booktitle * }
+ if$
+}
+
+FUNCTION { format.in.booktitle }
+{ % jtb: format for proceedings appearing in a journal
+ booktitle empty.or.unknown
+ { "" }
+ { "In \bibinfo{booktitle}{" booktitle * "}" * }
+ if$
+}
+
+FUNCTION { format.in.ed.booktitle }
+{
+ booktitle empty.or.unknown
+ { "" }
+ { editor empty.or.unknown
+ { "In " format.emphasize.booktitle * }
+ % jtb: swapped editor location
+ { "In " format.emphasize.booktitle * ", " * format.editors.fml * }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.thesis.type }
+{ % call with default type on stack top
+ type empty.or.unknown
+ 'skip$ % use default type
+ {
+ pop$ % discard default type
+ % NO: it is silly to have to brace protect every degree type!: type "t" change.case$
+ type
+ }
+ if$
+}
+
+FUNCTION { format.tr.number }
+{
+ "\bibinfo{type}{"
+ type empty.or.unknown
+ { "{T}echnical {R}eport" }
+ 'type
+ if$
+ "}" * *
+ number empty.or.unknown
+ { "t" change.case$ }
+ %% LOOKS BAD: { "." * number tie.or.space.connect }
+ %% Prefer "Research report RJ687." to "Research report. RJ687."
+ { number tie.or.space.connect }
+ if$
+}
+
+FUNCTION { format.advisor }
+{
+ advisor empty.or.unknown
+ { "" }
+ { "Advisor(s) " advisor * }
+ if$
+}
+
+FUNCTION { format.article.crossref }
+{ "See"
+ "\citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.crossref.editor }
+{
+ editor #1 "{vv~}{ll}" format.name$
+ editor num.names$ duplicate$
+ #2 >
+ { pop$ " et~al\mbox{.}" * } % jrh: avoid spacing problems
+ { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+ { " et~al\mbox{.}" * } % jrh: avoid spacing problems
+ { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+ if$
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION { format.book.crossref }
+{
+ volume empty.or.unknown
+ { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+ "In "
+ }
+ { "Volume" volume tie.or.space.connect % gnp - changed to mixed case
+ " of " *
+ }
+ if$
+ editor empty.or.unknown
+ editor field.or.null author field.or.null =
+ or
+ { key empty.or.unknown
+ { series empty.or.unknown
+ { "need editor, key, or series for " cite$ * " to crossref " *
+ crossref * warning$
+ "" *
+ }
+ { series emphasize * }
+ if$
+ }
+ { key * }
+ if$
+ }
+ { format.crossref.editor * }
+ if$
+ " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.incoll.inproc.crossref }
+{ "See"
+ " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.lab.names }
+{
+ % format.lab.names:
+ %
+ % determines "short" names for the abbreviated author information.
+ % "Long" labels are created in calc.label, using the routine my.full.label
+ % to format author and editor fields.
+ %
+ % There are 4 cases for labels. (n=3 in the example)
+ % a) one author Foo
+ % b) one to n Foo, Bar and Baz
+ % c) use of "and others" Foo, Bar et al.
+ % d) more than n Foo et al.
+
+ 's :=
+ s num.names$ 'numnames :=
+ numnames #2 > % change number to number of others allowed before
+ % forcing "et al".
+ { s #1 "{vv~}{ll}" format.name$ " et~al\mbox{.}" * } % jrh: \mbox{} added
+ {
+ numnames #1 - 'namesleft :=
+ #2 'nameptr :=
+ s #1 "{vv~}{ll}" format.name$
+ { namesleft #0 > }
+ { nameptr numnames =
+ { s nameptr "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+ { " et~al\mbox{.}" * } % jrh: avoid spacing problems
+ { " and " * s nameptr "{vv~}{ll}" format.name$ * }
+ if$
+ }
+ { ", " * s nameptr "{vv~}{ll}" format.name$ * }
+ if$
+ nameptr #1 + 'nameptr :=
+ namesleft #1 - 'namesleft :=
+ }
+ while$
+ }
+ if$
+}
+
+FUNCTION { author.key.label }
+{
+ author empty.or.unknown
+ { key empty.or.unknown
+ { "no key, author in " cite$ * warning$
+ cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+FUNCTION { editor.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if editor is null.
+ editor empty.or.unknown
+ { organization empty.or.unknown
+ { key empty.or.unknown
+ { "no key, editor or organization in " cite$ * warning$
+ cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { organization }
+ if$
+ }
+ { editor format.lab.names }
+ if$
+}
+
+FUNCTION { author.editor.key.label }
+{
+ author empty.or.unknown
+ { editor empty.or.unknown
+ { key empty.or.unknown
+ { "no key, author, or editor in " cite$ * warning$
+ cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { editor format.lab.names }
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+FUNCTION { author.editor.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if author is null.
+ author empty.or.unknown
+ { editor empty.or.unknown
+ { organization empty.or.unknown
+ { key empty.or.unknown
+ { "no key, author, editor or organization in " cite$ * warning$
+ cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { organization }
+ if$
+ }
+ { editor format.lab.names }
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+% Calculate label and leave it on stack
+FUNCTION { calc.basic.label }
+{
+ type$ "book" =
+ type$ "inbook" =
+ or
+ type$ "article" =
+ or
+ 'author.editor.key.label
+ { type$ "proceedings" =
+ type$ "periodical" =
+ or
+ 'editor.key.organization.label
+ { type$ "manual" =
+ 'author.editor.key.organization.label
+ 'author.key.label
+ if$
+ }
+ if$
+ }
+ if$
+ duplicate$
+ year empty.or.unknown
+ { "{[n.\,d.]}" }
+ { year field.or.null purify$ #-1 #4 substring$}
+ if$
+ *
+ 'basic.label.year :=
+}
+
+FUNCTION { calc.label }
+{
+ % Changed - GNP. See also author.editor.organization.sort, editor.organization.sort
+ % Form label for BibTeX entry. The classification of which fields are used
+ % for which type of entry (book, inbook, etc.) are taken from alpha.bst.
+ % The change here from newapa is to also include organization as a
+ % citation label if author or editor is missing.
+
+ calc.basic.label
+
+ author empty.or.unknown % generate the full label citation information.
+ {
+ editor empty.or.unknown
+ {
+ organization empty.or.unknown
+ {
+ key empty.or.unknown
+ {
+ "no author, editor, organization, or key in " cite$ * warning$
+ "??"
+ }
+ { key }
+ if$
+ }
+ { organization }
+ if$
+ }
+ { editor my.full.label }
+ if$
+ }
+ { author my.full.label }
+ if$
+
+ % leave label on the stack, to be popped when required.
+
+ "}{" * swap$ * "}{" *
+ % year field.or.null purify$ #-1 #4 substring$ *
+ %
+ % save the year for sort processing afterwards (adding a, b, c, etc.)
+ %
+ year empty.or.unknown
+ { "{[n.\,d.]}" }
+ { year field.or.null purify$ #-1 #4 substring$}
+ if$
+ 'label.year :=
+}
+
+
+FUNCTION { output.bibitem }
+{
+ newline$
+ "\bibitem[" write$
+ calc.basic.label write$
+ "(" write$
+ sort.year write$
+ ")" write$
+ "]%" writeln
+ " {" write$
+ cite$ write$
+ "}" writeln
+ ""
+ before.all 'output.state :=
+}
+
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint }
+{ % enter and return with stack empty
+ %% We switch now from buffered output to output of complete lines, so
+ %% that the Issue .. URL data have their own lines, and are less likely
+ %% to be line-wrapped by BibTeX's short-sighted algorithm, which wraps
+ %% lines longer than 79 characters, backtracking to what it thinks is
+ %% a break point in the string. Any such wrapping MUST be undone to
+ %% prevent percent-newline from appearing in DOIs and URLs. The
+ %% output data are intentionally wrapped in \showxxx{} macros at
+ %% beginning of line, and that supply their own punctuation (if they
+ %% are not defined to suppress output entirely), to make it easier for
+ %% other software to recover them from .bbl files.
+ %%
+ %% It also makes it possible to later change the macro definitions
+ %% to suppress particular output values, or alter their appearance.
+ %%
+ %% Note that it is possible for theses, technical reports, and
+ %% manuals to have ISBNs, and anything that has an ISBN may also
+ %% have an ISSN. When there are no values for these keys, there
+ %% is no output generated for them here.
+
+ "\newblock" writeln
+ after.block 'output.state :=
+
+ output.issue
+ output.isbn
+ output.coden % CODEN is functionally like ISSN, so output them sequentially
+ output.issn
+ output.lccn
+ output.doi % DOI is ALWAYS last according to CrossRef DOI documentation
+ output.eprint
+ output.url % but ACM wants URL last
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint.note }
+{ % enter with stack empty, return with empty string on stack
+ output.issue.doi.coden.isxn.lccn.url.eprint
+ note empty.or.unknown
+ { }
+ {
+ "\newblock" writeln
+ output.note
+ }
+ if$
+ ""
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint.note.check }
+{ % enter with stack empty, return with empty string on stack
+ output.issue.doi.coden.isxn.lccn.url.eprint
+ note empty.or.unknown
+ { }
+ {
+ "\newblock" writeln
+ output.note.check
+ }
+ if$
+ ""
+}
+
+FUNCTION { article }
+{
+ output.bibitem
+
+ author empty.or.unknown
+ {
+ editor empty.or.unknown
+ { "neither author and editor supplied for " cite$ * warning$ }
+ { format.editors "editor" output.check }
+ if$
+ }
+ { format.authors "author" output.check }
+ if$
+
+ author format.no.key output % added
+ output.year.check % added
+ new.block
+ format.articletitle "title" output.check
+ new.block
+ howpublished empty.or.unknown
+ { }
+ { "\bibinfo{howpublished}{" howpublished "}" * * output }
+ if$
+
+ crossref missing$
+ { format.journal.volume.number.day.month.year output}
+ {
+ "cross reference in @Article{...} is unusual" warning$
+ format.article.crossref output.nonnull
+ }
+ if$
+
+ format.pages.check.without.articleno output
+ format.articleno.numpages output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { book }
+{
+ output.bibitem
+ author empty.or.unknown
+ { format.editors "author and editor" output.check }
+ { format.authors output.nonnull
+ crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+ if$
+ }
+ if$
+ output.year.check % added
+ new.block
+ format.btitle "title" output.check
+ crossref missing$
+ { new.sentence % jtb: start a new sentence for series/volume
+ format.bvolume output
+ new.block
+ format.number.series output
+ new.sentence
+ publisher "publisher" bibinfo.output.check
+ address "address" bibinfo.output.check % jtb: require address
+ fin.sentence
+ pages empty.or.unknown
+ { format.bookpages } % use bookpages when pages empty
+ { format.pages.check "pages" tie.or.space.connect }
+ if$
+ output
+ }
+ { new.block
+ format.book.crossref output.nonnull
+ }
+ if$
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { booklet }
+{
+ output.bibitem
+ format.authors output
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.title "title" output.check
+ new.block
+ howpublished empty.or.unknown
+ { }
+ { "\bibinfo{howpublished}{" howpublished "}" * * output }
+ if$
+ address output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { inbook }
+{
+ output.bibitem
+ author empty.or.unknown
+ { format.editors
+ "author and editor" output.check
+ }
+ { format.authors output.nonnull
+ crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+ if$
+ }
+ if$
+ output.year.check % added
+ new.block
+ format.btitle "title" output.check
+ crossref missing$
+ { new.sentence % jtb: start a new sentence for series/volume
+ format.bvolume output
+ new.block
+ format.number.series output
+ new.sentence
+ publisher "publisher" bibinfo.output.check
+ address "address" bibinfo.output.check % jtb: require address
+ format.bookpages output
+ format.chapter.pages
+ "chapter and pages" output.check % jtb: moved from before publisher
+ }
+ {
+ format.bookpages output
+ format.chapter.pages "chapter and pages" output.check
+ new.block
+ format.book.crossref output.nonnull
+ }
+ if$
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { incollection }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.articletitle "title" output.check
+ new.block
+ crossref missing$
+ { format.in.ed.booktitle "booktitle" output.check
+ new.sentence % jtb: start a new sentence for series/volume
+ format.bvolume output
+ format.number.series output
+ new.sentence
+ publisher "publisher" bibinfo.output.check
+ address "address" bibinfo.output.check % jtb: require address
+ format.bookpages output
+ format.chapter.pages output % gnp - was special.output.nonnull
+ % left out comma before page numbers
+ % jtb: moved from before publisher
+ }
+ {
+ format.incoll.inproc.crossref output.nonnull
+ format.chapter.pages output
+ }
+ if$
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { inproceedings }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.articletitle "title" output.check
+ howpublished empty.or.unknown
+ { }
+ { "\bibinfo{howpublished}{" howpublished "}" * * output.dot.space }
+ if$
+ crossref missing$
+ {
+ journal missing$ % jtb: proceedings appearing in journals
+ { format.in.emphasize.booktitle format.city "booktitle" output.check.dot.space
+ format.series output.removenospace
+ format.editors.fml output % BV 2011/09/27 Moved dot to comma
+ series empty.or.unknown
+ { format.bvolume.noseries output }
+ {}
+ if$
+ new.sentence
+ organization output
+ publisher "publisher" bibinfo.output.check % jtb: require publisher (?)
+ address "address" bibinfo.output.check % jtb: require address
+ format.bookpages output
+ }
+ {
+ format.in.booktitle format.city "booktitle" output.check
+ format.editors.fml output
+ new.sentence
+ format.journal.volume.number.day.month.year output
+ }
+ if$
+ format.articleno output
+ format.pages.check.without.articleno output
+ }
+ {
+ format.incoll.inproc.crossref output.nonnull
+ format.articleno output
+ format.pages.check.without.articleno output
+ }
+ if$
+ format.articleno.numpages output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { conference } { inproceedings }
+
+FUNCTION { manual }
+{
+ output.bibitem
+ author empty.or.unknown
+ { editor empty.or.unknown
+ { organization "organization" output.check
+ organization format.key output } % if all else fails, use key
+ { format.editors "author and editor" output.check }
+ if$
+ }
+ { format.authors output.nonnull }
+ if$
+ output.year.check % added
+ new.block
+ format.btitle "title" output.check
+ organization address new.block.checkb
+ % jtb: back to normal style: organization, address
+ organization "organization" output.check
+ address output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { mastersthesis }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.title emphasize "title" output.check % NB: ACM style requires emphasized thesis title
+ new.block
+ "\bibinfo{thesistype}{Master's\ thesis}" format.thesis.type output
+ new.sentence
+ school "school" bibinfo.output.check
+ address empty.or.unknown
+ { }
+ { "\bibinfo{address}{" address * "}" * output }
+ if$
+ new.block
+ format.advisor output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { misc }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ title howpublished new.block.checkb
+ format.title output
+ new.block
+ howpublished empty.or.unknown
+ { }
+ { "\bibinfo{howpublished}{" howpublished "}" * * output }
+ if$
+ "" output.nonnull.dot.space
+ calc.format.page.count output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { online } { manual }
+
+FUNCTION { game } { manual }
+
+FUNCTION { artifactsoftware } { manual }
+
+FUNCTION { artifactdataset } { manual }
+
+FUNCTION { software } { manual }
+
+FUNCTION { dataset } { manual }
+
+FUNCTION { phdthesis }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.title emphasize "title" output.check % NB: ACM style requires emphasized thesis title
+ new.block
+ "\bibinfo{thesistype}{Ph.\,D. Dissertation}" format.thesis.type output
+ new.sentence
+ school "school" bibinfo.output.check
+ address empty.or.unknown
+ { }
+ { "\bibinfo{address}{" address * "}" * output }
+ if$
+ new.block
+ format.advisor output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION {format.date}
+{ year empty.or.unknown
+ { month empty.or.unknown
+ {
+ "" % output empty date if year/month both empty
+ day empty.or.unknown
+ { }
+ { "there's a day but no month or year in " cite$ * warning$ }
+ if$
+ }
+ { "there's a month but no year in " cite$ * warning$
+ month
+ day empty.or.unknown
+ { }
+ { " " * day * }
+ if$
+ }
+ if$
+ }
+ { month empty.or.unknown
+ {
+ year % output only year if month empty
+ day empty.or.unknown
+ { }
+ { "there's a day and year but no month in " cite$ * warning$ }
+ if$
+ }
+ {
+ month " " *
+ day empty.or.unknown
+ { }
+ { day * ", " * }
+ if$
+ year *
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION {new.block.checka}
+{
+ empty.or.unknown
+ 'skip$
+ 'new.block
+ if$
+}
+
+FUNCTION { periodical }
+{
+ output.bibitem
+ editor empty.or.unknown
+ { organization output }
+ { format.editors output.nonnull }
+ if$
+ new.block
+ output.year.check
+ new.sentence
+ format.articletitle "title" output.check
+ format.journal.volume.number.day.month.year output
+ calc.format.page.count output
+ fin.entry
+}
+
+FUNCTION { proceedings }
+{
+ output.bibitem
+ editor empty.or.unknown
+ { organization output
+ organization format.key output } % gnp - changed from author format.key
+ { format.editors output.nonnull }
+ if$
+ % author format.key output % gnp - removed (should be either
+ % editor or organization
+ output.year.check % added (newapa)
+ new.block
+ format.btitle format.city "title" output.check % jtb: added city
+ new.sentence
+ format.bvolume output
+ format.number.series output
+ new.sentence
+ organization output
+ % jtb: normal order: publisher, address
+ publisher empty.or.unknown
+ { }
+ { "\bibinfo{publisher}{" publisher * "}" * output }
+ if$
+ address empty.or.unknown
+ { }
+ { "\bibinfo{address}{" address * "}" * output }
+ if$
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { collection } { proceedings }
+
+FUNCTION { techreport }
+{
+ output.bibitem
+ format.authors "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.btitle "title" output.check
+ new.block
+% format.tr.number output % jtb: moved month ...
+ format.tr.number output new.sentence % Gerry - need dot 2011/09/28
+ institution "institution" bibinfo.output.check
+ address empty.or.unknown
+ { }
+ { "\bibinfo{address}{" address "}" * * output }
+ if$
+ new.sentence
+ format.named.pages output
+ % ACM omits year at end in transactions style
+ % format.day.month.year output.nonnull.dot.space % jtb: ... to here (no parens)
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note
+ fin.entry
+}
+
+FUNCTION { unpublished }
+{
+ output.bibitem
+ format.authors
+ "author" output.check
+ author format.key output % added
+ output.year.check % added
+ new.block
+ format.title "title" output.check
+ fin.sentence
+ output.day.month.year % UTAH
+ calc.format.page.count output
+ fin.block
+ output.issue.doi.coden.isxn.lccn.url.eprint.note.check
+ fin.entry
+}
+
+FUNCTION { default.type } { misc }
+
+%%% ACM journal-style month definitions: full name if 1--5 letters, else
+%%% abbreviation of 3 or 4 characters and a dot
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+%%% ACM journal names
+
+MACRO {cie} {"ACM Computers in Entertainment"}
+MACRO {csur} {"ACM Computing Surveys"}
+MACRO {dgov} {"Digital Government: Research and Practice"}
+MACRO {dtrap} {"Digital Threats: Research and Practice"}
+MACRO {health} {"ACM Transactions on Computing for Healthcare"}
+MACRO {imwut} {"PACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"}
+MACRO {jacm} {"Journal of the ACM"}
+MACRO {jdiq} {"ACM Journal of Data and Information Quality"}
+MACRO {jea} {"ACM Journal of Experimental Algorithmics"}
+MACRO {jeric} {"ACM Journal of Educational Resources in Computing"}
+MACRO {jetc} {"ACM Journal on Emerging Technologies in Computing Systems"}
+MACRO {jocch} {"ACM Journal on Computing and Cultural Heritage"}
+MACRO {pacmcgit} {"Proceedings of the ACM on Computer Graphics and Interactive Techniques"}
+MACRO {pacmhci} {"PACM on Human-Computer Interaction"}
+MACRO {pacmpl} {"PACM on Programming Languages"}
+MACRO {pomacs} {"PACM on Measurement and Analysis of Computing Systems"}
+MACRO {taas} {"ACM Transactions on Autonomous and Adaptive Systems"}
+MACRO {taccess} {"ACM Transactions on Accessible Computing"}
+MACRO {taco} {"ACM Transactions on Architecture and Code Optimization"}
+MACRO {talg} {"ACM Transactions on Algorithms"}
+MACRO {tallip} {"ACM Transactions on Asian and Low-Resource Language Information Processing"}
+MACRO {tap} {"ACM Transactions on Applied Perception"}
+MACRO {tcps} {"ACM Transactions on Cyber-Physical Systems"}
+MACRO {tds} {"ACM/IMS Transactions on Data Science"}
+MACRO {teac} {"ACM Transactions on Economics and Computation"}
+MACRO {tecs} {"ACM Transactions on Embedded Computing Systems"}
+MACRO {telo} {"ACM Transactions on Evolutionary Learning"}
+MACRO {thri} {"ACM Transactions on Human-Robot Interaction"}
+MACRO {tiis} {"ACM Transactions on Interactive Intelligent Systems"}
+MACRO {tiot} {"ACM Transactions on Internet of Things"}
+MACRO {tissec} {"ACM Transactions on Information and System Security"}
+MACRO {tist} {"ACM Transactions on Intelligent Systems and Technology"}
+MACRO {tkdd} {"ACM Transactions on Knowledge Discovery from Data"}
+MACRO {tmis} {"ACM Transactions on Management Information Systems"}
+MACRO {toce} {"ACM Transactions on Computing Education"}
+MACRO {tochi} {"ACM Transactions on Computer-Human Interaction"}
+MACRO {tocl} {"ACM Transactions on Computational Logic"}
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+MACRO {toct} {"ACM Transactions on Computation Theory"}
+MACRO {todaes} {"ACM Transactions on Design Automation of Electronic Systems"}
+MACRO {tods} {"ACM Transactions on Database Systems"}
+MACRO {tog} {"ACM Transactions on Graphics"}
+MACRO {tois} {"ACM Transactions on Information Systems"}
+MACRO {toit} {"ACM Transactions on Internet Technology"}
+MACRO {tomacs} {"ACM Transactions on Modeling and Computer Simulation"}
+MACRO {tomm} {"ACM Transactions on Multimedia Computing, Communications and Applications"}
+MACRO {tompecs} {"ACM Transactions on Modeling and Performance Evaluation of Computing Systems"}
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+MACRO {topc} {"ACM Transactions on Parallel Computing"}
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+MACRO {tops} {"ACM Transactions on Privacy and Security"}
+MACRO {tos} {"ACM Transactions on Storage"}
+MACRO {tosem} {"ACM Transactions on Software Engineering and Methodology"}
+MACRO {tosn} {"ACM Transactions on Sensor Networks"}
+MACRO {tqc} {"ACM Transactions on Quantum Computing"}
+MACRO {trets} {"ACM Transactions on Reconfigurable Technology and Systems"}
+MACRO {tsas} {"ACM Transactions on Spatial Algorithms and Systems"}
+MACRO {tsc} {"ACM Transactions on Social Computing"}
+MACRO {tslp} {"ACM Transactions on Speech and Language Processing"}
+MACRO {tweb} {"ACM Transactions on the Web"}
+
+%%% Some traditional macros
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+
+READ
+
+FUNCTION { sortify }
+{
+ purify$
+ "l" change.case$
+}
+
+FUNCTION { chop.word }
+{
+ 's :=
+ 'len :=
+ s #1 len substring$ =
+ { s len #1 + global.max$ substring$ }
+ 's
+ if$
+}
+
+FUNCTION { sort.format.names }
+{
+ 's :=
+ #1 'nameptr :=
+ ""
+ s num.names$ 'numnames :=
+ numnames 'namesleft :=
+ { namesleft #0 > }
+ { nameptr #1 >
+ { " " * }
+ 'skip$
+ if$
+ s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't :=
+ nameptr numnames = t "others" = and
+ { " et~al" * }
+ { t sortify * }
+ if$
+ nameptr #1 + 'nameptr :=
+ namesleft #1 - 'namesleft :=
+ }
+ while$
+}
+
+FUNCTION { sort.format.title }
+{
+ 't :=
+ "A " #2
+ "An " #3
+ "The " #4 t chop.word
+ chop.word
+ chop.word
+ sortify
+ #1 global.max$ substring$
+}
+
+FUNCTION { author.sort }
+{
+ author empty.or.unknown
+ { key empty.or.unknown
+ { "to sort, need author or key in " cite$ * warning$
+ "" }
+ { key sortify }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION { author.editor.sort }
+{
+ author empty.or.unknown
+ {
+ editor empty.or.unknown
+ {
+ key empty.or.unknown
+ { "to sort, need author, editor, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { editor sort.format.names }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION { editor.organization.sort }
+{
+ % added - GNP. Stack editor or organization for sorting (from alpha.bst).
+ % Unlike alpha.bst, we need entire names, not abbreviations
+
+ editor empty.or.unknown
+ { organization empty.or.unknown
+ { key empty.or.unknown
+ { "to sort, need editor, organization, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { organization sortify }
+ if$
+ }
+ { editor sort.format.names }
+ if$
+}
+
+FUNCTION { author.editor.organization.sort }
+{
+ % added - GNP. Stack author or organization for sorting (from alpha.bst).
+ % Unlike alpha.bst, we need entire names, not abbreviations
+
+ author empty.or.unknown
+ {
+ editor empty.or.unknown
+ { organization empty.or.unknown
+ { key empty.or.unknown
+ { "to sort, need author, editor, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { organization sortify }
+ if$
+ }
+ { editor sort.format.names }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION { presort }
+{
+ % Presort creates the bibentry's label via a call to calc.label, and then
+ % sorts the entries based on entry type. Chicago.bst adds support for
+ % including organizations as the sort key; the following is stolen from
+ % alpha.bst.
+
+ calc.label
+ basic.label.year
+ swap$
+ " "
+ swap$
+ * *
+ " "
+ *
+ sortify
+ year field.or.null purify$ #-1 #4 substring$ * % add year
+ " "
+ *
+ type$ "book" =
+ type$ "inbook" =
+ or
+ type$ "article" =
+ or
+ 'author.editor.sort
+ { type$ "proceedings" =
+ type$ "periodical" =
+ or
+ 'editor.organization.sort
+ { type$ "manual" =
+ 'author.editor.organization.sort
+ 'author.sort
+ if$
+ }
+ if$
+ }
+ if$
+ #1 entry.max$ substring$ % added for newapa
+ 'sort.label := % added for newapa
+ sort.label % added for newapa
+ *
+ " "
+ *
+ title field.or.null
+ sort.format.title
+ *
+ #1 entry.max$ substring$
+ 'sort.key$ :=
+}
+
+
+
+ITERATE { presort }
+
+SORT % by label, year, author/editor, title
+
+% From plainnat.bst
+STRINGS { longest.label }
+
+INTEGERS { longest.label.width number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+ #0 int.to.chr$ 'last.label :=
+ "" 'next.extra :=
+ #0 'longest.label.width :=
+ #0 'last.extra.num :=
+ #0 'number.label :=
+}
+
+
+
+FUNCTION { initialize.extra.label.stuff }
+{ #0 int.to.chr$ 'last.label :=
+ "" 'next.extra :=
+ #0 'last.extra.num :=
+}
+
+FUNCTION { forward.pass }
+{
+ % Pass through all entries, comparing current entry to last one.
+ % Need to concatenate year to the stack (done by calc.label) to determine
+ % if two entries are the same (see presort)
+
+ last.label
+ calc.basic.label year field.or.null purify$ #-1 #4 substring$ * % add year
+ #1 entry.max$ substring$ = % are they equal?
+ { last.extra.num #1 + 'last.extra.num :=
+ last.extra.num int.to.chr$ 'extra.label :=
+ }
+ { "a" chr.to.int$ 'last.extra.num :=
+ "" 'extra.label :=
+ calc.basic.label year field.or.null purify$ #-1 #4 substring$ * % add year
+ #1 entry.max$ substring$ 'last.label := % assign to last.label
+ }
+ if$
+ number.label #1 + 'number.label :=
+}
+
+FUNCTION { reverse.pass }
+{
+ next.extra "b" =
+ { "a" 'extra.label := }
+ 'skip$
+ if$
+ label.year extra.label * 'sort.year :=
+ extra.label 'next.extra :=
+}
+
+EXECUTE {initialize.extra.label.stuff}
+EXECUTE {initialize.longest.label}
+
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION { bib.sort.order }
+{
+ sort.label
+ " "
+ *
+ year field.or.null sortify
+ *
+ " "
+ *
+ title field.or.null
+ sort.format.title
+ *
+ #1 entry.max$ substring$
+ 'sort.key$ :=
+}
+
+ITERATE { bib.sort.order }
+
+SORT % by sort.label, year, title --- giving final bib. order.
+
+FUNCTION { begin.bib }
+{
+ %% Set to #0 show 13-digit ISBN in preference to 10-digit ISBN.
+ %% Set to #1 to show both 10-digit and 13-digit ISBNs.
+ #1 'show-isbn-10-and-13 :=
+
+ "%%% -*-BibTeX-*-" writeln
+ "%%% Do NOT edit. File created by BibTeX with style" writeln
+ "%%% ACM-Reference-Format-Journals [18-Jan-2012]." writeln
+ "" writeln
+
+ preamble$ empty.or.unknown
+ 'skip$
+ { preamble$ writeln }
+ if$
+ "\begin{thebibliography}{" number.label int.to.str$ * "}" * writeln
+ "" writeln
+ "%%% ====================================================================" writeln
+ "%%% NOTE TO THE USER: you can override these defaults by providing" writeln
+ "%%% customized versions of any of these macros before the \bibliography" writeln
+ "%%% command. Each of them MUST provide its own final punctuation," writeln
+ "%%% except for \shownote{}, \showDOI{}, and \showURL{}. The latter two" writeln
+ "%%% do not use final punctuation, in order to avoid confusing it with" writeln
+ "%%% the Web address." writeln
+ "%%%" writeln
+ "%%% To suppress output of a particular field, define its macro to expand" writeln
+ "%%% to an empty string, or better, \unskip, like this:" writeln
+ "%%%" writeln
+ "%%% \newcommand{\showDOI}[1]{\unskip} % LaTeX syntax" writeln
+ "%%%" writeln
+ "%%% \def \showDOI #1{\unskip} % plain TeX syntax" writeln
+ "%%%" writeln
+ "%%% ====================================================================" writeln
+ "" writeln
+
+ %% ACM publications do not use CODEN, ISSN, and LCCN data, so their default
+ %% macro wrappers expand to \unskip, discarding their values and unwanted
+ %% space.
+ %%
+ %% For other publications, prior definitions like these may be useful:
+ %%
+ %% Plain TeX:
+ %% \def \showCODEN #1{CODEN #1.}
+ %% \def \showISSN #1{ISSN #1.}
+ %% \def \showLCCN #1{LCCN #1.}
+ %%
+ %% LaTeX:
+ %% \newcommand{\showCODEN}[1]{CODEN #1.}
+ %% \newcommand{\showISSN}[1]#1{ISSN #1.}
+ %% \newcommand{\showLCCN}[1]{LCCN #1.}
+
+ "\ifx \showCODEN \undefined \def \showCODEN #1{\unskip} \fi" writeln
+ "\ifx \showDOI \undefined \def \showDOI #1{#1}\fi" writeln
+ % ACM styles omit ISBNs, but they can be included by suitable definitions of
+ % \showISBNx and \showISBNxiii before the .bbl file is read
+ "\ifx \showISBNx \undefined \def \showISBNx #1{\unskip} \fi" writeln
+ "\ifx \showISBNxiii \undefined \def \showISBNxiii #1{\unskip} \fi" writeln
+ "\ifx \showISSN \undefined \def \showISSN #1{\unskip} \fi" writeln
+ "\ifx \showLCCN \undefined \def \showLCCN #1{\unskip} \fi" writeln
+ "\ifx \shownote \undefined \def \shownote #1{#1} \fi" writeln % NB: final period supplied by add.period$ above
+ "\ifx \showarticletitle \undefined \def \showarticletitle #1{#1} \fi" writeln
+ "\ifx \showURL \undefined \def \showURL {\relax} \fi" writeln
+ "% The following commands are used for tagged output and should be " writeln
+ "% invisible to TeX" writeln
+ "\providecommand\bibfield[2]{#2}" writeln
+ "\providecommand\bibinfo[2]{#2}" writeln
+ "\providecommand\natexlab[1]{#1}" writeln
+ "\providecommand\showeprint[2][]{arXiv:#2}" writeln
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION { end.bib }
+{
+ newline$
+ "\end{thebibliography}"
+ writeln
+}
+
+EXECUTE {end.bib}
diff --git a/cls/psuthesis.cls b/cls/psuthesis.cls
new file mode 100644
index 0000000..97c81ab
--- /dev/null
+++ b/cls/psuthesis.cls
@@ -0,0 +1,915 @@
+%%% ======================================================================
+%%% @LaTeX-file{
+%%% filename = "psuthesis.cls",
+%%% version = "2.9.2",
+%%% date = "2019/07/10",
+%%% time = "16:00:00 EDT",
+%%% author = "Gary L. Gray",
+%%% copyright = "Gary L. Gray",
+%%% address = "Engineering Science and Mechanics,
+%%% 212 Earth & Engineering Sciences Bldg.,
+%%% Penn State University,
+%%% University Park, PA 16802,
+%%% USA",
+%%% telephone = "814-863-1778",
+%%% email = "gray@psu.edu",
+%%% keywords = "latex, psuthesis, thesis class",
+%%% supported = "yes",
+%%% abstract = "This package provides a style for typesetting
+%%% Penn State theses at the bachelors, masters,
+%%% or Ph.D. level."
+%%% }
+%%% ======================================================================
+% Change History
+% The change history can be found in the accompanying document entitled
+% "psuthesis class change history.md".
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\NeedsTeXFormat{LaTeX2e}[1995/06/01]
+\ProvidesClass{psuthesis}[2019/07/10 v2.9.2 psuthesis class]
+\RequirePackage{ifthen}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% Declare options for different degree types.
+% Allowable degrees are:
+% Ph.D. using class option <phd>
+% M.S. using class option <ms>
+% M.Eng. using class option <meng>
+% M.A. using class option <ma>
+% B.S. using class option <bs>
+% B.A. using class option <ba>
+% Bachelors degree with Schreyer honors using class option <schreyer>
+%
+% The option of an option sets the boolean for that option to
+% true and all others to false.
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newboolean{psu@secondthesissupervisor}
+\newboolean{psu@schreyer}
+\newboolean{psu@esc}
+\newboolean{psu@twoha}
+\newboolean{psu@bs}
+\newboolean{psu@ba}
+\newboolean{psu@ms}
+\newboolean{psu@meng}
+\newboolean{psu@ma}
+\newboolean{psu@phd}
+\newboolean{psu@toc}
+
+\setboolean{psu@secondthesissupervisor}{false}
+\setboolean{psu@schreyer}{false}
+\setboolean{psu@esc}{false}
+\setboolean{psu@twoha}{false}
+\setboolean{psu@bs}{false}
+\setboolean{psu@ba}{false}
+\setboolean{psu@ms}{false}
+\setboolean{psu@meng}{false}
+\setboolean{psu@ma}{false}
+\setboolean{psu@phd}{false}
+
+\DeclareOption{bs}{\setboolean{psu@bs}{true}\setboolean{psu@phd}{false}}
+\DeclareOption{ba}{\setboolean{psu@ba}{true}\setboolean{psu@phd}{false}}
+\DeclareOption{ms}{\setboolean{psu@ms}{true}\setboolean{psu@phd}{false}}
+\DeclareOption{meng}{\setboolean{psu@meng}{true}\setboolean{psu@phd}{false}}
+\DeclareOption{ma}{\setboolean{psu@ma}{true}\setboolean{psu@phd}{false}}
+\DeclareOption{phd}{\setboolean{psu@phd}{true}}
+\DeclareOption{inlinechaptertoc}{\setboolean{psu@toc}{true}}
+\DeclareOption{secondthesissupervisor}{\setboolean{psu@secondthesissupervisor}{true}}
+\DeclareOption{schreyer}{\setboolean{psu@schreyer}{true}}%
+\DeclareOption{twoha}{\setboolean{psu@twoha}{true}}%
+\DeclareOption{esc}{\setboolean{psu@esc}{true}}%
+\setboolean{psu@bs}{false}
+\setboolean{psu@ba}{false}
+\setboolean{psu@phd}{false}
+\setboolean{psu@twoha}{false}
+
+\DeclareOption{draft}{\PassOptionsToClass{\CurrentOption}{book}}
+\DeclareOption{10pt}{\PassOptionsToClass{\CurrentOption}{book}}
+\DeclareOption{11pt}{\PassOptionsToClass{\CurrentOption}{book}}
+\DeclareOption{12pt}{\PassOptionsToClass{\CurrentOption}{book}}
+\DeclareOption*{\PackageWarning{psuthesis}{Unknown option `\CurrentOption'. Ignoring}}
+\ExecuteOptions{phd} % the default option is <phd>
+\ProcessOptions
+\LoadClass[openany,oneside]{book}
+\RequirePackage{calc}
+\RequirePackage{setspace}
+% If you are using the subfigure package, load the tocloft package with
+% the subfigure option and comment out the next line.
+%\RequirePackage{tocloft}[2003/09/26]
+\RequirePackage[subfigure]{tocloft}[2003/09/26]
+\RequirePackage{fancyhdr}
+\RequirePackage[overload]{textcase}
+\RequirePackage[letterpaper, left = 1.4in, right = 0.9in, top = 0.9in, bottom = 0.9in, includefoot]{geometry}
+\RequirePackage{twoopt}
+
+%%%%%%%%%%%%%%%%%%%%%%%%
+% Settings for tocloft %
+%%%%%%%%%%%%%%%%%%%%%%%%
+% Format chapter entries so that the chapter name goes on a line
+% following "Chapter #".
+\renewcommand{\@pnumwidth}{1.75em} % remove TOC margin errors
+\renewcommand{\@tocrmarg}{2.75em}
+\newlength{\mylength}% a "scratch" length
+\newlength{\mylonglength}% another "scratch" length
+\ifthenelse{\boolean{psu@toc}}
+{%
+% Format chapter entries so that the chapter name goes on the same line
+% as "Chapter #".
+\renewcommand{\cftchappresnum}{Chapter }
+\settowidth{\mylength}{\bfseries\cftchappresnum\cftchapaftersnum}% extra space
+\addtolength{\cftchapnumwidth}{\mylength} % add the extra space
+%
+\newcommand{\mylongname}{Appendix }% the longest chapter number header
+\settowidth{\mylonglength}{\bfseries\mylongname\cftchapaftersnum}% extra space
+}
+{%
+\renewcommand{\cftchappresnum}{Chapter }
+\renewcommand{\cftchapaftersnumb}{\\ \mbox{}\hspace{-\mylength}\hspace{-0.1em}}
+\settowidth{\mylength}{\bfseries\cftchappresnum\cftchapaftersnum} % extra space
+\addtolength{\cftchapnumwidth}{\mylength+0.1em} % add the extra space\renewcommand{\cftchapfont}{\bfseries}
+}%
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% Here I define internal "commands" that will be used to store the
+% thesis title, author name, department, etc.
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% Store the title of the thesis.
+\newcommand{\psu@title}{Theory of Everything}
+\renewcommand{\title}[1]{\renewcommand{\psu@title}{#1}}
+
+% Store the author's name.
+\newcommand{\psu@author}{Richard Feynman}
+\renewcommand{\author}[1]{\renewcommand{\psu@author}{#1}}
+
+% Store the department name.
+\newcommand{\psu@dept}{ESM}
+\providecommand{\dept}[1]{\renewcommand{\psu@dept}{#1}}
+
+% Store the date the degree will be conferred.
+\newcommand{\psu@degreedate}{May 1900}
+\providecommand{\degreedate}[1]{\renewcommand{\psu@degreedate}{#1}}
+
+% Store the year of the copyright.
+\newcommand{\psu@copyrightyear}{1900}
+\providecommand{\copyrightyear}[1]{\renewcommand{\psu@copyrightyear}{#1}}
+
+% Store the document type.
+\newcommand{\psu@documenttype}{Thesis}
+\providecommand{\documenttype}[1]{\renewcommand{\psu@documenttype}{#1}}
+
+% Store the academic unit to which the document has been submitted.
+\newcommand{\psu@submittedto}{The Graduate School}
+\providecommand{\submittedto}[1]{\renewcommand{\psu@submittedto}{#1}}
+
+% Store the College to which the document has been submitted.
+\newcommand{\psu@collegesubmittedto}{College of Engineering}
+\providecommand{\collegesubmittedto}[1]{\renewcommand{\psu@collegesubmittedto}{#1}}
+
+% Store the the info for the honors degree(s) type(s).
+\newcommand{\psu@bachelorsdegreeinfo}{for a baccalaureate degree(s) \\ in Biology and Physics \\ with honors in Computer Engineering}
+\providecommand{\bachelorsdegreeinfo}[1]{\renewcommand{\psu@bachelorsdegreeinfo}{#1}}
+
+% Store the academic unit to which the document has been submitted.
+\newcommand{\psu@escdepthead}{Department Q. Head}
+\providecommand{\escdepthead}[1]{\renewcommand{\psu@escdepthead}{#1}}
+
+% Store the academic unit to which the document has been submitted.
+\newcommand{\psu@escdeptheadtitle}{Department Q. Head}
+\providecommand{\escdeptheadtitle}[1]{\renewcommand{\psu@escdeptheadtitle}{#1}}
+
+% Store the name of the second Thesis Supervisor for a baccalaureate degree.
+\newcommand{\psu@secondthesissupervisor}{Second Q. Jones}
+\providecommand{\secondthesissupervisor}[1]{\renewcommand{\psu@secondthesissupervisor}{#1}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% Store the name of the degree by determining which boolean was
+% set in the class option was specified.
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\ifthenelse{\boolean{psu@bs}}%
+{\newcommand{\psu@degreetype}{Baccalaureate of Science}}%
+{}
+
+\ifthenelse{\boolean{psu@ba}}%
+{\newcommand{\psu@degreetype}{Baccalaureate of Arts}}%
+{}
+
+\ifthenelse{\boolean{psu@ms}}%
+{\newcommand{\psu@degreetype}{Master of Science}}%
+{}
+
+\ifthenelse{\boolean{psu@meng}}%
+{\newcommand{\psu@degreetype}{Master of Engineering}}%
+{}
+
+\ifthenelse{\boolean{psu@ma}}%
+{\newcommand{\psu@degreetype}{Master of Arts}}%
+{}
+
+\ifthenelse{\boolean{psu@phd}}%
+{\newcommand{\psu@degreetype}{Doctor of Philosophy}}%
+{}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% Store the number of readers in \psu@readers. This quantity is
+% input in the main file using the \numberofreaders command.
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newcommand{\psu@readers}{4}
+\providecommand{\numberofreaders}[1]{\renewcommand{\psu@readers}{#1}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newcommand{\psu@honorsadvisor}{Name of Honors Advisor}%
+\newcommand{\psu@honorsadvisortitle}{Associate Professor of Something}%
+\providecommand{\honorsadvisor}[2]%
+{\renewcommand{\psu@honorsadvisor}{#1}%
+\renewcommand{\psu@honorsadvisortitle}{#2}}
+
+\newcommand{\psu@honorsadvisortwo}{Name of Second Honors Advisor}%
+\newcommand{\psu@honorsadvisortwotitle}{Associate Professor of Something}%
+\providecommand{\honorsadvisortwo}[2]%
+{\renewcommand{\psu@honorsadvisortwo}{#1}%
+\renewcommand{\psu@honorsadvisortwotitle}{#2}}
+
+%\newcommand{\psu@advisor}{John Doe}
+%\newcommand{\psu@advisortitle}{John Doe}
+%\newcommand{\psu@advisoroption}{}%
+%\providecommand{\advisor}[3][]%
+%{\renewcommand{\psu@advisoroption}{#1}%
+%\renewcommand{\psu@advisor}{#2}%
+%\renewcommand{\psu@advisortitle}{#3}}
+
+\newcommand{\psu@advisor}{John Doe}
+\newcommand{\psu@advisortitle}{John Doe}
+\newcommand{\psu@advisoroptionone}{}%
+\newcommand{\psu@advisoroptiontwo}{}%
+\providecommandtwoopt{\advisor}[4][][]%
+{\renewcommand{\psu@advisoroptionone}{#1}%
+\renewcommand{\psu@advisoroptiontwo}{#2}%
+\renewcommand{\psu@advisor}{#3}%
+\renewcommand{\psu@advisortitle}{#4}}
+
+\newcommand{\psu@readerone}{John Doe}
+\newcommand{\psu@readeronetitle}{John Doe}
+\newcommand{\psu@readeroneoption}{}%
+\providecommand{\readerone}[3][]%
+{\renewcommand{\psu@readeroneoption}{#1}%
+\renewcommand{\psu@readerone}{#2}%
+\renewcommand{\psu@readeronetitle}{#3}}
+
+\newcommand{\psu@readertwo}{John Doe}
+\newcommand{\psu@readertwotitle}{John Doe}
+\newcommand{\psu@readertwooption}{}%
+\providecommand{\readertwo}[3][]%
+{\renewcommand{\psu@readertwooption}{#1}%
+\renewcommand{\psu@readertwo}{#2}%
+\renewcommand{\psu@readertwotitle}{#3}}
+
+\newcommand{\psu@readerthree}{John Doe}
+\newcommand{\psu@readerthreetitle}{John Doe}
+\newcommand{\psu@readerthreeoption}{}%
+\providecommand{\readerthree}[3][]%
+{\renewcommand{\psu@readerthreeoption}{#1}%
+\renewcommand{\psu@readerthree}{#2}%
+\renewcommand{\psu@readerthreetitle}{#3}}
+
+\newcommand{\psu@readerfour}{John Doe}
+\newcommand{\psu@readerfourtitle}{John Doe}
+\newcommand{\psu@readerfouroption}{}%
+\providecommand{\readerfour}[3][]%
+{\renewcommand{\psu@readerfouroption}{#1}%
+\renewcommand{\psu@readerfour}{#2}%
+\renewcommand{\psu@readerfourtitle}{#3}}
+
+\newcommand{\psu@readerfive}{John Doe}
+\newcommand{\psu@readerfivetitle}{John Doe}
+\newcommand{\psu@readerfiveoption}{}%
+\providecommand{\readerfive}[3][]%
+{\renewcommand{\psu@readerfiveoption}{#1}%
+\renewcommand{\psu@readerfive}{#2}%
+\renewcommand{\psu@readerfivetitle}{#3}}
+
+\newcommand{\psu@readersix}{John Doe}
+\newcommand{\psu@readersixtitle}{John Doe}
+\newcommand{\psu@readersixoption}{}%
+\providecommand{\readersix}[3][]%
+{\renewcommand{\psu@readersixoption}{#1}%
+\renewcommand{\psu@readersix}{#2}%
+\renewcommand{\psu@readersixtitle}{#3}}
+
+
+\newsavebox{\tempbox}
+\renewcommand{\@makecaption}[2]{%
+\vspace{7pt}\sbox{\tempbox}{\small\textbf{#1.} #2}%
+\ifthenelse{\lengthtest{\wd\tempbox > \linewidth}}%
+{\small\textbf{#1.} #2\par}%
+{\centering \small\textbf{#1.} #2}%
+}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% %
+% The actual layout begins here. %
+% %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+% Here is the permission page.
+%\newcommand{\psupermissionpage}{%
+%\thispagestyle{empty}
+%\begin{singlespace}
+%\noindent
+%I grant The Pennsylvania State University the non-exclusive right to use this work for the University's own purposes and to make single copies of the work available to the public on a not-for-profit basis if copies are not otherwise available. \\[0.6in]
+%\mbox{} \hfill
+%\parbox{3in}{\begin{center} \rule{3in}{0.4pt} \\ \psu@author
+%\end{center}}
+%\end{singlespace}
+%\newpage
+%\addtocounter{page}{-1}
+%}
+
+
+% Here is the title page.
+\newcommand{\psutitlepage}{%
+\setcounter{page}{1}
+\thispagestyle{empty}%
+%%%
+\ifthenelse{\boolean{psu@bs} \or \boolean{psu@ba}}
+%%%
+{%
+\vspace*{-1in}
+%\enlargethispage{0.5in}
+\ifthenelse{\boolean{psu@schreyer}}
+{
+\begin{center}
+THE PENNSYLVANIA STATE UNIVERSITY \\ SCHREYER HONORS COLLEGE
+\end{center}
+}
+{
+\begin{center}
+THE PENNSYLVANIA STATE UNIVERSITY
+\end{center}
+}
+\vfill
+\begin{center}
+\MakeUppercase{\psu@dept}
+\end{center}
+\vfill
+\begin{center}
+\MakeUppercase{\psu@title}
+\end{center}
+\vfill
+\begin{center}
+\MakeUppercase{\psu@author} \\ \MakeUppercase{\psu@degreedate}
+\end{center}
+\vfill
+\begin{center}
+A thesis \\ submitted in partial fulfillment \\ of the requirements \\
+\psu@bachelorsdegreeinfo
+\end{center}
+\vfill
+\mbox{}
+\begin{center}
+Reviewed and approved$^{*}$ by the following:\\[4mm]
+\psu@advisor \\
+\psu@advisortitle \\
+Thesis Supervisor \\[4mm]
+\ifthenelse{\boolean{psu@twoha}}
+{
+\psu@honorsadvisor \\
+\psu@honorsadvisortitle\\
+Honors Advisor \\[4mm]
+\psu@honorsadvisortwo \\
+\psu@honorsadvisortwotitle\\
+Honors Advisor
+}
+{
+\psu@honorsadvisor \\
+\psu@honorsadvisortitle\\
+Honors Advisor
+} \\[4mm]
+\ifthenelse{\boolean{psu@esc}}
+{
+\psu@escdepthead \\
+Department Head \\
+\psu@escdeptheadtitle
+\enlargethispage{24pt}
+}
+{}
+\vfill
+\ifthenelse{\boolean{psu@schreyer}}
+{
+$^{*}$Signatures are on file in the Schreyer Honors College and Department of Engineering Science and Mechanics.
+}
+{
+$^{*}$Signatures are on file in the Department of Engineering Science and Mechanics.
+}
+\end{center}
+%
+%
+\newpage
+% Now for the bachelors signature pages.
+\ifthenelse{\boolean{psu@schreyer} \and \boolean{psu@esc}}
+{
+{\thispagestyle{empty}\large\bfseries\noindent
+This page is not included in the thesis. The page following this one is submitted to the Department of Engineering Science and Mechanics.
+}
+\newpage
+\thispagestyle{empty}
+\noindent
+We approve the thesis of \psu@author:\\[10mm]
+\mbox{}\hfill Date of Signature \hspace{5mm}\mbox{}\\[24pt]
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@advisor \\
+\psu@advisortitle \\
+Thesis Supervisor \\[42pt]
+%
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisor \\
+\psu@honorsadvisortitle \\
+Honors Advisor \\[42pt]
+\ifthenelse{\boolean{psu@twoha}}
+{
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisortwo \\
+\psu@honorsadvisortwotitle \\
+Honors Advisor \\[42pt]
+}
+{}
+\ifthenelse{\boolean{psu@esc}}{
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@escdepthead \\
+Department Head \\
+\psu@escdeptheadtitle
+}
+{}
+% Now the page that goes to the Schreyer Honors College.
+\newpage
+{\thispagestyle{empty}\large\bfseries\noindent
+This page is not included in the thesis. The page following this one is submitted to the Schreyer Honors College.
+}
+\newpage
+\thispagestyle{empty}
+\noindent
+We approve the thesis of \psu@author:\\[10mm]
+\mbox{}\hfill Date of Signature \hspace{5mm}\mbox{}\\[24pt]
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@advisor \\
+\psu@advisortitle \\
+Thesis Supervisor \\[42pt]
+%
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisor \\
+\psu@honorsadvisortitle \\
+Honors Advisor \\[42pt]
+\ifthenelse{\boolean{psu@twoha}}
+{
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisortwo \\
+\psu@honorsadvisortwotitle \\
+Honors Advisor \\[42pt]
+}
+{}
+\addtocounter{page}{-5}
+}
+{
+\newpage
+\thispagestyle{empty}
+\noindent
+We approve the thesis of \psu@author:\\[10mm]
+\mbox{}\hfill Date of Signature \hspace{5mm}\mbox{}\\[24pt]
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@advisor \\
+\psu@advisortitle \\
+Thesis Supervisor \\[42pt]
+%
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisor \\
+\psu@honorsadvisortitle \\
+Honors Advisor \\[42pt]
+\ifthenelse{\boolean{psu@twoha}}
+{
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@honorsadvisortwo \\
+\psu@honorsadvisortwotitle \\
+Honors Advisor \\[42pt]
+}
+{}
+\ifthenelse{\boolean{psu@esc}}{
+\rule{3.5in}{0.5pt}\hfill\rule{1.5in}{0.5pt}\\[12pt]
+\psu@escdepthead \\
+Department Head \\
+\psu@escdeptheadtitle
+}
+{}
+\addtocounter{page}{-2}
+}
+%\ifthenelse{\boolean{psu@escdepthead}}{%
+%\begin{tabbing}%
+%Approved: \= \rule{2.75in}{0.5pt} \quad Date: \= \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@advisor \\[-3pt]
+% \> \qquad Thesis Supervisor \\[8mm]
+%%
+%\ifthenelse{\boolean{psu@secondthesissupervisor}}{%
+% \> \rule{2.75in}{0.5pt} \> \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@secondthesissupervisor \\[-3pt]
+% \> \qquad Thesis Supervisor \\[8mm]
+%}{}%
+%%
+% \> \rule{2.75in}{0.5pt} \> \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@honorsadvisor \\[-3pt]
+% \> \qquad Honors Advisor \\[8mm]
+% %
+% \> \rule{2.75in}{0.5pt} \> \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@escdepthead \\[-3pt]
+% \> \qquad Department Head
+%\end{tabbing}%
+%}%
+%{%
+%\begin{tabbing}%
+%Approved: \= \rule{2.75in}{0.5pt} \quad Date: \= \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@advisor \\[-3pt]
+% \> \qquad Thesis Supervisor \\[8mm]
+%%
+%\ifthenelse{\boolean{psu@secondthesissupervisor}}{%
+% \> \rule{2.75in}{0.5pt} \> \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@secondthesissupervisor \\[-3pt]
+% \> \qquad Thesis Supervisor \\[8mm]
+%}{}%
+%%
+% \> \rule{2.75in}{0.5pt} \> \rule{1.5in}{0.5pt} \\[-3pt]
+% \> \qquad \psu@honorsadvisor \\[-3pt]
+% \> \qquad Honors Advisor
+%\end{tabbing}%
+%}%
+}%
+%%%
+{%
+\vspace*{-0.25in}
+\begin{center}
+ The Pennsylvania State University\\
+ \psu@submittedto \\
+ \psu@collegesubmittedto
+\end{center}
+\vfill
+\begin{center}
+ \setstretch{2}
+ \bfseries\uppercase\expandafter{\psu@title}
+\end{center}
+\vfill
+\begin{center}
+ A \psu@documenttype\ in\\
+ \psu@dept\\
+ by\\
+ \psu@author\\
+\end{center}
+\vfill
+\begin{center}
+ \copyright\ \psu@copyrightyear\ \psu@author
+\end{center}
+\vfill
+\begin{center}
+ Submitted in Partial Fulfillment\\
+ of the Requirements\\
+ for the Degree of
+\end{center}
+\vfill
+\begin{center}
+ \psu@degreetype
+\end{center}
+\vfill
+\begin{center}
+ \psu@degreedate
+\end{center}
+%\newpage
+%\ifthenelse{\boolean{psu@ms} \or \boolean{psu@meng} \or \boolean{psu@ma}}{\psupermissionpage}{}
+}
+%%%
+\restoregeometry
+\newpage
+}
+
+
+% Here is the committee page.
+
+\newlength{\psu@sigoptionskip}
+\newlength{\psu@sigafteroptionskip}
+\newlength{\psu@intersigspace}
+
+\newcommand{\psucommitteepage}{%
+ \ifthenelse{\psu@readers = 6}{%
+ \setlength{\psu@sigafteroptionskip}{1.0\baselineskip}
+ \setlength{\psu@intersigspace}{1.0\baselineskip}
+ }%
+ {%
+ \setlength{\psu@sigafteroptionskip}{1.5\baselineskip}
+ \setlength{\psu@intersigspace}{1.5\baselineskip}
+ }
+ \vspace*{-0.2in}
+ \noindent {\normalsize The \MakeTextLowercase{\psu@documenttype} of \psu@author\ was reviewed and approved by the following:}\\[3\baselineskip]
+\mbox{}\hfill
+\parbox{\textwidth - 0.5in}{
+ \psu@advisor\\[\psu@sigoptionskip]
+ \psu@advisortitle
+ \ifthenelse{\equal{\psu@advisoroptionone}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@advisoroptionone \\ \psu@advisoroptiontwo \\[\psu@sigafteroptionskip]}
+\ifcase \psu@readers
+\or
+ \psu@readerone\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readeroneoption}
+\or
+ \psu@readerone\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readeroneoption \\[\psu@sigafteroptionskip]}
+ \psu@readertwo\\[\psu@sigoptionskip]
+ \psu@readertwotitle
+ \ifthenelse{\equal{\psu@readertwooption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readertwooption}
+\or
+ {\psu@readerone}\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readeroneoption \\[\psu@sigafteroptionskip]}
+ {\psu@readertwo}\\[\psu@sigoptionskip]
+ \psu@readertwotitle
+ \ifthenelse{\equal{\psu@readertwooption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readertwooption \\[\psu@sigafteroptionskip]}
+ {\psu@readerthree}\\[\psu@sigoptionskip]
+ \psu@readerthreetitle
+ \ifthenelse{\equal{\psu@readerthreeoption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readerthreeoption}
+\or
+ \psu@readerone\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readeroneoption \\[\psu@sigafteroptionskip]}
+ \psu@readertwo\\[\psu@sigoptionskip]
+ \psu@readertwotitle
+ \ifthenelse{\equal{\psu@readertwooption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readertwooption \\[\psu@sigafteroptionskip]}
+ \psu@readerthree\\[\psu@sigoptionskip]
+ \psu@readerthreetitle
+ \ifthenelse{\equal{\psu@readerthreeoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerthreeoption \\[\psu@sigafteroptionskip]}
+ \psu@readerfour\\[\psu@sigoptionskip]
+ \psu@readerfourtitle
+ \ifthenelse{\equal{\psu@readerfouroption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readerfouroption}
+\or
+ \psu@readerone\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readeroneoption \\[\psu@sigafteroptionskip]}
+ \psu@readertwo\\[\psu@sigoptionskip]
+ \psu@readertwotitle
+ \ifthenelse{\equal{\psu@readertwooption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readertwooption \\[\psu@sigafteroptionskip]}
+ \psu@readerthree\\[\psu@sigoptionskip]
+ \psu@readerthreetitle
+ \ifthenelse{\equal{\psu@readerthreeoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerthreeoption \\[\psu@sigafteroptionskip]}
+ \psu@readerfour\\[\psu@sigoptionskip]
+ \psu@readerfourtitle
+ \ifthenelse{\equal{\psu@readerfouroption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerfouroption \\[\psu@sigafteroptionskip]}
+ \psu@readerfive\\[\psu@sigoptionskip]
+ \psu@readerfivetitle
+ \ifthenelse{\equal{\psu@readerfiveoption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readerfiveoption}
+\or
+ \psu@readerone\\[\psu@sigoptionskip]
+ \psu@readeronetitle
+ \ifthenelse{\equal{\psu@readeroneoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readeroneoption \\[\psu@sigafteroptionskip]}
+ \psu@readertwo\\[\psu@sigoptionskip]
+ \psu@readertwotitle
+ \ifthenelse{\equal{\psu@readertwooption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readertwooption \\[\psu@sigafteroptionskip]}
+ \psu@readerthree\\[\psu@sigoptionskip]
+ \psu@readerthreetitle
+ \ifthenelse{\equal{\psu@readerthreeoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerthreeoption \\[\psu@sigafteroptionskip]}
+ \psu@readerfour\\[\psu@sigoptionskip]
+ \psu@readerfourtitle
+ \ifthenelse{\equal{\psu@readerfouroption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerfouroption \\[\psu@sigafteroptionskip]}
+ \psu@readerfive\\[\psu@sigoptionskip]
+ \psu@readerfivetitle
+ \ifthenelse{\equal{\psu@readerfiveoption}{}}%
+ {\\[\psu@intersigspace]}%
+ {\\[\psu@sigoptionskip]
+ \psu@readerfiveoption \\[\psu@sigafteroptionskip]}
+ \psu@readersix\\[\psu@sigoptionskip]
+ \psu@readersixtitle
+ \ifthenelse{\equal{\psu@readersixoption}{}}%
+ {}%
+ {\\[\psu@sigoptionskip] \psu@readersixoption}
+\fi
+}
+
+\mbox{}\vfill
+
+\noindent
+% \parbox{\textwidth}{$^{*}$Signatures are on file in the Graduate School.}
+
+\newpage
+}
+
+
+%\newcommand{\psusigpage}{%\setlength{\textheight}{8.0in}
+%{\Huge\thispagestyle{empty}
+%\sffamily\bfseries
+%\mbox{}\vfill
+%\noindent
+%The following page is the Signature Page. The Signature Page needs to be given to the Grad School, but it should not be bound with the thesis.
+%\begin{center}
+%\tiny Don't bind this page either!
+%\end{center}
+%\vfill
+%}
+%\newpage
+%\psu@signaturepage}
+
+
+\newcommand{\thesisabstract}[1]{%
+%\pagestyle{plain}
+\chapter*{Abstract}
+ \begin{singlespace}
+ \input{#1}
+ \end{singlespace}
+\newpage
+}
+
+
+\renewcommand{\contentsname}{Table of Contents}
+\setcounter{secnumdepth}{10}
+\setcounter{tocdepth}{4}
+\newcommand{\thesistableofcontents}{%
+\begin{singlespace}
+\if@twocolumn
+\@restonecoltrue\onecolumn
+\else
+\@restonecolfalse
+\fi
+\chapter*{\contentsname
+\@mkboth{%
+\MakeUppercase\contentsname}{\MakeUppercase\contentsname}}%
+\@starttoc{toc}%
+\if@restonecol\twocolumn\fi%
+\end{singlespace}
+\clearpage
+}
+
+%\renewcommand{\listfigurename}{List of Figures}
+%\newcommand{\thesislistoffigures}{%
+%\begin{singlespace}
+%\if@twocolumn
+%\@restonecoltrue\onecolumn
+%\else
+%\@restonecolfalse
+%\fi
+%%\chapter{\listfigurename
+%%\@mkboth{%
+%%\MakeUppercase\listfigurename}{\MakeUppercase\listfigurename}}%
+%\chapter{\listfigurename}
+%\@starttoc{lof}%
+%\if@restonecol\twocolumn\fi
+%\end{singlespace}
+%\clearpage
+%}
+
+\renewcommand{\listtablename}{List of Tables}
+\newcommand{\thesislistoftables}{%
+\begin{singlespace}
+\if@twocolumn
+\@restonecoltrue\onecolumn
+\else
+\@restonecolfalse
+\fi
+\chapter{\listtablename
+\@mkboth{%
+\MakeUppercase\listtablename}{\MakeUppercase\listtablename}}%
+\@starttoc{lot}%
+\if@restonecol\twocolumn\fi
+\end{singlespace}
+\clearpage
+}
+
+
+\newcommand{\thesislistofsymbols}[1]{%
+\chapter{List of Symbols}
+%\addcontentsline{toc}{chapter}{List of Symbols}
+\begin{singlespace}
+ \input{#1}
+\end{singlespace}
+}
+
+
+\newcommand{\thesisacknowledgments}[1]{%
+\chapter{Acknowledgments}
+%\addcontentsline{toc}{chapter}{Acknowledgments}
+\begin{singlespace}
+ \input{#1}
+\end{singlespace}
+}
+
+
+\newcommand{\thesisdedication}[2]{%
+\chapter*{#2}
+\begin{singlespace}
+ \input{#1}
+\end{singlespace}
+}
+
+
+\newcommand{\Appendix}{\@dblarg\my@Appendix}
+\def\my@Appendix[#1]#2{%
+\ifthenelse{\value{chapter} = 0}
+{
+\ifthenelse{\boolean{psu@toc}}
+{%
+\addtocontents{toc}{\protect\addtolength{\cftchapnumwidth}{-\mylength}}
+\addtocontents{toc}{\string\renewcommand{\string\cftchappresnum}{Appendix }}
+\addtocontents{toc}{\protect\addtolength{\cftchapnumwidth}{\mylonglength}}
+}%
+{%
+\addtocontents{toc}{\string\renewcommand{\string\cftchappresnum}{Appendix }}
+}}{}%
+\chapter[#1]{#2}
+}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\DeclareRobustCommand{\thesismainmatter}{%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%\setlength{\textheight}{8.5in}
+\mainmatter
+%\pagestyle{empty}
+%\renewcommand{\@oddhead}{\mbox{}\hfill\arabic{page}}
+%\let\ps@plain = \ps@empty
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\newcommand{\vita}[1]{%
+\begin{singlespace}
+\thispagestyle{empty}
+\begin{center}
+ \textbf{{\large Vita} \\[0.1in] \psu@author}
+\end{center}
+\input{#1}
+\end{singlespace}
+}
+
+
+%%
+%% End of file `psuthesis.cls'.
diff --git a/cls/userlib.tex b/cls/userlib.tex
new file mode 100644
index 0000000..c64d25a
--- /dev/null
+++ b/cls/userlib.tex
@@ -0,0 +1,22 @@
+\newcommand{\nop}[1]{}
+\newtheorem{definition}{Definition}
+\newtheorem{theorem}{Theorem}
+\newtheorem{observation}{Observation}
+\newtheorem{lemma}{Lemma}
+\newtheorem{remark}{Remark}[section]
+\newtheorem{corollary}{Corollary}[section]
+\newtheorem{example}{Example}
+\newtheorem{claim}{Claim}
+
+\DeclareMathOperator{\op}{op}
+\DeclareMathOperator{\args}{arg}
+\DeclareMathOperator{\cost}{cost}
+\DeclareMathOperator{\lleft}{left}
+\DeclareMathOperator{\rright}{right}
+\DeclareMathOperator{\argmax}{argmax}
+\DeclareMathOperator{\argmin}{argmin}
+
+\newcommand{\Paragraph}[1]{~\vspace*{-0.8\baselineskip}\\{\bf #1}}
+\newcommand\mathbftt[1]{\textnormal{\ttfamily\bfseries #1}}
+
+\newcommand\note[1]{\marginpar{\color{red}\tiny #1}}
diff --git a/img/fig-bs-irs-insert.pdf b/img/fig-bs-irs-insert.pdf
new file mode 100644
index 0000000..74516ad
--- /dev/null
+++ b/img/fig-bs-irs-insert.pdf
Binary files differ
diff --git a/img/fig-bs-irs-query.pdf b/img/fig-bs-irs-query.pdf
new file mode 100644
index 0000000..798c615
--- /dev/null
+++ b/img/fig-bs-irs-query.pdf
Binary files differ
diff --git a/img/fig-bs-irs-space.pdf b/img/fig-bs-irs-space.pdf
new file mode 100644
index 0000000..d749d36
--- /dev/null
+++ b/img/fig-bs-irs-space.pdf
Binary files differ
diff --git a/img/fig-bs-knn-insert.pdf b/img/fig-bs-knn-insert.pdf
new file mode 100644
index 0000000..099471f
--- /dev/null
+++ b/img/fig-bs-knn-insert.pdf
Binary files differ
diff --git a/img/fig-bs-knn-query.pdf b/img/fig-bs-knn-query.pdf
new file mode 100644
index 0000000..ef80fd4
--- /dev/null
+++ b/img/fig-bs-knn-query.pdf
Binary files differ
diff --git a/img/fig-bs-knn.pdf b/img/fig-bs-knn.pdf
new file mode 100644
index 0000000..f7ca435
--- /dev/null
+++ b/img/fig-bs-knn.pdf
Binary files differ
diff --git a/img/fig-bs-rq-insert.pdf b/img/fig-bs-rq-insert.pdf
new file mode 100644
index 0000000..ebef47f
--- /dev/null
+++ b/img/fig-bs-rq-insert.pdf
Binary files differ
diff --git a/img/fig-bs-rq-query.pdf b/img/fig-bs-rq-query.pdf
new file mode 100644
index 0000000..4dfaf34
--- /dev/null
+++ b/img/fig-bs-rq-query.pdf
Binary files differ
diff --git a/img/fig-bs-rq-space.pdf b/img/fig-bs-rq-space.pdf
new file mode 100644
index 0000000..f239129
--- /dev/null
+++ b/img/fig-bs-rq-space.pdf
Binary files differ
diff --git a/img/fig-bs-wss-insert.pdf b/img/fig-bs-wss-insert.pdf
new file mode 100644
index 0000000..a8b2d6b
--- /dev/null
+++ b/img/fig-bs-wss-insert.pdf
Binary files differ
diff --git a/img/fig-bs-wss-sample.pdf b/img/fig-bs-wss-sample.pdf
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/img/fig-bs-wss-sample.pdf
diff --git a/img/leveling.pdf b/img/leveling.pdf
new file mode 100644
index 0000000..1918b9b
--- /dev/null
+++ b/img/leveling.pdf
Binary files differ
diff --git a/img/sigmod23/alias.pdf b/img/sigmod23/alias.pdf
new file mode 100644
index 0000000..19c11fa
--- /dev/null
+++ b/img/sigmod23/alias.pdf
Binary files differ
diff --git a/img/sigmod23/delete-tagging.pdf b/img/sigmod23/delete-tagging.pdf
new file mode 100644
index 0000000..c11f289
--- /dev/null
+++ b/img/sigmod23/delete-tagging.pdf
Binary files differ
diff --git a/img/sigmod23/delete-tombstone.pdf b/img/sigmod23/delete-tombstone.pdf
new file mode 100644
index 0000000..fa730a4
--- /dev/null
+++ b/img/sigmod23/delete-tombstone.pdf
Binary files differ
diff --git a/img/sigmod23/merge-leveling.pdf b/img/sigmod23/merge-leveling.pdf
new file mode 100644
index 0000000..554904c
--- /dev/null
+++ b/img/sigmod23/merge-leveling.pdf
Binary files differ
diff --git a/img/sigmod23/merge-tiering.pdf b/img/sigmod23/merge-tiering.pdf
new file mode 100644
index 0000000..1f3a1c1
--- /dev/null
+++ b/img/sigmod23/merge-tiering.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-ext-insert.pdf b/img/sigmod23/plot/fig-bs-ext-insert.pdf
new file mode 100644
index 0000000..3a4d17a
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-ext-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-ext-sample.pdf b/img/sigmod23/plot/fig-bs-ext-sample.pdf
new file mode 100644
index 0000000..6dc0131
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-ext-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-irs-insert.pdf b/img/sigmod23/plot/fig-bs-irs-insert.pdf
new file mode 100644
index 0000000..5004d3b
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-irs-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-irs-sample.pdf b/img/sigmod23/plot/fig-bs-irs-sample.pdf
new file mode 100644
index 0000000..ab862d3
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-irs-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-irs-samplesize.pdf b/img/sigmod23/plot/fig-bs-irs-samplesize.pdf
new file mode 100644
index 0000000..50ec556
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-irs-samplesize.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-irs-selectivity.pdf b/img/sigmod23/plot/fig-bs-irs-selectivity.pdf
new file mode 100644
index 0000000..9888566
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-irs-selectivity.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-wirs-insert.pdf b/img/sigmod23/plot/fig-bs-wirs-insert.pdf
new file mode 100644
index 0000000..4fcfb85
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-wirs-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-wirs-sample.pdf b/img/sigmod23/plot/fig-bs-wirs-sample.pdf
new file mode 100644
index 0000000..306f8fb
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-wirs-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-wss-insert.pdf b/img/sigmod23/plot/fig-bs-wss-insert.pdf
new file mode 100644
index 0000000..2f97e27
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-wss-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-bs-wss-sample.pdf b/img/sigmod23/plot/fig-bs-wss-sample.pdf
new file mode 100644
index 0000000..b76eb0e
--- /dev/null
+++ b/img/sigmod23/plot/fig-bs-wss-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-cc-irs-scale.pdf b/img/sigmod23/plot/fig-cc-irs-scale.pdf
new file mode 100644
index 0000000..d215854
--- /dev/null
+++ b/img/sigmod23/plot/fig-cc-irs-scale.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-cc-irs-thread.pdf b/img/sigmod23/plot/fig-cc-irs-thread.pdf
new file mode 100644
index 0000000..8f1966b
--- /dev/null
+++ b/img/sigmod23/plot/fig-cc-irs-thread.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-cl-buffsweep.pdf b/img/sigmod23/plot/fig-cl-buffsweep.pdf
new file mode 100644
index 0000000..a8dd883
--- /dev/null
+++ b/img/sigmod23/plot/fig-cl-buffsweep.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-bloom-memory.pdf b/img/sigmod23/plot/fig-ps-wss-bloom-memory.pdf
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-bloom-memory.pdf
diff --git a/img/sigmod23/plot/fig-ps-wss-bloom-sample.pdf b/img/sigmod23/plot/fig-ps-wss-bloom-sample.pdf
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-bloom-sample.pdf
diff --git a/img/sigmod23/plot/fig-ps-wss-bloom.pdf b/img/sigmod23/plot/fig-ps-wss-bloom.pdf
new file mode 100644
index 0000000..2e431c9
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-bloom.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-mt-insert.pdf b/img/sigmod23/plot/fig-ps-wss-mt-insert.pdf
new file mode 100644
index 0000000..2c8f086
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-mt-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-mt-sample.pdf b/img/sigmod23/plot/fig-ps-wss-mt-sample.pdf
new file mode 100644
index 0000000..ec6d0d7
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-mt-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-samplesize.pdf b/img/sigmod23/plot/fig-ps-wss-samplesize.pdf
new file mode 100644
index 0000000..429f772
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-samplesize.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-sf-insert.pdf b/img/sigmod23/plot/fig-ps-wss-sf-insert.pdf
new file mode 100644
index 0000000..e4cb0ac
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-sf-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-sf-sample.pdf b/img/sigmod23/plot/fig-ps-wss-sf-sample.pdf
new file mode 100644
index 0000000..13a4397
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-sf-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-tp-insert.pdf b/img/sigmod23/plot/fig-ps-wss-tp-insert.pdf
new file mode 100644
index 0000000..6e215dd
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-tp-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-ps-wss-tp-sample.pdf b/img/sigmod23/plot/fig-ps-wss-tp-sample.pdf
new file mode 100644
index 0000000..f1c70ce
--- /dev/null
+++ b/img/sigmod23/plot/fig-ps-wss-tp-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-sc-irs-delete.pdf b/img/sigmod23/plot/fig-sc-irs-delete.pdf
new file mode 100644
index 0000000..fb69b23
--- /dev/null
+++ b/img/sigmod23/plot/fig-sc-irs-delete.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-sc-irs-insert.pdf b/img/sigmod23/plot/fig-sc-irs-insert.pdf
new file mode 100644
index 0000000..823abb6
--- /dev/null
+++ b/img/sigmod23/plot/fig-sc-irs-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-sc-irs-sample.pdf b/img/sigmod23/plot/fig-sc-irs-sample.pdf
new file mode 100644
index 0000000..a01598f
--- /dev/null
+++ b/img/sigmod23/plot/fig-sc-irs-sample.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-sc-wss-insert.pdf b/img/sigmod23/plot/fig-sc-wss-insert.pdf
new file mode 100644
index 0000000..14c6834
--- /dev/null
+++ b/img/sigmod23/plot/fig-sc-wss-insert.pdf
Binary files differ
diff --git a/img/sigmod23/plot/fig-sc-wss-sample.pdf b/img/sigmod23/plot/fig-sc-wss-sample.pdf
new file mode 100644
index 0000000..62859e8
--- /dev/null
+++ b/img/sigmod23/plot/fig-sc-wss-sample.pdf
Binary files differ
diff --git a/img/sigmod23/sampling.pdf b/img/sigmod23/sampling.pdf
new file mode 100644
index 0000000..d66b149
--- /dev/null
+++ b/img/sigmod23/sampling.pdf
Binary files differ
diff --git a/img/tiering.pdf b/img/tiering.pdf
new file mode 100644
index 0000000..a60458b
--- /dev/null
+++ b/img/tiering.pdf
Binary files differ
diff --git a/paper.tex b/paper.tex
new file mode 100644
index 0000000..192377a
--- /dev/null
+++ b/paper.tex
@@ -0,0 +1,437 @@
+% !TEX TS-program = pdflatex
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% filename = "DouglasRumbaugh-Dissertation.tex",
+% version = "1.0.0",
+% date = "2025/02/18",
+% authors = "Douglas B. Rumbaugh",
+% copyright = "Douglas B. Rumbaugh",
+% address = "Computer Science and Engineering
+% W343 Westgate Building
+% Penn State University,
+% University Park, PA 16802,
+% USA",
+% telephone = "717-275-4939",
+% email = "drumbaugh@psu.edu"
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Change History:
+% The change history can be found in the accompanying document
+% entitled "YourName-Dissertation Template Change History.md".
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% This is a template file to help get you started using the
+% psuthesis.cls for theses and dissertations at Penn State
+% University. You will, of course, need to put the
+% psuthesis.cls file someplace that LaTeX will find it.
+%
+% I have set up a directory structure that I find to be clean
+% and convenient. You can readjust it to suit your tastes. In
+% fact, the structure used by our students is even a little
+% more involved and commands are defined to point to the
+% various directories.
+%
+% This document has been set up to be typeset using pdflatex.
+% About the only thing you will need to change if typesetting
+% using latex is the \DeclareGraphicsExtensions command.
+%
+% The psuthesis document class uses the same options as the
+% book class. In addition, it requires that you have the
+% ifthen, calc, setspace, and tocloft packages.
+%
+% The first additional option specifies the degree type. You
+% can choose from:
+% Ph.D. using class option <phd>
+% M.S. using class option <ms>
+% M.Eng. using class option <meng>
+% M.A. using class option <ma>
+% B.S. using class option <bs>
+% B.A. using class option <ba>
+% Honors from the Schreyer Honors College <schreyer>
+%
+% The second additional option inlinechaptertoc determines
+% the formatting of the Chapter entries in the Table of
+% Contents. The default sets them as two-line entries (try it).
+% If you want them as one-line entries, issue the
+% inlinechaptertoc option.
+%
+% The class option schreyer should be used for theses
+% submitted to the Schreyer Honors College.
+%
+% The class option esc should be used by all Engineering Science
+% students.
+%
+% The option option twoha should be used if you are earning
+% interdisciplanary honors and thus have two honors advisors.
+%
+% The class option ``secondthesissupervisor'' should be used
+% for baccalaureate honors degrees if you have a second
+% Thesis Supervisor.
+%
+% The vita is only included with the phd option and it is
+% placed at the end of the thesis. The permissions page is only
+% included with the ms, meng, and ma options.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Only one of the following lines should be used at a time.
+% Doctoral students.
+\documentclass[phd,12pt]{psuthesis}
+% Masters students
+%\documentclass[ms,12pt]{psuthesis}
+% Bachelors students in the Schreyer Honors College.
+% \documentclass[bs,schreyer,12pt]{psuthesis}
+% Bachelors students in the Schreyer Honors College & Engineering Science.
+%\documentclass[bs,schreyer,esc,twoha,12pt]{psuthesis}
+% Bachelors students in Engineering Science.
+%\documentclass[bs,esc,12pt]{psuthesis}
+
+\usepackage[T1]{fontenc}
+\usepackage[final]{graphicx}
+\usepackage[linesnumbered,ruled,noend]{algorithm2e}
+\usepackage[mathscr]{eucal}
+\usepackage[nosepfour,warning,np,debug,autolanguage]{numprint}
+\usepackage{acro}
+\usepackage{algpseudocode}
+\usepackage{amsmath,amsfonts}
+\usepackage{amssymb}
+\usepackage{mathrsfs}
+\usepackage{amsthm}
+\usepackage{balance}
+\usepackage{booktabs}
+\usepackage{caption}
+\usepackage{cite}
+\usepackage{epsfig,tabularx,subfig,multirow}
+\usepackage{eqlist}
+\usepackage{lipsum}
+\usepackage{listings}
+\usepackage{lmodern}
+\usepackage{mathtools}
+\usepackage{microtype}
+\usepackage{nicefrac}
+\usepackage{setspace}
+\usepackage{textcomp}
+\usepackage{threeparttable}
+\usepackage{tikz}
+\usepackage{titlesec}
+\usepackage{url}
+\usepackage{wrapfig}
+\usepackage{xcolor}
+\usepackage[numbers]{natbib}
+\usepackage{mathtools}
+\usepackage{oul}
+
+\setstretch{1.24}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% SPECIAL SYMBOLS AND NEW COMMANDS %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\input{cls/userlib}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Renewed Float Parameters %
+% (Makes floats fit better on the page) %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\renewcommand{\floatpagefraction}{0.85}
+\renewcommand{\topfraction} {0.85}
+\renewcommand{\bottomfraction} {0.85}
+\renewcommand{\textfraction} {0.15}
+
+% ----------------------------------------------------------- %
+
+%%%%%%%%%%%%%%%%
+% FRONT-MATTER %
+%%%%%%%%%%%%%%%%
+% Title
+\title{Automated Dynamization of Static Data Structures}
+
+% Author and Department
+\author{Douglas B. Rumbaugh}
+\dept{Computer Science and Engineering}
+% the degree will be conferred on this date
+\degreedate{December 2025}
+% year of your copyright
+\copyrightyear{2025}
+
+% This command is used for students submitting a thesis to the
+% Schreyer Honors College and for students in Engineering Science.
+% The argument of this command should contain every after the word
+% ``requirements'' that appears on the title page. This provides the
+% needed flexibility for all the degree types.
+\bachelorsdegreeinfo{for a baccalaureate degree \\ in Engineering Science \\ with honors in Engineering Science}
+
+% This is the document type. For example, this could also be:
+% Comprehensive Document
+% Thesis Proposal
+% \documenttype{Thesis}
+%\documenttype{Dissertation}
+\documenttype{Dissertation}
+
+
+% This will generally be The Graduate School, though you can
+% put anything in here to suit your needs.
+\submittedto{The Graduate School}
+
+% This is the college to which you are submitting the
+% thesis/dissertation.
+\collegesubmittedto{The College of Engineering}
+
+
+%%%%%%%%%%%%%%%%%%
+% Signatory Page %
+%%%%%%%%%%%%%%%%%%
+% You can have up to 7 committee members, i.e., one advisor
+% and up to 6 readers.
+%
+% Begin by specifying the number of readers.
+\numberofreaders{4}
+
+% For baccalaureate honors degrees, enter the name of your
+% honors advisor below.
+\honorsadvisor{Honors P. Advisor}
+{Associate Professor of Engineering Science and Mechanics}
+\honorsadvisortwo{Honors P. Advisor, Jr.}
+{Professor of Engineering Science and Mechanics}
+
+% For baccalaureate honors degrees, if you have a second
+% Thesis Supervisor, enter his or her name below.
+\secondthesissupervisor{Second T. Supervisor}
+
+% For baccalaureate honors degrees, certain departments
+% (e.g., Engineering Science and Mechanics) require the
+% signature of the department head. In that case, enter the
+% name and title of your department head below.
+\escdepthead{Department Q. Head}
+\escdeptheadtitle{P. B. Breneman Chair and Professor
+of Engineering Science and Mechanics
+}
+
+% Input reader information below. The optional argument, which
+% comes first, goes on the second line before the name.
+\advisor[Dissertation Advisor][Chair of Committee]
+ {Dong Xie}
+ {Assistant Professor of Computer Science and Engineering}
+
+\readerone[]
+ {Ruslan Nikolaev}
+ {Assistant Professor of Computer Science and Engineering}
+
+\readertwo[]
+ {Young Ko}
+ {Assistant Professor of Computer Science and Engineering}
+
+\readerthree[]
+ {Dongwong Lee}
+ {Professor of Information Sciences and Technology}
+
+\readerfour[]{}{}
+
+% Format the Chapter headings using the titlesec package.
+% You can format section headings and the like here too.
+\definecolor{gray75}{gray}{0.75}
+\newcommand{\hsp}{\hspace{15pt}}
+\titleformat{\chapter}[display]{\fontsize{30}{30}\selectfont\bfseries\sffamily}{Chapter \thechapter\hsp\textcolor{gray75}{\raisebox{3pt}{|}}}{0pt}{}{}
+
+\titleformat{\section}[block]{\Large\bfseries\sffamily}{\thesection}{12pt}{}{}
+\titleformat{\subsection}[block]{\large\bfseries\sffamily}{\thesubsection}{12pt}{}{}
+
+
+% Makes use of LaTeX's include facility. Add as many chapters
+% and appendices as you like.
+%\includeonly{%
+%chapters/chapter1,%
+%Chapter-2/Chapter-2,%
+%Chapter-3/Chapter-3,%
+%Chapter-4/Chapter-4,%
+%Chapter-5/Chapter-5,%
+%Chapter-6/Chapter-6,%
+%Chapter-7/Chapter-7,%
+%Chapter-8/Chapter-8,%
+%Chapter-9/Chapter-9,%
+% Appendix-A/Appendix-A,%
+% Appendix-B/Appendix-B,%
+% Appendix-C/Appendix-C,%
+% Appendix-D/Appendix-D,%
+% Appendix-E/Appendix-E%
+%}
+
+\usepackage{listings}
+%%%%%%%%%%%%%%%%%
+% THE BEGINNING %
+%%%%%%%%%%%%%%%%%
+\begin{document}
+\pagestyle{fancy}
+\fancyhead[L,C,R]{}
+\fancyfoot[L,R]{}
+\fancyfoot[C]{\thepage}
+\renewcommand{\headrulewidth}{0pt}
+\renewcommand{\footrulewidth}{0pt}
+%%%%%%%%%%%%%%%%%%%%%%%%
+% Preliminary Material %
+%%%%%%%%%%%%%%%%%%%%%%%%
+% This command is needed to properly set up the frontmatter.
+\frontmatter
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% IMPORTANT
+%
+% The following commands allow you to include all the
+% frontmatter in your thesis. If you don't need one or more of
+% these items, you can comment it out. Most of these items are
+% actually required by the Grad School -- see the Thesis Guide
+% for details regarding what is and what is not required for
+% your particular degree.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% !!! DO NOT CHANGE THE SEQUENCE OF THESE ITEMS !!!
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+% Generates the title page based on info you have provided
+% above.
+\psutitlepage
+
+% Generates the committee page -- this is bound with your
+% thesis. If this is an baccalaureate honors thesis, then
+% comment out this line.
+\psucommitteepage
+
+% Generates the abstract. The argument should point to the
+% file containing your abstract.
+\thesisabstract{chapters/abstract}
+
+% Generates the Table of Contents
+\thesistableofcontents
+
+% Generates the List of Figures
+\begin{singlespace}
+\renewcommand{\listfigurename}{\sffamily\Huge List of Figures}
+\setlength{\cftparskip}{\baselineskip}
+\addcontentsline{toc}{chapter}{List of Figures}
+%\fancypagestyle{plain}{%
+%\fancyhf{} % clear all header and footer fields
+%\fancyfoot[C]{\thepage}} % except the center
+\listoffigures
+\end{singlespace}
+\clearpage
+
+% Generates the List of Tables
+\begin{singlespace}
+\renewcommand{\listtablename}{\sffamily\Huge List of Tables}
+\setlength{\cftparskip}{\baselineskip}
+\addcontentsline{toc}{chapter}{List of Tables}
+\listoftables
+\end{singlespace}
+\clearpage
+
+% Generates the List of Symbols. The argument should point to
+% the file containing your List of Symbols.
+% \thesislistofsymbols{SupplementaryMaterial/ListOfSymbols}
+% Generally not needed in CSE
+
+% Generates the Acknowledgments. The argument should point to
+% the file containing your Acknowledgments.
+% \thesisacknowledgments{chapters/acknowledgments}
+
+% Generates the Epigraph/Dedication. The first argument should
+% point to the file containing your Epigraph/Dedication and
+% the second argument should be the title of this page.
+%\thesisdedication{SupplementaryMaterial/Dedication}{Dedication}
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% This command is needed to get the main part of the %
+% document going. %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\thesismainmatter
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% This is an AMS-LaTeX command to allow breaking %
+% of displayed equations across pages. Note the %
+% closing the "}" just before the bibliography. %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\allowdisplaybreaks{
+%\pagestyle{fancy}
+%\fancyhead{}
+%
+%%%%%%%%%%%%%%%%%%%%%%
+% THE ACTUAL CONTENT %
+%%%%%%%%%%%%%%%%%%%%%%
+% Chapters
+\input{chapters/introduction}
+\input{chapters/background}
+\input{chapters/dynamic-extension-sampling}
+\input{chapters/beyond-dsp}
+\input{chapters/future-work}
+\input{chapters/conclusion}
+%\include{Chapter-2/Chapter-2}
+%\include{Chapter-3/Chapter-3}
+%\include{Chapter-4/Chapter-4}
+%\include{Chapter-5/Chapter-5}
+%\include{Chapter-6/Chapter-6}
+%\include{Chapter-7/Chapter-7}
+%\include{Chapter-8/Chapter-8}
+%\include{Chapter-9/Chapter-9}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Appendices
+%
+% Because of a quirk in LaTeX (see p. 48 of The LaTeX
+% Companion, 2e), you cannot use \include along with
+% \addtocontents if you want things to appear the proper
+% sequence.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\appendix
+\titleformat{\chapter}[display]{\fontsize{30}{30}\selectfont\bfseries\sffamily}{Appendix \thechapter\textcolor{gray75}{\raisebox{3pt}{|}}}{0pt}{}{}
+% If you have a single appendix, then to prevent LaTeX from
+% calling it ``Appendix A'', you should uncomment the following two
+% lines that redefine the \thechapter and \thesection:
+%\renewcommand\thechapter{}
+%\renewcommand\thesection{\arabic{section}}
+% \include{Appendix-A/Appendix-A}
+% \include{Appendix-B/Appendix-B}
+% \include{Appendix-C/Appendix-C}
+% \include{Appendix-D/Appendix-D}
+% \include{Appendix-E/Appendix-E}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% ESM students need to include a Nontechnical Abstract as the %
+% last appendix. %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% This \include command should point to the file containing
+% that abstract.
+%\include{nontechnical-abstract}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+} % End of the \allowdisplaybreak command %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%%%%%%%%%%%%%%%%
+% BIBLIOGRAPHY %
+%%%%%%%%%%%%%%%%
+% You can use BibTeX or other bibliography facility for your
+% bibliography. LaTeX's standard stuff is shown below. If you
+% bibtex, then this section should look something like:
+\begin{singlespace}
+ \nocite{*}
+ \bibliographystyle{ACM-Reference-Format}
+ \addcontentsline{toc}{chapter}{Bibliography}
+ \bibliography{references/references}
+\end{singlespace}
+
+%\begin{singlespace}
+%\begin{thebibliography}{99}
+%\addcontentsline{toc}{chapter}{Bibliography}
+%\frenchspacing
+
+%\bibitem{Wisdom87} J. Wisdom, ``Rotational Dynamics of Irregularly Shaped Natural Satellites,'' \emph{The Astronomical Journal}, Vol.~94, No.~5, 1987 pp. 1350--1360.
+
+%\bibitem{G&H83} J. Guckenheimer and P. Holmes, \emph{Nonlinear Oscillations, Dynamical Systems, and Bifurcations of Vector Fields}, Springer-Verlag, New York, 1983.
+
+%\end{thebibliography}
+%\end{singlespace}
+
+\backmatter
+
+% Vita
+%\vita{chapters/vita}
+
+\end{document}
+
+
diff --git a/references/references.bib b/references/references.bib
new file mode 100644
index 0000000..b29e8ec
--- /dev/null
+++ b/references/references.bib
@@ -0,0 +1,1419 @@
+% Encoding: UTF-8
+
+@article{walker74,
+ author = {A.J. Walker},
+ title = {New fast method for generating discrete random numbers with arbitrary frequency distributions},
+ journal = {Electronics Letters},
+ year = {1974},
+ volume = {10},
+ pages = {127-128(1)},
+ issue = {8}
+}
+
+@article{oneil96,
+ author = {Patrick E. O'Neil and
+ Edward Cheng and
+ Dieter Gawlick and
+ Elizabeth J. O'Neil},
+ title = {The Log-Structured Merge-Tree (LSM-Tree)},
+ journal = {Acta Informatica},
+ volume = {33},
+ number = {4},
+ pages = {351--385},
+ year = {1996},
+ url = {https://doi.org/10.1007/s002360050048},
+ doi = {10.1007/s002360050048},
+ timestamp = {Sun, 21 Jun 2020 17:38:20 +0200},
+ biburl = {https://dblp.org/rec/journals/acta/ONeilCGO96.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{xie21,
+ author = {Dong Xie and
+ Jeff M. Phillips and
+ Michael Matheny and
+ Feifei Li},
+ editor = {Guoliang Li and
+ Zhanhuai Li and
+ Stratos Idreos and
+ Divesh Srivastava},
+ title = {Spatial Independent Range Sampling},
+ booktitle = {{SIGMOD} '21: International Conference on Management of Data, Virtual
+ Event, China, June 20-25, 2021},
+ pages = {2023--2035},
+ publisher = {{ACM}},
+ year = {2021},
+ url = {https://doi.org/10.1145/3448016.3452806},
+ doi = {10.1145/3448016.3452806},
+ timestamp = {Mon, 21 Jun 2021 11:48:44 +0200},
+ biburl = {https://dblp.org/rec/conf/sigmod/0001PM021.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{bloom70,
+ author = {Burton H. Bloom},
+ title = {Space/Time Trade-offs in Hash Coding with Allowable Errors},
+ journal = {Commun. {ACM}},
+ volume = {13},
+ number = {7},
+ pages = {422--426},
+ year = {1970},
+ url = {https://doi.org/10.1145/362686.362692},
+ doi = {10.1145/362686.362692},
+ timestamp = {Wed, 14 Nov 2018 10:22:32 +0100},
+ biburl = {https://dblp.org/rec/journals/cacm/Bloom70.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@inproceedings{dayan17,
+ author = {Niv Dayan and
+ Manos Athanassoulis and
+ Stratos Idreos},
+ editor = {Semih Salihoglu and
+ Wenchao Zhou and
+ Rada Chirkova and
+ Jun Yang and
+ Dan Suciu},
+ title = {Monkey: Optimal Navigable Key-Value Store},
+ booktitle = {Proceedings of the 2017 {ACM} International Conference on Management
+ of Data, {SIGMOD} Conference 2017, Chicago, IL, USA, May 14-19, 2017},
+ pages = {79--94},
+ publisher = {{ACM}},
+ year = {2017},
+ url = {https://doi.org/10.1145/3035918.3064054},
+ doi = {10.1145/3035918.3064054},
+ timestamp = {Thu, 14 Oct 2021 10:11:38 +0200},
+ biburl = {https://dblp.org/rec/conf/sigmod/DayanAI17.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{dayan18,
+ author = {Niv Dayan and
+ Stratos Idreos},
+ editor = {Gautam Das and
+ Christopher M. Jermaine and
+ Philip A. Bernstein},
+ title = {Dostoevsky: Better Space-Time Trade-Offs for LSM-Tree Based Key-Value
+ Stores via Adaptive Removal of Superfluous Merging},
+ booktitle = {Proceedings of the 2018 International Conference on Management of
+ Data, {SIGMOD} Conference 2018, Houston, TX, USA, June 10-15, 2018},
+ pages = {505--520},
+ publisher = {{ACM}},
+ year = {2018},
+ url = {https://doi.org/10.1145/3183713.3196927},
+ doi = {10.1145/3183713.3196927},
+ timestamp = {Wed, 21 Nov 2018 12:44:08 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/DayanI18.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{dayan22,
+ author = {Niv Dayan and
+ Tamar Weiss and
+ Shmuel Dashevsky and
+ Michael Pan and
+ Edward Bortnikov and
+ Moshe Twitto},
+ title = {Spooky: Granulating LSM-Tree Compactions Correctly},
+ journal = {Proc. {VLDB} Endow.},
+ volume = {15},
+ number = {11},
+ pages = {3071--3084},
+ year = {2022},
+ url = {https://www.vldb.org/pvldb/vol15/p3071-dayan.pdf},
+ timestamp = {Mon, 26 Sep 2022 17:09:16 +0200},
+ biburl = {https://dblp.org/rec/journals/pvldb/DayanWDPBT22.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{zhao22,
+ author = {Zhuoyue Zhao and
+ Dong Xie and
+ Feifei Li},
+ title = {AB-tree: Index for Concurrent Random Sampling and Updates},
+ journal = {Proc. {VLDB} Endow.},
+ volume = {15},
+ number = {9},
+ pages = {1835--1847},
+ year = {2022},
+ url = {https://www.vldb.org/pvldb/vol15/p1835-zhao.pdf},
+ timestamp = {Tue, 26 Jul 2022 17:09:52 +0200},
+ biburl = {https://dblp.org/rec/journals/pvldb/ZhaoXL22.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{balmau19,
+ author = {Oana Balmau and
+ Florin Dinu and
+ Willy Zwaenepoel and
+ Karan Gupta and
+ Ravishankar Chandhiramoorthi and
+ Diego Didona},
+ editor = {Dahlia Malkhi and
+ Dan Tsafrir},
+ title = {{SILK:} Preventing Latency Spikes in Log-Structured Merge Key-Value
+ Stores},
+ booktitle = {2019 {USENIX} Annual Technical Conference, {USENIX} {ATC} 2019, Renton,
+ WA, USA, July 10-12, 2019},
+ pages = {753--766},
+ publisher = {{USENIX} Association},
+ year = {2019},
+ url = {https://www.usenix.org/conference/atc19/presentation/balmau},
+ timestamp = {Mon, 01 Feb 2021 17:03:06 +0100},
+ biburl = {https://dblp.org/rec/conf/usenix/BalmauDZGCD19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{afshani17,
+ author = {Peyman Afshani and
+ Zhewei Wei},
+ editor = {Kirk Pruhs and
+ Christian Sohler},
+ title = {Independent Range Sampling, Revisited},
+ booktitle = {25th Annual European Symposium on Algorithms, {ESA} 2017, September
+ 4-6, 2017, Vienna, Austria},
+ series = {LIPIcs},
+ volume = {87},
+ pages = {3:1--3:14},
+ publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
+ year = {2017},
+ url = {https://doi.org/10.4230/LIPIcs.ESA.2017.3},
+ doi = {10.4230/LIPIcs.ESA.2017.3},
+ timestamp = {Tue, 11 Feb 2020 15:52:14 +0100},
+ biburl = {https://dblp.org/rec/conf/esa/AfshaniW17.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{tao22,
+ author = {Yufei Tao},
+ editor = {Leonid Libkin and
+ Pablo Barcel{\'{o}}},
+ title = {Algorithmic Techniques for Independent Query Sampling},
+ booktitle = {{PODS} '22: International Conference on Management of Data, Philadelphia,
+ PA, USA, June 12 - 17, 2022},
+ pages = {129--138},
+ publisher = {{ACM}},
+ year = {2022},
+ url = {https://doi.org/10.1145/3517804.3526068},
+ doi = {10.1145/3517804.3526068},
+ timestamp = {Wed, 07 Dec 2022 23:12:46 +0100},
+ biburl = {https://dblp.org/rec/conf/pods/Tao22.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{olken89,
+ author = {Frank Olken and
+ Doron Rotem},
+ editor = {Peter M. G. Apers and
+ Gio Wiederhold},
+ title = {Random Sampling from {B+} Trees},
+ booktitle = {Proceedings of the Fifteenth International Conference on Very Large
+ Data Bases, August 22-25, 1989, Amsterdam, The Netherlands},
+ pages = {269--277},
+ publisher = {Morgan Kaufmann},
+ year = {1989},
+ url = {http://www.vldb.org/conf/1989/P269.PDF},
+ timestamp = {Wed, 29 Mar 2017 16:45:23 +0200},
+ biburl = {https://dblp.org/rec/conf/vldb/OlkenR89.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{allendorf23,
+ author = {Daniel Allendorf},
+ title = {A Simple Data Structure for Maintaining a Discrete Probability Distribution},
+ journal = {CoRR},
+ volume = {abs/2302.05682},
+ year = {2023},
+ url = {https://doi.org/10.48550/arXiv.2302.05682},
+ doi = {10.48550/arXiv.2302.05682},
+ eprinttype = {arXiv},
+ eprint = {2302.05682},
+ timestamp = {Sun, 19 Feb 2023 18:44:53 +0100},
+ biburl = {https://dblp.org/rec/journals/corr/abs-2302-05682.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{hagerup93,
+ author = {Torben Hagerup and
+ Kurt Mehlhorn and
+ J. Ian Munro},
+ editor = {Andrzej Lingas and
+ Rolf G. Karlsson and
+ Svante Carlsson},
+ title = {Maintaining Discrete Probability Distributions Optimally},
+ booktitle = {Automata, Languages and Programming, 20nd International Colloquium,
+ ICALP93, Lund, Sweden, July 5-9, 1993, Proceedings},
+ series = {Lecture Notes in Computer Science},
+ volume = {700},
+ pages = {253--264},
+ publisher = {Springer},
+ year = {1993},
+ url = {https://doi.org/10.1007/3-540-56939-1\_77},
+ doi = {10.1007/3-540-56939-1\_77},
+ timestamp = {Tue, 14 May 2019 10:00:44 +0200},
+ biburl = {https://dblp.org/rec/conf/icalp/HagerupMM93.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{matias03,
+ author = {Yossi Matias and
+ Jeffrey Scott Vitter and
+ Wen{-}Chun Ni},
+ title = {Dynamic Generation of Discrete Random Variates},
+ journal = {Theory Comput. Syst.},
+ volume = {36},
+ number = {4},
+ pages = {329--358},
+ year = {2003},
+ url = {https://doi.org/10.1007/s00224-003-1078-6},
+ doi = {10.1007/s00224-003-1078-6},
+ timestamp = {Tue, 21 Mar 2023 21:14:25 +0100},
+ biburl = {https://dblp.org/rec/journals/mst/MatiasVN03.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{vose91,
+ author = {Michael D. Vose},
+ title = {A Linear Algorithm For Generating Random Numbers With a Given Distribution},
+ journal = {{IEEE} Trans. Software Eng.},
+ volume = {17},
+ number = {9},
+ pages = {972--975},
+ year = {1991},
+ url = {https://doi.org/10.1109/32.92917},
+ doi = {10.1109/32.92917},
+ timestamp = {Wed, 17 May 2017 10:56:35 +0200},
+ biburl = {https://dblp.org/rec/journals/tse/Vose91.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{dong21,
+ author = {Siying Dong and
+ Andrew Kryczka and
+ Yanqin Jin and
+ Michael Stumm},
+ title = {RocksDB: Evolution of Development Priorities in a Key-value Store
+ Serving Large-scale Applications},
+ journal = {{ACM} Trans. Storage},
+ volume = {17},
+ number = {4},
+ pages = {26:1--26:32},
+ year = {2021},
+ url = {https://doi.org/10.1145/3483840},
+ doi = {10.1145/3483840},
+ timestamp = {Sun, 12 Feb 2023 18:49:27 +0100},
+ biburl = {https://dblp.org/rec/journals/tos/DongKJS21.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{dayan18-1,
+ author = {Niv Dayan and
+ Manos Athanassoulis and
+ Stratos Idreos},
+ title = {Optimal Bloom Filters and Adaptive Merging for LSM-Trees},
+ journal = {{ACM} Trans. Database Syst.},
+ volume = {43},
+ number = {4},
+ pages = {16:1--16:48},
+ year = {2018},
+ url = {https://doi.org/10.1145/3276980},
+ doi = {10.1145/3276980},
+ timestamp = {Sat, 05 Sep 2020 17:52:22 +0200},
+ biburl = {https://dblp.org/rec/journals/tods/DayanAI18.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{zhu21,
+ author = {Zichen Zhu and
+ Ju Hyoung Mun and
+ Aneesh Raman and
+ Manos Athanassoulis},
+ editor = {Danica Porobic and
+ Spyros Blanas},
+ title = {Reducing Bloom Filter {CPU} Overhead in LSM-Trees on Modern Storage
+ Devices},
+ booktitle = {Proceedings of the 17th International Workshop on Data Management
+ on New Hardware, DaMoN 2021, 21 June 2021, Virtual Event, China},
+ pages = {1:1--1:10},
+ publisher = {{ACM}},
+ year = {2021},
+ url = {https://doi.org/10.1145/3465998.3466002},
+ doi = {10.1145/3465998.3466002},
+ timestamp = {Thu, 14 Oct 2021 09:48:02 +0200},
+ biburl = {https://dblp.org/rec/conf/damon/ZhuMRA21.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{dayan19,
+ author = {Niv Dayan and
+ Stratos Idreos},
+ editor = {Peter A. Boncz and
+ Stefan Manegold and
+ Anastasia Ailamaki and
+ Amol Deshpande and
+ Tim Kraska},
+ title = {The Log-Structured Merge-Bush {\&} the Wacky Continuum},
+ booktitle = {Proceedings of the 2019 International Conference on Management of
+ Data, {SIGMOD} Conference 2019, Amsterdam, The Netherlands, June 30
+ - July 5, 2019},
+ pages = {449--466},
+ publisher = {{ACM}},
+ year = {2019},
+ url = {https://doi.org/10.1145/3299869.3319903},
+ doi = {10.1145/3299869.3319903},
+ timestamp = {Sat, 22 Jun 2019 17:10:04 +0200},
+ biburl = {https://dblp.org/rec/conf/sigmod/DayanI19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{hu14,
+ author = {Xiaocheng Hu and
+ Miao Qiao and
+ Yufei Tao},
+ editor = {Richard Hull and
+ Martin Grohe},
+ title = {Independent range sampling},
+ booktitle = {Proceedings of the 33rd {ACM} {SIGMOD-SIGACT-SIGART} Symposium on
+ Principles of Database Systems, PODS'14, Snowbird, UT, USA, June 22-27,
+ 2014},
+ pages = {246--255},
+ publisher = {{ACM}},
+ year = {2014},
+ url = {https://doi.org/10.1145/2594538.2594545},
+ doi = {10.1145/2594538.2594545},
+ timestamp = {Thu, 29 Sep 2022 08:01:46 +0200},
+ biburl = {https://dblp.org/rec/conf/pods/HuQT14.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{saxe79,
+ author = {James B. Saxe and
+ Jon Louis Bentley},
+ title = {Transforming Static Data Structures to Dynamic Structures (Abridged
+ Version)},
+ booktitle = {20th Annual Symposium on Foundations of Computer Science, San Juan,
+ Puerto Rico, 29-31 October 1979},
+ pages = {148--168},
+ publisher = {{IEEE} Computer Society},
+ year = {1979},
+ url = {https://doi.org/10.1109/SFCS.1979.47},
+ doi = {10.1109/SFCS.1979.47},
+ timestamp = {Thu, 23 Mar 2023 23:57:52 +0100},
+ biburl = {https://dblp.org/rec/conf/focs/SaxeB79.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{overmars81,
+ author = {Mark H. Overmars and
+ Jan van Leeuwen},
+ title = {Worst-Case Optimal Insertion and Deletion Methods for Decomposable
+ Searching Problems},
+ journal = {Inf. Process. Lett.},
+ volume = {12},
+ number = {4},
+ pages = {168--173},
+ year = {1981},
+ url = {https://doi.org/10.1016/0020-0190(81)90093-4},
+ doi = {10.1016/0020-0190(81)90093-4},
+ timestamp = {Fri, 26 May 2017 22:54:44 +0200},
+ biburl = {https://dblp.org/rec/journals/ipl/OvermarsL81a.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{naidan14,
+ author = {Bilegsaikhan Naidan and
+ Magnus Lie Hetland},
+ title = {Static-to-dynamic transformation for metric indexing structures (extended
+ version)},
+ journal = {Inf. Syst.},
+ volume = {45},
+ pages = {48--60},
+ year = {2014},
+ url = {https://doi.org/10.1016/j.is.2013.08.002},
+ doi = {10.1016/j.is.2013.08.002},
+ timestamp = {Sat, 20 May 2017 00:24:08 +0200},
+ biburl = {https://dblp.org/rec/journals/is/NaidanH14.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{almodaresi23,
+ author = {Fatemeh Almodaresi and
+ Jamshed Khan and
+ Sergey Madaminov and
+ Michael Ferdman and
+ Rob Johnson and
+ Prashant Pandey and
+ Rob Patro},
+ title = {An incrementally updatable and scalable system for large-scale sequence
+ search using the Bentley-Saxe transformation},
+ journal = {Bioinform.},
+ volume = {38},
+ number = {12},
+ pages = {3155--3163},
+ year = {2022},
+ url = {https://doi.org/10.1093/bioinformatics/btac142},
+ doi = {10.1093/bioinformatics/btac142},
+ timestamp = {Mon, 22 Aug 2022 08:21:09 +0200},
+ biburl = {https://dblp.org/rec/journals/bioinformatics/AlmodaresiKMFJP22.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@book{bulmer79,
+ title = {Principles of Statistics},
+ author = {M.G. Bulmer},
+ year = {1979},
+ publisher = {Dover},
+ address = {New York}
+}
+
+@inproceedings{olken86,
+ author = {Frank Olken and
+ Doron Rotem},
+ editor = {Wesley W. Chu and
+ Georges Gardarin and
+ Setsuo Ohsuga and
+ Yahiko Kambayashi},
+ title = {Simple Random Sampling from Relational Databases},
+ booktitle = {VLDB'86 Twelfth International Conference on Very Large Data Bases,
+ August 25-28, 1986, Kyoto, Japan, Proceedings},
+ pages = {160--169},
+ publisher = {Morgan Kaufmann},
+ year = {1986},
+ url = {http://www.vldb.org/conf/1986/P160.PDF},
+ timestamp = {Wed, 29 Mar 2017 16:45:23 +0200},
+ biburl = {https://dblp.org/rec/conf/vldb/OlkenR86.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc {postgres-doc,
+ title = {PostgreSQL Documentation},
+ url = {https://www.postgresql.org/docs/15/sql-select.html},
+ year = {2023}
+}
+
+
+@online {pinecone,
+ title = {Pinecone DB},
+ url = {https://www.pinecone.io/},
+ year = {2023}
+}
+
+@online {neptune,
+ title = {Amazon Neptune},
+ url = {https://aws.amazon.com/neptune/},
+ year = {2023}
+}
+
+@online {teradata,
+ title = {Teradata},
+ url = {https://www.teradata.com/},
+ year = {2023}
+}
+
+@online {pinot,
+ title = {Apache Pinot},
+ url = {https://pinot.apache.org/},
+ year = {2023}
+}
+
+@online {pinecone-db,
+ title = {Pinecone DB: Hierarchical Navigable Small Worlds},
+ url = {https://www.pinecone.io/learn/series/faiss/hnsw/},
+ year = {2023}
+}
+
+@online {postgis-doc,
+ title = {Introduction to PostGIS: Spatial Indexing},
+ url = {https://postgis.net/workshops/postgis-intro/indexing.html},
+ year = {2023}
+}
+
+@online {mysql-btree-hash,
+ title = {MySQL Documentation - Comparison of B-tree and Hash Indexes},
+ url = {https://dev.mysql.com/doc/refman/8.0/en/index-btree-hash.html},
+ year = {2023}
+}
+
+@article{olken95,
+title = {Random sampling from databases: a survey},
+journal = {Statistics and Computing},
+volume = {5},
+pages = {25-42},
+year = {1995},
+doi = {https://doi.org/10.1007/BF00140664},
+author = {Frank Olken and Doron Rotem}
+}
+
+@inproceedings{hu15,
+ author = {Xiaocheng Hu and
+ Miao Qiao and
+ Yufei Tao},
+ editor = {Tova Milo and
+ Diego Calvanese},
+ title = {External Memory Stream Sampling},
+ booktitle = {Proceedings of the 34th {ACM} Symposium on Principles of Database
+ Systems, {PODS} 2015, Melbourne, Victoria, Australia, May 31 - June
+ 4, 2015},
+ pages = {229--239},
+ publisher = {{ACM}},
+ year = {2015},
+ url = {https://doi.org/10.1145/2745754.2745757},
+ doi = {10.1145/2745754.2745757},
+ timestamp = {Thu, 29 Sep 2022 08:01:46 +0200},
+ biburl = {https://dblp.org/rec/conf/pods/HuQT15.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{vitter85,
+ author = {Jeffrey Scott Vitter},
+ title = {Random Sampling with a Reservoir},
+ journal = {{ACM} Trans. Math. Softw.},
+ volume = {11},
+ number = {1},
+ pages = {37--57},
+ year = {1985},
+ url = {https://doi.org/10.1145/3147.3165},
+ doi = {10.1145/3147.3165},
+ timestamp = {Tue, 21 Mar 2023 21:15:05 +0100},
+ biburl = {https://dblp.org/rec/journals/toms/Vitter85.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@inproceedings{zhang18,
+ author = {Huanchen Zhang and
+ Hyeontaek Lim and
+ Viktor Leis and
+ David G. Andersen and
+ Michael Kaminsky and
+ Kimberly Keeton and
+ Andrew Pavlo},
+ editor = {Gautam Das and
+ Christopher M. Jermaine and
+ Philip A. Bernstein},
+ title = {SuRF: Practical Range Query Filtering with Fast Succinct Tries},
+ booktitle = {Proceedings of the 2018 International Conference on Management of
+ Data, {SIGMOD} Conference 2018, Houston, TX, USA, June 10-15, 2018},
+ pages = {323--336},
+ publisher = {{ACM}},
+ year = {2018},
+ url = {https://doi.org/10.1145/3183713.3196931},
+ doi = {10.1145/3183713.3196931},
+ timestamp = {Sun, 25 Oct 2020 22:52:40 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/ZhangLLAKKP18.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@inproceedings{siqiang20,
+ author = {Siqiang Luo and
+ Subarna Chatterjee and
+ Rafael Ketsetsidis and
+ Niv Dayan and
+ Wilson Qin and
+ Stratos Idreos},
+ editor = {David Maier and
+ Rachel Pottinger and
+ AnHai Doan and
+ Wang{-}Chiew Tan and
+ Abdussalam Alawini and
+ Hung Q. Ngo},
+ title = {Rosetta: {A} Robust Space-Time Optimized Range Filter for Key-Value
+ Stores},
+ booktitle = {Proceedings of the 2020 International Conference on Management of
+ Data, {SIGMOD} Conference 2020, online conference [Portland, OR, USA],
+ June 14-19, 2020},
+ pages = {2071--2086},
+ publisher = {{ACM}},
+ year = {2020},
+ url = {https://doi.org/10.1145/3318464.3389731},
+ doi = {10.1145/3318464.3389731},
+ timestamp = {Wed, 04 May 2022 13:02:28 +0200},
+ biburl = {https://dblp.org/rec/conf/sigmod/LuoCKDQI20.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@inproceedings{blinkdb,
+ author = {Sameer Agarwal and
+ Barzan Mozafari and
+ Aurojit Panda and
+ Henry Milner and
+ Samuel Madden and
+ Ion Stoica},
+ editor = {Zdenek Hanz{\'{a}}lek and
+ Hermann H{\"{a}}rtig and
+ Miguel Castro and
+ M. Frans Kaashoek},
+ title = {BlinkDB: queries with bounded errors and bounded response times on
+ very large data},
+ booktitle = {Eighth Eurosys Conference 2013, EuroSys '13, Prague, Czech Republic,
+ April 14-17, 2013},
+ pages = {29--42},
+ publisher = {{ACM}},
+ year = {2013},
+ url = {https://doi.org/10.1145/2465351.2465355},
+ doi = {10.1145/2465351.2465355},
+ timestamp = {Wed, 06 Jul 2022 14:43:33 +0200},
+ biburl = {https://dblp.org/rec/conf/eurosys/AgarwalMPMMS13.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{sps,
+ author = {Bolin Ding and
+ Silu Huang and
+ Surajit Chaudhuri and
+ Kaushik Chakrabarti and
+ Chi Wang},
+ editor = {Fatma {\"{O}}zcan and
+ Georgia Koutrika and
+ Sam Madden},
+ title = {Sample + Seek: Approximating Aggregates with Distribution Precision
+ Guarantee},
+ booktitle = {Proceedings of the 2016 International Conference on Management of
+ Data, {SIGMOD} Conference 2016, San Francisco, CA, USA, June 26 -
+ July 01, 2016},
+ pages = {679--694},
+ publisher = {{ACM}},
+ year = {2016},
+ url = {https://doi.org/10.1145/2882903.2915249},
+ doi = {10.1145/2882903.2915249},
+ timestamp = {Wed, 14 Nov 2018 10:56:20 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/DingHCC016.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@phdthesis{olken-thesis,
+ author = {Frank Olken},
+ title = {Random Sampling from Databases},
+ school = {University of California at Berkeley},
+ year = {1993},
+ timestamp = {Thu, 03 Jan 2002 12:33:26 +0100},
+ biburl = {https://dblp.org/rec/phd/Olken93.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{ml-sampling,
+ author = {Silu Huang and
+ Chi Wang and
+ Bolin Ding and
+ Surajit Chaudhuri},
+ title = {Efficient Identification of Approximate Best Configuration of Training
+ in Large Datasets},
+ booktitle = {The Thirty-Third {AAAI} Conference on Artificial Intelligence, {AAAI}
+ 2019, The Thirty-First Innovative Applications of Artificial Intelligence
+ Conference, {IAAI} 2019, The Ninth {AAAI} Symposium on Educational
+ Advances in Artificial Intelligence, {EAAI} 2019, Honolulu, Hawaii,
+ USA, January 27 - February 1, 2019},
+ pages = {3862--3869},
+ publisher = {{AAAI} Press},
+ year = {2019},
+ url = {https://doi.org/10.1609/aaai.v33i01.33013862},
+ doi = {10.1609/aaai.v33i01.33013862},
+ timestamp = {Tue, 02 Feb 2021 08:00:44 +0100},
+ biburl = {https://dblp.org/rec/conf/aaai/Huang0DC19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{quickr,
+ author = {Srikanth Kandula and
+ Anil Shanbhag and
+ Aleksandar Vitorovic and
+ Matthaios Olma and
+ Robert Grandl and
+ Surajit Chaudhuri and
+ Bolin Ding},
+ editor = {Fatma {\"{O}}zcan and
+ Georgia Koutrika and
+ Sam Madden},
+ title = {Quickr: Lazily Approximating Complex AdHoc Queries in BigData Clusters},
+ booktitle = {Proceedings of the 2016 International Conference on Management of
+ Data, {SIGMOD} Conference 2016, San Francisco, CA, USA, June 26 -
+ July 01, 2016},
+ pages = {631--646},
+ publisher = {{ACM}},
+ year = {2016},
+ url = {https://doi.org/10.1145/2882903.2882940},
+ doi = {10.1145/2882903.2882940},
+ timestamp = {Wed, 14 Nov 2018 10:56:20 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/KandulaSVOGCD16.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{verdict,
+ author = {Yongjoo Park and
+ Barzan Mozafari and
+ Joseph Sorenson and
+ Junhao Wang},
+ editor = {Gautam Das and
+ Christopher M. Jermaine and
+ Philip A. Bernstein},
+ title = {VerdictDB: Universalizing Approximate Query Processing},
+ booktitle = {Proceedings of the 2018 International Conference on Management of
+ Data, {SIGMOD} Conference 2018, Houston, TX, USA, June 10-15, 2018},
+ pages = {1461--1476},
+ publisher = {{ACM}},
+ year = {2018},
+ url = {https://doi.org/10.1145/3183713.3196905},
+ doi = {10.1145/3183713.3196905},
+ timestamp = {Wed, 21 Nov 2018 12:44:08 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/ParkMSW18.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{irsra,
+ author = {Peyman Afshani and
+ Jeff M. Phillips},
+ editor = {Gill Barequet and
+ Yusu Wang},
+ title = {Independent Range Sampling, Revisited Again},
+ booktitle = {35th International Symposium on Computational Geometry, SoCG 2019,
+ June 18-21, 2019, Portland, Oregon, {USA}},
+ series = {LIPIcs},
+ volume = {129},
+ pages = {4:1--4:13},
+ publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
+ year = {2019},
+ url = {https://doi.org/10.4230/LIPIcs.SoCG.2019.4},
+ doi = {10.4230/LIPIcs.SoCG.2019.4},
+ timestamp = {Mon, 02 Jan 2023 09:02:13 +0100},
+ biburl = {https://dblp.org/rec/conf/compgeom/AfshaniP19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{data-twitter1,
+ author = {Haewoon Kwak and
+ Changhyun Lee and
+ Hosung Park and
+ Sue B. Moon},
+ editor = {Michael Rappa and
+ Paul Jones and
+ Juliana Freire and
+ Soumen Chakrabarti},
+ title = {What is Twitter, a social network or a news media?},
+ booktitle = {Proceedings of the 19th International Conference on World Wide Web,
+ {WWW} 2010, Raleigh, North Carolina, USA, April 26-30, 2010},
+ pages = {591--600},
+ publisher = {{ACM}},
+ year = {2010},
+ url = {https://doi.org/10.1145/1772690.1772751},
+ doi = {10.1145/1772690.1772751},
+ timestamp = {Sun, 02 Jun 2019 21:15:56 +0200},
+ biburl = {https://dblp.org/rec/conf/www/KwakLPM10.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@online{data-twitter,
+ title = {Twitter Dataset},
+ url = {https://github.com/ANLAB-KAIST/traces/releases/tag/twitter_rv.net},
+ year = {2023}
+}
+
+@online{data-delicious,
+ title = {Delicious Dataset},
+ url = {http://konect.cc/networks/delicious-ti/},
+ year = {2023}
+}
+
+@online{data-osm,
+ title = {Open Street Map Dataset},
+ url = {https://planet.openstreetmap.org/},
+ year = {2023}
+}
+
+@inproceedings{golan-gueta15,
+ author = {Guy Golan{-}Gueta and
+ Edward Bortnikov and
+ Eshcar Hillel and
+ Idit Keidar},
+ editor = {Laurent R{\'{e}}veill{\`{e}}re and
+ Tim Harris and
+ Maurice Herlihy},
+ title = {Scaling concurrent log-structured data stores},
+ booktitle = {Proceedings of the Tenth European Conference on Computer Systems,
+ EuroSys 2015, Bordeaux, France, April 21-24, 2015},
+ pages = {32:1--32:14},
+ publisher = {{ACM}},
+ year = {2015},
+ url = {https://doi.org/10.1145/2741948.2741973},
+ doi = {10.1145/2741948.2741973},
+ timestamp = {Wed, 14 Nov 2018 10:57:04 +0100},
+ biburl = {https://dblp.org/rec/conf/eurosys/Golan-GuetaBHK15.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{custers19,
+ author = {Bram Custers and
+ Mees van de Kerkhof and
+ Wouter Meulemans and
+ Bettina Speckmann and
+ Frank Staals},
+ editor = {Farnoush Banaei Kashani and
+ Goce Trajcevski and
+ Ralf Hartmut G{\"{u}}ting and
+ Lars Kulik and
+ Shawn D. Newsam},
+ title = {Maximum Physically Consistent Trajectories},
+ booktitle = {Proceedings of the 27th {ACM} {SIGSPATIAL} International Conference
+ on Advances in Geographic Information Systems, {SIGSPATIAL} 2019,
+ Chicago, IL, USA, November 5-8, 2019},
+ pages = {79--88},
+ publisher = {{ACM}},
+ year = {2019},
+ url = {https://doi.org/10.1145/3347146.3359363},
+ doi = {10.1145/3347146.3359363},
+ timestamp = {Thu, 14 Nov 2019 10:14:43 +0100},
+ biburl = {https://dblp.org/rec/conf/gis/CustersKMSS19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{aumuller20,
+ author = {Martin Aum{\"{u}}ller and
+ Rasmus Pagh and
+ Francesco Silvestri},
+ editor = {Dan Suciu and
+ Yufei Tao and
+ Zhewei Wei},
+ title = {Fair Near Neighbor Search: Independent Range Sampling in High Dimensions},
+ booktitle = {Proceedings of the 39th {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
+ of Database Systems, {PODS} 2020, Portland, OR, USA, June 14-19, 2020},
+ pages = {191--204},
+ publisher = {{ACM}},
+ year = {2020},
+ url = {https://doi.org/10.1145/3375395.3387648},
+ doi = {10.1145/3375395.3387648},
+ timestamp = {Thu, 29 Sep 2022 08:01:46 +0200},
+ biburl = {https://dblp.org/rec/conf/pods/0001P020.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{cohen23,
+ author = {Edith Cohen},
+ editor = {Floris Geerts and
+ Hung Q. Ngo and
+ Stavros Sintos},
+ title = {Sampling Big Ideas in Query Optimization},
+ booktitle = {Proceedings of the 42nd {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
+ of Database Systems, {PODS} 2023, Seattle, WA, USA, June 18-23, 2023},
+ pages = {361--371},
+ publisher = {{ACM}},
+ year = {2023},
+ url = {https://doi.org/10.1145/3584372.3589935},
+ doi = {10.1145/3584372.3589935},
+ timestamp = {Thu, 15 Jun 2023 21:57:01 +0200},
+ biburl = {https://dblp.org/rec/conf/pods/Cohen23.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+
+@inproceedings{gryz04,
+ author = {Jarek Gryz and
+ Junjie Guo and
+ Linqi Liu and
+ Calisto Zuzarte},
+ editor = {Gerhard Weikum and
+ Arnd Christian K{\"{o}}nig and
+ Stefan De{\ss}loch},
+ title = {Query Sampling in {DB2} Universal Database},
+ booktitle = {Proceedings of the {ACM} {SIGMOD} International Conference on Management
+ of Data, Paris, France, June 13-18, 2004},
+ pages = {839--843},
+ publisher = {{ACM}},
+ year = {2004},
+ url = {https://doi.org/10.1145/1007568.1007664},
+ doi = {10.1145/1007568.1007664},
+ timestamp = {Thu, 11 Mar 2021 15:20:15 +0100},
+ biburl = {https://dblp.org/rec/conf/sigmod/GryzGLZ04.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{ben-eliezer20,
+ author = {Omri Ben{-}Eliezer and
+ Eylon Yogev},
+ editor = {Dan Suciu and
+ Yufei Tao and
+ Zhewei Wei},
+ title = {The Adversarial Robustness of Sampling},
+ booktitle = {Proceedings of the 39th {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
+ of Database Systems, {PODS} 2020, Portland, OR, USA, June 14-19, 2020},
+ pages = {49--62},
+ publisher = {{ACM}},
+ year = {2020},
+ url = {https://doi.org/10.1145/3375395.3387643},
+ doi = {10.1145/3375395.3387643},
+ timestamp = {Thu, 29 Sep 2022 08:01:46 +0200},
+ biburl = {https://dblp.org/rec/conf/pods/Ben-EliezerY20.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{li19,
+ author = {Feifei Li and
+ Bin Wu and
+ Ke Yi and
+ Zhuoyue Zhao},
+ title = {Wander Join and {XDB:} Online Aggregation via Random Walks},
+ journal = {{ACM} Trans. Database Syst.},
+ volume = {44},
+ number = {1},
+ pages = {2:1--2:41},
+ year = {2019},
+ url = {https://doi.org/10.1145/3284551},
+ doi = {10.1145/3284551},
+ timestamp = {Sun, 02 Oct 2022 15:51:46 +0200},
+ biburl = {https://dblp.org/rec/journals/tods/LiWYZ19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{rdd,
+ author = {Matei Zaharia and
+ Mosharaf Chowdhury and
+ Tathagata Das and
+ Ankur Dave and
+ Justin Ma and
+ Murphy McCauly and
+ Michael J. Franklin and
+ Scott Shenker and
+ Ion Stoica},
+ editor = {Steven D. Gribble and
+ Dina Katabi},
+ title = {Resilient Distributed Datasets: {A} Fault-Tolerant Abstraction for
+ In-Memory Cluster Computing},
+ booktitle = {Proceedings of the 9th {USENIX} Symposium on Networked Systems Design
+ and Implementation, {NSDI} 2012, San Jose, CA, USA, April 25-27, 2012},
+ pages = {15--28},
+ publisher = {{USENIX} Association},
+ year = {2012},
+ url = {https://www.usenix.org/conference/nsdi12/technical-sessions/presentation/zaharia},
+ timestamp = {Tue, 21 Mar 2023 21:02:49 +0100},
+ biburl = {https://dblp.org/rec/conf/nsdi/ZahariaCDDMMFSS12.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{hdfs,
+ author = {Konstantin Shvachko and
+ Hairong Kuang and
+ Sanjay Radia and
+ Robert Chansler},
+ editor = {Mohammed G. Khatib and
+ Xubin He and
+ Michael Factor},
+ title = {The Hadoop Distributed File System},
+ booktitle = {{IEEE} 26th Symposium on Mass Storage Systems and Technologies, {MSST}
+ 2012, Lake Tahoe, Nevada, USA, May 3-7, 2010},
+ pages = {1--10},
+ publisher = {{IEEE} Computer Society},
+ year = {2010},
+ url = {https://doi.org/10.1109/MSST.2010.5496972},
+ doi = {10.1109/MSST.2010.5496972},
+ timestamp = {Fri, 24 Mar 2023 00:01:51 +0100},
+ biburl = {https://dblp.org/rec/conf/mss/ShvachkoKRC10.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@book{overmars83,
+ author = {Mark H. Overmars},
+ title = {The Design of Dynamic Data Structures},
+ series = {Lecture Notes in Computer Science},
+ volume = {156},
+ publisher = {Springer},
+ year = {1983},
+ url = {https://doi.org/10.1007/BFb0014927},
+ doi = {10.1007/BFb0014927},
+ isbn = {3-540-12330-X},
+ timestamp = {Tue, 14 May 2019 10:00:35 +0200},
+ biburl = {https://dblp.org/rec/books/sp/Overmars83.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{vptree,
+ author = {Peter N. Yianilos},
+ editor = {Vijaya Ramachandran},
+ title = {Data Structures and Algorithms for Nearest Neighbor Search in General
+ Metric Spaces},
+ booktitle = {Proceedings of the Fourth Annual {ACM/SIGACT-SIAM} Symposium on Discrete
+ Algorithms, 25-27 January 1993, Austin, Texas, {USA}},
+ year = {1993},
+}
+
+@inproceedings{mtree,
+ author = {Paolo Ciaccia and
+ Marco Patella and
+ Pavel Zezula},
+ title = {M-tree: An Efficient Access Method for Similarity Search in Metric
+ Spaces},
+ booktitle = {VLDB'97, Proceedings of 23rd International Conference on Very Large Data Bases},
+ year = {1997},
+}
+
+@inproceedings{ALEX,
+ author = {Jialin Ding and
+ Umar Farooq Minhas and
+ Jia Yu and
+ Chi Wang and
+ Jaeyoung Do and
+ Yinan Li and
+ Hantian Zhang and
+ Badrish Chandramouli and
+ Johannes Gehrke and
+ Donald Kossmann and
+ David B. Lomet and
+ Tim Kraska},
+ title = {{ALEX:} An Updatable Adaptive Learned Index},
+ booktitle = {Proceedings of the 2020 ACM International Conference on Management of
+ Data},
+ year = {2020},
+}
+
+@article{pgm,
+ author = {Paolo Ferragina and
+ Giorgio Vinciguerra},
+ title = {The PGM-index: a fully-dynamic compressed learned index with provable
+ worst-case bounds},
+ journal = {Proc. {VLDB} Endow.},
+ volume = {13},
+ number = {8},
+ year = {2020},
+}
+
+@article{sosd-datasets,
+ author = {Ryan Marcus and
+ Andreas Kipf and
+ Alexander van Renen and
+ Mihail Stoian and
+ Sanchit Misra and
+ Alfons Kemper and
+ Thomas Neumann and
+ Tim Kraska},
+ title = {Benchmarking Learned Indexes},
+ journal = {Proc. {VLDB} Endow.},
+ volume = {14},
+ number = {1},
+ pages = {1--13},
+ year = {2020}
+}
+
+@inproceedings{RMI,
+author = {Kraska, Tim and Beutel, Alex and Chi, Ed H. and Dean, Jeffrey and Polyzotis, Neoklis},
+title = {The Case for Learned Index Structures},
+year = {2018},
+booktitle = {Proceedings of the 2018 International Conference on Management of Data},
+series = {SIGMOD '18}
+}
+
+@article{10.14778/3551793.3551848,
+author = {Wongkham, Chaichon and Lu, Baotong and Liu, Chris and Zhong, Zhicong and Lo, Eric and Wang, Tianzheng},
+title = {Are Updatable Learned Indexes Ready?},
+year = {2022},
+publisher = {VLDB Endowment},
+volume = {15},
+number = {11},
+journal = {Proc. VLDB Endow.},
+}
+
+@article{10.14778/2850583.2850584,
+author = {Wang, Lu and Christensen, Robert and Li, Feifei and Yi, Ke},
+title = {Spatial Online Sampling and Aggregation},
+year = {2015},
+publisher = {VLDB Endowment},
+volume = {9},
+number = {3},
+journal = {Proc. VLDB Endow.},
+}
+
+@article{plex,
+ author = {Mihail Stoian and
+ Andreas Kipf and
+ Ryan Marcus and
+ Tim Kraska},
+ title = {{PLEX:} Towards Practical Learned Indexing},
+ journal = {CoRR},
+ volume = {abs/2108.05117},
+ year = {2021},
+}
+
+@misc{sbw,
+ author = {Cardellino, Cristian},
+ title = {Spanish {B}illion {W}ords {C}orpus and {E}mbeddings},
+ url = {https://crscardellino.github.io/SBWCE/},
+ month = {August},
+ year = {2019}
+}
+
+@article{DBLP:journals/corr/abs-1712-01208,
+ author = {Tim Kraska and
+ Alex Beutel and
+ Ed H. Chi and
+ Jeffrey Dean and
+ Neoklis Polyzotis},
+ title = {The Case for Learned Index Structures},
+ journal = {CoRR},
+ year = {2017},
+}
+
+@article{DBLP:journals/corr/abs-1903-00507,
+ author = {Giorgio Vinciguerra and
+ Paolo Ferragina and
+ Michele Miccinesi},
+ title = {Superseding traditional indexes by orchestrating learning and geometry},
+ journal = {CoRR},
+ year = {2019},
+}
+
+@article{DBLP:journals/corr/abs-1905-08898,
+ author = {Jialin Ding and
+ Umar Farooq Minhas and
+ Hantian Zhang and
+ Yinan Li and
+ Chi Wang and
+ Badrish Chandramouli and
+ Johannes Gehrke and
+ Donald Kossmann and
+ David B. Lomet},
+ title = {{ALEX:} An Updatable Adaptive Learned Index},
+ journal = {CoRR},
+ year = {2019},
+}
+
+@inproceedings{10.1145/3332466.3374547,
+author = {Tang, Chuzhe and Wang, Youyun and Dong, Zhiyuan and Hu, Gansen and Wang, Zhaoguo and Wang, Minjie and Chen, Haibo},
+title = {XIndex: A Scalable Learned Index for Multicore Data Storage},
+year = {2020},
+booktitle = {Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
+series = {PPoPP '20}
+}
+
+@article{10.14778/3489496.3489512,
+author = {Li, Pengfei and Hua, Yu and Jia, Jingnan and Zuo, Pengfei},
+title = {FINEdex: A Fine-Grained Learned Index Scheme for Scalable and Concurrent Memory Systems},
+year = {2021},
+publisher = {VLDB Endowment},
+volume = {15},
+number = {2},
+journal = {Proc. VLDB Endow.},
+}
+
+@inproceedings{10.1145/2933349.2933352,
+author = {Leis, Viktor and Scheibner, Florian and Kemper, Alfons and Neumann, Thomas},
+title = {The ART of Practical Synchronization},
+year = {2016},
+booktitle = {Proceedings of the 12th International Workshop on Data Management on New Hardware},
+series = {DaMoN '16}
+}
+
+@article{DBLP:journals/corr/abs-1910-06169,
+ author = {Paolo Ferragina and
+ Giorgio Vinciguerra},
+ title = {The PGM-index: a multicriteria, compressed and learned approach to
+ data indexing},
+ journal = {CoRR},
+ year = {2019},
+}
+
+@article{byods-datalog,
+author = {Sahebolamri, Arash and Barrett, Langston and Moore, Scott and Micinski, Kristopher},
+title = {Bring Your Own Data Structures to Datalog},
+year = {2023},
+issue_date = {October 2023},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {7},
+number = {OOPSLA2},
+url = {https://doi.org/10.1145/3622840},
+doi = {10.1145/3622840},
+abstract = {The restricted logic programming language Datalog has become a popular implementation target for deductive-analytic workloads including social-media analytics and program analysis. Modern Datalog engines compile Datalog rules to joins over explicit representations of relations—often B-trees or hash maps. While these modern engines have enabled high scalability in many application domains, they have a crucial weakness: achieving the desired algorithmic complexity may be impossible due to representation-imposed overhead of the engine’s data structures. In this paper, we present the "Bring Your Own Data Structures" (Byods) approach, in the form of a DSL embedded in Rust. Using Byods, an engineer writes logical rules which are implicitly parametric on the concrete data structure representation; our implementation provides an interface to enable "bringing their own" data structures to represent relations, which harmoniously interact with code generated by our compiler (implemented as Rust procedural macros). We formalize the semantics of Byods as an extension of Datalog’s; our formalization captures the key properties demanded of data structures compatible with Byods, including properties required for incrementalized (semi-na\"{\i}ve) evaluation. We detail many applications of the Byods approach, implementing analyses requiring specialized data structures for transitive and equivalence relations to scale, including an optimized version of the Rust borrow checker Polonius; highly-parallel PageRank made possible by lattices; and a large-scale analysis of LLVM utilizing index-sharing to scale. Our results show that Byods offers both improved algorithmic scalability (reduced time and/or space complexity) and runtimes competitive with state-of-the-art parallelizing Datalog solvers.},
+journal = {Proc. ACM Program. Lang.},
+month = {oct},
+articleno = {264},
+numpages = {26},
+keywords = {Program Analysis, Logic Programming, Static Analysis, Datalog}
+}
+
+
+@inproceedings{avl,
+ title={An algorithm for organization of information},
+ author={Adelson-Velskii, Georgii Maksimovich and Landis, Evgenii Mikhailovich},
+ booktitle={Doklady Akademii Nauk},
+ volume={146},
+ number={2},
+ pages={263--266},
+ year={1962},
+ organization={Russian Academy of Sciences}
+}
+
+@book{cowbook,
+ title={Database Management Systems},
+ author={Raghu Ramakrishnan and Johannes Gehrke},
+ edition={3},
+ publisher={McGraw-Hill},
+ year = {2003}
+}
+
+@book{intro-analysis,
+ title={Introduction to Real Analysis},
+ author={Christopher Heil},
+ edition={1},
+ publisher={Springer},
+ year = {2019}
+}
+
+@inproceedings{wavesofmisery,
+ author = {Nikolaus Glombiewski and
+ Bernhard Seeger and
+ Goetz Graefe},
+ editor = {Torsten Grust and
+ Felix Naumann and
+ Alexander B{\"{o}}hm and
+ Wolfgang Lehner and
+ Theo H{\"{a}}rder and
+ Erhard Rahm and
+ Andreas Heuer and
+ Meike Klettke and
+ Holger Meyer},
+ title = {Waves of Misery After Index Creation},
+ booktitle = {Datenbanksysteme f{\"{u}}r Business, Technologie und Web {(BTW}
+ 2019), 18. Fachtagung des GI-Fachbereichs ,,Datenbanken und Informationssysteme"
+ (DBIS), 4.-8. M{\"{a}}rz 2019, Rostock, Germany, Proceedings},
+ series = {{LNI}},
+ volume = {{P-289}},
+ pages = {77--96},
+ publisher = {Gesellschaft f{\"{u}}r Informatik, Bonn},
+ year = {2019},
+ url = {https://doi.org/10.18420/btw2019-06},
+ doi = {10.18420/BTW2019-06},
+ timestamp = {Wed, 13 Jan 2021 11:37:30 +0100},
+ biburl = {https://dblp.org/rec/conf/btw/GlombiewskiSG19.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{wavesofmisery-rtree,
+ author = {Lu Xing and
+ Eric Lee and
+ Tong An and
+ Bo{-}Cheng Chu and
+ Ahmed Mahmood and
+ Ahmed M. Aly and
+ Jianguo Wang and
+ Walid G. Aref},
+ title = {An Experimental Evaluation and Investigation of Waves of Misery in
+ R-trees},
+ journal = {Proc. {VLDB} Endow.},
+ volume = {15},
+ number = {3},
+ pages = {478--490},
+ year = {2021},
+ url = {http://www.vldb.org/pvldb/vol15/p478-aref.pdf},
+ doi = {10.14778/3494124.3494132},
+ timestamp = {Sun, 12 Nov 2023 02:17:29 +0100},
+ biburl = {https://dblp.org/rec/journals/pvldb/XingLACMAWA21.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{ubiq-btree,
+author = {Comer, Douglas},
+title = {Ubiquitous B-Tree},
+year = {1979},
+issue_date = {June 1979},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {11},
+number = {2},
+issn = {0360-0300},
+url = {https://doi.org/10.1145/356770.356776},
+doi = {10.1145/356770.356776},
+journal = {ACM Comput. Surv.},
+month = {jun},
+pages = {121–137},
+numpages = {17}
+}
+
+@article{rocksdb,
+author = {Dong, Siying and Kryczka, Andrew and Jin, Yanqin and Stumm, Michael},
+title = {RocksDB: Evolution of Development Priorities in a Key-Value Store Serving Large-Scale Applications},
+year = {2021},
+issue_date = {November 2021},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {17},
+number = {4},
+issn = {1553-3077},
+url = {https://doi.org/10.1145/3483840},
+doi = {10.1145/3483840},
+abstract = {This article is an eight-year retrospective on development priorities for RocksDB, a key-value store developed at Facebook that targets large-scale distributed systems and that is optimized for Solid State Drives (SSDs). We describe how the priorities evolved over time as a result of hardware trends and extensive experiences running RocksDB at scale in production at a number of organizations: from optimizing write amplification, to space amplification, to CPU utilization. We describe lessons from running large-scale applications, including that resource allocation needs to be managed across different RocksDB instances, that data formats need to remain backward- and forward-compatible to allow incremental software rollouts, and that appropriate support for database replication and backups are needed. Lessons from failure handling taught us that data corruption errors needed to be detected earlier and that data integrity protection mechanisms are needed at every layer of the system. We describe improvements to the key-value interface. We describe a number of efforts that in retrospect proved to be misguided. Finally, we describe a number of open problems that could benefit from future research.},
+journal = {ACM Trans. Storage},
+month = {oct},
+articleno = {26},
+numpages = {32},
+keywords = {large-scale applications, RocksDB, SSD, compaction, databases, Key-value stores}
+}
+
+
+@techreport{ubiq-rtree,
+ title={R-trees have grown everywhere},
+ author={Manolopoulos, Yannis and Nanopoulos, Alexandros and Papadopoulos, Apostolos N and Theodoridis, Yannis},
+ year={2003},
+ institution={Technical Report available at http://www. rtreeportal. org}
+}
+
+@article{mergeable-summaries,
+ author = {Pankaj K. Agarwal and
+ Graham Cormode and
+ Zengfeng Huang and
+ Jeff M. Phillips and
+ Zhewei Wei and
+ Ke Yi},
+ title = {Mergeable summaries},
+ journal = {{ACM} Trans. Database Syst.},
+ volume = {38},
+ number = {4},
+ pages = {26},
+ year = {2013},
+ url = {https://doi.org/10.1145/2500128},
+ doi = {10.1145/2500128},
+ timestamp = {Tue, 21 Mar 2023 21:14:49 +0100},
+ biburl = {https://dblp.org/rec/journals/tods/AgarwalCHPWY13.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+
+@inproceedings{countmin-sketch,
+ author = {Graham Cormode and
+ S. Muthukrishnan},
+ editor = {Martin Farach{-}Colton},
+ title = {An Improved Data Stream Summary: The Count-Min Sketch and Its Applications},
+ booktitle = {{LATIN} 2004: Theoretical Informatics, 6th Latin American Symposium,
+ Buenos Aires, Argentina, April 5-8, 2004, Proceedings},
+ series = {Lecture Notes in Computer Science},
+ volume = {2976},
+ pages = {29--38},
+ publisher = {Springer},
+ year = {2004},
+ url = {https://doi.org/10.1007/978-3-540-24698-5\_7},
+ doi = {10.1007/978-3-540-24698-5\_7},
+ timestamp = {Fri, 07 May 2021 12:53:47 +0200},
+ biburl = {https://dblp.org/rec/conf/latin/CormodeM04.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{isam-overflow,
+author = {Larson, Per-\r{A}ke},
+title = {Analysis of Index-Sequential Files with Overflow Chaining},
+year = {1981},
+issue_date = {Dec. 1981},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {6},
+number = {4},
+issn = {0362-5915},
+url = {https://doi.org/10.1145/319628.319665},
+doi = {10.1145/319628.319665},
+abstract = {The gradual performance deterioration caused by deletions from and insertions into an index-sequential file after loading is analyzed. The model developed assumes that overflow records are handled by chaining. Formulas for computing the expected number of overflow records and the expected number of additional accesses caused by the overflow records for both successful and unsuccessful searches are derived.},
+journal = {ACM Trans. Database Syst.},
+month = {dec},
+pages = {671–680},
+numpages = {10},
+keywords = {analytic model, analysis of algorithms, overflow chaining, performance analysis, ISAM, overflow, overflow handling, indexed sequential access method, index sequential files, file organization, file structure}
+}