summaryrefslogtreecommitdiffstats
path: root/include/shard/VPTree.h
diff options
context:
space:
mode:
authorDouglas Rumbaugh <dbr4@psu.edu>2025-09-25 14:42:44 -0400
committerDouglas Rumbaugh <dbr4@psu.edu>2025-09-25 14:42:44 -0400
commitcf5f3bbb0cb58430ed68ad3ebfcefc009e553d71 (patch)
tree4c17bc3169ee195c236cea9c9efda0aef7488e3c /include/shard/VPTree.h
parent826c1fff5accbaa6b415acc176a5acbeb5f691b6 (diff)
downloaddynamic-extension-cf5f3bbb0cb58430ed68ad3ebfcefc009e553d71.tar.gz
Code reformatting
Diffstat (limited to 'include/shard/VPTree.h')
-rw-r--r--include/shard/VPTree.h605
1 files changed, 301 insertions, 304 deletions
diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h
index 7130efe..33ce9b9 100644
--- a/include/shard/VPTree.h
+++ b/include/shard/VPTree.h
@@ -15,379 +15,376 @@
#include <vector>
-#include <unordered_map>
#include "framework/ShardRequirements.h"
#include "psu-ds/PriorityQueue.h"
+#include <unordered_map>
+using psudb::byte;
using psudb::CACHELINE_SIZE;
using psudb::PriorityQueue;
-using psudb::byte;
namespace de {
-template <NDRecordInterface R, size_t LEAFSZ=100, bool HMAP=false>
+template <NDRecordInterface R, size_t LEAFSZ = 100, bool HMAP = false>
class VPTree {
public:
- typedef R RECORD;
+ typedef R RECORD;
private:
- struct vpnode {
- size_t start;
- size_t stop;
- bool leaf;
+ struct vpnode {
+ size_t start;
+ size_t stop;
+ bool leaf;
+
+ double radius;
+ vpnode *inside;
+ vpnode *outside;
+
+ vpnode()
+ : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr),
+ outside(nullptr) {}
+
+ ~vpnode() {
+ delete inside;
+ delete outside;
+ }
+ };
- double radius;
- vpnode *inside;
- vpnode *outside;
+public:
+ VPTree(BufferView<R> buffer)
+ : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) {
+
+ m_alloc_size = psudb::sf_aligned_alloc(
+ CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped<R>),
+ (byte **)&m_data);
+
+ m_ptrs = new vp_ptr[buffer.get_record_count()];
+ m_reccnt = 0;
+
+ // FIXME: will eventually need to figure out tombstones
+ // this one will likely require the multi-pass
+ // approach, as otherwise we'll need to sort the
+ // records repeatedly on each reconstruction.
+ for (size_t i = 0; i < buffer.get_record_count(); i++) {
+ auto rec = buffer.get(i);
+
+ if (rec->is_deleted()) {
+ continue;
+ }
+
+ rec->header &= 3;
+ m_data[m_reccnt] = *rec;
+ m_ptrs[m_reccnt].ptr = &m_data[m_reccnt];
+ m_reccnt++;
+ }
- vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {}
+ if (m_reccnt > 0) {
+ m_root = build_vptree();
+ build_map();
+ }
+ }
- ~vpnode() {
- delete inside;
- delete outside;
- }
- };
+ VPTree(std::vector<const VPTree *> shards)
+ : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) {
+ size_t attemp_reccnt = 0;
+ for (size_t i = 0; i < shards.size(); i++) {
+ attemp_reccnt += shards[i]->get_record_count();
+ }
+ m_alloc_size = psudb::sf_aligned_alloc(
+ CACHELINE_SIZE, attemp_reccnt * sizeof(Wrapped<R>), (byte **)&m_data);
+ m_ptrs = new vp_ptr[attemp_reccnt];
+
+ // FIXME: will eventually need to figure out tombstones
+ // this one will likely require the multi-pass
+ // approach, as otherwise we'll need to sort the
+ // records repeatedly on each reconstruction.
+ for (size_t i = 0; i < shards.size(); i++) {
+ for (size_t j = 0; j < shards[i]->get_record_count(); j++) {
+ if (shards[i]->get_record_at(j)->is_deleted()) {
+ continue;
+ }
-public:
- VPTree(BufferView<R> buffer)
- : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) {
+ m_data[m_reccnt] = *shards[i]->get_record_at(j);
+ m_ptrs[m_reccnt].ptr = &m_data[m_reccnt];
+ m_reccnt++;
+ }
+ }
+ if (m_reccnt > 0) {
+ m_root = build_vptree();
+ build_map();
+ }
+ }
- m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE,
- buffer.get_record_count() *
- sizeof(Wrapped<R>),
- (byte**) &m_data);
+ ~VPTree() {
+ free(m_data);
+ delete m_root;
+ delete[] m_ptrs;
+ }
- m_ptrs = new vp_ptr[buffer.get_record_count()];
- m_reccnt = 0;
+ Wrapped<R> *point_lookup(const R &rec, bool filter = false) {
+ if constexpr (HMAP) {
+ auto idx = m_lookup_map.find(rec);
- // FIXME: will eventually need to figure out tombstones
- // this one will likely require the multi-pass
- // approach, as otherwise we'll need to sort the
- // records repeatedly on each reconstruction.
- for (size_t i=0; i<buffer.get_record_count(); i++) {
- auto rec = buffer.get(i);
+ if (idx == m_lookup_map.end()) {
+ return nullptr;
+ }
- if (rec->is_deleted()) {
- continue;
- }
+ return m_data + idx->second;
+ } else {
+ vpnode *node = m_root;
- rec->header &= 3;
- m_data[m_reccnt] = *rec;
- m_ptrs[m_reccnt].ptr = &m_data[m_reccnt];
- m_reccnt++;
+ while (!node->leaf && m_ptrs[node->start].ptr->rec != rec) {
+ if (rec.calc_distance((m_ptrs[node->start].ptr->rec)) >= node->radius) {
+ node = node->outside;
+ } else {
+ node = node->inside;
}
+ }
- if (m_reccnt > 0) {
- m_root = build_vptree();
- build_map();
+ for (size_t i = node->start; i <= node->stop; i++) {
+ if (m_ptrs[i].ptr->rec == rec) {
+ return m_ptrs[i].ptr;
}
+ }
+
+ return nullptr;
}
+ }
- VPTree(std::vector<const VPTree*> shards)
- : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) {
+ Wrapped<R> *get_data() const { return m_data; }
- size_t attemp_reccnt = 0;
- for (size_t i=0; i<shards.size(); i++) {
- attemp_reccnt += shards[i]->get_record_count();
- }
+ size_t get_record_count() const { return m_reccnt; }
- m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE,
- attemp_reccnt * sizeof(Wrapped<R>),
- (byte **) &m_data);
- m_ptrs = new vp_ptr[attemp_reccnt];
-
- // FIXME: will eventually need to figure out tombstones
- // this one will likely require the multi-pass
- // approach, as otherwise we'll need to sort the
- // records repeatedly on each reconstruction.
- for (size_t i=0; i<shards.size(); i++) {
- for (size_t j=0; j<shards[i]->get_record_count(); j++) {
- if (shards[i]->get_record_at(j)->is_deleted()) {
- continue;
- }
-
- m_data[m_reccnt] = *shards[i]->get_record_at(j);
- m_ptrs[m_reccnt].ptr = &m_data[m_reccnt];
- m_reccnt++;
- }
- }
+ size_t get_tombstone_count() const { return m_tombstone_cnt; }
- if (m_reccnt > 0) {
- m_root = build_vptree();
- build_map();
- }
- }
+ const Wrapped<R> *get_record_at(size_t idx) const {
+ if (idx >= m_reccnt)
+ return nullptr;
+ return m_data + idx;
+ }
- ~VPTree() {
- free(m_data);
- delete m_root;
- delete[] m_ptrs;
- }
+ size_t get_memory_usage() const {
+ return m_node_cnt * sizeof(vpnode) + m_reccnt * sizeof(R *);
+ }
- Wrapped<R> *point_lookup(const R &rec, bool filter=false) {
- if constexpr (HMAP) {
- auto idx = m_lookup_map.find(rec);
+ size_t get_aux_memory_usage() const {
+ // FIXME: need to return the size of the unordered_map
+ return 0;
+ }
- if (idx == m_lookup_map.end()) {
- return nullptr;
- }
+ void search(const R &point, size_t k,
+ PriorityQueue<Wrapped<R>, DistCmpMax<Wrapped<R>>> &pq) {
+ double farthest = std::numeric_limits<double>::max();
- return m_data + idx->second;
- } else {
- vpnode *node = m_root;
-
- while (!node->leaf && m_ptrs[node->start].ptr->rec != rec) {
- if (rec.calc_distance((m_ptrs[node->start].ptr->rec)) >= node->radius) {
- node = node->outside;
- } else {
- node = node->inside;
- }
- }
-
- for (size_t i=node->start; i<=node->stop; i++) {
- if (m_ptrs[i].ptr->rec == rec) {
- return m_ptrs[i].ptr;
- }
- }
-
- return nullptr;
- }
- }
+ internal_search(m_root, point, k, pq, &farthest);
+ }
- Wrapped<R>* get_data() const {
- return m_data;
- }
-
- size_t get_record_count() const {
- return m_reccnt;
+private:
+ struct vp_ptr {
+ Wrapped<R> *ptr;
+ double dist;
+ };
+ Wrapped<R> *m_data;
+ vp_ptr *m_ptrs;
+ std::unordered_map<R, size_t, RecordHash<R>> m_lookup_map;
+ size_t m_reccnt;
+ size_t m_tombstone_cnt;
+ size_t m_node_cnt;
+ size_t m_alloc_size;
+
+ vpnode *m_root;
+
+ vpnode *build_vptree() {
+ if (m_reccnt == 0) {
+ return nullptr;
}
- size_t get_tombstone_count() const {
- return m_tombstone_cnt;
- }
+ size_t lower = 0;
+ size_t upper = m_reccnt - 1;
- const Wrapped<R>* get_record_at(size_t idx) const {
- if (idx >= m_reccnt) return nullptr;
- return m_data + idx;
- }
+ auto rng = gsl_rng_alloc(gsl_rng_mt19937);
+ auto root = build_subtree(lower, upper, rng);
+ gsl_rng_free(rng);
+ return root;
+ }
- size_t get_memory_usage() const {
- return m_node_cnt * sizeof(vpnode) + m_reccnt * sizeof(R*);
+ void build_map() {
+ // Skip constructing the hashmap if disabled in the
+ // template parameters.
+ if constexpr (!HMAP) {
+ return;
}
- size_t get_aux_memory_usage() const {
- // FIXME: need to return the size of the unordered_map
- return 0;
+ for (size_t i = 0; i < m_reccnt; i++) {
+ // FIXME: Will need to account for tombstones here too. Under
+ // tombstones, it is technically possible for two otherwise identical
+ // instances of the same record to exist within the same shard, so
+ // long as one of them is a tombstone. Because the table is currently
+ // using the unwrapped records for the key, it isn't possible for it
+ // to handle this case right now.
+ m_lookup_map.insert({m_data[i].rec, i});
}
-
- void search(const R &point, size_t k, PriorityQueue<Wrapped<R>,
- DistCmpMax<Wrapped<R>>> &pq) {
- double farthest = std::numeric_limits<double>::max();
-
- internal_search(m_root, point, k, pq, &farthest);
+ }
+
+ vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) {
+ /*
+ * base-case: sometimes happens (probably because of the +1 and -1
+ * in the first recursive call)
+ */
+ if (start > stop) {
+ return nullptr;
}
-private:
- struct vp_ptr {
- Wrapped<R> *ptr;
- double dist;
- };
- Wrapped<R>* m_data;
- vp_ptr* m_ptrs;
- std::unordered_map<R, size_t, RecordHash<R>> m_lookup_map;
- size_t m_reccnt;
- size_t m_tombstone_cnt;
- size_t m_node_cnt;
- size_t m_alloc_size;
-
- vpnode *m_root;
-
- vpnode *build_vptree() {
- if (m_reccnt == 0) {
- return nullptr;
- }
-
- size_t lower = 0;
- size_t upper = m_reccnt - 1;
+ /* base-case: create a leaf node */
+ if (stop - start <= LEAFSZ) {
+ vpnode *node = new vpnode();
+ node->start = start;
+ node->stop = stop;
+ node->leaf = true;
- auto rng = gsl_rng_alloc(gsl_rng_mt19937);
- auto root = build_subtree(lower, upper, rng);
- gsl_rng_free(rng);
- return root;
+ m_node_cnt++;
+ return node;
}
- void build_map() {
- // Skip constructing the hashmap if disabled in the
- // template parameters.
- if constexpr (!HMAP) {
- return;
- }
-
- for (size_t i=0; i<m_reccnt; i++) {
- // FIXME: Will need to account for tombstones here too. Under
- // tombstones, it is technically possible for two otherwise identical
- // instances of the same record to exist within the same shard, so
- // long as one of them is a tombstone. Because the table is currently
- // using the unwrapped records for the key, it isn't possible for it
- // to handle this case right now.
- m_lookup_map.insert({m_data[i].rec, i});
- }
+ /*
+ * select a random element to be the root of the
+ * subtree
+ */
+ auto i = start + gsl_rng_uniform_int(rng, stop - start + 1);
+ swap(start, i);
+
+ /* for efficiency, we'll pre-calculate the distances between each point and
+ * the root */
+ for (size_t i = start + 1; i <= stop; i++) {
+ m_ptrs[i].dist = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[i].ptr->rec);
}
- vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) {
- /*
- * base-case: sometimes happens (probably because of the +1 and -1
- * in the first recursive call)
- */
- if (start > stop) {
- return nullptr;
- }
-
- /* base-case: create a leaf node */
- if (stop - start <= LEAFSZ) {
- vpnode *node = new vpnode();
- node->start = start;
- node->stop = stop;
- node->leaf = true;
-
- m_node_cnt++;
- return node;
- }
-
- /*
- * select a random element to be the root of the
- * subtree
- */
- auto i = start + gsl_rng_uniform_int(rng, stop - start + 1);
- swap(start, i);
-
- /* for efficiency, we'll pre-calculate the distances between each point and the root */
- for (size_t i=start+1; i<=stop; i++) {
- m_ptrs[i].dist = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[i].ptr->rec);
- }
-
- /*
- * partition elements based on their distance from the start,
- * with those elements with distance falling below the median
- * distance going into the left sub-array and those above
- * the median in the right. This is easily done using QuickSelect.
- */
- auto mid = (start + 1 + stop) / 2;
- quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng);
-
- /* Create a new node based on this partitioning */
- vpnode *node = new vpnode();
- node->start = start;
+ /*
+ * partition elements based on their distance from the start,
+ * with those elements with distance falling below the median
+ * distance going into the left sub-array and those above
+ * the median in the right. This is easily done using QuickSelect.
+ */
+ auto mid = (start + 1 + stop) / 2;
+ quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng);
- /* store the radius of the circle used for partitioning the node. */
- node->radius = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[mid].ptr->rec);
- m_ptrs[start].dist = node->radius;
+ /* Create a new node based on this partitioning */
+ vpnode *node = new vpnode();
+ node->start = start;
- /* recursively construct the left and right subtrees */
- node->inside = build_subtree(start + 1, mid-1, rng);
- node->outside = build_subtree(mid, stop, rng);
+ /* store the radius of the circle used for partitioning the node. */
+ node->radius = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[mid].ptr->rec);
+ m_ptrs[start].dist = node->radius;
- m_node_cnt++;
+ /* recursively construct the left and right subtrees */
+ node->inside = build_subtree(start + 1, mid - 1, rng);
+ node->outside = build_subtree(mid, stop, rng);
- return node;
- }
+ m_node_cnt++;
- void quickselect(size_t start, size_t stop, size_t k, Wrapped<R> *p, gsl_rng *rng) {
- if (start == stop) return;
+ return node;
+ }
- auto pivot = partition(start, stop, p, rng);
+ void quickselect(size_t start, size_t stop, size_t k, Wrapped<R> *p,
+ gsl_rng *rng) {
+ if (start == stop)
+ return;
- if (k < pivot) {
- quickselect(start, pivot - 1, k, p, rng);
- } else if (k > pivot) {
- quickselect(pivot + 1, stop, k, p, rng);
- }
- }
+ auto pivot = partition(start, stop, p, rng);
- // TODO: The quickselect code can probably be generalized and moved out
- // to psudb-common instead.
- size_t partition(size_t start, size_t stop, Wrapped<R> *p, gsl_rng *rng) {
- auto pivot = start + gsl_rng_uniform_int(rng, stop - start);
- //double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec);
-
- swap(pivot, stop);
-
- size_t j = start;
- for (size_t i=start; i<stop; i++) {
- if (m_ptrs[i].dist < m_ptrs[stop].dist) {
- //assert(distances[i - start] == p->rec.calc_distance(m_ptrs[i]->rec));
- //if (distances[i - start] < distances[stop - start]) {
- //if (p->rec .calc_distance(m_ptrs[i]->rec) < pivot_dist) {
- swap(j, i);
- j++;
- }
- }
-
- swap(j, stop);
- return j;
+ if (k < pivot) {
+ quickselect(start, pivot - 1, k, p, rng);
+ } else if (k > pivot) {
+ quickselect(pivot + 1, stop, k, p, rng);
}
-
- void swap(size_t idx1, size_t idx2) {
- auto tmp = m_ptrs[idx1];
- m_ptrs[idx1] = m_ptrs[idx2];
- m_ptrs[idx2] = tmp;
+ }
+
+ // TODO: The quickselect code can probably be generalized and moved out
+ // to psudb-common instead.
+ size_t partition(size_t start, size_t stop, Wrapped<R> *p, gsl_rng *rng) {
+ auto pivot = start + gsl_rng_uniform_int(rng, stop - start);
+ // double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec);
+
+ swap(pivot, stop);
+
+ size_t j = start;
+ for (size_t i = start; i < stop; i++) {
+ if (m_ptrs[i].dist < m_ptrs[stop].dist) {
+ // assert(distances[i - start] == p->rec.calc_distance(m_ptrs[i]->rec));
+ // if (distances[i - start] < distances[stop - start]) {
+ // if (p->rec .calc_distance(m_ptrs[i]->rec) < pivot_dist) {
+ swap(j, i);
+ j++;
+ }
}
- void internal_search(vpnode *node, const R &point, size_t k, PriorityQueue<Wrapped<R>,
- DistCmpMax<Wrapped<R>>> &pq, double *farthest) {
-
- if (node == nullptr) return;
+ swap(j, stop);
+ return j;
+ }
- if (node->leaf) {
- for (size_t i=node->start; i<=node->stop; i++) {
- double d = point.calc_distance(m_ptrs[i].ptr->rec);
- if (d < *farthest) {
- if (pq.size() == k) {
- pq.pop();
- }
+ void swap(size_t idx1, size_t idx2) {
+ auto tmp = m_ptrs[idx1];
+ m_ptrs[idx1] = m_ptrs[idx2];
+ m_ptrs[idx2] = tmp;
+ }
- pq.push(m_ptrs[i].ptr);
- if (pq.size() == k) {
- *farthest = point.calc_distance(pq.peek().data->rec);
- }
- }
- }
+ void internal_search(vpnode *node, const R &point, size_t k,
+ PriorityQueue<Wrapped<R>, DistCmpMax<Wrapped<R>>> &pq,
+ double *farthest) {
- return;
- }
-
- double d = point.calc_distance(m_ptrs[node->start].ptr->rec);
+ if (node == nullptr)
+ return;
+ if (node->leaf) {
+ for (size_t i = node->start; i <= node->stop; i++) {
+ double d = point.calc_distance(m_ptrs[i].ptr->rec);
if (d < *farthest) {
- if (pq.size() == k) {
- pq.pop();
- }
- pq.push(m_ptrs[node->start].ptr);
- if (pq.size() == k) {
- *farthest = point.calc_distance(pq.peek().data->rec);
- }
+ if (pq.size() == k) {
+ pq.pop();
+ }
+
+ pq.push(m_ptrs[i].ptr);
+ if (pq.size() == k) {
+ *farthest = point.calc_distance(pq.peek().data->rec);
+ }
}
+ }
- if (d < node->radius) {
- if (d - (*farthest) <= node->radius) {
- internal_search(node->inside, point, k, pq, farthest);
- }
+ return;
+ }
- if (d + (*farthest) >= node->radius) {
- internal_search(node->outside, point, k, pq, farthest);
- }
- } else {
- if (d + (*farthest) >= node->radius) {
- internal_search(node->outside, point, k, pq, farthest);
- }
+ double d = point.calc_distance(m_ptrs[node->start].ptr->rec);
- if (d - (*farthest) <= node->radius) {
- internal_search(node->inside, point, k, pq, farthest);
- }
- }
+ if (d < *farthest) {
+ if (pq.size() == k) {
+ pq.pop();
+ }
+ pq.push(m_ptrs[node->start].ptr);
+ if (pq.size() == k) {
+ *farthest = point.calc_distance(pq.peek().data->rec);
+ }
+ }
+
+ if (d < node->radius) {
+ if (d - (*farthest) <= node->radius) {
+ internal_search(node->inside, point, k, pq, farthest);
+ }
+
+ if (d + (*farthest) >= node->radius) {
+ internal_search(node->outside, point, k, pq, farthest);
+ }
+ } else {
+ if (d + (*farthest) >= node->radius) {
+ internal_search(node->outside, point, k, pq, farthest);
+ }
+
+ if (d - (*farthest) <= node->radius) {
+ internal_search(node->inside, point, k, pq, farthest);
+ }
}
- };
-}
+ }
+};
+} // namespace de