From cf5f3bbb0cb58430ed68ad3ebfcefc009e553d71 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 25 Sep 2025 14:42:44 -0400 Subject: Code reformatting --- include/shard/VPTree.h | 605 ++++++++++++++++++++++++------------------------- 1 file changed, 301 insertions(+), 304 deletions(-) (limited to 'include/shard/VPTree.h') diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 7130efe..33ce9b9 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -15,379 +15,376 @@ #include -#include #include "framework/ShardRequirements.h" #include "psu-ds/PriorityQueue.h" +#include +using psudb::byte; using psudb::CACHELINE_SIZE; using psudb::PriorityQueue; -using psudb::byte; namespace de { -template +template class VPTree { public: - typedef R RECORD; + typedef R RECORD; private: - struct vpnode { - size_t start; - size_t stop; - bool leaf; + struct vpnode { + size_t start; + size_t stop; + bool leaf; + + double radius; + vpnode *inside; + vpnode *outside; + + vpnode() + : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), + outside(nullptr) {} + + ~vpnode() { + delete inside; + delete outside; + } + }; - double radius; - vpnode *inside; - vpnode *outside; +public: + VPTree(BufferView buffer) + : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) { + + m_alloc_size = psudb::sf_aligned_alloc( + CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped), + (byte **)&m_data); + + m_ptrs = new vp_ptr[buffer.get_record_count()]; + m_reccnt = 0; + + // FIXME: will eventually need to figure out tombstones + // this one will likely require the multi-pass + // approach, as otherwise we'll need to sort the + // records repeatedly on each reconstruction. + for (size_t i = 0; i < buffer.get_record_count(); i++) { + auto rec = buffer.get(i); + + if (rec->is_deleted()) { + continue; + } + + rec->header &= 3; + m_data[m_reccnt] = *rec; + m_ptrs[m_reccnt].ptr = &m_data[m_reccnt]; + m_reccnt++; + } - vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {} + if (m_reccnt > 0) { + m_root = build_vptree(); + build_map(); + } + } - ~vpnode() { - delete inside; - delete outside; - } - }; + VPTree(std::vector shards) + : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) { + size_t attemp_reccnt = 0; + for (size_t i = 0; i < shards.size(); i++) { + attemp_reccnt += shards[i]->get_record_count(); + } + m_alloc_size = psudb::sf_aligned_alloc( + CACHELINE_SIZE, attemp_reccnt * sizeof(Wrapped), (byte **)&m_data); + m_ptrs = new vp_ptr[attemp_reccnt]; + + // FIXME: will eventually need to figure out tombstones + // this one will likely require the multi-pass + // approach, as otherwise we'll need to sort the + // records repeatedly on each reconstruction. + for (size_t i = 0; i < shards.size(); i++) { + for (size_t j = 0; j < shards[i]->get_record_count(); j++) { + if (shards[i]->get_record_at(j)->is_deleted()) { + continue; + } -public: - VPTree(BufferView buffer) - : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) { + m_data[m_reccnt] = *shards[i]->get_record_at(j); + m_ptrs[m_reccnt].ptr = &m_data[m_reccnt]; + m_reccnt++; + } + } + if (m_reccnt > 0) { + m_root = build_vptree(); + build_map(); + } + } - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, - buffer.get_record_count() * - sizeof(Wrapped), - (byte**) &m_data); + ~VPTree() { + free(m_data); + delete m_root; + delete[] m_ptrs; + } - m_ptrs = new vp_ptr[buffer.get_record_count()]; - m_reccnt = 0; + Wrapped *point_lookup(const R &rec, bool filter = false) { + if constexpr (HMAP) { + auto idx = m_lookup_map.find(rec); - // FIXME: will eventually need to figure out tombstones - // this one will likely require the multi-pass - // approach, as otherwise we'll need to sort the - // records repeatedly on each reconstruction. - for (size_t i=0; iis_deleted()) { - continue; - } + return m_data + idx->second; + } else { + vpnode *node = m_root; - rec->header &= 3; - m_data[m_reccnt] = *rec; - m_ptrs[m_reccnt].ptr = &m_data[m_reccnt]; - m_reccnt++; + while (!node->leaf && m_ptrs[node->start].ptr->rec != rec) { + if (rec.calc_distance((m_ptrs[node->start].ptr->rec)) >= node->radius) { + node = node->outside; + } else { + node = node->inside; } + } - if (m_reccnt > 0) { - m_root = build_vptree(); - build_map(); + for (size_t i = node->start; i <= node->stop; i++) { + if (m_ptrs[i].ptr->rec == rec) { + return m_ptrs[i].ptr; } + } + + return nullptr; } + } - VPTree(std::vector shards) - : m_reccnt(0), m_tombstone_cnt(0), m_node_cnt(0), m_root(nullptr) { + Wrapped *get_data() const { return m_data; } - size_t attemp_reccnt = 0; - for (size_t i=0; iget_record_count(); - } + size_t get_record_count() const { return m_reccnt; } - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, - attemp_reccnt * sizeof(Wrapped), - (byte **) &m_data); - m_ptrs = new vp_ptr[attemp_reccnt]; - - // FIXME: will eventually need to figure out tombstones - // this one will likely require the multi-pass - // approach, as otherwise we'll need to sort the - // records repeatedly on each reconstruction. - for (size_t i=0; iget_record_count(); j++) { - if (shards[i]->get_record_at(j)->is_deleted()) { - continue; - } - - m_data[m_reccnt] = *shards[i]->get_record_at(j); - m_ptrs[m_reccnt].ptr = &m_data[m_reccnt]; - m_reccnt++; - } - } + size_t get_tombstone_count() const { return m_tombstone_cnt; } - if (m_reccnt > 0) { - m_root = build_vptree(); - build_map(); - } - } + const Wrapped *get_record_at(size_t idx) const { + if (idx >= m_reccnt) + return nullptr; + return m_data + idx; + } - ~VPTree() { - free(m_data); - delete m_root; - delete[] m_ptrs; - } + size_t get_memory_usage() const { + return m_node_cnt * sizeof(vpnode) + m_reccnt * sizeof(R *); + } - Wrapped *point_lookup(const R &rec, bool filter=false) { - if constexpr (HMAP) { - auto idx = m_lookup_map.find(rec); + size_t get_aux_memory_usage() const { + // FIXME: need to return the size of the unordered_map + return 0; + } - if (idx == m_lookup_map.end()) { - return nullptr; - } + void search(const R &point, size_t k, + PriorityQueue, DistCmpMax>> &pq) { + double farthest = std::numeric_limits::max(); - return m_data + idx->second; - } else { - vpnode *node = m_root; - - while (!node->leaf && m_ptrs[node->start].ptr->rec != rec) { - if (rec.calc_distance((m_ptrs[node->start].ptr->rec)) >= node->radius) { - node = node->outside; - } else { - node = node->inside; - } - } - - for (size_t i=node->start; i<=node->stop; i++) { - if (m_ptrs[i].ptr->rec == rec) { - return m_ptrs[i].ptr; - } - } - - return nullptr; - } - } + internal_search(m_root, point, k, pq, &farthest); + } - Wrapped* get_data() const { - return m_data; - } - - size_t get_record_count() const { - return m_reccnt; +private: + struct vp_ptr { + Wrapped *ptr; + double dist; + }; + Wrapped *m_data; + vp_ptr *m_ptrs; + std::unordered_map> m_lookup_map; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_node_cnt; + size_t m_alloc_size; + + vpnode *m_root; + + vpnode *build_vptree() { + if (m_reccnt == 0) { + return nullptr; } - size_t get_tombstone_count() const { - return m_tombstone_cnt; - } + size_t lower = 0; + size_t upper = m_reccnt - 1; - const Wrapped* get_record_at(size_t idx) const { - if (idx >= m_reccnt) return nullptr; - return m_data + idx; - } + auto rng = gsl_rng_alloc(gsl_rng_mt19937); + auto root = build_subtree(lower, upper, rng); + gsl_rng_free(rng); + return root; + } - size_t get_memory_usage() const { - return m_node_cnt * sizeof(vpnode) + m_reccnt * sizeof(R*); + void build_map() { + // Skip constructing the hashmap if disabled in the + // template parameters. + if constexpr (!HMAP) { + return; } - size_t get_aux_memory_usage() const { - // FIXME: need to return the size of the unordered_map - return 0; + for (size_t i = 0; i < m_reccnt; i++) { + // FIXME: Will need to account for tombstones here too. Under + // tombstones, it is technically possible for two otherwise identical + // instances of the same record to exist within the same shard, so + // long as one of them is a tombstone. Because the table is currently + // using the unwrapped records for the key, it isn't possible for it + // to handle this case right now. + m_lookup_map.insert({m_data[i].rec, i}); } - - void search(const R &point, size_t k, PriorityQueue, - DistCmpMax>> &pq) { - double farthest = std::numeric_limits::max(); - - internal_search(m_root, point, k, pq, &farthest); + } + + vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) { + /* + * base-case: sometimes happens (probably because of the +1 and -1 + * in the first recursive call) + */ + if (start > stop) { + return nullptr; } -private: - struct vp_ptr { - Wrapped *ptr; - double dist; - }; - Wrapped* m_data; - vp_ptr* m_ptrs; - std::unordered_map> m_lookup_map; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_node_cnt; - size_t m_alloc_size; - - vpnode *m_root; - - vpnode *build_vptree() { - if (m_reccnt == 0) { - return nullptr; - } - - size_t lower = 0; - size_t upper = m_reccnt - 1; + /* base-case: create a leaf node */ + if (stop - start <= LEAFSZ) { + vpnode *node = new vpnode(); + node->start = start; + node->stop = stop; + node->leaf = true; - auto rng = gsl_rng_alloc(gsl_rng_mt19937); - auto root = build_subtree(lower, upper, rng); - gsl_rng_free(rng); - return root; + m_node_cnt++; + return node; } - void build_map() { - // Skip constructing the hashmap if disabled in the - // template parameters. - if constexpr (!HMAP) { - return; - } - - for (size_t i=0; irec.calc_distance(m_ptrs[i].ptr->rec); } - vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) { - /* - * base-case: sometimes happens (probably because of the +1 and -1 - * in the first recursive call) - */ - if (start > stop) { - return nullptr; - } - - /* base-case: create a leaf node */ - if (stop - start <= LEAFSZ) { - vpnode *node = new vpnode(); - node->start = start; - node->stop = stop; - node->leaf = true; - - m_node_cnt++; - return node; - } - - /* - * select a random element to be the root of the - * subtree - */ - auto i = start + gsl_rng_uniform_int(rng, stop - start + 1); - swap(start, i); - - /* for efficiency, we'll pre-calculate the distances between each point and the root */ - for (size_t i=start+1; i<=stop; i++) { - m_ptrs[i].dist = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[i].ptr->rec); - } - - /* - * partition elements based on their distance from the start, - * with those elements with distance falling below the median - * distance going into the left sub-array and those above - * the median in the right. This is easily done using QuickSelect. - */ - auto mid = (start + 1 + stop) / 2; - quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng); - - /* Create a new node based on this partitioning */ - vpnode *node = new vpnode(); - node->start = start; + /* + * partition elements based on their distance from the start, + * with those elements with distance falling below the median + * distance going into the left sub-array and those above + * the median in the right. This is easily done using QuickSelect. + */ + auto mid = (start + 1 + stop) / 2; + quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng); - /* store the radius of the circle used for partitioning the node. */ - node->radius = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[mid].ptr->rec); - m_ptrs[start].dist = node->radius; + /* Create a new node based on this partitioning */ + vpnode *node = new vpnode(); + node->start = start; - /* recursively construct the left and right subtrees */ - node->inside = build_subtree(start + 1, mid-1, rng); - node->outside = build_subtree(mid, stop, rng); + /* store the radius of the circle used for partitioning the node. */ + node->radius = m_ptrs[start].ptr->rec.calc_distance(m_ptrs[mid].ptr->rec); + m_ptrs[start].dist = node->radius; - m_node_cnt++; + /* recursively construct the left and right subtrees */ + node->inside = build_subtree(start + 1, mid - 1, rng); + node->outside = build_subtree(mid, stop, rng); - return node; - } + m_node_cnt++; - void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, gsl_rng *rng) { - if (start == stop) return; + return node; + } - auto pivot = partition(start, stop, p, rng); + void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, + gsl_rng *rng) { + if (start == stop) + return; - if (k < pivot) { - quickselect(start, pivot - 1, k, p, rng); - } else if (k > pivot) { - quickselect(pivot + 1, stop, k, p, rng); - } - } + auto pivot = partition(start, stop, p, rng); - // TODO: The quickselect code can probably be generalized and moved out - // to psudb-common instead. - size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { - auto pivot = start + gsl_rng_uniform_int(rng, stop - start); - //double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); - - swap(pivot, stop); - - size_t j = start; - for (size_t i=start; irec.calc_distance(m_ptrs[i]->rec)); - //if (distances[i - start] < distances[stop - start]) { - //if (p->rec .calc_distance(m_ptrs[i]->rec) < pivot_dist) { - swap(j, i); - j++; - } - } - - swap(j, stop); - return j; + if (k < pivot) { + quickselect(start, pivot - 1, k, p, rng); + } else if (k > pivot) { + quickselect(pivot + 1, stop, k, p, rng); } - - void swap(size_t idx1, size_t idx2) { - auto tmp = m_ptrs[idx1]; - m_ptrs[idx1] = m_ptrs[idx2]; - m_ptrs[idx2] = tmp; + } + + // TODO: The quickselect code can probably be generalized and moved out + // to psudb-common instead. + size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { + auto pivot = start + gsl_rng_uniform_int(rng, stop - start); + // double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); + + swap(pivot, stop); + + size_t j = start; + for (size_t i = start; i < stop; i++) { + if (m_ptrs[i].dist < m_ptrs[stop].dist) { + // assert(distances[i - start] == p->rec.calc_distance(m_ptrs[i]->rec)); + // if (distances[i - start] < distances[stop - start]) { + // if (p->rec .calc_distance(m_ptrs[i]->rec) < pivot_dist) { + swap(j, i); + j++; + } } - void internal_search(vpnode *node, const R &point, size_t k, PriorityQueue, - DistCmpMax>> &pq, double *farthest) { - - if (node == nullptr) return; + swap(j, stop); + return j; + } - if (node->leaf) { - for (size_t i=node->start; i<=node->stop; i++) { - double d = point.calc_distance(m_ptrs[i].ptr->rec); - if (d < *farthest) { - if (pq.size() == k) { - pq.pop(); - } + void swap(size_t idx1, size_t idx2) { + auto tmp = m_ptrs[idx1]; + m_ptrs[idx1] = m_ptrs[idx2]; + m_ptrs[idx2] = tmp; + } - pq.push(m_ptrs[i].ptr); - if (pq.size() == k) { - *farthest = point.calc_distance(pq.peek().data->rec); - } - } - } + void internal_search(vpnode *node, const R &point, size_t k, + PriorityQueue, DistCmpMax>> &pq, + double *farthest) { - return; - } - - double d = point.calc_distance(m_ptrs[node->start].ptr->rec); + if (node == nullptr) + return; + if (node->leaf) { + for (size_t i = node->start; i <= node->stop; i++) { + double d = point.calc_distance(m_ptrs[i].ptr->rec); if (d < *farthest) { - if (pq.size() == k) { - pq.pop(); - } - pq.push(m_ptrs[node->start].ptr); - if (pq.size() == k) { - *farthest = point.calc_distance(pq.peek().data->rec); - } + if (pq.size() == k) { + pq.pop(); + } + + pq.push(m_ptrs[i].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(pq.peek().data->rec); + } } + } - if (d < node->radius) { - if (d - (*farthest) <= node->radius) { - internal_search(node->inside, point, k, pq, farthest); - } + return; + } - if (d + (*farthest) >= node->radius) { - internal_search(node->outside, point, k, pq, farthest); - } - } else { - if (d + (*farthest) >= node->radius) { - internal_search(node->outside, point, k, pq, farthest); - } + double d = point.calc_distance(m_ptrs[node->start].ptr->rec); - if (d - (*farthest) <= node->radius) { - internal_search(node->inside, point, k, pq, farthest); - } - } + if (d < *farthest) { + if (pq.size() == k) { + pq.pop(); + } + pq.push(m_ptrs[node->start].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(pq.peek().data->rec); + } + } + + if (d < node->radius) { + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + } else { + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } } - }; -} + } +}; +} // namespace de -- cgit v1.2.3