From 5ab408321dd45865a88fed71d11efe01dd7715d9 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 24 Jul 2023 18:17:57 -0400 Subject: VPTree: added a level of indirection to avoid repeated point copies --- include/shard/VPTree.h | 75 ++++++++++++++++++++++++++++++++++---------------- tests/vptree_tests.cpp | 43 +++++++++++++++++------------ 2 files changed, 77 insertions(+), 41 deletions(-) diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 927108c..bbb3e50 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -86,16 +86,19 @@ private: -template +template class VPTree { private: struct vpnode { - size_t idx; + size_t start; + size_t stop; + bool leaf; + double radius; vpnode *inside; vpnode *outside; - vpnode(size_t idx) : idx(idx), radius(0), inside(nullptr), outside(nullptr) {} + vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {} ~vpnode() { delete inside; @@ -112,6 +115,7 @@ public: size_t alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); assert(alloc_size % CACHELINE_SIZE == 0); m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, alloc_size); + m_ptrs = new Wrapped*[buffer->get_record_count()]; size_t offset = 0; m_reccnt = 0; @@ -128,7 +132,9 @@ public: } rec->header &= 3; - m_data[m_reccnt++] = *rec; + m_data[m_reccnt] = *rec; + m_ptrs[m_reccnt] = &m_data[m_reccnt]; + m_reccnt++; } if (m_reccnt > 0) { @@ -149,6 +155,7 @@ public: size_t alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); assert(alloc_size % CACHELINE_SIZE == 0); m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, alloc_size); + m_ptrs = new Wrapped*[attemp_reccnt]; // FIXME: will eventually need to figure out tombstones // this one will likely require the multi-pass @@ -160,7 +167,9 @@ public: continue; } - m_data[m_reccnt++] = *shards[i]->get_record_at(j); + m_data[m_reccnt] = *shards[i]->get_record_at(j); + m_ptrs[m_reccnt] = &m_data[m_reccnt]; + m_reccnt++; } } @@ -173,6 +182,7 @@ public: ~VPTree() { if (m_data) free(m_data); if (m_root) delete m_root; + if (m_ptrs) delete[] m_ptrs; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -243,12 +253,13 @@ private: } // base-case: create a leaf node - if (start == stop) { - vpnode *node = new vpnode(start); - node->idx = start; + if (stop - start <= LEAFSZ) { + vpnode *node = new vpnode(); + node->start = start; + node->stop = stop; + node->leaf = true; m_node_cnt++; - return node; } @@ -262,13 +273,14 @@ private: // distance going into the left sub-array and those above // the median in the right. This is easily done using QuickSelect. auto mid = (start + 1 + stop) / 2; - quickselect(start + 1, stop, mid, m_data[start], rng); + quickselect(start + 1, stop, mid, m_ptrs[start], rng); // Create a new node based on this partitioning - vpnode *node = new vpnode(start); + vpnode *node = new vpnode(); + node->start = start; // store the radius of the circle used for partitioning the node. - node->radius = m_data[start].rec.calc_distance(m_data[mid].rec); + node->radius = m_ptrs[start]->rec.calc_distance(m_ptrs[mid]->rec); // recursively construct the left and right subtrees node->inside = build_subtree(start + 1, mid-1, rng); @@ -280,7 +292,7 @@ private: } - void quickselect(size_t start, size_t stop, size_t k, Wrapped p, gsl_rng *rng) { + void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, gsl_rng *rng) { if (start == stop) return; auto pivot = partition(start, stop, p, rng); @@ -293,15 +305,15 @@ private: } - size_t partition(size_t start, size_t stop, Wrapped p, gsl_rng *rng) { + size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { auto pivot = start + gsl_rng_uniform_int(rng, stop - start); - double pivot_dist = p.rec.calc_distance(m_data[pivot].rec); + double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); swap(pivot, stop); size_t j = start; for (size_t i=start; irec.calc_distance(m_ptrs[i]->rec) < pivot_dist) { swap(j, i); j++; } @@ -313,30 +325,46 @@ private: void swap(size_t idx1, size_t idx2) { - Wrapped tmp = m_data[idx1]; - m_data[idx1] = m_data[idx2]; - m_data[idx2] = tmp; + auto tmp = m_ptrs[idx1]; + m_ptrs[idx1] = m_ptrs[idx2]; + m_ptrs[idx2] = tmp; } void search(vpnode *node, const R &point, size_t k, PriorityQueue, KNNDistCmpMax>> &pq, double *farthest) { if (node == nullptr) return; - double d = point.calc_distance(m_data[node->idx].rec); + if (node->leaf) { + for (size_t i=node->start; i<=node->stop; i++) { + double d = point.calc_distance(m_ptrs[i]->rec); + if (d < *farthest) { + if (pq.size() == k) { + pq.pop(); + } + + pq.push(m_ptrs[i]); + if (pq.size() == k) { + *farthest = point.calc_distance(pq.peek().data->rec); + } + } + } + + return; + } + + double d = point.calc_distance(m_ptrs[node->start]->rec); if (d < *farthest) { if (pq.size() == k) { auto t = pq.peek().data->rec; pq.pop(); } - pq.push(&m_data[node->idx]); + pq.push(m_ptrs[node->start]); if (pq.size() == k) { *farthest = point.calc_distance(pq.peek().data->rec); } } - if (!node->inside && !node->outside) return; - if (d < node->radius) { if (d - (*farthest) <= node->radius) { search(node->inside, point, k, pq, farthest); @@ -357,6 +385,7 @@ private: } Wrapped* m_data; + Wrapped** m_ptrs; std::unordered_map> m_lookup_map; size_t m_reccnt; size_t m_tombstone_cnt; diff --git a/tests/vptree_tests.cpp b/tests/vptree_tests.cpp index 894e64a..06f147b 100644 --- a/tests/vptree_tests.cpp +++ b/tests/vptree_tests.cpp @@ -137,31 +137,38 @@ START_TEST(t_buffer_query) START_TEST(t_knn_query) { - size_t n = 100; + size_t n = 1000; auto buffer = create_2d_sequential_mbuffer(n); - PRec target; - target.data[0] = 50; - target.data[1] = 50; + auto vptree = VPTree(buffer); KNNQueryParms p; - p.k = 10; - p.point = target; + for (size_t i=0; i<100; i++) { + p.k = rand() % 150; + p.point.data[0] = rand() % (n-p.k); + p.point.data[1] = p.point.data[0]; - auto state = KNNQuery::get_buffer_query_state(buffer, &p); - auto result = KNNQuery::buffer_query(buffer, state, &p); + auto state = KNNQuery::get_query_state(&vptree, &p); + auto results = KNNQuery::query(&vptree, state, &p); + KNNQuery::delete_query_state(state); - KNNQuery::delete_buffer_query_state(state); + ck_assert_int_eq(results.size(), p.k); - auto vptree = VPTree(buffer); - auto state_2 = KNNQuery::get_query_state(&vptree, &p); - auto result_2 = KNNQuery::query(&vptree, state_2, &p); - KNNQuery::delete_query_state(state_2); - - std::sort(result_2.begin(), result_2.end()); - size_t start = 46; - for (size_t i=0; i