From 711769574e647839677739192698e400529efe75 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 8 Feb 2024 16:38:44 -0500 Subject: Updated VPTree to new shard/query interfaces --- include/shard/VPTree.h | 282 +++++++++---------------------------------------- 1 file changed, 50 insertions(+), 232 deletions(-) (limited to 'include/shard/VPTree.h') diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 2f5ebbb..ba13a87 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -5,98 +5,27 @@ * * Distributed under the Modified BSD License. * - * A shard shim around the VPTree spatial index. + * A shard shim around a VPTree for high-dimensional metric similarity + * search. * - * FIXME: separate the KNN query class out into a standalone - * file in include/query . + * FIXME: Does not yet support the tombstone delete policy. * */ #pragma once #include -#include -#include -#include -#include -#include -#include -#include +#include #include "framework/ShardRequirements.h" - #include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; -using psudb::Alias; +using psudb::byte; namespace de { -template -struct KNNQueryParms { - R point; - size_t k; -}; - -template -class KNNQuery; - -template -struct KNNState { - size_t k; - - KNNState() { - k = 0; - } -}; - -template -struct KNNBufferState { - -}; - - -template -class KNNDistCmpMax { -public: - KNNDistCmpMax(R *baseline) : P(baseline) {} - - inline bool operator()(const R *a, const R *b) requires WrappedInterface { - return a->rec.calc_distance(P->rec) > b->rec.calc_distance(P->rec); - } - - inline bool operator()(const R *a, const R *b) requires (!WrappedInterface){ - return a->calc_distance(*P) > b->calc_distance(*P); - } - -private: - R *P; -}; - -template -class KNNDistCmpMin { -public: - KNNDistCmpMin(R *baseline) : P(baseline) {} - - inline bool operator()(const R *a, const R *b) requires WrappedInterface { - return a->rec.calc_distance(P->rec) < b->rec.calc_distance(P->rec); - } - - inline bool operator()(const R *a, const R *b) requires (!WrappedInterface){ - return a->calc_distance(*P) < b->calc_distance(*P); - } - -private: - R *P; -}; - - - template class VPTree { private: @@ -117,16 +46,19 @@ private: } }; -public: - friend class KNNQuery; - VPTree(MutableBuffer* buffer) + +public: + VPTree(BufferView buffer) : m_reccnt(0), m_tombstone_cnt(0), m_root(nullptr), m_node_cnt(0) { - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - m_ptrs = new Wrapped*[buffer->get_record_count()]; + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + + m_ptrs = new Wrapped*[buffer.get_record_count()]; size_t offset = 0; m_reccnt = 0; @@ -135,8 +67,8 @@ public: // this one will likely require the multi-pass // approach, as otherwise we'll need to sort the // records repeatedly on each reconstruction. - for (size_t i=0; iget_record_count(); i++) { - auto rec = buffer->get_data() + i; + for (size_t i=0; iis_deleted()) { continue; @@ -154,25 +86,24 @@ public: } } - VPTree(VPTree** shards, size_t len) + VPTree(std::vector shards) : m_reccnt(0), m_tombstone_cnt(0), m_root(nullptr), m_node_cnt(0) { size_t attemp_reccnt = 0; - - for (size_t i=0; iget_record_count(); } - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); m_ptrs = new Wrapped*[attemp_reccnt]; // FIXME: will eventually need to figure out tombstones // this one will likely require the multi-pass // approach, as otherwise we'll need to sort the // records repeatedly on each reconstruction. - for (size_t i=0; iget_record_count(); j++) { if (shards[i]->get_record_at(j)->is_deleted()) { continue; @@ -191,9 +122,9 @@ public: } ~VPTree() { - if (m_data) free(m_data); - if (m_root) delete m_root; - if (m_ptrs) delete[] m_ptrs; + free(m_data); + delete m_root; + delete[] m_ptrs; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -248,11 +179,27 @@ public: } size_t get_aux_memory_usage() { + // FIXME: need to return the size of the unordered_map return 0; } + void search(const R &point, size_t k, PriorityQueue, + DistCmpMax>> &pq) { + double farthest = std::numeric_limits::max(); + + internal_search(m_root, point, k, pq, &farthest); + } private: + Wrapped* m_data; + Wrapped** m_ptrs; + std::unordered_map> m_lookup_map; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_node_cnt; + size_t m_alloc_size; + + vpnode *m_root; vpnode *build_vptree() { if (m_reccnt == 0) { @@ -332,7 +279,6 @@ private: return node; } - void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, gsl_rng *rng) { if (start == stop) return; @@ -345,7 +291,6 @@ private: } } - size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { auto pivot = start + gsl_rng_uniform_int(rng, stop - start); double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); @@ -364,15 +309,15 @@ private: return j; } - void swap(size_t idx1, size_t idx2) { auto tmp = m_ptrs[idx1]; m_ptrs[idx1] = m_ptrs[idx2]; m_ptrs[idx2] = tmp; } + void internal_search(vpnode *node, const R &point, size_t k, PriorityQueue, + DistCmpMax>> &pq, double *farthest) { - void search(vpnode *node, const R &point, size_t k, PriorityQueue, KNNDistCmpMax>> &pq, double *farthest) { if (node == nullptr) return; if (node->leaf) { @@ -408,151 +353,24 @@ private: if (d < node->radius) { if (d - (*farthest) <= node->radius) { - search(node->inside, point, k, pq, farthest); + internal_search(node->inside, point, k, pq, farthest); } if (d + (*farthest) >= node->radius) { - search(node->outside, point, k, pq, farthest); + internal_search(node->outside, point, k, pq, farthest); } } else { if (d + (*farthest) >= node->radius) { - search(node->outside, point, k, pq, farthest); + internal_search(node->outside, point, k, pq, farthest); } if (d - (*farthest) <= node->radius) { - search(node->inside, point, k, pq, farthest); + internal_search(node->inside, point, k, pq, farthest); } } } - Wrapped* m_data; - Wrapped** m_ptrs; - std::unordered_map> m_lookup_map; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_node_cnt; - size_t m_alloc_size; - - vpnode *m_root; -}; - - -template -class KNNQuery { -public: - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=true; - - static void *get_query_state(VPTree *wss, void *parms) { - return nullptr; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - return nullptr; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(VPTree *wss, void *q_state, void *parms) { - std::vector> results; - KNNQueryParms *p = (KNNQueryParms *) parms; - Wrapped wrec; - wrec.rec = p->point; - wrec.header = 0; - - PriorityQueue, KNNDistCmpMax>> pq(p->k, &wrec); - - double farthest = std::numeric_limits::max(); - - wss->search(wss->m_root, p->point, p->k, pq, &farthest); - - while (pq.size() > 0) { - results.emplace_back(*pq.peek().data); - pq.pop(); - } - - return results; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - KNNQueryParms *p = (KNNQueryParms *) parms; - Wrapped wrec; - wrec.rec = p->point; - wrec.header = 0; - - size_t k = p->k; - - PriorityQueue, KNNDistCmpMax>> pq(k, &wrec); - for (size_t i=0; iget_record_count(); i++) { - // Skip over deleted records (under tagging) - if ((buffer->get_data())[i].is_deleted()) { - continue; - } - - if (pq.size() < k) { - pq.push(buffer->get_data() + i); - } else { - double head_dist = pq.peek().data->rec.calc_distance(wrec.rec); - double cur_dist = (buffer->get_data() + i)->rec.calc_distance(wrec.rec); - - if (cur_dist < head_dist) { - pq.pop(); - pq.push(buffer->get_data() + i); - } - } - } - - std::vector> results; - while (pq.size() > 0) { - results.emplace_back(*(pq.peek().data)); - pq.pop(); - } - - return results; - } - - static std::vector merge(std::vector>> &results, void *parms) { - KNNQueryParms *p = (KNNQueryParms *) parms; - R rec = p->point; - size_t k = p->k; - - PriorityQueue> pq(k, &rec); - for (size_t i=0; icalc_distance(rec); - double cur_dist = results[i][j].rec.calc_distance(rec); - if (cur_dist < head_dist) { - pq.pop(); - pq.push(&results[i][j].rec); - } - } - } - } - - std::vector output; - while (pq.size() > 0) { - output.emplace_back(*pq.peek().data); - pq.pop(); - } - - return output; - } - - static void delete_query_state(void *state) { - auto s = (KNNState *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (KNNBufferState *) state; - delete s; - } -}; + }; } -- cgit v1.2.3