From 47a386b50d904d3f1b7ce3cfc13c29ea96dd1e43 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 30 Apr 2024 14:18:08 -0400 Subject: Added VPTree BSM benchmark --- CMakeLists.txt | 6 + benchmarks/include/standard_benchmarks.h | 39 +++- benchmarks/include/triespline_bsm.h | 3 +- benchmarks/include/vptree_bsm.h | 317 +++++++++++++++++++++++++++++++ benchmarks/vldb/mtree_bench.cpp | 2 +- benchmarks/vldb/vptree_bench.cpp | 6 +- external/psudb-common | 2 +- 7 files changed, 369 insertions(+), 6 deletions(-) create mode 100644 benchmarks/include/vptree_bsm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 34b4bed..d88db24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,12 @@ if (vldb_bench) target_link_options(vptree_parmsweep PUBLIC -mcx16) target_compile_options(vptree_parmsweep PUBLIC -fopenmp) + add_executable(vptree_bsm_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/vldb/vptree_bsm_bench.cpp) + target_link_libraries(vptree_bsm_bench PUBLIC gsl pthread atomic) + target_include_directories(vptree_bsm_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include benchmarks/include external/psudb-common/cpp/include) + target_link_options(vptree_bsm_bench PUBLIC -mcx16) + target_compile_options(vptree_bsm_bench PUBLIC -fopenmp) + add_executable(ts_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/vldb/ts_bench.cpp) target_link_libraries(ts_bench PUBLIC gsl pthread atomic) target_include_directories(ts_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include benchmarks/include external/psudb-common/cpp/include) diff --git a/benchmarks/include/standard_benchmarks.h b/benchmarks/include/standard_benchmarks.h index aaef679..76b5f7c 100644 --- a/benchmarks/include/standard_benchmarks.h +++ b/benchmarks/include/standard_benchmarks.h @@ -34,6 +34,14 @@ static void run_queries(DE *extension, std::vector &queries) { result.emplace_back(itr->data); itr++; } + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (auto &r : result) { + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + #endif } else if constexpr (std::is_same_v) { size_t tot = 0; auto ptr = extension->find(queries[i].lower_bound); @@ -44,7 +52,26 @@ static void run_queries(DE *extension, std::vector &queries) { } else { auto res = extension->query(&queries[i]); if constexpr (!BSM) { - auto r = res.get(); + auto result = res.get(); + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=result.size()-1; i>=0; i--) { + auto &r = result[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } else { + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif } } } @@ -73,6 +100,16 @@ static void run_static_queries(S *shard, std::vector &queries) { Q::process_query_states(q, states, nullptr); auto res = Q::query(shard, state, q); + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i].rec; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif } } diff --git a/benchmarks/include/triespline_bsm.h b/benchmarks/include/triespline_bsm.h index 4276074..dfddc43 100644 --- a/benchmarks/include/triespline_bsm.h +++ b/benchmarks/include/triespline_bsm.h @@ -2,6 +2,7 @@ #include #include +#include #include "ts/builder.h" template @@ -52,7 +53,7 @@ public: return std::move(rs); } - std::vector query_merge(std::vector &rsa, std::vector &rsb) { + std::vector query_merge(std::vector &rsa, std::vector &rsb, void *parms) { rsa[0].first += rsb[0].first; return std::move(rsa); } diff --git a/benchmarks/include/vptree_bsm.h b/benchmarks/include/vptree_bsm.h new file mode 100644 index 0000000..2f12fcb --- /dev/null +++ b/benchmarks/include/vptree_bsm.h @@ -0,0 +1,317 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "psu-ds/PriorityQueue.h" +#include "framework/interface/Record.h" + +template +class BSMVPTree { +public: + struct KNNQueryParms { + R point; + size_t k; + }; + +public: + static BSMVPTree *build(std::vector &records) { + return new BSMVPTree(records); + } + + static BSMVPTree *build_presorted(std::vector &records) { + return new BSMVPTree(records); + } + + std::vector unbuild() { + return std::move(m_data); + } + + std::vector query(void *q) { + std::vector rs; + + /* return an empty result set if q is invalid */ + if (q == nullptr) { + return rs; + } + + auto parms = (BSMVPTree::KNNQueryParms*) q; + auto pq = psudb::PriorityQueue>(parms->k, &parms->point); + + if (parms->k >= m_data.size()) { + for (size_t i=0; i::max(); + internal_search(m_root, parms->point, parms->k, pq, &farthest); + } + + size_t i=0; + while (pq.size() > 0) { + rs.push_back(*pq.peek().data); + pq.pop(); + } + return std::move(rs); + } + + std::vector query_merge(std::vector &rsa, std::vector &rsb, void* parms) { + KNNQueryParms *p = (KNNQueryParms *) parms; + R rec = p->point; + size_t k = p->k; + + std::vector output; + + psudb::PriorityQueue> pq(k, &rec); + + for (size_t i=0; icalc_distance(rec); + double cur_dist = rsa[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsa[i]); + } + } + } + + for (size_t i=0; icalc_distance(rec); + double cur_dist = rsb[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsb[i]); + } + } + } + + while (pq.size() > 0) { + output.emplace_back(*pq.peek().data); + pq.pop(); + } + + return std::move(output); + } + + size_t record_count() { + return m_data.size(); + } + + ~BSMVPTree() { + delete m_root; + } + +private: + + struct vp_ptr { + R *ptr; + double dist; + }; + + struct vpnode { + size_t start; + size_t stop; + bool leaf; + + double radius; + vpnode *inside; + vpnode *outside; + + vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {} + + ~vpnode() { + delete inside; + delete outside; + } + }; + + std::vector m_data; + std::vector m_ptrs; + vpnode *m_root; + + size_t m_node_cnt; + + BSMVPTree(std::vector &records) { + m_data = std::move(records); + m_node_cnt = 0; + + for (size_t i=0; i stop) { + return nullptr; + } + + /* base-case: create a leaf node */ + if (stop - start <= LEAFSZ) { + vpnode *node = new vpnode(); + node->start = start; + node->stop = stop; + node->leaf = true; + + m_node_cnt++; + return node; + } + + /* + * select a random element to be the root of the + * subtree + */ + auto i = start + gsl_rng_uniform_int(rng, stop - start + 1); + swap(start, i); + + /* for efficiency, we'll pre-calculate the distances between each point and the root */ + for (size_t i=start+1; i<=stop; i++) { + m_ptrs[i].dist = m_ptrs[start].ptr->calc_distance(*m_ptrs[i].ptr); + } + + /* + * partition elements based on their distance from the start, + * with those elements with distance falling below the median + * distance going into the left sub-array and those above + * the median in the right. This is easily done using QuickSelect. + */ + auto mid = (start + 1 + stop) / 2; + quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng); + + /* Create a new node based on this partitioning */ + vpnode *node = new vpnode(); + node->start = start; + + /* store the radius of the circle used for partitioning the node. */ + node->radius = m_ptrs[start].ptr->calc_distance(*m_ptrs[mid].ptr); + m_ptrs[start].dist = node->radius; + + /* recursively construct the left and right subtrees */ + node->inside = build_subtree(start + 1, mid-1, rng); + node->outside = build_subtree(mid, stop, rng); + + m_node_cnt++; + + return node; + } + + void quickselect(size_t start, size_t stop, size_t k, R *p, gsl_rng *rng) { + if (start == stop) return; + + auto pivot = partition(start, stop, p, rng); + + if (k < pivot) { + quickselect(start, pivot - 1, k, p, rng); + } else if (k > pivot) { + quickselect(pivot + 1, stop, k, p, rng); + } + } + + size_t partition(size_t start, size_t stop, R *p, gsl_rng *rng) { + auto pivot = start + gsl_rng_uniform_int(rng, stop - start); + + swap(pivot, stop); + + size_t j = start; + for (size_t i=start; i> &pq, double *farthest) { + + if (node == nullptr) return; + + if (node->leaf) { + for (size_t i=node->start; i<=node->stop; i++) { + double d = point.calc_distance(*m_ptrs[i].ptr); + if (d < *farthest) { + if (pq.size() == k) { + pq.pop(); + } + + pq.push(m_ptrs[i].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + } + + return; + } + + double d = point.calc_distance(*m_ptrs[node->start].ptr); + + if (d < *farthest) { + if (pq.size() == k) { + auto t = pq.peek().data; + pq.pop(); + } + pq.push(m_ptrs[node->start].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + + if (d < node->radius) { + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + } else { + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + } + } +}; diff --git a/benchmarks/vldb/mtree_bench.cpp b/benchmarks/vldb/mtree_bench.cpp index 35f56be..60425da 100644 --- a/benchmarks/vldb/mtree_bench.cpp +++ b/benchmarks/vldb/mtree_bench.cpp @@ -46,7 +46,7 @@ int main(int argc, char **argv) { } } fprintf(stderr, "[I] Reading Queries\n"); - auto queries = read_knn_queries(q_fname, 10); + auto queries = read_knn_queries(q_fname, 1000); fprintf(stderr, "[I] Warming up structure...\n"); /* warmup structure w/ 10% of records */ diff --git a/benchmarks/vldb/vptree_bench.cpp b/benchmarks/vldb/vptree_bench.cpp index b17a57b..0b98a52 100644 --- a/benchmarks/vldb/vptree_bench.cpp +++ b/benchmarks/vldb/vptree_bench.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { std::string d_fname = std::string(argv[2]); std::string q_fname = std::string(argv[3]); - auto extension = new Ext(100, 1000, 8, 0, 64); + auto extension = new Ext(1400, 1400, 8, 0, 64); gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); fprintf(stderr, "[I] Reading data file...\n"); @@ -53,7 +53,7 @@ int main(int argc, char **argv) { } } fprintf(stderr, "[I] Reading Queries\n"); - auto queries = read_knn_queries(q_fname, 10); + auto queries = read_knn_queries(q_fname, 1000); fprintf(stderr, "[I] Warming up structure...\n"); /* warmup structure w/ 10% of records */ @@ -82,6 +82,7 @@ int main(int argc, char **argv) { auto shard = extension->create_static_structure(); + fprintf(stderr, "Running Static query tests\n\n"); TIMER_START(); run_static_queries(shard, queries); TIMER_STOP(); @@ -96,5 +97,6 @@ int main(int argc, char **argv) { gsl_rng_free(rng); delete extension; fflush(stderr); + fflush(stdout); } diff --git a/external/psudb-common b/external/psudb-common index a34bedf..97cb5f1 160000 --- a/external/psudb-common +++ b/external/psudb-common @@ -1 +1 @@ -Subproject commit a34bedfa068765ca2cbd4c643b648a5e0d7dd2e1 +Subproject commit 97cb5f16c1fc9eb61bcdd132cbd7610b970bc415 -- cgit v1.2.3