diff options
| author | Douglas B. Rumbaugh <dbr4@psu.edu> | 2024-05-14 16:31:05 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-14 16:31:05 -0400 |
| commit | 47916da2ba5ed5bee2dda3cbcc58d39e1e931bfc (patch) | |
| tree | ee5613ce182b2c9caa228d3abeb65dc27fef2db3 /benchmarks/include | |
| parent | 4a834497d5f82c817d634925250158d85ca825c2 (diff) | |
| parent | 8643fe194dec05b4e3f3ea31e162ac0b2b00e162 (diff) | |
| download | dynamic-extension-47916da2ba5ed5bee2dda3cbcc58d39e1e931bfc.tar.gz | |
Merge pull request #4 from dbrumbaugh/master
Updates for VLDB revision
Diffstat (limited to 'benchmarks/include')
| -rw-r--r-- | benchmarks/include/benchmark_types.h | 68 | ||||
| -rw-r--r-- | benchmarks/include/btree-util.h | 27 | ||||
| -rw-r--r-- | benchmarks/include/data-proc.h | 258 | ||||
| -rw-r--r-- | benchmarks/include/file_util.h | 294 | ||||
| -rw-r--r-- | benchmarks/include/standard_benchmarks.h | 380 | ||||
| -rw-r--r-- | benchmarks/include/triespline_bsm.h | 156 | ||||
| -rw-r--r-- | benchmarks/include/vptree_bsm.h | 317 |
7 files changed, 1215 insertions, 285 deletions
diff --git a/benchmarks/include/benchmark_types.h b/benchmarks/include/benchmark_types.h new file mode 100644 index 0000000..51fc52d --- /dev/null +++ b/benchmarks/include/benchmark_types.h @@ -0,0 +1,68 @@ +#pragma once + +#include <cstdlib> +#include "psu-ds/BTree.h" +#include "framework/interface/Record.h" +#include "pgm/pgm_index_dynamic.hpp" + +/* BTree definitions*/ +template <typename K, typename V> +struct btree_record { + K key; + V value; + + inline bool operator<(const btree_record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const btree_record& other) const { + return key == other.key && value == other.value; + } +}; + +template <typename K, typename V> +struct btree_key_extract { + static const K &get(const btree_record<K, V> &v) { + return v.key; + } +}; + +typedef psudb::BTree<int64_t, btree_record<int64_t, int64_t>, btree_key_extract<int64_t, int64_t>> BenchBTree; + + +/*MTree Definitions*/ + +const size_t W2V_SIZE = 300; +typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec; + +const size_t ANNSize = 128; +typedef de::EuclidPoint<uint64_t, ANNSize> ANNRec; + +struct euclidean_distance { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double dist = 0; + for (size_t i=0; i<W2V_SIZE; i++) { + dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]); + } + + return std::sqrt(dist); + } + + double operator()(const ANNRec &first, const ANNRec &second) const { + double dist = 0; + for (size_t i=0; i<ANNSize; i++) { + dist += ((double) first.data[i] - (double) second.data[i]) * ((double) first.data[i] - (double) second.data[i]); + } + + return std::sqrt(dist); + } +}; + +#ifdef _GNU_SOURCE +#include "mtree.h" +typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; +typedef mt::mtree<ANNRec, euclidean_distance> MTree_alt; +#endif + +typedef pgm::DynamicPGMIndex<uint64_t, uint64_t, pgm::PGMIndex<uint64_t, 64>> PGM; + diff --git a/benchmarks/include/btree-util.h b/benchmarks/include/btree-util.h deleted file mode 100644 index 571c073..0000000 --- a/benchmarks/include/btree-util.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include <cstdlib> -#include "psu-ds/BTree.h" - -struct btree_record { - int64_t key; - int64_t value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const int64_t &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - - diff --git a/benchmarks/include/data-proc.h b/benchmarks/include/data-proc.h deleted file mode 100644 index 444cb94..0000000 --- a/benchmarks/include/data-proc.h +++ /dev/null @@ -1,258 +0,0 @@ -#include <cstdlib> -#include <cstdio> -#include <iostream> -#include <fstream> -#include <sstream> -#include <string> -#include <gsl/gsl_rng.h> -#include <cstring> -#include <vector> - -#include "psu-ds/BTree.h" - -#pragma once - -typedef int64_t key_type; -typedef int64_t value_type; -typedef uint64_t weight_type; - -static gsl_rng *g_rng; -static bool g_osm_data; - -struct btree_record { - key_type key; - value_type value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const key_type &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - -static key_type g_min_key = UINT64_MAX; -static key_type g_max_key = 0; - -static size_t g_max_record_cnt = 0; -static size_t g_reccnt = 0; - -static constexpr unsigned int DEFAULT_SEED = 0; - -static unsigned int get_random_seed() -{ - unsigned int seed = 0; - std::fstream urandom; - urandom.open("/dev/urandom", std::ios::in|std::ios::binary); - urandom.read((char *) &seed, sizeof(seed)); - urandom.close(); - - return seed; -} - -static key_type osm_to_key(const char *key_field) { - double tmp_key = (atof(key_field) + 180) * 10e6; - return (key_type) tmp_key; -} - -static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) -{ - g_rng = gsl_rng_alloc(type); - gsl_rng_set(g_rng, seed); -} - -static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) -{ - unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; - init_bench_rng(seed, gsl_rng_mt19937); - g_osm_data = osm_correction; - g_max_record_cnt = max_reccnt; - g_reccnt = 0; -} - -static void delete_bench_env() -{ - gsl_rng_free(g_rng); -} - - -template <typename QP> -static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.target_key = start; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.lower_bound = start; - q.upper_bound = stop; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_knn_queries(std::string fname, size_t k) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - char *line = NULL; - size_t len = 0; - - while (getline(&line, &len, qf) > 0) { - char *token; - QP query; - size_t idx = 0; - - token = strtok(line, " "); - do { - query.point.data[idx++] = atof(token); - } while ((token = strtok(NULL, " "))); - - query.k = k; - queries.emplace_back(query); - } - - free(line); - fclose(qf); - - return queries; -} - -/* - * NOTE: The QP type must have lower_bound and upper_bound attributes, which - * this function will initialize. Any other query parameter attributes must - * be manually initialized after the call. - */ -template <typename R> -static bool next_vector_record(std::fstream &file, R &record, bool binary=false) { - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - for (size_t i=0; i<300; i++) { - std::string dimension; - - std::getline(line_stream, dimension, ' '); - record.data[i] = atof(dimension.c_str()); - } - - g_reccnt++; - - return true; - } - - return false; - -} - -template <typename R> -static bool next_record(std::fstream &file, R &record, bool binary=false) -{ - static value_type value = 1; - if (g_reccnt >= g_max_record_cnt) return false; - - if (binary) { - if (file.good()) { - decltype(R::key) key; - - file.read((char*) &key, sizeof(key)); - record.key = key; - record.value = value; - value++; - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - return true; - } - - return false; - } - - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - std::string key_field; - std::string value_field; - std::string weight_field; - - std::getline(line_stream, value_field, '\t'); - std::getline(line_stream, key_field, '\t'); - std::getline(line_stream, weight_field, '\t'); - - record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); - record.value = atol(value_field.c_str()); - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - g_reccnt++; - - return true; - } - - return false; -} - -template <typename R> -static bool build_delete_vec(std::vector<R> &to_delete, std::vector<R> &vec, size_t n) { - vec.clear(); - - size_t cnt = 0; - while (cnt < n) { - if (to_delete.size() == 0) { - return false; - } - - auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); - vec.emplace_back(to_delete[i]); - to_delete.erase(to_delete.begin() + i); - } -td: - return true; -} - -static std::vector<int64_t> read_sosd_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - std::vector<int64_t> records(n); - for (size_t i=0; i<n; i++) { - file.read((char*) &(records[i]), sizeof(int64_t)); - } - - return records; -} diff --git a/benchmarks/include/file_util.h b/benchmarks/include/file_util.h new file mode 100644 index 0000000..41eb18c --- /dev/null +++ b/benchmarks/include/file_util.h @@ -0,0 +1,294 @@ +#pragma once + +#include <cstdlib> +#include <cstdio> +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <gsl/gsl_rng.h> +#include <cstring> +#include <vector> +#include <memory> + +#include "psu-util/progress.h" + + +template <typename QP> +static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.target_key = start; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> generate_string_lookup_queries(std::vector<std::unique_ptr<char[]>> &strings, size_t cnt, gsl_rng *rng) { + std::vector<QP> queries; + + for (size_t i=0; i<cnt; i++) { + auto idx = gsl_rng_uniform_int(rng, strings.size()); + QP q; + q.search_key = strings[idx].get(); + queries.push_back(q); + } + + return queries; +} + +template <typename QP> +static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.00001) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + + +template <typename QP> +static std::vector<QP> read_binary_knn_queries(std::string fname, size_t k, size_t n) { + std::vector<QP> queries; + queries.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + + int32_t dim; + int32_t cnt; + + file.read((char*) &(cnt), sizeof(cnt)); + file.read((char*) &(dim), sizeof(dim)); + + if (n > cnt) { + n = cnt; + } + + for (size_t i=0; i<n; i++) { + QP query; + for (size_t j=0; j<dim; j++) { + uint64_t val; + file.read((char*) &(val), sizeof(uint64_t)); + query.point.data[j] = val; + } + query.k = k; + queries.push_back(query); + } + + return queries; +} + + +template <typename QP> +static std::vector<QP> read_knn_queries(std::string fname, size_t k) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; + + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); + + query.k = k; + queries.emplace_back(query); + } + + free(line); + fclose(qf); + + return queries; +} + +template<typename R> +static std::vector<R> read_sosd_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records(n); + for (size_t i=0; i<n; i++) { + decltype(R::key) k; + file.read((char*) &(k), sizeof(R::key)); + records[i].key = k; + records[i].value = i; + } + + return records; +} + +template<typename K, typename V> +static std::vector<std::pair<K, V>> read_sosd_file_pair(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<std::pair<K,V>> records(n); + for (size_t i=0; i<n; i++) { + K k; + file.read((char*) &(k), sizeof(K)); + records[i].first = k; + records[i].second = i; + } + + return records; +} + +/* + * This function expects a plaintext file with each vector on its own line. + * There should be D dimensions (or more) for each record, separated by + * whitespace. The function will generate a vector of n records, each + * record built from the first D dimensions of the data in the file. + */ +template <typename R, size_t D> +static std::vector<R> read_vector_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records; + records.reserve(n); + + for (size_t i=0; i<n; i++) { + std::string line; + if (!std::getline(file, line, '\n')) break; + + std::stringstream line_stream(line); + R rec; + for (size_t j=0; j<D; j++) { + std::string dim; + if (!std::getline(line_stream, dim, ' ')) break; + + rec.data[j] = atof(dim.c_str()); + } + records.emplace_back(rec); + } + + return records; +} + +template <typename R> +static std::vector<R> read_binary_vector_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records; + records.reserve(n); + + int32_t dim; + int32_t cnt; + + file.read((char*) &(cnt), sizeof(cnt)); + file.read((char*) &(dim), sizeof(dim)); + + if (n > cnt) { + n = cnt; + } + + R rec; + for (size_t i=0; i<n; i++) { + for (size_t j=0; j<dim; j++) { + uint64_t val; + file.read((char*) &(val), sizeof(uint64_t)); + rec.data[j] = val; + } + + records.emplace_back(rec); + } + + return records; +} + +static std::vector<std::unique_ptr<char[]>>read_string_file(std::string fname, size_t n=10000000) { + + std::fstream file; + file.open(fname, std::ios::in); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<std::unique_ptr<char[]>> strings; + strings.reserve(n); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(std::unique_ptr<char[]>(strdup(line.c_str()))); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } + + return strings; +} diff --git a/benchmarks/include/standard_benchmarks.h b/benchmarks/include/standard_benchmarks.h new file mode 100644 index 0000000..b805c08 --- /dev/null +++ b/benchmarks/include/standard_benchmarks.h @@ -0,0 +1,380 @@ +/* + * benchmarks/include/bench.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <cstdlib> +#include <fstream> +#include <gsl/gsl_rng.h> + +#include "framework/DynamicExtension.h" +#include "framework/interface/Query.h" +#include "query/irs.h" +#include "psu-util/progress.h" +#include "benchmark_types.h" +#include "psu-util/bentley-saxe.h" + +static size_t g_deleted_records = 0; +static double delete_proportion = 0.05; + +static volatile size_t total = 0; + +template<typename DE, typename QP, typename R> +static void run_queries(DE *extension, DE *ghost, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + std::vector<R> res = extension->query(&queries[i]); + std::vector<R> negres = ghost->query(&queries[i]); + auto result = res[0].first - negres[0].first; + total = result; + } +} + + +template<typename DE, typename QP, bool BSM=false> +static void run_queries(DE *extension, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + if constexpr (std::is_same_v<MTree, DE>) { + std::vector<Word2VecRec> result; + auto res = extension->get_nearest_by_limit(queries[i].point, queries[i].k); + + auto itr = res.begin(); + while (itr != res.end()) { + result.emplace_back(itr->data); + itr++; + } + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (auto &r : result) { + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + #endif + } else if constexpr (std::is_same_v<MTree_alt, DE>) { + std::vector<ANNRec> result; + auto res = extension->get_nearest_by_limit(queries[i].point, queries[i].k); + + auto itr = res.begin(); + while (itr != res.end()) { + result.emplace_back(itr->data); + itr++; + } + }else if constexpr (std::is_same_v<PGM, DE>) { + size_t tot = 0; + auto ptr = extension->find(queries[i].lower_bound); + while (ptr != extension->end() && ptr->first <= queries[i].upper_bound) { + tot++; + ++ptr; + } + } else { + auto res = extension->query(&queries[i]); + if constexpr (!BSM) { + auto result = res.get(); + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=result.size()-1; i>=0; i--) { + auto &r = result[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } else { + total = res.size(); + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } + } + } +} + +template <typename R> +static void run_btree_queries(BenchBTree *btree, std::vector<de::irs::Parms<R>> &queries) { + std::vector<int64_t> sample_set; + sample_set.reserve(queries[0].sample_size); + + for (size_t i=0; i<queries.size(); i++) { + btree->range_sample(queries[i].lower_bound, queries[i].upper_bound, queries[i].sample_size, sample_set, queries[i].rng); + } +} + + +template<typename S, typename QP, typename Q> +static void run_static_queries(S *shard, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + auto q = &queries[i]; + + auto state = Q::get_query_state(shard, q); + + std::vector<void*> shards = {shard}; + std::vector<void*> states = {state}; + + Q::process_query_states(q, states, nullptr); + auto res = Q::query(shard, state, q); + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i].rec; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } +} + + +/* + * Insert records into a standard Bentley-Saxe extension. Deletes are not + * supported. + */ +template<typename DS, typename R, bool MDSP=false> +static void insert_records(psudb::bsm::BentleySaxe<R, DS, MDSP> *extension, + size_t start, size_t stop, std::vector<R> &records) { + + psudb::progress_update(0, "Insert Progress"); + for (size_t i=start; i<stop; i++) { + extension->insert(records[i]); + } + + psudb::progress_update(1, "Insert Progress"); +} + + +template<typename DS, typename R, bool MDSP=false> +static void insert_records(psudb::bsm::BentleySaxe<R, DS, MDSP> *extension, + psudb::bsm::BentleySaxe<R, DS, MDSP> *ghost, + size_t start, size_t stop, std::vector<R> &records, + std::vector<size_t> &to_delete, size_t &delete_idx, + gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + + extension->insert(records[i]); + + if (gsl_rng_uniform(rng) <= delete_proportion && to_delete[delete_idx] <= i) { + ghost->insert(records[to_delete[delete_idx]]); + delete_idx++; + g_deleted_records++; + } + + } + +} + + +template<typename DE, typename R> +static void insert_records(DE *structure, size_t start, size_t stop, + std::vector<R> &records, std::vector<size_t> &to_delete, + size_t &delete_idx, bool delete_records, gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + + if constexpr (std::is_same_v<BenchBTree, DE>) { + structure->insert(records[i]); + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + structure->add(records[i]); + } else if constexpr (std::is_same_v<PGM, DE>) { + structure->insert_or_assign(records[i].key, records[i].value); + } else { + while (!structure->insert(records[i])) { + psudb::progress_update((double) i / (double)(stop - start), "Insert Progress"); + usleep(1); + } + } + + if (delete_records && gsl_rng_uniform(rng) <= + delete_proportion && to_delete[delete_idx] <= i) { + + if constexpr (std::is_same_v<BenchBTree, DE>) { + structure->erase_one(records[to_delete[delete_idx]].key); + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + structure->remove(records[to_delete[delete_idx]]); + } else if constexpr (std::is_same_v<PGM, DE>) { + structure->erase(records[to_delete[delete_idx]].key); + } else { + while (!structure->erase(records[to_delete[delete_idx]])) { + usleep(1); + } + } + delete_idx++; + g_deleted_records++; + } + } + + psudb::progress_update(1, "Insert Progress"); +} + +template <typename DE, de::RecordInterface R, bool PROGRESS=true, size_t BATCH=1000> +static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cnt, + double delete_prop, gsl_rng *rng, std::vector<R> &to_delete, bool binary=false) { + + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + std::vector<R> insert_vec; + std::vector<R> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + if constexpr (PROGRESS) { + psudb::progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(rng) < delete_prop) { + if constexpr (std::is_same_v<BenchBTree, DE>) { + de_index.erase_one(delete_vec[delete_idx++].key); + #ifdef _GNU_SOURCE + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + de_index.remove(delete_vec[delete_idx++]); + #endif + } else { + de_index.erase(delete_vec[delete_idx++]); + } + applied_deletes++; + } + + // insert the record; + #ifdef _GNU_SOURCE + if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + de_index.add(insert_vec[i]); + } else { + de_index.insert(insert_vec[i]); + } + #else + de_index.insert(insert_vec[i]); + #endif + + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + if constexpr (PROGRESS) { + psudb::progress_update(1.0, "inserting:"); + } + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); + + return continue_benchmark; +} + +template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true> +static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto res = de_index.query(&queries[j]); + + total_results += res.size(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} + + +template <typename Shard, de::RecordInterface R, typename QP, de::QueryInterface<R, Shard> Q, bool PROGRESS=true> +static bool static_latency_bench(Shard *shard, std::vector<QP> queries, size_t trial_cnt=100) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + std::vector<void *> states(1); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + states[0] = Q::get_query_state(shard, &queries[j]); + Q::process_query_states(&queries[j], states, nullptr); + auto res = Q::query(shard, states[0], &queries[j]); + total_results += res.size(); + Q::delete_query_state(states[0]); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} diff --git a/benchmarks/include/triespline_bsm.h b/benchmarks/include/triespline_bsm.h new file mode 100644 index 0000000..eaf1079 --- /dev/null +++ b/benchmarks/include/triespline_bsm.h @@ -0,0 +1,156 @@ +#pragma once + +#include <cstdlib> +#include <vector> +#include <algorithm> +#include "ts/builder.h" + +template <typename K, typename V, size_t E=1024> +class BSMTrieSpline { +private: + typedef std::pair<K, V> R; + +public: + struct RangeQueryParameters { + K lower_bound; + K upper_bound; + }; + +public: + static BSMTrieSpline *build(std::vector<R> &records) { + if (records.size() == 0) { + return nullptr; + } + + std::sort(records.begin(), records.end()); + return new BSMTrieSpline(records); + } + + static BSMTrieSpline *build_presorted(std::vector<R> &records) { + if (records.size() == 0) { + return nullptr; + } + + return new BSMTrieSpline(records); + } + + std::vector<R> unbuild() { + return std::move(m_data); + } + + std::vector<R> query(void *q) { + std::vector<R> rs; + + /* return an empty result set if q is invalid */ + if (q == nullptr) { + rs.push_back({0, 0}); + return rs; + } + + auto parms = (BSMTrieSpline::RangeQueryParameters*) q; + + size_t idx = lower_bound(parms->lower_bound); + + size_t cnt = 0; + while (idx < m_data.size() && m_data[idx].first <= parms->upper_bound) { + cnt++; + idx++; + } + + rs.push_back({cnt, 0}); + + return std::move(rs); + } + + std::vector<R> query_merge(std::vector<R> &rsa, std::vector<R> &rsb, void *parms) { + /* initialize rsa on the first merge */ + if (rsa.size() == 0) { + rsa.push_back({0, 0}); + } + rsa[0].first += rsb[0].first; + return std::move(rsa); + } + + size_t record_count() { + return m_data.size(); + } + + ~BSMTrieSpline() = default; + +private: + std::vector<R> m_data; + K m_max_key; + K m_min_key; + ts::TrieSpline<K> m_ts; + + BSMTrieSpline(std::vector<R> &records) { + m_data = std::move(records); + m_min_key = m_data[0].first; + m_max_key = m_data[m_data.size() - 1].first; + + if (m_data.size() < 50) { + return; + } + + auto bldr = ts::Builder<K>(m_min_key, m_max_key, E); + for (size_t i=0; i<m_data.size(); i++) { + bldr.AddKey(m_data[i].first); + } + + if (m_data.size() > 1) { + m_ts = bldr.Finalize(); + } + } + + size_t lower_bound(K key) { + if (m_data.size() == 0) { + return 1; + } else if (m_data.size() < 50) { + for (size_t i=0; i<m_data.size(); i++) { + if (m_data[i].first >= key) { + return i; + } + } + + return m_data.size(); + } + + auto bound = m_ts.GetSearchBound(key); + size_t idx = bound.begin; + + if (idx >= m_data.size()) { + return m_data.size(); + } + + // If the region to search is less than some pre-specified + // amount, perform a linear scan to locate the record. + if (bound.end - bound.begin < 256) { + while (idx < bound.end && m_data[idx].first < key) { + idx++; + } + } else { + // Otherwise, perform a binary search + idx = bound.begin; + size_t max = bound.end; + + while (idx < max) { + size_t mid = (idx + max) / 2; + if (key > m_data[mid].first) { + idx = mid + 1; + } else { + max = mid; + } + } + } + + if (idx == m_data.size()) { + return m_data.size(); + } + + if (m_data[idx].first > key && idx > 0 && m_data[idx-1].first <= key) { + return idx-1; + } + + return idx; + } +}; diff --git a/benchmarks/include/vptree_bsm.h b/benchmarks/include/vptree_bsm.h new file mode 100644 index 0000000..2f12fcb --- /dev/null +++ b/benchmarks/include/vptree_bsm.h @@ -0,0 +1,317 @@ +#pragma once + +#include <cstdlib> +#include <vector> +#include <algorithm> +#include <limits> +#include <gsl/gsl_rng.h> + +#include "psu-ds/PriorityQueue.h" +#include "framework/interface/Record.h" + +template <typename R, size_t LEAFSZ=100> +class BSMVPTree { +public: + struct KNNQueryParms { + R point; + size_t k; + }; + +public: + static BSMVPTree *build(std::vector<R> &records) { + return new BSMVPTree(records); + } + + static BSMVPTree *build_presorted(std::vector<R> &records) { + return new BSMVPTree(records); + } + + std::vector<R> unbuild() { + return std::move(m_data); + } + + std::vector<R> query(void *q) { + std::vector<R> rs; + + /* return an empty result set if q is invalid */ + if (q == nullptr) { + return rs; + } + + auto parms = (BSMVPTree::KNNQueryParms*) q; + auto pq = psudb::PriorityQueue<R, de::DistCmpMax<R>>(parms->k, &parms->point); + + if (parms->k >= m_data.size()) { + for (size_t i=0; i<m_data.size(); i++) { + if (m_ptrs[i].ptr != nullptr) { + pq.push(m_ptrs[i].ptr); + } + } + } else { + double farthest = std::numeric_limits<double>::max(); + internal_search(m_root, parms->point, parms->k, pq, &farthest); + } + + size_t i=0; + while (pq.size() > 0) { + rs.push_back(*pq.peek().data); + pq.pop(); + } + return std::move(rs); + } + + std::vector<R> query_merge(std::vector<R> &rsa, std::vector<R> &rsb, void* parms) { + KNNQueryParms *p = (KNNQueryParms *) parms; + R rec = p->point; + size_t k = p->k; + + std::vector<R> output; + + psudb::PriorityQueue<R, de::DistCmpMax<R>> pq(k, &rec); + + for (size_t i=0; i<rsa.size(); i++) { + if (pq.size() < k) { + pq.push(&rsa[i]); + } else { + double head_dist = pq.peek().data->calc_distance(rec); + double cur_dist = rsa[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsa[i]); + } + } + } + + for (size_t i=0; i<rsb.size(); i++) { + if (pq.size() < k) { + pq.push(&rsb[i]); + } else { + double head_dist = pq.peek().data->calc_distance(rec); + double cur_dist = rsb[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsb[i]); + } + } + } + + while (pq.size() > 0) { + output.emplace_back(*pq.peek().data); + pq.pop(); + } + + return std::move(output); + } + + size_t record_count() { + return m_data.size(); + } + + ~BSMVPTree() { + delete m_root; + } + +private: + + struct vp_ptr { + R *ptr; + double dist; + }; + + struct vpnode { + size_t start; + size_t stop; + bool leaf; + + double radius; + vpnode *inside; + vpnode *outside; + + vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {} + + ~vpnode() { + delete inside; + delete outside; + } + }; + + std::vector<R> m_data; + std::vector<vp_ptr> m_ptrs; + vpnode *m_root; + + size_t m_node_cnt; + + BSMVPTree(std::vector<R> &records) { + m_data = std::move(records); + m_node_cnt = 0; + + for (size_t i=0; i<m_data.size(); i++) { + m_ptrs.push_back({&m_data[i], 0}); + } + + m_root = build_vptree(); + } + + vpnode *build_vptree() { + if (m_data.size() == 0) { + return nullptr; + } + + size_t lower = 0; + size_t upper = m_data.size() - 1; + + auto rng = gsl_rng_alloc(gsl_rng_mt19937); + auto root = build_subtree(lower, upper, rng); + gsl_rng_free(rng); + return root; + } + + vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) { + /* + * base-case: sometimes happens (probably because of the +1 and -1 + * in the first recursive call) + */ + if (start > stop) { + return nullptr; + } + + /* base-case: create a leaf node */ + if (stop - start <= LEAFSZ) { + vpnode *node = new vpnode(); + node->start = start; + node->stop = stop; + node->leaf = true; + + m_node_cnt++; + return node; + } + + /* + * select a random element to be the root of the + * subtree + */ + auto i = start + gsl_rng_uniform_int(rng, stop - start + 1); + swap(start, i); + + /* for efficiency, we'll pre-calculate the distances between each point and the root */ + for (size_t i=start+1; i<=stop; i++) { + m_ptrs[i].dist = m_ptrs[start].ptr->calc_distance(*m_ptrs[i].ptr); + } + + /* + * partition elements based on their distance from the start, + * with those elements with distance falling below the median + * distance going into the left sub-array and those above + * the median in the right. This is easily done using QuickSelect. + */ + auto mid = (start + 1 + stop) / 2; + quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng); + + /* Create a new node based on this partitioning */ + vpnode *node = new vpnode(); + node->start = start; + + /* store the radius of the circle used for partitioning the node. */ + node->radius = m_ptrs[start].ptr->calc_distance(*m_ptrs[mid].ptr); + m_ptrs[start].dist = node->radius; + + /* recursively construct the left and right subtrees */ + node->inside = build_subtree(start + 1, mid-1, rng); + node->outside = build_subtree(mid, stop, rng); + + m_node_cnt++; + + return node; + } + + void quickselect(size_t start, size_t stop, size_t k, R *p, gsl_rng *rng) { + if (start == stop) return; + + auto pivot = partition(start, stop, p, rng); + + if (k < pivot) { + quickselect(start, pivot - 1, k, p, rng); + } else if (k > pivot) { + quickselect(pivot + 1, stop, k, p, rng); + } + } + + size_t partition(size_t start, size_t stop, R *p, gsl_rng *rng) { + auto pivot = start + gsl_rng_uniform_int(rng, stop - start); + + swap(pivot, stop); + + size_t j = start; + for (size_t i=start; i<stop; i++) { + if (m_ptrs[i].dist < m_ptrs[stop].dist) { + swap(j++, i); + } + } + + swap(j, stop); + return j; + } + + void swap(size_t idx1, size_t idx2) { + auto tmp = m_ptrs[idx1]; + m_ptrs[idx1] = m_ptrs[idx2]; + m_ptrs[idx2] = tmp; + } + + void internal_search(vpnode *node, const R &point, size_t k, psudb::PriorityQueue<R, + de::DistCmpMax<R>> &pq, double *farthest) { + + if (node == nullptr) return; + + if (node->leaf) { + for (size_t i=node->start; i<=node->stop; i++) { + double d = point.calc_distance(*m_ptrs[i].ptr); + if (d < *farthest) { + if (pq.size() == k) { + pq.pop(); + } + + pq.push(m_ptrs[i].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + } + + return; + } + + double d = point.calc_distance(*m_ptrs[node->start].ptr); + + if (d < *farthest) { + if (pq.size() == k) { + auto t = pq.peek().data; + pq.pop(); + } + pq.push(m_ptrs[node->start].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + + if (d < node->radius) { + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + } else { + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + } + } +}; |