diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2024-02-23 15:40:49 -0500 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2024-02-23 15:40:49 -0500 |
| commit | 69f36c9b4f5df19f09156689b333afe29a017eed (patch) | |
| tree | f908ce79c7844696b08cc52c78df3e880fa7c7e7 /benchmarks/include | |
| parent | 0fcf63e9e47bae61b9f7289c157b28ec294c2e24 (diff) | |
| download | dynamic-extension-69f36c9b4f5df19f09156689b333afe29a017eed.tar.gz | |
Benchmark updates
Diffstat (limited to 'benchmarks/include')
| -rw-r--r-- | benchmarks/include/benchmark_types.h | 50 | ||||
| -rw-r--r-- | benchmarks/include/btree-util.h | 27 | ||||
| -rw-r--r-- | benchmarks/include/data-proc.h | 258 | ||||
| -rw-r--r-- | benchmarks/include/file_util.h | 130 | ||||
| -rw-r--r-- | benchmarks/include/standard_benchmarks.h | 212 |
5 files changed, 392 insertions, 285 deletions
diff --git a/benchmarks/include/benchmark_types.h b/benchmarks/include/benchmark_types.h new file mode 100644 index 0000000..85e9565 --- /dev/null +++ b/benchmarks/include/benchmark_types.h @@ -0,0 +1,50 @@ +#pragma once + +#include <cstdlib> +#include "psu-ds/BTree.h" +#include "mtree.h" +#include "framework/interface/Record.h" + +/* TLX BTree definitions*/ +template <typename K, typename V> +struct btree_record { + K key; + V value; + + inline bool operator<(const btree_record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const btree_record& other) const { + return key == other.key && value == other.value; + } +}; + +template <typename K, typename V> +struct btree_key_extract { + static const K &get(const btree_record<K, V> &v) { + return v.key; + } +}; + +typedef psudb::BTree<int64_t, btree_record<int64_t, int64_t>, btree_key_extract<int64_t, int64_t>> BenchBTree; + + +/*MTree Definitions*/ + +const size_t W2V_SIZE = 300; +typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec; + +struct euclidean_distance { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double dist = 0; + for (size_t i=0; i<W2V_SIZE; i++) { + dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]); + } + + return std::sqrt(dist); + } +}; + +typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; + diff --git a/benchmarks/include/btree-util.h b/benchmarks/include/btree-util.h deleted file mode 100644 index 571c073..0000000 --- a/benchmarks/include/btree-util.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include <cstdlib> -#include "psu-ds/BTree.h" - -struct btree_record { - int64_t key; - int64_t value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const int64_t &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - - diff --git a/benchmarks/include/data-proc.h b/benchmarks/include/data-proc.h deleted file mode 100644 index 444cb94..0000000 --- a/benchmarks/include/data-proc.h +++ /dev/null @@ -1,258 +0,0 @@ -#include <cstdlib> -#include <cstdio> -#include <iostream> -#include <fstream> -#include <sstream> -#include <string> -#include <gsl/gsl_rng.h> -#include <cstring> -#include <vector> - -#include "psu-ds/BTree.h" - -#pragma once - -typedef int64_t key_type; -typedef int64_t value_type; -typedef uint64_t weight_type; - -static gsl_rng *g_rng; -static bool g_osm_data; - -struct btree_record { - key_type key; - value_type value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const key_type &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - -static key_type g_min_key = UINT64_MAX; -static key_type g_max_key = 0; - -static size_t g_max_record_cnt = 0; -static size_t g_reccnt = 0; - -static constexpr unsigned int DEFAULT_SEED = 0; - -static unsigned int get_random_seed() -{ - unsigned int seed = 0; - std::fstream urandom; - urandom.open("/dev/urandom", std::ios::in|std::ios::binary); - urandom.read((char *) &seed, sizeof(seed)); - urandom.close(); - - return seed; -} - -static key_type osm_to_key(const char *key_field) { - double tmp_key = (atof(key_field) + 180) * 10e6; - return (key_type) tmp_key; -} - -static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) -{ - g_rng = gsl_rng_alloc(type); - gsl_rng_set(g_rng, seed); -} - -static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) -{ - unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; - init_bench_rng(seed, gsl_rng_mt19937); - g_osm_data = osm_correction; - g_max_record_cnt = max_reccnt; - g_reccnt = 0; -} - -static void delete_bench_env() -{ - gsl_rng_free(g_rng); -} - - -template <typename QP> -static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.target_key = start; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.lower_bound = start; - q.upper_bound = stop; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_knn_queries(std::string fname, size_t k) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - char *line = NULL; - size_t len = 0; - - while (getline(&line, &len, qf) > 0) { - char *token; - QP query; - size_t idx = 0; - - token = strtok(line, " "); - do { - query.point.data[idx++] = atof(token); - } while ((token = strtok(NULL, " "))); - - query.k = k; - queries.emplace_back(query); - } - - free(line); - fclose(qf); - - return queries; -} - -/* - * NOTE: The QP type must have lower_bound and upper_bound attributes, which - * this function will initialize. Any other query parameter attributes must - * be manually initialized after the call. - */ -template <typename R> -static bool next_vector_record(std::fstream &file, R &record, bool binary=false) { - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - for (size_t i=0; i<300; i++) { - std::string dimension; - - std::getline(line_stream, dimension, ' '); - record.data[i] = atof(dimension.c_str()); - } - - g_reccnt++; - - return true; - } - - return false; - -} - -template <typename R> -static bool next_record(std::fstream &file, R &record, bool binary=false) -{ - static value_type value = 1; - if (g_reccnt >= g_max_record_cnt) return false; - - if (binary) { - if (file.good()) { - decltype(R::key) key; - - file.read((char*) &key, sizeof(key)); - record.key = key; - record.value = value; - value++; - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - return true; - } - - return false; - } - - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - std::string key_field; - std::string value_field; - std::string weight_field; - - std::getline(line_stream, value_field, '\t'); - std::getline(line_stream, key_field, '\t'); - std::getline(line_stream, weight_field, '\t'); - - record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); - record.value = atol(value_field.c_str()); - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - g_reccnt++; - - return true; - } - - return false; -} - -template <typename R> -static bool build_delete_vec(std::vector<R> &to_delete, std::vector<R> &vec, size_t n) { - vec.clear(); - - size_t cnt = 0; - while (cnt < n) { - if (to_delete.size() == 0) { - return false; - } - - auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); - vec.emplace_back(to_delete[i]); - to_delete.erase(to_delete.begin() + i); - } -td: - return true; -} - -static std::vector<int64_t> read_sosd_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - std::vector<int64_t> records(n); - for (size_t i=0; i<n; i++) { - file.read((char*) &(records[i]), sizeof(int64_t)); - } - - return records; -} diff --git a/benchmarks/include/file_util.h b/benchmarks/include/file_util.h new file mode 100644 index 0000000..2a3300a --- /dev/null +++ b/benchmarks/include/file_util.h @@ -0,0 +1,130 @@ +#include <cstdlib> +#include <cstdio> +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <gsl/gsl_rng.h> +#include <cstring> +#include <vector> + +#include "framework/interface/Record.h" +#include "query/irs.h" + +#pragma once + +template <typename QP> +static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.target_key = start; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> read_knn_queries(std::string fname, size_t k) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; + + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; + + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); + + query.k = k; + queries.emplace_back(query); + } + + free(line); + fclose(qf); + + return queries; +} + +template<de::KVPInterface R> +static std::vector<R> read_sosd_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + std::vector<R> records(n); + for (size_t i=0; i<n; i++) { + uint64_t k; + file.read((char*) &(k), sizeof(uint64_t)); + records[i].key = k; + records[i].value = i; + } + + return records; +} + +/* + * This function expects a plaintext file with each vector on its own line. + * There should be D dimensions (or more) for each record, separated by + * whitespace. The function will generate a vector of n records, each + * record built from the first D dimensions of the data in the file. + */ +template <typename R, size_t D> +static std::vector<R> read_vector_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in); + + std::vector<R> records; + records.reserve(n); + + for (size_t i=0; i<n; i++) { + std::string line; + if (!std::getline(file, line, '\n')) break; + + std::stringstream line_stream(line); + R rec; + for (size_t j=0; j<D; j++) { + std::string dim; + if (!std::getline(line_stream, dim, ' ')) break; + + rec.data[j] = atof(dim.c_str()); + } + records.emplace_back(rec); + } + + return records; +} diff --git a/benchmarks/include/standard_benchmarks.h b/benchmarks/include/standard_benchmarks.h new file mode 100644 index 0000000..a42cdd6 --- /dev/null +++ b/benchmarks/include/standard_benchmarks.h @@ -0,0 +1,212 @@ +/* + * benchmarks/include/bench.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <cstdlib> +#include <fstream> +#include <gsl/gsl_rng.h> + +#include "framework/DynamicExtension.h" +#include "framework/interface/Query.h" +#include "psu-util/progress.h" +#include "benchmark_types.h" + +static size_t g_deleted_records = 0; +static double delete_proportion = 0.05; + +template<typename DE, typename QP> +static void run_queries(DE *extension, std::vector<QP> &queries, gsl_rng *rng) { + size_t total; + for (size_t i=0; i<queries.size(); i++) { + auto q = &queries[i]; + + auto res = extension->query(q); + auto r = res.get(); + total += r.size(); + } +} + + +template<typename DE, de::RecordInterface R> +static void insert_records(DE *extension, size_t start, size_t stop, + std::vector<R> &records, std::vector<size_t> &to_delete, + size_t &delete_idx, bool delete_records, gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + while (!extension->insert(records[i])) { + psudb::progress_update((double) i / (double)(stop - start), "Insert Progress"); + usleep(1); + } + + if (delete_records && gsl_rng_uniform(rng) <= + delete_proportion && to_delete[delete_idx] <= i) { + + while (!extension->erase(records[to_delete[delete_idx]])) { + usleep(1); + } + + delete_idx++; + g_deleted_records++; + } + } + + psudb::progress_update(1, "Insert Progress"); +} + +template <typename DE, de::RecordInterface R, bool PROGRESS=true, size_t BATCH=1000> +static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cnt, + double delete_prop, gsl_rng *rng, std::vector<R> &to_delete, bool binary=false) { + + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + std::vector<R> insert_vec; + std::vector<R> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + if constexpr (PROGRESS) { + psudb::progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(rng) < delete_prop) { + if constexpr (std::is_same_v<BenchBTree, DE>) { + de_index.erase_one(delete_vec[delete_idx++].key); + } else if constexpr (std::is_same_v<MTree, DE>) { + de_index.remove(delete_vec[delete_idx++]); + } else { + de_index.erase(delete_vec[delete_idx++]); + } + applied_deletes++; + } + + // insert the record; + if constexpr (std::is_same_v<MTree, DE>) { + de_index.add(insert_vec[i]); + } else { + de_index.insert(insert_vec[i]); + } + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + if constexpr (PROGRESS) { + psudb::progress_update(1.0, "inserting:"); + } + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); + + return continue_benchmark; +} + +template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true> +static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto res = de_index.query(&queries[j]); + + total_results += res.size(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} + + +template <typename Shard, de::RecordInterface R, typename QP, de::QueryInterface<R, Shard> Q, bool PROGRESS=true> +static bool static_latency_bench(Shard *shard, std::vector<QP> queries, size_t trial_cnt=100) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + std::vector<void *> states(1); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + states[0] = Q::get_query_state(shard, &queries[j]); + Q::process_query_states(&queries[j], states, nullptr); + auto res = Q::query(shard, states[0], &queries[j]); + total_results += res.size(); + Q::delete_query_state(states[0]); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} |