diff options
Diffstat (limited to 'benchmarks/old-bench')
| -rw-r--r-- | benchmarks/old-bench/alex_rq_bench.cpp | 205 | ||||
| -rw-r--r-- | benchmarks/old-bench/alias_wss_bench.cpp | 57 | ||||
| -rw-r--r-- | benchmarks/old-bench/btree_irs_bench.cpp | 91 | ||||
| -rw-r--r-- | benchmarks/old-bench/btree_rq_bench.cpp | 90 | ||||
| -rw-r--r-- | benchmarks/old-bench/include/bench.h | 162 | ||||
| -rw-r--r-- | benchmarks/old-bench/include/bench_utility.h | 181 | ||||
| -rw-r--r-- | benchmarks/old-bench/include/standalone_utility.h | 240 | ||||
| -rw-r--r-- | benchmarks/old-bench/isam_irs_bench.cpp | 64 | ||||
| -rw-r--r-- | benchmarks/old-bench/isam_rq_bench.cpp | 59 | ||||
| -rw-r--r-- | benchmarks/old-bench/mtree_knn_bench.cpp | 83 | ||||
| -rw-r--r-- | benchmarks/old-bench/pgm_pl_bench.cpp | 67 | ||||
| -rw-r--r-- | benchmarks/old-bench/pgm_rq_bench.cpp | 67 | ||||
| -rw-r--r-- | benchmarks/old-bench/test.cpp | 7 | ||||
| -rw-r--r-- | benchmarks/old-bench/triespline_rq_bench.cpp | 66 | ||||
| -rw-r--r-- | benchmarks/old-bench/upgm_pl_bench.cpp | 212 | ||||
| -rw-r--r-- | benchmarks/old-bench/upgm_rq_bench.cpp | 217 | ||||
| -rw-r--r-- | benchmarks/old-bench/vptree_knn_bench.cpp | 58 |
17 files changed, 1926 insertions, 0 deletions
diff --git a/benchmarks/old-bench/alex_rq_bench.cpp b/benchmarks/old-bench/alex_rq_bench.cpp new file mode 100644 index 0000000..f75afa6 --- /dev/null +++ b/benchmarks/old-bench/alex_rq_bench.cpp @@ -0,0 +1,205 @@ +#include "alex.h" +#include "include/standalone_utility.h" + +typedef uint64_t key_type; +typedef uint64_t value_type; + +typedef alex::Alex<key_type, value_type> Alex; + +struct record { + key_type key; + value_type value; +}; + +struct query { + key_type lower_bound; + key_type upper_bound; +}; + +template <typename R> +static bool build_insert_vec(std::fstream &file, std::vector<R> &vec, size_t n, + double delete_prop, std::vector<R> &to_delete, bool binary=false) { + vec.clear(); + for (size_t i=0; i<n; i++) { + R rec; + if (!next_record(file, rec, binary)) { + if (i == 0) { + return false; + } + + break; + } + + vec.emplace_back(rec); + + if (gsl_rng_uniform(g_rng) < delete_prop + (delete_prop * .1)) { + to_delete.emplace_back(rec); + } + } + + return true; +} + + +static Alex *warmup(std::fstream &file, size_t count, + double delete_prop, std::vector<record> to_delete, bool progress=true, bool binary=false) { + size_t batch = std::min(.1 * count, 25000.0); + + std::pair<key_type, value_type> *insert_vec = new std::pair<key_type, value_type>[count]; + Alex *alex = new Alex(); + + size_t cnt = 0; + record rec; + while (cnt < count && next_record(file, rec)) { + insert_vec[cnt] = {rec.key, rec.value}; + cnt++; + } + + std::sort(insert_vec, insert_vec + count); + + alex->bulk_load(insert_vec, count); + delete[] insert_vec; + + return alex; +} + + +static void alex_rq_insert(Alex &alex, std::fstream &file, size_t insert_cnt, double delete_prop, std::vector<record> &to_delete, bool binary=false) { + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + size_t BATCH=1000; + + std::vector<record> insert_vec; + std::vector<record> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + alex.erase_one(delete_vec[delete_idx++].key); + applied_deletes++; + } + + // insert the record; + alex.insert(insert_vec[i].key, insert_vec[i].value); + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + progress_update(1.0, "inserting:"); + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); +} + + + +static void alex_rq_bench(Alex &alex, std::vector<query> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<record> result_set; + + for (int i=0; i<trial_cnt; i++) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto ptr = alex.find(queries[j].lower_bound); + while (ptr != alex.end() && ptr.key() <= queries[j].upper_bound) { + result_set.push_back({ptr.key(), ptr.payload()}); + ptr++; + } + result_set.clear(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: alex_rq_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.8; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries<query>(qfilename, .0001); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<record> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + auto alex = warmup(datafile, warmup_cnt, delete_prop, to_delete, true, true); + + fprintf(stderr, "Size: %ld\n", alex->size()); + size_t insert_cnt = record_count - warmup_cnt; + + alex_rq_insert(*alex, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = alex->model_size() + alex->data_size(); + + fprintf(stderr, "Size: %ld\n", alex->size()); + fprintf(stdout, "%ld\t", memory_usage); + + alex_rq_bench(*alex, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + delete alex; + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/alias_wss_bench.cpp b/benchmarks/old-bench/alias_wss_bench.cpp new file mode 100644 index 0000000..a3a43f2 --- /dev/null +++ b/benchmarks/old-bench/alias_wss_bench.cpp @@ -0,0 +1,57 @@ +/* + * benchmarks/alias_wss_bench.cpp + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 4) { + fprintf(stderr, "Usage: sampling_tput <filename> <record_count> <delete_proportion> [osm_data]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double delete_prop = atof(argv[3]); + double max_delete_prop = (delete_prop > 0) ? delete_prop : 1; + bool use_osm = (argc == 5) ? atoi(argv[4]) : 0; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + + auto de_wss = ExtendedWSS(buffer_cap, scale_factor, max_delete_prop); + + std::fstream datafile; + datafile.open(filename, std::ios::in); + + std::vector<WRec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedWSS, WRec>(datafile, de_wss, warmup_cnt, delete_prop, to_delete); + + size_t insert_cnt = record_count - warmup_cnt; + + std::vector<de::wss_query_parms<WRec>> queries(1); + queries[0].rng = g_rng; + queries[0].sample_size = 1000; + + insert_tput_bench<ExtendedWSS, WRec>(de_wss, datafile, insert_cnt, delete_prop, to_delete); + query_latency_bench<ExtendedWSS, WRec, de::wss_query_parms<WRec>>(de_wss, queries, 1000); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/btree_irs_bench.cpp b/benchmarks/old-bench/btree_irs_bench.cpp new file mode 100644 index 0000000..862fc6b --- /dev/null +++ b/benchmarks/old-bench/btree_irs_bench.cpp @@ -0,0 +1,91 @@ +#include "include/bench.h" +#include "ds/BTree.h" + +static void btree_sample_bench(TreeMap &tree, std::vector<de::irs_query_parms<btree_record>> queries, size_t trial_cnt=10) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<key_type> sample_set; + sample_set.reserve(queries[0].sample_size); + + for (int i=0; i<trial_cnt; i++) { + progress_update((double) (i * batch_size) / (double) trial_cnt, progbuf); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + tree.range_sample(queries[j].lower_bound, queries[j].upper_bound, queries[j].sample_size, sample_set, g_rng); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + + + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: btree_irs_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries<de::irs_query_parms<btree_record>>(qfilename, .001); + + for (auto &q: queries) { + q.rng = g_rng; + q.sample_size = 1000; + } + + auto btree = TreeMap(); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<btree_record> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<TreeMap, btree_record>(datafile, btree, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<TreeMap, btree_record>(btree, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = btree.get_stats().inner_nodes * tlx::btree_default_traits<key_type, btree_record>::inner_slots * (sizeof(key_type) + sizeof(void*)); + memory_usage += btree.get_stats().leaves * tlx::btree_default_traits<key_type, btree_record>::leaf_slots * sizeof(btree_record); + fprintf(stdout, "%ld\t", memory_usage); + + btree_sample_bench(btree, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/btree_rq_bench.cpp b/benchmarks/old-bench/btree_rq_bench.cpp new file mode 100644 index 0000000..d92b45d --- /dev/null +++ b/benchmarks/old-bench/btree_rq_bench.cpp @@ -0,0 +1,90 @@ +#include "include/bench.h" +#include "ds/BTree.h" + +static void btree_rq_bench(TreeMap &tree, std::vector<de::ISAMRangeQueryParms<btree_record>> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<btree_record> result_set; + + for (int i=0; i<trial_cnt; i++) { + progress_update((double) (i * batch_size) / (double) trial_cnt, progbuf); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto ptr = tree.find(queries[j].lower_bound); + while (ptr != tree.end() && ptr->key <= queries[j].upper_bound) { + result_set.emplace_back(*ptr); + ptr++; + } + result_set.clear(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + + + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: btree_rq_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries<de::ISAMRangeQueryParms<btree_record>>(qfilename, .0001); + + auto btree = TreeMap(); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<btree_record> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<TreeMap, btree_record>(datafile, btree, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<TreeMap, btree_record>(btree, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = btree.get_stats().inner_nodes * tlx::btree_default_traits<key_type, btree_record>::inner_slots * (sizeof(key_type) + sizeof(void*)); + memory_usage += btree.get_stats().leaves * tlx::btree_default_traits<key_type, btree_record>::leaf_slots * sizeof(btree_record); + fprintf(stdout, "%ld\t", memory_usage); + + btree_rq_bench(btree, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/include/bench.h b/benchmarks/old-bench/include/bench.h new file mode 100644 index 0000000..586ff12 --- /dev/null +++ b/benchmarks/old-bench/include/bench.h @@ -0,0 +1,162 @@ +/* + * benchmarks/include/bench.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "bench_utility.h" + +template <typename DE, de::RecordInterface R, bool PROGRESS=true, size_t BATCH=1000> +static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cnt, + double delete_prop, std::vector<R> &to_delete, bool binary=false) { + + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + std::vector<R> insert_vec; + std::vector<R> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + if constexpr (PROGRESS) { + progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + if constexpr (std::is_same_v<TreeMap, DE>) { + de_index.erase_one(delete_vec[delete_idx++].key); + } else if constexpr (std::is_same_v<MTree, DE>) { + de_index.remove(delete_vec[delete_idx++]); + } else { + de_index.erase(delete_vec[delete_idx++]); + } + applied_deletes++; + } + + // insert the record; + if constexpr (std::is_same_v<MTree, DE>) { + de_index.add(insert_vec[i]); + } else { + de_index.insert(insert_vec[i]); + } + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + if constexpr (PROGRESS) { + progress_update(1.0, "inserting:"); + } + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); + reset_de_perf_metrics(); + + return continue_benchmark; +} + +template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true> +static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto res = de_index.query(&queries[j]); + + total_results += res.size(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} + + +template <typename Shard, de::RecordInterface R, typename QP, QueryInterface Q, bool PROGRESS=true> +static bool static_latency_bench(Shard *shard, std::vector<QP> queries, size_t trial_cnt=100) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + std::vector<void *> states(1); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + states[0] = Q::get_query_state(shard, &queries[j]); + Q::process_query_states(&queries[j], states, nullptr); + auto res = Q::query(shard, states[0], &queries[j]); + total_results += res.size(); + Q::delete_query_state(states[0]); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} diff --git a/benchmarks/old-bench/include/bench_utility.h b/benchmarks/old-bench/include/bench_utility.h new file mode 100644 index 0000000..e33b93d --- /dev/null +++ b/benchmarks/old-bench/include/bench_utility.h @@ -0,0 +1,181 @@ +/* + * benchmarks/include/bench_utility.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "framework/DynamicExtension.h" +#include "shard/WSS.h" +#include "shard/MemISAM.h" +#include "shard/PGM.h" +#include "shard/TrieSpline.h" +#include "shard/WIRS.h" +#include "ds/BTree.h" +#include "shard/VPTree.h" +#include "mtree.h" +#include "standalone_utility.h" + +#include <cstdlib> +#include <cstdio> +#include <chrono> +#include <algorithm> +#include <numeric> +#include <memory> +#include <iostream> +#include <fstream> +#include <sstream> +#include <unordered_set> +#include <set> +#include <string> +#include <random> + +typedef uint64_t key_type; +typedef uint64_t value_type; +typedef uint64_t weight_type; + +typedef de::WeightedRecord<key_type, value_type, weight_type> WRec; +typedef de::Record<key_type, value_type> Rec; + +const size_t W2V_SIZE = 300; +typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec; + +typedef de::DynamicExtension<WRec, de::WSS<WRec>, de::WSSQuery<WRec>> ExtendedWSS; +typedef de::DynamicExtension<Rec, de::TrieSpline<Rec>, de::TrieSplineRangeQuery<Rec>> ExtendedTSRQ; +typedef de::DynamicExtension<Rec, de::PGM<Rec>, de::PGMRangeQuery<Rec>> ExtendedPGMRQ; +typedef de::DynamicExtension<Rec, de::PGM<Rec>, de::PGMPointLookup<Rec>> ExtendedPGM_PL; +typedef de::DynamicExtension<Rec, de::MemISAM<Rec>, de::IRSQuery<Rec>> ExtendedISAM_IRS; +typedef de::DynamicExtension<Rec, de::MemISAM<Rec>, de::ISAMRangeQuery<Rec>> ExtendedISAM_RQ; +typedef de::DynamicExtension<Word2VecRec, de::VPTree<Word2VecRec>, de::KNNQuery<Word2VecRec>> ExtendedVPTree_KNN; + +struct euclidean_distance { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double dist = 0; + for (size_t i=0; i<W2V_SIZE; i++) { + dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]); + } + + return std::sqrt(dist); + } +}; + + +struct cosine_similarity { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double prod = 0; + double asquared = 0; + double bsquared = 0; + + for (size_t i=0; i<W2V_SIZE; i++) { + prod += first.data[i] * second.data[i]; + asquared += first.data[i]*first.data[i]; + bsquared += second.data[i]*second.data[i]; + } + + return prod / std::sqrt(asquared * bsquared); + } +}; + +typedef tlx::BTree<key_type, btree_record, btree_key_extract> TreeMap; +typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; + +template <de::RecordInterface R> +static bool build_insert_vec(std::fstream &file, std::vector<R> &vec, size_t n, + double delete_prop, std::vector<R> &to_delete, bool binary=false) { + vec.clear(); + for (size_t i=0; i<n; i++) { + R rec; + if constexpr (std::is_same_v<R, Word2VecRec>) { + if (!next_vector_record(file, rec)) { + if (i == 0) { + return false; + } + + break; + } + } else { + if (!next_record(file, rec, binary)) { + if (i == 0) { + return false; + } + + break; + } + } + + vec.emplace_back(rec); + + if (gsl_rng_uniform(g_rng) < delete_prop + (delete_prop * .1)) { + to_delete.emplace_back(rec); + } + } + + return true; +} + + +template <typename DE, de::RecordInterface R> +static bool warmup(std::fstream &file, DE &extended_index, size_t count, + double delete_prop, std::vector<R> to_delete, bool progress=true, bool binary=false) { + size_t batch = std::min(.1 * count, 25000.0); + + std::vector<R> insert_vec; + std::vector<R> delete_vec; + insert_vec.reserve(batch); + delete_vec.reserve(batch*delete_prop); + + size_t inserted = 0; + size_t delete_idx = 0; + + double last_percent = 0; + while (inserted < count) { + // Build vector of records to insert and potentially delete + auto continue_warmup = build_insert_vec(file, insert_vec, batch, delete_prop, to_delete, binary); + if (inserted > batch) { + build_delete_vec(to_delete, delete_vec, batch*delete_prop); + delete_idx = 0; + } + + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + if constexpr (std::is_same_v<TreeMap, DE>) { + extended_index.erase_one(delete_vec[delete_idx++].key); + } + else if constexpr (std::is_same_v<MTree, DE>) { + extended_index.remove(delete_vec[delete_idx++]); + } else { + extended_index.erase(delete_vec[delete_idx++]); + } + } + + // insert the record; + if constexpr (std::is_same_v<MTree, DE>) { + extended_index.add(insert_vec[i]); + } else { + extended_index.insert(insert_vec[i]); + } + inserted++; + + if (progress) { + progress_update((double) inserted / (double) count, "warming up:"); + } + } + } + + return true; +} + + +static void reset_de_perf_metrics() { + + /* + * rejection counters are zeroed automatically by the + * sampling function itself. + */ + + RESET_IO_CNT(); +} diff --git a/benchmarks/old-bench/include/standalone_utility.h b/benchmarks/old-bench/include/standalone_utility.h new file mode 100644 index 0000000..727daa5 --- /dev/null +++ b/benchmarks/old-bench/include/standalone_utility.h @@ -0,0 +1,240 @@ +#include <cstdlib> +#include <cstdio> +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <gsl/gsl_rng.h> +#include <cstring> +#include <vector> + +typedef uint64_t key_type; +typedef uint64_t value_type; +typedef uint64_t weight_type; + +static gsl_rng *g_rng; +static bool g_osm_data; + +struct btree_record { + key_type key; + value_type value; + + inline bool operator<(const btree_record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const btree_record& other) const { + return key == other.key && value == other.value; + } +}; + +struct btree_key_extract { + static const key_type &get(const btree_record &v) { + return v.key; + } +}; + +static key_type g_min_key = UINT64_MAX; +static key_type g_max_key = 0; + +static size_t g_max_record_cnt = 0; +static size_t g_reccnt = 0; + +static constexpr unsigned int DEFAULT_SEED = 0; + +static unsigned int get_random_seed() +{ + unsigned int seed = 0; + std::fstream urandom; + urandom.open("/dev/urandom", std::ios::in|std::ios::binary); + urandom.read((char *) &seed, sizeof(seed)); + urandom.close(); + + return seed; +} + +static key_type osm_to_key(const char *key_field) { + double tmp_key = (atof(key_field) + 180) * 10e6; + return (key_type) tmp_key; +} + +static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) +{ + g_rng = gsl_rng_alloc(type); + gsl_rng_set(g_rng, seed); +} + +static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) +{ + unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; + init_bench_rng(seed, gsl_rng_mt19937); + g_osm_data = osm_correction; + g_max_record_cnt = max_reccnt; + g_reccnt = 0; +} + +static void delete_bench_env() +{ + gsl_rng_free(g_rng); +} + + +template <typename QP> +static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.target_key = start; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> read_range_queries(std::string fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> read_knn_queries(std::string fname, size_t k) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; + + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; + + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); + + query.k = k; + queries.emplace_back(query); + } + + free(line); + fclose(qf); + + return queries; +} + +/* + * NOTE: The QP type must have lower_bound and upper_bound attributes, which + * this function will initialize. Any other query parameter attributes must + * be manually initialized after the call. + */ +template <typename R> +static bool next_vector_record(std::fstream &file, R &record, bool binary=false) { + std::string line; + if (std::getline(file, line, '\n')) { + std::stringstream line_stream(line); + for (size_t i=0; i<300; i++) { + std::string dimension; + + std::getline(line_stream, dimension, ' '); + record.data[i] = atof(dimension.c_str()); + } + + g_reccnt++; + + return true; + } + + return false; + +} + +template <typename R> +static bool next_record(std::fstream &file, R &record, bool binary=false) +{ + static value_type value = 1; + if (g_reccnt >= g_max_record_cnt) return false; + + if (binary) { + if (file.good()) { + decltype(R::key) key; + + file.read((char*) &key, sizeof(key)); + record.key = key; + record.value = value; + value++; + + if (record.key < g_min_key) g_min_key = record.key; + if (record.key > g_max_key) g_max_key = record.key; + + return true; + } + + return false; + } + + std::string line; + if (std::getline(file, line, '\n')) { + std::stringstream line_stream(line); + std::string key_field; + std::string value_field; + std::string weight_field; + + std::getline(line_stream, value_field, '\t'); + std::getline(line_stream, key_field, '\t'); + std::getline(line_stream, weight_field, '\t'); + + record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); + record.value = atol(value_field.c_str()); + + if (record.key < g_min_key) g_min_key = record.key; + if (record.key > g_max_key) g_max_key = record.key; + + g_reccnt++; + + return true; + } + + return false; +} + +template <typename R> +static bool build_delete_vec(std::vector<R> &to_delete, std::vector<R> &vec, size_t n) { + vec.clear(); + + size_t cnt = 0; + while (cnt < n) { + if (to_delete.size() == 0) { + return false; + } + + auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); + vec.emplace_back(to_delete[i]); + to_delete.erase(to_delete.begin() + i); + } +td: + return true; +} diff --git a/benchmarks/old-bench/isam_irs_bench.cpp b/benchmarks/old-bench/isam_irs_bench.cpp new file mode 100644 index 0000000..96525f0 --- /dev/null +++ b/benchmarks/old-bench/isam_irs_bench.cpp @@ -0,0 +1,64 @@ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: isam_irs_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries<de::irs_query_parms<Rec>>(qfilename, .001); + + for (auto &q: queries) { + q.rng = g_rng; + q.sample_size = 1000; + } + + auto de_irs = ExtendedISAM_IRS(buffer_cap, scale_factor, max_delete_prop); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Rec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedISAM_IRS, Rec>(datafile, de_irs, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedISAM_IRS, Rec>(de_irs, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de_irs.get_memory_usage()); + query_latency_bench<ExtendedISAM_IRS, Rec, de::irs_query_parms<Rec>>(de_irs, queries); + fprintf(stdout, "\n"); + + auto ts = de_irs.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::MemISAM<Rec>, Rec, de::irs_query_parms<Rec>, de::IRSQuery<Rec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/isam_rq_bench.cpp b/benchmarks/old-bench/isam_rq_bench.cpp new file mode 100644 index 0000000..bb5626e --- /dev/null +++ b/benchmarks/old-bench/isam_rq_bench.cpp @@ -0,0 +1,59 @@ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: isam_rq_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries<de::ISAMRangeQueryParms<Rec>>(qfilename, .0001); + + auto de_isam_rq = ExtendedISAM_RQ(buffer_cap, scale_factor, max_delete_prop); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Rec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedISAM_RQ, Rec>(datafile, de_isam_rq, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedISAM_RQ, Rec>(de_isam_rq, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de_isam_rq.get_memory_usage()); + query_latency_bench<ExtendedISAM_RQ, Rec, de::ISAMRangeQueryParms<Rec>>(de_isam_rq, queries); + fprintf(stdout, "\n"); + + auto ts = de_isam_rq.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::MemISAM<Rec>, Rec, de::ISAMRangeQueryParms<Rec>, de::ISAMRangeQuery<Rec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/mtree_knn_bench.cpp b/benchmarks/old-bench/mtree_knn_bench.cpp new file mode 100644 index 0000000..9d4cc57 --- /dev/null +++ b/benchmarks/old-bench/mtree_knn_bench.cpp @@ -0,0 +1,83 @@ +#include "include/bench.h" +#include "mtree.h" + +static void mtree_knn_bench(MTree &tree, std::vector<de::KNNQueryParms<Word2VecRec>> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<Word2VecRec> result_set; + + for (int i=0; i<trial_cnt; i++) { + progress_update((double) (i * batch_size) / (double) trial_cnt, progbuf); + + std::vector<Word2VecRec> results; + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + results.clear(); + auto query_output = tree.get_nearest_by_limit(queries[j].point, queries[j].k); + auto itr = query_output.begin(); + while (itr != query_output.end()) { + results.emplace_back(itr->data); + itr++; + } + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: mtree_knn_bench <filename> <record_count> <delete_proportion> <query_file> [k]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + size_t k = (argc == 6) ? atol(argv[5]) : 10; + + init_bench_env(record_count, true); + auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, k); + + auto mtree = MTree(); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Word2VecRec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = 0.1 * record_count; + warmup<MTree, Word2VecRec>(datafile, mtree, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<MTree, Word2VecRec>(mtree, datafile, insert_cnt, delete_prop, to_delete, true); + // fprintf(stdout, "%ld\t", mtree.get_memory_usage()); + + mtree_knn_bench(mtree, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/pgm_pl_bench.cpp b/benchmarks/old-bench/pgm_pl_bench.cpp new file mode 100644 index 0000000..f798861 --- /dev/null +++ b/benchmarks/old-bench/pgm_pl_bench.cpp @@ -0,0 +1,67 @@ +/* + * benchmarks/triespline_rq_bench.cpp + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: pgm_pl_bench <filename> <record_count> <delete_proportion> <query_file> [osm_data]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + size_t buffer_cap = 1000; + size_t scale_factor = 6; + double delete_prop = atof(argv[3]); + double max_delete_prop = (delete_prop > 0) ? delete_prop : 1; + std::string query_file = std::string(argv[4]); + bool use_osm = (argc == 6) ? atoi(argv[5]) : 0; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + + auto de = ExtendedPGM_PL(buffer_cap, scale_factor, max_delete_prop); + auto queries = read_lookup_queries<de::PGMPointLookupParms<Rec>>(query_file, .0001); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Rec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedPGM_PL, Rec>(datafile, de, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedPGM_PL, Rec>(de, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de.get_memory_usage()); + query_latency_bench<ExtendedPGM_PL, Rec, de::PGMPointLookupParms<Rec>>(de, queries, 1); + + fprintf(stdout, "\n"); + + auto ts = de.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::PGM<Rec>, Rec, de::PGMPointLookupParms<Rec>, de::PGMPointLookup<Rec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/pgm_rq_bench.cpp b/benchmarks/old-bench/pgm_rq_bench.cpp new file mode 100644 index 0000000..e25d29f --- /dev/null +++ b/benchmarks/old-bench/pgm_rq_bench.cpp @@ -0,0 +1,67 @@ +/* + * benchmarks/triespline_rq_bench.cpp + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: pgm_rq_bench <filename> <record_count> <delete_proportion> <query_file> [osm_data]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + size_t buffer_cap = 12000; + size_t scale_factor = 8; + double delete_prop = atof(argv[3]); + double max_delete_prop = (delete_prop > 0) ? delete_prop : 1; + std::string query_file = std::string(argv[4]); + bool use_osm = (argc == 6) ? atoi(argv[5]) : 0; + + double insert_batch = 0.5; + + init_bench_env(record_count, true, use_osm); + + auto de = ExtendedPGMRQ(buffer_cap, scale_factor, max_delete_prop); + auto queries = read_range_queries<de::pgm_range_query_parms<Rec>>(query_file, .0001); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Rec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedPGMRQ, Rec>(datafile, de, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedPGMRQ, Rec>(de, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de.get_memory_usage()); + query_latency_bench<ExtendedPGMRQ, Rec, de::pgm_range_query_parms<Rec>>(de, queries, 1); + + fprintf(stdout, "\n"); + + auto ts = de.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::PGM<Rec>, Rec, de::pgm_range_query_parms<Rec>, de::PGMRangeQuery<Rec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/test.cpp b/benchmarks/old-bench/test.cpp new file mode 100644 index 0000000..75bffe3 --- /dev/null +++ b/benchmarks/old-bench/test.cpp @@ -0,0 +1,7 @@ +#include "alex.h" + + +int main(int argc, char **argv) { + alex::Alex<int, int> test; + +} diff --git a/benchmarks/old-bench/triespline_rq_bench.cpp b/benchmarks/old-bench/triespline_rq_bench.cpp new file mode 100644 index 0000000..967c3b0 --- /dev/null +++ b/benchmarks/old-bench/triespline_rq_bench.cpp @@ -0,0 +1,66 @@ +/* + * benchmarks/triespline_rq_bench.cpp + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: triespline_rq_bench <filename> <record_count> <delete_proportion> <query_file> [osm_data]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + size_t buffer_cap = 12000; + size_t scale_factor = 8; + double delete_prop = atof(argv[3]); + double max_delete_prop = (delete_prop > 0) ? delete_prop : 1; + std::string query_file = std::string(argv[4]); + bool use_osm = (argc == 6) ? atoi(argv[5]) : 0; + + double insert_batch = 0.5; + + init_bench_env(record_count, true, use_osm); + + auto de = ExtendedTSRQ(buffer_cap, scale_factor, max_delete_prop); + auto queries = read_range_queries<de::ts_range_query_parms<Rec>>(query_file, .0001); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Rec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup<ExtendedTSRQ, Rec>(datafile, de, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedTSRQ, Rec>(de, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de.get_memory_usage()); + query_latency_bench<ExtendedTSRQ, Rec, de::ts_range_query_parms<Rec>>(de, queries, 1); + fprintf(stdout, "\n"); + + auto ts = de.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::TrieSpline<Rec>, Rec, de::ts_range_query_parms<Rec>, de::TrieSplineRangeQuery<Rec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/upgm_pl_bench.cpp b/benchmarks/old-bench/upgm_pl_bench.cpp new file mode 100644 index 0000000..e0445b2 --- /dev/null +++ b/benchmarks/old-bench/upgm_pl_bench.cpp @@ -0,0 +1,212 @@ +#include "pgm/pgm_index_dynamic.hpp" +#include "include/standalone_utility.h" + +typedef uint64_t key_type; +typedef uint64_t value_type; + +typedef pgm::DynamicPGMIndex<key_type, value_type, pgm::PGMIndex<key_type, 64>> PGM; + +struct record { + key_type key; + value_type value; +}; + +struct query { + key_type lower_bound; + key_type upper_bound; +}; + +template <typename R> +static bool build_insert_vec(std::fstream &file, std::vector<R> &vec, size_t n, + double delete_prop, std::vector<R> &to_delete, bool binary=false) { + vec.clear(); + for (size_t i=0; i<n; i++) { + R rec; + if (!next_record(file, rec, binary)) { + if (i == 0) { + return false; + } + + break; + } + + vec.emplace_back(rec); + + if (gsl_rng_uniform(g_rng) < delete_prop + (delete_prop * .1)) { + to_delete.emplace_back(rec); + } + } + + return true; +} + + +static bool warmup(std::fstream &file, PGM &pgm, size_t count, + double delete_prop, std::vector<record> to_delete, bool progress=true, bool binary=false) { + size_t batch = std::min(.1 * count, 25000.0); + + std::vector<record> insert_vec; + std::vector<record> delete_vec; + insert_vec.reserve(batch); + delete_vec.reserve(batch*delete_prop); + + size_t inserted = 0; + size_t delete_idx = 0; + + double last_percent = 0; + while (inserted < count) { + // Build vector of records to insert and potentially delete + auto continue_warmup = build_insert_vec<record>(file, insert_vec, batch, delete_prop, to_delete, binary); + if (inserted > batch) { + build_delete_vec(to_delete, delete_vec, batch*delete_prop); + delete_idx = 0; + } + + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + pgm.erase(delete_vec[delete_idx++].key); + } + + pgm.insert_or_assign(insert_vec[i].key, insert_vec[i].value); + inserted++; + progress_update((double) inserted / (double) count, "warming up:"); + } + } + + return true; +} + + +static void pgm_rq_insert(PGM &pgm, std::fstream &file, size_t insert_cnt, double delete_prop, std::vector<record> &to_delete, bool binary=false) { + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + size_t BATCH=1000; + + std::vector<record> insert_vec; + std::vector<record> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + pgm.erase(delete_vec[delete_idx++].key); + applied_deletes++; + } + + // insert the record; + pgm.insert_or_assign(insert_vec[i].key, insert_vec[i].value); + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + progress_update(1.0, "inserting:"); + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); +} + + + +static void pgm_pl_bench(PGM &pgm, std::vector<query> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<record> result_set; + + for (int i=0; i<trial_cnt; i++) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto ptr = pgm.find(queries[j].lower_bound); + if (ptr != pgm.end() && ptr->first == queries[j].lower_bound) { + result_set.push_back({ptr->first, ptr->second}); + } + result_set.clear(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: upgm_pl_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + double insert_batch = 0.1; + + init_bench_env(record_count, true); + auto queries = read_range_queries<query>(qfilename, .0001); + + std::vector<std::pair<key_type, value_type>> data; + PGM pgm(data.begin(), data.end()); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<record> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup(datafile, pgm, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + pgm_rq_insert(pgm, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = pgm.size_in_bytes(); + fprintf(stdout, "%ld\t", memory_usage); + + pgm_pl_bench(pgm, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/upgm_rq_bench.cpp b/benchmarks/old-bench/upgm_rq_bench.cpp new file mode 100644 index 0000000..940a9e6 --- /dev/null +++ b/benchmarks/old-bench/upgm_rq_bench.cpp @@ -0,0 +1,217 @@ +#include "pgm/pgm_index_dynamic.hpp" +#include "include/standalone_utility.h" + +typedef uint64_t key_type; +typedef uint64_t value_type; + +typedef pgm::DynamicPGMIndex<key_type, value_type, pgm::PGMIndex<key_type, 64>> PGM; + +struct record { + key_type key; + value_type value; +}; + +struct query { + key_type lower_bound; + key_type upper_bound; +}; + +template <typename R> +static bool build_insert_vec(std::fstream &file, std::vector<R> &vec, size_t n, + double delete_prop, std::vector<R> &to_delete, bool binary=false) { + vec.clear(); + for (size_t i=0; i<n; i++) { + R rec; + if (!next_record(file, rec, binary)) { + if (i == 0) { + return false; + } + + break; + } + + vec.emplace_back(rec); + + if (gsl_rng_uniform(g_rng) < delete_prop + (delete_prop * .1)) { + to_delete.emplace_back(rec); + } + } + + return true; +} + + +static bool warmup(std::fstream &file, PGM &pgm, size_t count, + double delete_prop, std::vector<record> to_delete, bool progress=true, bool binary=false) { + size_t batch = std::min(.1 * count, 25000.0); + + std::vector<record> insert_vec; + std::vector<record> delete_vec; + insert_vec.reserve(batch); + delete_vec.reserve(batch*delete_prop); + + size_t inserted = 0; + size_t delete_idx = 0; + + double last_percent = 0; + while (inserted < count) { + // Build vector of records to insert and potentially delete + auto continue_warmup = build_insert_vec<record>(file, insert_vec, batch, delete_prop, to_delete, binary); + if (inserted > batch) { + build_delete_vec(to_delete, delete_vec, batch*delete_prop); + delete_idx = 0; + } + + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + pgm.erase(delete_vec[delete_idx++].key); + } + + pgm.insert_or_assign(insert_vec[i].key, insert_vec[i].value); + inserted++; + progress_update((double) inserted / (double) count, "warming up:"); + } + } + + return true; +} + + +static void pgm_rq_insert(PGM &pgm, std::fstream &file, size_t insert_cnt, double delete_prop, std::vector<record> &to_delete, bool binary=false) { + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + size_t BATCH=1000; + + std::vector<record> insert_vec; + std::vector<record> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(g_rng) < delete_prop) { + pgm.erase(delete_vec[delete_idx++].key); + applied_deletes++; + } + + // insert the record; + pgm.insert_or_assign(insert_vec[i].key, insert_vec[i].value); + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + progress_update(1.0, "inserting:"); + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); +} + + + +static void pgm_rq_bench(PGM &pgm, std::vector<query> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + //std::vector<record> result_set; + size_t tot = 0; + + for (int i=0; i<trial_cnt; i++) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto ptr = pgm.find(queries[j].lower_bound);\ + tot = 0; + while (ptr != pgm.end() && ptr->first <= queries[j].upper_bound) { + ++tot; + //result_set.push_back({ptr->first, ptr->second}); + ++ptr; + } + assert(tot > 0); + //result_set.clear(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: upgm_rq_bench <filename> <record_count> <delete_proportion> <query_file>\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + double insert_batch = 0.5; + + init_bench_env(record_count, true); + auto queries = read_range_queries<query>(qfilename, .0001); + + std::vector<std::pair<key_type, value_type>> data; + PGM pgm(data.begin(), data.end()); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<record> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup(datafile, pgm, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + pgm_rq_insert(pgm, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = pgm.size_in_bytes(); + fprintf(stdout, "%ld\t", memory_usage); + + pgm_rq_bench(pgm, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/old-bench/vptree_knn_bench.cpp b/benchmarks/old-bench/vptree_knn_bench.cpp new file mode 100644 index 0000000..d8247e4 --- /dev/null +++ b/benchmarks/old-bench/vptree_knn_bench.cpp @@ -0,0 +1,58 @@ +#include "include/bench.h" + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: vptree_knn_bench <filename> <record_count> <delete_proportion> <query_file> [k]\n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + size_t k = (argc == 6) ? atol(argv[5]) : 10; + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + + init_bench_env(record_count, true); + auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, k); + + auto de_vp_knn = ExtendedVPTree_KNN(buffer_cap, scale_factor, max_delete_prop); + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector<Word2VecRec> to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = 0.1 * record_count; + warmup<ExtendedVPTree_KNN, Word2VecRec>(datafile, de_vp_knn, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + insert_tput_bench<ExtendedVPTree_KNN, Word2VecRec>(de_vp_knn, datafile, insert_cnt, delete_prop, to_delete, true); + fprintf(stdout, "%ld\t", de_vp_knn.get_memory_usage()); + + query_latency_bench<ExtendedVPTree_KNN, Word2VecRec, de::KNNQueryParms<Word2VecRec>>(de_vp_knn, queries); + fprintf(stdout, "\n"); + + auto ts = de_vp_knn.create_static_structure(); + + fprintf(stdout, "%ld\t", ts->get_memory_usage()); + static_latency_bench<de::VPTree<Word2VecRec>, Word2VecRec, de::KNNQueryParms<Word2VecRec>, de::KNNQuery<Word2VecRec>>( + ts, queries, 1 + ); + fprintf(stdout, "\n"); + + delete ts; + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} |