From 0265acc2caf2d0dffb640ccd4f46c5d9d8163903 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 25 Jul 2023 13:35:30 -0400 Subject: ALEX Benchmarking: added benchmark for ALEX --- .gitmodules | 3 + CMakeLists.txt | 5 + benchmarks/alex_rq_bench.cpp | 213 ++++++++++++++++++++++++++++ benchmarks/include/bench_utility.h | 236 +------------------------------ benchmarks/include/standalone_utility.h | 242 ++++++++++++++++++++++++++++++++ benchmarks/test.cpp | 7 + external/alex | 1 + 7 files changed, 472 insertions(+), 235 deletions(-) create mode 100644 benchmarks/alex_rq_bench.cpp create mode 100644 benchmarks/include/standalone_utility.h create mode 100644 benchmarks/test.cpp create mode 160000 external/alex diff --git a/.gitmodules b/.gitmodules index 7123aee..6e7ae08 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "external/m-tree"] path = external/m-tree url = git@github.com:dbrumbaugh/M-Tree +[submodule "external/alex"] + path = external/alex + url = https://github.com/microsoft/ALEX diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e1902e..5caea52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,11 @@ if (bench) target_include_directories(btree_rq_bench PRIVATE include external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include) target_compile_options(btree_rq_bench PUBLIC -fopenmp) + add_executable(alex_rq_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/alex_rq_bench.cpp) + target_link_libraries(alex_rq_bench PUBLIC gsl pthread gomp) + target_include_directories(alex_rq_bench PRIVATE include external/alex/src/core external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include) + target_compile_options(alex_rq_bench PUBLIC -fopenmp -march=native) + set_property(TARGET alex_rq_bench PROPERTY CXX_STANDARD 14) add_executable(vptree_knn_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/vptree_knn_bench.cpp) target_link_libraries(vptree_knn_bench PUBLIC gsl pthread gomp) diff --git a/benchmarks/alex_rq_bench.cpp b/benchmarks/alex_rq_bench.cpp new file mode 100644 index 0000000..3b7ebc9 --- /dev/null +++ b/benchmarks/alex_rq_bench.cpp @@ -0,0 +1,213 @@ +#include "alex.h" +#include "include/standalone_utility.h" + +typedef uint64_t key_type; +typedef uint64_t value_type; + +typedef alex::Alex Alex; + +struct record { + key_type key; + value_type value; +}; + +struct query { + key_type lower_bound; + key_type upper_bound; +}; + +template +static bool build_insert_vec(std::fstream &file, std::vector &vec, size_t n, + double delete_prop, std::vector &to_delete, bool binary=false) { + vec.clear(); + for (size_t i=0; i to_delete, bool progress=true, bool binary=false) { + size_t batch = std::min(.1 * count, 25000.0); + + std::vector insert_vec; + std::vector delete_vec; + insert_vec.reserve(batch); + delete_vec.reserve(batch*delete_prop); + + size_t inserted = 0; + size_t delete_idx = 0; + + double last_percent = 0; + while (inserted < count) { + // Build vector of records to insert and potentially delete + auto continue_warmup = build_insert_vec(file, insert_vec, batch, delete_prop, to_delete, binary); + if (inserted > batch) { + build_delete_vec(to_delete, delete_vec, batch*delete_prop); + delete_idx = 0; + } + + for (size_t i=0; i &to_delete, bool binary=false) { + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + size_t BATCH=1000; + + std::vector insert_vec; + std::vector delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i(insert_stop - insert_start).count(); + } + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); +} + + + +static void alex_rq_bench(Alex &alex, std::vector queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector result_set; + + for (int i=0; i(stop - start).count(); + } + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage: alex_rq_bench \n"); + exit(EXIT_FAILURE); + } + + std::string filename = std::string(argv[1]); + size_t record_count = atol(argv[2]); + double delete_prop = atof(argv[3]); + std::string qfilename = std::string(argv[4]); + + size_t buffer_cap = 12000; + size_t scale_factor = 6; + double max_delete_prop = delete_prop; + bool use_osm = false; + + double insert_batch = 0.1; + + init_bench_env(record_count, true, use_osm); + auto queries = read_range_queries(qfilename, .0001); + + Alex alex; + + std::fstream datafile; + datafile.open(filename, std::ios::in | std::ios::binary); + + std::vector to_delete; + + // warm up the tree with initial_insertions number of initially inserted + // records + size_t warmup_cnt = insert_batch * record_count; + warmup(datafile, alex, warmup_cnt, delete_prop, to_delete, true, true); + + size_t insert_cnt = record_count - warmup_cnt; + + alex_rq_insert(alex, datafile, insert_cnt, delete_prop, to_delete, true); + size_t memory_usage = alex.model_size(); + fprintf(stdout, "%ld\t", memory_usage); + + alex_rq_bench(alex, queries); + fprintf(stdout, "\n"); + + delete_bench_env(); + fflush(stdout); + fflush(stderr); + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/include/bench_utility.h b/benchmarks/include/bench_utility.h index 28040be..2ab1532 100644 --- a/benchmarks/include/bench_utility.h +++ b/benchmarks/include/bench_utility.h @@ -17,6 +17,7 @@ #include "ds/BTree.h" #include "shard/VPTree.h" #include "mtree.h" +#include "standalone_utility.h" #include #include @@ -49,25 +50,6 @@ typedef de::DynamicExtension, de::IRSQuery> ExtendedI typedef de::DynamicExtension, de::ISAMRangeQuery> ExtendedISAM_RQ; typedef de::DynamicExtension, de::KNNQuery> ExtendedVPTree_KNN; -struct btree_record { - key_type key; - value_type value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const key_type &get(const btree_record &v) { - return v.key; - } -}; - struct euclidean_distance { double operator()(const Word2VecRec &first, const Word2VecRec &second) const { double dist = 0; @@ -99,189 +81,6 @@ struct cosine_similarity { typedef tlx::BTree TreeMap; typedef mt::mtree MTree; -static gsl_rng *g_rng; -static std::set *g_to_delete; -static bool g_osm_data; - -static key_type g_min_key = UINT64_MAX; -static key_type g_max_key = 0; - -static size_t g_max_record_cnt = 0; -static size_t g_reccnt = 0; - -static constexpr unsigned int DEFAULT_SEED = 0; - -static unsigned int get_random_seed() -{ - unsigned int seed = 0; - std::fstream urandom; - urandom.open("/dev/urandom", std::ios::in|std::ios::binary); - urandom.read((char *) &seed, sizeof(seed)); - urandom.close(); - - return seed; -} - -static key_type osm_to_key(const char *key_field) { - double tmp_key = (atof(key_field) + 180) * 10e6; - return (key_type) tmp_key; -} - -static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) -{ - g_rng = gsl_rng_alloc(type); - gsl_rng_set(g_rng, seed); -} - -static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) -{ - unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; - init_bench_rng(seed, gsl_rng_mt19937); - g_to_delete = new std::set(); - g_osm_data = osm_correction; - g_max_record_cnt = max_reccnt; - g_reccnt = 0; -} - -static void delete_bench_env() -{ - gsl_rng_free(g_rng); - delete g_to_delete; -} - -/* - * NOTE: The QP type must have lower_bound and upper_bound attributes, which - * this function will initialize. Any other query parameter attributes must - * be manually initialized after the call. - */ -template -static std::vector read_range_queries(std::string fname, double selectivity) { - std::vector queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.lower_bound = start; - q.upper_bound = stop; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template -static std::vector read_knn_queries(std::string fname, size_t k) { - std::vector queries; - - FILE *qf = fopen(fname.c_str(), "r"); - char *line = NULL; - size_t len = 0; - - while (getline(&line, &len, qf) > 0) { - char *token; - QP query; - size_t idx = 0; - - token = strtok(line, " "); - do { - query.point.data[idx++] = atof(token); - } while ((token = strtok(NULL, " "))); - - query.k = k; - queries.emplace_back(query); - } - - free(line); - fclose(qf); - - return queries; -} - -static bool next_vector_record(std::fstream &file, Word2VecRec &record, bool binary=false) { - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - for (size_t i=0; i<300; i++) { - std::string dimension; - - std::getline(line_stream, dimension, ' '); - record.data[i] = atof(dimension.c_str()); - } - - g_reccnt++; - - return true; - } - - return false; - -} - - -template -static bool next_record(std::fstream &file, R &record, bool binary=false) -{ - static value_type value = 1; - if (g_reccnt >= g_max_record_cnt) return false; - - if (binary) { - if (file.good()) { - decltype(R::key) key; - - file.read((char*) &key, sizeof(key)); - record.key = key; - record.value = value; - value++; - - if constexpr (de::WeightedRecordInterface) { - decltype(R::weight) weight; - file.read((char*) &weight, sizeof(weight)); - record.weight = weight; - } - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - return true; - } - - return false; - } - - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - std::string key_field; - std::string value_field; - std::string weight_field; - - std::getline(line_stream, value_field, '\t'); - std::getline(line_stream, key_field, '\t'); - std::getline(line_stream, weight_field, '\t'); - - record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); - record.value = atol(value_field.c_str()); - - if constexpr (de::WeightedRecordInterface) { - record.weight = atof(weight_field.c_str()); - } - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - g_reccnt++; - - return true; - } - - return false; -} - template static bool build_insert_vec(std::fstream &file, std::vector &vec, size_t n, double delete_prop, std::vector &to_delete, bool binary=false) { @@ -316,39 +115,6 @@ static bool build_insert_vec(std::fstream &file, std::vector &vec, size_t n, return true; } -template -static bool build_delete_vec(std::vector &to_delete, std::vector &vec, size_t n) { - vec.clear(); - - size_t cnt = 0; - while (cnt < n) { - if (to_delete.size() == 0) { - return false; - } - - auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); - vec.emplace_back(to_delete[i]); - to_delete.erase(to_delete.begin() + i); - } -td: - return true; -} - -/* - * helper routines for displaying progress bars to stderr - */ -static const char *g_prog_bar = "======================================================================"; -static const size_t g_prog_width = 50; - -static void progress_update(double percentage, std::string prompt) { - int val = (int) (percentage * 100); - int lpad = (int) (percentage * g_prog_width); - int rpad = (int) (g_prog_width - lpad); - fprintf(stderr, "\r(%3d%%) %20s [%.*s%*s]", val, prompt.c_str(), lpad, g_prog_bar, rpad, ""); - fflush(stderr); - - if (percentage >= 1) fprintf(stderr, "\n"); -} template static bool warmup(std::fstream &file, DE &extended_index, size_t count, diff --git a/benchmarks/include/standalone_utility.h b/benchmarks/include/standalone_utility.h new file mode 100644 index 0000000..272a332 --- /dev/null +++ b/benchmarks/include/standalone_utility.h @@ -0,0 +1,242 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef uint64_t key_type; +typedef uint64_t value_type; +typedef uint64_t weight_type; + +static gsl_rng *g_rng; +static bool g_osm_data; + +struct btree_record { + key_type key; + value_type value; + + inline bool operator<(const btree_record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const btree_record& other) const { + return key == other.key && value == other.value; + } +}; + +struct btree_key_extract { + static const key_type &get(const btree_record &v) { + return v.key; + } +}; + +static key_type g_min_key = UINT64_MAX; +static key_type g_max_key = 0; + +static size_t g_max_record_cnt = 0; +static size_t g_reccnt = 0; + +static constexpr unsigned int DEFAULT_SEED = 0; + +static unsigned int get_random_seed() +{ + unsigned int seed = 0; + std::fstream urandom; + urandom.open("/dev/urandom", std::ios::in|std::ios::binary); + urandom.read((char *) &seed, sizeof(seed)); + urandom.close(); + + return seed; +} + +static key_type osm_to_key(const char *key_field) { + double tmp_key = (atof(key_field) + 180) * 10e6; + return (key_type) tmp_key; +} + +static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) +{ + g_rng = gsl_rng_alloc(type); + gsl_rng_set(g_rng, seed); +} + +static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) +{ + unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; + init_bench_rng(seed, gsl_rng_mt19937); + g_osm_data = osm_correction; + g_max_record_cnt = max_reccnt; + g_reccnt = 0; +} + +static void delete_bench_env() +{ + gsl_rng_free(g_rng); +} + +template +static std::vector read_range_queries(std::string fname, double selectivity) { + std::vector queries; + + FILE *qf = fopen(fname.c_str(), "r"); + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template +static std::vector read_knn_queries(std::string fname, size_t k) { + std::vector queries; + + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; + + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; + + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); + + query.k = k; + queries.emplace_back(query); + } + + free(line); + fclose(qf); + + return queries; +} + +/* + * NOTE: The QP type must have lower_bound and upper_bound attributes, which + * this function will initialize. Any other query parameter attributes must + * be manually initialized after the call. + */ +template +static bool next_vector_record(std::fstream &file, R &record, bool binary=false) { + std::string line; + if (std::getline(file, line, '\n')) { + std::stringstream line_stream(line); + for (size_t i=0; i<300; i++) { + std::string dimension; + + std::getline(line_stream, dimension, ' '); + record.data[i] = atof(dimension.c_str()); + } + + g_reccnt++; + + return true; + } + + return false; + +} + +template +static bool next_record(std::fstream &file, R &record, bool binary=false) +{ + static value_type value = 1; + if (g_reccnt >= g_max_record_cnt) return false; + + if (binary) { + if (file.good()) { + decltype(R::key) key; + + file.read((char*) &key, sizeof(key)); + record.key = key; + record.value = value; + value++; + + if (record.key < g_min_key) g_min_key = record.key; + if (record.key > g_max_key) g_max_key = record.key; + + return true; + } + + return false; + } + + std::string line; + if (std::getline(file, line, '\n')) { + std::stringstream line_stream(line); + std::string key_field; + std::string value_field; + std::string weight_field; + + std::getline(line_stream, value_field, '\t'); + std::getline(line_stream, key_field, '\t'); + std::getline(line_stream, weight_field, '\t'); + + record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); + record.value = atol(value_field.c_str()); + + if (record.key < g_min_key) g_min_key = record.key; + if (record.key > g_max_key) g_max_key = record.key; + + g_reccnt++; + + return true; + } + + return false; +} + +template +static bool build_delete_vec(std::vector &to_delete, std::vector &vec, size_t n) { + vec.clear(); + + size_t cnt = 0; + while (cnt < n) { + if (to_delete.size() == 0) { + return false; + } + + auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); + vec.emplace_back(to_delete[i]); + to_delete.erase(to_delete.begin() + i); + } +td: + return true; +} + +/* + * helper routines for displaying progress bars to stderr + */ +static const char *g_prog_bar = "======================================================================"; +static const size_t g_prog_width = 50; + +static void progress_update(double percentage, std::string prompt) { + int val = (int) (percentage * 100); + int lpad = (int) (percentage * g_prog_width); + int rpad = (int) (g_prog_width - lpad); + fprintf(stderr, "\r(%3d%%) %20s [%.*s%*s]", val, prompt.c_str(), lpad, g_prog_bar, rpad, ""); + fflush(stderr); + + if (percentage >= 1) fprintf(stderr, "\n"); +} diff --git a/benchmarks/test.cpp b/benchmarks/test.cpp new file mode 100644 index 0000000..75bffe3 --- /dev/null +++ b/benchmarks/test.cpp @@ -0,0 +1,7 @@ +#include "alex.h" + + +int main(int argc, char **argv) { + alex::Alex test; + +} diff --git a/external/alex b/external/alex new file mode 160000 index 0000000..7f4cc98 --- /dev/null +++ b/external/alex @@ -0,0 +1 @@ +Subproject commit 7f4cc981c85baf0f72299b38ecbf523320de4ded -- cgit v1.2.3