diff options
Diffstat (limited to 'benchmarks')
36 files changed, 3581 insertions, 454 deletions
diff --git a/benchmarks/bigann_sample.cpp b/benchmarks/bigann_sample.cpp new file mode 100644 index 0000000..aa12f91 --- /dev/null +++ b/benchmarks/bigann_sample.cpp @@ -0,0 +1,55 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "file_util.h" +#include "benchmark_types.h" + +#include <gsl/gsl_rng.h> + +typedef ANNRec Rec; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile sampcnt\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + size_t m = atol(argv[3]); + + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + auto data = read_binary_vector_file<Rec>(d_fname, n); + + std::vector<size_t> to_delete(m); + + std::unordered_map<Rec, size_t, de::RecordHash<Rec>> filter; + double ratio = (double) data.size() / (double) m; + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= ratio && filter.find(data[i]) == filter.end()) { + to_delete[j++] = i; + filter.insert({data[i], i}); + } + } + + for (size_t i=0; i<to_delete.size(); i++) { + for (size_t j=0; j<ANNSize; j++ ) { + fprintf(stdout, "%ld ", data[to_delete[i]].data[j]); + } + fprintf(stdout, "\n"); + } + + gsl_rng_free(rng); + fflush(stderr); + fflush(stdout); +} + diff --git a/benchmarks/cedar_trie.cpp b/benchmarks/cedar_trie.cpp new file mode 100644 index 0000000..7499ce7 --- /dev/null +++ b/benchmarks/cedar_trie.cpp @@ -0,0 +1,97 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> +#include <vector> + +#include "cedar.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + +std::vector<std::string> strings; + +typedef cedar::da<int> Trie; + +void insert_thread(int64_t start, int64_t end, Trie * trie) { + for (uint64_t i=start; i<end; i++) { + auto res = trie->update(strings[i].c_str(), strings[i].size(), i+1); + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(line); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + auto trie = new Trie(); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), trie); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + + auto res = trie->exactMatchSearch<int>(strings[j].c_str()); + //assert(*(res)+1 == j); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t\t%lf\t%ld\n", trie->size(), + i_tput, q_lat); + + fprintf(stdout, "%ld\n", trie->total_size()); + + delete trie; + + fflush(stderr); +} + diff --git a/benchmarks/hat_trie.cpp b/benchmarks/hat_trie.cpp new file mode 100644 index 0000000..3b4c7d3 --- /dev/null +++ b/benchmarks/hat_trie.cpp @@ -0,0 +1,98 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> + +#include "htrie_map.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + +std::vector<std::string> strings; + +typedef tsl::htrie_map<char, size_t> Trie; + +void insert_thread(int64_t start, int64_t end, Trie * trie) { + for (uint64_t i=start; i<end; i++) { + auto res = trie->insert(strings[i].c_str(), i+1); + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(line); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + auto trie = new Trie(); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), trie); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + + auto res = trie->find(strings[j]); + if (*res != (j+1)) { + fprintf(stderr, "%ld %d %s\n", j, *res, strings[j].c_str()); + } + //assert(*(res)+1 == j); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t\t%lf\t%ld\n", trie->size(), + i_tput, q_lat); + + + delete trie; + + fflush(stderr); +} + diff --git a/benchmarks/include/benchmark_types.h b/benchmarks/include/benchmark_types.h new file mode 100644 index 0000000..51fc52d --- /dev/null +++ b/benchmarks/include/benchmark_types.h @@ -0,0 +1,68 @@ +#pragma once + +#include <cstdlib> +#include "psu-ds/BTree.h" +#include "framework/interface/Record.h" +#include "pgm/pgm_index_dynamic.hpp" + +/* BTree definitions*/ +template <typename K, typename V> +struct btree_record { + K key; + V value; + + inline bool operator<(const btree_record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const btree_record& other) const { + return key == other.key && value == other.value; + } +}; + +template <typename K, typename V> +struct btree_key_extract { + static const K &get(const btree_record<K, V> &v) { + return v.key; + } +}; + +typedef psudb::BTree<int64_t, btree_record<int64_t, int64_t>, btree_key_extract<int64_t, int64_t>> BenchBTree; + + +/*MTree Definitions*/ + +const size_t W2V_SIZE = 300; +typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec; + +const size_t ANNSize = 128; +typedef de::EuclidPoint<uint64_t, ANNSize> ANNRec; + +struct euclidean_distance { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double dist = 0; + for (size_t i=0; i<W2V_SIZE; i++) { + dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]); + } + + return std::sqrt(dist); + } + + double operator()(const ANNRec &first, const ANNRec &second) const { + double dist = 0; + for (size_t i=0; i<ANNSize; i++) { + dist += ((double) first.data[i] - (double) second.data[i]) * ((double) first.data[i] - (double) second.data[i]); + } + + return std::sqrt(dist); + } +}; + +#ifdef _GNU_SOURCE +#include "mtree.h" +typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; +typedef mt::mtree<ANNRec, euclidean_distance> MTree_alt; +#endif + +typedef pgm::DynamicPGMIndex<uint64_t, uint64_t, pgm::PGMIndex<uint64_t, 64>> PGM; + diff --git a/benchmarks/include/btree-util.h b/benchmarks/include/btree-util.h deleted file mode 100644 index 571c073..0000000 --- a/benchmarks/include/btree-util.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include <cstdlib> -#include "psu-ds/BTree.h" - -struct btree_record { - int64_t key; - int64_t value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const int64_t &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - - diff --git a/benchmarks/include/data-proc.h b/benchmarks/include/data-proc.h deleted file mode 100644 index 444cb94..0000000 --- a/benchmarks/include/data-proc.h +++ /dev/null @@ -1,258 +0,0 @@ -#include <cstdlib> -#include <cstdio> -#include <iostream> -#include <fstream> -#include <sstream> -#include <string> -#include <gsl/gsl_rng.h> -#include <cstring> -#include <vector> - -#include "psu-ds/BTree.h" - -#pragma once - -typedef int64_t key_type; -typedef int64_t value_type; -typedef uint64_t weight_type; - -static gsl_rng *g_rng; -static bool g_osm_data; - -struct btree_record { - key_type key; - value_type value; - - inline bool operator<(const btree_record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const btree_record& other) const { - return key == other.key && value == other.value; - } -}; - -struct btree_key_extract { - static const key_type &get(const btree_record &v) { - return v.key; - } -}; - -typedef psudb::BTree<int64_t, btree_record, btree_key_extract> BenchBTree; - -static key_type g_min_key = UINT64_MAX; -static key_type g_max_key = 0; - -static size_t g_max_record_cnt = 0; -static size_t g_reccnt = 0; - -static constexpr unsigned int DEFAULT_SEED = 0; - -static unsigned int get_random_seed() -{ - unsigned int seed = 0; - std::fstream urandom; - urandom.open("/dev/urandom", std::ios::in|std::ios::binary); - urandom.read((char *) &seed, sizeof(seed)); - urandom.close(); - - return seed; -} - -static key_type osm_to_key(const char *key_field) { - double tmp_key = (atof(key_field) + 180) * 10e6; - return (key_type) tmp_key; -} - -static void init_bench_rng(unsigned int seed, const gsl_rng_type *type) -{ - g_rng = gsl_rng_alloc(type); - gsl_rng_set(g_rng, seed); -} - -static void init_bench_env(size_t max_reccnt, bool random_seed, bool osm_correction=true) -{ - unsigned int seed = (random_seed) ? get_random_seed() : DEFAULT_SEED; - init_bench_rng(seed, gsl_rng_mt19937); - g_osm_data = osm_correction; - g_max_record_cnt = max_reccnt; - g_reccnt = 0; -} - -static void delete_bench_env() -{ - gsl_rng_free(g_rng); -} - - -template <typename QP> -static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.target_key = start; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.lower_bound = start; - q.upper_bound = stop; - queries.push_back(q); - } - } - fclose(qf); - - return queries; -} - -template <typename QP> -static std::vector<QP> read_knn_queries(std::string fname, size_t k) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - char *line = NULL; - size_t len = 0; - - while (getline(&line, &len, qf) > 0) { - char *token; - QP query; - size_t idx = 0; - - token = strtok(line, " "); - do { - query.point.data[idx++] = atof(token); - } while ((token = strtok(NULL, " "))); - - query.k = k; - queries.emplace_back(query); - } - - free(line); - fclose(qf); - - return queries; -} - -/* - * NOTE: The QP type must have lower_bound and upper_bound attributes, which - * this function will initialize. Any other query parameter attributes must - * be manually initialized after the call. - */ -template <typename R> -static bool next_vector_record(std::fstream &file, R &record, bool binary=false) { - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - for (size_t i=0; i<300; i++) { - std::string dimension; - - std::getline(line_stream, dimension, ' '); - record.data[i] = atof(dimension.c_str()); - } - - g_reccnt++; - - return true; - } - - return false; - -} - -template <typename R> -static bool next_record(std::fstream &file, R &record, bool binary=false) -{ - static value_type value = 1; - if (g_reccnt >= g_max_record_cnt) return false; - - if (binary) { - if (file.good()) { - decltype(R::key) key; - - file.read((char*) &key, sizeof(key)); - record.key = key; - record.value = value; - value++; - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - return true; - } - - return false; - } - - std::string line; - if (std::getline(file, line, '\n')) { - std::stringstream line_stream(line); - std::string key_field; - std::string value_field; - std::string weight_field; - - std::getline(line_stream, value_field, '\t'); - std::getline(line_stream, key_field, '\t'); - std::getline(line_stream, weight_field, '\t'); - - record.key = (g_osm_data) ? osm_to_key(key_field.c_str()) : atol(key_field.c_str()); - record.value = atol(value_field.c_str()); - - if (record.key < g_min_key) g_min_key = record.key; - if (record.key > g_max_key) g_max_key = record.key; - - g_reccnt++; - - return true; - } - - return false; -} - -template <typename R> -static bool build_delete_vec(std::vector<R> &to_delete, std::vector<R> &vec, size_t n) { - vec.clear(); - - size_t cnt = 0; - while (cnt < n) { - if (to_delete.size() == 0) { - return false; - } - - auto i = gsl_rng_uniform_int(g_rng, to_delete.size()); - vec.emplace_back(to_delete[i]); - to_delete.erase(to_delete.begin() + i); - } -td: - return true; -} - -static std::vector<int64_t> read_sosd_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - std::vector<int64_t> records(n); - for (size_t i=0; i<n; i++) { - file.read((char*) &(records[i]), sizeof(int64_t)); - } - - return records; -} diff --git a/benchmarks/include/file_util.h b/benchmarks/include/file_util.h new file mode 100644 index 0000000..41eb18c --- /dev/null +++ b/benchmarks/include/file_util.h @@ -0,0 +1,294 @@ +#pragma once + +#include <cstdlib> +#include <cstdio> +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <gsl/gsl_rng.h> +#include <cstring> +#include <vector> +#include <memory> + +#include "psu-util/progress.h" + + +template <typename QP> +static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.target_key = start; + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + +template <typename QP> +static std::vector<QP> generate_string_lookup_queries(std::vector<std::unique_ptr<char[]>> &strings, size_t cnt, gsl_rng *rng) { + std::vector<QP> queries; + + for (size_t i=0; i<cnt; i++) { + auto idx = gsl_rng_uniform_int(rng, strings.size()); + QP q; + q.search_key = strings[idx].get(); + queries.push_back(q); + } + + return queries; +} + +template <typename QP> +static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.00001) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + + queries.push_back(q); + } + } + fclose(qf); + + return queries; +} + + +template <typename QP> +static std::vector<QP> read_binary_knn_queries(std::string fname, size_t k, size_t n) { + std::vector<QP> queries; + queries.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + + int32_t dim; + int32_t cnt; + + file.read((char*) &(cnt), sizeof(cnt)); + file.read((char*) &(dim), sizeof(dim)); + + if (n > cnt) { + n = cnt; + } + + for (size_t i=0; i<n; i++) { + QP query; + for (size_t j=0; j<dim; j++) { + uint64_t val; + file.read((char*) &(val), sizeof(uint64_t)); + query.point.data[j] = val; + } + query.k = k; + queries.push_back(query); + } + + return queries; +} + + +template <typename QP> +static std::vector<QP> read_knn_queries(std::string fname, size_t k) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; + + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); + + query.k = k; + queries.emplace_back(query); + } + + free(line); + fclose(qf); + + return queries; +} + +template<typename R> +static std::vector<R> read_sosd_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records(n); + for (size_t i=0; i<n; i++) { + decltype(R::key) k; + file.read((char*) &(k), sizeof(R::key)); + records[i].key = k; + records[i].value = i; + } + + return records; +} + +template<typename K, typename V> +static std::vector<std::pair<K, V>> read_sosd_file_pair(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<std::pair<K,V>> records(n); + for (size_t i=0; i<n; i++) { + K k; + file.read((char*) &(k), sizeof(K)); + records[i].first = k; + records[i].second = i; + } + + return records; +} + +/* + * This function expects a plaintext file with each vector on its own line. + * There should be D dimensions (or more) for each record, separated by + * whitespace. The function will generate a vector of n records, each + * record built from the first D dimensions of the data in the file. + */ +template <typename R, size_t D> +static std::vector<R> read_vector_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records; + records.reserve(n); + + for (size_t i=0; i<n; i++) { + std::string line; + if (!std::getline(file, line, '\n')) break; + + std::stringstream line_stream(line); + R rec; + for (size_t j=0; j<D; j++) { + std::string dim; + if (!std::getline(line_stream, dim, ' ')) break; + + rec.data[j] = atof(dim.c_str()); + } + records.emplace_back(rec); + } + + return records; +} + +template <typename R> +static std::vector<R> read_binary_vector_file(std::string &fname, size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records; + records.reserve(n); + + int32_t dim; + int32_t cnt; + + file.read((char*) &(cnt), sizeof(cnt)); + file.read((char*) &(dim), sizeof(dim)); + + if (n > cnt) { + n = cnt; + } + + R rec; + for (size_t i=0; i<n; i++) { + for (size_t j=0; j<dim; j++) { + uint64_t val; + file.read((char*) &(val), sizeof(uint64_t)); + rec.data[j] = val; + } + + records.emplace_back(rec); + } + + return records; +} + +static std::vector<std::unique_ptr<char[]>>read_string_file(std::string fname, size_t n=10000000) { + + std::fstream file; + file.open(fname, std::ios::in); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<std::unique_ptr<char[]>> strings; + strings.reserve(n); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(std::unique_ptr<char[]>(strdup(line.c_str()))); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } + + return strings; +} diff --git a/benchmarks/include/standard_benchmarks.h b/benchmarks/include/standard_benchmarks.h new file mode 100644 index 0000000..b805c08 --- /dev/null +++ b/benchmarks/include/standard_benchmarks.h @@ -0,0 +1,380 @@ +/* + * benchmarks/include/bench.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <cstdlib> +#include <fstream> +#include <gsl/gsl_rng.h> + +#include "framework/DynamicExtension.h" +#include "framework/interface/Query.h" +#include "query/irs.h" +#include "psu-util/progress.h" +#include "benchmark_types.h" +#include "psu-util/bentley-saxe.h" + +static size_t g_deleted_records = 0; +static double delete_proportion = 0.05; + +static volatile size_t total = 0; + +template<typename DE, typename QP, typename R> +static void run_queries(DE *extension, DE *ghost, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + std::vector<R> res = extension->query(&queries[i]); + std::vector<R> negres = ghost->query(&queries[i]); + auto result = res[0].first - negres[0].first; + total = result; + } +} + + +template<typename DE, typename QP, bool BSM=false> +static void run_queries(DE *extension, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + if constexpr (std::is_same_v<MTree, DE>) { + std::vector<Word2VecRec> result; + auto res = extension->get_nearest_by_limit(queries[i].point, queries[i].k); + + auto itr = res.begin(); + while (itr != res.end()) { + result.emplace_back(itr->data); + itr++; + } + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (auto &r : result) { + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + #endif + } else if constexpr (std::is_same_v<MTree_alt, DE>) { + std::vector<ANNRec> result; + auto res = extension->get_nearest_by_limit(queries[i].point, queries[i].k); + + auto itr = res.begin(); + while (itr != res.end()) { + result.emplace_back(itr->data); + itr++; + } + }else if constexpr (std::is_same_v<PGM, DE>) { + size_t tot = 0; + auto ptr = extension->find(queries[i].lower_bound); + while (ptr != extension->end() && ptr->first <= queries[i].upper_bound) { + tot++; + ++ptr; + } + } else { + auto res = extension->query(&queries[i]); + if constexpr (!BSM) { + auto result = res.get(); + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=result.size()-1; i>=0; i--) { + auto &r = result[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", result.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } else { + total = res.size(); + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i]; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } + } + } +} + +template <typename R> +static void run_btree_queries(BenchBTree *btree, std::vector<de::irs::Parms<R>> &queries) { + std::vector<int64_t> sample_set; + sample_set.reserve(queries[0].sample_size); + + for (size_t i=0; i<queries.size(); i++) { + btree->range_sample(queries[i].lower_bound, queries[i].upper_bound, queries[i].sample_size, sample_set, queries[i].rng); + } +} + + +template<typename S, typename QP, typename Q> +static void run_static_queries(S *shard, std::vector<QP> &queries) { + for (size_t i=0; i<queries.size(); i++) { + auto q = &queries[i]; + + auto state = Q::get_query_state(shard, q); + + std::vector<void*> shards = {shard}; + std::vector<void*> states = {state}; + + Q::process_query_states(q, states, nullptr); + auto res = Q::query(shard, state, q); + + #ifdef BENCH_PRINT_RESULTS + fprintf(stdout, "\n\n"); + for (int i=res.size()-1; i>=0; i--) { + auto &r = res[i].rec; + fprintf(stdout, "%ld %lf %lf %lf %lf %lf %lf\n", res.size(), r.data[0], + r.data[1], r.data[2], r.data[3], r.data[4], r.data[5]); + } + fflush(stdout); + #endif + } +} + + +/* + * Insert records into a standard Bentley-Saxe extension. Deletes are not + * supported. + */ +template<typename DS, typename R, bool MDSP=false> +static void insert_records(psudb::bsm::BentleySaxe<R, DS, MDSP> *extension, + size_t start, size_t stop, std::vector<R> &records) { + + psudb::progress_update(0, "Insert Progress"); + for (size_t i=start; i<stop; i++) { + extension->insert(records[i]); + } + + psudb::progress_update(1, "Insert Progress"); +} + + +template<typename DS, typename R, bool MDSP=false> +static void insert_records(psudb::bsm::BentleySaxe<R, DS, MDSP> *extension, + psudb::bsm::BentleySaxe<R, DS, MDSP> *ghost, + size_t start, size_t stop, std::vector<R> &records, + std::vector<size_t> &to_delete, size_t &delete_idx, + gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + + extension->insert(records[i]); + + if (gsl_rng_uniform(rng) <= delete_proportion && to_delete[delete_idx] <= i) { + ghost->insert(records[to_delete[delete_idx]]); + delete_idx++; + g_deleted_records++; + } + + } + +} + + +template<typename DE, typename R> +static void insert_records(DE *structure, size_t start, size_t stop, + std::vector<R> &records, std::vector<size_t> &to_delete, + size_t &delete_idx, bool delete_records, gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + + if constexpr (std::is_same_v<BenchBTree, DE>) { + structure->insert(records[i]); + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + structure->add(records[i]); + } else if constexpr (std::is_same_v<PGM, DE>) { + structure->insert_or_assign(records[i].key, records[i].value); + } else { + while (!structure->insert(records[i])) { + psudb::progress_update((double) i / (double)(stop - start), "Insert Progress"); + usleep(1); + } + } + + if (delete_records && gsl_rng_uniform(rng) <= + delete_proportion && to_delete[delete_idx] <= i) { + + if constexpr (std::is_same_v<BenchBTree, DE>) { + structure->erase_one(records[to_delete[delete_idx]].key); + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + structure->remove(records[to_delete[delete_idx]]); + } else if constexpr (std::is_same_v<PGM, DE>) { + structure->erase(records[to_delete[delete_idx]].key); + } else { + while (!structure->erase(records[to_delete[delete_idx]])) { + usleep(1); + } + } + delete_idx++; + g_deleted_records++; + } + } + + psudb::progress_update(1, "Insert Progress"); +} + +template <typename DE, de::RecordInterface R, bool PROGRESS=true, size_t BATCH=1000> +static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cnt, + double delete_prop, gsl_rng *rng, std::vector<R> &to_delete, bool binary=false) { + + size_t delete_cnt = insert_cnt * delete_prop; + + size_t applied_deletes = 0; + size_t applied_inserts = 0; + + std::vector<R> insert_vec; + std::vector<R> delete_vec; + insert_vec.reserve(BATCH); + delete_vec.reserve(BATCH*delete_prop); + + size_t delete_idx = 0; + + bool continue_benchmark = true; + + size_t total_time = 0; + + while (applied_inserts < insert_cnt && continue_benchmark) { + continue_benchmark = build_insert_vec(file, insert_vec, BATCH, delete_prop, to_delete, binary); + if (applied_deletes < delete_cnt) { + build_delete_vec(to_delete, delete_vec, BATCH*delete_prop); + delete_idx = 0; + } + + if (insert_vec.size() == 0) { + break; + } + + if constexpr (PROGRESS) { + psudb::progress_update((double) applied_inserts / (double) insert_cnt, "inserting:"); + } + + auto insert_start = std::chrono::high_resolution_clock::now(); + for (size_t i=0; i<insert_vec.size(); i++) { + // process a delete if necessary + if (applied_deletes < delete_cnt && delete_idx < delete_vec.size() && gsl_rng_uniform(rng) < delete_prop) { + if constexpr (std::is_same_v<BenchBTree, DE>) { + de_index.erase_one(delete_vec[delete_idx++].key); + #ifdef _GNU_SOURCE + } else if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + de_index.remove(delete_vec[delete_idx++]); + #endif + } else { + de_index.erase(delete_vec[delete_idx++]); + } + applied_deletes++; + } + + // insert the record; + #ifdef _GNU_SOURCE + if constexpr (std::is_same_v<MTree, DE> || std::is_same_v<MTree_alt, DE>) { + de_index.add(insert_vec[i]); + } else { + de_index.insert(insert_vec[i]); + } + #else + de_index.insert(insert_vec[i]); + #endif + + applied_inserts++; + } + auto insert_stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(insert_stop - insert_start).count(); + } + + if constexpr (PROGRESS) { + psudb::progress_update(1.0, "inserting:"); + } + + size_t throughput = (((double) (applied_inserts + applied_deletes) / (double) total_time) * 1e9); + + fprintf(stdout, "%ld\t", throughput); + + return continue_benchmark; +} + +template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true> +static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + auto res = de_index.query(&queries[j]); + + total_results += res.size(); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} + + +template <typename Shard, de::RecordInterface R, typename QP, de::QueryInterface<R, Shard> Q, bool PROGRESS=true> +static bool static_latency_bench(Shard *shard, std::vector<QP> queries, size_t trial_cnt=100) { + char progbuf[25]; + if constexpr (PROGRESS) { + sprintf(progbuf, "querying:"); + } + + size_t total_time = 0; + size_t total_results = 0; + + for (size_t i=0; i<trial_cnt; i++) { + if constexpr (PROGRESS) { + psudb::progress_update((double) (i) / (double) trial_cnt, progbuf); + } + + std::vector<void *> states(1); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + states[0] = Q::get_query_state(shard, &queries[j]); + Q::process_query_states(&queries[j], states, nullptr); + auto res = Q::query(shard, states[0], &queries[j]); + total_results += res.size(); + Q::delete_query_state(states[0]); + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + psudb::progress_update(1.0, progbuf); + + size_t query_latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", query_latency); + fflush(stdout); + + return true; +} diff --git a/benchmarks/include/triespline_bsm.h b/benchmarks/include/triespline_bsm.h new file mode 100644 index 0000000..eaf1079 --- /dev/null +++ b/benchmarks/include/triespline_bsm.h @@ -0,0 +1,156 @@ +#pragma once + +#include <cstdlib> +#include <vector> +#include <algorithm> +#include "ts/builder.h" + +template <typename K, typename V, size_t E=1024> +class BSMTrieSpline { +private: + typedef std::pair<K, V> R; + +public: + struct RangeQueryParameters { + K lower_bound; + K upper_bound; + }; + +public: + static BSMTrieSpline *build(std::vector<R> &records) { + if (records.size() == 0) { + return nullptr; + } + + std::sort(records.begin(), records.end()); + return new BSMTrieSpline(records); + } + + static BSMTrieSpline *build_presorted(std::vector<R> &records) { + if (records.size() == 0) { + return nullptr; + } + + return new BSMTrieSpline(records); + } + + std::vector<R> unbuild() { + return std::move(m_data); + } + + std::vector<R> query(void *q) { + std::vector<R> rs; + + /* return an empty result set if q is invalid */ + if (q == nullptr) { + rs.push_back({0, 0}); + return rs; + } + + auto parms = (BSMTrieSpline::RangeQueryParameters*) q; + + size_t idx = lower_bound(parms->lower_bound); + + size_t cnt = 0; + while (idx < m_data.size() && m_data[idx].first <= parms->upper_bound) { + cnt++; + idx++; + } + + rs.push_back({cnt, 0}); + + return std::move(rs); + } + + std::vector<R> query_merge(std::vector<R> &rsa, std::vector<R> &rsb, void *parms) { + /* initialize rsa on the first merge */ + if (rsa.size() == 0) { + rsa.push_back({0, 0}); + } + rsa[0].first += rsb[0].first; + return std::move(rsa); + } + + size_t record_count() { + return m_data.size(); + } + + ~BSMTrieSpline() = default; + +private: + std::vector<R> m_data; + K m_max_key; + K m_min_key; + ts::TrieSpline<K> m_ts; + + BSMTrieSpline(std::vector<R> &records) { + m_data = std::move(records); + m_min_key = m_data[0].first; + m_max_key = m_data[m_data.size() - 1].first; + + if (m_data.size() < 50) { + return; + } + + auto bldr = ts::Builder<K>(m_min_key, m_max_key, E); + for (size_t i=0; i<m_data.size(); i++) { + bldr.AddKey(m_data[i].first); + } + + if (m_data.size() > 1) { + m_ts = bldr.Finalize(); + } + } + + size_t lower_bound(K key) { + if (m_data.size() == 0) { + return 1; + } else if (m_data.size() < 50) { + for (size_t i=0; i<m_data.size(); i++) { + if (m_data[i].first >= key) { + return i; + } + } + + return m_data.size(); + } + + auto bound = m_ts.GetSearchBound(key); + size_t idx = bound.begin; + + if (idx >= m_data.size()) { + return m_data.size(); + } + + // If the region to search is less than some pre-specified + // amount, perform a linear scan to locate the record. + if (bound.end - bound.begin < 256) { + while (idx < bound.end && m_data[idx].first < key) { + idx++; + } + } else { + // Otherwise, perform a binary search + idx = bound.begin; + size_t max = bound.end; + + while (idx < max) { + size_t mid = (idx + max) / 2; + if (key > m_data[mid].first) { + idx = mid + 1; + } else { + max = mid; + } + } + } + + if (idx == m_data.size()) { + return m_data.size(); + } + + if (m_data[idx].first > key && idx > 0 && m_data[idx-1].first <= key) { + return idx-1; + } + + return idx; + } +}; diff --git a/benchmarks/include/vptree_bsm.h b/benchmarks/include/vptree_bsm.h new file mode 100644 index 0000000..2f12fcb --- /dev/null +++ b/benchmarks/include/vptree_bsm.h @@ -0,0 +1,317 @@ +#pragma once + +#include <cstdlib> +#include <vector> +#include <algorithm> +#include <limits> +#include <gsl/gsl_rng.h> + +#include "psu-ds/PriorityQueue.h" +#include "framework/interface/Record.h" + +template <typename R, size_t LEAFSZ=100> +class BSMVPTree { +public: + struct KNNQueryParms { + R point; + size_t k; + }; + +public: + static BSMVPTree *build(std::vector<R> &records) { + return new BSMVPTree(records); + } + + static BSMVPTree *build_presorted(std::vector<R> &records) { + return new BSMVPTree(records); + } + + std::vector<R> unbuild() { + return std::move(m_data); + } + + std::vector<R> query(void *q) { + std::vector<R> rs; + + /* return an empty result set if q is invalid */ + if (q == nullptr) { + return rs; + } + + auto parms = (BSMVPTree::KNNQueryParms*) q; + auto pq = psudb::PriorityQueue<R, de::DistCmpMax<R>>(parms->k, &parms->point); + + if (parms->k >= m_data.size()) { + for (size_t i=0; i<m_data.size(); i++) { + if (m_ptrs[i].ptr != nullptr) { + pq.push(m_ptrs[i].ptr); + } + } + } else { + double farthest = std::numeric_limits<double>::max(); + internal_search(m_root, parms->point, parms->k, pq, &farthest); + } + + size_t i=0; + while (pq.size() > 0) { + rs.push_back(*pq.peek().data); + pq.pop(); + } + return std::move(rs); + } + + std::vector<R> query_merge(std::vector<R> &rsa, std::vector<R> &rsb, void* parms) { + KNNQueryParms *p = (KNNQueryParms *) parms; + R rec = p->point; + size_t k = p->k; + + std::vector<R> output; + + psudb::PriorityQueue<R, de::DistCmpMax<R>> pq(k, &rec); + + for (size_t i=0; i<rsa.size(); i++) { + if (pq.size() < k) { + pq.push(&rsa[i]); + } else { + double head_dist = pq.peek().data->calc_distance(rec); + double cur_dist = rsa[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsa[i]); + } + } + } + + for (size_t i=0; i<rsb.size(); i++) { + if (pq.size() < k) { + pq.push(&rsb[i]); + } else { + double head_dist = pq.peek().data->calc_distance(rec); + double cur_dist = rsb[i].calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&rsb[i]); + } + } + } + + while (pq.size() > 0) { + output.emplace_back(*pq.peek().data); + pq.pop(); + } + + return std::move(output); + } + + size_t record_count() { + return m_data.size(); + } + + ~BSMVPTree() { + delete m_root; + } + +private: + + struct vp_ptr { + R *ptr; + double dist; + }; + + struct vpnode { + size_t start; + size_t stop; + bool leaf; + + double radius; + vpnode *inside; + vpnode *outside; + + vpnode() : start(0), stop(0), leaf(false), radius(0.0), inside(nullptr), outside(nullptr) {} + + ~vpnode() { + delete inside; + delete outside; + } + }; + + std::vector<R> m_data; + std::vector<vp_ptr> m_ptrs; + vpnode *m_root; + + size_t m_node_cnt; + + BSMVPTree(std::vector<R> &records) { + m_data = std::move(records); + m_node_cnt = 0; + + for (size_t i=0; i<m_data.size(); i++) { + m_ptrs.push_back({&m_data[i], 0}); + } + + m_root = build_vptree(); + } + + vpnode *build_vptree() { + if (m_data.size() == 0) { + return nullptr; + } + + size_t lower = 0; + size_t upper = m_data.size() - 1; + + auto rng = gsl_rng_alloc(gsl_rng_mt19937); + auto root = build_subtree(lower, upper, rng); + gsl_rng_free(rng); + return root; + } + + vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) { + /* + * base-case: sometimes happens (probably because of the +1 and -1 + * in the first recursive call) + */ + if (start > stop) { + return nullptr; + } + + /* base-case: create a leaf node */ + if (stop - start <= LEAFSZ) { + vpnode *node = new vpnode(); + node->start = start; + node->stop = stop; + node->leaf = true; + + m_node_cnt++; + return node; + } + + /* + * select a random element to be the root of the + * subtree + */ + auto i = start + gsl_rng_uniform_int(rng, stop - start + 1); + swap(start, i); + + /* for efficiency, we'll pre-calculate the distances between each point and the root */ + for (size_t i=start+1; i<=stop; i++) { + m_ptrs[i].dist = m_ptrs[start].ptr->calc_distance(*m_ptrs[i].ptr); + } + + /* + * partition elements based on their distance from the start, + * with those elements with distance falling below the median + * distance going into the left sub-array and those above + * the median in the right. This is easily done using QuickSelect. + */ + auto mid = (start + 1 + stop) / 2; + quickselect(start + 1, stop, mid, m_ptrs[start].ptr, rng); + + /* Create a new node based on this partitioning */ + vpnode *node = new vpnode(); + node->start = start; + + /* store the radius of the circle used for partitioning the node. */ + node->radius = m_ptrs[start].ptr->calc_distance(*m_ptrs[mid].ptr); + m_ptrs[start].dist = node->radius; + + /* recursively construct the left and right subtrees */ + node->inside = build_subtree(start + 1, mid-1, rng); + node->outside = build_subtree(mid, stop, rng); + + m_node_cnt++; + + return node; + } + + void quickselect(size_t start, size_t stop, size_t k, R *p, gsl_rng *rng) { + if (start == stop) return; + + auto pivot = partition(start, stop, p, rng); + + if (k < pivot) { + quickselect(start, pivot - 1, k, p, rng); + } else if (k > pivot) { + quickselect(pivot + 1, stop, k, p, rng); + } + } + + size_t partition(size_t start, size_t stop, R *p, gsl_rng *rng) { + auto pivot = start + gsl_rng_uniform_int(rng, stop - start); + + swap(pivot, stop); + + size_t j = start; + for (size_t i=start; i<stop; i++) { + if (m_ptrs[i].dist < m_ptrs[stop].dist) { + swap(j++, i); + } + } + + swap(j, stop); + return j; + } + + void swap(size_t idx1, size_t idx2) { + auto tmp = m_ptrs[idx1]; + m_ptrs[idx1] = m_ptrs[idx2]; + m_ptrs[idx2] = tmp; + } + + void internal_search(vpnode *node, const R &point, size_t k, psudb::PriorityQueue<R, + de::DistCmpMax<R>> &pq, double *farthest) { + + if (node == nullptr) return; + + if (node->leaf) { + for (size_t i=node->start; i<=node->stop; i++) { + double d = point.calc_distance(*m_ptrs[i].ptr); + if (d < *farthest) { + if (pq.size() == k) { + pq.pop(); + } + + pq.push(m_ptrs[i].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + } + + return; + } + + double d = point.calc_distance(*m_ptrs[node->start].ptr); + + if (d < *farthest) { + if (pq.size() == k) { + auto t = pq.peek().data; + pq.pop(); + } + pq.push(m_ptrs[node->start].ptr); + if (pq.size() == k) { + *farthest = point.calc_distance(*pq.peek().data); + } + } + + if (d < node->radius) { + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + } else { + if (d + (*farthest) >= node->radius) { + internal_search(node->outside, point, k, pq, farthest); + } + + if (d - (*farthest) <= node->radius) { + internal_search(node->inside, point, k, pq, farthest); + } + } + } +}; diff --git a/benchmarks/irs_bench.cpp b/benchmarks/irs_bench.cpp deleted file mode 100644 index ddb4220..0000000 --- a/benchmarks/irs_bench.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* - * - */ - -#define ENABLE_TIMER - -#include <thread> - -#include "framework/DynamicExtension.h" -#include "shard/ISAMTree.h" -#include "query/irs.h" -#include "framework/interface/Record.h" -#include "include/data-proc.h" - -#include <gsl/gsl_rng.h> - -#include "psu-util/timer.h" - - -typedef de::Record<int64_t, int64_t> Rec; -typedef de::ISAMTree<Rec> ISAM; -typedef de::irs::Query<Rec, ISAM> Q; -typedef de::DynamicExtension<Rec, ISAM, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; -typedef de::irs::Parms<Rec> QP; - -void run_queries(Ext *extension, std::vector<QP> &queries, gsl_rng *rng) { - size_t total; - for (size_t i=0; i<queries.size(); i++) { - auto q = &queries[i]; - q->rng = rng; - q->sample_size = 1000; - - auto res = extension->query(q); - auto r = res.get(); - total += r.size(); - } - - fprintf(stderr, "%ld\n", total); -} - -size_t g_deleted_records = 0; -double delete_proportion = 0.05; - -void insert_records(Ext *extension, size_t start, - size_t stop, - std::vector<int64_t> &records, - std::vector<size_t> &to_delete, - size_t &delete_idx, - bool delete_records, - gsl_rng *rng) { - size_t reccnt = 0; - Rec r; - for (size_t i=start; i<stop; i++) { - r.key = records[i]; - r.value = i; - - while (!extension->insert(r)) { - usleep(1); - } - - if (delete_records && gsl_rng_uniform(rng) <= delete_proportion && to_delete[delete_idx] <= i) { - r.key = records[to_delete[delete_idx]]; - r.value = (int64_t) (to_delete[delete_idx]); - while (!extension->erase(r)) { - usleep(1); - } - delete_idx++; - g_deleted_records++; - } - } -} - -int main(int argc, char **argv) { - - if (argc < 4) { - fprintf(stderr, "insert_query_tput reccnt datafile queryfile\n"); - exit(EXIT_FAILURE); - } - - size_t n = atol(argv[1]); - std::string d_fname = std::string(argv[2]); - std::string q_fname = std::string(argv[3]); - - auto extension = new Ext(12000, 12001, 8, 0, 64); - gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); - - auto data = read_sosd_file(d_fname, n); - std::vector<size_t> to_delete(n * delete_proportion); - size_t j=0; - for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { - if (gsl_rng_uniform(rng) <= delete_proportion) { - to_delete[j++] = i; - } - } - auto queries = read_range_queries<QP>(q_fname, .001); - - /* warmup structure w/ 10% of records */ - size_t warmup = .3 * n; - size_t delete_idx = 0; - insert_records(extension, 0, warmup, data, to_delete, delete_idx, false, rng); - - extension->await_next_epoch(); - - TIMER_INIT(); - - TIMER_START(); - insert_records(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); - TIMER_STOP(); - - auto insert_latency = TIMER_RESULT(); - size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); - - TIMER_START(); - run_queries(extension, queries, rng); - TIMER_STOP(); - - auto query_latency = TIMER_RESULT() / queries.size(); - - fprintf(stdout, "T\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, g_deleted_records); - - gsl_rng_free(rng); - delete extension; - fflush(stderr); -} - diff --git a/benchmarks/old-bench/include/bench_utility.h b/benchmarks/old-bench/include/bench_utility.h index e33b93d..f495f18 100644 --- a/benchmarks/old-bench/include/bench_utility.h +++ b/benchmarks/old-bench/include/bench_utility.h @@ -79,8 +79,6 @@ struct cosine_similarity { } }; -typedef tlx::BTree<key_type, btree_record, btree_key_extract> TreeMap; -typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; template <de::RecordInterface R> static bool build_insert_vec(std::fstream &file, std::vector<R> &vec, size_t n, diff --git a/benchmarks/poplar_trie.cpp b/benchmarks/poplar_trie.cpp new file mode 100644 index 0000000..6c47465 --- /dev/null +++ b/benchmarks/poplar_trie.cpp @@ -0,0 +1,102 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> + +#include "poplar.hpp" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + +std::vector<std::string> strings; + +typedef poplar::plain_bonsai_map<int> Trie; + +void insert_thread(int64_t start, int64_t end, Trie * trie) { + for (uint64_t i=start; i<end; i++) { + auto res = trie->update(strings[i]); + *res = i+1; + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(line); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + auto trie = new Trie(); + size_t warmup = strings.size()*.1; + insert_thread(0, warmup, trie); + + TIMER_INIT(); + TIMER_START(); + insert_thread(warmup, strings.size(), trie); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + + auto res = trie->find(strings[j]); + if (*res != (j+1)) { + fprintf(stderr, "%ld %d %s\n", j, *res, strings[j].c_str()); + } + //assert(*(res)+1 == j); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t\t%lf\t%ld\n", trie->size(), + i_tput, q_lat); + + trie->show_stats(std::cerr, 1); + + delete trie; + + fflush(stderr); +} + diff --git a/benchmarks/string_insertion_tput.cpp b/benchmarks/string_insertion_tput.cpp new file mode 100644 index 0000000..c439cb3 --- /dev/null +++ b/benchmarks/string_insertion_tput.cpp @@ -0,0 +1,114 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> +#include <vector> + +#include "framework/DynamicExtension.h" +#include "shard/FSTrie.h" +#include "query/pointlookup.h" +#include "framework/interface/Record.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + + +typedef de::Record<const char *, uint64_t> Rec; +typedef de::FSTrie<Rec> Trie; +typedef de::pl::Query<Rec, Trie> Q; +typedef de::DynamicExtension<Rec, Trie, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; + + +void insert_thread(int64_t start, int64_t end, Ext *extension) { + for (uint64_t i=start; i<end; i++) { + Rec r = {strings[i].get(), i, strlen(strings[i].get())}; + while (!extension->insert(r)) { + _mm_pause(); + } + } +} + +std::vector<std::unique_ptr<char[]>>read_strings(std::string fname, size_t n=10000000) { + std::vector<std::unique_ptr<char[]>> strings; + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(std::unique_ptr<char[]>(strdup(line.c_str()))); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } + + return strings; +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + std::vector<size_t> scale_factors = {2, 4, 6, 8, 10, 12}; + std::vector<size_t> buffer_sizes = {1000, 2000, 5000, 10000, 12000, 15000}; + + for (auto &sf : scale_factors) { + for (auto &bf_sz : buffer_sizes) { + auto extension = new Ext(bf_sz, bf_sz, sf); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), extension); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + de::pl::Parms<Rec> parms = {strings[j].get()}; + + auto res = extension->query(&parms); + auto ans = res.get(); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t%ld\t%ld\t%lf\t%ld\t%ld\n", extension->get_record_count(), + bf_sz, sf, i_tput, q_lat, extension->get_memory_usage()); + + delete extension; + + } + } + fflush(stderr); +} + diff --git a/benchmarks/vldb/alex_bench.cpp b/benchmarks/vldb/alex_bench.cpp new file mode 100644 index 0000000..ba687f3 --- /dev/null +++ b/benchmarks/vldb/alex_bench.cpp @@ -0,0 +1,144 @@ +#define ENABLE_TIMER + +#include "alex.h" + +#include "file_util.h" +#include "psu-util/progress.h" +#include "psu-util/timer.h" + +typedef uint64_t key_type; +typedef uint64_t value_type; + +typedef alex::Alex<key_type, value_type> Alex; + +struct record { + key_type key; + value_type value; +}; + +struct query { + key_type lower_bound; + key_type upper_bound; +}; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +static size_t g_deleted_records = 0; +static double delete_proportion = 0.05; + +static void insert_records(Alex *structure, size_t start, size_t stop, + std::vector<record> &records, std::vector<size_t> &to_delete, + size_t &delete_idx, bool delete_records, gsl_rng *rng) { + + psudb::progress_update(0, "Insert Progress"); + size_t reccnt = 0; + for (size_t i=start; i<stop; i++) { + structure->insert(records[i].key, records[i].value); + + if (delete_records && gsl_rng_uniform(rng) <= + delete_proportion && to_delete[delete_idx] <= i) { + + structure->erase_one(records[i].key); + delete_idx++; + g_deleted_records++; + } + } + + psudb::progress_update(1, "Insert Progress"); +} + +size_t g_global_cnt = 0; + +static void run_queries(Alex *alex, std::vector<query> &queries) { + for (size_t i=0; i<queries.size(); i++) { + size_t cnt=0; + auto ptr = alex->find(queries[i].lower_bound); + while (ptr != alex->end() && ptr.key() <= queries[i].upper_bound) { + cnt++; + ptr++; + } + + g_global_cnt += cnt; + } +} + +Alex *warmup_alex(std::vector<record> records, size_t cnt) { + if (cnt >= records.size()) { + fprintf(stderr, "[E] Requesting warmup with more records than are available.\n"); + exit(EXIT_FAILURE); + } + + auto alex = new Alex(); + std::pair<key_type, value_type> *insert_vec = new std::pair<key_type, value_type>[cnt]; + + for (size_t i=0; i<cnt; i++) { + insert_vec[i] = {records[i].key, records[i].value}; + } + + std::sort(insert_vec, insert_vec + cnt); + alex->bulk_load(insert_vec, cnt); + delete[] insert_vec; + + return alex; +} + +int main(int argc, char **argv) +{ + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); + + + auto data = read_sosd_file<record>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + + auto queries = read_range_queries<query>(q_fname, .0001); + + + size_t warmup = .1 * n; + size_t delete_idx = 0; + + auto alex = warmup_alex(data, warmup); + + TIMER_INIT(); + + TIMER_START(); + insert_records(alex, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries(alex, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = alex->model_size() + alex->data_size() - (alex->size() * sizeof(record)); + + fprintf(stdout, "%ld\t%ld\t%lld\t%ld\n", insert_throughput, query_latency, ext_size, g_global_cnt); + fflush(stdout); + + gsl_rng_free(rng); + fflush(stderr); + + delete alex; + + exit(EXIT_SUCCESS); +} diff --git a/benchmarks/vldb/btree_bench.cpp b/benchmarks/vldb/btree_bench.cpp new file mode 100644 index 0000000..fa72831 --- /dev/null +++ b/benchmarks/vldb/btree_bench.cpp @@ -0,0 +1,90 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "shard/ISAMTree.h" +#include "query/irs.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "benchmark_types.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" +#include "standard_benchmarks.h" +#include "psu-ds/BTree.h" + +typedef btree_record<int64_t, int64_t> Rec; + +typedef de::ISAMTree<Rec> Shard; +typedef de::irs::Query<Rec, Shard> Q; +typedef de::irs::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto btree = BenchBTree(); + + gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + /* read in the range queries and add sample size and rng for sampling */ + auto queries = read_range_queries<QP>(q_fname, .0001); + for (auto &q : queries) { + q.sample_size = 1000; + q.rng = rng; + } + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<BenchBTree, Rec>(&btree, 0, warmup, data, to_delete, delete_idx, false, rng); + + TIMER_INIT(); + + TIMER_START(); + insert_records<BenchBTree, Rec>(&btree, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_btree_queries<Rec>(&btree, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto btree_size = btree.get_stats().inner_nodes * psudb::btree_default_traits<int64_t, Rec>::inner_slots * (sizeof(int64_t) + sizeof(void*)); + + /* account for memory wasted on gaps in the structure */ + btree_size += btree.get_stats().leaves * psudb::btree_default_traits<int64_t, Rec>::leaf_slots * sizeof(Rec); + btree_size -= btree.size() * sizeof(Rec); + + fprintf(stdout, "%ld\t%ld\t%ld\n", insert_throughput, query_latency, btree_size); + + gsl_rng_free(rng); + fflush(stderr); +} + diff --git a/benchmarks/btree_insert_query_tput.cpp b/benchmarks/vldb/btree_thread_scaling_bench.cpp index f838f80..557e966 100644 --- a/benchmarks/btree_insert_query_tput.cpp +++ b/benchmarks/vldb/btree_thread_scaling_bench.cpp @@ -7,8 +7,8 @@ #include <thread> #include "query/irs.h" -#include "include/data-proc.h" -#include "psu-ds/BTree.h" +#include "benchmark_types.h" +#include "file_util.h" #include <mutex> #include <gsl/gsl_rng.h> @@ -16,7 +16,7 @@ #include "psu-util/timer.h" -typedef de::Record<int64_t, int64_t> Rec; +typedef btree_record<int64_t, int64_t> Rec; typedef de::irs::Parms<Rec> QP; std::atomic<bool> inserts_done = false; @@ -46,11 +46,11 @@ void query_thread(BenchBTree *tree, std::vector<QP> *queries) { gsl_rng_free(rng); } -void insert_thread(BenchBTree *tree, size_t start, std::vector<int64_t> *records) { +void insert_thread(BenchBTree *tree, size_t start, std::vector<Rec> *records) { size_t reccnt = 0; for (size_t i=start; i<records->size(); i++) { - btree_record r; - r.key = (*records)[i]; + btree_record<int64_t, int64_t> r; + r.key = (*records)[i].key; r.value = i; g_btree_lock.lock(); @@ -80,15 +80,15 @@ int main(int argc, char **argv) { auto tree = new BenchBTree(); gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); - auto data = read_sosd_file(d_fname, n); + auto data = read_sosd_file<Rec>(d_fname, n); auto queries = read_range_queries<QP>(q_fname, .001); /* warmup structure w/ 10% of records */ size_t warmup = .1 * n; for (size_t i=0; i<warmup; i++) { - btree_record r; - r.key = data[i]; - r.value = i; + btree_record<int64_t, int64_t> r; + r.key = data[i].key; + r.value = data[i].value; tree->insert(r); } diff --git a/benchmarks/vldb/dynamic_pgm_bench.cpp b/benchmarks/vldb/dynamic_pgm_bench.cpp new file mode 100644 index 0000000..15b130f --- /dev/null +++ b/benchmarks/vldb/dynamic_pgm_bench.cpp @@ -0,0 +1,77 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <thread> + +#include "query/rangecount.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::rc::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + std::vector<std::pair<uint64_t, uint64_t>> tmp_data; + PGM pgm(tmp_data.begin(), tmp_data.end()); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_range_queries<QP>(q_fname, .0001); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<PGM, Rec>(&pgm, 0, warmup, data, to_delete, delete_idx, false, rng); + + TIMER_INIT(); + + TIMER_START(); + insert_records<PGM, Rec>(&pgm, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<PGM, QP>(&pgm, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = pgm.index_size_in_bytes(); + + fprintf(stdout, "%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size); + + gsl_rng_free(rng); + fflush(stderr); +} + diff --git a/benchmarks/vldb/fst_bench.cpp b/benchmarks/vldb/fst_bench.cpp new file mode 100644 index 0000000..276a922 --- /dev/null +++ b/benchmarks/vldb/fst_bench.cpp @@ -0,0 +1,100 @@ +/* + * + */ + +#define ENABLE_TIMER +#define TS_TEST + +#include <thread> + +#include "framework/DynamicExtension.h" +#include "shard/FSTrie.h" +#include "query/pointlookup.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<const char *, uint64_t> Rec; +typedef de::FSTrie<Rec> Shard; +typedef de::pl::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::pl::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + + auto extension = new Ext(12000, 12001, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto strings = read_string_file(d_fname, n); + auto queries = generate_string_lookup_queries<QP>(strings, 1000, rng); + + std::vector<Rec> data; + for (size_t i=0; i<strings.size(); i++) { + data.push_back({strings[i].get(), i, strlen(strings[i].get())}); + } + + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<strings.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); //+ shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/fst_bsm_bench.cpp b/benchmarks/vldb/fst_bsm_bench.cpp new file mode 100644 index 0000000..15a441a --- /dev/null +++ b/benchmarks/vldb/fst_bsm_bench.cpp @@ -0,0 +1,100 @@ +/* + * + */ + +#define ENABLE_TIMER +#define TS_TEST + +#include <thread> + +#include "framework/DynamicExtension.h" +#include "shard/FSTrie.h" +#include "query/pointlookup.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<const char *, uint64_t> Rec; +typedef de::FSTrie<Rec> Shard; +typedef de::pl::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::BSM, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::pl::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + + auto extension = new Ext(1, 12001, 2, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto strings = read_string_file(d_fname, n); + auto queries = generate_string_lookup_queries<QP>(strings, 1000, rng); + + std::vector<Rec> data; + for (size_t i=0; i<strings.size(); i++) { + data.push_back({strings[i].get(), i, strlen(strings[i].get())}); + } + + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<strings.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); //+ shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/irs_bench.cpp b/benchmarks/vldb/irs_bench.cpp new file mode 100644 index 0000000..e062e80 --- /dev/null +++ b/benchmarks/vldb/irs_bench.cpp @@ -0,0 +1,97 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/ISAMTree.h" +#include "query/irs.h" +#include "framework/interface/Record.h" +#include "file_util.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" +#include "standard_benchmarks.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::ISAMTree<Rec> Shard; +typedef de::irs::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::irs::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(12000, 12001, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + /* read in the range queries and add sample size and rng for sampling */ + auto queries = read_range_queries<QP>(q_fname, .0001); + for (auto &q : queries) { + q.sample_size = 1000; + q.rng = rng; + } + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage();// + shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/mtree_bench.cpp b/benchmarks/vldb/mtree_bench.cpp new file mode 100644 index 0000000..cc2f41f --- /dev/null +++ b/benchmarks/vldb/mtree_bench.cpp @@ -0,0 +1,82 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "query/knn.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef Word2VecRec Rec; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto mtree = new MTree(); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_vector_file<Rec, 300>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_knn_queries<QP>(q_fname, 1000); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<MTree, Rec>(mtree, 0, warmup, data, to_delete, delete_idx, false, rng); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<MTree, Rec>(mtree, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<MTree, QP>(mtree, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto size = mtree->size() - sizeof(Rec)*(data.size() - to_delete.size()); + + fprintf(stdout, "%ld\t%ld\t%ld\n", insert_throughput, query_latency, size); + + gsl_rng_free(rng); + delete mtree; + fflush(stderr); +} + diff --git a/benchmarks/vldb/mtree_bench_alt.cpp b/benchmarks/vldb/mtree_bench_alt.cpp new file mode 100644 index 0000000..50c6117 --- /dev/null +++ b/benchmarks/vldb/mtree_bench_alt.cpp @@ -0,0 +1,82 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "query/knn.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef ANNRec Rec; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto mtree = new MTree_alt(); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_binary_vector_file<Rec>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_binary_knn_queries<QP>(q_fname, 1000, 100); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<MTree_alt, Rec>(mtree, 0, warmup, data, to_delete, delete_idx, false, rng); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<MTree_alt, Rec>(mtree, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<MTree_alt, QP>(mtree, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto size = mtree->size() - sizeof(Rec)*(data.size() - to_delete.size()); + + fprintf(stdout, "%ld\t%ld\t%ld\n", insert_throughput, query_latency, size); + + gsl_rng_free(rng); + delete mtree; + fflush(stderr); +} + diff --git a/benchmarks/vldb/pgm_bench.cpp b/benchmarks/vldb/pgm_bench.cpp new file mode 100644 index 0000000..cec95df --- /dev/null +++ b/benchmarks/vldb/pgm_bench.cpp @@ -0,0 +1,94 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <thread> + +#include "framework/DynamicExtension.h" +#include "shard/PGM.h" +#include "query/rangecount.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::PGM<Rec> Shard; +typedef de::rc::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::rc::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(12000, 12001, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_range_queries<QP>(q_fname, .0001); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); // + shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/insert_query_tput.cpp b/benchmarks/vldb/thread_scaling_bench.cpp index ce05264..b679e92 100644 --- a/benchmarks/insert_query_tput.cpp +++ b/benchmarks/vldb/thread_scaling_bench.cpp @@ -10,7 +10,8 @@ #include "shard/ISAMTree.h" #include "query/irs.h" #include "framework/interface/Record.h" -#include "include/data-proc.h" +#include "file_util.h" +#include <ctime> #include <gsl/gsl_rng.h> @@ -25,6 +26,8 @@ typedef de::irs::Parms<Rec> QP; std::atomic<bool> inserts_done = false; +struct timespec delay = {0, 500}; + void query_thread(Ext *extension, std::vector<QP> *queries) { gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); size_t total = 0; @@ -39,7 +42,7 @@ void query_thread(Ext *extension, std::vector<QP> *queries) { auto res = extension->query(&q); auto r = res.get(); total += r.size(); - usleep(1); + nanosleep(&delay, nullptr); } fprintf(stderr, "%ld\n", total); @@ -47,47 +50,39 @@ void query_thread(Ext *extension, std::vector<QP> *queries) { gsl_rng_free(rng); } -void insert_thread(Ext *extension, size_t start, std::vector<int64_t> *records) { - size_t reccnt = 0; - Rec r; - for (size_t i=start; i<records->size(); i++) { - r.key = (*records)[i]; - r.value = i; - - while (!extension->insert(r)) { - usleep(1); +void insert_thread(Ext *extension, size_t start, size_t stop, std::vector<Rec> *records) { + fprintf(stderr, "%ld\t%ld\n", start, stop); + for (size_t i=start; i<stop; i++) { + while (!extension->insert((*records)[i])) { + nanosleep(&delay, nullptr); } } - - inserts_done.store(true); } int main(int argc, char **argv) { - if (argc < 5) { - fprintf(stderr, "insert_query_tput reccnt query_threads datafile queryfile\n"); + if (argc < 6) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "%s reccnt insert_threads query_threads datafile queryfile\n", argv[0]); exit(EXIT_FAILURE); } size_t n = atol(argv[1]); - size_t qthread_cnt = atol(argv[2]); - std::string d_fname = std::string(argv[3]); - std::string q_fname = std::string(argv[4]); + size_t ithread_cnt = atol(argv[2]); + size_t qthread_cnt = atol(argv[3]); + std::string d_fname = std::string(argv[4]); + std::string q_fname = std::string(argv[5]); auto extension = new Ext(1000, 12000, 8, 0, 64); gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); - auto data = read_sosd_file(d_fname, n); + auto data = read_sosd_file<Rec>(d_fname, n); auto queries = read_range_queries<QP>(q_fname, .001); /* warmup structure w/ 10% of records */ size_t warmup = .1 * n; - Rec r; for (size_t i=0; i<warmup; i++) { - r.key = data[i]; - r.value = gsl_rng_uniform_int(rng, n); - - while (!extension->insert(r)) { + while (!extension->insert(data[i])) { usleep(1); } } @@ -96,14 +91,26 @@ int main(int argc, char **argv) { TIMER_INIT(); + std::vector<std::thread> ithreads(ithread_cnt); std::vector<std::thread> qthreads(qthread_cnt); TIMER_START(); - std::thread i_thrd(insert_thread, extension, warmup, &data); + size_t start = warmup; + size_t per_thread = (n - warmup) / ithread_cnt; + for (size_t i=0; i<ithread_cnt; i++) { + ithreads[i] = std::thread(insert_thread, extension, start, start + per_thread, &data); + start += per_thread; + } + for (size_t i=0; i<qthread_cnt; i++) { qthreads[i] = std::thread(query_thread, extension, &queries); } - i_thrd.join(); + + for (size_t i=0; i<ithread_cnt; i++) { + ithreads[i].join(); + } + + inserts_done.store(true); TIMER_STOP(); for (size_t i=0; i<qthread_cnt; i++) { diff --git a/benchmarks/vldb/ts_bench.cpp b/benchmarks/vldb/ts_bench.cpp new file mode 100644 index 0000000..81a430a --- /dev/null +++ b/benchmarks/vldb/ts_bench.cpp @@ -0,0 +1,95 @@ +/* + * + */ + +#define ENABLE_TIMER +#define TS_TEST + +#include <thread> + +#include "framework/DynamicExtension.h" +#include "shard/TrieSpline.h" +#include "query/rangecount.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::TrieSpline<Rec> Shard; +typedef de::rc::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::rc::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(8000, 12001, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_range_queries<QP>(q_fname, .0001); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); //+ shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/ts_bsm_bench.cpp b/benchmarks/vldb/ts_bsm_bench.cpp new file mode 100644 index 0000000..4511350 --- /dev/null +++ b/benchmarks/vldb/ts_bsm_bench.cpp @@ -0,0 +1,95 @@ +/* + * + */ + +#define ENABLE_TIMER +#define TS_TEST + +#include <thread> + +#include "framework/DynamicExtension.h" +#include "shard/TrieSpline.h" +#include "query/rangecount.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::TrieSpline<Rec> Shard; +typedef de::rc::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::BSM, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::rc::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(1, 12001, 2, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_range_queries<QP>(q_fname, .0001); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); //+ shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/ts_mdsp_bench.cpp b/benchmarks/vldb/ts_mdsp_bench.cpp new file mode 100644 index 0000000..cc0cd99 --- /dev/null +++ b/benchmarks/vldb/ts_mdsp_bench.cpp @@ -0,0 +1,70 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <thread> + +#include "triespline_bsm.h" +#include "psu-util/bentley-saxe.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "query/rangecount.h" +#include "psu-util/timer.h" +#include "standard_benchmarks.h" + +typedef std::pair<uint64_t, uint64_t> Rec; +typedef de::Record<uint64_t, uint64_t> FRec; + +typedef BSMTrieSpline<uint64_t, uint64_t> Shard; +typedef de::rc::Parms<FRec> QP; +typedef psudb::bsm::BentleySaxe<Rec, Shard, true> Ext; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new psudb::bsm::BentleySaxe<Rec, Shard, true>(); + gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file_pair<uint64_t, uint64_t>(d_fname, n); + auto queries = read_range_queries<QP>(q_fname, .0001); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + insert_records<Shard, Rec, true>(extension, 0, warmup, data); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Shard, Rec, true>(extension, warmup, data.size(), data); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP, true>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + fprintf(stdout, "%ld\t%ld\n", insert_throughput, query_latency); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); +} + diff --git a/benchmarks/vldb/ts_parmsweep.cpp b/benchmarks/vldb/ts_parmsweep.cpp new file mode 100644 index 0000000..2c9412a --- /dev/null +++ b/benchmarks/vldb/ts_parmsweep.cpp @@ -0,0 +1,124 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/TrieSpline.h" +#include "query/rangecount.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef de::Record<uint64_t, uint64_t> Rec; +typedef de::TrieSpline<Rec> Shard; +typedef de::rc::Query<Rec, Shard, true> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::LEVELING, de::DeletePolicy::TOMBSTONE, de::SerialScheduler> Ext2; +typedef de::rc::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_sosd_file<Rec>(d_fname, n); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_range_queries<QP>(q_fname, .001); + + const std::vector<de::LayoutPolicy> policies = {de::LayoutPolicy::LEVELING, de::LayoutPolicy::TEIRING}; + const std::vector<size_t> buffer_sizes = {1000, 4000, 8000, 12000, 15000, 20000}; + const std::vector<size_t> scale_factors = {2, 4, 6, 8, 10, 12}; + + for (const auto &bs : buffer_sizes) { + for (const auto &sf : scale_factors) { + auto extension = new Ext(bs, bs, sf, 0, 64); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + + fprintf(stdout, "TIERING\t%ld\t%ld\t%ld\t%ld\t%ld\n", bs, sf, insert_throughput, query_latency, ext_size); + delete extension; + } + } + + for (const auto &bs : buffer_sizes) { + for (const auto &sf : scale_factors) { + auto extension = new Ext2(bs, bs, sf, 0, 64); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext2, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext2, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext2, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + + fprintf(stdout, "LEVELING\t%ld\t%ld\t%ld\t%ld\t%ld\n", bs, sf, insert_throughput, query_latency, ext_size); + delete extension; + } + } + + gsl_rng_free(rng); + fflush(stderr); +} + diff --git a/benchmarks/vldb/vptree_bench.cpp b/benchmarks/vldb/vptree_bench.cpp new file mode 100644 index 0000000..0b98a52 --- /dev/null +++ b/benchmarks/vldb/vptree_bench.cpp @@ -0,0 +1,102 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef Word2VecRec Rec; + +typedef de::VPTree<Rec, 100, true> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(1400, 1400, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_vector_file<Rec, 300>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_knn_queries<QP>(q_fname, 1000); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + fprintf(stderr, "Running Static query tests\n\n"); + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); // + shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); + fflush(stdout); +} + diff --git a/benchmarks/vldb/vptree_bench_alt.cpp b/benchmarks/vldb/vptree_bench_alt.cpp new file mode 100644 index 0000000..b09ee7d --- /dev/null +++ b/benchmarks/vldb/vptree_bench_alt.cpp @@ -0,0 +1,102 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef ANNRec Rec; + +typedef de::VPTree<Rec, 100, true> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(1400, 1400, 8, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_binary_vector_file<Rec>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_binary_knn_queries<QP>(q_fname, 1000, 100); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + fprintf(stderr, "Running Static query tests\n\n"); + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); // + shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); + fflush(stdout); +} + diff --git a/benchmarks/vldb/vptree_bsm_bench.cpp b/benchmarks/vldb/vptree_bsm_bench.cpp new file mode 100644 index 0000000..4a7fcb6 --- /dev/null +++ b/benchmarks/vldb/vptree_bsm_bench.cpp @@ -0,0 +1,102 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef Word2VecRec Rec; + +typedef de::VPTree<Rec, 100, true> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::BSM, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(1, 1400, 2, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_vector_file<Rec, 300>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_knn_queries<QP>(q_fname, 1000); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto shard = extension->create_static_structure(); + + fprintf(stderr, "Running Static query tests\n\n"); + TIMER_START(); + run_static_queries<Shard, QP, Q>(shard, queries); + TIMER_STOP(); + + auto static_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + auto static_size = shard->get_memory_usage(); // + shard->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t\t%ld\t%ld\t%ld\n", insert_throughput, query_latency, ext_size, static_latency, static_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); + fflush(stdout); +} + diff --git a/benchmarks/vldb/vptree_bsm_bench_alt.cpp b/benchmarks/vldb/vptree_bsm_bench_alt.cpp new file mode 100644 index 0000000..63baf8b --- /dev/null +++ b/benchmarks/vldb/vptree_bsm_bench_alt.cpp @@ -0,0 +1,92 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef ANNRec Rec; + +typedef de::VPTree<Rec, 100, true> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::BSM, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + auto extension = new Ext(1, 1400, 2, 0, 64); + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + fprintf(stderr, "[I] Reading data file...\n"); + auto data = read_binary_vector_file<Rec>(d_fname, n); + + fprintf(stderr, "[I] Generating delete vector\n"); + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + fprintf(stderr, "[I] Reading Queries\n"); + auto queries = read_binary_knn_queries<QP>(q_fname, 1000, 100); + + fprintf(stderr, "[I] Warming up structure...\n"); + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + fprintf(stderr, "[I] Running Insertion Benchmark\n"); + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + fprintf(stderr, "[I] Running Query Benchmark\n"); + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + + fprintf(stdout, "%ld\t%ld\t\t%ld\n", insert_throughput, query_latency, ext_size); + + gsl_rng_free(rng); + delete extension; + fflush(stderr); + fflush(stdout); +} + diff --git a/benchmarks/vldb/vptree_parmsweep.cpp b/benchmarks/vldb/vptree_parmsweep.cpp new file mode 100644 index 0000000..2cbd521 --- /dev/null +++ b/benchmarks/vldb/vptree_parmsweep.cpp @@ -0,0 +1,129 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" +#include "file_util.h" +#include "standard_benchmarks.h" + +#include <gsl/gsl_rng.h> + +#include "psu-util/timer.h" + + +typedef Word2VecRec Rec; + +typedef de::VPTree<Rec, 100, true> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; +typedef de::DynamicExtension<Rec, Shard, Q, de::LayoutPolicy::LEVELING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext2; +typedef de::knn::Parms<Rec> QP; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile queryfile\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + std::string q_fname = std::string(argv[3]); + + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + + auto data = read_vector_file<Rec, 300>(d_fname, n); + + std::vector<size_t> to_delete(n * delete_proportion); + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= delete_proportion) { + to_delete[j++] = i; + } + } + auto queries = read_knn_queries<QP>(q_fname, 10); + + + const std::vector<de::LayoutPolicy> policies = {de::LayoutPolicy::LEVELING, de::LayoutPolicy::TEIRING}; + const std::vector<size_t> buffer_sizes = {100, 400, 800, 1200, 1500, 2000}; + const std::vector<size_t> scale_factors = {2, 4, 6, 8, 10, 12}; + + for (const auto &bs : buffer_sizes) { + for (const auto &sf : scale_factors) { + auto extension = new Ext(bs, bs, sf, 0, 64); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + + fprintf(stdout, "TIERING\t%ld\t%ld\t%ld\t%ld\t%ld\n", bs, sf, insert_throughput, query_latency, ext_size); + delete extension; + } + } + + for (const auto &bs : buffer_sizes) { + for (const auto &sf : scale_factors) { + auto extension = new Ext2(bs, bs, sf, 0, 64); + + /* warmup structure w/ 10% of records */ + size_t warmup = .1 * n; + size_t delete_idx = 0; + insert_records<Ext2, Rec>(extension, 0, warmup, data, to_delete, delete_idx, false, rng); + + extension->await_next_epoch(); + + TIMER_INIT(); + + TIMER_START(); + insert_records<Ext2, Rec>(extension, warmup, data.size(), data, to_delete, delete_idx, true, rng); + TIMER_STOP(); + + auto insert_latency = TIMER_RESULT(); + size_t insert_throughput = (size_t) ((double) (n - warmup) / (double) insert_latency * 1e9); + + TIMER_START(); + run_queries<Ext2, QP>(extension, queries); + TIMER_STOP(); + + auto query_latency = TIMER_RESULT() / queries.size(); + + auto ext_size = extension->get_memory_usage() + extension->get_aux_memory_usage(); + + fprintf(stdout, "LEVELING\t%ld\t%ld\t%ld\t%ld\t%ld\n", bs, sf, insert_throughput, query_latency, ext_size); + delete extension; + } + } + + gsl_rng_free(rng); + fflush(stderr); +} + diff --git a/benchmarks/watermark_testing.cpp b/benchmarks/watermark_testing.cpp index caba8ff..c56fc63 100644 --- a/benchmarks/watermark_testing.cpp +++ b/benchmarks/watermark_testing.cpp @@ -5,15 +5,18 @@ #define ENABLE_TIMER #include "framework/DynamicExtension.h" -#include "shard/ISAMTree.h" +#include "shard/TrieSpline.h" #include "query/rangequery.h" #include "framework/interface/Record.h" #include "psu-util/timer.h" +#include <algorithm> +#include <random> -typedef de::Record<int64_t, int64_t> Rec; -typedef de::ISAMTree<Rec> ISAM; +typedef uint64_t K; +typedef de::Record<K, K> Rec; +typedef de::TrieSpline<Rec> ISAM; typedef de::rq::Query<Rec, ISAM> Q; typedef de::DynamicExtension<Rec, ISAM, Q> Ext; @@ -23,7 +26,17 @@ int main(int argc, char **argv) { std::vector hwms = {5000l, 10000l, 20000l, 50000l}; std::vector lwms = {.1, .2, .3, .4, .5, .6, .7, .8, .9}; - size_t n = 1000000; + size_t n = 1000000000; + + std::vector<K> keys(n); + for (K i=0; i<n; i++) { + keys[i] = i; + } + + std::random_device rd; + std::mt19937 g(rd()); + + std::shuffle(keys.begin(), keys.end(), g); TIMER_INIT(); @@ -33,8 +46,8 @@ int main(int argc, char **argv) { auto extension = new Ext(lwm, hwm, 8); TIMER_START(); - for (int64_t i=0; i<n; i++) { - Rec r = {i, i}; + for (size_t i=0; i<n; i++) { + Rec r = {keys[i], keys[i]}; while (!extension->insert(r)) { _mm_pause(); } diff --git a/benchmarks/watermark_testing_knn.cpp b/benchmarks/watermark_testing_knn.cpp new file mode 100644 index 0000000..7cea594 --- /dev/null +++ b/benchmarks/watermark_testing_knn.cpp @@ -0,0 +1,61 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "framework/DynamicExtension.h" +#include "shard/VPTree.h" +#include "query/knn.h" +#include "framework/interface/Record.h" + +#include "psu-util/timer.h" + +constexpr size_t D = 100; + +typedef de::EuclidPoint<int64_t, D> Rec; +typedef de::VPTree<Rec> Shard; +typedef de::knn::Query<Rec, Shard> Q; +typedef de::DynamicExtension<Rec, Shard, Q> Ext; + +int main(int argc, char **argv) { + std::vector hwms = {1000l, 2000l, 4000l, 10000l}; + std::vector lwms = {.1, .2, .3, .4, .5, .6, .7, .8, .9}; + + size_t n = 1000000; + + std::vector<Rec> records(n); + for (size_t i=0; i<n; i++) { + Rec r; + for (size_t j=0; j<D; j++) { + r.data[j] = rand() % n; + } + } + + TIMER_INIT(); + + for (auto &hwm : hwms) { + for (size_t i=0; i<lwms.size(); i++) { + size_t lwm = hwm * lwms[i]; + + auto extension = new Ext(lwm, hwm, 8); + TIMER_START(); + for (int64_t i=0; i<n; i++) { + while (!extension->insert(records[i])) { + _mm_pause(); + } + } + TIMER_STOP(); + + auto insert_time = TIMER_RESULT(); + double insert_throughput = (double) n / (double) insert_time * 1e9; + + fprintf(stdout, "%ld\t%ld\t%lf\n", lwm, hwm, insert_throughput); + extension->print_scheduler_statistics(); + + fflush(stdout); + delete extension; + } + } +} + |