diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2025-02-20 15:16:56 -0500 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2025-02-20 15:16:56 -0500 |
| commit | 40fe2e7ea56d49a065a4a53b7f8a4a918a5d78b0 (patch) | |
| tree | cd3a501666c2b4618922faf1f2240bb8a23377ff /benchmarks/include/file_util.h | |
| parent | f1316e313de5c5286b279cec6ed320cba3eb506f (diff) | |
| download | dynamic-extension-40fe2e7ea56d49a065a4a53b7f8a4a918a5d78b0.tar.gz | |
Added uniform data generator as file option
Diffstat (limited to 'benchmarks/include/file_util.h')
| -rw-r--r-- | benchmarks/include/file_util.h | 455 |
1 files changed, 240 insertions, 215 deletions
diff --git a/benchmarks/include/file_util.h b/benchmarks/include/file_util.h index 01aaa1a..df8d999 100644 --- a/benchmarks/include/file_util.h +++ b/benchmarks/include/file_util.h @@ -1,198 +1,219 @@ #pragma once -#include <cstdlib> +#include <algorithm> #include <cstdio> -#include <iostream> +#include <cstdlib> +#include <cstring> #include <fstream> +#include <gsl/gsl_rng.h> +#include <iostream> +#include <memory> #include <sstream> #include <string> -#include <gsl/gsl_rng.h> -#include <cstring> #include <vector> -#include <memory> #include "psu-util/progress.h" - template <typename QP> -static std::vector<QP> read_lookup_queries(std::string fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - - if (!qf) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); +static std::vector<QP> read_lookup_queries(std::string fname, + double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.1) { + QP q; + q.target_key = start; + queries.push_back(q); } + } + fclose(qf); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.1) { - QP q; - q.target_key = start; - queries.push_back(q); - } - } - fclose(qf); - - return queries; + return queries; } template <typename QP> -static std::vector<QP> generate_string_lookup_queries(std::vector<char *> &strings, size_t cnt, gsl_rng *rng) { - std::vector<QP> queries; - - for (size_t i=0; i<cnt; i++) { - auto idx = gsl_rng_uniform_int(rng, strings.size()); - QP q; - q.search_key = strings[idx]; - queries.push_back(q); - } - - return queries; +static std::vector<QP> +generate_string_lookup_queries(std::vector<char *> &strings, size_t cnt, + gsl_rng *rng) { + std::vector<QP> queries; + + for (size_t i = 0; i < cnt; i++) { + auto idx = gsl_rng_uniform_int(rng, strings.size()); + QP q; + q.search_key = strings[idx]; + queries.push_back(q); + } + + return queries; } template <typename QP> -static std::vector<QP> read_range_queries(std::string &fname, double selectivity) { - std::vector<QP> queries; - - FILE *qf = fopen(fname.c_str(), "r"); - - if (!qf) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); +static std::vector<QP> read_range_queries(std::string &fname, + double selectivity) { + std::vector<QP> queries; + + FILE *qf = fopen(fname.c_str(), "r"); + + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + size_t start, stop; + double sel; + while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { + if (start < stop && std::abs(sel - selectivity) < 0.00001) { + QP q; + q.lower_bound = start; + q.upper_bound = stop; + + queries.push_back(q); } + } + fclose(qf); - size_t start, stop; - double sel; - while (fscanf(qf, "%zu%zu%lf\n", &start, &stop, &sel) != EOF) { - if (start < stop && std::abs(sel - selectivity) < 0.00001) { - QP q; - q.lower_bound = start; - q.upper_bound = stop; - - queries.push_back(q); - } - } - fclose(qf); - - return queries; + return queries; } - template <typename QP> -static std::vector<QP> read_binary_knn_queries(std::string fname, size_t k, size_t n) { - std::vector<QP> queries; - queries.reserve(n); - - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); +static std::vector<QP> read_binary_knn_queries(std::string fname, size_t k, + size_t n) { + std::vector<QP> queries; + queries.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + uint32_t dim; + uint32_t cnt; + + file.read((char *)&(cnt), sizeof(cnt)); + file.read((char *)&(dim), sizeof(dim)); + + if (n > cnt) { + n = cnt; + } + + for (size_t i = 0; i < n; i++) { + QP query; + for (size_t j = 0; j < dim; j++) { + uint64_t val; + file.read((char *)&(val), sizeof(uint64_t)); + query.point.data[j] = val; } + query.k = k; + queries.push_back(query); + } - - uint32_t dim; - uint32_t cnt; - - file.read((char*) &(cnt), sizeof(cnt)); - file.read((char*) &(dim), sizeof(dim)); - - if (n > cnt) { - n = cnt; - } - - for (size_t i=0; i<n; i++) { - QP query; - for (size_t j=0; j<dim; j++) { - uint64_t val; - file.read((char*) &(val), sizeof(uint64_t)); - query.point.data[j] = val; - } - query.k = k; - queries.push_back(query); - } - - return queries; + return queries; } - template <typename QP> static std::vector<QP> read_knn_queries(std::string fname, size_t k) { - std::vector<QP> queries; + std::vector<QP> queries; - FILE *qf = fopen(fname.c_str(), "r"); - char *line = NULL; - size_t len = 0; + FILE *qf = fopen(fname.c_str(), "r"); + char *line = NULL; + size_t len = 0; - if (!qf) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } + if (!qf) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } - while (getline(&line, &len, qf) > 0) { - char *token; - QP query; - size_t idx = 0; + while (getline(&line, &len, qf) > 0) { + char *token; + QP query; + size_t idx = 0; - token = strtok(line, " "); - do { - query.point.data[idx++] = atof(token); - } while ((token = strtok(NULL, " "))); + token = strtok(line, " "); + do { + query.point.data[idx++] = atof(token); + } while ((token = strtok(NULL, " "))); - query.k = k; - queries.emplace_back(query); - } + query.k = k; + queries.emplace_back(query); + } - free(line); - fclose(qf); + free(line); + fclose(qf); - return queries; + return queries; } -template<typename R> -static std::vector<R> read_sosd_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } - +template <typename R> +static std::vector<R> generate_uniform(size_t n) { std::vector<R> records(n); + for (size_t i=0; i<n; i++) { - decltype(R::key) k; - file.read((char*) &(k), sizeof(R::key)); - records[i].key = k; + records[i].key = i; records[i].value = i; } + std::random_shuffle(records.begin(), records.end()); + return records; } -template<typename K, typename V> -static std::vector<std::pair<K, V>> read_sosd_file_pair(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); - - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } - - std::vector<std::pair<K,V>> records(n); - for (size_t i=0; i<n; i++) { - K k; - file.read((char*) &(k), sizeof(K)); - records[i].first = k; - records[i].second = i; - } +template <typename R> +static std::vector<R> read_sosd_file(std::string &fname, size_t n) { + if (fname.starts_with("unif")) { + return generate_uniform<R>(n); + } + + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<R> records(n); + for (size_t i = 0; i < n; i++) { + decltype(R::key) k; + file.read((char *)&(k), sizeof(R::key)); + records[i].key = k; + records[i].value = i; + } + + return records; +} - return records; +template <typename K, typename V> +static std::vector<std::pair<K, V>> read_sosd_file_pair(std::string &fname, + size_t n) { + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); + + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } + + std::vector<std::pair<K, V>> records(n); + for (size_t i = 0; i < n; i++) { + K k; + file.read((char *)&(k), sizeof(K)); + records[i].first = k; + records[i].second = i; + } + + return records; } /* @@ -203,98 +224,102 @@ static std::vector<std::pair<K, V>> read_sosd_file_pair(std::string &fname, size */ template <typename R, size_t D> static std::vector<R> read_vector_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in); + std::fstream file; + file.open(fname, std::ios::in); - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } - std::vector<R> records; - records.reserve(n); + std::vector<R> records; + records.reserve(n); - for (size_t i=0; i<n; i++) { - std::string line; - if (!std::getline(file, line, '\n')) break; - - std::stringstream line_stream(line); - R rec; - for (size_t j=0; j<D; j++) { - std::string dim; - if (!std::getline(line_stream, dim, ' ')) break; - - rec.data[j] = atof(dim.c_str()); - } - records.emplace_back(rec); + for (size_t i = 0; i < n; i++) { + std::string line; + if (!std::getline(file, line, '\n')) + break; + + std::stringstream line_stream(line); + R rec; + for (size_t j = 0; j < D; j++) { + std::string dim; + if (!std::getline(line_stream, dim, ' ')) + break; + + rec.data[j] = atof(dim.c_str()); } + records.emplace_back(rec); + } - return records; + return records; } template <typename R> static std::vector<R> read_binary_vector_file(std::string &fname, size_t n) { - std::fstream file; - file.open(fname, std::ios::in | std::ios::binary); + std::fstream file; + file.open(fname, std::ios::in | std::ios::binary); - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } - std::vector<R> records; - records.reserve(n); + std::vector<R> records; + records.reserve(n); - uint32_t dim; - uint32_t cnt; + uint32_t dim; + uint32_t cnt; - file.read((char*) &(cnt), sizeof(cnt)); - file.read((char*) &(dim), sizeof(dim)); + file.read((char *)&(cnt), sizeof(cnt)); + file.read((char *)&(dim), sizeof(dim)); - if (n > cnt) { - n = cnt; - } + if (n > cnt) { + n = cnt; + } - R rec; - for (size_t i=0; i<n; i++) { - for (size_t j=0; j<dim; j++) { - uint64_t val; - file.read((char*) &(val), sizeof(uint64_t)); - rec.data[j] = val; - } - - records.emplace_back(rec); + R rec; + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < dim; j++) { + uint64_t val; + file.read((char *)&(val), sizeof(uint64_t)); + rec.data[j] = val; } - return records; + records.emplace_back(rec); + } + + return records; } -[[maybe_unused]] static std::vector<char *> read_string_file(std::string fname, size_t n=10000000) { +[[maybe_unused]] static std::vector<char *> +read_string_file(std::string fname, size_t n = 10000000) { - std::fstream file; - file.open(fname, std::ios::in); + std::fstream file; + file.open(fname, std::ios::in); - if (!file.is_open()) { - fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); - exit(EXIT_FAILURE); - } + if (!file.is_open()) { + fprintf(stderr, "ERROR: Failed to open file %s\n", fname.c_str()); + exit(EXIT_FAILURE); + } - std::vector<char *> strings; - strings.reserve(n); + std::vector<char *> strings; + strings.reserve(n); - size_t i=0; - std::string line; - while (i < n && std::getline(file, line, '\n')) { - strings.emplace_back(strdup(line.c_str())); - i++; - psudb::progress_update((double) i / (double) n, "Reading file:"); - } + size_t i = 0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(strdup(line.c_str())); + i++; + psudb::progress_update((double)i / (double)n, "Reading file:"); + } - return strings; + return strings; } -[[maybe_unused]] static void destroy_string_file_data(std::vector<char *> &data) { - for (size_t i=0; i<data.size(); i++) { - delete data[i]; - } +[[maybe_unused]] static void +destroy_string_file_data(std::vector<char *> &data) { + for (size_t i = 0; i < data.size(); i++) { + delete data[i]; + } } |