diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2024-04-19 14:40:19 -0400 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2024-04-19 14:40:19 -0400 |
| commit | 34fd8ad935e6359d20a5d6c949e67495d0842f8f (patch) | |
| tree | 234994ef2753bb78c5e619070415415b53d37348 | |
| parent | 7c2f43ff039795576bc0014c367b893fbbaceca4 (diff) | |
| download | dynamic-extension-34fd8ad935e6359d20a5d6c949e67495d0842f8f.tar.gz | |
More trie baseline tests
| -rw-r--r-- | CMakeLists.txt | 52 | ||||
| -rw-r--r-- | benchmarks/cedar_trie.cpp | 97 | ||||
| -rw-r--r-- | benchmarks/hat_trie.cpp | 98 | ||||
| -rw-r--r-- | benchmarks/louds_insertion_tput.cpp | 112 | ||||
| m--------- | external/psudb-common | 0 | ||||
| -rw-r--r-- | include/shard/LoudsPatricia.h | 199 | ||||
| -rw-r--r-- | tests/louds_tests.cpp | 55 |
7 files changed, 597 insertions, 16 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c73545..c62085f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) set(namespace "de") project("Practical Dynamic Extension" VERSION 0.1.0) -set(debug true) +set(debug false) set(tests True) set(bench true) @@ -18,7 +18,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/bin") set(CMAKE_CXX_FLAGS=-latomic -mcx16) -add_compile_options(-Iinclude -Iexternal/PLEX/include -Iexternal -mcx16) # -fconcepts-diagnostics-depth=3) +add_compile_options(-Iinclude -Iexternal/PLEX/include -Iexternal -mcx16 -march=native) # -fconcepts-diagnostics-depth=3) if (BSD) add_link_options(-L/usr/local/lib) @@ -122,29 +122,29 @@ if (tests) # OpenBSD doesn't have OpenMP support, so don't build the PGM code on that # platform. - #if (!BSD) - # add_executable(pgm_tests ${CMAKE_CURRENT_SOURCE_DIR}/tests/pgm_tests.cpp) - #target_link_libraries(pgm_tests PUBLIC gsl cblas check subunit pthread gomp atomic) - #target_include_directories(pgm_tests PRIVATE include external/PGM-index/include external/psudb-common/cpp/include) - #target_link_options(pgm_tests PUBLIC -mcx16) - #target_compile_options(pgm_tests PUBLIC -fopenmp) - #endif() + add_executable(pgm_tests ${CMAKE_CURRENT_SOURCE_DIR}/tests/pgm_tests.cpp) + target_link_libraries(pgm_tests PUBLIC gsl check subunit pthread gomp atomic) + target_include_directories(pgm_tests PRIVATE include external/PGM-index/include external/psudb-common/cpp/include) + target_link_options(pgm_tests PUBLIC -mcx16) + target_compile_options(pgm_tests PUBLIC -fopenmp) # Triespline code doesn't build under OpenBSD either due to ambiguous function call; # this is likely a difference between gcc and clang, rather than an OS thing -if (!BSD) add_executable(triespline_debug ${CMAKE_CURRENT_SOURCE_DIR}/tests/triespline_debug.cpp) target_link_libraries(triespline_debug PUBLIC gsl check subunit pthread atomic) target_link_options(triespline_debug PUBLIC -mcx16) target_include_directories(triespline_debug PRIVATE include external/psudb-common/cpp/include external/PLEX/include) -endif() add_executable(fst_tests ${CMAKE_CURRENT_SOURCE_DIR}/tests/fst_tests.cpp) target_link_libraries(fst_tests PUBLIC gsl check subunit pthread atomic) target_link_options(fst_tests PUBLIC -mcx16) - target_include_directories(fst_tests PRIVATE include - external/psudb-common/cpp/include external/PLEX/include external/fast_succinct_trie/include) + target_include_directories(fst_tests PRIVATE include external/psudb-common/cpp/include external/PLEX/include external/fast_succinct_trie/include external/louds-patricia) + + add_executable(louds_tests ${CMAKE_CURRENT_SOURCE_DIR}/tests/louds_tests.cpp) + target_link_libraries(louds_tests PUBLIC gsl check subunit pthread atomic) + target_link_options(louds_tests PUBLIC -mcx16) + target_include_directories(louds_tests PRIVATE include external/psudb-common/cpp/include external/PLEX/include external/fast_succinct_trie/include external/louds-patricia) endif() if (bench) @@ -167,6 +167,12 @@ if (bench) target_link_options(string_insertion_tput PUBLIC -mcx16) + add_executable(louds_insertion_tput ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/louds_insertion_tput.cpp) + target_link_libraries(louds_insertion_tput PUBLIC gsl pthread atomic) + target_include_directories(louds_insertion_tput PRIVATE include external external/fast_succinct_trie/include external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include external/louds-patricia) + target_link_options(louds_insertion_tput PUBLIC -mcx16) + + add_executable(query_workload_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/query_workload_bench.cpp) target_link_libraries(query_workload_bench PUBLIC gsl pthread atomic) target_include_directories(query_workload_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include) @@ -182,6 +188,17 @@ if (bench) target_include_directories(poplar_trie PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include external/poplar-trie/include) target_link_options(poplar_trie PUBLIC -mcx16) + add_executable(hat_trie ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/hat_trie.cpp) + target_link_libraries(hat_trie PUBLIC gsl pthread atomic) + target_include_directories(hat_trie PRIVATE include external + external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include external/hat-trie/include/tsl) + target_link_options(hat_trie PUBLIC -mcx16) + + add_executable(cedar_trie ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/cedar_trie.cpp) + target_link_libraries(cedar_trie PUBLIC gsl pthread atomic) + target_include_directories(cedar_trie PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include external/hat-trie/include/tsl) + target_link_options(cedar_trie PUBLIC -mcx16) + #add_executable(btree_insert_query_tput ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/btree_insert_query_tput.cpp) #target_link_libraries(btree_insert_query_tput PUBLIC gsl cblas pthread atomic) #target_include_directories(btree_insert_query_tput PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include) @@ -202,13 +219,16 @@ if (bench) target_include_directories(vptree_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include) target_link_options(vptree_bench PUBLIC -mcx16) - -if(!BSD) add_executable(ts_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/ts_bench.cpp) target_link_libraries(ts_bench PUBLIC gsl pthread atomic) target_include_directories(ts_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include) target_link_options(ts_bench PUBLIC -mcx16) -endif() + + add_executable(pgm_bench ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/pgm_bench.cpp) + target_link_libraries(pgm_bench PUBLIC gsl pthread atomic gomp) + target_include_directories(pgm_bench PRIVATE include external external/m-tree/cpp external/PGM-index/include external/PLEX/include bench/include external/psudb-common/cpp/include) + target_link_options(pgm_bench PUBLIC -mcx16) + target_compile_options(pgm_bench PUBLIC -fopenmp) #add_executable(static_dynamic_comp ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/static_dynamic_comp.cpp) #target_link_libraries(static_dynamic_comp PUBLIC gsl cblas pthread atomic) diff --git a/benchmarks/cedar_trie.cpp b/benchmarks/cedar_trie.cpp new file mode 100644 index 0000000..7499ce7 --- /dev/null +++ b/benchmarks/cedar_trie.cpp @@ -0,0 +1,97 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> +#include <vector> + +#include "cedar.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + +std::vector<std::string> strings; + +typedef cedar::da<int> Trie; + +void insert_thread(int64_t start, int64_t end, Trie * trie) { + for (uint64_t i=start; i<end; i++) { + auto res = trie->update(strings[i].c_str(), strings[i].size(), i+1); + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(line); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + auto trie = new Trie(); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), trie); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + + auto res = trie->exactMatchSearch<int>(strings[j].c_str()); + //assert(*(res)+1 == j); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t\t%lf\t%ld\n", trie->size(), + i_tput, q_lat); + + fprintf(stdout, "%ld\n", trie->total_size()); + + delete trie; + + fflush(stderr); +} + diff --git a/benchmarks/hat_trie.cpp b/benchmarks/hat_trie.cpp new file mode 100644 index 0000000..3b4c7d3 --- /dev/null +++ b/benchmarks/hat_trie.cpp @@ -0,0 +1,98 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> + +#include "htrie_map.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + +std::vector<std::string> strings; + +typedef tsl::htrie_map<char, size_t> Trie; + +void insert_thread(int64_t start, int64_t end, Trie * trie) { + for (uint64_t i=start; i<end; i++) { + auto res = trie->insert(strings[i].c_str(), i+1); + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(line); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + auto trie = new Trie(); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), trie); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + + auto res = trie->find(strings[j]); + if (*res != (j+1)) { + fprintf(stderr, "%ld %d %s\n", j, *res, strings[j].c_str()); + } + //assert(*(res)+1 == j); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t\t%lf\t%ld\n", trie->size(), + i_tput, q_lat); + + + delete trie; + + fflush(stderr); +} + diff --git a/benchmarks/louds_insertion_tput.cpp b/benchmarks/louds_insertion_tput.cpp new file mode 100644 index 0000000..d772f3b --- /dev/null +++ b/benchmarks/louds_insertion_tput.cpp @@ -0,0 +1,112 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include <fstream> +#include <sstream> + +#include "framework/DynamicExtension.h" +#include "shard/LoudsPatricia.h" +#include "query/pointlookup.h" +#include "framework/interface/Record.h" + +#include "psu-util/timer.h" +#include "psu-util/progress.h" + + +typedef de::Record<const char *, uint64_t> Rec; +typedef de::LoudsPatricia<Rec> Trie; +typedef de::pl::Query<Rec, Trie> Q; +typedef de::DynamicExtension<Rec, Trie, Q, de::LayoutPolicy::TEIRING, de::DeletePolicy::TAGGING, de::SerialScheduler> Ext; + +std::vector<std::unique_ptr<char[]>> strings; + +void insert_thread(int64_t start, int64_t end, Ext *extension) { + for (uint64_t i=start; i<end; i++) { + Rec r = {strings[i].get(), i, strlen(strings[i].get())}; + while (!extension->insert(r)) { + _mm_pause(); + } + } +} + +void read_data(std::string fname, size_t n=10000000) { + strings.reserve(n); + + std::fstream file; + file.open(fname, std::ios::in); + + size_t i=0; + std::string line; + while (i < n && std::getline(file, line, '\n')) { + strings.emplace_back(std::unique_ptr<char[]>(strdup(line.c_str()))); + i++; + psudb::progress_update((double) i / (double) n, "Reading file:"); + } +} + +void usage(char *name) { + fprintf(stderr, "Usage:\n%s datafile record_count\n", name); +} + +int main(int argc, char **argv) { + + if (argc < 3) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + std::string fname = std::string(argv[1]); + size_t n = atol(argv[2]); + + read_data(fname, n); + + if (strings.size() == 0) { + fprintf(stderr, "[E]: No string data read from file. Aborting execution.\n"); + } else { + fprintf(stderr, "Finished reading from file.\n"); + } + + std::vector<size_t> scale_factors = {2, 4, 6, 8, 10, 12}; + std::vector<size_t> buffer_sizes = {1000, 2000, 5000, 10000, 12000, 15000}; + + for (auto &sf : scale_factors) { + for (auto &bf_sz : buffer_sizes) { + + auto extension = new Ext(bf_sz, bf_sz, sf); + + TIMER_INIT(); + TIMER_START(); + insert_thread(0, strings.size(), extension); + TIMER_STOP(); + + auto total_time = TIMER_RESULT(); + + size_t m = 100; + TIMER_START(); + for (size_t i=0; i<m; i++) { + size_t j = rand() % strings.size(); + de::pl::Parms<Rec> parms = {strings[j].get()}; + + auto res = extension->query(&parms); + auto ans = res.get(); + } + TIMER_STOP(); + + auto query_time = TIMER_RESULT(); + + double i_tput = (double) n / (double) total_time * 1e9; + size_t q_lat = query_time / m; + + fprintf(stdout, "%ld\t%ld\t%ld\t%lf\t%ld\t%ld\n", extension->get_record_count(), + bf_sz, sf, i_tput, q_lat, extension->get_memory_usage()); + + delete extension; + + fflush(stderr); + } + } +} + diff --git a/external/psudb-common b/external/psudb-common -Subproject 18a3e3383ed77e7a9cfab240f54378c43de203b +Subproject 6f938d68945343881160cae5fdef3a8eccb6b03 diff --git a/include/shard/LoudsPatricia.h b/include/shard/LoudsPatricia.h new file mode 100644 index 0000000..3452839 --- /dev/null +++ b/include/shard/LoudsPatricia.h @@ -0,0 +1,199 @@ +/* + * include/shard/LoudsPatricia.h + * + * Copyright (C) 2024 Douglas B. Rumbaugh <drumbaugh@psu.edu> + * + * Distributed under the Modified BSD License. + * + * A shard shim around the LoudsPatricia learned index. + */ +#pragma once + + +#include <vector> + +#include "framework/ShardRequirements.h" +#include "louds-patricia.hpp" +#include "util/SortedMerge.h" + +using psudb::CACHELINE_SIZE; +using psudb::BloomFilter; +using psudb::PriorityQueue; +using psudb::queue_record; +using psudb::byte; + +namespace de { + +template <KVPInterface R> +class LoudsPatricia { +private: + + typedef decltype(R::key) K; + typedef decltype(R::value) V; + static_assert(std::is_same_v<K, const char*>, "FST requires const char* keys."); + +public: + LoudsPatricia(BufferView<R> buffer) + : m_data(nullptr) + , m_reccnt(0) + , m_alloc_size(0) + { + m_data = new Wrapped<R>[buffer.get_record_count()](); + m_alloc_size = sizeof(Wrapped<R>) * buffer.get_record_count(); + + m_louds = new louds::Patricia(); + + size_t cnt = 0; + std::vector<std::string> keys; + keys.reserve(buffer.get_record_count()); + + /* + * Copy the contents of the buffer view into a temporary buffer, and + * sort them. We still need to iterate over these temporary records to + * apply tombstone/deleted record filtering, as well as any possible + * per-record processing that is required by the shard being built. + */ + auto temp_buffer = new Wrapped<R>[buffer.get_record_count()](); + for (size_t i=0; i<buffer.get_record_count(); i++) { + temp_buffer[i] = *(buffer.get(i)); + } + + auto base = temp_buffer; + auto stop = base + buffer.get_record_count(); + std::sort(base, stop, std::less<Wrapped<R>>()); + + for (size_t i=0; i<buffer.get_record_count(); i++) { + if (temp_buffer[i].is_deleted() || !temp_buffer[i].is_visible() || temp_buffer[i].rec.key == "") { + continue; + } + + m_data[cnt] = temp_buffer[i]; + m_data[cnt].clear_timestamp(); + + m_louds->add(std::string(m_data[cnt].rec.key)); + cnt++; + } + + m_reccnt = cnt; + if (m_reccnt > 0) { + m_louds->build(); + } + + delete[] temp_buffer; + } + + LoudsPatricia(std::vector<LoudsPatricia*> &shards) + : m_data(nullptr) + , m_reccnt(0) + , m_alloc_size(0) + { + size_t attemp_reccnt = 0; + size_t tombstone_count = 0; + auto cursors = build_cursor_vec<R, LoudsPatricia>(shards, &attemp_reccnt, &tombstone_count); + + m_data = new Wrapped<R>[attemp_reccnt](); + m_alloc_size = attemp_reccnt * sizeof(Wrapped<R>); + + m_louds = new louds::Patricia(); + + // FIXME: For smaller cursor arrays, it may be more efficient to skip + // the priority queue and just do a scan. + PriorityQueue<Wrapped<R>> pq(cursors.size()); + for (size_t i=0; i<cursors.size(); i++) { + pq.push(cursors[i].ptr, i); + } + + while (pq.size()) { + auto now = pq.peek(); + auto next = pq.size() > 1 ? pq.peek(1) : queue_record<Wrapped<R>>{nullptr, 0}; + /* + * if the current record is not a tombstone, and the next record is + * a tombstone that matches the current one, then the current one + * has been deleted, and both it and its tombstone can be skipped + * over. + */ + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + /* skip over records that have been deleted via tagging */ + if (!cursor.ptr->is_deleted() && cursor.ptr->rec.key != "") { + m_data[m_reccnt] = *cursor.ptr; + m_louds->add(std::string(m_data[m_reccnt].rec.key)); + m_reccnt++; + } + pq.pop(); + + if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); + } + } + + if (m_reccnt > 0) { + m_louds->build(); + } + } + + ~LoudsPatricia() { + delete[] m_data; + delete m_louds; + } + + Wrapped<R> *point_lookup(const R &rec, bool filter=false) { + + auto idx = m_louds->lookup(std::string(rec.key)); + + if (idx == -1) { + return nullptr; + } + + // FIXME: for convenience, I'm treating this Trie as a unique index + // for now, so no need to scan forward and/or check values. This + // also makes the point lookup query class a lot easier to make. + // Ultimately, though, we can support non-unique indexes with some + // extra work. + return m_data + idx; + } + + Wrapped<R>* get_data() const { + return m_data; + } + + size_t get_record_count() const { + return m_reccnt; + } + + size_t get_tombstone_count() const { + return 0; + } + + const Wrapped<R>* get_record_at(size_t idx) const { + if (idx >= m_reccnt) return nullptr; + return m_data + idx; + } + + + size_t get_memory_usage() { + return m_louds->size(); + } + + size_t get_aux_memory_usage() { + return m_alloc_size; + } + + size_t get_lower_bound(R &rec) {return 0;} + size_t get_upper_bound(R &rec) {return 0;} + +private: + + Wrapped<R>* m_data; + size_t m_reccnt; + size_t m_alloc_size; + louds::Patricia *m_louds; +}; +} diff --git a/tests/louds_tests.cpp b/tests/louds_tests.cpp new file mode 100644 index 0000000..7eed54a --- /dev/null +++ b/tests/louds_tests.cpp @@ -0,0 +1,55 @@ +/* + * tests/isam_tests.cpp + * + * Unit tests for ISAM Tree shard + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * Dong Xie <dongx@psu.edu> + * + * Distributed under the Modified BSD License. + * + */ + +#include "shard/LoudsPatricia.h" +#include "include/testing.h" +#include <check.h> + +using namespace de; + +typedef StringRec R; +typedef LoudsPatricia<R> Shard; + +#include "include/shard_string.h" +#include "include/pointlookup.h" + +Suite *unit_testing() +{ + Suite *unit = suite_create("Fast-succinct Trie Shard Unit Testing"); + + inject_shard_tests(unit); + inject_pointlookup_tests(unit); + + return unit; +} + + +int shard_unit_tests() +{ + int failed = 0; + Suite *unit = unit_testing(); + SRunner *unit_shardner = srunner_create(unit); + + srunner_run_all(unit_shardner, CK_NORMAL); + failed = srunner_ntests_failed(unit_shardner); + srunner_free(unit_shardner); + + return failed; +} + + +int main() +{ + int unit_failed = shard_unit_tests(); + + return (unit_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; +} |