From 405bf4a20b4a22a6bb4b60b730b6a7e901fdccf6 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 19 Mar 2024 11:10:02 -0400 Subject: FST Shard w/ tests Needs some debugging--some methods currently fail within the library itself. The build system doesn't currently build the FST library. To compile, you'll first need to manually build it, and then place the libFST.so file in your LIBRARY_PATH and LD_LIBRARY_PATH. --- include/shard/FSTrie.h | 266 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 include/shard/FSTrie.h (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h new file mode 100644 index 0000000..11a232d --- /dev/null +++ b/include/shard/FSTrie.h @@ -0,0 +1,266 @@ +/* + * include/shard/FSTrie.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * Distributed under the Modified BSD License. + * + * A shard shim around the FSTrie learned index. + * + * TODO: The code in this file is very poorly commented. + */ +#pragma once + + +#include + +#include "framework/ShardRequirements.h" +#include "FST.hpp" +#include "psu-ds/BloomFilter.h" +#include "util/bf_config.h" +#include "util/SortedMerge.h" + +using psudb::CACHELINE_SIZE; +using psudb::BloomFilter; +using psudb::PriorityQueue; +using psudb::queue_record; +using psudb::byte; + +namespace de { + +template +class FSTrie { +private: + typedef decltype(R::key) K; + typedef decltype(R::value) V; + + static_assert(std::is_same_v || std::is_same_v, + "FST requires either string or uint64_t keys"); + +public: + FSTrie(BufferView buffer) + : m_data(nullptr) + , m_reccnt(0) + , m_alloc_size(0) + { + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + size_t cnt = 0; + std::vector keys; + keys.reserve(buffer.get_record_count()); + + std::vector values; + values.reserve(buffer.get_record_count()); + + size_t longest_key = 0; + + /* + * Copy the contents of the buffer view into a temporary buffer, and + * sort them. We still need to iterate over these temporary records to + * apply tombstone/deleted record filtering, as well as any possible + * per-record processing that is required by the shard being built. + */ + auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, + buffer.get_record_count(), + sizeof(Wrapped)); + buffer.copy_to_buffer((byte *) temp_buffer); + + auto base = temp_buffer; + auto stop = base + buffer.get_record_count(); + std::sort(base, stop, std::less>()); + + for (size_t i=0; i) { + if (m_data[cnt].rec.key.size() > longest_key) { + longest_key = m_data[cnt].rec.key.size(); + } + } + + cnt++; + } + + m_reccnt = cnt; + m_fst = FST(); + if constexpr (std::is_same_v) { + m_fst.load(keys, values, longest_key); + } else { + m_fst.load(keys, values); + } + + free(temp_buffer); + } + + FSTrie(std::vector &shards) + : m_data(nullptr) + , m_reccnt(0) + , m_alloc_size(0) + { + size_t attemp_reccnt = 0; + size_t tombstone_count = 0; + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); + + std::vector keys; + keys.reserve(attemp_reccnt); + + std::vector values; + values.reserve(attemp_reccnt); + + size_t longest_key = 0; + // FIXME: For smaller cursor arrays, it may be more efficient to skip + // the priority queue and just do a scan. + PriorityQueue> pq(cursors.size()); + for (size_t i=0; i 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + /* + * if the current record is not a tombstone, and the next record is + * a tombstone that matches the current one, then the current one + * has been deleted, and both it and its tombstone can be skipped + * over. + */ + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + /* skip over records that have been deleted via tagging */ + if (!cursor.ptr->is_deleted()) { + m_data[m_reccnt] = *cursor.ptr; + keys.push_back(m_data[m_reccnt].rec.key); + values.push_back(m_data[m_reccnt].rec.value); + + if constexpr (std::is_same_v) { + if (m_data[m_reccnt].rec.key.size() > longest_key) { + longest_key = m_data[m_reccnt].rec.key.size(); + } + } + + m_reccnt++; + } + pq.pop(); + + if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); + } + } + + if (m_reccnt > 0) { + m_fst = FST(); + if constexpr (std::is_same_v) { + m_fst.load(keys, values, longest_key); + } else { + m_fst.load(keys, values); + } + } + } + + ~FSTrie() { + free(m_data); + } + + Wrapped *point_lookup(const R &rec, bool filter=false) { + size_t idx; + bool res; + if constexpr (std::is_same_v) { + res = m_fst.lookup(rec.key.c_str(), rec.key.size(), idx); + } else { + res = m_fst.lookup(rec.key, idx); + } + + if (res) { + return m_data + idx; + } + + return nullptr; + } + + Wrapped* get_data() const { + return m_data; + } + + size_t get_record_count() const { + return m_reccnt; + } + + size_t get_tombstone_count() const { + return 0; + } + + const Wrapped* get_record_at(size_t idx) const { + if (idx >= m_reccnt) return nullptr; + return m_data + idx; + } + + + size_t get_memory_usage() { + return m_fst.mem() + m_alloc_size; + } + + size_t get_aux_memory_usage() { + return 0; + } + + size_t get_lower_bound(const K& key) { + auto itr = FSTIter(); + + const K temp_key = key; + + bool res; + if constexpr (std::is_same_v) { + res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); + } else { + res = m_fst.lowerBound(temp_key, itr); + } + + return itr.value(); + } + + size_t get_upper_bound(const K& key) { + auto itr = FSTIter(); + + const K temp_key = key; + + bool res; + if constexpr (std::is_same_v) { + res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); + } else { + res = m_fst.lowerBound(temp_key, itr); + } + + size_t idx = itr.value(); + while (idx < m_reccnt && m_data[idx].rec.key <= key) { + idx++; + } + + return idx; + } + +private: + + Wrapped* m_data; + size_t m_reccnt; + size_t m_alloc_size; + FST m_fst; +}; +} -- cgit v1.2.3 From 9fe190f5d500e22b0894095e7c917f9c652e0a64 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 20 Mar 2024 17:30:14 -0400 Subject: Updates/progress towards succinct trie support --- include/shard/FSTrie.h | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 11a232d..62aa0b7 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -43,10 +43,9 @@ public: , m_reccnt(0) , m_alloc_size(0) { - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, - buffer.get_record_count() * - sizeof(Wrapped), - (byte**) &m_data); + m_data = new Wrapped[buffer.get_record_count()](); + m_alloc_size = sizeof(Wrapped) * buffer.get_record_count(); + size_t cnt = 0; std::vector keys; keys.reserve(buffer.get_record_count()); @@ -62,10 +61,15 @@ public: * apply tombstone/deleted record filtering, as well as any possible * per-record processing that is required by the shard being built. */ + /* auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, buffer.get_record_count(), sizeof(Wrapped)); - buffer.copy_to_buffer((byte *) temp_buffer); + */ + auto temp_buffer = new Wrapped[buffer.get_record_count()](); + for (size_t i=0; i) { @@ -88,6 +94,10 @@ public: cnt++; } + for (size_t i=0; i) { @@ -96,7 +106,7 @@ public: m_fst.load(keys, values); } - free(temp_buffer); + delete[] temp_buffer; } FSTrie(std::vector &shards) @@ -108,9 +118,8 @@ public: size_t tombstone_count = 0; auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, - attemp_reccnt * sizeof(Wrapped), - (byte **) &m_data); + m_data = new Wrapped[attemp_reccnt](); + m_alloc_size = attemp_reccnt * sizeof(Wrapped); std::vector keys; keys.reserve(attemp_reccnt); @@ -165,6 +174,10 @@ public: } } + for (size_t i=0; i 0) { m_fst = FST(); if constexpr (std::is_same_v) { @@ -176,18 +189,22 @@ public: } ~FSTrie() { - free(m_data); + delete[] m_data; } Wrapped *point_lookup(const R &rec, bool filter=false) { size_t idx; bool res; if constexpr (std::is_same_v) { - res = m_fst.lookup(rec.key.c_str(), rec.key.size(), idx); + res = m_fst.lookup((uint8_t*)rec.key.c_str(), rec.key.size(), idx); } else { res = m_fst.lookup(rec.key, idx); } + if (res && m_data[idx].rec.key != rec.key) { + fprintf(stderr, "ERROR!\n"); + } + if (res) { return m_data + idx; } -- cgit v1.2.3 From 147f0df58e1ff4973bffb7e4628e6b2fdc20eb57 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 22 Mar 2024 14:04:40 -0400 Subject: FSTrie testing and debugging --- include/shard/FSTrie.h | 121 ++++++++----------------------------------------- 1 file changed, 20 insertions(+), 101 deletions(-) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 62aa0b7..aa3c9f4 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -1,13 +1,11 @@ /* * include/shard/FSTrie.h * - * Copyright (C) 2023 Douglas B. Rumbaugh + * Copyright (C) 2024 Douglas B. Rumbaugh * * Distributed under the Modified BSD License. * * A shard shim around the FSTrie learned index. - * - * TODO: The code in this file is very poorly commented. */ #pragma once @@ -15,9 +13,7 @@ #include #include "framework/ShardRequirements.h" -#include "FST.hpp" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" +#include "fst.hpp" #include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; @@ -28,14 +24,13 @@ using psudb::byte; namespace de { -template +template class FSTrie { private: + typedef decltype(R::key) K; typedef decltype(R::value) V; - - static_assert(std::is_same_v || std::is_same_v, - "FST requires either string or uint64_t keys"); + static_assert(std::is_same_v, "FST requires std::string keys."); public: FSTrie(BufferView buffer) @@ -50,22 +45,12 @@ public: std::vector keys; keys.reserve(buffer.get_record_count()); - std::vector values; - values.reserve(buffer.get_record_count()); - - size_t longest_key = 0; - /* * Copy the contents of the buffer view into a temporary buffer, and * sort them. We still need to iterate over these temporary records to * apply tombstone/deleted record filtering, as well as any possible * per-record processing that is required by the shard being built. */ - /* - auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, - buffer.get_record_count(), - sizeof(Wrapped)); - */ auto temp_buffer = new Wrapped[buffer.get_record_count()](); for (size_t i=0; i>()); for (size_t i=0; i) { - if (m_data[cnt].rec.key.size() > longest_key) { - longest_key = m_data[cnt].rec.key.size(); - } - } - cnt++; } @@ -99,11 +77,8 @@ public: } m_reccnt = cnt; - m_fst = FST(); - if constexpr (std::is_same_v) { - m_fst.load(keys, values, longest_key); - } else { - m_fst.load(keys, values); + if (m_reccnt > 0) { + m_fst = new fst::Trie(keys); } delete[] temp_buffer; @@ -124,10 +99,6 @@ public: std::vector keys; keys.reserve(attemp_reccnt); - std::vector values; - values.reserve(attemp_reccnt); - - size_t longest_key = 0; // FIXME: For smaller cursor arrays, it may be more efficient to skip // the priority queue and just do a scan. PriorityQueue> pq(cursors.size()); @@ -155,16 +126,9 @@ public: } else { auto& cursor = cursors[now.version]; /* skip over records that have been deleted via tagging */ - if (!cursor.ptr->is_deleted()) { + if (!cursor.ptr->is_deleted() && cursor.ptr->rec.key != "") { m_data[m_reccnt] = *cursor.ptr; keys.push_back(m_data[m_reccnt].rec.key); - values.push_back(m_data[m_reccnt].rec.value); - - if constexpr (std::is_same_v) { - if (m_data[m_reccnt].rec.key.size() > longest_key) { - longest_key = m_data[m_reccnt].rec.key.size(); - } - } m_reccnt++; } @@ -179,37 +143,24 @@ public: } if (m_reccnt > 0) { - m_fst = FST(); - if constexpr (std::is_same_v) { - m_fst.load(keys, values, longest_key); - } else { - m_fst.load(keys, values); - } + m_fst = new fst::Trie(keys); } } ~FSTrie() { delete[] m_data; + delete m_fst; } Wrapped *point_lookup(const R &rec, bool filter=false) { - size_t idx; - bool res; - if constexpr (std::is_same_v) { - res = m_fst.lookup((uint8_t*)rec.key.c_str(), rec.key.size(), idx); - } else { - res = m_fst.lookup(rec.key, idx); - } - if (res && m_data[idx].rec.key != rec.key) { - fprintf(stderr, "ERROR!\n"); - } + auto idx = m_fst->exactSearch(rec.key); - if (res) { - return m_data + idx; + if (idx == fst::kNotFound) { + return nullptr; } - return nullptr; + return m_data + idx; } Wrapped* get_data() const { @@ -231,53 +182,21 @@ public: size_t get_memory_usage() { - return m_fst.mem() + m_alloc_size; + return m_fst->getMemoryUsage() + m_alloc_size; } size_t get_aux_memory_usage() { return 0; } - size_t get_lower_bound(const K& key) { - auto itr = FSTIter(); - - const K temp_key = key; - - bool res; - if constexpr (std::is_same_v) { - res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); - } else { - res = m_fst.lowerBound(temp_key, itr); - } - - return itr.value(); - } - - size_t get_upper_bound(const K& key) { - auto itr = FSTIter(); - - const K temp_key = key; - - bool res; - if constexpr (std::is_same_v) { - res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); - } else { - res = m_fst.lowerBound(temp_key, itr); - } - - size_t idx = itr.value(); - while (idx < m_reccnt && m_data[idx].rec.key <= key) { - idx++; - } - - return idx; - } + size_t get_lower_bound(R &rec) {return 0;} + size_t get_upper_bound(R &rec) {return 0;} private: Wrapped* m_data; size_t m_reccnt; size_t m_alloc_size; - FST m_fst; + fst::Trie *m_fst; }; } -- cgit v1.2.3 From fb4312a883dd0e382ecbcfe1119479e6f44d32a6 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 22 Mar 2024 15:35:14 -0400 Subject: PointLookup: added a point lookup query for unique indexes, and some tests --- include/shard/FSTrie.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index aa3c9f4..50bf982 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -160,6 +160,12 @@ public: return nullptr; } + // FIXME: for convenience, I'm treating this Trie as a unique index + // for now, so no need to scan forward and/or check values. This + // also makes the point lookup query class a lot easier to make. + // Ultimately, though, we can support non-unique indexes with some + // extra work. + return m_data + idx; } -- cgit v1.2.3 From 7e7fd9f7339eee2f1ae974c662a447532dfb1b1a Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 26 Mar 2024 16:35:12 -0400 Subject: Updated FSTrie benchmark and some minor fixes --- include/shard/FSTrie.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 50bf982..95f396f 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -61,7 +61,7 @@ public: std::sort(base, stop, std::less>()); for (size_t i=0; i Date: Mon, 15 Apr 2024 14:00:27 -0400 Subject: Updated FSTrie to use const char * instead of std::string Note: this requires the caller to manage the memory of the strings --- include/shard/FSTrie.h | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 95f396f..be678ff 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -30,7 +30,7 @@ private: typedef decltype(R::key) K; typedef decltype(R::value) V; - static_assert(std::is_same_v, "FST requires std::string keys."); + static_assert(std::is_same_v, "FST requires const char* keys."); public: FSTrie(BufferView buffer) @@ -42,7 +42,7 @@ public: m_alloc_size = sizeof(Wrapped) * buffer.get_record_count(); size_t cnt = 0; - std::vector keys; + std::vector keys; keys.reserve(buffer.get_record_count()); /* @@ -68,14 +68,10 @@ public: m_data[cnt] = temp_buffer[i]; m_data[cnt].clear_timestamp(); - keys.push_back(m_data[cnt].rec.key); + keys.push_back(std::string(m_data[cnt].rec.key)); cnt++; } - for (size_t i=0; i 0) { m_fst = new fst::Trie(keys); @@ -96,7 +92,7 @@ public: m_data = new Wrapped[attemp_reccnt](); m_alloc_size = attemp_reccnt * sizeof(Wrapped); - std::vector keys; + std::vector keys; keys.reserve(attemp_reccnt); // FIXME: For smaller cursor arrays, it may be more efficient to skip @@ -128,7 +124,7 @@ public: /* skip over records that have been deleted via tagging */ if (!cursor.ptr->is_deleted() && cursor.ptr->rec.key != "") { m_data[m_reccnt] = *cursor.ptr; - keys.push_back(m_data[m_reccnt].rec.key); + keys.push_back(std::string(m_data[m_reccnt].rec.key)); m_reccnt++; } @@ -138,10 +134,6 @@ public: } } - for (size_t i=0; i 0) { m_fst = new fst::Trie(keys); } -- cgit v1.2.3 From 7c2f43ff039795576bc0014c367b893fbbaceca4 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 19 Apr 2024 14:39:33 -0400 Subject: Benchmark updates --- include/shard/FSTrie.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/shard/FSTrie.h') diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index be678ff..3783b38 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -74,7 +74,7 @@ public: m_reccnt = cnt; if (m_reccnt > 0) { - m_fst = new fst::Trie(keys); + m_fst = new fst::Trie(keys, true, 1); } delete[] temp_buffer; @@ -135,7 +135,7 @@ public: } if (m_reccnt > 0) { - m_fst = new fst::Trie(keys); + m_fst = new fst::Trie(keys, true, 1); } } @@ -180,11 +180,11 @@ public: size_t get_memory_usage() { - return m_fst->getMemoryUsage() + m_alloc_size; + return m_fst->getMemoryUsage(); } size_t get_aux_memory_usage() { - return 0; + return m_alloc_size; } size_t get_lower_bound(R &rec) {return 0;} -- cgit v1.2.3