diff options
Diffstat (limited to 'include/shard')
| -rw-r--r-- | include/shard/FSTrie.h | 121 |
1 files changed, 20 insertions, 101 deletions
diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 62aa0b7..aa3c9f4 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -1,13 +1,11 @@ /* * include/shard/FSTrie.h * - * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> + * Copyright (C) 2024 Douglas B. Rumbaugh <drumbaugh@psu.edu> * * Distributed under the Modified BSD License. * * A shard shim around the FSTrie learned index. - * - * TODO: The code in this file is very poorly commented. */ #pragma once @@ -15,9 +13,7 @@ #include <vector> #include "framework/ShardRequirements.h" -#include "FST.hpp" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" +#include "fst.hpp" #include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; @@ -28,14 +24,13 @@ using psudb::byte; namespace de { -template <KVPInterface R, size_t E=1024> +template <KVPInterface R> class FSTrie { private: + typedef decltype(R::key) K; typedef decltype(R::value) V; - - static_assert(std::is_same_v<K, std::string> || std::is_same_v<K, uint64_t>, - "FST requires either string or uint64_t keys"); + static_assert(std::is_same_v<K, std::string>, "FST requires std::string keys."); public: FSTrie(BufferView<R> buffer) @@ -50,22 +45,12 @@ public: std::vector<K> keys; keys.reserve(buffer.get_record_count()); - std::vector<size_t> values; - values.reserve(buffer.get_record_count()); - - size_t longest_key = 0; - /* * Copy the contents of the buffer view into a temporary buffer, and * sort them. We still need to iterate over these temporary records to * apply tombstone/deleted record filtering, as well as any possible * per-record processing that is required by the shard being built. */ - /* - auto temp_buffer = (Wrapped<R> *) psudb::sf_aligned_calloc(CACHELINE_SIZE, - buffer.get_record_count(), - sizeof(Wrapped<R>)); - */ auto temp_buffer = new Wrapped<R>[buffer.get_record_count()](); for (size_t i=0; i<buffer.get_record_count(); i++) { temp_buffer[i] = *(buffer.get(i)); @@ -76,21 +61,14 @@ public: std::sort(base, stop, std::less<Wrapped<R>>()); for (size_t i=0; i<buffer.get_record_count(); i++) { - if (temp_buffer[i].is_deleted()) { + if (temp_buffer[i].is_deleted() || !temp_buffer[i].is_visible()) { continue; } m_data[cnt] = temp_buffer[i]; - m_data[cnt].header = 0; + m_data[cnt].clear_timestamp(); keys.push_back(m_data[cnt].rec.key); - values.push_back(cnt); - if constexpr (std::is_same_v<K, std::string>) { - if (m_data[cnt].rec.key.size() > longest_key) { - longest_key = m_data[cnt].rec.key.size(); - } - } - cnt++; } @@ -99,11 +77,8 @@ public: } m_reccnt = cnt; - m_fst = FST(); - if constexpr (std::is_same_v<K, std::string>) { - m_fst.load(keys, values, longest_key); - } else { - m_fst.load(keys, values); + if (m_reccnt > 0) { + m_fst = new fst::Trie(keys); } delete[] temp_buffer; @@ -124,10 +99,6 @@ public: std::vector<K> keys; keys.reserve(attemp_reccnt); - std::vector<size_t> values; - values.reserve(attemp_reccnt); - - size_t longest_key = 0; // FIXME: For smaller cursor arrays, it may be more efficient to skip // the priority queue and just do a scan. PriorityQueue<Wrapped<R>> pq(cursors.size()); @@ -155,16 +126,9 @@ public: } else { auto& cursor = cursors[now.version]; /* skip over records that have been deleted via tagging */ - if (!cursor.ptr->is_deleted()) { + if (!cursor.ptr->is_deleted() && cursor.ptr->rec.key != "") { m_data[m_reccnt] = *cursor.ptr; keys.push_back(m_data[m_reccnt].rec.key); - values.push_back(m_data[m_reccnt].rec.value); - - if constexpr (std::is_same_v<K, std::string>) { - if (m_data[m_reccnt].rec.key.size() > longest_key) { - longest_key = m_data[m_reccnt].rec.key.size(); - } - } m_reccnt++; } @@ -179,37 +143,24 @@ public: } if (m_reccnt > 0) { - m_fst = FST(); - if constexpr (std::is_same_v<K, std::string>) { - m_fst.load(keys, values, longest_key); - } else { - m_fst.load(keys, values); - } + m_fst = new fst::Trie(keys); } } ~FSTrie() { delete[] m_data; + delete m_fst; } Wrapped<R> *point_lookup(const R &rec, bool filter=false) { - size_t idx; - bool res; - if constexpr (std::is_same_v<K, std::string>) { - res = m_fst.lookup((uint8_t*)rec.key.c_str(), rec.key.size(), idx); - } else { - res = m_fst.lookup(rec.key, idx); - } - if (res && m_data[idx].rec.key != rec.key) { - fprintf(stderr, "ERROR!\n"); - } + auto idx = m_fst->exactSearch(rec.key); - if (res) { - return m_data + idx; + if (idx == fst::kNotFound) { + return nullptr; } - return nullptr; + return m_data + idx; } Wrapped<R>* get_data() const { @@ -231,53 +182,21 @@ public: size_t get_memory_usage() { - return m_fst.mem() + m_alloc_size; + return m_fst->getMemoryUsage() + m_alloc_size; } size_t get_aux_memory_usage() { return 0; } - size_t get_lower_bound(const K& key) { - auto itr = FSTIter(); - - const K temp_key = key; - - bool res; - if constexpr (std::is_same_v<K, std::string>) { - res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); - } else { - res = m_fst.lowerBound(temp_key, itr); - } - - return itr.value(); - } - - size_t get_upper_bound(const K& key) { - auto itr = FSTIter(); - - const K temp_key = key; - - bool res; - if constexpr (std::is_same_v<K, std::string>) { - res = m_fst.lowerBound(temp_key.c_str(), key.size(), itr); - } else { - res = m_fst.lowerBound(temp_key, itr); - } - - size_t idx = itr.value(); - while (idx < m_reccnt && m_data[idx].rec.key <= key) { - idx++; - } - - return idx; - } + size_t get_lower_bound(R &rec) {return 0;} + size_t get_upper_bound(R &rec) {return 0;} private: Wrapped<R>* m_data; size_t m_reccnt; size_t m_alloc_size; - FST m_fst; + fst::Trie *m_fst; }; } |