From d47eeea719448f649e93b6a9ec7593b4cb2fb40e Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 5 Jun 2023 14:25:19 -0400 Subject: Added TrieSpline and PGM Range queries + tests and bugfixes --- include/shard/PGM.h | 84 ++++++++++++++++++---------------- include/shard/TrieSpline.h | 109 ++++++++++++++++++++++++++++----------------- tests/pgm_tests.cpp | 59 ++++++++++++++++++++++++ tests/testing.h | 21 +++++++++ tests/triespline_tests.cpp | 67 ++++++++++++++++++++++++++-- 5 files changed, 257 insertions(+), 83 deletions(-) diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 9fad8d0..f9e1dad 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -28,13 +28,13 @@ namespace de { template -struct ts_range_query_parms { +struct pgm_range_query_parms { decltype(R::key) lower_bound; decltype(R::key) upper_bound; }; -template -class PGMLookup; +template +class PGMRangeQuery; template struct PGMState { @@ -46,7 +46,6 @@ template struct PGMBufferState { size_t cutoff; Alias* alias; - decltype(R::weight) max_weight; ~PGMBufferState() { delete alias; @@ -63,8 +62,7 @@ private: public: // FIXME: there has to be a better way to do this - friend class PGMLookup; - friend class PGMLookup; + friend class PGMRangeQuery; PGM(MutableBuffer* buffer) : m_reccnt(0), m_tombstone_cnt(0) { @@ -225,10 +223,6 @@ public: } private: - - // FIXME: depending upon the size of the returned bound, - // it may be better to switch between binary search and - // linear scan. size_t get_lower_bound(const K& key) const { auto bound = m_pgm.search(key); size_t idx = bound.lo; @@ -237,29 +231,29 @@ private: return m_reccnt; } - // if the found location _is_ the key, we're done. - if (m_data[idx].rec.key == key) { - return idx; - } - - // if the found location is larger than the key, we need to - // move backwards towards the beginning of the array - if (m_data[idx].rec.key > key) { - for (ssize_t i=idx; i>=0; i--) { - if (m_data[i].rec.key < key) { - return i+1; - } + // If the region to search is less than some pre-specified + // amount, perform a linear scan to locate the record. + if (bound.hi - bound.lo < 256) { + while (idx < bound.hi && m_data[idx].rec.key < key) { + idx++; } - // otherwise, we move forward towards the end } else { - for (size_t i=idx; i= key) { - return i - 1; + // Otherwise, perform a binary search + idx = bound.lo; + size_t max = bound.hi; + + while (idx < max) { + size_t mid = (idx + max) / 2; + if (key > m_data[mid].rec.key) { + idx = mid + 1; + } else { + max = mid; } } + } - return m_reccnt; + return (m_data[idx].rec.key <= key) ? idx : m_reccnt; } Wrapped* m_data; @@ -277,7 +271,7 @@ class PGMRangeQuery { public: static void *get_query_state(PGM *ts, void *parms) { auto res = new PGMState(); - auto p = (ts_range_query_parms *) parms; + auto p = (pgm_range_query_parms *) parms; res->start_idx = ts->get_lower_bound(p->lower_bound); res->stop_idx = ts->get_record_count(); @@ -287,18 +281,26 @@ public: static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { auto res = new PGMBufferState(); - res.cutoff = buffer->get_record_count(); + res->cutoff = buffer->get_record_count(); return res; } static std::vector> query(PGM *ts, void *q_state, void *parms) { std::vector> records; - auto p = (ts_range_query_parms *) parms; + auto p = (pgm_range_query_parms *) parms; auto s = (PGMState *) q_state; - auto ptr = ts->get_record_at(s->lower_bound); + + // if the returned index is one past the end of the + // records for the PGM, then there are not records + // in the index falling into the specified range. + if (s->start_idx == ts->get_record_count()) { + return records; + } + + auto ptr = ts->get_record_at(s->start_idx); size_t i = 0; - while (ptr->rec.key <= p->upper_bound && i < s->stop_idx) { + while (ptr[i].rec.key <= p->upper_bound && i < s->stop_idx - s->start_idx) { records.emplace_back(ptr[i]); i++; } @@ -307,11 +309,20 @@ public: } static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto p = (ts_range_query_parms *) parms; + auto p = (pgm_range_query_parms *) parms; auto s = (PGMBufferState *) state; + std::vector> records; + for (size_t i=0; icutoff; i++) { + auto rec = buffer->get_data() + i; + if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { + records.emplace_back(*rec); + } + } + + return records; } static std::vector merge(std::vector> &results) { @@ -335,11 +346,8 @@ public: auto s = (PGMBufferState *) state; delete s; } - - - //{q.get_buffer_query_state(p, p)}; - //{q.buffer_query(p, p)}; - }; +; + } diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 13753a1..fb0ed70 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -30,28 +30,24 @@ namespace de { size_t g_max_error = 1024; template -struct ts_lookup_parms { - size_t sample_size; - gsl_rng *rng; +struct ts_range_query_parms { + decltype(R::key) lower_bound; + decltype(R::key) upper_bound; }; -template -class TrieSplineLookup; +template +class TrieSplineRangeQuery; template struct TrieSplineState { - decltype(R::weight) tot_weight; - - TrieSplineState() { - tot_weight = 0; - } + size_t start_idx; + size_t stop_idx; }; template struct TrieSplineBufferState { size_t cutoff; Alias* alias; - decltype(R::weight) max_weight; ~TrieSplineBufferState() { delete alias; @@ -68,8 +64,7 @@ private: public: // FIXME: there has to be a better way to do this - friend class TrieSplineLookup; - friend class TrieSplineLookup; + friend class TrieSplineRangeQuery; TrieSpline(MutableBuffer* buffer) : m_reccnt(0), m_tombstone_cnt(0) { @@ -247,9 +242,6 @@ public: private: - // FIXME: depending upon the size of the returned bound, - // it may be better to switch between binary search and - // linear scan. size_t get_lower_bound(const K& key) const { auto bound = m_ts.GetSearchBound(key); size_t idx = bound.begin; @@ -258,29 +250,29 @@ private: return m_reccnt; } - // if the found location _is_ the key, we're done. - if (m_data[idx].rec.key == key) { - return idx; - } - - // if the found location is larger than the key, we need to - // move backwards towards the beginning of the array - if (m_data[idx].rec.key > key) { - for (ssize_t i=idx; i>=0; i--) { - if (m_data[i].rec.key < key) { - return i+1; - } + // If the region to search is less than some pre-specified + // amount, perform a linear scan to locate the record. + if (bound.end - bound.begin < 256) { + while (idx < bound.end && m_data[idx].rec.key < key) { + idx++; } - // otherwise, we move forward towards the end } else { - for (size_t i=idx; i= key) { - return i - 1; + // Otherwise, perform a binary search + idx = bound.begin; + size_t max = bound.end; + + while (idx < max) { + size_t mid = (idx + max) / 2; + if (key > m_data[mid].rec.key) { + idx = mid + 1; + } else { + max = mid; } } + } - return m_reccnt; + return (m_data[idx].rec.key <= key) ? idx : m_reccnt; } Wrapped* m_data; @@ -293,25 +285,63 @@ private: }; -template -class TrieSplineLookup { +template +class TrieSplineRangeQuery { public: - static void *get_query_state(TrieSpline *wss, void *parms) { + static void *get_query_state(TrieSpline *ts, void *parms) { auto res = new TrieSplineState(); + auto p = (ts_range_query_parms *) parms; + + res->start_idx = ts->get_lower_bound(p->lower_bound); + res->stop_idx = ts->get_record_count(); return res; } static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + auto res = new TrieSplineBufferState(); + res->cutoff = buffer->get_record_count(); + return res; } - static std::vector> query(TrieSpline *wss, void *q_state, void *parms) { + static std::vector> query(TrieSpline *ts, void *q_state, void *parms) { + std::vector> records; + auto p = (ts_range_query_parms *) parms; + auto s = (TrieSplineState *) q_state; + + // if the returned index is one past the end of the + // records for the TrieSpline, then there are not records + // in the index falling into the specified range. + if (s->start_idx == ts->get_record_count()) { + return records; + } + + auto ptr = ts->get_record_at(s->start_idx); + size_t i = 0; + while (ptr[i].rec.key <= p->upper_bound && i < s->stop_idx - s->start_idx) { + records.emplace_back(ptr[i]); + i++; + } + return records; } static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + auto p = (ts_range_query_parms *) parms; + auto s = (TrieSplineBufferState *) state; + + std::vector> records; + for (size_t i=0; icutoff; i++) { + auto rec = buffer->get_data() + i; + if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { + records.emplace_back(*rec); + } + } + + + return records; } static std::vector merge(std::vector> &results) { @@ -335,11 +365,6 @@ public: auto s = (TrieSplineBufferState *) state; delete s; } - - - //{q.get_buffer_query_state(p, p)}; - //{q.buffer_query(p, p)}; - }; } diff --git a/tests/pgm_tests.cpp b/tests/pgm_tests.cpp index 33979ae..254de03 100644 --- a/tests/pgm_tests.cpp +++ b/tests/pgm_tests.cpp @@ -139,6 +139,60 @@ START_TEST(t_point_lookup_miss) } +START_TEST(t_range_query) +{ + auto buffer = create_sequential_mbuffer(100, 1000); + auto shard = Shard(buffer); + + pgm_range_query_parms parms; + parms.lower_bound = 300; + parms.upper_bound = 500; + + auto state = PGMRangeQuery::get_query_state(&shard, &parms); + auto result = PGMRangeQuery::query(&shard, state, &parms); + PGMRangeQuery::delete_query_state(state); + + ck_assert_int_eq(result.size(), parms.upper_bound - parms.lower_bound + 1); + for (size_t i=0; i(100, 1000); + + pgm_range_query_parms parms; + parms.lower_bound = 300; + parms.upper_bound = 500; + + auto state = PGMRangeQuery::get_buffer_query_state(buffer, &parms); + auto result = PGMRangeQuery::buffer_query(buffer, state, &parms); + PGMRangeQuery::delete_buffer_query_state(state); + + ck_assert_int_eq(result.size(), parms.upper_bound - parms.lower_bound + 1); + for (size_t i=0; i *create_test_mbuffer(size_t cnt) return buffer; } +template +static de::MutableBuffer *create_sequential_mbuffer(decltype(R::key) start, decltype(R::key) stop) +{ + size_t cnt = stop - start; + auto buffer = new de::MutableBuffer(cnt, true, cnt); + + for (size_t i=start; i) { + rec.weight = 1; + } + + buffer->append(rec); + } + + return buffer; +} + template static de::MutableBuffer *create_test_mbuffer_tombstones(size_t cnt, size_t ts_cnt) { diff --git a/tests/triespline_tests.cpp b/tests/triespline_tests.cpp index 982be79..d88d4b1 100644 --- a/tests/triespline_tests.cpp +++ b/tests/triespline_tests.cpp @@ -1,5 +1,5 @@ /* - * tests/irs_tests.cpp + * tests/triespline_tests.cpp * * Unit tests for TrieSpline (Augmented B+Tree) shard * @@ -45,7 +45,7 @@ START_TEST(t_mbuffer_init) } -START_TEST(t_irs_init) +START_TEST(t_init) { size_t n = 512; auto mbuffer1 = create_test_mbuffer(n); @@ -169,13 +169,67 @@ START_TEST(t_full_cancelation) END_TEST +START_TEST(t_range_query) +{ + auto buffer = create_sequential_mbuffer(100, 1000); + auto shard = Shard(buffer); + + ts_range_query_parms parms; + parms.lower_bound = 300; + parms.upper_bound = 500; + + auto state = TrieSplineRangeQuery::get_query_state(&shard, &parms); + auto result = TrieSplineRangeQuery::query(&shard, state, &parms); + TrieSplineRangeQuery::delete_query_state(state); + + ck_assert_int_eq(result.size(), parms.upper_bound - parms.lower_bound + 1); + for (size_t i=0; i(100, 1000); + + ts_range_query_parms parms; + parms.lower_bound = 300; + parms.upper_bound = 500; + + auto state = TrieSplineRangeQuery::get_buffer_query_state(buffer, &parms); + auto result = TrieSplineRangeQuery::buffer_query(buffer, state, &parms); + TrieSplineRangeQuery::delete_buffer_query_state(state); + + ck_assert_int_eq(result.size(), parms.upper_bound - parms.lower_bound + 1); + for (size_t i=0; i