From b25beb13773072c3b143842b45a7c32a1108f347 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 15 Apr 2024 14:00:27 -0400 Subject: Updated FSTrie to use const char * instead of std::string Note: this requires the caller to manage the memory of the strings --- CMakeLists.txt | 2 +- benchmarks/string_insertion_tput.cpp | 18 ++++---- external/psudb-common | 2 +- include/framework/interface/Record.h | 17 ++++++++ include/shard/FSTrie.h | 18 +++----- tests/include/concurrent_extension.h | 6 +-- tests/include/pointlookup.h | 18 ++++---- tests/include/rangecount.h | 18 +++----- tests/include/rangequery.h | 18 +++----- tests/include/shard_standard.h | 10 ++--- tests/include/shard_string.h | 7 ++- tests/include/testing.h | 82 +++++++++++++++--------------------- 12 files changed, 93 insertions(+), 123 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 314fc8c..3c73545 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) set(namespace "de") project("Practical Dynamic Extension" VERSION 0.1.0) -set(debug false) +set(debug true) set(tests True) set(bench true) diff --git a/benchmarks/string_insertion_tput.cpp b/benchmarks/string_insertion_tput.cpp index 4923b09..f4a519a 100644 --- a/benchmarks/string_insertion_tput.cpp +++ b/benchmarks/string_insertion_tput.cpp @@ -16,16 +16,16 @@ #include "psu-util/progress.h" -typedef de::Record Rec; +typedef de::Record Rec; typedef de::FSTrie Trie; typedef de::pl::Query Q; typedef de::DynamicExtension Ext; -std::vector strings; +std::vector> strings; void insert_thread(int64_t start, int64_t end, Ext *extension) { for (uint64_t i=start; iinsert(r)) { _mm_pause(); } @@ -41,7 +41,7 @@ void read_data(std::string fname, size_t n=10000000) { size_t i=0; std::string line; while (i < n && std::getline(file, line, '\n')) { - strings.emplace_back(line); + strings.emplace_back(std::unique_ptr(strdup(line.c_str()))); i++; psudb::progress_update((double) i / (double) n, "Reading file:"); } @@ -82,14 +82,13 @@ int main(int argc, char **argv) { TIMER_START(); for (size_t i=0; i parms; - parms.search_key = strings[j]; + de::pl::Parms parms = {strings[j].get()}; auto res = extension->query(&parms); auto ans = res.get(); if (ans[0].value != j) { - fprintf(stderr, "ext:\t%ld %ld %s\n", ans[0].value, j, strings[j].c_str()); + fprintf(stderr, "ext:\t%ld %ld %s\n", ans[0].value, j, strings[j].get()); } assert(ans[0].value == j); @@ -103,13 +102,12 @@ int main(int argc, char **argv) { TIMER_START(); for (size_t i=0; i parms; - parms.search_key = strings[j]; + de::pl::Parms parms = {strings[j].get()}; auto res = Q::query(shard, nullptr, &parms); if (res[0].rec.value != j) { - fprintf(stderr, "static:\t%ld %ld %s\n", res[0].rec.value, j, strings[j].c_str()); + fprintf(stderr, "static:\t%ld %ld %s\n", res[0].rec.value, j, strings[j].get()); } } TIMER_STOP(); diff --git a/external/psudb-common b/external/psudb-common index 18a3e33..ef14fe0 160000 --- a/external/psudb-common +++ b/external/psudb-common @@ -1 +1 @@ -Subproject commit 18a3e3383ed77e7a9cfab240f54378c43de203bf +Subproject commit ef14fe0f5c1ec52107b187dbe803f472767bd68c diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h index d380f9b..19ccadd 100644 --- a/include/framework/interface/Record.h +++ b/include/framework/interface/Record.h @@ -132,6 +132,23 @@ struct Record { } }; +template +struct Record { + const char* key; + V value; + size_t len; + + inline bool operator<(const Record& other) const { + size_t n = std::min(len, other.len) + 1; + return strncmp(key, other.key, n) < 0; + } + + inline bool operator==(const Record& other) const { + size_t n = std::min(len, other.len) + 1; + return strncmp(key, other.key, n) == 0; + } +}; + template struct WeightedRecord { K key; diff --git a/include/shard/FSTrie.h b/include/shard/FSTrie.h index 95f396f..be678ff 100644 --- a/include/shard/FSTrie.h +++ b/include/shard/FSTrie.h @@ -30,7 +30,7 @@ private: typedef decltype(R::key) K; typedef decltype(R::value) V; - static_assert(std::is_same_v, "FST requires std::string keys."); + static_assert(std::is_same_v, "FST requires const char* keys."); public: FSTrie(BufferView buffer) @@ -42,7 +42,7 @@ public: m_alloc_size = sizeof(Wrapped) * buffer.get_record_count(); size_t cnt = 0; - std::vector keys; + std::vector keys; keys.reserve(buffer.get_record_count()); /* @@ -68,14 +68,10 @@ public: m_data[cnt] = temp_buffer[i]; m_data[cnt].clear_timestamp(); - keys.push_back(m_data[cnt].rec.key); + keys.push_back(std::string(m_data[cnt].rec.key)); cnt++; } - for (size_t i=0; i 0) { m_fst = new fst::Trie(keys); @@ -96,7 +92,7 @@ public: m_data = new Wrapped[attemp_reccnt](); m_alloc_size = attemp_reccnt * sizeof(Wrapped); - std::vector keys; + std::vector keys; keys.reserve(attemp_reccnt); // FIXME: For smaller cursor arrays, it may be more efficient to skip @@ -128,7 +124,7 @@ public: /* skip over records that have been deleted via tagging */ if (!cursor.ptr->is_deleted() && cursor.ptr->rec.key != "") { m_data[m_reccnt] = *cursor.ptr; - keys.push_back(m_data[m_reccnt].rec.key); + keys.push_back(std::string(m_data[m_reccnt].rec.key)); m_reccnt++; } @@ -138,10 +134,6 @@ public: } } - for (size_t i=0; i 0) { m_fst = new fst::Trie(keys); } diff --git a/tests/include/concurrent_extension.h b/tests/include/concurrent_extension.h index 2948292..927a094 100644 --- a/tests/include/concurrent_extension.h +++ b/tests/include/concurrent_extension.h @@ -97,8 +97,7 @@ START_TEST(t_insert_with_mem_merges) R r = {key, val}; for (size_t i=0; i<1000; i++) { ck_assert_int_eq(test_de->insert(r), 1); - r.key++; - r.value++; + r = R{r.key + 1, r.value + 1}; } ck_assert_int_eq(test_de->get_record_count(), 1000); @@ -114,8 +113,7 @@ START_TEST(t_insert_with_mem_merges) size_t cnt = 0; do { if (test_de->insert(r)) { - r.key++; - r.value++; + r = R{r.key + 1, r.value + 1}; cnt++; ck_assert_int_eq(test_de->get_record_count(), cnt + 1000); } else { diff --git a/tests/include/pointlookup.h b/tests/include/pointlookup.h index bf4810b..84e71f2 100644 --- a/tests/include/pointlookup.h +++ b/tests/include/pointlookup.h @@ -40,24 +40,25 @@ START_TEST(t_point_lookup_query) auto buffer = create_test_mbuffer(1000); auto shard = Shard(buffer->get_buffer_view()); - pl::Parms parms; { auto bv = buffer->get_buffer_view(); for (size_t i=0; irec.key; - parms.search_key = key; + pl::Parms parms = {key}; auto state = pl::Query::get_query_state(&shard, &parms); auto result = pl::Query::query(&shard, state, &parms); pl::Query::delete_query_state(state); ck_assert_int_eq(result.size(), 1); - ck_assert_str_eq(result[0].rec.key.c_str(), key.c_str()); + ck_assert_str_eq(result[0].rec.key, key); ck_assert_int_eq(result[0].rec.value, bv.get(i)->rec.value); } /* point lookup miss; result size should be 0 */ - parms.search_key = "computer"; + const char *c = "computer"; + pl::Parms parms = {c}; + auto state = pl::Query::get_query_state(&shard, &parms); auto result = pl::Query::query(&shard, state, &parms); pl::Query::delete_query_state(state); @@ -74,23 +75,24 @@ START_TEST(t_buffer_point_lookup) { auto buffer = create_test_mbuffer(1000); - pl::Parms parms; { auto view = buffer->get_buffer_view(); for (int i=view.get_record_count()-1; i>=0; i--) { - parms.search_key = view.get(i)->rec.key; + pl::Parms parms = {view.get(i)->rec.key}; auto state = pl::Query::get_buffer_query_state(&view, &parms); auto result = pl::Query::buffer_query(state, &parms); pl::Query::delete_buffer_query_state(state); ck_assert_int_eq(result.size(), 1); - ck_assert_str_eq(result[0].rec.key.c_str(), view.get(i)->rec.key.c_str()); + ck_assert_str_eq(result[0].rec.key, view.get(i)->rec.key); ck_assert_int_eq(result[0].rec.value, view.get(i)->rec.value); } /* point lookup miss; result size should be 0 */ - parms.search_key = "computer"; + const char *c = "computer"; + pl::Parms parms = {c}; + auto state = pl::Query::get_buffer_query_state(&view, &parms); auto result = pl::Query::buffer_query(state, &parms); pl::Query::delete_buffer_query_state(state); diff --git a/tests/include/rangecount.h b/tests/include/rangecount.h index fdd66d9..c97d64c 100644 --- a/tests/include/rangecount.h +++ b/tests/include/rangecount.h @@ -40,9 +40,7 @@ START_TEST(t_range_count) auto buffer = create_sequential_mbuffer(100, 1000); auto shard = Shard(buffer->get_buffer_view()); - rc::Parms parms; - parms.lower_bound = 300; - parms.upper_bound = 500; + rc::Parms parms = {300, 500}; auto state = rc::Query::get_query_state(&shard, &parms); auto result = rc::Query::query(&shard, state, &parms); @@ -60,9 +58,7 @@ START_TEST(t_buffer_range_count) { auto buffer = create_sequential_mbuffer(100, 1000); - rc::Parms parms; - parms.lower_bound = 300; - parms.upper_bound = 500; + rc::Parms parms = {300, 500}; { auto view = buffer->get_buffer_view(); @@ -87,9 +83,7 @@ START_TEST(t_range_count_merge) auto shard1 = Shard(buffer1->get_buffer_view()); auto shard2 = Shard(buffer2->get_buffer_view()); - rc::Parms parms; - parms.lower_bound = 150; - parms.upper_bound = 500; + rc::Parms parms = {150, 500}; size_t result_size = parms.upper_bound - parms.lower_bound + 1 - 200; @@ -128,10 +122,8 @@ START_TEST(t_lower_bound) auto merged = Shard(shards); - for (size_t i=100; i<1000; i++) { - R r; - r.key = i; - r.value = i; + for (uint32_t i=100; i<1000; i++) { + R r = R{i, i}; auto idx = merged.get_lower_bound(i); diff --git a/tests/include/rangequery.h b/tests/include/rangequery.h index a8a73f7..a3b761e 100644 --- a/tests/include/rangequery.h +++ b/tests/include/rangequery.h @@ -39,9 +39,7 @@ START_TEST(t_range_query) auto buffer = create_sequential_mbuffer(100, 1000); auto shard = Shard(buffer->get_buffer_view()); - rq::Parms parms; - parms.lower_bound = 300; - parms.upper_bound = 500; + rq::Parms parms = {300, 500}; auto state = rq::Query::get_query_state(&shard, &parms); auto result = rq::Query::query(&shard, state, &parms); @@ -62,9 +60,7 @@ START_TEST(t_buffer_range_query) { auto buffer = create_sequential_mbuffer(100, 1000); - rq::Parms parms; - parms.lower_bound = 300; - parms.upper_bound = 500; + rq::Parms parms = {300, 500}; { auto view = buffer->get_buffer_view(); @@ -92,9 +88,7 @@ START_TEST(t_range_query_merge) auto shard1 = Shard(buffer1->get_buffer_view()); auto shard2 = Shard(buffer2->get_buffer_view()); - rq::Parms parms; - parms.lower_bound = 150; - parms.upper_bound = 500; + rq::Parms parms = {150, 500}; size_t result_size = parms.upper_bound - parms.lower_bound + 1 - 200; @@ -149,10 +143,8 @@ START_TEST(t_lower_bound) auto merged = Shard(shards); - for (size_t i=100; i<1000; i++) { - R r; - r.key = i; - r.value = i; + for (uint32_t i=100; i<1000; i++) { + R r = R{i, i}; auto idx = merged.get_lower_bound(i); diff --git a/tests/include/shard_standard.h b/tests/include/shard_standard.h index 55e4c7b..2809d74 100644 --- a/tests/include/shard_standard.h +++ b/tests/include/shard_standard.h @@ -150,10 +150,8 @@ START_TEST(t_point_lookup) auto view = buffer->get_buffer_view(); for (size_t i=0; irec.key; - r.value = rec->rec.value; + R r = {rec->rec.key, rec->rec.value}; auto result = isam.point_lookup(r); ck_assert_ptr_nonnull(result); @@ -174,10 +172,8 @@ START_TEST(t_point_lookup_miss) auto buffer = create_double_seq_mbuffer(n, false); auto isam = Shard(buffer->get_buffer_view()); - for (size_t i=n + 100; i<2*n; i++) { - R r; - r.key = i; - r.value = i; + for (uint32_t i=n + 100; i<2*n; i++) { + R r = R{i, i}; auto result = isam.point_lookup(r); ck_assert_ptr_null(result); diff --git a/tests/include/shard_string.h b/tests/include/shard_string.h index fa51630..2d7a72a 100644 --- a/tests/include/shard_string.h +++ b/tests/include/shard_string.h @@ -122,7 +122,7 @@ START_TEST(t_point_lookup) auto result = shard.point_lookup(r); ck_assert_ptr_nonnull(result); - ck_assert_str_eq(result->rec.key.c_str(), r.key.c_str()); + ck_assert_str_eq(result->rec.key, r.key); ck_assert_int_eq(result->rec.value, r.value); //fprintf(stderr, "%ld\n", i); } @@ -141,9 +141,8 @@ START_TEST(t_point_lookup_miss) auto shard = Shard(buffer->get_buffer_view()); for (size_t i=n + 100; i<2*n; i++) { - R r; - r.key = std::string("computer"); - r.value = 1234; + const char *c = "computer"; + R r = {c, 1234, 8}; auto result = shard.point_lookup(r); ck_assert_ptr_null(result); diff --git a/tests/include/testing.h b/tests/include/testing.h index 2315daa..d0bff2d 100644 --- a/tests/include/testing.h +++ b/tests/include/testing.h @@ -27,14 +27,17 @@ typedef de::WeightedRecord WRec; typedef de::Record Rec; typedef de::EuclidPoint PRec; -typedef de::Record StringRec; +typedef de::Record StringRec; -std::string kjv_wordlist = "tests/data/kjv-wordlist.txt"; -std::string summa_wordlist = "tests/data/summa-wordlist.txt"; +static std::string kjv_wordlist = "tests/data/kjv-wordlist.txt"; +static std::string summa_wordlist = "tests/data/summa-wordlist.txt"; + +static std::vector> string_data; static std::vector read_string_data(std::string fname, size_t n) { std::vector vec; vec.reserve(n); + string_data.reserve(n); std::fstream file; file.open(fname, std::ios::in); @@ -44,13 +47,17 @@ static std::vector read_string_data(std::string fname, size_t n) { if (!std::getline(file, line, '\n')) break; std::stringstream ls(line); - StringRec r; std::string field; std::getline(ls, field, '\t'); - r.value = atol(field.c_str()); + auto val = atol(field.c_str()); std::getline(ls, field, '\n'); - r.key = std::string(field); + + char *c = strdup(field.c_str()); + + string_data.push_back(std::unique_ptr(c)); + + StringRec r(string_data[string_data.size() -1].get(), val, field.size()); vec.push_back(r); } @@ -62,7 +69,7 @@ static std::vector read_string_data(std::string fname, size_t n) { template std::vector strip_wrapping(std::vector> vec) { std::vector out(vec.size()); - for (size_t i=0; i *create_test_mbuffer(size_t cnt) { auto buffer = new de::MutableBuffer(cnt/2, cnt); - R rec; if constexpr (de::KVPInterface){ - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v){ auto records = read_string_data(kjv_wordlist, cnt); for (size_t i=0; i) { - rec.weight = 1; - } - buffer->append(records[i]); } } else { for (size_t i = 0; i < cnt; i++) { - rec.key = rand(); - rec.value = rand(); - if constexpr (de::WeightedRecordInterface) { - rec.weight = 1; + buffer->append({(uint64_t) rand(), (uint32_t) rand(), 1}); + } else { + buffer->append({(uint64_t) rand(), (uint32_t) rand()}); } - - buffer->append(rec); } } } else if constexpr (de::NDRecordInterface) { for (size_t i=0; iappend({a, b}); + buffer->append({(uint64_t) rand(), (uint64_t) rand()}); } } @@ -155,25 +152,19 @@ static de::MutableBuffer *create_sequential_mbuffer(size_t start, size_t stop size_t cnt = stop - start; auto buffer = new de::MutableBuffer(cnt/2, cnt); - for (size_t i=start; i) { - rec.key = i; - rec.value = i; - } else if constexpr (de::NDRecordInterface) { - rec = {i, i}; - } + for (uint32_t i=start; i) { - rec.weight = 1; + buffer->append({i, i, 1}); + } else { + buffer->append({i, i}); } - - buffer->append(rec); } return buffer; } +/* template static de::MutableBuffer *create_test_mbuffer_tombstones(size_t cnt, size_t ts_cnt) { @@ -183,13 +174,13 @@ static de::MutableBuffer *create_test_mbuffer_tombstones(size_t cnt, size_t t R rec; for (size_t i = 0; i < cnt; i++) { - rec.key = rand(); - rec.value = rand(); - if constexpr (de::WeightedRecordInterface) { - rec.weight = 1; + rec = {rand(), rand(), 1}; + } else { + rec = {rand(), rand(), 1}; } + if (i < ts_cnt) { tombstones.push_back({rec.key, rec.value}); } @@ -204,6 +195,7 @@ static de::MutableBuffer *create_test_mbuffer_tombstones(size_t cnt, size_t t return buffer; } +*/ template requires de::WeightedRecordInterface && de::KVPInterface @@ -234,20 +226,12 @@ static de::MutableBuffer *create_double_seq_mbuffer(size_t cnt, bool ts=false { auto buffer = new de::MutableBuffer(cnt/2, cnt); - for (size_t i = 0; i < cnt / 2; i++) { - R rec; - rec.key = i; - rec.value = i; - - buffer->append(rec, ts); + for (uint32_t i = 0; i < cnt / 2; i++) { + buffer->append({i, i}, ts); } - for (size_t i = 0; i < cnt / 2; i++) { - R rec; - rec.key = i; - rec.value = i + 1; - - buffer->append(rec, ts); + for (uint32_t i = 0; i < cnt / 2; i++) { + buffer->append({i, i+1}, ts); } return buffer; -- cgit v1.2.3