From eb8dbaa770a57557d67c817c2839c64f536a6ce4 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 13 Sep 2023 16:22:03 -0400 Subject: Began re-architecting the project for concurrency support The project is now in a state where it builds, but it probably has a lot of bugs still. --- include/framework/Configuration.h | 54 +++++ include/framework/DynamicExtension.h | 372 ++++++-------------------------- include/framework/ExtensionStructure.h | 374 +++++++++++++++++++++++++++++++++ include/framework/InternalLevel.h | 8 +- include/framework/MutableBuffer.h | 67 +++++- include/framework/RecordInterface.h | 2 +- include/framework/Scheduler.h | 76 +++++++ 7 files changed, 645 insertions(+), 308 deletions(-) create mode 100644 include/framework/Configuration.h create mode 100644 include/framework/ExtensionStructure.h create mode 100644 include/framework/Scheduler.h (limited to 'include') diff --git a/include/framework/Configuration.h b/include/framework/Configuration.h new file mode 100644 index 0000000..eb9b93f --- /dev/null +++ b/include/framework/Configuration.h @@ -0,0 +1,54 @@ +/* + * include/framework/DynamicExtension.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include + +#include "psu-util/timer.h" +#include "psu-ds/Alias.h" + +namespace de { + +thread_local size_t sampling_attempts = 0; +thread_local size_t sampling_rejections = 0; +thread_local size_t deletion_rejections = 0; +thread_local size_t bounds_rejections = 0; +thread_local size_t tombstone_rejections = 0; +thread_local size_t buffer_rejections = 0; + +/* + * thread_local size_t various_sampling_times go here. + */ +thread_local size_t sample_range_time = 0; +thread_local size_t alias_time = 0; +thread_local size_t alias_query_time = 0; +thread_local size_t rejection_check_time = 0; +thread_local size_t buffer_sample_time = 0; +thread_local size_t memlevel_sample_time = 0; +thread_local size_t disklevel_sample_time = 0; +thread_local size_t sampling_bailouts = 0; + + +enum class LayoutPolicy { + LEVELING, + TEIRING +}; + +enum class DeletePolicy { + TOMBSTONE, + TAGGING +}; + +typedef ssize_t level_index; + +} diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 524024b..5e9bcee 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -19,62 +19,36 @@ #include "framework/ShardInterface.h" #include "framework/QueryInterface.h" #include "framework/RecordInterface.h" +#include "framework/ExtensionStructure.h" + +#include "framework/Configuration.h" +#include "framework/Scheduler.h" -#include "shard/WIRS.h" #include "psu-util/timer.h" #include "psu-ds/Alias.h" namespace de { -thread_local size_t sampling_attempts = 0; -thread_local size_t sampling_rejections = 0; -thread_local size_t deletion_rejections = 0; -thread_local size_t bounds_rejections = 0; -thread_local size_t tombstone_rejections = 0; -thread_local size_t buffer_rejections = 0; - -/* - * thread_local size_t various_sampling_times go here. - */ -thread_local size_t sample_range_time = 0; -thread_local size_t alias_time = 0; -thread_local size_t alias_query_time = 0; -thread_local size_t rejection_check_time = 0; -thread_local size_t buffer_sample_time = 0; -thread_local size_t memlevel_sample_time = 0; -thread_local size_t disklevel_sample_time = 0; -thread_local size_t sampling_bailouts = 0; - - -enum class LayoutPolicy { - LEVELING, - TEIRING -}; - -enum class DeletePolicy { - TOMBSTONE, - TAGGING -}; - -typedef ssize_t level_index; - template class DynamicExtension { - //typedef typename S Shard; typedef S Shard; typedef MutableBuffer Buffer; - + typedef ExtensionStructure Structure; public: - DynamicExtension(size_t buffer_cap, size_t scale_factor, double max_delete_prop) - : m_scale_factor(scale_factor), m_max_delete_prop(max_delete_prop), - m_buffer(new Buffer(buffer_cap, buffer_cap * max_delete_prop)) + DynamicExtension(size_t buffer_cap, size_t scale_factor, double max_delete_prop, size_t memory_budget=0, + size_t thread_cnt=16) + : m_scale_factor(scale_factor) + , m_max_delete_prop(max_delete_prop) + , m_sched(memory_budget, thread_cnt) { } ~DynamicExtension() { - delete m_buffer; + for (size_t i=0; idelete_record(rec)) { - return 1; - } + if (get_active_version()->tagged_delete(rec)) { + return 1; } - // the buffer will take the longest amount of time, and - // probably has the lowest probability of having the record, - // so we'll check it last. + /* + * the buffer will take the longest amount of time, and + * probably has the lowest probability of having the record, + * so we'll check it last. + */ return buffer->delete_record(rec); } + /* + * If tagging isn't used, then delete using a tombstone + */ return internal_append(rec, true); } std::vector query(void *parms) { auto buffer = get_buffer(); + auto vers = get_active_version(); // Get the buffer query state auto buffer_state = Q::get_buffer_query_state(buffer, parms); @@ -115,7 +89,7 @@ public: std::vector> shards; std::vector states; - for (auto &level : m_levels) { + for (auto &level : vers->get_levels()) { level->get_query_states(shards, states, parms); } @@ -125,7 +99,7 @@ public: // Execute the query for the buffer auto buffer_results = Q::buffer_query(buffer, buffer_state, parms); - query_results[0] = std::move(filter_deletes(buffer_results, {-1, -1}, buffer)); + query_results[0] = std::move(filter_deletes(buffer_results, {-1, -1}, buffer, vers)); if constexpr (Q::EARLY_ABORT) { if (query_results[0].size() > 0) { auto result = Q::merge(query_results, parms); @@ -141,7 +115,7 @@ public: // Execute the query for each shard for (size_t i=0; i 0) { auto result = Q::merge(query_results, parms); @@ -170,75 +144,44 @@ public: size_t get_record_count() { size_t cnt = get_buffer()->get_record_count(); - - for (size_t i=0; iget_record_count(); - } - - return cnt; + return cnt + get_active_version()->get_record_count(); } size_t get_tombstone_cnt() { size_t cnt = get_buffer()->get_tombstone_count(); - - for (size_t i=0; iget_tombstone_count(); - } - - return cnt; + return cnt + get_active_version()->get_tombstone_cnt(); } size_t get_height() { - return m_levels.size(); + return get_active_version()->get_height(); } size_t get_memory_usage() { - size_t cnt = m_buffer->get_memory_usage(); - - for (size_t i=0; iget_memory_usage(); - } + auto vers = get_active_version(); + auto buffer = get_buffer(); - return cnt; + return vers.get_memory_usage() + buffer->get_memory_usage(); } size_t get_aux_memory_usage() { - size_t cnt = m_buffer->get_aux_memory_usage(); - - for (size_t i=0; iget_aux_memory_usage(); - } - } - - return cnt; - } - - bool validate_tombstone_proportion() { - long double ts_prop; - for (size_t i=0; iget_tombstone_count() / (long double) calc_level_record_capacity(i); - if (ts_prop > (long double) m_max_delete_prop) { - return false; - } - } - } + auto vers = get_active_version(); + auto buffer = get_buffer(); - return true; + return vers.get_aux_memory_usage() + buffer->get_aux_memory_usage(); } size_t get_buffer_capacity() { - return m_buffer->get_capacity(); + return get_height()->get_capacity(); } Shard *create_static_structure() { + auto vers = get_active_version(); std::vector shards; - if (m_levels.size() > 0) { - for (int i=m_levels.size() - 1; i>= 0; i--) { - if (m_levels[i]) { - shards.emplace_back(m_levels[i]->get_merged_shard()); + if (vers->get_levels().size() > 0) { + for (int i=vers->get_levels().size() - 1; i>= 0; i--) { + if (vers->get_levels()[i]) { + shards.emplace_back(vers->get_levels()[i]->get_merged_shard()); } } } @@ -263,16 +206,32 @@ public: return flattened; } + /* + * Mostly exposed for unit-testing purposes. Verifies that the current + * active version of the ExtensionStructure doesn't violate the maximum + * tombstone proportion invariant. + */ + bool validate_tombstone_proportion() { + return get_active_version()->validate_tombstone_proportion(); + } + private: - Buffer *m_buffer; + Scheduler m_sched; + + std::vector m_buffers; + std::vector m_versions; + + std::atomic m_current_epoch; size_t m_scale_factor; double m_max_delete_prop; - std::vector *> m_levels; - Buffer *get_buffer() { - return m_buffer; + return m_buffers[0]; + } + + Structure *get_active_version() { + return m_versions[0]; } int internal_append(const R &rec, bool ts) { @@ -281,13 +240,14 @@ private: ; if (buffer->is_full()) { - merge_buffer(); + auto vers = get_active_version(); + m_sched.schedule_merge(vers, buffer); } return buffer->append(rec, ts); } - std::vector> filter_deletes(std::vector> &records, ShardID shid, Buffer *buffer) { + std::vector> filter_deletes(std::vector> &records, ShardID shid, Buffer *buffer, Structure *vers) { if constexpr (!Q::SKIP_DELETE_FILTER) { return records; } @@ -322,12 +282,12 @@ private: if (shid != INVALID_SHID) { for (size_t lvl=0; lvl<=shid.level_idx; lvl++) { - if (m_levels[lvl]->check_tombstone(0, rec.rec)) { + if (vers->get_levels()[lvl]->check_tombstone(0, rec.rec)) { continue; } } - if (m_levels[shid.level_idx]->check_tombstone(shid.shard_idx + 1, rec.rec)) { + if (vers->get_levels()[shid.level_idx]->check_tombstone(shid.shard_idx + 1, rec.rec)) { continue; } } @@ -337,198 +297,6 @@ private: return processed_records; } - - /* - * Add a new level to the LSM Tree and return that level's index. Will - * automatically determine whether the level should be on memory or on disk, - * and act appropriately. - */ - inline level_index grow() { - level_index new_idx; - - size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor; - new_idx = m_levels.size(); - if (new_idx > 0) { - assert(m_levels[new_idx - 1]->get_shard(0)->get_tombstone_count() == 0); - } - m_levels.emplace_back(new InternalLevel(new_idx, new_shard_cnt)); - - return new_idx; - } - - - // Merge the memory table down into the tree, completing any required other - // merges to make room for it. - inline void merge_buffer() { - auto buffer = get_buffer(); - - if (!can_merge_with(0, buffer->get_record_count())) { - merge_down(0); - } - - merge_buffer_into_l0(buffer); - enforce_delete_maximum(0); - - buffer->truncate(); - return; - } - - /* - * Merge the specified level down into the tree. The level index must be - * non-negative (i.e., this function cannot be used to merge the buffer). This - * routine will recursively perform any necessary merges to make room for the - * specified level. - */ - inline void merge_down(level_index idx) { - level_index merge_base_level = find_mergable_level(idx); - if (merge_base_level == -1) { - merge_base_level = grow(); - } - - for (level_index i=merge_base_level; i>idx; i--) { - merge_levels(i, i-1); - enforce_delete_maximum(i); - } - - return; - } - - /* - * Find the first level below the level indicated by idx that - * is capable of sustaining a merge operation and return its - * level index. If no such level exists, returns -1. Also - * returns -1 if idx==0, and no such level exists, to simplify - * the logic of the first merge. - */ - inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { - - if (idx == 0 && m_levels.size() == 0) return -1; - - bool level_found = false; - bool disk_level; - level_index merge_level_idx; - - size_t incoming_rec_cnt = get_level_record_count(idx, buffer); - for (level_index i=idx+1; i::merge_levels(m_levels[base_level], m_levels[incoming_level]); - mark_as_unused(tmp); - } else { - m_levels[base_level]->append_merged_shards(m_levels[incoming_level]); - } - - mark_as_unused(m_levels[incoming_level]); - m_levels[incoming_level] = new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor); - } - - - inline void merge_buffer_into_l0(Buffer *buffer) { - assert(m_levels[0]); - if constexpr (L == LayoutPolicy::LEVELING) { - // FIXME: Kludgey implementation due to interface constraints. - auto old_level = m_levels[0]; - auto temp_level = new InternalLevel(0, 1); - temp_level->append_buffer(buffer); - auto new_level = InternalLevel::merge_levels(old_level, temp_level); - - m_levels[0] = new_level; - delete temp_level; - mark_as_unused(old_level); - } else { - m_levels[0]->append_buffer(buffer); - } - } - - /* - * Mark a given memory level as no-longer in use by the tree. For now this - * will just free the level. In future, this will be more complex as the - * level may not be able to immediately be deleted, depending upon who - * else is using it. - */ - inline void mark_as_unused(InternalLevel *level) { - delete level; - } - - /* - * Check the tombstone proportion for the specified level and - * if the limit is exceeded, forcibly merge levels until all - * levels below idx are below the limit. - */ - inline void enforce_delete_maximum(level_index idx) { - long double ts_prop = (long double) m_levels[idx]->get_tombstone_count() / (long double) calc_level_record_capacity(idx); - - if (ts_prop > (long double) m_max_delete_prop) { - merge_down(idx); - } - - return; - } - - /* - * Assume that level "0" should be larger than the buffer. The buffer - * itself is index -1, which should return simply the buffer capacity. - */ - inline size_t calc_level_record_capacity(level_index idx) { - return get_buffer()->get_capacity() * pow(m_scale_factor, idx+1); - } - - /* - * Returns the actual number of records present on a specified level. An - * index value of -1 indicates the memory table. Can optionally pass in - * a pointer to the memory table to use, if desired. Otherwise, there are - * no guarantees about which buffer will be accessed if level_index is -1. - */ - inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) { - - assert(idx >= -1); - if (idx == -1) { - return (buffer) ? buffer->get_record_count() : get_buffer()->get_record_count(); - } - - return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0; - } - - /* - * Determines if the specific level can merge with another record containing - * incoming_rec_cnt number of records. The provided level index should be - * non-negative (i.e., not refer to the buffer) and will be automatically - * translated into the appropriate index into either the disk or memory level - * vector. - */ - inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { - if (idx>= m_levels.size() || !m_levels[idx]) { - return false; - } - - if (L == LayoutPolicy::LEVELING) { - return m_levels[idx]->get_record_count() + incoming_rec_cnt <= calc_level_record_capacity(idx); - } else { - return m_levels[idx]->get_shard_count() < m_scale_factor; - } - - // unreachable - assert(true); - } }; - } diff --git a/include/framework/ExtensionStructure.h b/include/framework/ExtensionStructure.h new file mode 100644 index 0000000..1e756db --- /dev/null +++ b/include/framework/ExtensionStructure.h @@ -0,0 +1,374 @@ +/* + * include/framework/ExtensionStructure.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include + +#include "framework/MutableBuffer.h" +#include "framework/InternalLevel.h" +#include "framework/ShardInterface.h" +#include "framework/QueryInterface.h" +#include "framework/RecordInterface.h" + +#include "framework/Configuration.h" + +#include "psu-util/timer.h" +#include "psu-ds/Alias.h" + +namespace de { + +template +class ExtensionStructure { + typedef S Shard; + typedef MutableBuffer Buffer; + +public: + ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop) + : m_scale_factor(scale_factor) + , m_max_delete_prop(max_delete_prop) + , m_buffer_size(buffer_size) + {} + + ~ExtensionStructure() = default; + + /* + * Create a shallow copy of this extension structure. The copy will share references to the + * same levels/shards as the original, but will have its own lists. As all of the shards are + * immutable (with the exception of deletes), the copy can be restructured with merges, etc., + * without affecting the original. + * + * NOTE: When using tagged deletes, a delete of a record in the original structure will affect + * the copy, so long as the copy retains a reference to the same shard as the original. This could + * cause synchronization problems under tagging with concurrency. Any deletes in this context will + * need to be forwarded to the appropriate structures manually. + */ + ExtensionStructure *copy() { + auto new_struct = new ExtensionStructure(m_scale_factor, m_max_delete_prop, m_buffer_size); + for (size_t i=0; im_levels.push_back(m_levels[i]); + } + + return new_struct; + } + + /* + * Search for a record matching the argument and mark it deleted by + * setting the delete bit in its wrapped header. Returns 1 if a matching + * record was found and deleted, and 0 if a matching record was not found. + * + * This function will stop after finding the first matching record. It is assumed + * that no duplicate records exist. In the case of duplicates, this function will + * still "work", but in the sense of "delete first match". + */ + int tagged_delete(const R &rec) { + for (auto level : m_levels) { + if (level && level->delete_record(rec)) { + return 1; + } + } + + /* + * If the record to be erased wasn't found, return 0. The + * DynamicExtension itself will then search the active + * Buffers. + */ + return 0; + } + + /* + * Merge the memory table down into the tree, completing any required other + * merges to make room for it. + */ + inline bool merge_buffer(Buffer *buffer) { + if (!can_merge_with(0, buffer->get_record_count())) { + merge_down(0); + } + + merge_buffer_into_l0(buffer); + enforce_delete_maximum(0); + + buffer->truncate(); + return true; + } + + /* + * Return the total number of records (including tombstones) within all + * of the levels of the structure. + */ + size_t get_record_count() { + size_t cnt = 0; + + for (size_t i=0; iget_record_count(); + } + + return cnt; + } + + /* + * Return the total number of tombstones contained within all of the + * levels of the structure. + */ + size_t get_tombstone_cnt() { + size_t cnt = 0; + + for (size_t i=0; iget_tombstone_count(); + } + + return cnt; + } + + /* + * Return the number of levels within the structure. Note that not + * all of these levels are necessarily populated. + */ + size_t get_height() { + return m_levels.size(); + } + + /* + * Return the amount of memory (in bytes) used by the shards within the + * structure for storing the primary data structure and raw data. + */ + size_t get_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_memory_usage(); + } + + return cnt; + } + + /* + * Return the amount of memory (in bytes) used by the shards within the + * structure for storing auxiliary data structures. This total does not + * include memory used for the main data structure, or raw data. + */ + size_t get_aux_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_aux_memory_usage(); + } + } + + return cnt; + } + + /* + * Validate that no level in the structure exceeds its maximum tombstone capacity. This is + * used to trigger preemptive compactions at the end of the merge process. + */ + bool validate_tombstone_proportion() { + long double ts_prop; + for (size_t i=0; iget_tombstone_count() / (long double) calc_level_record_capacity(i); + if (ts_prop > (long double) m_max_delete_prop) { + return false; + } + } + } + + return true; + } + + /* + * Return a reference to the underlying vector of levels within the + * structure. + */ + std::vector>> &get_levels() { + return m_levels; + } + +private: + size_t m_scale_factor; + double m_max_delete_prop; + size_t m_buffer_size; + + std::vector>> m_levels; + + /* + * Add a new level to the LSM Tree and return that level's index. Will + * automatically determine whether the level should be on memory or on disk, + * and act appropriately. + */ + inline level_index grow() { + level_index new_idx; + + size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor; + new_idx = m_levels.size(); + if (new_idx > 0) { + assert(m_levels[new_idx - 1]->get_shard(0)->get_tombstone_count() == 0); + } + m_levels.emplace_back(std::shared_ptr>(new InternalLevel(new_idx, new_shard_cnt))); + + return new_idx; + } + + + /* + * Merge the specified level down into the tree. The level index must be + * non-negative (i.e., this function cannot be used to merge the buffer). This + * routine will recursively perform any necessary merges to make room for the + * specified level. + */ + inline void merge_down(level_index idx) { + level_index merge_base_level = find_mergable_level(idx); + if (merge_base_level == -1) { + merge_base_level = grow(); + } + + for (level_index i=merge_base_level; i>idx; i--) { + merge_levels(i, i-1); + enforce_delete_maximum(i); + } + + return; + } + + /* + * Find the first level below the level indicated by idx that + * is capable of sustaining a merge operation and return its + * level index. If no such level exists, returns -1. Also + * returns -1 if idx==0, and no such level exists, to simplify + * the logic of the first merge. + */ + inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { + + if (idx == 0 && m_levels.size() == 0) return -1; + + bool level_found = false; + bool disk_level; + level_index merge_level_idx; + + size_t incoming_rec_cnt = get_level_record_count(idx, buffer); + for (level_index i=idx+1; i::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); + } else { + m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); + } + + m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); + } + + + inline void merge_buffer_into_l0(Buffer *buffer) { + assert(m_levels[0]); + if constexpr (L == LayoutPolicy::LEVELING) { + // FIXME: Kludgey implementation due to interface constraints. + auto old_level = m_levels[0].get(); + auto temp_level = new InternalLevel(0, 1); + temp_level->append_buffer(buffer); + auto new_level = InternalLevel::merge_levels(old_level, temp_level); + + m_levels[0] = new_level; + delete temp_level; + } else { + m_levels[0]->append_buffer(buffer); + } + } + + /* + * Mark a given memory level as no-longer in use by the tree. For now this + * will just free the level. In future, this will be more complex as the + * level may not be able to immediately be deleted, depending upon who + * else is using it. + */ + inline void mark_as_unused(std::shared_ptr> level) { + level.reset(); + } + + /* + * Check the tombstone proportion for the specified level and + * if the limit is exceeded, forcibly merge levels until all + * levels below idx are below the limit. + */ + inline void enforce_delete_maximum(level_index idx) { + long double ts_prop = (long double) m_levels[idx]->get_tombstone_count() / (long double) calc_level_record_capacity(idx); + + if (ts_prop > (long double) m_max_delete_prop) { + merge_down(idx); + } + + return; + } + + /* + * Assume that level "0" should be larger than the buffer. The buffer + * itself is index -1, which should return simply the buffer capacity. + */ + inline size_t calc_level_record_capacity(level_index idx) { + return m_buffer_size * pow(m_scale_factor, idx+1); + } + + /* + * Returns the actual number of records present on a specified level. An + * index value of -1 indicates the memory table. Can optionally pass in + * a pointer to the memory table to use, if desired. Otherwise, there are + * no guarantees about which buffer will be accessed if level_index is -1. + */ + inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) { + if (buffer) { + return buffer->get_record_count(); + } + + return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0; + } + + /* + * Determines if the specific level can merge with another record containing + * incoming_rec_cnt number of records. The provided level index should be + * non-negative (i.e., not refer to the buffer) and will be automatically + * translated into the appropriate index into either the disk or memory level + * vector. + */ + inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { + if (idx>= m_levels.size() || !m_levels[idx]) { + return false; + } + + if (L == LayoutPolicy::LEVELING) { + return m_levels[idx]->get_record_count() + incoming_rec_cnt <= calc_level_record_capacity(idx); + } else { + return m_levels[idx]->get_shard_count() < m_scale_factor; + } + + // unreachable + assert(true); + } +}; + +} + diff --git a/include/framework/InternalLevel.h b/include/framework/InternalLevel.h index ec8ffc4..983ec6a 100644 --- a/include/framework/InternalLevel.h +++ b/include/framework/InternalLevel.h @@ -19,6 +19,10 @@ #include "framework/MutableBuffer.h" namespace de { +template +class InternalLevel; + + template class InternalLevel { @@ -55,7 +59,7 @@ public: // WARNING: for leveling only. // assuming the base level is the level new level is merging into. (base_level is larger.) - static InternalLevel* merge_levels(InternalLevel* base_level, InternalLevel* new_level) { + static std::shared_ptr merge_levels(InternalLevel* base_level, InternalLevel* new_level) { assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0)); auto res = new InternalLevel(base_level->m_level_no, 1); res->m_shard_cnt = 1; @@ -64,7 +68,7 @@ public: shards[1] = new_level->m_shards[0]; res->m_shards[0] = new S(shards, 2); - return res; + return std::shared_ptr(res); } void append_buffer(Buffer* buffer) { diff --git a/include/framework/MutableBuffer.h b/include/framework/MutableBuffer.h index b79fc02..cadecb6 100644 --- a/include/framework/MutableBuffer.h +++ b/include/framework/MutableBuffer.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -33,16 +34,22 @@ public: MutableBuffer(size_t capacity, size_t max_tombstone_cap) : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0) , m_tombstonecnt(0), m_weight(0), m_max_weight(0) { - auto len = capacity * sizeof(Wrapped); - size_t aligned_buffersize = len + (CACHELINE_SIZE - (len % CACHELINE_SIZE)); - m_data = (Wrapped*) std::aligned_alloc(CACHELINE_SIZE, aligned_buffersize); + m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); + m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); m_tombstone_filter = nullptr; if (max_tombstone_cap > 0) { m_tombstone_filter = new psudb::BloomFilter(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS); } + + m_refcnt.store(0); + m_deferred_truncate.store(false); + m_merging.store(false); } ~MutableBuffer() { + assert(m_refcnt.load() == 0); + assert(m_merging.load() == false); + if (m_data) free(m_data); if (m_tombstone_filter) delete m_tombstone_filter; } @@ -157,6 +164,50 @@ public: return m_max_weight; } + bool start_merge() { + if (m_merge_lock.try_lock()) { + /* there cannot already been an active merge */ + if (m_merging.load()) { + m_merge_lock.unlock(); + return false; + } + + m_merging.store(true); + memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); + return true; + } + + /* lock could not be obtained */ + return false; + } + + bool finish_merge() { + m_merge_lock.unlock(); + return true; + } + + /* + * Concurrency-related operations + */ + bool take_reference() { + m_refcnt.fetch_add(1); + return true; + } + + bool release_reference() { + m_refcnt.fetch_add(-1); + + if (m_refcnt.load() == 0 && m_deferred_truncate.load()) { + assert(this->truncate()); + } + + return true; + } + + bool active_merge() { + return m_merging.load(); + } + private: int32_t try_advance_tail() { size_t new_tail = m_reccnt.fetch_add(1); @@ -169,12 +220,22 @@ private: size_t m_tombstone_cap; Wrapped* m_data; + Wrapped* m_merge_data; + psudb::BloomFilter* m_tombstone_filter; alignas(64) std::atomic m_tombstonecnt; alignas(64) std::atomic m_reccnt; alignas(64) std::atomic m_weight; alignas(64) std::atomic m_max_weight; + alignas(64) std::atomic m_merging; + alignas(64) std::atomic m_deferred_truncate; + alignas(64) std::atomic m_refcnt; + + alignas(64) std::mutex m_merge_lock; + alignas(64) std::mutex m_trunc_lock; + alignas(64) std::condition_variable m_trunc_signal; + }; } diff --git a/include/framework/RecordInterface.h b/include/framework/RecordInterface.h index f78918c..1ef1984 100644 --- a/include/framework/RecordInterface.h +++ b/include/framework/RecordInterface.h @@ -207,7 +207,7 @@ struct EuclidPoint{ template struct RecordHash { size_t operator()(R const &rec) const { - return psudb::hash_bytes((char *) &rec, sizeof(R)); + return psudb::hash_bytes((std::byte *) &rec, sizeof(R)); } }; diff --git a/include/framework/Scheduler.h b/include/framework/Scheduler.h new file mode 100644 index 0000000..cd3f430 --- /dev/null +++ b/include/framework/Scheduler.h @@ -0,0 +1,76 @@ +/* + * include/framework/Scheduler.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include + +#include "util/types.h" +#include "framework/ShardInterface.h" +#include "framework/QueryInterface.h" +#include "framework/RecordInterface.h" +#include "framework/MutableBuffer.h" +#include "framework/Configuration.h" +#include "framework/ExtensionStructure.h" + +namespace de { + + +struct MergeTask { + level_index m_source_level; + level_index m_target_level; + size_t m_size; + size_t m_timestamp; + + bool operator<(MergeTask &other) { + return m_timestamp < other.m_timestamp; + } +}; + + +template +class Scheduler { + typedef ExtensionStructure Structure; +public: + /* + * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means + * unlimited. + */ + Scheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) + , m_used_memory(0) + , m_used_threads(0) + {} + + bool schedule_merge(Structure *version, MutableBuffer *buffer) { + // FIXME: this is a non-concurrent implementation + return version->merge_buffer(buffer); + } + +private: + size_t get_timestamp() { + auto ts = m_timestamp.fetch_add(1); + return ts; + } + + size_t m_memory_budget; + size_t m_thread_cnt; + + alignas(64) std::atomic m_used_memory; + alignas(64) std::atomic m_used_threads; + alignas(64) std::atomic m_timestamp; + + std::priority_queue m_merge_queue; + std::mutex m_merge_queue_lock; +}; + +} -- cgit v1.2.3 From 7f56949bc847b56da69c9eb3ebe081d6cf9f61c6 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 18 Sep 2023 12:25:01 -0400 Subject: General bugfixes --- include/framework/DynamicExtension.h | 5 ++++- include/framework/MutableBuffer.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 5e9bcee..08e2243 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -40,7 +40,10 @@ public: : m_scale_factor(scale_factor) , m_max_delete_prop(max_delete_prop) , m_sched(memory_budget, thread_cnt) - { } + { + m_buffers.push_back(new Buffer(buffer_cap, max_delete_prop*buffer_cap)); + m_versions.push_back(new Structure(buffer_cap, scale_factor, max_delete_prop)); + } ~DynamicExtension() { for (size_t i=0; i -- cgit v1.2.3 From abc8605a51537fc7b35bb0d9b1da6c724c5c6973 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 18 Sep 2023 13:05:44 -0400 Subject: Moved individual merge task execution into the scheduler This change is made in anticipation of scheduling each task using a specific thread, and required some modification to the interface of ExtensionStructure. Namely, 1. ExtensionStructure now supports a get_merge_tasks() interface, which returns a list of the individual level merges that would need to be performed to complete a buffer flush of specified size. 2. merge_levels and merge_buffer have been promoted to the public interface, to allow their use within the scheduler. 3. merge_buffer has been modified to assume that the structure already can support a direct flush of the buffer into L0, it is now the responsibility of the caller to ensure that the necessary merges have already been completed prior to calling this method. Currently, preemptive tombstone compactions are non-functional, so some unit tests are failing. This will be fixed when the thread scheduling system is set up. --- include/framework/ExtensionStructure.h | 101 ++++++++++++++++++++++++++------- include/framework/InternalLevel.h | 4 ++ include/framework/Scheduler.h | 28 ++++----- 3 files changed, 98 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/framework/ExtensionStructure.h b/include/framework/ExtensionStructure.h index 1e756db..9a5f6b3 100644 --- a/include/framework/ExtensionStructure.h +++ b/include/framework/ExtensionStructure.h @@ -27,6 +27,17 @@ namespace de { +struct MergeTask { + level_index m_source_level; + level_index m_target_level; + size_t m_size; + size_t m_timestamp; + + bool operator<(MergeTask &other) { + return m_timestamp < other.m_timestamp; + } +}; + template class ExtensionStructure { typedef S Shard; @@ -90,9 +101,7 @@ public: * merges to make room for it. */ inline bool merge_buffer(Buffer *buffer) { - if (!can_merge_with(0, buffer->get_record_count())) { - merge_down(0); - } + assert(can_merge_with(0, buffer->get_record_count())); merge_buffer_into_l0(buffer); enforce_delete_maximum(0); @@ -192,6 +201,74 @@ public: return m_levels; } + /* + * + */ + std::vector get_merge_tasks(size_t buffer_reccnt) { + std::vector merges; + + /* + * The buffer -> L0 merge task is not included so if that + * can be done without any other change, just return an + * empty list. + */ + if (can_merge_with(0, buffer_reccnt)) { + return std::move(merges); + } + + level_index merge_base_level = find_mergable_level(0); + if (merge_base_level == -1) { + merge_base_level = grow(); + } + + for (level_index i=merge_base_level; i>0; i--) { + MergeTask task; + task.m_source_level = i - 1; + task.m_target_level = i; + + /* + * The amount of storage required for the merge accounts + * for the cost of storing the new records, along with the + * cost of retaining the old records during the process + * (hence the 2x multiplier). + * + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records + * themselves. + */ + size_t reccnt = m_levels[i-1]->get_record_count(); + if constexpr (L == LayoutPolicy::LEVELING) { + if (can_merge_with(i, reccnt)) { + reccnt += m_levels[i]->get_record_count(); + } + } + task.m_size = 2* reccnt * sizeof(R); + + merges.push_back(task); + } + + return std::move(merges); + } + + /* + * Merge the level specified by incoming level into the level specified + * by base level. The two levels should be sequential--i.e. no levels + * are skipped in the merge process--otherwise the tombstone ordering + * invariant may be violated by the merge operation. + */ + inline void merge_levels(level_index base_level, level_index incoming_level) { + // merging two memory levels + if constexpr (L == LayoutPolicy::LEVELING) { + auto tmp = m_levels[base_level]; + m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); + } else { + m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); + } + + m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); + } + + private: size_t m_scale_factor; double m_max_delete_prop; @@ -265,24 +342,6 @@ private: return -1; } - /* - * Merge the level specified by incoming level into the level specified - * by base level. The two levels should be sequential--i.e. no levels - * are skipped in the merge process--otherwise the tombstone ordering - * invariant may be violated by the merge operation. - */ - inline void merge_levels(level_index base_level, level_index incoming_level) { - // merging two memory levels - if constexpr (L == LayoutPolicy::LEVELING) { - auto tmp = m_levels[base_level]; - m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); - } else { - m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); - } - - m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); - } - inline void merge_buffer_into_l0(Buffer *buffer) { assert(m_levels[0]); diff --git a/include/framework/InternalLevel.h b/include/framework/InternalLevel.h index 983ec6a..b9866b8 100644 --- a/include/framework/InternalLevel.h +++ b/include/framework/InternalLevel.h @@ -87,6 +87,10 @@ public: } Shard *get_merged_shard() { + if (m_shard_cnt == 0) { + return nullptr; + } + Shard *shards[m_shard_cnt]; for (size_t i=0; i class Scheduler { typedef ExtensionStructure Structure; @@ -52,7 +39,20 @@ public: {} bool schedule_merge(Structure *version, MutableBuffer *buffer) { - // FIXME: this is a non-concurrent implementation + /* + * Get list of individual level reconstructions that are necessary + * for completing the overall merge + */ + std::vector merges = version->get_merge_tasks(buffer->get_record_count()); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=merges.size()-1; i>=0; i--) { + version->merge_levels(merges[i].m_target_level, merges[i].m_source_level); + } + return version->merge_buffer(buffer); } -- cgit v1.2.3 From 6e30f576ca9d11d1901f4877315e97f84d15b1e1 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 18 Sep 2023 16:37:30 -0400 Subject: The scheduler now spawns a seperate merge thread Merges are now executed from a seperate thread within the scheduler that wakes up via condition variables when new merge tasks are scheduled. In addition, tombstone limits are now enforced by the scheduler, with new merges being scheduled as needed. There are still a few tests failing, notably the zero tombstones in the last run invarient is not holding under tiering with tombstones. Need to look into that yet. --- include/framework/DynamicExtension.h | 5 ++ include/framework/ExtensionStructure.h | 109 +++++++++++++++------------- include/framework/InternalLevel.h | 39 +++++++++- include/framework/Scheduler.h | 126 +++++++++++++++++++++++++++++++-- 4 files changed, 221 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 08e2243..6965965 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -301,5 +301,10 @@ private: return processed_records; } }; + +template +static void de_merge_callback(DynamicExtension extension, ExtensionStructure new_version) { + +} } diff --git a/include/framework/ExtensionStructure.h b/include/framework/ExtensionStructure.h index 9a5f6b3..2fb9cf0 100644 --- a/include/framework/ExtensionStructure.h +++ b/include/framework/ExtensionStructure.h @@ -33,8 +33,12 @@ struct MergeTask { size_t m_size; size_t m_timestamp; - bool operator<(MergeTask &other) { - return m_timestamp < other.m_timestamp; + friend bool operator<(const MergeTask &self, const MergeTask &other) { + return self.m_timestamp < other.m_timestamp; + } + + friend bool operator>(const MergeTask &self, const MergeTask &other) { + return self.m_timestamp > other.m_timestamp; } }; @@ -66,7 +70,7 @@ public: ExtensionStructure *copy() { auto new_struct = new ExtensionStructure(m_scale_factor, m_max_delete_prop, m_buffer_size); for (size_t i=0; im_levels.push_back(m_levels[i]); + new_struct->m_levels.push_back(m_levels[i]->clone()); } return new_struct; @@ -104,9 +108,8 @@ public: assert(can_merge_with(0, buffer->get_record_count())); merge_buffer_into_l0(buffer); - enforce_delete_maximum(0); - buffer->truncate(); + return true; } @@ -193,6 +196,11 @@ public: return true; } + bool validate_tombstone_proportion(level_index level) { + long double ts_prop = (long double) m_levels[level]->get_tombstone_count() / (long double) calc_level_record_capacity(level); + return ts_prop <= (long double) m_max_delete_prop; + } + /* * Return a reference to the underlying vector of levels within the * structure. @@ -250,6 +258,47 @@ public: return std::move(merges); } + + /* + * + */ + std::vector get_merge_tasks_from_level(size_t source_level) { + std::vector merges; + + level_index merge_base_level = find_mergable_level(source_level); + if (merge_base_level == -1) { + merge_base_level = grow(); + } + + for (level_index i=merge_base_level; i>source_level; i--) { + MergeTask task; + task.m_source_level = i - 1; + task.m_target_level = i; + + /* + * The amount of storage required for the merge accounts + * for the cost of storing the new records, along with the + * cost of retaining the old records during the process + * (hence the 2x multiplier). + * + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records + * themselves. + */ + size_t reccnt = m_levels[i-1]->get_record_count(); + if constexpr (L == LayoutPolicy::LEVELING) { + if (can_merge_with(i, reccnt)) { + reccnt += m_levels[i]->get_record_count(); + } + } + task.m_size = 2* reccnt * sizeof(R); + + merges.push_back(task); + } + + return std::move(merges); + } + /* * Merge the level specified by incoming level into the level specified * by base level. The two levels should be sequential--i.e. no levels @@ -282,44 +331,18 @@ private: * and act appropriately. */ inline level_index grow() { - level_index new_idx; - + level_index new_idx = m_levels.size(); size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor; - new_idx = m_levels.size(); - if (new_idx > 0) { - assert(m_levels[new_idx - 1]->get_shard(0)->get_tombstone_count() == 0); - } - m_levels.emplace_back(std::shared_ptr>(new InternalLevel(new_idx, new_shard_cnt))); + m_levels.emplace_back(std::shared_ptr>(new InternalLevel(new_idx, new_shard_cnt))); return new_idx; } - - /* - * Merge the specified level down into the tree. The level index must be - * non-negative (i.e., this function cannot be used to merge the buffer). This - * routine will recursively perform any necessary merges to make room for the - * specified level. - */ - inline void merge_down(level_index idx) { - level_index merge_base_level = find_mergable_level(idx); - if (merge_base_level == -1) { - merge_base_level = grow(); - } - - for (level_index i=merge_base_level; i>idx; i--) { - merge_levels(i, i-1); - enforce_delete_maximum(i); - } - - return; - } - /* * Find the first level below the level indicated by idx that * is capable of sustaining a merge operation and return its * level index. If no such level exists, returns -1. Also - * returns -1 if idx==0, and no such level exists, to simplify + * returns -1 if idx==0, and no such level exists, to skimplify * the logic of the first merge. */ inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { @@ -342,7 +365,6 @@ private: return -1; } - inline void merge_buffer_into_l0(Buffer *buffer) { assert(m_levels[0]); if constexpr (L == LayoutPolicy::LEVELING) { @@ -369,21 +391,6 @@ private: level.reset(); } - /* - * Check the tombstone proportion for the specified level and - * if the limit is exceeded, forcibly merge levels until all - * levels below idx are below the limit. - */ - inline void enforce_delete_maximum(level_index idx) { - long double ts_prop = (long double) m_levels[idx]->get_tombstone_count() / (long double) calc_level_record_capacity(idx); - - if (ts_prop > (long double) m_max_delete_prop) { - merge_down(idx); - } - - return; - } - /* * Assume that level "0" should be larger than the buffer. The buffer * itself is index -1, which should return simply the buffer capacity. @@ -424,7 +431,7 @@ private: return m_levels[idx]->get_shard_count() < m_scale_factor; } - // unreachable + /* unreachable */ assert(true); } }; diff --git a/include/framework/InternalLevel.h b/include/framework/InternalLevel.h index b9866b8..e67ae45 100644 --- a/include/framework/InternalLevel.h +++ b/include/framework/InternalLevel.h @@ -34,6 +34,7 @@ public: , m_shard_cnt(0) , m_shards(shard_cap, nullptr) , m_owns(shard_cap, true) + , m_pending_shard(nullptr) {} // Create a new memory level sharing the shards and repurposing it as previous level_no + 1 @@ -42,7 +43,9 @@ public: : m_level_no(level->m_level_no + 1) , m_shard_cnt(level->m_shard_cnt) , m_shards(level->m_shards.size(), nullptr) - , m_owns(level->m_owns.size(), true) { + , m_owns(level->m_owns.size(), true) + , m_pending_shard(nullptr) + { assert(m_shard_cnt == 1 && m_shards.size() == 1); for (size_t i=0; im_shards.data(), level->m_shard_cnt); + return; + } + m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt); m_owns[m_shard_cnt] = true; ++m_shard_cnt; } + + void finalize() { + if (m_pending_shard) { + for (size_t i=0; i m_shards; + + Shard *m_pending_shard; + std::vector m_owns; InternalLevel *clone() { diff --git a/include/framework/Scheduler.h b/include/framework/Scheduler.h index 28ed8a9..534ce25 100644 --- a/include/framework/Scheduler.h +++ b/include/framework/Scheduler.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "util/types.h" #include "framework/ShardInterface.h" @@ -26,6 +28,7 @@ namespace de { template class Scheduler { typedef ExtensionStructure Structure; + typedef MutableBuffer Buffer; public: /* * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means @@ -36,9 +39,25 @@ public: , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) , m_used_memory(0) , m_used_threads(0) - {} + , m_shutdown(false) + { + m_sched_thrd = std::thread(&Scheduler::run_scheduler, this); + } + + ~Scheduler() { + m_shutdown = true; + + m_cv.notify_all(); + m_sched_thrd.join(); + } bool schedule_merge(Structure *version, MutableBuffer *buffer) { + /* + * temporary hack + */ + pending_version = version; + pending_buffer = buffer; + /* * Get list of individual level reconstructions that are necessary * for completing the overall merge @@ -50,10 +69,30 @@ public: * executes them sequentially in a blocking fashion) */ for (ssize_t i=merges.size()-1; i>=0; i--) { - version->merge_levels(merges[i].m_target_level, merges[i].m_source_level); + merges[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(merges[i]); + m_merge_queue_lock.unlock(); } - return version->merge_buffer(buffer); + MergeTask buffer_merge; + buffer_merge.m_source_level = -1; + buffer_merge.m_target_level = 0; + buffer_merge.m_size = buffer->get_record_count() * sizeof(R) * 2; + buffer_merge.m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(buffer_merge); + m_merge_queue_lock.unlock(); + + m_cv.notify_all(); + do { + std::unique_lock merge_cv_lock(m_merge_cv_lock); + m_merge_cv.wait(merge_cv_lock); + } while (m_merge_queue.size() > 0); + + assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); + + return true; } private: @@ -62,15 +101,94 @@ private: return ts; } + void schedule_next_task() { + m_merge_queue_lock.lock(); + auto task = m_merge_queue.top(); + m_merge_queue.pop(); + m_merge_queue_lock.unlock(); + + if (task.m_source_level == -1 && task.m_target_level == 0) { + run_buffer_merge(pending_buffer, pending_version); + } else { + run_merge(task, pending_version); + } + + if (m_merge_queue.size() == 0) { + m_merge_cv.notify_all(); + } + } + + void run_merge(MergeTask task, Structure *version) { + version->merge_levels(task.m_target_level, task.m_source_level); + if (!version->validate_tombstone_proportion(task.m_target_level)) { + auto tasks = version->get_merge_tasks(task.m_target_level); + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + + void run_buffer_merge(Buffer *buffer, Structure *version) { + version->merge_buffer(buffer); + if (!version->validate_tombstone_proportion(0)) { + auto tasks = version->get_merge_tasks_from_level(0); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + void run_scheduler() { + do { + std::unique_lock cv_lock(m_cv_lock); + m_cv.wait(cv_lock); + + while (m_merge_queue.size() > 0 && m_used_threads < m_thread_cnt) { + schedule_next_task(); + } + cv_lock.unlock(); + } while(!m_shutdown); + } + size_t m_memory_budget; size_t m_thread_cnt; + Buffer *pending_buffer; + Structure *pending_version; + alignas(64) std::atomic m_used_memory; alignas(64) std::atomic m_used_threads; alignas(64) std::atomic m_timestamp; - std::priority_queue m_merge_queue; + std::priority_queue, std::greater> m_merge_queue; std::mutex m_merge_queue_lock; + + std::mutex m_cv_lock; + std::condition_variable m_cv; + + std::mutex m_merge_cv_lock; + std::condition_variable m_merge_cv; + + std::thread m_sched_thrd; + + bool m_shutdown; + }; } -- cgit v1.2.3 From 754372aeccb74815cbb16f32ceacb04b4c5aaba9 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 20 Sep 2023 14:03:23 -0400 Subject: Bugfixes for tiering Fixed a few issues that manifested during the tiering tests, 1) When a version is copied, it now contains copies of the levels, not just pointers (the levels themselves still hold pointers to the shards, though). 2) Ensure that tasks are scheduled with the correct timestamp, they were originally being scheduled backwards. The get_merge_tasks() method already returns them in the correct order, so reversing them again put it in the wrong order. --- include/framework/ExtensionStructure.h | 1 + include/framework/InternalLevel.h | 8 ++++++-- include/framework/Scheduler.h | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/framework/ExtensionStructure.h b/include/framework/ExtensionStructure.h index 2fb9cf0..892e63b 100644 --- a/include/framework/ExtensionStructure.h +++ b/include/framework/ExtensionStructure.h @@ -312,6 +312,7 @@ public: m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); } else { m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); + m_levels[base_level]->finalize(); } m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); diff --git a/include/framework/InternalLevel.h b/include/framework/InternalLevel.h index e67ae45..6cdac4e 100644 --- a/include/framework/InternalLevel.h +++ b/include/framework/InternalLevel.h @@ -106,6 +106,7 @@ public: for (size_t i=0; i m_owns; - InternalLevel *clone() { - auto new_level = new InternalLevel(m_level_no, m_shards.size()); + std::shared_ptr clone() { + auto new_level = std::make_shared(m_level_no, m_shards.size()); for (size_t i=0; im_shards[i] = m_shards[i]; new_level->m_owns[i] = true; m_owns[i] = false; } + + return new_level; } }; diff --git a/include/framework/Scheduler.h b/include/framework/Scheduler.h index 534ce25..6055bef 100644 --- a/include/framework/Scheduler.h +++ b/include/framework/Scheduler.h @@ -68,7 +68,7 @@ public: * Schedule the merge tasks (FIXME: currently this just * executes them sequentially in a blocking fashion) */ - for (ssize_t i=merges.size()-1; i>=0; i--) { + for (ssize_t i=0; imerge_levels(task.m_target_level, task.m_source_level); + if (!version->validate_tombstone_proportion(task.m_target_level)) { auto tasks = version->get_merge_tasks(task.m_target_level); /* @@ -188,7 +189,6 @@ private: std::thread m_sched_thrd; bool m_shutdown; - }; } -- cgit v1.2.3 From 7c03d771475421c1d5a2bbc135242536af1a371c Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 25 Sep 2023 10:49:36 -0400 Subject: Re-structuring Project + scheduling updates This is a big one--probably should have split it apart, but I'm feeling lazy this morning. * Organized the mess of header files in include/framework by splitting them out into their own subdirectories, and renaming a few files to remove redundancies introduced by the directory structure. * Introduced a new framework/ShardRequirements.h header file for simpler shard development. This header simply contains the necessary includes from framework/* for creating shard files. This should help to remove structural dependencies from the framework file structure and shards, as well as centralizing the necessary framework files to make shard development easier. * Created a (currently dummy) SchedulerInterface, and make the scheduler implementation a template parameter of the dynamic extension for easier testing of various scheduling policies. There's still more work to be done to fully integrate the scheduler (queries, multiple buffers), but some more of the necessary framework code for this has been added as well. * Adjusted the Task interface setup for the scheduler. The task structures have been removed from ExtensionStructure and placed in their own header file. Additionally, I started experimenting with using std::variant, as opposed to inheritence, to implement subtype polymorphism on the Merge and Query tasks. The scheduler now has a general task queue that contains both, and std::variant, std::visit, and std::get are used to manipulate them without virtual functions. * Removed Alex.h, as it can't build anyway. There's a branch out there containing the Alex implementation stripped of the C++20 stuff. So there's no need to keep it here. --- include/framework/Configuration.h | 54 --- include/framework/DynamicExtension.h | 24 +- include/framework/ExtensionStructure.h | 441 ----------------------- include/framework/InternalLevel.h | 258 ------------- include/framework/MutableBuffer.h | 242 ------------- include/framework/QueryInterface.h | 35 -- include/framework/RecordInterface.h | 214 ----------- include/framework/Scheduler.h | 194 ---------- include/framework/ShardInterface.h | 26 -- include/framework/ShardRequirements.h | 9 + include/framework/interface/Query.h | 35 ++ include/framework/interface/Record.h | 214 +++++++++++ include/framework/interface/Scheduler.h | 31 ++ include/framework/interface/Shard.h | 26 ++ include/framework/scheduling/Scheduler.h | 195 ++++++++++ include/framework/scheduling/SerialScheduler.h | 227 ++++++++++++ include/framework/scheduling/Task.h | 63 ++++ include/framework/structure/ExtensionStructure.h | 428 ++++++++++++++++++++++ include/framework/structure/InternalLevel.h | 258 +++++++++++++ include/framework/structure/MutableBuffer.h | 242 +++++++++++++ include/framework/util/Configuration.h | 54 +++ include/shard/Alex.h | 360 ------------------ include/shard/MemISAM.h | 3 +- include/shard/PGM.h | 6 +- include/shard/TrieSpline.h | 5 +- include/shard/VPTree.h | 6 +- include/shard/WIRS.h | 6 +- include/shard/WSS.h | 6 +- include/util/Cursor.h | 2 +- 29 files changed, 1807 insertions(+), 1857 deletions(-) delete mode 100644 include/framework/Configuration.h delete mode 100644 include/framework/ExtensionStructure.h delete mode 100644 include/framework/InternalLevel.h delete mode 100644 include/framework/MutableBuffer.h delete mode 100644 include/framework/QueryInterface.h delete mode 100644 include/framework/RecordInterface.h delete mode 100644 include/framework/Scheduler.h delete mode 100644 include/framework/ShardInterface.h create mode 100644 include/framework/ShardRequirements.h create mode 100644 include/framework/interface/Query.h create mode 100644 include/framework/interface/Record.h create mode 100644 include/framework/interface/Scheduler.h create mode 100644 include/framework/interface/Shard.h create mode 100644 include/framework/scheduling/Scheduler.h create mode 100644 include/framework/scheduling/SerialScheduler.h create mode 100644 include/framework/scheduling/Task.h create mode 100644 include/framework/structure/ExtensionStructure.h create mode 100644 include/framework/structure/InternalLevel.h create mode 100644 include/framework/structure/MutableBuffer.h create mode 100644 include/framework/util/Configuration.h delete mode 100644 include/shard/Alex.h (limited to 'include') diff --git a/include/framework/Configuration.h b/include/framework/Configuration.h deleted file mode 100644 index eb9b93f..0000000 --- a/include/framework/Configuration.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * include/framework/DynamicExtension.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include - -#include "psu-util/timer.h" -#include "psu-ds/Alias.h" - -namespace de { - -thread_local size_t sampling_attempts = 0; -thread_local size_t sampling_rejections = 0; -thread_local size_t deletion_rejections = 0; -thread_local size_t bounds_rejections = 0; -thread_local size_t tombstone_rejections = 0; -thread_local size_t buffer_rejections = 0; - -/* - * thread_local size_t various_sampling_times go here. - */ -thread_local size_t sample_range_time = 0; -thread_local size_t alias_time = 0; -thread_local size_t alias_query_time = 0; -thread_local size_t rejection_check_time = 0; -thread_local size_t buffer_sample_time = 0; -thread_local size_t memlevel_sample_time = 0; -thread_local size_t disklevel_sample_time = 0; -thread_local size_t sampling_bailouts = 0; - - -enum class LayoutPolicy { - LEVELING, - TEIRING -}; - -enum class DeletePolicy { - TOMBSTONE, - TAGGING -}; - -typedef ssize_t level_index; - -} diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 6965965..3a460aa 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -14,22 +14,24 @@ #include #include -#include "framework/MutableBuffer.h" -#include "framework/InternalLevel.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" -#include "framework/RecordInterface.h" -#include "framework/ExtensionStructure.h" - -#include "framework/Configuration.h" -#include "framework/Scheduler.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/structure/InternalLevel.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/interface/Query.h" +#include "framework/interface/Scheduler.h" +#include "framework/structure/ExtensionStructure.h" + +#include "framework/util/Configuration.h" +#include "framework/scheduling/SerialScheduler.h" #include "psu-util/timer.h" #include "psu-ds/Alias.h" namespace de { -template +template > class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; @@ -219,7 +221,7 @@ public: } private: - Scheduler m_sched; + SCHED m_sched; std::vector m_buffers; std::vector m_versions; diff --git a/include/framework/ExtensionStructure.h b/include/framework/ExtensionStructure.h deleted file mode 100644 index 892e63b..0000000 --- a/include/framework/ExtensionStructure.h +++ /dev/null @@ -1,441 +0,0 @@ -/* - * include/framework/ExtensionStructure.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include - -#include "framework/MutableBuffer.h" -#include "framework/InternalLevel.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" -#include "framework/RecordInterface.h" - -#include "framework/Configuration.h" - -#include "psu-util/timer.h" -#include "psu-ds/Alias.h" - -namespace de { - -struct MergeTask { - level_index m_source_level; - level_index m_target_level; - size_t m_size; - size_t m_timestamp; - - friend bool operator<(const MergeTask &self, const MergeTask &other) { - return self.m_timestamp < other.m_timestamp; - } - - friend bool operator>(const MergeTask &self, const MergeTask &other) { - return self.m_timestamp > other.m_timestamp; - } -}; - -template -class ExtensionStructure { - typedef S Shard; - typedef MutableBuffer Buffer; - -public: - ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop) - : m_scale_factor(scale_factor) - , m_max_delete_prop(max_delete_prop) - , m_buffer_size(buffer_size) - {} - - ~ExtensionStructure() = default; - - /* - * Create a shallow copy of this extension structure. The copy will share references to the - * same levels/shards as the original, but will have its own lists. As all of the shards are - * immutable (with the exception of deletes), the copy can be restructured with merges, etc., - * without affecting the original. - * - * NOTE: When using tagged deletes, a delete of a record in the original structure will affect - * the copy, so long as the copy retains a reference to the same shard as the original. This could - * cause synchronization problems under tagging with concurrency. Any deletes in this context will - * need to be forwarded to the appropriate structures manually. - */ - ExtensionStructure *copy() { - auto new_struct = new ExtensionStructure(m_scale_factor, m_max_delete_prop, m_buffer_size); - for (size_t i=0; im_levels.push_back(m_levels[i]->clone()); - } - - return new_struct; - } - - /* - * Search for a record matching the argument and mark it deleted by - * setting the delete bit in its wrapped header. Returns 1 if a matching - * record was found and deleted, and 0 if a matching record was not found. - * - * This function will stop after finding the first matching record. It is assumed - * that no duplicate records exist. In the case of duplicates, this function will - * still "work", but in the sense of "delete first match". - */ - int tagged_delete(const R &rec) { - for (auto level : m_levels) { - if (level && level->delete_record(rec)) { - return 1; - } - } - - /* - * If the record to be erased wasn't found, return 0. The - * DynamicExtension itself will then search the active - * Buffers. - */ - return 0; - } - - /* - * Merge the memory table down into the tree, completing any required other - * merges to make room for it. - */ - inline bool merge_buffer(Buffer *buffer) { - assert(can_merge_with(0, buffer->get_record_count())); - - merge_buffer_into_l0(buffer); - buffer->truncate(); - - return true; - } - - /* - * Return the total number of records (including tombstones) within all - * of the levels of the structure. - */ - size_t get_record_count() { - size_t cnt = 0; - - for (size_t i=0; iget_record_count(); - } - - return cnt; - } - - /* - * Return the total number of tombstones contained within all of the - * levels of the structure. - */ - size_t get_tombstone_cnt() { - size_t cnt = 0; - - for (size_t i=0; iget_tombstone_count(); - } - - return cnt; - } - - /* - * Return the number of levels within the structure. Note that not - * all of these levels are necessarily populated. - */ - size_t get_height() { - return m_levels.size(); - } - - /* - * Return the amount of memory (in bytes) used by the shards within the - * structure for storing the primary data structure and raw data. - */ - size_t get_memory_usage() { - size_t cnt = 0; - for (size_t i=0; iget_memory_usage(); - } - - return cnt; - } - - /* - * Return the amount of memory (in bytes) used by the shards within the - * structure for storing auxiliary data structures. This total does not - * include memory used for the main data structure, or raw data. - */ - size_t get_aux_memory_usage() { - size_t cnt = 0; - for (size_t i=0; iget_aux_memory_usage(); - } - } - - return cnt; - } - - /* - * Validate that no level in the structure exceeds its maximum tombstone capacity. This is - * used to trigger preemptive compactions at the end of the merge process. - */ - bool validate_tombstone_proportion() { - long double ts_prop; - for (size_t i=0; iget_tombstone_count() / (long double) calc_level_record_capacity(i); - if (ts_prop > (long double) m_max_delete_prop) { - return false; - } - } - } - - return true; - } - - bool validate_tombstone_proportion(level_index level) { - long double ts_prop = (long double) m_levels[level]->get_tombstone_count() / (long double) calc_level_record_capacity(level); - return ts_prop <= (long double) m_max_delete_prop; - } - - /* - * Return a reference to the underlying vector of levels within the - * structure. - */ - std::vector>> &get_levels() { - return m_levels; - } - - /* - * - */ - std::vector get_merge_tasks(size_t buffer_reccnt) { - std::vector merges; - - /* - * The buffer -> L0 merge task is not included so if that - * can be done without any other change, just return an - * empty list. - */ - if (can_merge_with(0, buffer_reccnt)) { - return std::move(merges); - } - - level_index merge_base_level = find_mergable_level(0); - if (merge_base_level == -1) { - merge_base_level = grow(); - } - - for (level_index i=merge_base_level; i>0; i--) { - MergeTask task; - task.m_source_level = i - 1; - task.m_target_level = i; - - /* - * The amount of storage required for the merge accounts - * for the cost of storing the new records, along with the - * cost of retaining the old records during the process - * (hence the 2x multiplier). - * - * FIXME: currently does not account for the *actual* size - * of the shards, only the storage for the records - * themselves. - */ - size_t reccnt = m_levels[i-1]->get_record_count(); - if constexpr (L == LayoutPolicy::LEVELING) { - if (can_merge_with(i, reccnt)) { - reccnt += m_levels[i]->get_record_count(); - } - } - task.m_size = 2* reccnt * sizeof(R); - - merges.push_back(task); - } - - return std::move(merges); - } - - - /* - * - */ - std::vector get_merge_tasks_from_level(size_t source_level) { - std::vector merges; - - level_index merge_base_level = find_mergable_level(source_level); - if (merge_base_level == -1) { - merge_base_level = grow(); - } - - for (level_index i=merge_base_level; i>source_level; i--) { - MergeTask task; - task.m_source_level = i - 1; - task.m_target_level = i; - - /* - * The amount of storage required for the merge accounts - * for the cost of storing the new records, along with the - * cost of retaining the old records during the process - * (hence the 2x multiplier). - * - * FIXME: currently does not account for the *actual* size - * of the shards, only the storage for the records - * themselves. - */ - size_t reccnt = m_levels[i-1]->get_record_count(); - if constexpr (L == LayoutPolicy::LEVELING) { - if (can_merge_with(i, reccnt)) { - reccnt += m_levels[i]->get_record_count(); - } - } - task.m_size = 2* reccnt * sizeof(R); - - merges.push_back(task); - } - - return std::move(merges); - } - - /* - * Merge the level specified by incoming level into the level specified - * by base level. The two levels should be sequential--i.e. no levels - * are skipped in the merge process--otherwise the tombstone ordering - * invariant may be violated by the merge operation. - */ - inline void merge_levels(level_index base_level, level_index incoming_level) { - // merging two memory levels - if constexpr (L == LayoutPolicy::LEVELING) { - auto tmp = m_levels[base_level]; - m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); - } else { - m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); - m_levels[base_level]->finalize(); - } - - m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); - } - - -private: - size_t m_scale_factor; - double m_max_delete_prop; - size_t m_buffer_size; - - std::vector>> m_levels; - - /* - * Add a new level to the LSM Tree and return that level's index. Will - * automatically determine whether the level should be on memory or on disk, - * and act appropriately. - */ - inline level_index grow() { - level_index new_idx = m_levels.size(); - size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor; - - m_levels.emplace_back(std::shared_ptr>(new InternalLevel(new_idx, new_shard_cnt))); - return new_idx; - } - - /* - * Find the first level below the level indicated by idx that - * is capable of sustaining a merge operation and return its - * level index. If no such level exists, returns -1. Also - * returns -1 if idx==0, and no such level exists, to skimplify - * the logic of the first merge. - */ - inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { - - if (idx == 0 && m_levels.size() == 0) return -1; - - bool level_found = false; - bool disk_level; - level_index merge_level_idx; - - size_t incoming_rec_cnt = get_level_record_count(idx, buffer); - for (level_index i=idx+1; i(0, 1); - temp_level->append_buffer(buffer); - auto new_level = InternalLevel::merge_levels(old_level, temp_level); - - m_levels[0] = new_level; - delete temp_level; - } else { - m_levels[0]->append_buffer(buffer); - } - } - - /* - * Mark a given memory level as no-longer in use by the tree. For now this - * will just free the level. In future, this will be more complex as the - * level may not be able to immediately be deleted, depending upon who - * else is using it. - */ - inline void mark_as_unused(std::shared_ptr> level) { - level.reset(); - } - - /* - * Assume that level "0" should be larger than the buffer. The buffer - * itself is index -1, which should return simply the buffer capacity. - */ - inline size_t calc_level_record_capacity(level_index idx) { - return m_buffer_size * pow(m_scale_factor, idx+1); - } - - /* - * Returns the actual number of records present on a specified level. An - * index value of -1 indicates the memory table. Can optionally pass in - * a pointer to the memory table to use, if desired. Otherwise, there are - * no guarantees about which buffer will be accessed if level_index is -1. - */ - inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) { - if (buffer) { - return buffer->get_record_count(); - } - - return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0; - } - - /* - * Determines if the specific level can merge with another record containing - * incoming_rec_cnt number of records. The provided level index should be - * non-negative (i.e., not refer to the buffer) and will be automatically - * translated into the appropriate index into either the disk or memory level - * vector. - */ - inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { - if (idx>= m_levels.size() || !m_levels[idx]) { - return false; - } - - if (L == LayoutPolicy::LEVELING) { - return m_levels[idx]->get_record_count() + incoming_rec_cnt <= calc_level_record_capacity(idx); - } else { - return m_levels[idx]->get_shard_count() < m_scale_factor; - } - - /* unreachable */ - assert(true); - } -}; - -} - diff --git a/include/framework/InternalLevel.h b/include/framework/InternalLevel.h deleted file mode 100644 index 6cdac4e..0000000 --- a/include/framework/InternalLevel.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * include/framework/InternalLevel.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include - -#include "util/types.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" -#include "framework/RecordInterface.h" -#include "framework/MutableBuffer.h" - -namespace de { -template -class InternalLevel; - - - -template -class InternalLevel { - typedef S Shard; - typedef MutableBuffer Buffer; -public: - InternalLevel(ssize_t level_no, size_t shard_cap) - : m_level_no(level_no) - , m_shard_cnt(0) - , m_shards(shard_cap, nullptr) - , m_owns(shard_cap, true) - , m_pending_shard(nullptr) - {} - - // Create a new memory level sharing the shards and repurposing it as previous level_no + 1 - // WARNING: for leveling only. - InternalLevel(InternalLevel* level) - : m_level_no(level->m_level_no + 1) - , m_shard_cnt(level->m_shard_cnt) - , m_shards(level->m_shards.size(), nullptr) - , m_owns(level->m_owns.size(), true) - , m_pending_shard(nullptr) - { - assert(m_shard_cnt == 1 && m_shards.size() == 1); - - for (size_t i=0; im_owns[i] = false; - m_shards[i] = level->m_shards[i]; - } - } - - ~InternalLevel() { - for (size_t i=0; i merge_levels(InternalLevel* base_level, InternalLevel* new_level) { - assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0)); - auto res = new InternalLevel(base_level->m_level_no, 1); - res->m_shard_cnt = 1; - Shard* shards[2]; - shards[0] = base_level->m_shards[0]; - shards[1] = new_level->m_shards[0]; - - res->m_shards[0] = new S(shards, 2); - return std::shared_ptr(res); - } - - void append_buffer(Buffer* buffer) { - if (m_shard_cnt == m_shards.size()) { - assert(m_pending_shard == nullptr); - m_pending_shard = new S(buffer); - return; - } - - m_shards[m_shard_cnt] = new S(buffer); - m_owns[m_shard_cnt] = true; - ++m_shard_cnt; - } - - void append_merged_shards(InternalLevel* level) { - if (m_shard_cnt == m_shards.size()) { - m_pending_shard = new S(level->m_shards.data(), level->m_shard_cnt); - return; - } - - m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt); - m_owns[m_shard_cnt] = true; - - ++m_shard_cnt; - } - - - void finalize() { - if (m_pending_shard) { - for (size_t i=0; i> &shards, std::vector& shard_states, void *query_parms) { - for (size_t i=0; i= (ssize_t) shard_stop; i--) { - if (m_shards[i]) { - auto res = m_shards[i]->point_lookup(rec, true); - if (res && res->is_tombstone()) { - return true; - } - } - } - return false; - } - - bool delete_record(const R &rec) { - if (m_shard_cnt == 0) return false; - - for (size_t i = 0; i < m_shards.size(); ++i) { - if (m_shards[i]) { - auto res = m_shards[i]->point_lookup(rec); - if (res) { - res->set_delete(); - return true; - } - } - } - - return false; - } - - Shard* get_shard(size_t idx) { - return m_shards[idx]; - } - - size_t get_shard_count() { - return m_shard_cnt; - } - - size_t get_record_count() { - size_t cnt = 0; - for (size_t i=0; iget_record_count(); - } - - return cnt; - } - - size_t get_tombstone_count() { - size_t res = 0; - for (size_t i = 0; i < m_shard_cnt; ++i) { - res += m_shards[i]->get_tombstone_count(); - } - return res; - } - - size_t get_aux_memory_usage() { - size_t cnt = 0; - for (size_t i=0; iget_aux_memory_usage(); - } - - return cnt; - } - - size_t get_memory_usage() { - size_t cnt = 0; - for (size_t i=0; iget_memory_usage(); - } - } - - return cnt; - } - - double get_tombstone_prop() { - size_t tscnt = 0; - size_t reccnt = 0; - for (size_t i=0; iget_tombstone_count(); - reccnt += (*m_shards[i])->get_record_count(); - } - } - - return (double) tscnt / (double) (tscnt + reccnt); - } - -private: - ssize_t m_level_no; - - size_t m_shard_cnt; - size_t m_shard_size_cap; - - std::vector m_shards; - - Shard *m_pending_shard; - - std::vector m_owns; - - std::shared_ptr clone() { - auto new_level = std::make_shared(m_level_no, m_shards.size()); - for (size_t i=0; im_shards[i] = m_shards[i]; - new_level->m_owns[i] = true; - m_owns[i] = false; - } - - return new_level; - } -}; - -} diff --git a/include/framework/MutableBuffer.h b/include/framework/MutableBuffer.h deleted file mode 100644 index 572b656..0000000 --- a/include/framework/MutableBuffer.h +++ /dev/null @@ -1,242 +0,0 @@ -/* - * include/framework/MutableBuffer.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "psu-util/alignment.h" -#include "util/bf_config.h" -#include "psu-ds/BloomFilter.h" -#include "psu-ds/Alias.h" -#include "psu-util/timer.h" -#include "framework/RecordInterface.h" - -using psudb::CACHELINE_SIZE; - -namespace de { - -template -class MutableBuffer { -public: - MutableBuffer(size_t capacity, size_t max_tombstone_cap) - : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0) - , m_tombstonecnt(0), m_weight(0), m_max_weight(0) { - m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); - m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); - m_tombstone_filter = nullptr; - if (max_tombstone_cap > 0) { - m_tombstone_filter = new psudb::BloomFilter(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS); - } - - m_refcnt.store(0); - m_deferred_truncate.store(false); - m_merging.store(false); - } - - ~MutableBuffer() { - assert(m_refcnt.load() == 0); - assert(m_merging.load() == false); - - if (m_data) free(m_data); - if (m_tombstone_filter) delete m_tombstone_filter; - if (m_merge_data) free(m_merge_data); - } - - template - int append(const R &rec, bool tombstone=false) { - if (tombstone && m_tombstonecnt + 1 > m_tombstone_cap) return 0; - - int32_t pos = 0; - if ((pos = try_advance_tail()) == -1) return 0; - - Wrapped wrec; - wrec.rec = rec; - wrec.header = 0; - if (tombstone) wrec.set_tombstone(); - - m_data[pos] = wrec; - m_data[pos].header |= (pos << 2); - - if (tombstone) { - m_tombstonecnt.fetch_add(1); - if (m_tombstone_filter) m_tombstone_filter->insert(rec); - } - - if constexpr (WeightedRecordInterface) { - m_weight.fetch_add(rec.weight); - double old = m_max_weight.load(); - while (old < rec.weight) { - m_max_weight.compare_exchange_strong(old, rec.weight); - old = m_max_weight.load(); - } - } else { - m_weight.fetch_add(1); - } - - return 1; - } - - bool truncate() { - m_tombstonecnt.store(0); - m_reccnt.store(0); - m_weight.store(0); - m_max_weight.store(0); - if (m_tombstone_filter) m_tombstone_filter->clear(); - - return true; - } - - size_t get_record_count() { - return m_reccnt; - } - - size_t get_capacity() { - return m_cap; - } - - bool is_full() { - return m_reccnt == m_cap; - } - - size_t get_tombstone_count() { - return m_tombstonecnt.load(); - } - - bool delete_record(const R& rec) { - auto offset = 0; - while (offset < m_reccnt.load()) { - if (m_data[offset].rec == rec) { - m_data[offset].set_delete(); - return true; - } - offset++; - } - - return false; - } - - bool check_tombstone(const R& rec) { - if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false; - - auto offset = 0; - while (offset < m_reccnt.load()) { - if (m_data[offset].rec == rec && m_data[offset].is_tombstone()) { - return true; - } - offset++;; - } - return false; - } - - size_t get_memory_usage() { - return m_cap * sizeof(R); - } - - size_t get_aux_memory_usage() { - return m_tombstone_filter->get_memory_usage(); - } - - size_t get_tombstone_capacity() { - return m_tombstone_cap; - } - - double get_total_weight() { - return m_weight.load(); - } - - Wrapped *get_data() { - return m_data; - } - - double get_max_weight() { - return m_max_weight; - } - - bool start_merge() { - if (m_merge_lock.try_lock()) { - /* there cannot already been an active merge */ - if (m_merging.load()) { - m_merge_lock.unlock(); - return false; - } - - m_merging.store(true); - memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); - return true; - } - - /* lock could not be obtained */ - return false; - } - - bool finish_merge() { - m_merge_lock.unlock(); - return true; - } - - /* - * Concurrency-related operations - */ - bool take_reference() { - m_refcnt.fetch_add(1); - return true; - } - - bool release_reference() { - m_refcnt.fetch_add(-1); - - if (m_refcnt.load() == 0 && m_deferred_truncate.load()) { - assert(this->truncate()); - } - - return true; - } - - bool active_merge() { - return m_merging.load(); - } - -private: - int32_t try_advance_tail() { - size_t new_tail = m_reccnt.fetch_add(1); - - if (new_tail < m_cap) return new_tail; - else return -1; - } - - size_t m_cap; - size_t m_tombstone_cap; - - Wrapped* m_data; - Wrapped* m_merge_data; - - psudb::BloomFilter* m_tombstone_filter; - - alignas(64) std::atomic m_tombstonecnt; - alignas(64) std::atomic m_reccnt; - alignas(64) std::atomic m_weight; - alignas(64) std::atomic m_max_weight; - alignas(64) std::atomic m_merging; - alignas(64) std::atomic m_deferred_truncate; - alignas(64) std::atomic m_refcnt; - - alignas(64) std::mutex m_merge_lock; - alignas(64) std::mutex m_trunc_lock; - alignas(64) std::condition_variable m_trunc_signal; - -}; - -} diff --git a/include/framework/QueryInterface.h b/include/framework/QueryInterface.h deleted file mode 100644 index 46a1ce1..0000000 --- a/include/framework/QueryInterface.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * include/framework/QueryInterface.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include "util/types.h" - -template -concept QueryInterface = requires(Q q, void *p, std::vector &s) { - -/* - {q.get_query_state(p, p)} -> std::convertible_to; - {q.get_buffer_query_state(p, p)}; - {q.query(p, p)}; - {q.buffer_query(p, p)}; - {q.merge()}; - {q.delete_query_state(p)}; -*/ - {Q::EARLY_ABORT} -> std::convertible_to; - {Q::SKIP_DELETE_FILTER} -> std::convertible_to; - //{Q::get_query_state(p, p)} -> std::convertible_to; - //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; - {Q::process_query_states(p, s, p)}; - - {Q::delete_query_state(std::declval())} -> std::same_as; - {Q::delete_buffer_query_state(p)}; - -}; diff --git a/include/framework/RecordInterface.h b/include/framework/RecordInterface.h deleted file mode 100644 index 1ef1984..0000000 --- a/include/framework/RecordInterface.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * include/framework/RecordInterface.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include - -#include "psu-util/hash.h" - -namespace de { - -template -concept RecordInterface = requires(R r, R s) { - { r < s } ->std::convertible_to; - { r == s } ->std::convertible_to; -}; - -template -concept WeightedRecordInterface = requires(R r) { - {r.weight} -> std::convertible_to; -}; - -template -concept NDRecordInterface = RecordInterface && requires(R r, R s) { - {r.calc_distance(s)} -> std::convertible_to; -}; - -template -concept KVPInterface = RecordInterface && requires(R r) { - r.key; - r.value; -}; - -template -concept AlexInterface = KVPInterface && requires(R r) { - {r.key} -> std::convertible_to; - {r.value} -> std::convertible_to; -}; - -template -concept WrappedInterface = RecordInterface && requires(R r, R s, bool b) { - {r.header} -> std::convertible_to; - r.rec; - {r.set_delete()}; - {r.is_deleted()} -> std::convertible_to; - {r.set_tombstone(b)}; - {r.is_tombstone()} -> std::convertible_to; - {r < s} -> std::convertible_to; - {r == s} ->std::convertible_to; -}; - -template -struct Wrapped { - uint32_t header; - R rec; - - inline void set_delete() { - header |= 2; - } - - inline bool is_deleted() const { - return header & 2; - } - - inline void set_tombstone(bool val=true) { - if (val) { - header |= val; - } else { - header &= 0; - } - } - - inline bool is_tombstone() const { - return header & 1; - } - - inline bool operator<(const Wrapped& other) const { - return rec < other.rec || (rec == other.rec && header < other.header); - } - - inline bool operator==(const Wrapped& other) const { - return rec == other.rec; - } - -}; - -template -struct Record { - K key; - V value; - uint32_t header = 0; - - inline bool operator<(const Record& other) const { - return key < other.key || (key == other.key && value < other.value); - } - - inline bool operator==(const Record& other) const { - return key == other.key && value == other.value; - } -}; - -template -struct WeightedRecord { - K key; - V value; - W weight = 1; - - inline bool operator==(const WeightedRecord& other) const { - return key == other.key && value == other.value; - } - - inline bool operator<(const WeightedRecord& other) const { - return key < other.key || (key == other.key && value < other.value); - } -}; - - -template -struct CosinePoint{ - V data[D]; - - inline bool operator==(const CosinePoint& other) const { - for (size_t i=0; i other.data[i]) { - return false; - } - } - - return false; - } - - inline double calc_distance(const CosinePoint& other) const { - - double prod = 0; - double asquared = 0; - double bsquared = 0; - - for (size_t i=0; i -struct EuclidPoint{ - V data[D]; - - inline bool operator==(const EuclidPoint& other) const { - for (size_t i=0; i other.data[i]) { - return false; - } - } - - return false; - } - - inline double calc_distance(const EuclidPoint& other) const { - double dist = 0; - for (size_t i=0; i -struct RecordHash { - size_t operator()(R const &rec) const { - return psudb::hash_bytes((std::byte *) &rec, sizeof(R)); - } -}; - -} diff --git a/include/framework/Scheduler.h b/include/framework/Scheduler.h deleted file mode 100644 index 6055bef..0000000 --- a/include/framework/Scheduler.h +++ /dev/null @@ -1,194 +0,0 @@ -/* - * include/framework/Scheduler.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include -#include - -#include "util/types.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" -#include "framework/RecordInterface.h" -#include "framework/MutableBuffer.h" -#include "framework/Configuration.h" -#include "framework/ExtensionStructure.h" - -namespace de { - -template -class Scheduler { - typedef ExtensionStructure Structure; - typedef MutableBuffer Buffer; -public: - /* - * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means - * unlimited. - */ - Scheduler(size_t memory_budget, size_t thread_cnt) - : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) - , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) - , m_used_memory(0) - , m_used_threads(0) - , m_shutdown(false) - { - m_sched_thrd = std::thread(&Scheduler::run_scheduler, this); - } - - ~Scheduler() { - m_shutdown = true; - - m_cv.notify_all(); - m_sched_thrd.join(); - } - - bool schedule_merge(Structure *version, MutableBuffer *buffer) { - /* - * temporary hack - */ - pending_version = version; - pending_buffer = buffer; - - /* - * Get list of individual level reconstructions that are necessary - * for completing the overall merge - */ - std::vector merges = version->get_merge_tasks(buffer->get_record_count()); - - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=0; iget_record_count() * sizeof(R) * 2; - buffer_merge.m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(buffer_merge); - m_merge_queue_lock.unlock(); - - m_cv.notify_all(); - do { - std::unique_lock merge_cv_lock(m_merge_cv_lock); - m_merge_cv.wait(merge_cv_lock); - } while (m_merge_queue.size() > 0); - - assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); - - return true; - } - -private: - size_t get_timestamp() { - auto ts = m_timestamp.fetch_add(1); - return ts; - } - - void schedule_next_task() { - m_merge_queue_lock.lock(); - auto task = m_merge_queue.top(); - m_merge_queue.pop(); - m_merge_queue_lock.unlock(); - - if (task.m_source_level == -1 && task.m_target_level == 0) { - run_buffer_merge(pending_buffer, pending_version); - } else { - run_merge(task, pending_version); - } - - if (m_merge_queue.size() == 0) { - m_merge_cv.notify_all(); - } - } - - void run_merge(MergeTask task, Structure *version) { - version->merge_levels(task.m_target_level, task.m_source_level); - - if (!version->validate_tombstone_proportion(task.m_target_level)) { - auto tasks = version->get_merge_tasks(task.m_target_level); - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); - } - } - } - - - void run_buffer_merge(Buffer *buffer, Structure *version) { - version->merge_buffer(buffer); - if (!version->validate_tombstone_proportion(0)) { - auto tasks = version->get_merge_tasks_from_level(0); - - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); - } - } - } - - void run_scheduler() { - do { - std::unique_lock cv_lock(m_cv_lock); - m_cv.wait(cv_lock); - - while (m_merge_queue.size() > 0 && m_used_threads < m_thread_cnt) { - schedule_next_task(); - } - cv_lock.unlock(); - } while(!m_shutdown); - } - - size_t m_memory_budget; - size_t m_thread_cnt; - - Buffer *pending_buffer; - Structure *pending_version; - - alignas(64) std::atomic m_used_memory; - alignas(64) std::atomic m_used_threads; - alignas(64) std::atomic m_timestamp; - - std::priority_queue, std::greater> m_merge_queue; - std::mutex m_merge_queue_lock; - - std::mutex m_cv_lock; - std::condition_variable m_cv; - - std::mutex m_merge_cv_lock; - std::condition_variable m_merge_cv; - - std::thread m_sched_thrd; - - bool m_shutdown; -}; - -} diff --git a/include/framework/ShardInterface.h b/include/framework/ShardInterface.h deleted file mode 100644 index 3aa62df..0000000 --- a/include/framework/ShardInterface.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * include/framework/ShardInterface.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include - -#include "util/types.h" -#include "framework/RecordInterface.h" - -namespace de { - -//template typename S, typename R> -template -concept ShardInterface = requires(S s, void *p, bool b) { - //{s.point_lookup(r, b) } -> std::same_as; - {s.get_record_count()} -> std::convertible_to; - {s.get_memory_usage()} -> std::convertible_to; -}; - -} diff --git a/include/framework/ShardRequirements.h b/include/framework/ShardRequirements.h new file mode 100644 index 0000000..95f7b67 --- /dev/null +++ b/include/framework/ShardRequirements.h @@ -0,0 +1,9 @@ +/* + * + */ +#pragma once + +#include "framework/structure/MutableBuffer.h" +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h new file mode 100644 index 0000000..46a1ce1 --- /dev/null +++ b/include/framework/interface/Query.h @@ -0,0 +1,35 @@ +/* + * include/framework/QueryInterface.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include "util/types.h" + +template +concept QueryInterface = requires(Q q, void *p, std::vector &s) { + +/* + {q.get_query_state(p, p)} -> std::convertible_to; + {q.get_buffer_query_state(p, p)}; + {q.query(p, p)}; + {q.buffer_query(p, p)}; + {q.merge()}; + {q.delete_query_state(p)}; +*/ + {Q::EARLY_ABORT} -> std::convertible_to; + {Q::SKIP_DELETE_FILTER} -> std::convertible_to; + //{Q::get_query_state(p, p)} -> std::convertible_to; + //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; + {Q::process_query_states(p, s, p)}; + + {Q::delete_query_state(std::declval())} -> std::same_as; + {Q::delete_buffer_query_state(p)}; + +}; diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h new file mode 100644 index 0000000..1ef1984 --- /dev/null +++ b/include/framework/interface/Record.h @@ -0,0 +1,214 @@ +/* + * include/framework/RecordInterface.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include + +#include "psu-util/hash.h" + +namespace de { + +template +concept RecordInterface = requires(R r, R s) { + { r < s } ->std::convertible_to; + { r == s } ->std::convertible_to; +}; + +template +concept WeightedRecordInterface = requires(R r) { + {r.weight} -> std::convertible_to; +}; + +template +concept NDRecordInterface = RecordInterface && requires(R r, R s) { + {r.calc_distance(s)} -> std::convertible_to; +}; + +template +concept KVPInterface = RecordInterface && requires(R r) { + r.key; + r.value; +}; + +template +concept AlexInterface = KVPInterface && requires(R r) { + {r.key} -> std::convertible_to; + {r.value} -> std::convertible_to; +}; + +template +concept WrappedInterface = RecordInterface && requires(R r, R s, bool b) { + {r.header} -> std::convertible_to; + r.rec; + {r.set_delete()}; + {r.is_deleted()} -> std::convertible_to; + {r.set_tombstone(b)}; + {r.is_tombstone()} -> std::convertible_to; + {r < s} -> std::convertible_to; + {r == s} ->std::convertible_to; +}; + +template +struct Wrapped { + uint32_t header; + R rec; + + inline void set_delete() { + header |= 2; + } + + inline bool is_deleted() const { + return header & 2; + } + + inline void set_tombstone(bool val=true) { + if (val) { + header |= val; + } else { + header &= 0; + } + } + + inline bool is_tombstone() const { + return header & 1; + } + + inline bool operator<(const Wrapped& other) const { + return rec < other.rec || (rec == other.rec && header < other.header); + } + + inline bool operator==(const Wrapped& other) const { + return rec == other.rec; + } + +}; + +template +struct Record { + K key; + V value; + uint32_t header = 0; + + inline bool operator<(const Record& other) const { + return key < other.key || (key == other.key && value < other.value); + } + + inline bool operator==(const Record& other) const { + return key == other.key && value == other.value; + } +}; + +template +struct WeightedRecord { + K key; + V value; + W weight = 1; + + inline bool operator==(const WeightedRecord& other) const { + return key == other.key && value == other.value; + } + + inline bool operator<(const WeightedRecord& other) const { + return key < other.key || (key == other.key && value < other.value); + } +}; + + +template +struct CosinePoint{ + V data[D]; + + inline bool operator==(const CosinePoint& other) const { + for (size_t i=0; i other.data[i]) { + return false; + } + } + + return false; + } + + inline double calc_distance(const CosinePoint& other) const { + + double prod = 0; + double asquared = 0; + double bsquared = 0; + + for (size_t i=0; i +struct EuclidPoint{ + V data[D]; + + inline bool operator==(const EuclidPoint& other) const { + for (size_t i=0; i other.data[i]) { + return false; + } + } + + return false; + } + + inline double calc_distance(const EuclidPoint& other) const { + double dist = 0; + for (size_t i=0; i +struct RecordHash { + size_t operator()(R const &rec) const { + return psudb::hash_bytes((std::byte *) &rec, sizeof(R)); + } +}; + +} diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h new file mode 100644 index 0000000..1445e90 --- /dev/null +++ b/include/framework/interface/Scheduler.h @@ -0,0 +1,31 @@ +/* + * include/framework/QueryInterface.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include "framework/interface/Record.h" +#include "util/types.h" + +template +concept SchedulerInterface = requires(S s, size_t i, void *vp) { + {S(i, i)}; +// {s.schedule_merge(vp, vp)}; + +/* + {q.get_query_state(p, p)} -> std::convertible_to; + {q.get_buffer_query_state(p, p)}; + {q.query(p, p)}; + {q.buffer_query(p, p)}; + {q.merge()}; + {q.delete_query_state(p)}; +*/ + //{Q::get_query_state(p, p)} -> std::convertible_to; + //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; +}; diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h new file mode 100644 index 0000000..ea58b2a --- /dev/null +++ b/include/framework/interface/Shard.h @@ -0,0 +1,26 @@ +/* + * include/framework/ShardInterface.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include + +#include "util/types.h" +#include "framework/interface/Record.h" + +namespace de { + +//template typename S, typename R> +template +concept ShardInterface = requires(S s, void *p, bool b) { + //{s.point_lookup(r, b) } -> std::same_as; + {s.get_record_count()} -> std::convertible_to; + {s.get_memory_usage()} -> std::convertible_to; +}; + +} diff --git a/include/framework/scheduling/Scheduler.h b/include/framework/scheduling/Scheduler.h new file mode 100644 index 0000000..992cbf9 --- /dev/null +++ b/include/framework/scheduling/Scheduler.h @@ -0,0 +1,195 @@ +/* + * include/framework/Scheduler.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "util/types.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/util/Configuration.h" +#include "framework/structure/ExtensionStructure.h" + +namespace de { + +template +class Scheduler { + typedef ExtensionStructure Structure; + typedef MutableBuffer Buffer; +public: + /* + * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means + * unlimited. + */ + Scheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) + , m_used_memory(0) + , m_used_threads(0) + , m_shutdown(false) + { + m_sched_thrd = std::thread(&Scheduler::run_scheduler, this); + } + + ~Scheduler() { + m_shutdown = true; + + m_cv.notify_all(); + m_sched_thrd.join(); + } + + bool schedule_merge(Structure *version, MutableBuffer *buffer) { + /* + * temporary hack + */ + pending_version = version; + pending_buffer = buffer; + + /* + * Get list of individual level reconstructions that are necessary + * for completing the overall merge + */ + std::vector merges = version->get_merge_tasks(buffer->get_record_count()); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=0; iget_record_count() * sizeof(R) * 2; + buffer_merge.m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(buffer_merge); + m_merge_queue_lock.unlock(); + + m_cv.notify_all(); + do { + std::unique_lock merge_cv_lock(m_merge_cv_lock); + m_merge_cv.wait(merge_cv_lock); + } while (m_merge_queue.size() > 0); + + assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); + + return true; + } + +private: + size_t get_timestamp() { + auto ts = m_timestamp.fetch_add(1); + return ts; + } + + void schedule_next_task() { + m_merge_queue_lock.lock(); + auto task = m_merge_queue.top(); + m_merge_queue.pop(); + m_merge_queue_lock.unlock(); + + if (task.m_source_level == -1 && task.m_target_level == 0) { + run_buffer_merge(pending_buffer, pending_version); + } else { + run_merge(task, pending_version); + } + + if (m_merge_queue.size() == 0) { + m_merge_cv.notify_all(); + } + } + + + void run_merge(MergeTask task, Structure *version) { + version->merge_levels(task.m_target_level, task.m_source_level); + + if (!version->validate_tombstone_proportion(task.m_target_level)) { + auto tasks = version->get_merge_tasks(task.m_target_level); + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + + void run_buffer_merge(Buffer *buffer, Structure *version) { + version->merge_buffer(buffer); + if (!version->validate_tombstone_proportion(0)) { + auto tasks = version->get_merge_tasks_from_level(0); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + void run_scheduler() { + do { + std::unique_lock cv_lock(m_cv_lock); + m_cv.wait(cv_lock); + + while (m_merge_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { + schedule_next_task(); + } + cv_lock.unlock(); + } while(!m_shutdown); + } + + size_t m_memory_budget; + size_t m_thread_cnt; + + Buffer *pending_buffer; + Structure *pending_version; + + alignas(64) std::atomic m_used_memory; + alignas(64) std::atomic m_used_threads; + alignas(64) std::atomic m_timestamp; + + std::priority_queue, std::greater> m_merge_queue; + std::mutex m_merge_queue_lock; + + std::mutex m_cv_lock; + std::condition_variable m_cv; + + std::mutex m_merge_cv_lock; + std::condition_variable m_merge_cv; + + std::thread m_sched_thrd; + + bool m_shutdown; +}; + +} diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h new file mode 100644 index 0000000..5e16bdf --- /dev/null +++ b/include/framework/scheduling/SerialScheduler.h @@ -0,0 +1,227 @@ +/* + * include/framework/Scheduler.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "util/types.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/util/Configuration.h" +#include "framework/structure/ExtensionStructure.h" +#include "framework/scheduling/Task.h" + +namespace de { + +template +class SerialScheduler { + typedef ExtensionStructure Structure; + typedef MutableBuffer Buffer; +public: + /* + * A simple "scheduler" that runs tasks serially, in a FIFO manner. Incoming concurrent + * requests will wait for their turn, and only one task will be active in the system at + * a time. The scheduler will spin up a second thread for running itself, but all tasks + * will be single-threaded. + * + * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means + * unlimited. + * + * Note that the SerialScheduler object is non-concurrent, and so will ignore the + * thread_cnt argument. It will obey the memory_budget, however a failure due to + * memory constraints will be irrecoverable, as there is no way to free up memory + * or block particular tasks until memory becomes available. + */ + SerialScheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) + , m_used_memory(0) + , m_used_threads(0) + , m_shutdown(false) + { + m_sched_thrd = std::thread(&SerialScheduler::run_scheduler, this); + } + + ~SerialScheduler() { + m_shutdown = true; + + m_cv.notify_all(); + m_sched_thrd.join(); + } + + bool schedule_merge(Structure *version, MutableBuffer *buffer) { + pending_version = version; + pending_buffer = buffer; + + /* + * Get list of individual level reconstructions that are necessary + * for completing the overall merge + */ + std::vector merges = version->get_merge_tasks(buffer->get_record_count()); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=0; iget_record_count() * sizeof(R) * 2; + buffer_merge.m_timestamp = m_timestamp.fetch_add(1); + buffer_merge.m_type = TaskType::MERGE; + m_merge_queue_lock.lock(); + m_merge_queue.emplace(buffer_merge); + m_merge_queue_lock.unlock(); + + m_cv.notify_all(); + do { + std::unique_lock merge_cv_lock(m_merge_cv_lock); + m_merge_cv.wait(merge_cv_lock); + } while (m_merge_queue.size() > 0); + + assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); + + return true; + } + + bool schedule_query() { + return true; + } + +private: + size_t get_timestamp() { + auto ts = m_timestamp.fetch_add(1); + return ts; + } + + void schedule_merge(MergeTask task) { + if (task.m_source_level == -1 && task.m_target_level == 0) { + run_buffer_merge(pending_buffer, pending_version); + } else { + run_merge(task, pending_version); + } + } + + + void schedule_query(QueryTask task) { + + } + + void schedule_next_task() { + m_merge_queue_lock.lock(); + auto task = m_merge_queue.top(); + m_merge_queue.pop(); + m_merge_queue_lock.unlock(); + + auto type = std::visit(GetTaskType{}, task); + + switch (type) { + case TaskType::MERGE: + schedule_merge(std::get(task)); + break; + case TaskType::QUERY: + schedule_query(std::get(task)); + break; + default: assert(false); + } + + if (m_merge_queue.size() == 0) { + m_merge_cv.notify_all(); + } + } + + + void run_merge(MergeTask task, Structure *version) { + version->merge_levels(task.m_target_level, task.m_source_level); + + if (!version->validate_tombstone_proportion(task.m_target_level)) { + auto tasks = version->get_merge_tasks(task.m_target_level); + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + + void run_buffer_merge(Buffer *buffer, Structure *version) { + version->merge_buffer(buffer); + if (!version->validate_tombstone_proportion(0)) { + auto tasks = version->get_merge_tasks_from_level(0); + + /* + * Schedule the merge tasks (FIXME: currently this just + * executes them sequentially in a blocking fashion) + */ + for (ssize_t i=tasks.size()-1; i>=0; i--) { + tasks[i].m_timestamp = m_timestamp.fetch_add(1); + m_merge_queue_lock.lock(); + m_merge_queue.push(tasks[i]); + m_merge_queue_lock.unlock(); + } + } + } + + void run_scheduler() { + do { + std::unique_lock cv_lock(m_cv_lock); + m_cv.wait(cv_lock); + + while (m_merge_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { + schedule_next_task(); + } + cv_lock.unlock(); + } while(!m_shutdown); + } + + size_t m_memory_budget; + size_t m_thread_cnt; + + Buffer *pending_buffer; + Structure *pending_version; + + alignas(64) std::atomic m_used_memory; + alignas(64) std::atomic m_used_threads; + alignas(64) std::atomic m_timestamp; + + std::priority_queue, std::greater> m_merge_queue; + std::mutex m_merge_queue_lock; + + std::mutex m_cv_lock; + std::condition_variable m_cv; + + std::mutex m_merge_cv_lock; + std::condition_variable m_merge_cv; + + std::thread m_sched_thrd; + + bool m_shutdown; +}; + +} diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h new file mode 100644 index 0000000..9e0655a --- /dev/null +++ b/include/framework/scheduling/Task.h @@ -0,0 +1,63 @@ +/* + * + */ +#pragma once + +#include + +#include "framework/util/Configuration.h" + +namespace de { + +enum class TaskType { + MERGE, + QUERY +}; + +struct MergeTask { + level_index m_source_level; + level_index m_target_level; + size_t m_timestamp; + size_t m_size; + TaskType m_type; + + TaskType get_type() const { + return m_type; + } + + friend bool operator<(const MergeTask &self, const MergeTask &other) { + return self.m_timestamp < other.m_timestamp; + } + + friend bool operator>(const MergeTask &self, const MergeTask &other) { + return self.m_timestamp > other.m_timestamp; + } + +}; + +struct QueryTask { + size_t m_timestamp; + size_t m_size; + TaskType m_type; + + TaskType get_type() const { + return m_type; + } + + friend bool operator<(const QueryTask &self, const QueryTask &other) { + return self.m_timestamp < other.m_timestamp; + } + + friend bool operator>(const QueryTask &self, const QueryTask &other) { + return self.m_timestamp > other.m_timestamp; + } +}; + +struct GetTaskType { + TaskType operator()(const MergeTask &t) { return t.get_type(); } + TaskType operator()(const QueryTask &t) { return t.get_type(); } +}; + +typedef std::variant Task; + +} diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h new file mode 100644 index 0000000..920e1c3 --- /dev/null +++ b/include/framework/structure/ExtensionStructure.h @@ -0,0 +1,428 @@ +/* + * include/framework/ExtensionStructure.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include + +#include "framework/structure/MutableBuffer.h" +#include "framework/structure/InternalLevel.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" + +#include "framework/util/Configuration.h" +#include "framework/scheduling/Task.h" + +#include "psu-util/timer.h" +#include "psu-ds/Alias.h" + +namespace de { + +template +class ExtensionStructure { + typedef S Shard; + typedef MutableBuffer Buffer; + +public: + ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop) + : m_scale_factor(scale_factor) + , m_max_delete_prop(max_delete_prop) + , m_buffer_size(buffer_size) + {} + + ~ExtensionStructure() = default; + + /* + * Create a shallow copy of this extension structure. The copy will share references to the + * same levels/shards as the original, but will have its own lists. As all of the shards are + * immutable (with the exception of deletes), the copy can be restructured with merges, etc., + * without affecting the original. + * + * NOTE: When using tagged deletes, a delete of a record in the original structure will affect + * the copy, so long as the copy retains a reference to the same shard as the original. This could + * cause synchronization problems under tagging with concurrency. Any deletes in this context will + * need to be forwarded to the appropriate structures manually. + */ + ExtensionStructure *copy() { + auto new_struct = new ExtensionStructure(m_scale_factor, m_max_delete_prop, m_buffer_size); + for (size_t i=0; im_levels.push_back(m_levels[i]->clone()); + } + + return new_struct; + } + + /* + * Search for a record matching the argument and mark it deleted by + * setting the delete bit in its wrapped header. Returns 1 if a matching + * record was found and deleted, and 0 if a matching record was not found. + * + * This function will stop after finding the first matching record. It is assumed + * that no duplicate records exist. In the case of duplicates, this function will + * still "work", but in the sense of "delete first match". + */ + int tagged_delete(const R &rec) { + for (auto level : m_levels) { + if (level && level->delete_record(rec)) { + return 1; + } + } + + /* + * If the record to be erased wasn't found, return 0. The + * DynamicExtension itself will then search the active + * Buffers. + */ + return 0; + } + + /* + * Merge the memory table down into the tree, completing any required other + * merges to make room for it. + */ + inline bool merge_buffer(Buffer *buffer) { + assert(can_merge_with(0, buffer->get_record_count())); + + merge_buffer_into_l0(buffer); + buffer->truncate(); + + return true; + } + + /* + * Return the total number of records (including tombstones) within all + * of the levels of the structure. + */ + size_t get_record_count() { + size_t cnt = 0; + + for (size_t i=0; iget_record_count(); + } + + return cnt; + } + + /* + * Return the total number of tombstones contained within all of the + * levels of the structure. + */ + size_t get_tombstone_cnt() { + size_t cnt = 0; + + for (size_t i=0; iget_tombstone_count(); + } + + return cnt; + } + + /* + * Return the number of levels within the structure. Note that not + * all of these levels are necessarily populated. + */ + size_t get_height() { + return m_levels.size(); + } + + /* + * Return the amount of memory (in bytes) used by the shards within the + * structure for storing the primary data structure and raw data. + */ + size_t get_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_memory_usage(); + } + + return cnt; + } + + /* + * Return the amount of memory (in bytes) used by the shards within the + * structure for storing auxiliary data structures. This total does not + * include memory used for the main data structure, or raw data. + */ + size_t get_aux_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_aux_memory_usage(); + } + } + + return cnt; + } + + /* + * Validate that no level in the structure exceeds its maximum tombstone capacity. This is + * used to trigger preemptive compactions at the end of the merge process. + */ + bool validate_tombstone_proportion() { + long double ts_prop; + for (size_t i=0; iget_tombstone_count() / (long double) calc_level_record_capacity(i); + if (ts_prop > (long double) m_max_delete_prop) { + return false; + } + } + } + + return true; + } + + bool validate_tombstone_proportion(level_index level) { + long double ts_prop = (long double) m_levels[level]->get_tombstone_count() / (long double) calc_level_record_capacity(level); + return ts_prop <= (long double) m_max_delete_prop; + } + + /* + * Return a reference to the underlying vector of levels within the + * structure. + */ + std::vector>> &get_levels() { + return m_levels; + } + + /* + * + */ + std::vector get_merge_tasks(size_t buffer_reccnt) { + std::vector merges; + + /* + * The buffer -> L0 merge task is not included so if that + * can be done without any other change, just return an + * empty list. + */ + if (can_merge_with(0, buffer_reccnt)) { + return std::move(merges); + } + + level_index merge_base_level = find_mergable_level(0); + if (merge_base_level == -1) { + merge_base_level = grow(); + } + + for (level_index i=merge_base_level; i>0; i--) { + MergeTask task; + task.m_source_level = i - 1; + task.m_target_level = i; + task.m_type = TaskType::MERGE; + + /* + * The amount of storage required for the merge accounts + * for the cost of storing the new records, along with the + * cost of retaining the old records during the process + * (hence the 2x multiplier). + * + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records + * themselves. + */ + size_t reccnt = m_levels[i-1]->get_record_count(); + if constexpr (L == LayoutPolicy::LEVELING) { + if (can_merge_with(i, reccnt)) { + reccnt += m_levels[i]->get_record_count(); + } + } + task.m_size = 2* reccnt * sizeof(R); + + merges.push_back(task); + } + + return std::move(merges); + } + + + /* + * + */ + std::vector get_merge_tasks_from_level(size_t source_level) { + std::vector merges; + + level_index merge_base_level = find_mergable_level(source_level); + if (merge_base_level == -1) { + merge_base_level = grow(); + } + + for (level_index i=merge_base_level; i>source_level; i--) { + MergeTask task; + task.m_source_level = i - 1; + task.m_target_level = i; + + /* + * The amount of storage required for the merge accounts + * for the cost of storing the new records, along with the + * cost of retaining the old records during the process + * (hence the 2x multiplier). + * + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records + * themselves. + */ + size_t reccnt = m_levels[i-1]->get_record_count(); + if constexpr (L == LayoutPolicy::LEVELING) { + if (can_merge_with(i, reccnt)) { + reccnt += m_levels[i]->get_record_count(); + } + } + task.m_size = 2* reccnt * sizeof(R); + + merges.push_back(task); + } + + return std::move(merges); + } + + /* + * Merge the level specified by incoming level into the level specified + * by base level. The two levels should be sequential--i.e. no levels + * are skipped in the merge process--otherwise the tombstone ordering + * invariant may be violated by the merge operation. + */ + inline void merge_levels(level_index base_level, level_index incoming_level) { + // merging two memory levels + if constexpr (L == LayoutPolicy::LEVELING) { + auto tmp = m_levels[base_level]; + m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); + } else { + m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); + m_levels[base_level]->finalize(); + } + + m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); + } + + +private: + size_t m_scale_factor; + double m_max_delete_prop; + size_t m_buffer_size; + + std::vector>> m_levels; + + /* + * Add a new level to the LSM Tree and return that level's index. Will + * automatically determine whether the level should be on memory or on disk, + * and act appropriately. + */ + inline level_index grow() { + level_index new_idx = m_levels.size(); + size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor; + + m_levels.emplace_back(std::shared_ptr>(new InternalLevel(new_idx, new_shard_cnt))); + return new_idx; + } + + /* + * Find the first level below the level indicated by idx that + * is capable of sustaining a merge operation and return its + * level index. If no such level exists, returns -1. Also + * returns -1 if idx==0, and no such level exists, to skimplify + * the logic of the first merge. + */ + inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { + + if (idx == 0 && m_levels.size() == 0) return -1; + + bool level_found = false; + bool disk_level; + level_index merge_level_idx; + + size_t incoming_rec_cnt = get_level_record_count(idx, buffer); + for (level_index i=idx+1; i(0, 1); + temp_level->append_buffer(buffer); + auto new_level = InternalLevel::merge_levels(old_level, temp_level); + + m_levels[0] = new_level; + delete temp_level; + } else { + m_levels[0]->append_buffer(buffer); + } + } + + /* + * Mark a given memory level as no-longer in use by the tree. For now this + * will just free the level. In future, this will be more complex as the + * level may not be able to immediately be deleted, depending upon who + * else is using it. + */ + inline void mark_as_unused(std::shared_ptr> level) { + level.reset(); + } + + /* + * Assume that level "0" should be larger than the buffer. The buffer + * itself is index -1, which should return simply the buffer capacity. + */ + inline size_t calc_level_record_capacity(level_index idx) { + return m_buffer_size * pow(m_scale_factor, idx+1); + } + + /* + * Returns the actual number of records present on a specified level. An + * index value of -1 indicates the memory table. Can optionally pass in + * a pointer to the memory table to use, if desired. Otherwise, there are + * no guarantees about which buffer will be accessed if level_index is -1. + */ + inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) { + if (buffer) { + return buffer->get_record_count(); + } + + return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0; + } + + /* + * Determines if the specific level can merge with another record containing + * incoming_rec_cnt number of records. The provided level index should be + * non-negative (i.e., not refer to the buffer) and will be automatically + * translated into the appropriate index into either the disk or memory level + * vector. + */ + inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { + if (idx>= m_levels.size() || !m_levels[idx]) { + return false; + } + + if (L == LayoutPolicy::LEVELING) { + return m_levels[idx]->get_record_count() + incoming_rec_cnt <= calc_level_record_capacity(idx); + } else { + return m_levels[idx]->get_shard_count() < m_scale_factor; + } + + /* unreachable */ + assert(true); + } +}; + +} + diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h new file mode 100644 index 0000000..b9230f4 --- /dev/null +++ b/include/framework/structure/InternalLevel.h @@ -0,0 +1,258 @@ +/* + * include/framework/InternalLevel.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include + +#include "util/types.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" + +namespace de { +template +class InternalLevel; + + + +template +class InternalLevel { + typedef S Shard; + typedef MutableBuffer Buffer; +public: + InternalLevel(ssize_t level_no, size_t shard_cap) + : m_level_no(level_no) + , m_shard_cnt(0) + , m_shards(shard_cap, nullptr) + , m_owns(shard_cap, true) + , m_pending_shard(nullptr) + {} + + // Create a new memory level sharing the shards and repurposing it as previous level_no + 1 + // WARNING: for leveling only. + InternalLevel(InternalLevel* level) + : m_level_no(level->m_level_no + 1) + , m_shard_cnt(level->m_shard_cnt) + , m_shards(level->m_shards.size(), nullptr) + , m_owns(level->m_owns.size(), true) + , m_pending_shard(nullptr) + { + assert(m_shard_cnt == 1 && m_shards.size() == 1); + + for (size_t i=0; im_owns[i] = false; + m_shards[i] = level->m_shards[i]; + } + } + + ~InternalLevel() { + for (size_t i=0; i merge_levels(InternalLevel* base_level, InternalLevel* new_level) { + assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0)); + auto res = new InternalLevel(base_level->m_level_no, 1); + res->m_shard_cnt = 1; + Shard* shards[2]; + shards[0] = base_level->m_shards[0]; + shards[1] = new_level->m_shards[0]; + + res->m_shards[0] = new S(shards, 2); + return std::shared_ptr(res); + } + + void append_buffer(Buffer* buffer) { + if (m_shard_cnt == m_shards.size()) { + assert(m_pending_shard == nullptr); + m_pending_shard = new S(buffer); + return; + } + + m_shards[m_shard_cnt] = new S(buffer); + m_owns[m_shard_cnt] = true; + ++m_shard_cnt; + } + + void append_merged_shards(InternalLevel* level) { + if (m_shard_cnt == m_shards.size()) { + m_pending_shard = new S(level->m_shards.data(), level->m_shard_cnt); + return; + } + + m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt); + m_owns[m_shard_cnt] = true; + + ++m_shard_cnt; + } + + + void finalize() { + if (m_pending_shard) { + for (size_t i=0; i> &shards, std::vector& shard_states, void *query_parms) { + for (size_t i=0; i= (ssize_t) shard_stop; i--) { + if (m_shards[i]) { + auto res = m_shards[i]->point_lookup(rec, true); + if (res && res->is_tombstone()) { + return true; + } + } + } + return false; + } + + bool delete_record(const R &rec) { + if (m_shard_cnt == 0) return false; + + for (size_t i = 0; i < m_shards.size(); ++i) { + if (m_shards[i]) { + auto res = m_shards[i]->point_lookup(rec); + if (res) { + res->set_delete(); + return true; + } + } + } + + return false; + } + + Shard* get_shard(size_t idx) { + return m_shards[idx]; + } + + size_t get_shard_count() { + return m_shard_cnt; + } + + size_t get_record_count() { + size_t cnt = 0; + for (size_t i=0; iget_record_count(); + } + + return cnt; + } + + size_t get_tombstone_count() { + size_t res = 0; + for (size_t i = 0; i < m_shard_cnt; ++i) { + res += m_shards[i]->get_tombstone_count(); + } + return res; + } + + size_t get_aux_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_aux_memory_usage(); + } + + return cnt; + } + + size_t get_memory_usage() { + size_t cnt = 0; + for (size_t i=0; iget_memory_usage(); + } + } + + return cnt; + } + + double get_tombstone_prop() { + size_t tscnt = 0; + size_t reccnt = 0; + for (size_t i=0; iget_tombstone_count(); + reccnt += (*m_shards[i])->get_record_count(); + } + } + + return (double) tscnt / (double) (tscnt + reccnt); + } + +private: + ssize_t m_level_no; + + size_t m_shard_cnt; + size_t m_shard_size_cap; + + std::vector m_shards; + + Shard *m_pending_shard; + + std::vector m_owns; + + std::shared_ptr clone() { + auto new_level = std::make_shared(m_level_no, m_shards.size()); + for (size_t i=0; im_shards[i] = m_shards[i]; + new_level->m_owns[i] = true; + m_owns[i] = false; + } + + return new_level; + } +}; + +} diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h new file mode 100644 index 0000000..9f12175 --- /dev/null +++ b/include/framework/structure/MutableBuffer.h @@ -0,0 +1,242 @@ +/* + * include/framework/MutableBuffer.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "psu-util/alignment.h" +#include "util/bf_config.h" +#include "psu-ds/BloomFilter.h" +#include "psu-ds/Alias.h" +#include "psu-util/timer.h" +#include "framework/interface/Record.h" + +using psudb::CACHELINE_SIZE; + +namespace de { + +template +class MutableBuffer { +public: + MutableBuffer(size_t capacity, size_t max_tombstone_cap) + : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0) + , m_tombstonecnt(0), m_weight(0), m_max_weight(0) { + m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); + m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); + m_tombstone_filter = nullptr; + if (max_tombstone_cap > 0) { + m_tombstone_filter = new psudb::BloomFilter(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS); + } + + m_refcnt.store(0); + m_deferred_truncate.store(false); + m_merging.store(false); + } + + ~MutableBuffer() { + assert(m_refcnt.load() == 0); + assert(m_merging.load() == false); + + if (m_data) free(m_data); + if (m_tombstone_filter) delete m_tombstone_filter; + if (m_merge_data) free(m_merge_data); + } + + template + int append(const R &rec, bool tombstone=false) { + if (tombstone && m_tombstonecnt + 1 > m_tombstone_cap) return 0; + + int32_t pos = 0; + if ((pos = try_advance_tail()) == -1) return 0; + + Wrapped wrec; + wrec.rec = rec; + wrec.header = 0; + if (tombstone) wrec.set_tombstone(); + + m_data[pos] = wrec; + m_data[pos].header |= (pos << 2); + + if (tombstone) { + m_tombstonecnt.fetch_add(1); + if (m_tombstone_filter) m_tombstone_filter->insert(rec); + } + + if constexpr (WeightedRecordInterface) { + m_weight.fetch_add(rec.weight); + double old = m_max_weight.load(); + while (old < rec.weight) { + m_max_weight.compare_exchange_strong(old, rec.weight); + old = m_max_weight.load(); + } + } else { + m_weight.fetch_add(1); + } + + return 1; + } + + bool truncate() { + m_tombstonecnt.store(0); + m_reccnt.store(0); + m_weight.store(0); + m_max_weight.store(0); + if (m_tombstone_filter) m_tombstone_filter->clear(); + + return true; + } + + size_t get_record_count() { + return m_reccnt; + } + + size_t get_capacity() { + return m_cap; + } + + bool is_full() { + return m_reccnt == m_cap; + } + + size_t get_tombstone_count() { + return m_tombstonecnt.load(); + } + + bool delete_record(const R& rec) { + auto offset = 0; + while (offset < m_reccnt.load()) { + if (m_data[offset].rec == rec) { + m_data[offset].set_delete(); + return true; + } + offset++; + } + + return false; + } + + bool check_tombstone(const R& rec) { + if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false; + + auto offset = 0; + while (offset < m_reccnt.load()) { + if (m_data[offset].rec == rec && m_data[offset].is_tombstone()) { + return true; + } + offset++;; + } + return false; + } + + size_t get_memory_usage() { + return m_cap * sizeof(R); + } + + size_t get_aux_memory_usage() { + return m_tombstone_filter->get_memory_usage(); + } + + size_t get_tombstone_capacity() { + return m_tombstone_cap; + } + + double get_total_weight() { + return m_weight.load(); + } + + Wrapped *get_data() { + return m_data; + } + + double get_max_weight() { + return m_max_weight; + } + + bool start_merge() { + if (m_merge_lock.try_lock()) { + /* there cannot already been an active merge */ + if (m_merging.load()) { + m_merge_lock.unlock(); + return false; + } + + m_merging.store(true); + memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); + return true; + } + + /* lock could not be obtained */ + return false; + } + + bool finish_merge() { + m_merge_lock.unlock(); + return true; + } + + /* + * Concurrency-related operations + */ + bool take_reference() { + m_refcnt.fetch_add(1); + return true; + } + + bool release_reference() { + m_refcnt.fetch_add(-1); + + if (m_refcnt.load() == 0 && m_deferred_truncate.load()) { + assert(this->truncate()); + } + + return true; + } + + bool active_merge() { + return m_merging.load(); + } + +private: + int32_t try_advance_tail() { + size_t new_tail = m_reccnt.fetch_add(1); + + if (new_tail < m_cap) return new_tail; + else return -1; + } + + size_t m_cap; + size_t m_tombstone_cap; + + Wrapped* m_data; + Wrapped* m_merge_data; + + psudb::BloomFilter* m_tombstone_filter; + + alignas(64) std::atomic m_tombstonecnt; + alignas(64) std::atomic m_reccnt; + alignas(64) std::atomic m_weight; + alignas(64) std::atomic m_max_weight; + alignas(64) std::atomic m_merging; + alignas(64) std::atomic m_deferred_truncate; + alignas(64) std::atomic m_refcnt; + + alignas(64) std::mutex m_merge_lock; + alignas(64) std::mutex m_trunc_lock; + alignas(64) std::condition_variable m_trunc_signal; + +}; + +} diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h new file mode 100644 index 0000000..eb9b93f --- /dev/null +++ b/include/framework/util/Configuration.h @@ -0,0 +1,54 @@ +/* + * include/framework/DynamicExtension.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include + +#include "psu-util/timer.h" +#include "psu-ds/Alias.h" + +namespace de { + +thread_local size_t sampling_attempts = 0; +thread_local size_t sampling_rejections = 0; +thread_local size_t deletion_rejections = 0; +thread_local size_t bounds_rejections = 0; +thread_local size_t tombstone_rejections = 0; +thread_local size_t buffer_rejections = 0; + +/* + * thread_local size_t various_sampling_times go here. + */ +thread_local size_t sample_range_time = 0; +thread_local size_t alias_time = 0; +thread_local size_t alias_query_time = 0; +thread_local size_t rejection_check_time = 0; +thread_local size_t buffer_sample_time = 0; +thread_local size_t memlevel_sample_time = 0; +thread_local size_t disklevel_sample_time = 0; +thread_local size_t sampling_bailouts = 0; + + +enum class LayoutPolicy { + LEVELING, + TEIRING +}; + +enum class DeletePolicy { + TOMBSTONE, + TAGGING +}; + +typedef ssize_t level_index; + +} diff --git a/include/shard/Alex.h b/include/shard/Alex.h deleted file mode 100644 index 9f794dc..0000000 --- a/include/shard/Alex.h +++ /dev/null @@ -1,360 +0,0 @@ -/* - * include/shard/Alex.h - * - * Copyright (C) 2023 Douglas B. Rumbaugh - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - - -#include -#include -#include -#include -#include - -#include "alex.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" - -using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; -using psudb::PriorityQueue; -using psudb::queue_record; -using psudb::Alias; - -namespace de { - -template -struct alex_range_query_parms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; -}; - -template -class AlexRangeQuery; - -template -struct AlexState { - size_t start_idx; - size_t stop_idx; -}; - -template -struct AlexBufferState { - size_t cutoff; - Alias* alias; - - ~AlexBufferState() { - delete alias; - } -}; - - -template -class Alex { -private: - typedef decltype(R::key) K; - typedef decltype(R::value) V; - -public: - - // FIXME: there has to be a better way to do this - friend class AlexRangeQuery; - - Alex(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - std::vector> temp_records; - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - std::sort(base, stop, std::less>()); - - K min_key = base->rec.key; - K max_key = (stop - 1)->rec.key; - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - temp_records.push_back({base->rec.key, base->rec.value}); - - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); - } - - base++; - } - - if (m_reccnt > 0) { - m_alex = alex::Alex(); - m_alex.set_expected_insert_frac(0); - m_alex.bulkload(temp_records.data(), temp_records.size()); - } - } - - Alex(Alex** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - - size_t attemp_reccnt = 0; - size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } - - m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - std::vector> temp_records; - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cur5sor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - temp_records.pushback({cursor.ptr->rec.key, cursor.ptr->rec.value}); - if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } - - if (m_reccnt > 0) { - m_alex = alex::Alex(); - m_alex.set_expected_insert_frac(0); - m_alex.bulkload(temp_records.data(), temp_records.size()); - } - } - - ~Alex() { - if (m_data) free(m_data); - if (m_bf) delete m_bf; - - } - - Wrapped *point_lookup(const R &rec, bool filter=false) { - if (filter && !m_bf->lookup(rec)) { - return nullptr; - } - - size_t idx = get_lower_bound(rec.key); - if (idx >= m_reccnt) { - return nullptr; - } - - while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; - - if (m_data[idx].rec == rec) { - return m_data + idx; - } - - return nullptr; - } - - Wrapped* get_data() const { - return m_data; - } - - size_t get_record_count() const { - return m_reccnt; - } - - size_t get_tombstone_count() const { - return m_tombstone_cnt; - } - - const Wrapped* get_record_at(size_t idx) const { - if (idx >= m_reccnt) return nullptr; - return m_data + idx; - } - - - size_t get_memory_usage() { - return m_alex.size_in_bytes() + m_alloc_size; - } - - alex::Alex::Iterator get_lower_bound(const K& key) const { - auto bound = m_alex.find(key); - while (bound != m_alex.end() && bound.key() < key) { - bound++; - } - - return bound; - } - -private: - Wrapped* m_data; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_alloc_size; - K m_max_key; - K m_min_key; - alex::Alex m_alex; - BloomFilter *m_bf; -}; - - -template -class AlexRangeQuery { -public: - static void *get_query_state(Alex *ts, void *parms) { - auto res = new AlexState(); - auto p = (alex_range_query_parms *) parms; - - res->start_idx = ts->get_lower_bound(p->lower_bound); - res->stop_idx = ts->get_record_count(); - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new AlexBufferState(); - res->cutoff = buffer->get_record_count(); - - return res; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(Alex *ts, void *q_state, void *parms) { - std::vector> records; - auto p = (alex_range_query_parms *) parms; - auto s = (AlexState *) q_state; - - // if the returned index is one past the end of the - // records for the Alex, then there are not records - // in the index falling into the specified range. - if (s->start_idx == ts->get_record_count()) { - return records; - } - - auto ptr = ts->get_record_at(s->start_idx); - - // roll the pointer forward to the first record that is - // greater than or equal to the lower bound. - while(ptr->rec.key < p->lower_bound) { - ptr++; - } - - while (ptr->rec.key <= p->upper_bound && ptr < ts->m_data + s->stop_idx) { - records.emplace_back(*ptr); - ptr++; - } - - return records; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto p = (alex_range_query_parms *) parms; - auto s = (AlexBufferState *) state; - - std::vector> records; - for (size_t i=0; icutoff; i++) { - auto rec = buffer->get_data() + i; - if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - records.emplace_back(*rec); - } - } - - return records; - } - - static std::vector merge(std::vector> &results, void *parms) { - size_t total = 0; - for (size_t i=0; i(); - } - - std::vector output; - output.reserve(total); - - for (size_t i=0; i *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (AlexBufferState *) state; - delete s; - } -}; - -; - -} diff --git a/include/shard/MemISAM.h b/include/shard/MemISAM.h index a220792..f9c621e 100644 --- a/include/shard/MemISAM.h +++ b/include/shard/MemISAM.h @@ -14,7 +14,8 @@ #include #include -#include "framework/MutableBuffer.h" +#include "framework/ShardRequirements.h" + #include "util/bf_config.h" #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 2cd153e..d960e70 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -15,15 +15,13 @@ #include #include +#include "framework/ShardRequirements.h" + #include "pgm/pgm_index.hpp" #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 69fcfbc..98153c0 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -15,15 +15,12 @@ #include #include +#include "framework/ShardRequirements.h" #include "ts/builder.h" #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 8feec84..0e998d9 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -15,14 +15,12 @@ #include #include +#include "framework/ShardRequirements.h" + #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; diff --git a/include/shard/WIRS.h b/include/shard/WIRS.h index 19d3eea..8583cb0 100644 --- a/include/shard/WIRS.h +++ b/include/shard/WIRS.h @@ -16,15 +16,13 @@ #include #include +#include "framework/ShardRequirements.h" + #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" #include "psu-ds/Alias.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; diff --git a/include/shard/WSS.h b/include/shard/WSS.h index c0af573..87b016c 100644 --- a/include/shard/WSS.h +++ b/include/shard/WSS.h @@ -16,15 +16,13 @@ #include #include +#include "framework/ShardRequirements.h" + #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" #include "psu-ds/Alias.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "framework/MutableBuffer.h" -#include "framework/RecordInterface.h" -#include "framework/ShardInterface.h" -#include "framework/QueryInterface.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; diff --git a/include/util/Cursor.h b/include/util/Cursor.h index 1b0b8ed..1cf20e1 100644 --- a/include/util/Cursor.h +++ b/include/util/Cursor.h @@ -9,7 +9,7 @@ */ #pragma once -#include "framework/RecordInterface.h" +#include "framework/ShardRequirements.h" #include "psu-ds/BloomFilter.h" #include "psu-ds/PriorityQueue.h" -- cgit v1.2.3 From 1a47cbd7978dcad7ed0b2f2af3f933137eedbfa3 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 20 Oct 2023 15:12:46 -0400 Subject: Checkpointing work I'll probably throw all this out, but I want to stash it just in case. --- include/framework/scheduling/SerialScheduler.h | 65 +++++++++++++------------- include/framework/scheduling/Task.h | 52 +++++++++++++++++++++ 2 files changed, 84 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index 5e16bdf..c43e930 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -1,7 +1,7 @@ /* * include/framework/Scheduler.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. @@ -14,6 +14,7 @@ #include #include #include +#include #include "util/types.h" #include "framework/interface/Shard.h" @@ -24,6 +25,8 @@ #include "framework/structure/ExtensionStructure.h" #include "framework/scheduling/Task.h" +#include "psu-ds/LockedPriorityQueue.h" + namespace de { template @@ -73,31 +76,21 @@ public: std::vector merges = version->get_merge_tasks(buffer->get_record_count()); /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) + * Schedule the merge tasks */ for (ssize_t i=0; iget_record_count() * sizeof(R) * 2; - buffer_merge.m_timestamp = m_timestamp.fetch_add(1); - buffer_merge.m_type = TaskType::MERGE; - m_merge_queue_lock.lock(); - m_merge_queue.emplace(buffer_merge); - m_merge_queue_lock.unlock(); + auto t = MergeTask(-1, 0, buffer->get_record_count() * sizeof(R) * 2, m_timestamp.fetch_add(1)); + m_task_queue.push(t); m_cv.notify_all(); do { std::unique_lock merge_cv_lock(m_merge_cv_lock); m_merge_cv.wait(merge_cv_lock); - } while (m_merge_queue.size() > 0); + } while (m_task_queue.size() > 0); assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); @@ -128,10 +121,7 @@ private: } void schedule_next_task() { - m_merge_queue_lock.lock(); - auto task = m_merge_queue.top(); - m_merge_queue.pop(); - m_merge_queue_lock.unlock(); + auto task = m_task_queue.pop(); auto type = std::visit(GetTaskType{}, task); @@ -145,7 +135,7 @@ private: default: assert(false); } - if (m_merge_queue.size() == 0) { + if (m_task_queue.size() == 0) { m_merge_cv.notify_all(); } } @@ -157,15 +147,27 @@ private: if (!version->validate_tombstone_proportion(task.m_target_level)) { auto tasks = version->get_merge_tasks(task.m_target_level); /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) + * Schedule the merge tasks */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { + std::promise trigger_prom; + tasks[tasks.size() - 1].make_dependent_on(trigger_prom); + tasks[tasks.size() - 1].m_timestamp = m_timestamp.fetch_add(1); + m_task_queue.push(tasks[tasks.size() - 1]); + + for (ssize_t i=tasks.size()-2; i>=0; i--) { + tasks[i].make_dependent_on(tasks[i+1]); tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); + m_task_queue.push(tasks[i]); } + + /* + * Block the completion of any task until all have been + * scheduled. Probably not strictly necessary, but due to + * interface constraints with the way promises are used, + * a dummy promise needs to be set up for the first job + * anyway. It's easiest to just release it here. + */ + trigger_prom.set_value(); } } @@ -181,9 +183,7 @@ private: */ for (ssize_t i=tasks.size()-1; i>=0; i--) { tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); + m_task_queue.push(tasks[i]); } } } @@ -193,7 +193,7 @@ private: std::unique_lock cv_lock(m_cv_lock); m_cv.wait(cv_lock); - while (m_merge_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { + while (m_task_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { schedule_next_task(); } cv_lock.unlock(); @@ -210,8 +210,7 @@ private: alignas(64) std::atomic m_used_threads; alignas(64) std::atomic m_timestamp; - std::priority_queue, std::greater> m_merge_queue; - std::mutex m_merge_queue_lock; + psudb::LockedPriorityQueue, std::greater> m_task_queue; std::mutex m_cv_lock; std::condition_variable m_cv; diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 9e0655a..3c1b158 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include "framework/util/Configuration.h" @@ -14,17 +15,52 @@ enum class TaskType { QUERY }; +struct TaskDependency { + std::promise prom; + std::future fut; +}; + struct MergeTask { level_index m_source_level; level_index m_target_level; size_t m_timestamp; size_t m_size; TaskType m_type; + std::unique_ptr m_dep; + + MergeTask() = default; + + MergeTask(level_index source, level_index target, size_t size, size_t timestamp) + : m_source_level(source) + , m_target_level(target) + , m_timestamp(timestamp) + , m_size(size) + , m_type(TaskType::MERGE) + , m_dep(std::make_unique()){} + + + MergeTask(MergeTask &t) + : m_source_level(t.m_source_level) + , m_target_level(t.m_target_level) + , m_timestamp(t.m_timestamp) + , m_size(t.m_size) + , m_type(TaskType::MERGE) + , m_dep(std::move(t.m_dep)) + {} + TaskType get_type() const { return m_type; } + void make_dependent_on(MergeTask &task) { + m_dep->fut = task.m_dep->prom.get_future(); + } + + void make_dependent_on(TaskDependency *dep) { + m_dep->fut = dep->prom.get_future(); + } + friend bool operator<(const MergeTask &self, const MergeTask &other) { return self.m_timestamp < other.m_timestamp; } @@ -39,11 +75,27 @@ struct QueryTask { size_t m_timestamp; size_t m_size; TaskType m_type; + std::unique_ptr m_dep; + + QueryTask(QueryTask &t) + : m_timestamp(t.m_timestamp) + , m_size(t.m_size) + , m_type(t.m_type) + , m_dep(std::move(t.m_dep)) + {} TaskType get_type() const { return m_type; } + void SetDependency(QueryTask &task) { + m_dep->fut = task.m_dep->prom.get_future(); + } + + void SetDependency(TaskDependency *dep) { + m_dep->fut = dep->prom.get_future(); + } + friend bool operator<(const QueryTask &self, const QueryTask &other) { return self.m_timestamp < other.m_timestamp; } -- cgit v1.2.3 From 7ecfb22c32b7986ed1a2439c1abbeed298e4153a Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 20 Oct 2023 17:00:42 -0400 Subject: Initial pass w/ new scheduler setup currently there's a race condition of some type to sort out. --- include/framework/DynamicExtension.h | 184 +++++++++++++--------- include/framework/interface/Scheduler.h | 17 +- include/framework/scheduling/SerialScheduler.h | 191 ++++------------------- include/framework/scheduling/Task.h | 112 ++++--------- include/framework/structure/ExtensionStructure.h | 21 ++- include/framework/structure/MutableBuffer.h | 11 ++ include/framework/util/Configuration.h | 1 + 7 files changed, 196 insertions(+), 341 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 3a460aa..fc7922c 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -31,7 +31,8 @@ namespace de { -template > +template class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; @@ -83,68 +84,8 @@ public: return internal_append(rec, true); } - std::vector query(void *parms) { - auto buffer = get_buffer(); - auto vers = get_active_version(); - - // Get the buffer query state - auto buffer_state = Q::get_buffer_query_state(buffer, parms); - - // Get the shard query states - std::vector> shards; - std::vector states; - - for (auto &level : vers->get_levels()) { - level->get_query_states(shards, states, parms); - } - - Q::process_query_states(parms, states, buffer_state); - - std::vector>> query_results(shards.size() + 1); - - // Execute the query for the buffer - auto buffer_results = Q::buffer_query(buffer, buffer_state, parms); - query_results[0] = std::move(filter_deletes(buffer_results, {-1, -1}, buffer, vers)); - if constexpr (Q::EARLY_ABORT) { - if (query_results[0].size() > 0) { - auto result = Q::merge(query_results, parms); - for (size_t i=0; i 0) { - auto result = Q::merge(query_results, parms); - for (size_t i=0; i> query(void *parms) { + return schedule_query(get_active_version(), get_buffer(), parms); } size_t get_record_count() { @@ -239,6 +180,112 @@ private: return m_versions[0]; } + static void merge(void *arguments) { + MergeArgs *args = (MergeArgs *) arguments; + + Structure *vers = (Structure *) args->version; + Buffer *buff = (Buffer *) args->buffer; + + for (ssize_t i=args->merges.size() - 1; i>=0; i--) { + vers->merge_levels(args->merges[i].second, args->merges[i].first); + } + + vers->merge_buffer(buff); + + args->result.set_value(true); + delete args; + } + + static void async_query(void *arguments) { + QueryArgs *args = (QueryArgs *) arguments; + + auto buffer = (Buffer *) args->buffer; + auto vers = (Structure *) args->version; + void *parms = args->query_parms; + + // Get the buffer query state + auto buffer_state = Q::get_buffer_query_state(buffer, parms); + + // Get the shard query states + std::vector> shards; + std::vector states; + + for (auto &level : vers->get_levels()) { + level->get_query_states(shards, states, parms); + } + + Q::process_query_states(parms, states, buffer_state); + + std::vector>> query_results(shards.size() + 1); + + // Execute the query for the buffer + auto buffer_results = Q::buffer_query(buffer, buffer_state, parms); + query_results[0] = std::move(filter_deletes(buffer_results, {-1, -1}, buffer, vers)); + if constexpr (Q::EARLY_ABORT) { + if (query_results[0].size() > 0) { + auto result = Q::merge(query_results, parms); + for (size_t i=0; i 0) { + auto result = Q::merge(query_results, parms); + for (size_t i=0; iresult_set.set_value(std::move(result)); + delete args; + } + + std::future schedule_merge(Structure *version, Buffer *buffer) { + MergeArgs *args = new MergeArgs(); + args->merges = version->get_merge_tasks(buffer->get_record_count()); + args->buffer = buffer; + args->version = version; + + m_sched.schedule_job(merge, 0, args); + + return args->result.get_future(); + } + + std::future> schedule_query(Structure *version, Buffer *buffer, void *query_parms) { + QueryArgs *args = new QueryArgs(); + args->buffer = buffer; + args->version = version; + args->buffer = query_parms; + + m_sched.schedule_job(async_query, 0, args); + + return args->result_set.get_future(); + } + int internal_append(const R &rec, bool ts) { Buffer *buffer; while (!(buffer = get_buffer())) @@ -246,13 +293,15 @@ private: if (buffer->is_full()) { auto vers = get_active_version(); - m_sched.schedule_merge(vers, buffer); + auto res = schedule_merge(vers, buffer); + res.get(); } + return buffer->append(rec, ts); } - std::vector> filter_deletes(std::vector> &records, ShardID shid, Buffer *buffer, Structure *vers) { + static std::vector> filter_deletes(std::vector> &records, ShardID shid, Buffer *buffer, Structure *vers) { if constexpr (!Q::SKIP_DELETE_FILTER) { return records; } @@ -303,10 +352,5 @@ private: return processed_records; } }; - -template -static void de_merge_callback(DynamicExtension extension, ExtensionStructure new_version) { - -} } diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h index 1445e90..e8ffd08 100644 --- a/include/framework/interface/Scheduler.h +++ b/include/framework/interface/Scheduler.h @@ -12,20 +12,11 @@ #include #include "framework/interface/Record.h" #include "util/types.h" +#include "framework/scheduling/Task.h" template -concept SchedulerInterface = requires(S s, size_t i, void *vp) { +concept SchedulerInterface = requires(S s, size_t i, void *vp, de::Job j) { {S(i, i)}; -// {s.schedule_merge(vp, vp)}; - -/* - {q.get_query_state(p, p)} -> std::convertible_to; - {q.get_buffer_query_state(p, p)}; - {q.query(p, p)}; - {q.buffer_query(p, p)}; - {q.merge()}; - {q.delete_query_state(p)}; -*/ - //{Q::get_query_state(p, p)} -> std::convertible_to; - //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; + {s.schedule_job(j, i, vp)} -> std::convertible_to; + {s.shutdown()}; }; diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index c43e930..5d6e5c2 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -29,198 +29,67 @@ namespace de { -template class SerialScheduler { - typedef ExtensionStructure Structure; - typedef MutableBuffer Buffer; public: - /* - * A simple "scheduler" that runs tasks serially, in a FIFO manner. Incoming concurrent - * requests will wait for their turn, and only one task will be active in the system at - * a time. The scheduler will spin up a second thread for running itself, but all tasks - * will be single-threaded. - * - * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means - * unlimited. - * - * Note that the SerialScheduler object is non-concurrent, and so will ignore the - * thread_cnt argument. It will obey the memory_budget, however a failure due to - * memory constraints will be irrecoverable, as there is no way to free up memory - * or block particular tasks until memory becomes available. - */ - SerialScheduler(size_t memory_budget, size_t thread_cnt) - : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) - , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) - , m_used_memory(0) - , m_used_threads(0) - , m_shutdown(false) - { - m_sched_thrd = std::thread(&SerialScheduler::run_scheduler, this); + SerialScheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thrd_cnt((thread_cnt) ? thread_cnt: UINT64_MAX) + , m_used_memory(0) + , m_used_thrds(0) + , m_shutdown(false) + { + m_sched_thrd = std::thread(&SerialScheduler::run, this); } ~SerialScheduler() { - m_shutdown = true; + shutdown(); m_cv.notify_all(); m_sched_thrd.join(); } - bool schedule_merge(Structure *version, MutableBuffer *buffer) { - pending_version = version; - pending_buffer = buffer; - - /* - * Get list of individual level reconstructions that are necessary - * for completing the overall merge - */ - std::vector merges = version->get_merge_tasks(buffer->get_record_count()); - - /* - * Schedule the merge tasks - */ - for (ssize_t i=0; iget_record_count() * sizeof(R) * 2, m_timestamp.fetch_add(1)); - m_task_queue.push(t); - - m_cv.notify_all(); - do { - std::unique_lock merge_cv_lock(m_merge_cv_lock); - m_merge_cv.wait(merge_cv_lock); - } while (m_task_queue.size() > 0); - - assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); - - return true; + void schedule_job(std::function job, size_t size, void *args) { + size_t ts = m_counter.fetch_add(1); + m_task_queue.push(Task(size, ts, job, args)); } - bool schedule_query() { - return true; + void shutdown() { + m_shutdown = true; } private: - size_t get_timestamp() { - auto ts = m_timestamp.fetch_add(1); - return ts; - } - - void schedule_merge(MergeTask task) { - if (task.m_source_level == -1 && task.m_target_level == 0) { - run_buffer_merge(pending_buffer, pending_version); - } else { - run_merge(task, pending_version); - } - } - - - void schedule_query(QueryTask task) { - - } - - void schedule_next_task() { - auto task = m_task_queue.pop(); - - auto type = std::visit(GetTaskType{}, task); - - switch (type) { - case TaskType::MERGE: - schedule_merge(std::get(task)); - break; - case TaskType::QUERY: - schedule_query(std::get(task)); - break; - default: assert(false); - } - - if (m_task_queue.size() == 0) { - m_merge_cv.notify_all(); - } - } + psudb::LockedPriorityQueue m_task_queue; + size_t m_memory_budget; + size_t m_thrd_cnt; - void run_merge(MergeTask task, Structure *version) { - version->merge_levels(task.m_target_level, task.m_source_level); - - if (!version->validate_tombstone_proportion(task.m_target_level)) { - auto tasks = version->get_merge_tasks(task.m_target_level); - /* - * Schedule the merge tasks - */ - std::promise trigger_prom; - tasks[tasks.size() - 1].make_dependent_on(trigger_prom); - tasks[tasks.size() - 1].m_timestamp = m_timestamp.fetch_add(1); - m_task_queue.push(tasks[tasks.size() - 1]); - - for (ssize_t i=tasks.size()-2; i>=0; i--) { - tasks[i].make_dependent_on(tasks[i+1]); - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_task_queue.push(tasks[i]); - } + bool m_shutdown; - /* - * Block the completion of any task until all have been - * scheduled. Probably not strictly necessary, but due to - * interface constraints with the way promises are used, - * a dummy promise needs to be set up for the first job - * anyway. It's easiest to just release it here. - */ - trigger_prom.set_value(); - } - } + std::atomic m_counter; + std::mutex m_cv_lock; + std::condition_variable m_cv; + std::thread m_sched_thrd; - void run_buffer_merge(Buffer *buffer, Structure *version) { - version->merge_buffer(buffer); - if (!version->validate_tombstone_proportion(0)) { - auto tasks = version->get_merge_tasks_from_level(0); + std::atomic m_used_thrds; + std::atomic m_used_memory; - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_task_queue.push(tasks[i]); - } - } + void schedule_next() { + auto t = m_task_queue.pop(); + t(); } - void run_scheduler() { + void run() { do { std::unique_lock cv_lock(m_cv_lock); m_cv.wait(cv_lock); - while (m_task_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { - schedule_next_task(); + while (m_task_queue.size() > 0 && m_used_thrds.load() < m_thrd_cnt) { + schedule_next(); } cv_lock.unlock(); } while(!m_shutdown); } - - size_t m_memory_budget; - size_t m_thread_cnt; - - Buffer *pending_buffer; - Structure *pending_version; - - alignas(64) std::atomic m_used_memory; - alignas(64) std::atomic m_used_threads; - alignas(64) std::atomic m_timestamp; - - psudb::LockedPriorityQueue, std::greater> m_task_queue; - - std::mutex m_cv_lock; - std::condition_variable m_cv; - - std::mutex m_merge_cv_lock; - std::condition_variable m_merge_cv; - - std::thread m_sched_thrd; - - bool m_shutdown; }; } diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 3c1b158..518159d 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -5,111 +5,53 @@ #include #include +#include #include "framework/util/Configuration.h" namespace de { -enum class TaskType { - MERGE, - QUERY +struct MergeArgs { + void *version; + void *buffer; + std::vector merges; + std::promise result; }; -struct TaskDependency { - std::promise prom; - std::future fut; +template +struct QueryArgs { + void *version; + void *buffer; + std::promise> result_set; + void *query_parms; }; -struct MergeTask { - level_index m_source_level; - level_index m_target_level; - size_t m_timestamp; - size_t m_size; - TaskType m_type; - std::unique_ptr m_dep; - - MergeTask() = default; - - MergeTask(level_index source, level_index target, size_t size, size_t timestamp) - : m_source_level(source) - , m_target_level(target) - , m_timestamp(timestamp) - , m_size(size) - , m_type(TaskType::MERGE) - , m_dep(std::make_unique()){} - +typedef std::function Job; - MergeTask(MergeTask &t) - : m_source_level(t.m_source_level) - , m_target_level(t.m_target_level) - , m_timestamp(t.m_timestamp) - , m_size(t.m_size) - , m_type(TaskType::MERGE) - , m_dep(std::move(t.m_dep)) +struct Task { + Task(size_t size, size_t ts, Job job, void *args) + : m_job(job) + , m_size(size) + , m_timestamp(ts) + , m_args(args) {} - - TaskType get_type() const { - return m_type; - } - - void make_dependent_on(MergeTask &task) { - m_dep->fut = task.m_dep->prom.get_future(); - } - - void make_dependent_on(TaskDependency *dep) { - m_dep->fut = dep->prom.get_future(); - } - - friend bool operator<(const MergeTask &self, const MergeTask &other) { - return self.m_timestamp < other.m_timestamp; - } - - friend bool operator>(const MergeTask &self, const MergeTask &other) { - return self.m_timestamp > other.m_timestamp; - } - -}; - -struct QueryTask { - size_t m_timestamp; + Job m_job; size_t m_size; - TaskType m_type; - std::unique_ptr m_dep; - - QueryTask(QueryTask &t) - : m_timestamp(t.m_timestamp) - , m_size(t.m_size) - , m_type(t.m_type) - , m_dep(std::move(t.m_dep)) - {} - - TaskType get_type() const { - return m_type; - } - - void SetDependency(QueryTask &task) { - m_dep->fut = task.m_dep->prom.get_future(); - } - - void SetDependency(TaskDependency *dep) { - m_dep->fut = dep->prom.get_future(); - } + size_t m_timestamp; + void *m_args; - friend bool operator<(const QueryTask &self, const QueryTask &other) { + friend bool operator<(const Task &self, const Task &other) { return self.m_timestamp < other.m_timestamp; } - friend bool operator>(const QueryTask &self, const QueryTask &other) { + friend bool operator>(const Task &self, const Task &other) { return self.m_timestamp > other.m_timestamp; } -}; -struct GetTaskType { - TaskType operator()(const MergeTask &t) { return t.get_type(); } - TaskType operator()(const QueryTask &t) { return t.get_type(); } + void operator()() { + m_job(m_args); + } }; -typedef std::variant Task; - } diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 920e1c3..8344518 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -93,7 +93,10 @@ public: inline bool merge_buffer(Buffer *buffer) { assert(can_merge_with(0, buffer->get_record_count())); + buffer->start_merge(); merge_buffer_into_l0(buffer); + buffer->finish_merge(); + buffer->truncate(); return true; @@ -216,10 +219,7 @@ public: } for (level_index i=merge_base_level; i>0; i--) { - MergeTask task; - task.m_source_level = i - 1; - task.m_target_level = i; - task.m_type = TaskType::MERGE; + MergeTask task = {i-1, i}; /* * The amount of storage required for the merge accounts @@ -237,7 +237,7 @@ public: reccnt += m_levels[i]->get_record_count(); } } - task.m_size = 2* reccnt * sizeof(R); + //task.m_size = 2* reccnt * sizeof(R); merges.push_back(task); } @@ -249,7 +249,7 @@ public: /* * */ - std::vector get_merge_tasks_from_level(size_t source_level) { + std::vector get_merge_tasks_from_level(level_index source_level) { std::vector merges; level_index merge_base_level = find_mergable_level(source_level); @@ -258,10 +258,7 @@ public: } for (level_index i=merge_base_level; i>source_level; i--) { - MergeTask task; - task.m_source_level = i - 1; - task.m_target_level = i; - + MergeTask task = {i - 1, i}; /* * The amount of storage required for the merge accounts * for the cost of storing the new records, along with the @@ -278,12 +275,12 @@ public: reccnt += m_levels[i]->get_record_count(); } } - task.m_size = 2* reccnt * sizeof(R); +// task.m_size = 2* reccnt * sizeof(R); merges.push_back(task); } - return std::move(merges); + return merges; } /* diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 9f12175..804ca5e 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -90,12 +90,23 @@ public: } bool truncate() { + + while (active_merge() || m_refcnt.load() > 0) + ; + + m_merge_lock.lock(); + + while (m_refcnt > 0) + ; + m_tombstonecnt.store(0); m_reccnt.store(0); m_weight.store(0); m_max_weight.store(0); if (m_tombstone_filter) m_tombstone_filter->clear(); + m_merge_lock.unlock(); + return true; } diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h index eb9b93f..9d8248f 100644 --- a/include/framework/util/Configuration.h +++ b/include/framework/util/Configuration.h @@ -50,5 +50,6 @@ enum class DeletePolicy { }; typedef ssize_t level_index; +typedef std::pair MergeTask; } -- cgit v1.2.3 From b72103cb11347f0dd108bd2321f29b0d6ab05106 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 23 Oct 2023 13:18:30 -0400 Subject: Bugfixes --- include/framework/DynamicExtension.h | 6 +++++- include/framework/scheduling/SerialScheduler.h | 1 + include/framework/structure/MutableBuffer.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index fc7922c..26221d8 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -259,6 +259,7 @@ private: } Q::delete_buffer_query_state(buffer_state); + buffer->release_reference(); args->result_set.set_value(std::move(result)); delete args; @@ -276,10 +277,13 @@ private: } std::future> schedule_query(Structure *version, Buffer *buffer, void *query_parms) { + buffer->take_reference(); // FIXME: this is wrong. The buffer and version need to be + // taken atomically, together. + QueryArgs *args = new QueryArgs(); args->buffer = buffer; args->version = version; - args->buffer = query_parms; + args->query_parms = query_parms; m_sched.schedule_job(async_query, 0, args); diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index 5d6e5c2..da2bb8e 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -51,6 +51,7 @@ public: void schedule_job(std::function job, size_t size, void *args) { size_t ts = m_counter.fetch_add(1); m_task_queue.push(Task(size, ts, job, args)); + m_cv.notify_all(); } void shutdown() { diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 804ca5e..4e0b5c2 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -195,6 +195,7 @@ public: bool finish_merge() { m_merge_lock.unlock(); + m_merging.store(false); return true; } -- cgit v1.2.3 From 3afacb7702e6d8fa67749a2a41dc776d315e02a9 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 23 Oct 2023 17:43:22 -0400 Subject: Began moving to an explicit epoch-based system I started moving over to an explicit Epoch based system, which has necessitated a ton of changes throughout the code base. This will ultimately allow for a much cleaner set of abstractions for managing concurrency. --- include/framework/DynamicExtension.h | 279 +++++++++++++++-------- include/framework/ShardRequirements.h | 2 +- include/framework/interface/Query.h | 3 +- include/framework/scheduling/Epoch.h | 128 +++++++++++ include/framework/scheduling/FIFOScheduler.h | 96 ++++++++ include/framework/scheduling/SerialScheduler.h | 96 -------- include/framework/scheduling/Task.h | 10 +- include/framework/structure/BufferView.h | 124 ++++++++++ include/framework/structure/ExtensionStructure.h | 26 +++ include/framework/structure/MutableBuffer.h | 4 + 10 files changed, 573 insertions(+), 195 deletions(-) create mode 100644 include/framework/scheduling/Epoch.h create mode 100644 include/framework/scheduling/FIFOScheduler.h delete mode 100644 include/framework/scheduling/SerialScheduler.h create mode 100644 include/framework/structure/BufferView.h (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 26221d8..6936247 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "framework/structure/MutableBuffer.h" #include "framework/structure/InternalLevel.h" @@ -24,7 +25,8 @@ #include "framework/structure/ExtensionStructure.h" #include "framework/util/Configuration.h" -#include "framework/scheduling/SerialScheduler.h" +#include "framework/scheduling/FIFOScheduler.h" +#include "framework/scheduling/Epoch.h" #include "psu-util/timer.h" #include "psu-ds/Alias.h" @@ -32,20 +34,30 @@ namespace de { template + DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=FIFOScheduler> class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; typedef ExtensionStructure Structure; + typedef Epoch Epoch; + typedef BufferView BufView; + public: DynamicExtension(size_t buffer_cap, size_t scale_factor, double max_delete_prop, size_t memory_budget=0, size_t thread_cnt=16) : m_scale_factor(scale_factor) , m_max_delete_prop(max_delete_prop) , m_sched(memory_budget, thread_cnt) + , m_buffer_capacity(buffer_cap) + , m_buffer_delete_capacity(max_delete_prop*buffer_cap) { + auto buf = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); + auto vers = new Structure(m_buffer_capacity, m_scale_factor, m_max_delete_prop); + auto epoch = new Epoch(vers, buf); + m_buffers.push_back(new Buffer(buffer_cap, max_delete_prop*buffer_cap)); m_versions.push_back(new Structure(buffer_cap, scale_factor, max_delete_prop)); + m_epochs.push_back({0, epoch}); } ~DynamicExtension() { @@ -63,10 +75,10 @@ public: } int erase(const R &rec) { - Buffer *buffer = get_buffer(); - if constexpr (D == DeletePolicy::TAGGING) { - if (get_active_version()->tagged_delete(rec)) { + BufView buffers = get_active_epoch()->get_buffer_view(); + + if (get_active_epoch()->get_structure()->tagged_delete(rec)) { return 1; } @@ -75,7 +87,7 @@ public: * probably has the lowest probability of having the record, * so we'll check it last. */ - return buffer->delete_record(rec); + return buffers->delete_record(rec); } /* @@ -85,43 +97,43 @@ public: } std::future> query(void *parms) { - return schedule_query(get_active_version(), get_buffer(), parms); + return schedule_query(get_active_epoch()->get_structure(), get_active_epoch()->get_buffers()[0], parms); } size_t get_record_count() { - size_t cnt = get_buffer()->get_record_count(); - return cnt + get_active_version()->get_record_count(); + size_t cnt = get_active_epoch()->get_buffer_view().get_record_count(); + return cnt + get_active_epoch()->get_structure()->get_record_count(); } size_t get_tombstone_cnt() { - size_t cnt = get_buffer()->get_tombstone_count(); - return cnt + get_active_version()->get_tombstone_cnt(); + size_t cnt = get_active_epoch()->get_buffer_view().get_tombstone_count(); + return cnt + get_active_epoch()->get_structure()->get_tombstone_cnt(); } size_t get_height() { - return get_active_version()->get_height(); + return get_active_epoch()->get_structure()->get_height(); } size_t get_memory_usage() { - auto vers = get_active_version(); - auto buffer = get_buffer(); + auto vers = get_active_epoch()->get_structure()->get_memory_usage(); + auto buffer = get_active_epoch()->get_buffer_view().get_memory_usage(); - return vers.get_memory_usage() + buffer->get_memory_usage(); + return vers + buffer; } size_t get_aux_memory_usage() { - auto vers = get_active_version(); - auto buffer = get_buffer(); + auto vers = get_active_epoch()->get_structure()->get_aux_memory_usage(); + auto buffer = get_active_epoch()->get_buffer_view().get_aux_memory_usage(); - return vers.get_aux_memory_usage() + buffer->get_aux_memory_usage(); + return vers + buffer; } size_t get_buffer_capacity() { - return get_height()->get_capacity(); + return m_buffer_capacity; } Shard *create_static_structure() { - auto vers = get_active_version(); + auto vers = get_active_epoch()->get_structure(); std::vector shards; if (vers->get_levels().size() > 0) { @@ -132,7 +144,9 @@ public: } } - shards.emplace_back(new S(get_buffer())); + // FIXME: should use a buffer view--or perhaps some sort of a + // raw record iterator model. + shards.emplace_back(new S(get_active_epoch()->get_buffers()[0])); Shard *shards_array[shards.size()]; @@ -158,33 +172,121 @@ public: * tombstone proportion invariant. */ bool validate_tombstone_proportion() { - return get_active_version()->validate_tombstone_proportion(); + return get_active_epoch()->get_structure()->validate_tombstone_proportion(); } private: SCHED m_sched; - std::vector m_buffers; - std::vector m_versions; + std::mutex m_struct_lock; + std::set m_buffers; + std::set m_versions; std::atomic m_current_epoch; + std::unordered_map m_epochs; size_t m_scale_factor; double m_max_delete_prop; + size_t m_buffer_capacity; + size_t m_buffer_delete_capacity; - Buffer *get_buffer() { - return m_buffers[0]; + Epoch *get_active_epoch() { + return m_epochs[m_current_epoch.load()]; + } + + void advance_epoch() { + size_t new_epoch_num = m_current_epoch.load() + 1; + Epoch *new_epoch = m_epochs[new_epoch_num]; + Epoch *old_epoch = m_epochs[m_current_epoch.load()]; + + // Update the new Epoch to contain the buffers + // from the old one that it doesn't currently have + size_t old_buffer_cnt = new_epoch->clear_buffers(); + for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { + new_epoch->add_buffer(old_epoch->get_buffers[i]); + } + m_current_epoch.fetch_add(1); } - Structure *get_active_version() { - return m_versions[0]; + /* + * Creates a new epoch by copying the currently active one. The new epoch's + * structure will be a shallow copy of the old one's. + */ + Epoch *create_new_epoch() { + auto new_epoch = get_active_epoch()->clone(); + std::unique_lock m_struct_lock; + m_versions.insert(new_epoch->get_structure()); + m_epochs.insert({m_current_epoch.load() + 1, new_epoch}); + m_struct_lock.release(); + + return new_epoch; + } + + /* + * Add a new empty buffer to the specified epoch. This is intended to be used + * when a merge is triggered, to allow for inserts to be sustained in the new + * buffer while a new epoch is being created in the background. Returns a + * pointer to the newly created buffer. + */ + Buffer *add_empty_buffer(Epoch *epoch) { + auto new_buffer = Buffer(m_buffer_capacity, m_buffer_delete_capacity); + + std::unique_lock m_struct_lock; + m_buffers.insert(new_buffer); + m_struct_lock.release(); + + epoch->add_buffer(new_buffer); + return new_buffer; + } + + void retire_epoch(Epoch *epoch) { + /* + * Epochs with currently active jobs cannot + * be retired. By the time retire_epoch is called, + * it is assumed that a new epoch is active, meaning + * that the epoch to be retired should no longer + * accumulate new active jobs. Eventually, this + * number will hit zero and the function will + * proceed. + * + * FIXME: this can be replaced with a cv, which + * is probably a superior solution in this case + */ + while (epoch->get_active_job_num() > 0) + ; + + /* + * The epoch's destructor will handle releasing + * all the references it holds + */ + delete epoch; + + /* + * Following the epoch's destruction, any buffers + * or structures with no remaining references can + * be safely freed. + */ + std::unique_lock lock(m_struct_lock); + for (auto buf : m_buffers) { + if (buf->get_reference_count() == 0) { + m_buffers.erase(buf); + delete buf; + } + } + + for (auto vers : m_versions) { + if (vers->get_reference_count() == 0) { + m_versions.erase(vers); + delete vers; + } + } } static void merge(void *arguments) { - MergeArgs *args = (MergeArgs *) arguments; + MergeArgs *args = (MergeArgs *) arguments; - Structure *vers = (Structure *) args->version; - Buffer *buff = (Buffer *) args->buffer; + Structure *vers = args->epoch->get_structure(); + Buffer *buff = (Buffer *) args->epoch->get_buffers()[0]; for (ssize_t i=args->merges.size() - 1; i>=0; i--) { vers->merge_levels(args->merges[i].second, args->merges[i].first); @@ -193,98 +295,94 @@ private: vers->merge_buffer(buff); args->result.set_value(true); + args->epoch->end_job(); delete args; } + static std::vector finalize_query_result(std::vector>> &query_results, void *parms, + std::vector &shard_states, std::vector &buffer_states) { + auto result = Q::merge(query_results, parms); + + for (size_t i=0; i *args = (QueryArgs *) arguments; + QueryArgs *args = (QueryArgs *) arguments; - auto buffer = (Buffer *) args->buffer; - auto vers = (Structure *) args->version; + auto buffers = args->epoch->get_buffer_view(); + auto vers = args->epoch->get_structure(); void *parms = args->query_parms; - // Get the buffer query state - auto buffer_state = Q::get_buffer_query_state(buffer, parms); + // Get the buffer query states + std::vector buffer_states = buffers->get_buffer_query_states(parms); // Get the shard query states std::vector> shards; - std::vector states; - - for (auto &level : vers->get_levels()) { - level->get_query_states(shards, states, parms); - } + std::vector shard_states = vers->get_query_states(shards, parms); - Q::process_query_states(parms, states, buffer_state); + Q::process_query_states(parms, shard_states, buffer_states); - std::vector>> query_results(shards.size() + 1); + std::vector>> query_results(shards.size() + buffer_states.size()); // Execute the query for the buffer - auto buffer_results = Q::buffer_query(buffer, buffer_state, parms); - query_results[0] = std::move(filter_deletes(buffer_results, {-1, -1}, buffer, vers)); - if constexpr (Q::EARLY_ABORT) { - if (query_results[0].size() > 0) { - auto result = Q::merge(query_results, parms); - for (size_t i=0; i>> buffer_results(buffer_states.size()); + for (size_t i=0; iget_buffers[i], buffer_states[i], parms); + query_results[i] = std::move(filter_deletes(buffer_results, {-1, -1}, buffers, vers)); - Q::delete_buffer_query_state(buffer_state); - return result; + if constexpr (Q::EARLY_ABORT) { + if (query_results[i] > 0) { + return finalize_query_result(query_results, parms, buffer_states, shard_states); + } } } - + // Execute the query for each shard for (size_t i=0; i 0) { - auto result = Q::merge(query_results, parms); - for (size_t i=0; i 0) { + return finalize_query_result(query_results, parms, buffer_states, shard_states); } } } - // Merge the results together - auto result = Q::merge(query_results, parms); - - for (size_t i=0; irelease_reference(); - + // Merge the results together and finalize the job + auto result = finalize_query_result(query_results, parms, buffer_states, shard_states); args->result_set.set_value(std::move(result)); + + args->epoch->end_job(); delete args; } - std::future schedule_merge(Structure *version, Buffer *buffer) { - MergeArgs *args = new MergeArgs(); - args->merges = version->get_merge_tasks(buffer->get_record_count()); - args->buffer = buffer; - args->version = version; + std::future schedule_merge() { + auto epoch = get_active_epoch(); + epoch->start_job(); + MergeArgs *args = new MergeArgs(); + args->epoch = epoch; + args->merges = epoch->get_structure()->get_merge_tasks(epoch->get_buffers()[0]); m_sched.schedule_job(merge, 0, args); return args->result.get_future(); } - std::future> schedule_query(Structure *version, Buffer *buffer, void *query_parms) { - buffer->take_reference(); // FIXME: this is wrong. The buffer and version need to be - // taken atomically, together. + std::future> schedule_query(void *query_parms) { + auto epoch = get_active_epoch(); + epoch->start_job(); - QueryArgs *args = new QueryArgs(); - args->buffer = buffer; - args->version = version; + QueryArgs *args = new QueryArgs(); + args->epoch = epoch; args->query_parms = query_parms; - m_sched.schedule_job(async_query, 0, args); return args->result_set.get_future(); @@ -292,20 +390,19 @@ private: int internal_append(const R &rec, bool ts) { Buffer *buffer; - while (!(buffer = get_buffer())) + while (!(buffer = get_active_epoch()->get_active_buffer())) ; if (buffer->is_full()) { - auto vers = get_active_version(); + auto vers = get_active_epoch()->get_structure(); auto res = schedule_merge(vers, buffer); res.get(); } - return buffer->append(rec, ts); } - static std::vector> filter_deletes(std::vector> &records, ShardID shid, Buffer *buffer, Structure *vers) { + static std::vector> filter_deletes(std::vector> &records, ShardID shid, BufView *buffers, Structure *vers) { if constexpr (!Q::SKIP_DELETE_FILTER) { return records; } @@ -334,7 +431,7 @@ private: continue; } - if (buffer->check_tombstone(rec.rec)) { + if (buffers->check_tombstone(rec.rec)) { continue; } diff --git a/include/framework/ShardRequirements.h b/include/framework/ShardRequirements.h index 95f7b67..d2d4ff2 100644 --- a/include/framework/ShardRequirements.h +++ b/include/framework/ShardRequirements.h @@ -3,7 +3,7 @@ */ #pragma once -#include "framework/structure/MutableBuffer.h" +#include "framework/structure/BufferView.h" #include "framework/interface/Record.h" #include "framework/interface/Shard.h" #include "framework/interface/Query.h" diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index 46a1ce1..9b1d2d6 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -14,7 +14,6 @@ template concept QueryInterface = requires(Q q, void *p, std::vector &s) { - /* {q.get_query_state(p, p)} -> std::convertible_to; {q.get_buffer_query_state(p, p)}; @@ -27,7 +26,7 @@ concept QueryInterface = requires(Q q, void *p, std::vector &s) { {Q::SKIP_DELETE_FILTER} -> std::convertible_to; //{Q::get_query_state(p, p)} -> std::convertible_to; //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; - {Q::process_query_states(p, s, p)}; + {Q::process_query_states(p, s, s)}; {Q::delete_query_state(std::declval())} -> std::same_as; {Q::delete_buffer_query_state(p)}; diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h new file mode 100644 index 0000000..a1f865c --- /dev/null +++ b/include/framework/scheduling/Epoch.h @@ -0,0 +1,128 @@ +/* + * include/framework/scheduling/Epoch.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "framework/structure/MutableBuffer.h" +#include "framework/structure/ExtensionStructure.h" +#include "framework/structure/BufferView.h" + +namespace de { + + +template +class Epoch { +private: + typedef MutableBuffer Buffer; + typedef ExtensionStructure Structure; + typedef BufferView BufView; +public: + Epoch() + : m_buffers() + , m_structure(nullptr) + , m_active_jobs(0) + {} + + Epoch(Structure *structure, Buffer *buff) + : m_buffers() + , m_structure(structure) + , m_active_jobs(0) + { + m_buffers.push_back(buff); + } + + ~Epoch() { + assert(m_active_jobs.load() == 0); + + for (auto buf : m_buffers) { + buf.release_reference(); + } + + if (m_structure) { + m_structure->release_reference(); + } + } + + void add_buffer(Buffer *buf) { + assert(buf); + + buf->take_reference(); + m_buffers.push_back(buf); + } + + void start_job() { + m_active_jobs.fetch_add(1); + } + + void end_job() { + m_active_jobs.fetch_add(-1); + } + + size_t get_active_job_num() { + return m_active_jobs.load(); + } + + Structure *get_structure() { + return m_structure; + } + + std::vector &get_buffers() { + return m_buffers; + } + + BufView get_buffer_view() { + return BufView(m_buffers); + } + + Buffer *get_active_buffer() { + if (m_buffers.size() == 0) return nullptr; + + return m_buffers[m_buffers.size() - 1]; + } + + /* + * Return the number of buffers in this epoch at + * time of call, and then clear the buffer vector, + * releasing all references in the process. + */ + size_t clear_buffers() { + size_t buf_cnt = m_buffers.size(); + for (auto buf : m_buffers) { + if (buf) buf->release_reference(); + } + + m_buffers.clear(); + return buf_cnt; + } + + /* + * Returns a new Epoch object that is a copy of this one. The new object will also contain + * a copy of the m_structure, rather than a reference to the same one. + */ + Epoch *clone() { + auto epoch = new Epoch(); + epoch->m_buffers = m_buffers; + if (m_structure) { + epoch->m_structure = m_structure->copy(); + } + } + +private: + Structure *m_structure; + std::vector m_buffers; + + /* + * The number of currently active jobs + * (queries/merges) operating on this + * epoch. An epoch can only be retired + * when this number is 0. + */ + std::atomic m_active_jobs; +}; +} diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h new file mode 100644 index 0000000..878bb81 --- /dev/null +++ b/include/framework/scheduling/FIFOScheduler.h @@ -0,0 +1,96 @@ +/* + * include/framework/Scheduler.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "util/types.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/util/Configuration.h" +#include "framework/structure/ExtensionStructure.h" +#include "framework/scheduling/Task.h" + +#include "psu-ds/LockedPriorityQueue.h" + +namespace de { + +class FIFOScheduler { +public: + FIFOScheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thrd_cnt((thread_cnt) ? thread_cnt: UINT64_MAX) + , m_used_memory(0) + , m_used_thrds(0) + , m_shutdown(false) + { + m_sched_thrd = std::thread(&FIFOScheduler::run, this); + } + + ~FIFOScheduler() { + shutdown(); + + m_cv.notify_all(); + m_sched_thrd.join(); + } + + void schedule_job(std::function job, size_t size, void *args) { + size_t ts = m_counter.fetch_add(1); + m_task_queue.push(Task(size, ts, job, args)); + m_cv.notify_all(); + } + + void shutdown() { + m_shutdown = true; + } + +private: + psudb::LockedPriorityQueue m_task_queue; + + size_t m_memory_budget; + size_t m_thrd_cnt; + + bool m_shutdown; + + std::atomic m_counter; + std::mutex m_cv_lock; + std::condition_variable m_cv; + + std::thread m_sched_thrd; + + std::atomic m_used_thrds; + std::atomic m_used_memory; + + void schedule_next() { + auto t = m_task_queue.pop(); + t(); + } + + void run() { + do { + std::unique_lock cv_lock(m_cv_lock); + m_cv.wait(cv_lock); + + while (m_task_queue.size() > 0 && m_used_thrds.load() < m_thrd_cnt) { + schedule_next(); + } + cv_lock.unlock(); + } while(!m_shutdown); + } +}; + +} diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h deleted file mode 100644 index da2bb8e..0000000 --- a/include/framework/scheduling/SerialScheduler.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * include/framework/Scheduler.h - * - * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "util/types.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" -#include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" -#include "framework/util/Configuration.h" -#include "framework/structure/ExtensionStructure.h" -#include "framework/scheduling/Task.h" - -#include "psu-ds/LockedPriorityQueue.h" - -namespace de { - -class SerialScheduler { -public: - SerialScheduler(size_t memory_budget, size_t thread_cnt) - : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) - , m_thrd_cnt((thread_cnt) ? thread_cnt: UINT64_MAX) - , m_used_memory(0) - , m_used_thrds(0) - , m_shutdown(false) - { - m_sched_thrd = std::thread(&SerialScheduler::run, this); - } - - ~SerialScheduler() { - shutdown(); - - m_cv.notify_all(); - m_sched_thrd.join(); - } - - void schedule_job(std::function job, size_t size, void *args) { - size_t ts = m_counter.fetch_add(1); - m_task_queue.push(Task(size, ts, job, args)); - m_cv.notify_all(); - } - - void shutdown() { - m_shutdown = true; - } - -private: - psudb::LockedPriorityQueue m_task_queue; - - size_t m_memory_budget; - size_t m_thrd_cnt; - - bool m_shutdown; - - std::atomic m_counter; - std::mutex m_cv_lock; - std::condition_variable m_cv; - - std::thread m_sched_thrd; - - std::atomic m_used_thrds; - std::atomic m_used_memory; - - void schedule_next() { - auto t = m_task_queue.pop(); - t(); - } - - void run() { - do { - std::unique_lock cv_lock(m_cv_lock); - m_cv.wait(cv_lock); - - while (m_task_queue.size() > 0 && m_used_thrds.load() < m_thrd_cnt) { - schedule_next(); - } - cv_lock.unlock(); - } while(!m_shutdown); - } -}; - -} diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 518159d..94c4d0a 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -8,20 +8,20 @@ #include #include "framework/util/Configuration.h" +#include "framework/scheduling/Epoch.h" namespace de { +template struct MergeArgs { - void *version; - void *buffer; + Epoch *epoch; std::vector merges; std::promise result; }; -template +template struct QueryArgs { - void *version; - void *buffer; + Epoch *epoch; std::promise> result_set; void *query_parms; }; diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h new file mode 100644 index 0000000..1efc1ac --- /dev/null +++ b/include/framework/structure/BufferView.h @@ -0,0 +1,124 @@ +/* + * include/framework/structure/BufferView.h + * + * Copyright (C) 2023 Douglas Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "psu-util/alignment.h" +#include "util/bf_config.h" +#include "psu-ds/BloomFilter.h" +#include "psu-ds/Alias.h" +#include "psu-util/timer.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/interface/Query.h" + +namespace de { + +template +class BufferView { + typedef MutableBuffer Buffer; +public: + BufferView() = default; + + BufferView(std::vector buffers) + : m_buffers(buffers) + , m_cutoff(buffers[buffers->size()-1]->get_record_count()) + {} + + ~BufferView() = default; + + bool delete_record(const R& rec) { + auto res = false; + for (auto buf : m_buffers) { + res = buf->delete_record(rec); + if (res) return true; + } + return false; + } + + bool check_tombstone(const R& rec) { + auto res = false; + for (auto buf : m_buffers) { + res = buf->check_tombstone(rec); + if (res) return true; + } + return false; + } + + size_t get_record_count() { + size_t reccnt = 0; + for (auto buf : m_buffers) { + reccnt += buf->get_record_count(); + } + return reccnt; + } + + size_t get_capacity() { + return m_buffers[0]->get_capacity(); + } + + bool is_full() { + return m_buffers[m_buffers.size() - 1]->is_full(); + } + + size_t get_tombstone_count() { + size_t tscnt = 0; + for (auto buf : m_buffers) { + tscnt += buf->get_tombstone_count(); + } + return tscnt; + } + + size_t get_memory_usage() { + size_t mem = 0; + for (auto buf : m_buffers) { + mem += buf->get_memory_usage(); + } + return mem; + } + + size_t get_aux_memory_usage() { + size_t mem = 0; + for (auto buf : m_buffers) { + mem += buf->get_aux_memory_usage(); + } + return mem; + } + + size_t get_tombstone_capacity() { + return m_buffers[0]->get_tombstone_capacity(); + } + + std::vector get_buffer_states(void *parms) { + std::vector states; + + for (auto buf : m_buffers) { + states.push_back(Q::get_buffer_query_state(buf, parms)); + } + + return states; + } + + std::vector &get_buffers() { + return m_buffers; + } + +private: + std::vector m_buffers; + size_t m_cutoff; +}; + +} diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 8344518..de965ae 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -302,12 +302,38 @@ public: m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); } + bool take_reference() { + m_refcnt.fetch_add(1); + return true; + } + + bool release_reference() { + assert(m_refcnt.load() > 0); + m_refcnt.fetch_add(-1); + return true; + } + + size_t get_reference_count() { + return m_refcnt.load(); + } + + std::vector get_query_states(std::vector> &shards, void *parms) { + std::vector states; + + for (auto &level : m_levels) { + level->get_query_states(shards, states, parms); + } + + return states; + } private: size_t m_scale_factor; double m_max_delete_prop; size_t m_buffer_size; + std::atomic m_refcnt; + std::vector>> m_levels; /* diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 4e0b5c2..974dc28 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -217,6 +217,10 @@ public: return true; } + size_t get_reference_count() { + return m_refcnt.load(); + } + bool active_merge() { return m_merging.load(); } -- cgit v1.2.3 From 39ae3e0441d8297a09197aba98bd494b5ada12c1 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 14:17:59 -0400 Subject: Concurrency updates + fixes for compile errors --- include/framework/DynamicExtension.h | 161 ++++++++++++++-------------- include/framework/scheduling/Epoch.h | 4 +- include/framework/scheduling/Task.h | 1 + include/framework/structure/BufferView.h | 4 +- include/framework/structure/InternalLevel.h | 21 ++-- include/shard/WIRS.h | 14 +-- 6 files changed, 106 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 6936247..d2a6b7a 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -39,7 +39,7 @@ class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; typedef ExtensionStructure Structure; - typedef Epoch Epoch; + typedef Epoch _Epoch; typedef BufferView BufView; public: @@ -53,20 +53,24 @@ public: { auto buf = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); auto vers = new Structure(m_buffer_capacity, m_scale_factor, m_max_delete_prop); - auto epoch = new Epoch(vers, buf); + auto epoch = new _Epoch(vers, buf); - m_buffers.push_back(new Buffer(buffer_cap, max_delete_prop*buffer_cap)); - m_versions.push_back(new Structure(buffer_cap, scale_factor, max_delete_prop)); - m_epochs.push_back({0, epoch}); + m_buffers.insert(new Buffer(buffer_cap, max_delete_prop*buffer_cap)); + m_versions.insert(new Structure(buffer_cap, scale_factor, max_delete_prop)); + m_epochs.insert({0, epoch}); } ~DynamicExtension() { - for (size_t i=0; idelete_record(rec); + return buffers.delete_record(rec); } /* @@ -97,7 +101,7 @@ public: } std::future> query(void *parms) { - return schedule_query(get_active_epoch()->get_structure(), get_active_epoch()->get_buffers()[0], parms); + return schedule_query(parms); } size_t get_record_count() { @@ -183,36 +187,37 @@ private: std::set m_versions; std::atomic m_current_epoch; - std::unordered_map m_epochs; + std::unordered_map m_epochs; size_t m_scale_factor; double m_max_delete_prop; size_t m_buffer_capacity; size_t m_buffer_delete_capacity; - Epoch *get_active_epoch() { + _Epoch *get_active_epoch() { return m_epochs[m_current_epoch.load()]; } void advance_epoch() { size_t new_epoch_num = m_current_epoch.load() + 1; - Epoch *new_epoch = m_epochs[new_epoch_num]; - Epoch *old_epoch = m_epochs[m_current_epoch.load()]; + _Epoch *new_epoch = m_epochs[new_epoch_num]; + _Epoch *old_epoch = m_epochs[m_current_epoch.load()]; // Update the new Epoch to contain the buffers // from the old one that it doesn't currently have size_t old_buffer_cnt = new_epoch->clear_buffers(); for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { - new_epoch->add_buffer(old_epoch->get_buffers[i]); + new_epoch->add_buffer(old_epoch->get_buffers()[i]); } m_current_epoch.fetch_add(1); + retire_epoch(old_epoch); } /* * Creates a new epoch by copying the currently active one. The new epoch's * structure will be a shallow copy of the old one's. */ - Epoch *create_new_epoch() { + _Epoch *create_new_epoch() { auto new_epoch = get_active_epoch()->clone(); std::unique_lock m_struct_lock; m_versions.insert(new_epoch->get_structure()); @@ -228,8 +233,8 @@ private: * buffer while a new epoch is being created in the background. Returns a * pointer to the newly created buffer. */ - Buffer *add_empty_buffer(Epoch *epoch) { - auto new_buffer = Buffer(m_buffer_capacity, m_buffer_delete_capacity); + Buffer *add_empty_buffer(_Epoch *epoch) { + auto new_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); std::unique_lock m_struct_lock; m_buffers.insert(new_buffer); @@ -239,7 +244,7 @@ private: return new_buffer; } - void retire_epoch(Epoch *epoch) { + void retire_epoch(_Epoch *epoch) { /* * Epochs with currently active jobs cannot * be retired. By the time retire_epoch is called, @@ -294,26 +299,15 @@ private: vers->merge_buffer(buff); - args->result.set_value(true); args->epoch->end_job(); - delete args; - } - - static std::vector finalize_query_result(std::vector>> &query_results, void *parms, - std::vector &shard_states, std::vector &buffer_states) { - auto result = Q::merge(query_results, parms); - - for (size_t i=0; iresult.set_value(true); - return result; + ((DynamicExtension *) args->extension)->advance_epoch(); + + // FIXME: this might break things... not sure + delete args; } - + static void async_query(void *arguments) { QueryArgs *args = (QueryArgs *) arguments; @@ -322,58 +316,56 @@ private: void *parms = args->query_parms; // Get the buffer query states - std::vector buffer_states = buffers->get_buffer_query_states(parms); + std::vector buffer_states = buffers.get_query_states(parms); // Get the shard query states std::vector> shards; - std::vector shard_states = vers->get_query_states(shards, parms); + std::vector states = vers->get_query_states(shards, parms); - Q::process_query_states(parms, shard_states, buffer_states); + Q::process_query_states(parms, states, buffer_states); std::vector>> query_results(shards.size() + buffer_states.size()); + for (size_t i=0; i> local_results = (i < buffer_states.size()) + ? Q::buffer_query(buffers.get_buffers()[i], buffer_states[i], parms) + : Q::query(shards[i - buffer_states.size()].second, + states[i - buffer_states.size()], parms); + ShardID shid = (i < buffer_states.size()) ? INVALID_SHID : shards[i - buffer_states.size()].first; + query_results[i] = std::move(filter_deletes(local_results, shid, buffers, vers)); - // Execute the query for the buffer - std::vector>> buffer_results(buffer_states.size()); - for (size_t i=0; iget_buffers[i], buffer_states[i], parms); - query_results[i] = std::move(filter_deletes(buffer_results, {-1, -1}, buffers, vers)); - - if constexpr (Q::EARLY_ABORT) { - if (query_results[i] > 0) { - return finalize_query_result(query_results, parms, buffer_states, shard_states); - } - } - } - - // Execute the query for each shard - for (size_t i=0; i 0) { - return finalize_query_result(query_results, parms, buffer_states, shard_states); - } + if (query_results[i].size() > 0) break; } } - - // Merge the results together and finalize the job - auto result = finalize_query_result(query_results, parms, buffer_states, shard_states); + + auto result = Q::merge(query_results, parms); args->result_set.set_value(std::move(result)); args->epoch->end_job(); + + for (size_t i=0; i schedule_merge() { - auto epoch = get_active_epoch(); + void schedule_merge() { + auto epoch = create_new_epoch(); epoch->start_job(); MergeArgs *args = new MergeArgs(); args->epoch = epoch; - args->merges = epoch->get_structure()->get_merge_tasks(epoch->get_buffers()[0]); + // FIXME: all full buffers can be merged at this point--but that requires + // retooling the shard interface a bit to do efficiently. + args->merges = epoch->get_structure()->get_merge_tasks(epoch->get_buffers()[0]->get_record_count()); + args->extension = this; m_sched.schedule_job(merge, 0, args); - - return args->result.get_future(); } std::future> schedule_query(void *query_parms) { @@ -389,20 +381,29 @@ private: } int internal_append(const R &rec, bool ts) { - Buffer *buffer; - while (!(buffer = get_active_epoch()->get_active_buffer())) - ; - - if (buffer->is_full()) { - auto vers = get_active_epoch()->get_structure(); - auto res = schedule_merge(vers, buffer); - res.get(); - } + Buffer *buffer = nullptr; + do { + auto epoch = get_active_epoch(); + + while (!(buffer = epoch->get_active_buffer())) + ; + + /* if the buffer is full, schedule a merge and add a new empty buffer */ + if (buffer->is_full()) { + // FIXME: possible race here--two identical merges could be scheduled + auto vers = epoch->get_structure(); + schedule_merge(); + buffer = add_empty_buffer(epoch); + } + + } while(!buffer->append(rec, ts)); - return buffer->append(rec, ts); + /* internal append should always succeed, eventually */ + return 1; } - static std::vector> filter_deletes(std::vector> &records, ShardID shid, BufView *buffers, Structure *vers) { + static std::vector> filter_deletes(std::vector> &records, ShardID shid, + BufView &buffers, Structure *vers) { if constexpr (!Q::SKIP_DELETE_FILTER) { return records; } @@ -431,7 +432,7 @@ private: continue; } - if (buffers->check_tombstone(rec.rec)) { + if (buffers.check_tombstone(rec.rec)) { continue; } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index a1f865c..fe63c86 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -41,7 +41,7 @@ public: assert(m_active_jobs.load() == 0); for (auto buf : m_buffers) { - buf.release_reference(); + buf->release_reference(); } if (m_structure) { @@ -111,6 +111,8 @@ public: if (m_structure) { epoch->m_structure = m_structure->copy(); } + + return epoch; } private: diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 94c4d0a..d25c7c0 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -17,6 +17,7 @@ struct MergeArgs { Epoch *epoch; std::vector merges; std::promise result; + void *extension; }; template diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 1efc1ac..14abedc 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -35,7 +35,7 @@ public: BufferView(std::vector buffers) : m_buffers(buffers) - , m_cutoff(buffers[buffers->size()-1]->get_record_count()) + , m_cutoff(buffers[buffers.size()-1]->get_record_count()) {} ~BufferView() = default; @@ -102,7 +102,7 @@ public: return m_buffers[0]->get_tombstone_capacity(); } - std::vector get_buffer_states(void *parms) { + std::vector get_query_states(void *parms) { std::vector states; for (auto buf : m_buffers) { diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index b9230f4..342a2c7 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -231,6 +231,17 @@ public: return (double) tscnt / (double) (tscnt + reccnt); } + std::shared_ptr clone() { + auto new_level = std::make_shared(m_level_no, m_shards.size()); + for (size_t i=0; im_shards[i] = m_shards[i]; + new_level->m_owns[i] = true; + m_owns[i] = false; + } + + return new_level; + } + private: ssize_t m_level_no; @@ -243,16 +254,6 @@ private: std::vector m_owns; - std::shared_ptr clone() { - auto new_level = std::make_shared(m_level_no, m_shards.size()); - for (size_t i=0; im_shards[i] = m_shards[i]; - new_level->m_owns[i] = true; - m_owns[i] = false; - } - - return new_level; - } }; } diff --git a/include/shard/WIRS.h b/include/shard/WIRS.h index 8583cb0..83573c8 100644 --- a/include/shard/WIRS.h +++ b/include/shard/WIRS.h @@ -448,17 +448,21 @@ public: return state; } - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { + static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buff_states) { + // FIXME: need to redo for the buffer vector interface auto p = (wirs_query_parms *) query_parms; - auto bs = (WIRSBufferState *) buff_state; std::vector shard_sample_sizes(shard_states.size()+1, 0); size_t buffer_sz = 0; + decltype(R::weight) total_weight = 0; std::vector weights; - weights.push_back(bs->total_weight); + for (auto &s : buff_states) { + auto state = (WIRSBufferState *) s; + total_weight += state->total_weight; + weights.push_back(state->total_weight); + } - decltype(R::weight) total_weight = 0; for (auto &s : shard_states) { auto state = (WIRSState *) s; total_weight += state->total_weight; @@ -480,8 +484,6 @@ public: } } - - bs->sample_size = buffer_sz; for (size_t i=0; i *) shard_states[i]; state->sample_size = shard_sample_sizes[i+1]; -- cgit v1.2.3 From ceffd8caf5e4e827e2cc4d6975507a66d88f77a9 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 14:25:28 -0400 Subject: DynamicExtension: adjusted a few operations to ensure conistency get_memory_usage, get_aux_memory_usage, get_record_count, get_tombstone_count, and create_static_structure have been adjusted to ensure that they pull from a consistent epoch, even if a change-over occurs midway through the function. These functions also now register with the epoch as a job, to ensure that the epoch they are operating own isn't retired midway through the function. Probably not a big issue for the accessors, but I could see it being very important for create_static_structure. --- include/framework/DynamicExtension.h | 48 ++++++++++++++++++++++---------- include/framework/structure/BufferView.h | 4 +++ 2 files changed, 37 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index d2a6b7a..eb78d48 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -105,13 +105,20 @@ public: } size_t get_record_count() { - size_t cnt = get_active_epoch()->get_buffer_view().get_record_count(); - return cnt + get_active_epoch()->get_structure()->get_record_count(); + auto epoch = get_active_epoch(); + epoch->start_job(); + auto t = epoch->get_buffer_view().get_record_count() + epoch->get_structure()->get_record_count(); + epoch->end_job(); + + return t; } - size_t get_tombstone_cnt() { - size_t cnt = get_active_epoch()->get_buffer_view().get_tombstone_count(); - return cnt + get_active_epoch()->get_structure()->get_tombstone_cnt(); + size_t get_tombstone_count() { + auto epoch = get_active_epoch(); + epoch->start_job(); + auto t = epoch->get_buffer_view().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); + epoch->end_job(); + return t; } size_t get_height() { @@ -119,17 +126,21 @@ public: } size_t get_memory_usage() { - auto vers = get_active_epoch()->get_structure()->get_memory_usage(); - auto buffer = get_active_epoch()->get_buffer_view().get_memory_usage(); + auto epoch = get_active_epoch(); + epoch->start_job(); + auto t= epoch->get_buffer_view().get_memory_usage() + epoch->get_structure()->get_memory_usage(); + epoch->end_job(); - return vers + buffer; + return t; } size_t get_aux_memory_usage() { - auto vers = get_active_epoch()->get_structure()->get_aux_memory_usage(); - auto buffer = get_active_epoch()->get_buffer_view().get_aux_memory_usage(); + auto epoch = get_active_epoch(); + epoch->start_job(); + auto t = epoch->get_buffer_view().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); + epoch->end_job(); - return vers + buffer; + return t; } size_t get_buffer_capacity() { @@ -137,7 +148,11 @@ public: } Shard *create_static_structure() { - auto vers = get_active_epoch()->get_structure(); + auto epoch = get_active_epoch(); + auto bv = epoch->get_buffer_view(); + epoch->start_job(); + + auto vers = epoch->get_structure(); std::vector shards; if (vers->get_levels().size() > 0) { @@ -148,9 +163,11 @@ public: } } - // FIXME: should use a buffer view--or perhaps some sort of a - // raw record iterator model. - shards.emplace_back(new S(get_active_epoch()->get_buffers()[0])); + // FIXME: With an interface adjustment, this could be done in + // one call, rather than a loop. + for (size_t i=bv.size() - 1; i>=0; i--) { + shards.emplace_back(new S(bv.get_buffers()[i])); + } Shard *shards_array[shards.size()]; @@ -167,6 +184,7 @@ public: delete shard; } + epoch->end_job(); return flattened; } diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 14abedc..8dff2ef 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -116,6 +116,10 @@ public: return m_buffers; } + size_t size() { + return m_buffers.size(); + } + private: std::vector m_buffers; size_t m_cutoff; -- cgit v1.2.3 From 40b87b74f2bf4e93fdc9dabd6eab9175187fb63c Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 14:47:16 -0400 Subject: FIFOScheduler: correctly protect m_cv with a lock --- include/framework/scheduling/FIFOScheduler.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 878bb81..7ccab26 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -44,13 +44,18 @@ public: ~FIFOScheduler() { shutdown(); + std::unique_lock lk(m_cv_lock); m_cv.notify_all(); + lk.release(); + m_sched_thrd.join(); } void schedule_job(std::function job, size_t size, void *args) { size_t ts = m_counter.fetch_add(1); m_task_queue.push(Task(size, ts, job, args)); + + std::unique_lock lk(m_cv_lock); m_cv.notify_all(); } -- cgit v1.2.3 From 32aeedbaf6584eb71126cbe92cb42e93b65d69d3 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 14:47:35 -0400 Subject: Epoch/DynamicExtension: added cv to epoch retirement check Instead of busy waiting on the active job count, a condition variable is now used to wait for all active jobs to finish before freeing an epoch's resources. --- include/framework/DynamicExtension.h | 9 +++++---- include/framework/scheduling/Epoch.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index eb78d48..21d0261 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -79,6 +79,8 @@ public: } int erase(const R &rec) { + // FIXME: delete tagging will require a lot of extra work to get + // operating "correctly" in a concurrent environment. if constexpr (D == DeletePolicy::TAGGING) { BufView buffers = get_active_epoch()->get_buffer_view(); @@ -118,6 +120,7 @@ public: epoch->start_job(); auto t = epoch->get_buffer_view().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); epoch->end_job(); + return t; } @@ -271,11 +274,9 @@ private: * accumulate new active jobs. Eventually, this * number will hit zero and the function will * proceed. - * - * FIXME: this can be replaced with a cv, which - * is probably a superior solution in this case */ - while (epoch->get_active_job_num() > 0) + + while (!epoch->retirable()) ; /* diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index fe63c86..87463bd 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -62,6 +62,11 @@ public: void end_job() { m_active_jobs.fetch_add(-1); + + if (m_active_jobs.load() == 0) { + std::unique_lock lk(m_cv_lock); + m_active_cv.notify_all(); + } } size_t get_active_job_num() { @@ -115,10 +120,35 @@ public: return epoch; } + /* + * + */ + bool retirable() { + /* if epoch is currently active, then it cannot be retired */ + if (m_active) { + return false; + } + + /* + * if the epoch has active jobs but is not itself active, + * wait for them to finish and return true. If there are + * not active jobs, return true immediately + */ + while (m_active_jobs > 0) { + std::unique_lock lk(m_cv_lock); + m_active_cv.wait(lk); + } + + return true; + } + private: Structure *m_structure; std::vector m_buffers; + std::condition_variable m_active_cv; + std::mutex m_cv_lock; + /* * The number of currently active jobs * (queries/merges) operating on this @@ -126,5 +156,6 @@ private: * when this number is 0. */ std::atomic m_active_jobs; + bool m_active; }; } -- cgit v1.2.3 From 8ce1cb0eef7d5631f0f7788804845ddc8296ac6f Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 14:52:45 -0400 Subject: DynamicExtension: comment cleanup/adjustments --- include/framework/DynamicExtension.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 21d0261..f2bbacc 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -224,8 +224,10 @@ private: _Epoch *new_epoch = m_epochs[new_epoch_num]; _Epoch *old_epoch = m_epochs[m_current_epoch.load()]; - // Update the new Epoch to contain the buffers - // from the old one that it doesn't currently have + /* + * Update the new Epoch to contain the buffers from the old one + * that it doesn't currently have + */ size_t old_buffer_cnt = new_epoch->clear_buffers(); for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { new_epoch->add_buffer(old_epoch->get_buffers()[i]); @@ -275,7 +277,6 @@ private: * number will hit zero and the function will * proceed. */ - while (!epoch->retirable()) ; @@ -323,7 +324,6 @@ private: ((DynamicExtension *) args->extension)->advance_epoch(); - // FIXME: this might break things... not sure delete args; } @@ -334,10 +334,10 @@ private: auto vers = args->epoch->get_structure(); void *parms = args->query_parms; - // Get the buffer query states + /* Get the buffer query states */ std::vector buffer_states = buffers.get_query_states(parms); - // Get the shard query states + /* Get the shard query states */ std::vector> shards; std::vector states = vers->get_query_states(shards, parms); @@ -370,7 +370,6 @@ private: Q::delete_query_state(states[i]); } - // FIXME: this might break things... not sure delete args; } @@ -430,8 +429,10 @@ private: std::vector> processed_records; processed_records.reserve(records.size()); - // For delete tagging, we just need to check the delete bit on each - // record. + /* + * For delete tagging, we just need to check the delete bit + * on each record. + */ if constexpr (D == DeletePolicy::TAGGING) { for (auto &rec : records) { if (rec.is_deleted()) { @@ -444,8 +445,10 @@ private: return processed_records; } - // For tombstone deletes, we need to search for the corresponding - // tombstone for each record. + /* + * For tombstone deletes, we need to search for the corresponding + * tombstone for each record. + */ for (auto &rec : records) { if (rec.is_tombstone()) { continue; -- cgit v1.2.3 From d2279e1b96d352a0af1d425dcaaf93e8a26a8d52 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 30 Oct 2023 17:15:05 -0400 Subject: General Comment + Consistency updates --- include/framework/DynamicExtension.h | 46 +++--- include/framework/ShardRequirements.h | 8 + include/framework/interface/Query.h | 34 ++-- include/framework/interface/Record.h | 5 +- include/framework/interface/Scheduler.h | 4 +- include/framework/interface/Shard.h | 20 ++- include/framework/scheduling/Epoch.h | 1 - include/framework/scheduling/FIFOScheduler.h | 3 +- include/framework/scheduling/Scheduler.h | 195 ----------------------- include/framework/scheduling/Task.h | 6 +- include/framework/structure/BufferView.h | 2 +- include/framework/structure/ExtensionStructure.h | 4 +- include/framework/structure/InternalLevel.h | 4 +- include/framework/structure/MutableBuffer.h | 4 +- include/framework/util/Configuration.h | 5 +- include/shard/MemISAM.h | 6 +- include/shard/PGM.h | 4 + include/shard/TrieSpline.h | 4 + include/shard/VPTree.h | 9 +- include/shard/WIRS.h | 6 +- include/shard/WSS.h | 8 +- include/util/Cursor.h | 4 +- include/util/bf_config.h | 20 ++- include/util/types.h | 47 +++--- 24 files changed, 167 insertions(+), 282 deletions(-) delete mode 100644 include/framework/scheduling/Scheduler.h (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index f2bbacc..9129060 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -1,7 +1,7 @@ /* * include/framework/DynamicExtension.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. @@ -107,8 +107,7 @@ public: } size_t get_record_count() { - auto epoch = get_active_epoch(); - epoch->start_job(); + auto epoch = get_active_epoch_protected(); auto t = epoch->get_buffer_view().get_record_count() + epoch->get_structure()->get_record_count(); epoch->end_job(); @@ -116,8 +115,7 @@ public: } size_t get_tombstone_count() { - auto epoch = get_active_epoch(); - epoch->start_job(); + auto epoch = get_active_epoch_protected(); auto t = epoch->get_buffer_view().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); epoch->end_job(); @@ -129,8 +127,7 @@ public: } size_t get_memory_usage() { - auto epoch = get_active_epoch(); - epoch->start_job(); + auto epoch = get_active_epoch_protected(); auto t= epoch->get_buffer_view().get_memory_usage() + epoch->get_structure()->get_memory_usage(); epoch->end_job(); @@ -138,8 +135,7 @@ public: } size_t get_aux_memory_usage() { - auto epoch = get_active_epoch(); - epoch->start_job(); + auto epoch = get_active_epoch_protected(); auto t = epoch->get_buffer_view().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); epoch->end_job(); @@ -151,9 +147,8 @@ public: } Shard *create_static_structure() { - auto epoch = get_active_epoch(); + auto epoch = get_active_epoch_protected(); auto bv = epoch->get_buffer_view(); - epoch->start_job(); auto vers = epoch->get_structure(); std::vector shards; @@ -219,6 +214,11 @@ private: return m_epochs[m_current_epoch.load()]; } + _Epoch *get_active_epoch_protected() { + m_epochs[m_current_epoch.load()]->start_job(); + return m_epochs[m_current_epoch.load()]; + } + void advance_epoch() { size_t new_epoch_num = m_current_epoch.load() + 1; _Epoch *new_epoch = m_epochs[new_epoch_num]; @@ -241,6 +241,12 @@ private: * structure will be a shallow copy of the old one's. */ _Epoch *create_new_epoch() { + /* + * This epoch access is _not_ protected under the assumption that + * only one merge will be able to trigger at a time. If that condition + * is violated, it is possible that this code will clone a retired + * epoch. + */ auto new_epoch = get_active_epoch()->clone(); std::unique_lock m_struct_lock; m_versions.insert(new_epoch->get_structure()); @@ -311,6 +317,8 @@ private: MergeArgs *args = (MergeArgs *) arguments; Structure *vers = args->epoch->get_structure(); + // FIXME: with an improved shard interface, multiple full buffers + // could be merged at once here. Buffer *buff = (Buffer *) args->epoch->get_buffers()[0]; for (ssize_t i=args->merges.size() - 1; i>=0; i--) { @@ -387,24 +395,23 @@ private: } std::future> schedule_query(void *query_parms) { - auto epoch = get_active_epoch(); - epoch->start_job(); + auto epoch = get_active_epoch_protected(); QueryArgs *args = new QueryArgs(); args->epoch = epoch; args->query_parms = query_parms; + auto result = args->result_set.get_future(); + m_sched.schedule_job(async_query, 0, args); - return args->result_set.get_future(); + return result; } int internal_append(const R &rec, bool ts) { Buffer *buffer = nullptr; do { - auto epoch = get_active_epoch(); - - while (!(buffer = epoch->get_active_buffer())) - ; + auto epoch = get_active_epoch_protected(); + buffer = epoch->get_active_buffer(); /* if the buffer is full, schedule a merge and add a new empty buffer */ if (buffer->is_full()) { @@ -413,7 +420,8 @@ private: schedule_merge(); buffer = add_empty_buffer(epoch); } - + // FIXME: not exactly the best spot for this + epoch->end_job(); } while(!buffer->append(rec, ts)); /* internal append should always succeed, eventually */ diff --git a/include/framework/ShardRequirements.h b/include/framework/ShardRequirements.h index d2d4ff2..55e7199 100644 --- a/include/framework/ShardRequirements.h +++ b/include/framework/ShardRequirements.h @@ -1,4 +1,12 @@ /* + * include/framework/ShardRequirements.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + * A header file containing the necessary includes for Shard + * development. * */ #pragma once diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index 9b1d2d6..21cadcb 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -1,7 +1,7 @@ /* - * include/framework/QueryInterface.h + * include/framework/interface/Query.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * @@ -10,25 +10,29 @@ #include #include + #include "util/types.h" +// FIXME: The interface is not completely specified yet, as it is pending +// determining a good way to handle additional template arguments +// to get the Shard and Record types into play template concept QueryInterface = requires(Q q, void *p, std::vector &s) { -/* - {q.get_query_state(p, p)} -> std::convertible_to; - {q.get_buffer_query_state(p, p)}; - {q.query(p, p)}; - {q.buffer_query(p, p)}; - {q.merge()}; - {q.delete_query_state(p)}; -*/ - {Q::EARLY_ABORT} -> std::convertible_to; - {Q::SKIP_DELETE_FILTER} -> std::convertible_to; - //{Q::get_query_state(p, p)} -> std::convertible_to; - //{Q::get_buffer_query_state(p, p)} -> std::convertible_to; + + /* + {Q::get_query_state(p, p)} -> std::convertible_to; + {Q::get_buffer_query_state(p, p)} -> std::convertible_to; + */ {Q::process_query_states(p, s, s)}; + /* + {Q::query(s, p, p)} -> std::convertible_to>>; + {Q::buffer_query(p, p)} -> std::convertible_to>>; + {Q::merge(rv, p)} -> std::convertible_to>; + */ {Q::delete_query_state(std::declval())} -> std::same_as; - {Q::delete_buffer_query_state(p)}; + {Q::delete_buffer_query_state(std::declval())} -> std::same_as; + {Q::EARLY_ABORT} -> std::convertible_to; + {Q::SKIP_DELETE_FILTER} -> std::convertible_to; }; diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h index 1ef1984..bf495df 100644 --- a/include/framework/interface/Record.h +++ b/include/framework/interface/Record.h @@ -1,11 +1,12 @@ /* - * include/framework/RecordInterface.h + * include/framework/interface/Record.h * * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie * * All rights reserved. Published under the Modified BSD License. * + * FIXME: the record implementations could probably be broken out into + * different files, leaving only the interface here */ #pragma once diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h index e8ffd08..63581d2 100644 --- a/include/framework/interface/Scheduler.h +++ b/include/framework/interface/Scheduler.h @@ -1,7 +1,7 @@ /* - * include/framework/QueryInterface.h + * include/framework/interface/Scheduler.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index ea58b2a..d3a6cf8 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -1,7 +1,7 @@ /* - * include/framework/ShardInterface.h + * include/framework/interface/Shard.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * @@ -15,12 +15,22 @@ namespace de { -//template typename S, typename R> +// FIXME: The interface is not completely specified yet, as it is pending +// determining a good way to handle additional template arguments +// to get the Record type into play template -concept ShardInterface = requires(S s, void *p, bool b) { - //{s.point_lookup(r, b) } -> std::same_as; +concept ShardInterface = requires(S s, S **spp, void *p, bool b, size_t i) { + {S(spp, i)}; + /* + {S(mutable buffer)} + {s.point_lookup(r, b) } -> std::convertible_to + */ + {s.get_data()} -> std::convertible_to; + {s.get_record_count()} -> std::convertible_to; + {s.get_tombstone_count()} -> std::convertible_to; {s.get_memory_usage()} -> std::convertible_to; + {s.get_aux_memory_usage()} -> std::convertible_to; }; } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 87463bd..03cbb62 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -2,7 +2,6 @@ * include/framework/scheduling/Epoch.h * * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie * * All rights reserved. Published under the Modified BSD License. * diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 7ccab26..5425c4f 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -1,8 +1,7 @@ /* - * include/framework/Scheduler.h + * include/framework/scheduling/FIFOScheduler.h * * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie * * All rights reserved. Published under the Modified BSD License. * diff --git a/include/framework/scheduling/Scheduler.h b/include/framework/scheduling/Scheduler.h deleted file mode 100644 index 992cbf9..0000000 --- a/include/framework/scheduling/Scheduler.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * include/framework/Scheduler.h - * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include -#include - -#include "util/types.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" -#include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" -#include "framework/util/Configuration.h" -#include "framework/structure/ExtensionStructure.h" - -namespace de { - -template -class Scheduler { - typedef ExtensionStructure Structure; - typedef MutableBuffer Buffer; -public: - /* - * Memory budget stated in bytes, with 0 meaning unlimited. Likewise, 0 threads means - * unlimited. - */ - Scheduler(size_t memory_budget, size_t thread_cnt) - : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) - , m_thread_cnt((thread_cnt) ? thread_cnt : UINT64_MAX) - , m_used_memory(0) - , m_used_threads(0) - , m_shutdown(false) - { - m_sched_thrd = std::thread(&Scheduler::run_scheduler, this); - } - - ~Scheduler() { - m_shutdown = true; - - m_cv.notify_all(); - m_sched_thrd.join(); - } - - bool schedule_merge(Structure *version, MutableBuffer *buffer) { - /* - * temporary hack - */ - pending_version = version; - pending_buffer = buffer; - - /* - * Get list of individual level reconstructions that are necessary - * for completing the overall merge - */ - std::vector merges = version->get_merge_tasks(buffer->get_record_count()); - - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=0; iget_record_count() * sizeof(R) * 2; - buffer_merge.m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(buffer_merge); - m_merge_queue_lock.unlock(); - - m_cv.notify_all(); - do { - std::unique_lock merge_cv_lock(m_merge_cv_lock); - m_merge_cv.wait(merge_cv_lock); - } while (m_merge_queue.size() > 0); - - assert(version->get_levels()[version->get_levels().size() - 1]->get_shard(0)->get_tombstone_count() == 0); - - return true; - } - -private: - size_t get_timestamp() { - auto ts = m_timestamp.fetch_add(1); - return ts; - } - - void schedule_next_task() { - m_merge_queue_lock.lock(); - auto task = m_merge_queue.top(); - m_merge_queue.pop(); - m_merge_queue_lock.unlock(); - - if (task.m_source_level == -1 && task.m_target_level == 0) { - run_buffer_merge(pending_buffer, pending_version); - } else { - run_merge(task, pending_version); - } - - if (m_merge_queue.size() == 0) { - m_merge_cv.notify_all(); - } - } - - - void run_merge(MergeTask task, Structure *version) { - version->merge_levels(task.m_target_level, task.m_source_level); - - if (!version->validate_tombstone_proportion(task.m_target_level)) { - auto tasks = version->get_merge_tasks(task.m_target_level); - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); - } - } - } - - - void run_buffer_merge(Buffer *buffer, Structure *version) { - version->merge_buffer(buffer); - if (!version->validate_tombstone_proportion(0)) { - auto tasks = version->get_merge_tasks_from_level(0); - - /* - * Schedule the merge tasks (FIXME: currently this just - * executes them sequentially in a blocking fashion) - */ - for (ssize_t i=tasks.size()-1; i>=0; i--) { - tasks[i].m_timestamp = m_timestamp.fetch_add(1); - m_merge_queue_lock.lock(); - m_merge_queue.push(tasks[i]); - m_merge_queue_lock.unlock(); - } - } - } - - void run_scheduler() { - do { - std::unique_lock cv_lock(m_cv_lock); - m_cv.wait(cv_lock); - - while (m_merge_queue.size() > 0 && m_used_threads.load() < m_thread_cnt) { - schedule_next_task(); - } - cv_lock.unlock(); - } while(!m_shutdown); - } - - size_t m_memory_budget; - size_t m_thread_cnt; - - Buffer *pending_buffer; - Structure *pending_version; - - alignas(64) std::atomic m_used_memory; - alignas(64) std::atomic m_used_threads; - alignas(64) std::atomic m_timestamp; - - std::priority_queue, std::greater> m_merge_queue; - std::mutex m_merge_queue_lock; - - std::mutex m_cv_lock; - std::condition_variable m_cv; - - std::mutex m_merge_cv_lock; - std::condition_variable m_merge_cv; - - std::thread m_sched_thrd; - - bool m_shutdown; -}; - -} diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index d25c7c0..228665f 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -1,9 +1,13 @@ /* + * include/framework/scheduling/Task.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. * */ #pragma once -#include #include #include diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 8dff2ef..ccd3dac 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -1,7 +1,7 @@ /* * include/framework/structure/BufferView.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index de965ae..1f365ae 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -1,7 +1,7 @@ /* - * include/framework/ExtensionStructure.h + * include/framework/structure/ExtensionStructure.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index 342a2c7..7a7b98c 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -1,7 +1,7 @@ /* - * include/framework/InternalLevel.h + * include/framework/structure/InternalLevel.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 974dc28..e0a6962 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -1,7 +1,7 @@ /* - * include/framework/MutableBuffer.h + * include/framework/structure/MutableBuffer.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h index 9d8248f..ec4ec3a 100644 --- a/include/framework/util/Configuration.h +++ b/include/framework/util/Configuration.h @@ -1,8 +1,7 @@ /* - * include/framework/DynamicExtension.h + * include/framework/util/Configuration.h * - * Copyright (C) 2023 Douglas Rumbaugh - * Dong Xie + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * diff --git a/include/shard/MemISAM.h b/include/shard/MemISAM.h index f9c621e..8ca5cee 100644 --- a/include/shard/MemISAM.h +++ b/include/shard/MemISAM.h @@ -1,7 +1,7 @@ /* * include/shard/MemISAM.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. @@ -264,6 +264,10 @@ public: return m_internal_node_cnt * inmem_isam_node_size + m_alloc_size; } + size_t get_aux_memory_usage() { + return 0; + } + private: size_t get_lower_bound(const K& key) const { const InMemISAMNode* now = m_root; diff --git a/include/shard/PGM.h b/include/shard/PGM.h index d960e70..6d76376 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -235,6 +235,10 @@ public: return m_pgm.size_in_bytes() + m_alloc_size; } + size_t get_aux_memory_usage() { + return 0; + } + size_t get_lower_bound(const K& key) const { auto bound = m_pgm.search(key); size_t idx = bound.lo; diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 98153c0..a784a38 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -250,6 +250,10 @@ public: return m_ts.GetSize() + m_alloc_size; } + size_t get_aux_memory_usage() { + return 0; + } + private: size_t get_lower_bound(const K& key) const { diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 0e998d9..d9a15b1 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -1,9 +1,9 @@ /* * include/shard/VPTree.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * - * All outsides reserved. Published under the Modified BSD License. + * All rights reserved. Published under the Modified BSD License. * */ #pragma once @@ -240,6 +240,11 @@ public: return m_node_cnt * sizeof(vpnode) + m_reccnt * sizeof(R*) + m_alloc_size; } + size_t get_aux_memory_usage() { + return 0; + } + + private: vpnode *build_vptree() { diff --git a/include/shard/WIRS.h b/include/shard/WIRS.h index 83573c8..bf29325 100644 --- a/include/shard/WIRS.h +++ b/include/shard/WIRS.h @@ -2,7 +2,7 @@ * include/shard/WIRS.h * * Copyright (C) 2023 Dong Xie - * Douglas Rumbaugh + * Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * @@ -260,6 +260,10 @@ public: return m_alloc_size + m_node_cnt * sizeof(wirs_node>); } + size_t get_aux_memory_usage() { + return 0; + } + private: size_t get_lower_bound(const K& key) const { diff --git a/include/shard/WSS.h b/include/shard/WSS.h index 87b016c..4e3a326 100644 --- a/include/shard/WSS.h +++ b/include/shard/WSS.h @@ -1,8 +1,8 @@ /* * include/shard/WSS.h * - * Copyright (C) 2023 Dong Xie - * Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie * * All rights reserved. Published under the Modified BSD License. * @@ -243,6 +243,10 @@ public: return m_alloc_size; } + size_t get_aux_memory_usage() { + return 0; + } + private: size_t get_lower_bound(const K& key) const { diff --git a/include/util/Cursor.h b/include/util/Cursor.h index 1cf20e1..00afaab 100644 --- a/include/util/Cursor.h +++ b/include/util/Cursor.h @@ -1,11 +1,13 @@ /* * include/util/Cursor.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. * + * A simple record cursor type with associated methods for help in + * merging record sets when constructing shards. */ #pragma once diff --git a/include/util/bf_config.h b/include/util/bf_config.h index 2390643..4de465d 100644 --- a/include/util/bf_config.h +++ b/include/util/bf_config.h @@ -1,11 +1,17 @@ /* * include/util/bf_config.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * All rights reserved. Published under the Modified BSD License. * + * Global parameters for configuring bloom filters used as auxiliary + * structures on shards within the framework. The bloom filters themselves + * can be found in + * + * $PROJECT_ROOT/external/psudb-common/cpp/include/psu-ds/BloomFilter.h + * */ #pragma once @@ -13,13 +19,25 @@ namespace de { +/* global variable for specifying bloom filter FPR */ static double BF_FPR = .01; + +/* global variable for specifying number of BF hash functions (k) */ static size_t BF_HASH_FUNCS = 7; +/* + * Adjust the value of BF_FPR. The argument must be on the interval + * (0, 1), or the behavior of bloom filters is undefined. + */ static void BF_SET_FPR(double fpr) { + BF_FPR = fpr; } +/* + * Adjust the value of BF_HASH_FUNCS. The argument must be on the interval + * (0, INT64_MAX], or the behavior of bloom filters is undefined. + */ static void BF_SET_HASHFUNC(size_t func_cnt) { BF_HASH_FUNCS = func_cnt; } diff --git a/include/util/types.h b/include/util/types.h index 3010e78..b7f9607 100644 --- a/include/util/types.h +++ b/include/util/types.h @@ -1,11 +1,11 @@ /* * include/util/types.h * - * Copyright (C) 2023 Douglas Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * * All rights reserved. Published under the Modified BSD License. * - * A centralized header file for various datatypes used throughout the + * A centralized header file for various data types used throughout the * code base. There are a few very specific types, such as header formats, * that are defined within the header files that make direct use of them, * but all generally usable, simple types are defined here. @@ -22,33 +22,41 @@ namespace de { using std::byte; -// Represents a page offset within a specific file (physical or virtual) +/* Represents a page offset within a specific file (physical or virtual) */ typedef uint32_t PageNum; -// Byte offset within a page. Also used for lengths of records, etc., -// within the codebase. size_t isn't necessary, as the maximum offset -// is only parm::PAGE_SIZE +/* + * Byte offset within a page. Also used for lengths of records, etc., + * within the codebase. size_t isn't necessary, as the maximum offset + * is only parm::PAGE_SIZE + */ typedef uint16_t PageOffset; -// A unique identifier for a frame within a buffer or cache. +/* A unique identifier for a frame within a buffer or cache */ typedef int32_t FrameId; -// A unique timestamp for use in MVCC concurrency control. Currently stored in -// record headers, but not used by anything. +/* + * A unique timestamp for use in MVCC concurrency control. Currently stored in + * record headers, but not used by anything. + */ typedef uint32_t Timestamp; const Timestamp TIMESTAMP_MIN = 0; const Timestamp TIMESTAMP_MAX = UINT32_MAX; -// Invalid values for various IDs. Used throughout the code base to indicate -// uninitialized values and error conditions. +/* + * Invalid values for various IDs. Used throughout the code base to indicate + * uninitialized values and error conditions. + */ const PageNum INVALID_PNUM = 0; const FrameId INVALID_FRID = -1; -// An ID for a given shard within the index. The level_idx is the index -// in the memory_levels and disk_levels vectors corresponding to the -// shard, and the shard_idx is the index with the level (always 0 in the -// case of leveling). Note that the two vectors of levels are treated -// as a contiguous index space. +/* + * An ID for a given shard within the index. The level_idx is the index + * in the memory_levels and disk_levels vectors corresponding to the + * shard, and the shard_idx is the index with the level (always 0 in the + * case of leveling). Note that the two vectors of levels are treated + * as a contiguous index space. + */ struct ShardID { ssize_t level_idx; ssize_t shard_idx; @@ -58,12 +66,7 @@ struct ShardID { } }; +/* A placeholder for an invalid shard--also used to indicate the mutable buffer */ const ShardID INVALID_SHID = {-1, -1}; -struct SampleRange { - ShardID shid; - size_t low; - size_t high; -}; - } -- cgit v1.2.3 From c00900c5bfbc23537bf7084a927e7fd2ef0a5c94 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:01:05 -0400 Subject: DynamicExtension: added a way to block on merge completion This is mostly just for testing purposes at the moment, though I'd imagine it may be useful for other reasons too. --- include/framework/DynamicExtension.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 9129060..2f0327f 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -146,7 +146,11 @@ public: return m_buffer_capacity; } - Shard *create_static_structure() { + Shard *create_static_structure(bool await_merge_completion=false) { + if (await_merge_completion) { + await_next_epoch(); + } + auto epoch = get_active_epoch_protected(); auto bv = epoch->get_buffer_view(); @@ -186,6 +190,19 @@ public: return flattened; } + /* + * If the current epoch is *not* the newest one, then wait for + * the newest one to become available. Otherwise, returns immediately. + */ + void await_next_epoch() { + while (m_current_epoch.load() != m_newest_epoch.load()) { + std::unique_lock m_epoch_cv_lk; + m_epoch_cv.wait(m_epoch_cv_lk); + } + + return; + } + /* * Mostly exposed for unit-testing purposes. Verifies that the current * active version of the ExtensionStructure doesn't violate the maximum @@ -203,8 +220,12 @@ private: std::set m_versions; std::atomic m_current_epoch; + std::atomic m_newest_epoch; std::unordered_map m_epochs; + std::condition_variable m_epoch_cv; + std::mutex m_epoch_cv_lk; + size_t m_scale_factor; double m_max_delete_prop; size_t m_buffer_capacity; @@ -220,7 +241,7 @@ private: } void advance_epoch() { - size_t new_epoch_num = m_current_epoch.load() + 1; + size_t new_epoch_num = m_newest_epoch.load(); _Epoch *new_epoch = m_epochs[new_epoch_num]; _Epoch *old_epoch = m_epochs[m_current_epoch.load()]; @@ -233,6 +254,12 @@ private: new_epoch->add_buffer(old_epoch->get_buffers()[i]); } m_current_epoch.fetch_add(1); + + /* notify any blocking threads that the new epoch is available */ + m_epoch_cv_lk.lock(); + m_epoch_cv.notify_all(); + m_epoch_cv_lk.unlock(); + retire_epoch(old_epoch); } @@ -250,7 +277,8 @@ private: auto new_epoch = get_active_epoch()->clone(); std::unique_lock m_struct_lock; m_versions.insert(new_epoch->get_structure()); - m_epochs.insert({m_current_epoch.load() + 1, new_epoch}); + m_newest_epoch.fetch_add(1); + m_epochs.insert({m_newest_epoch.load(), new_epoch}); m_struct_lock.release(); return new_epoch; -- cgit v1.2.3 From 28b036025d35853e2ff4ad0bd0c581768f93ece0 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:04:18 -0400 Subject: VPTree Shard: updates to build on my desktop --- include/shard/VPTree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index d9a15b1..978372b 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "framework/ShardRequirements.h" -- cgit v1.2.3 From 62792753bb4df2515e5e2d8cc48bca568c5379fd Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:48:56 -0400 Subject: Epoch: Creating an epoch now takes references on buffers + versions When an epoch is created using the constructor Epoch(Structure, Buffer), it will call take_reference() on both. This was necessary to ensure that the destructor doesn't fail, as it releases references and fails if the refcnt is 0. It releases the user of the object from the burden of manually taking references in this situation. --- include/framework/scheduling/Epoch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 03cbb62..6bbf927 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -33,6 +33,8 @@ public: , m_structure(structure) , m_active_jobs(0) { + structure->take_reference(); + buff->take_reference(); m_buffers.push_back(buff); } -- cgit v1.2.3 From 1b8bec5ea882584aba62c92d1ab6ffaf03e7b9b5 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:51:46 -0400 Subject: DynamicExtension: fixed some Epoch-related bugs The epochs must be released in the destructor prior to releasing the buffers and structures, as otherwise there are references remaining to these objects and their destructors will fail. Additionally, fixed a bug in the constructor resulting in a memory leak due to allocating an extra starting version and buffer. --- include/framework/DynamicExtension.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 2f0327f..a1f7c2b 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -55,12 +55,16 @@ public: auto vers = new Structure(m_buffer_capacity, m_scale_factor, m_max_delete_prop); auto epoch = new _Epoch(vers, buf); - m_buffers.insert(new Buffer(buffer_cap, max_delete_prop*buffer_cap)); - m_versions.insert(new Structure(buffer_cap, scale_factor, max_delete_prop)); + m_buffers.insert(buf); + m_versions.insert(vers); m_epochs.insert({0, epoch}); } ~DynamicExtension() { + for (auto e : m_epochs) { + delete e.second; + } + for (auto e : m_buffers) { delete e; } @@ -68,10 +72,6 @@ public: for (auto e : m_versions) { delete e; } - - for (auto e : m_epochs) { - delete e.second; - } } int insert(const R &rec) { -- cgit v1.2.3 From 7163b8db0ee5acc099a228090a4bdee379c1c8af Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:53:08 -0400 Subject: SerialScheduler: added a single-threaded scheduler Added a new scheduler for ensuring single-threaded operation. Additionally, added a static assert to (at least for now) restrict the use of tagging to this single threaded scheduler. --- include/framework/DynamicExtension.h | 2 + include/framework/scheduling/SerialScheduler.h | 67 ++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 include/framework/scheduling/SerialScheduler.h (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index a1f7c2b..3a4a7e1 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -26,6 +26,7 @@ #include "framework/util/Configuration.h" #include "framework/scheduling/FIFOScheduler.h" +#include "framework/scheduling/SerialScheduler.h" #include "framework/scheduling/Epoch.h" #include "psu-util/timer.h" @@ -82,6 +83,7 @@ public: // FIXME: delete tagging will require a lot of extra work to get // operating "correctly" in a concurrent environment. if constexpr (D == DeletePolicy::TAGGING) { + static_assert(std::same_as, "Tagging is only supported in single-threaded operation"); BufView buffers = get_active_epoch()->get_buffer_view(); if (get_active_epoch()->get_structure()->tagged_delete(rec)) { diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h new file mode 100644 index 0000000..9c767e8 --- /dev/null +++ b/include/framework/scheduling/SerialScheduler.h @@ -0,0 +1,67 @@ +/* + * include/framework/scheduling/SerialScheduler.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + * IMPORTANT: This "scheduler" is a shim implementation for allowing + * strictly serial, single-threaded operation of the framework. It should + * never be used in multi-threaded contexts. A call to the schedule_job + * function will immediately run the job and block on its completion before + * returning. + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "util/types.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" +#include "framework/interface/Record.h" +#include "framework/structure/MutableBuffer.h" +#include "framework/util/Configuration.h" +#include "framework/structure/ExtensionStructure.h" +#include "framework/scheduling/Task.h" + +namespace de { + +class SerialScheduler { +public: + SerialScheduler(size_t memory_budget, size_t thread_cnt) + : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) + , m_thrd_cnt((thread_cnt) ? thread_cnt: UINT64_MAX) + , m_used_memory(0) + , m_used_thrds(0) + , m_counter(0) + {} + + ~SerialScheduler() = default; + + void schedule_job(std::function job, size_t size, void *args) { + size_t ts = m_counter++; + auto t = Task(size, ts, job, args); + t(); + } + + void shutdown() { + /* intentionally left blank */ + } + +private: + size_t m_memory_budget; + size_t m_thrd_cnt; + + size_t m_used_thrds; + size_t m_used_memory; + + size_t m_counter; +}; + +} -- cgit v1.2.3 From 786a1cf5ab76f94a1adece48c1de53fb32e4551e Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:54:09 -0400 Subject: FIFOScheduler: fixed a few synchronization issues --- include/framework/scheduling/FIFOScheduler.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 5425c4f..91a72b3 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -43,23 +43,20 @@ public: ~FIFOScheduler() { shutdown(); - std::unique_lock lk(m_cv_lock); m_cv.notify_all(); - lk.release(); - m_sched_thrd.join(); } void schedule_job(std::function job, size_t size, void *args) { + std::unique_lock lk(m_cv_lock); size_t ts = m_counter.fetch_add(1); m_task_queue.push(Task(size, ts, job, args)); - std::unique_lock lk(m_cv_lock); m_cv.notify_all(); } void shutdown() { - m_shutdown = true; + m_shutdown.store(true); } private: @@ -68,7 +65,7 @@ private: size_t m_memory_budget; size_t m_thrd_cnt; - bool m_shutdown; + std::atomic m_shutdown; std::atomic m_counter; std::mutex m_cv_lock; @@ -80,6 +77,7 @@ private: std::atomic m_used_memory; void schedule_next() { + assert(m_task_queue.size() > 0); auto t = m_task_queue.pop(); t(); } @@ -92,8 +90,7 @@ private: while (m_task_queue.size() > 0 && m_used_thrds.load() < m_thrd_cnt) { schedule_next(); } - cv_lock.unlock(); - } while(!m_shutdown); + } while(!m_shutdown.load()); } }; -- cgit v1.2.3 From df59a313ae18a6968daa2662dc39a8065d92cfcb Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 11:54:24 -0400 Subject: MemISAM: updated to new query interface --- include/shard/MemISAM.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/shard/MemISAM.h b/include/shard/MemISAM.h index 8ca5cee..00fb467 100644 --- a/include/shard/MemISAM.h +++ b/include/shard/MemISAM.h @@ -589,7 +589,7 @@ public: return res; } - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { + static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { return; } -- cgit v1.2.3 From 230831243a61f1ca1b1dd4319a4c5224b15d2657 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 12:05:58 -0400 Subject: ExtensionStructure: fixed incorrect constructor args in clone() --- include/framework/structure/ExtensionStructure.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 1f365ae..2ced439 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -54,7 +54,7 @@ public: * need to be forwarded to the appropriate structures manually. */ ExtensionStructure *copy() { - auto new_struct = new ExtensionStructure(m_scale_factor, m_max_delete_prop, m_buffer_size); + auto new_struct = new ExtensionStructure(m_buffer_size, m_scale_factor, m_max_delete_prop); for (size_t i=0; im_levels.push_back(m_levels[i]->clone()); } @@ -432,7 +432,7 @@ private: * vector. */ inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { - if (idx>= m_levels.size() || !m_levels[idx]) { + if (idx >= m_levels.size() || !m_levels[idx]) { return false; } -- cgit v1.2.3 From ca729108869b4143f1eea31f6dde9195decfec9c Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 12:14:57 -0400 Subject: MutableBuffer: removed most concurrency control stuff The buffer isn't responsible for a lot of CC anymore (just the append operation), so this code was no longer necessary. Also removed the only calls to some of these CC operations within the rest of the framework. --- include/framework/structure/ExtensionStructure.h | 6 +-- include/framework/structure/MutableBuffer.h | 59 ++++-------------------- 2 files changed, 13 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 2ced439..f5657af 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -93,11 +93,11 @@ public: inline bool merge_buffer(Buffer *buffer) { assert(can_merge_with(0, buffer->get_record_count())); + // FIXME: this step makes an extra copy of the buffer, + // which could be avoided by adjusting the shard + // reconstruction process a bit, possibly. buffer->start_merge(); merge_buffer_into_l0(buffer); - buffer->finish_merge(); - - buffer->truncate(); return true; } diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index e0a6962..a70b86b 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -42,13 +42,10 @@ public: } m_refcnt.store(0); - m_deferred_truncate.store(false); - m_merging.store(false); } ~MutableBuffer() { assert(m_refcnt.load() == 0); - assert(m_merging.load() == false); if (m_data) free(m_data); if (m_tombstone_filter) delete m_tombstone_filter; @@ -90,23 +87,12 @@ public: } bool truncate() { - - while (active_merge() || m_refcnt.load() > 0) - ; - - m_merge_lock.lock(); - - while (m_refcnt > 0) - ; - m_tombstonecnt.store(0); m_reccnt.store(0); m_weight.store(0); m_max_weight.store(0); if (m_tombstone_filter) m_tombstone_filter->clear(); - m_merge_lock.unlock(); - return true; } @@ -176,26 +162,15 @@ public: return m_max_weight; } + /* + * This operation assumes that no other threads have write access + * to the buffer. This will be the case in normal operation, at + * present, but may change (in which case this approach will need + * to be adjusted). Other threads having read access is perfectly + * acceptable, however. + */ bool start_merge() { - if (m_merge_lock.try_lock()) { - /* there cannot already been an active merge */ - if (m_merging.load()) { - m_merge_lock.unlock(); - return false; - } - - m_merging.store(true); - memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); - return true; - } - - /* lock could not be obtained */ - return false; - } - - bool finish_merge() { - m_merge_lock.unlock(); - m_merging.store(false); + memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); return true; } @@ -208,12 +183,8 @@ public: } bool release_reference() { + assert(m_refcnt > 0); m_refcnt.fetch_add(-1); - - if (m_refcnt.load() == 0 && m_deferred_truncate.load()) { - assert(this->truncate()); - } - return true; } @@ -221,10 +192,6 @@ public: return m_refcnt.load(); } - bool active_merge() { - return m_merging.load(); - } - private: int32_t try_advance_tail() { size_t new_tail = m_reccnt.fetch_add(1); @@ -245,14 +212,8 @@ private: alignas(64) std::atomic m_reccnt; alignas(64) std::atomic m_weight; alignas(64) std::atomic m_max_weight; - alignas(64) std::atomic m_merging; - alignas(64) std::atomic m_deferred_truncate; - alignas(64) std::atomic m_refcnt; - - alignas(64) std::mutex m_merge_lock; - alignas(64) std::mutex m_trunc_lock; - alignas(64) std::condition_variable m_trunc_signal; + alignas(64) std::atomic m_refcnt; }; } -- cgit v1.2.3 From 68ae6279476e7d37837ac06474fb558e50ce6706 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 31 Oct 2023 12:41:55 -0400 Subject: Fixes for various bugs under SerialScheduler --- include/framework/DynamicExtension.h | 35 ++++++++++++++++-------- include/framework/scheduling/Epoch.h | 31 +++++++++++++++++---- include/framework/structure/ExtensionStructure.h | 5 +++- 3 files changed, 54 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 3a4a7e1..5c1eaab 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -54,7 +54,7 @@ public: { auto buf = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); auto vers = new Structure(m_buffer_capacity, m_scale_factor, m_max_delete_prop); - auto epoch = new _Epoch(vers, buf); + auto epoch = new _Epoch(0, vers, buf); m_buffers.insert(buf); m_versions.insert(vers); @@ -249,13 +249,19 @@ private: /* * Update the new Epoch to contain the buffers from the old one - * that it doesn't currently have + * that it doesn't currently have if using a multi-threaded + * scheduler (otherwise, there is only one buffer that is + * reused, so it shouldn't be removed) */ - size_t old_buffer_cnt = new_epoch->clear_buffers(); - for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { - new_epoch->add_buffer(old_epoch->get_buffers()[i]); + if constexpr (!std::same_as) { + size_t old_buffer_cnt = new_epoch->clear_buffers(); + for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { + new_epoch->add_buffer(old_epoch->get_buffers()[i]); + } } + m_current_epoch.fetch_add(1); + old_epoch->set_inactive(); /* notify any blocking threads that the new epoch is available */ m_epoch_cv_lk.lock(); @@ -276,10 +282,10 @@ private: * is violated, it is possible that this code will clone a retired * epoch. */ - auto new_epoch = get_active_epoch()->clone(); + m_newest_epoch.fetch_add(1); + auto new_epoch = get_active_epoch()->clone(m_newest_epoch.load()); std::unique_lock m_struct_lock; m_versions.insert(new_epoch->get_structure()); - m_newest_epoch.fetch_add(1); m_epochs.insert({m_newest_epoch.load(), new_epoch}); m_struct_lock.release(); @@ -316,6 +322,9 @@ private: while (!epoch->retirable()) ; + /* remove epoch from the framework's map */ + m_epochs.erase(epoch->get_epoch_number()); + /* * The epoch's destructor will handle releasing * all the references it holds @@ -440,7 +449,8 @@ private: int internal_append(const R &rec, bool ts) { Buffer *buffer = nullptr; do { - auto epoch = get_active_epoch_protected(); + // FIXME: figure out best way to protect this epoch access + auto epoch = get_active_epoch(); buffer = epoch->get_active_buffer(); /* if the buffer is full, schedule a merge and add a new empty buffer */ @@ -448,10 +458,13 @@ private: // FIXME: possible race here--two identical merges could be scheduled auto vers = epoch->get_structure(); schedule_merge(); - buffer = add_empty_buffer(epoch); + + if constexpr (std::same_as) { + buffer->truncate(); + } else { + buffer = add_empty_buffer(epoch); + } } - // FIXME: not exactly the best spot for this - epoch->end_job(); } while(!buffer->append(rec, ts)); /* internal append should always succeed, eventually */ diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 6bbf927..f4aefe9 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -22,16 +22,20 @@ private: typedef ExtensionStructure Structure; typedef BufferView BufView; public: - Epoch() + Epoch(size_t number=0) : m_buffers() , m_structure(nullptr) , m_active_jobs(0) + , m_active(true) + , m_epoch_number(number) {} - Epoch(Structure *structure, Buffer *buff) + Epoch(size_t number, Structure *structure, Buffer *buff) : m_buffers() , m_structure(structure) , m_active_jobs(0) + , m_active(true) + , m_epoch_number(number) { structure->take_reference(); buff->take_reference(); @@ -62,6 +66,7 @@ public: } void end_job() { + assert(m_active_jobs.load() > 0); m_active_jobs.fetch_add(-1); if (m_active_jobs.load() == 0) { @@ -74,6 +79,10 @@ public: return m_active_jobs.load(); } + size_t get_epoch_number() { + return m_epoch_number; + } + Structure *get_structure() { return m_structure; } @@ -109,18 +118,29 @@ public: /* * Returns a new Epoch object that is a copy of this one. The new object will also contain - * a copy of the m_structure, rather than a reference to the same one. + * a copy of the m_structure, rather than a reference to the same one. The epoch number of + * the new epoch will be set to the provided argument. */ - Epoch *clone() { - auto epoch = new Epoch(); + Epoch *clone(size_t number) { + auto epoch = new Epoch(number); epoch->m_buffers = m_buffers; if (m_structure) { epoch->m_structure = m_structure->copy(); + /* the copy routine returns a structure with 0 references */ + epoch->m_structure->take_reference(); + } + + for (auto b : m_buffers) { + b->take_reference(); } return epoch; } + void set_inactive() { + m_active = false; + } + /* * */ @@ -158,5 +178,6 @@ private: */ std::atomic m_active_jobs; bool m_active; + size_t m_epoch_number; }; } diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index f5657af..80ec7b9 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -46,7 +46,8 @@ public: * Create a shallow copy of this extension structure. The copy will share references to the * same levels/shards as the original, but will have its own lists. As all of the shards are * immutable (with the exception of deletes), the copy can be restructured with merges, etc., - * without affecting the original. + * without affecting the original. The copied structure will be returned with a reference + * count of 0; generally you will want to immediately call take_reference() on it. * * NOTE: When using tagged deletes, a delete of a record in the original structure will affect * the copy, so long as the copy retains a reference to the same shard as the original. This could @@ -59,6 +60,8 @@ public: new_struct->m_levels.push_back(m_levels[i]->clone()); } + new_struct->m_refcnt = 0; + return new_struct; } -- cgit v1.2.3 From 0b723322a611de83872dd83b55d2e10e8886a283 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Thu, 2 Nov 2023 08:01:45 -0400 Subject: started refactoring queries interface --- include/framework/QueryRequirements.h | 17 ++ include/framework/interface/Shard.h | 6 + include/query/irs.h | 216 ++++++++++++++++++++ include/query/rangequery.h | 161 +++++++++++++++ include/shard/MemISAM.h | 361 ---------------------------------- 5 files changed, 400 insertions(+), 361 deletions(-) create mode 100644 include/framework/QueryRequirements.h create mode 100644 include/query/irs.h create mode 100644 include/query/rangequery.h (limited to 'include') diff --git a/include/framework/QueryRequirements.h b/include/framework/QueryRequirements.h new file mode 100644 index 0000000..ff4eaff --- /dev/null +++ b/include/framework/QueryRequirements.h @@ -0,0 +1,17 @@ +/* + * include/framework/QueryRequirements.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + * A header file containing the necessary includes for Shard + * development. + * + */ +#pragma once + +#include "framework/structure/MutableBuffer.h" +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/interface/Query.h" diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index d3a6cf8..40a696b 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -33,4 +33,10 @@ concept ShardInterface = requires(S s, S **spp, void *p, bool b, size_t i) { {s.get_aux_memory_usage()} -> std::convertible_to; }; +template +concept SortedShardInterface = ShardInterface && requires(S s, R r, R *rp) { + {s.lower_bound(r)} -> std::convertible_to; + {s.upper_bound(r)} -> std::convertible_to; +} + } diff --git a/include/query/irs.h b/include/query/irs.h new file mode 100644 index 0000000..5b09e73 --- /dev/null +++ b/include/query/irs.h @@ -0,0 +1,216 @@ +/* + * include/query/irs.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "framework/QueryRequirements.h" + +namespace de { namespace irs { + +template +struct Parms { + decltype(R::key) lower_bound; + decltype(R::key) upper_bound; + size_t sample_size; + gsl_rng *rng; +}; + + +template +struct State { + size_t lower_bound; + size_t upper_bound; + size_t sample_size; + size_t total_weight; +}; + +template +struct BufferState { + size_t cutoff; + std::vector> records; + size_t sample_size; +}; + +template +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=false; + + static void *get_query_state(S *shard, void *parms) { + auto res = new State(); + decltype(R::key) lower_key = ((PARMS *) parms)->lower_bound; + decltype(R::key) upper_key = (PARMS *) parms)->upper_bound; + + res->lower_bound = shard->get_lower_bound(lower_key); + res->upper_bound = shard->get_upper_bound(upper_key); + + if (res->lower_bound == shard->get_record_count()) { + res->total_weight = 0; + } else { + res->total_weight = res->upper_bound - res->lower_bound; + } + + res->sample_size = 0; + return res; + } + + static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + auto res = new BufferState(); + + res->cutoff = buffer->get_record_count(); + res->sample_size = 0; + + if constexpr (Rejection) { + return res; + } + + auto lower_key = ((Parms *) parms)->lower_bound; + auto upper_key = ((Parms *) parms)->upper_bound; + + for (size_t i=0; icutoff; i++) { + if (((buffer->get_data() + i)->rec.key >= lower_key) && ((buffer->get_data() + i)->rec.key <= upper_key)) { + res->records.emplace_back(*(buffer->get_data() + i)); + } + } + + return res; + } + + static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { + auto p = (Parms *) query_parms; + auto bs = (buff_state) ? (BufferState *) buff_state : nullptr; + + std::vector shard_sample_sizes(shard_states.size()+1, 0); + size_t buffer_sz = 0; + + std::vector weights; + if constexpr (Rejection) { + weights.push_back((bs) ? bs->cutoff : 0); + } else { + weights.push_back((bs) ? bs->records.size() : 0); + } + + size_t total_weight = 0; + for (auto &s : shard_states) { + auto state = (State *) s; + total_weight += state->total_weight; + weights.push_back(state->total_weight); + } + + // if no valid records fall within the query range, just + // set all of the sample sizes to 0 and bail out. + if (total_weight == 0) { + for (size_t i=0; i *) shard_states[i]; + state->sample_size = 0; + } + + return; + } + + std::vector normalized_weights; + for (auto w : weights) { + normalized_weights.push_back((double) w / (double) total_weight); + } + + auto shard_alias = Alias(normalized_weights); + for (size_t i=0; isample_size; i++) { + auto idx = shard_alias.get(p->rng); + if (idx == 0) { + buffer_sz++; + } else { + shard_sample_sizes[idx - 1]++; + } + } + + if (bs) { + bs->sample_size = buffer_sz; + } + for (size_t i=0; i *) shard_states[i]; + state->sample_size = shard_sample_sizes[i+1]; + } + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + auto lower_key = ((Parms *) parms)->lower_bound; + auto upper_key = ((Parms *) parms)->upper_bound; + auto rng = ((Parms *) parms)->rng; + + auto state = (State *) q_state; + auto sample_sz = state->sample_size; + + std::vector> result_set; + + if (sample_sz == 0 || state->lower_bound == shard->get_record_count()) { + return result_set; + } + + size_t attempts = 0; + size_t range_length = state->upper_bound - state->lower_bound; + do { + attempts++; + size_t idx = (range_length > 0) ? gsl_rng_uniform_int(rng, range_length) : 0; + result_set.emplace_back(*shard->get_record_at(state->lower_bound + idx)); + } while (attempts < sample_sz); + + return result_set; + } + + static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + auto st = (BufferState *) state; + auto p = (Parms *) parms; + + std::vector> result; + result.reserve(st->sample_size); + + if constexpr (Rejection) { + for (size_t i=0; isample_size; i++) { + auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); + auto rec = buffer->get_data() + idx; + + if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { + result.emplace_back(*rec); + } + } + + return result; + } + + for (size_t i=0; isample_size; i++) { + auto idx = gsl_rng_uniform_int(p->rng, st->records.size()); + result.emplace_back(st->records[idx]); + } + + return result; + } + + static std::vector merge(std::vector>> &results, void *parms) { + std::vector output; + + for (size_t i=0; i *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; +}} diff --git a/include/query/rangequery.h b/include/query/rangequery.h new file mode 100644 index 0000000..f9a34d9 --- /dev/null +++ b/include/query/rangequery.h @@ -0,0 +1,161 @@ +/* + * include/query/rangequery.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +namespace de { namespace rq { + +template +struct Parms { + decltype(R::key) lower_bound; + decltype(R::key) upper_bound; +}; + +template +struct State { + size_t start_idx; + size_t stop_idx; +}; + +template +struct BufferState { + size_t cutoff; +}; + +template +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=true; + + static void *get_query_state(S *shard, void *parms) { + auto res = new State(); + auto p = (Parms *) parms; + + res->start_idx = shard->get_lower_bound(p->lower_bound); + res->stop_idx = shard->get_record_count(); + + return res; + } + + static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + auto res = new BufferState(); + res->cutoff = buffer->get_record_count(); + + return res; + } + + static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { + return; + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + std::vector> records; + auto p = (Parms *) parms; + auto s = (State *) q_state; + + // if the returned index is one past the end of the + // records for the PGM, then there are not records + // in the index falling into the specified range. + if (s->start_idx == shard->get_record_count()) { + return records; + } + + auto ptr = shard->get_record_at(s->start_idx); + + // roll the pointer forward to the first record that is + // greater than or equal to the lower bound. + while(ptr->rec.key < p->lower_bound) { + ptr++; + } + + while (ptr->rec.key <= p->upper_bound && ptr < shard->m_data + s->stop_idx) { + records.emplace_back(*ptr); + ptr++; + } + + return records; + } + + static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + auto p = (Parms *) parms; + auto s = (BufferState *) state; + + std::vector> records; + for (size_t i=0; icutoff; i++) { + auto rec = buffer->get_data() + i; + if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { + records.emplace_back(*rec); + } + } + + return records; + } + + static std::vector merge(std::vector>> &results, void *parms) { + std::vector>> cursors; + cursors.reserve(results.size()); + + PriorityQueue> pq(results.size()); + size_t total = 0; + size_t tmp_n = results.size(); + + + for (size_t i = 0; i < tmp_n; ++i) + if (results[i].size() > 0){ + auto base = results[i].data(); + cursors.emplace_back(Cursor{base, base + results[i].size(), 0, results[i].size()}); + assert(i == cursors.size() - 1); + total += results[i].size(); + pq.push(cursors[i].ptr, tmp_n - i - 1); + } else { + cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); + } + + if (total == 0) { + return std::vector(); + } + + std::vector output; + output.reserve(total); + + while (pq.size()) { + auto now = pq.peek(); + auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[tmp_n - now.version - 1]; + auto& cursor2 = cursors[tmp_n - next.version - 1]; + if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[tmp_n - now.version - 1]; + if (!now.data->is_tombstone()) output.push_back(cursor.ptr->rec); + pq.pop(); + + if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); + } + } + + return output; + } + + static void delete_query_state(void *state) { + auto s = (State *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; + +}} diff --git a/include/shard/MemISAM.h b/include/shard/MemISAM.h index 00fb467..6962c19 100644 --- a/include/shard/MemISAM.h +++ b/include/shard/MemISAM.h @@ -31,52 +31,6 @@ namespace de { thread_local size_t mrun_cancelations = 0; -template -struct irs_query_parms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; - size_t sample_size; - gsl_rng *rng; -}; - -template -class IRSQuery; - -template -struct IRSState { - size_t lower_bound; - size_t upper_bound; - size_t sample_size; - size_t total_weight; -}; - -template -struct IRSBufferState { - size_t cutoff; - std::vector> records; - size_t sample_size; -}; - -template -struct ISAMRangeQueryParms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; -}; - -template -class ISAMRangeQuery; - -template -struct ISAMRangeQueryState { - size_t start_idx; - size_t stop_idx; -}; - -template -struct RangeQueryBufferState { - size_t cutoff; -}; - template class MemISAM { private: @@ -384,319 +338,4 @@ private: size_t m_deleted_cnt; size_t m_alloc_size; }; - -template -class IRSQuery { -public: - - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=false; - - static void *get_query_state(MemISAM *isam, void *parms) { - auto res = new IRSState(); - decltype(R::key) lower_key = ((irs_query_parms *) parms)->lower_bound; - decltype(R::key) upper_key = ((irs_query_parms *) parms)->upper_bound; - - res->lower_bound = isam->get_lower_bound(lower_key); - res->upper_bound = isam->get_upper_bound(upper_key); - - if (res->lower_bound == isam->get_record_count()) { - res->total_weight = 0; - } else { - res->total_weight = res->upper_bound - res->lower_bound; - } - - res->sample_size = 0; - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new IRSBufferState(); - - res->cutoff = buffer->get_record_count(); - res->sample_size = 0; - - if constexpr (Rejection) { - return res; - } - - auto lower_key = ((irs_query_parms *) parms)->lower_bound; - auto upper_key = ((irs_query_parms *) parms)->upper_bound; - - for (size_t i=0; icutoff; i++) { - if (((buffer->get_data() + i)->rec.key >= lower_key) && ((buffer->get_data() + i)->rec.key <= upper_key)) { - res->records.emplace_back(*(buffer->get_data() + i)); - } - } - - return res; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - auto p = (irs_query_parms *) query_parms; - auto bs = (buff_state) ? (IRSBufferState *) buff_state : nullptr; - - std::vector shard_sample_sizes(shard_states.size()+1, 0); - size_t buffer_sz = 0; - - std::vector weights; - if constexpr (Rejection) { - weights.push_back((bs) ? bs->cutoff : 0); - } else { - weights.push_back((bs) ? bs->records.size() : 0); - } - - size_t total_weight = 0; - for (auto &s : shard_states) { - auto state = (IRSState *) s; - total_weight += state->total_weight; - weights.push_back(state->total_weight); - } - - // if no valid records fall within the query range, just - // set all of the sample sizes to 0 and bail out. - if (total_weight == 0) { - for (size_t i=0; i *) shard_states[i]; - state->sample_size = 0; - } - - return; - } - - std::vector normalized_weights; - for (auto w : weights) { - normalized_weights.push_back((double) w / (double) total_weight); - } - - auto shard_alias = Alias(normalized_weights); - for (size_t i=0; isample_size; i++) { - auto idx = shard_alias.get(p->rng); - if (idx == 0) { - buffer_sz++; - } else { - shard_sample_sizes[idx - 1]++; - } - } - - if (bs) { - bs->sample_size = buffer_sz; - } - for (size_t i=0; i *) shard_states[i]; - state->sample_size = shard_sample_sizes[i+1]; - } - } - - static std::vector> query(MemISAM *isam, void *q_state, void *parms) { - auto lower_key = ((irs_query_parms *) parms)->lower_bound; - auto upper_key = ((irs_query_parms *) parms)->upper_bound; - auto rng = ((irs_query_parms *) parms)->rng; - - auto state = (IRSState *) q_state; - auto sample_sz = state->sample_size; - - std::vector> result_set; - - if (sample_sz == 0 || state->lower_bound == isam->get_record_count()) { - return result_set; - } - - size_t attempts = 0; - size_t range_length = state->upper_bound - state->lower_bound; - do { - attempts++; - size_t idx = (range_length > 0) ? gsl_rng_uniform_int(rng, range_length) : 0; - result_set.emplace_back(*isam->get_record_at(state->lower_bound + idx)); - } while (attempts < sample_sz); - - return result_set; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto st = (IRSBufferState *) state; - auto p = (irs_query_parms *) parms; - - std::vector> result; - result.reserve(st->sample_size); - - if constexpr (Rejection) { - for (size_t i=0; isample_size; i++) { - auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; - - if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - result.emplace_back(*rec); - } - } - - return result; - } - - for (size_t i=0; isample_size; i++) { - auto idx = gsl_rng_uniform_int(p->rng, st->records.size()); - result.emplace_back(st->records[idx]); - } - - return result; - } - - static std::vector merge(std::vector>> &results, void *parms) { - std::vector output; - - for (size_t i=0; i *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (IRSBufferState *) state; - delete s; - } -}; - - -template -class ISAMRangeQuery { -public: - - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=true; - - static void *get_query_state(MemISAM *ts, void *parms) { - auto res = new ISAMRangeQueryState(); - auto p = (ISAMRangeQueryParms *) parms; - - res->start_idx = ts->get_lower_bound(p->lower_bound); - res->stop_idx = ts->get_record_count(); - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new RangeQueryBufferState(); - res->cutoff = buffer->get_record_count(); - - return res; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { - return; - } - - static std::vector> query(MemISAM *ts, void *q_state, void *parms) { - std::vector> records; - auto p = (ISAMRangeQueryParms *) parms; - auto s = (ISAMRangeQueryState *) q_state; - - // if the returned index is one past the end of the - // records for the PGM, then there are not records - // in the index falling into the specified range. - if (s->start_idx == ts->get_record_count()) { - return records; - } - - auto ptr = ts->get_record_at(s->start_idx); - - // roll the pointer forward to the first record that is - // greater than or equal to the lower bound. - while(ptr->rec.key < p->lower_bound) { - ptr++; - } - - while (ptr->rec.key <= p->upper_bound && ptr < ts->m_data + s->stop_idx) { - records.emplace_back(*ptr); - ptr++; - } - - return records; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto p = (ISAMRangeQueryParms *) parms; - auto s = (RangeQueryBufferState *) state; - - std::vector> records; - for (size_t i=0; icutoff; i++) { - auto rec = buffer->get_data() + i; - if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - records.emplace_back(*rec); - } - } - - return records; - } - - static std::vector merge(std::vector>> &results, void *parms) { - std::vector>> cursors; - cursors.reserve(results.size()); - - PriorityQueue> pq(results.size()); - size_t total = 0; - size_t tmp_n = results.size(); - - - for (size_t i = 0; i < tmp_n; ++i) - if (results[i].size() > 0){ - auto base = results[i].data(); - cursors.emplace_back(Cursor{base, base + results[i].size(), 0, results[i].size()}); - assert(i == cursors.size() - 1); - total += results[i].size(); - pq.push(cursors[i].ptr, tmp_n - i - 1); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - - if (total == 0) { - return std::vector(); - } - - std::vector output; - output.reserve(total); - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[tmp_n - now.version - 1]; - auto& cursor2 = cursors[tmp_n - next.version - 1]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[tmp_n - now.version - 1]; - if (!now.data->is_tombstone()) output.push_back(cursor.ptr->rec); - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } - - return output; - } - - static void delete_query_state(void *state) { - auto s = (ISAMRangeQueryState *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (RangeQueryBufferState *) state; - delete s; - } -}; - - - } -- cgit v1.2.3 From 4e4cf858122ca6c1ae6d5f635e839089769fee38 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 10:01:23 -0500 Subject: Scheduling: Switched over to a thread pool model --- include/framework/scheduling/FIFOScheduler.h | 13 ++++++++++--- include/framework/scheduling/SerialScheduler.h | 2 +- include/framework/scheduling/Task.h | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 91a72b3..1521eb6 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -24,20 +24,26 @@ #include "framework/structure/ExtensionStructure.h" #include "framework/scheduling/Task.h" +#include "ctpl/ctpl.h" #include "psu-ds/LockedPriorityQueue.h" namespace de { + class FIFOScheduler { +private: + static const size_t DEFAULT_MAX_THREADS = 8; + public: FIFOScheduler(size_t memory_budget, size_t thread_cnt) : m_memory_budget((memory_budget) ? memory_budget : UINT64_MAX) - , m_thrd_cnt((thread_cnt) ? thread_cnt: UINT64_MAX) + , m_thrd_cnt((thread_cnt) ? thread_cnt: DEFAULT_MAX_THREADS) , m_used_memory(0) , m_used_thrds(0) , m_shutdown(false) { m_sched_thrd = std::thread(&FIFOScheduler::run, this); + m_thrd_pool.resize(m_thrd_cnt); } ~FIFOScheduler() { @@ -72,6 +78,7 @@ private: std::condition_variable m_cv; std::thread m_sched_thrd; + ctpl::thread_pool m_thrd_pool; std::atomic m_used_thrds; std::atomic m_used_memory; @@ -79,7 +86,7 @@ private: void schedule_next() { assert(m_task_queue.size() > 0); auto t = m_task_queue.pop(); - t(); + m_thrd_pool.push(t); } void run() { @@ -87,7 +94,7 @@ private: std::unique_lock cv_lock(m_cv_lock); m_cv.wait(cv_lock); - while (m_task_queue.size() > 0 && m_used_thrds.load() < m_thrd_cnt) { + while (m_task_queue.size() > 0 && m_thrd_pool.n_idle() > 0) { schedule_next(); } } while(!m_shutdown.load()); diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index 9c767e8..93611d1 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -47,7 +47,7 @@ public: void schedule_job(std::function job, size_t size, void *args) { size_t ts = m_counter++; auto t = Task(size, ts, job, args); - t(); + t(0); } void shutdown() { diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 228665f..6dfd7df 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -54,7 +54,7 @@ struct Task { return self.m_timestamp > other.m_timestamp; } - void operator()() { + void operator()(size_t thrd_id) { m_job(m_args); } }; -- cgit v1.2.3 From 97a4d0fcedb75cbfe5a2e0162e54e71cd9eb0708 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 10:01:48 -0500 Subject: DynamicExtension: fixed some use-after free bugs Reordered some code in internal_append() to avoid use-after frees on the mutable buffer reference used for insertion. --- include/framework/DynamicExtension.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 5c1eaab..49c6905 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -244,8 +244,11 @@ private: void advance_epoch() { size_t new_epoch_num = m_newest_epoch.load(); + size_t old_epoch_num = m_current_epoch.load(); + assert(new_epoch_num != old_epoch_num); + _Epoch *new_epoch = m_epochs[new_epoch_num]; - _Epoch *old_epoch = m_epochs[m_current_epoch.load()]; + _Epoch *old_epoch = m_epochs[old_epoch_num]; /* * Update the new Epoch to contain the buffers from the old one @@ -302,10 +305,10 @@ private: auto new_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); std::unique_lock m_struct_lock; + epoch->add_buffer(new_buffer); m_buffers.insert(new_buffer); m_struct_lock.release(); - epoch->add_buffer(new_buffer); return new_buffer; } @@ -448,24 +451,29 @@ private: int internal_append(const R &rec, bool ts) { Buffer *buffer = nullptr; + int res; do { // FIXME: figure out best way to protect this epoch access - auto epoch = get_active_epoch(); + auto epoch = get_active_epoch_protected(); buffer = epoch->get_active_buffer(); /* if the buffer is full, schedule a merge and add a new empty buffer */ if (buffer->is_full()) { // FIXME: possible race here--two identical merges could be scheduled auto vers = epoch->get_structure(); - schedule_merge(); if constexpr (std::same_as) { buffer->truncate(); } else { buffer = add_empty_buffer(epoch); } + + schedule_merge(); } - } while(!buffer->append(rec, ts)); + + res = buffer->append(rec, ts); + epoch->end_job(); + } while(!res); /* internal append should always succeed, eventually */ return 1; -- cgit v1.2.3 From 7249af78a3f39bd2852c3f81fe92dc5b647161fb Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 11:33:17 -0500 Subject: MutableBuffer: added explicit tail variable Use an explicit m_tail variable for insertion, rather than using m_reccnt. This ensures that the record count doesn't increase despite new records being inserted, and allows for the m_tail variable to be decremented on failure without causing the record count to momentarily change. --- include/framework/structure/MutableBuffer.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index a70b86b..ba25cc3 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -33,7 +33,7 @@ class MutableBuffer { public: MutableBuffer(size_t capacity, size_t max_tombstone_cap) : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0) - , m_tombstonecnt(0), m_weight(0), m_max_weight(0) { + , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) { m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); m_tombstone_filter = nullptr; @@ -83,6 +83,7 @@ public: m_weight.fetch_add(1); } + m_reccnt.fetch_add(1); return 1; } @@ -91,6 +92,7 @@ public: m_reccnt.store(0); m_weight.store(0); m_max_weight.store(0); + m_tail.store(0); if (m_tombstone_filter) m_tombstone_filter->clear(); return true; @@ -193,11 +195,15 @@ public: } private: - int32_t try_advance_tail() { - size_t new_tail = m_reccnt.fetch_add(1); + int64_t try_advance_tail() { + int64_t new_tail = m_tail.fetch_add(1); - if (new_tail < m_cap) return new_tail; - else return -1; + if (new_tail < m_cap) { + return new_tail; + } + + m_tail.fetch_add(-1); + return -1; } size_t m_cap; @@ -210,6 +216,7 @@ private: alignas(64) std::atomic m_tombstonecnt; alignas(64) std::atomic m_reccnt; + alignas(64) std::atomic m_tail; alignas(64) std::atomic m_weight; alignas(64) std::atomic m_max_weight; -- cgit v1.2.3 From fe136eda414d3f7897d4610faeda8dbb3b7bb400 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 12:33:18 -0500 Subject: DynamicExtension::create_static_structure: fixed heap overflow --- include/framework/DynamicExtension.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 49c6905..76722c0 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -169,7 +169,7 @@ public: // FIXME: With an interface adjustment, this could be done in // one call, rather than a loop. - for (size_t i=bv.size() - 1; i>=0; i--) { + for (ssize_t i=bv.size() - 1; i>=0; i--) { shards.emplace_back(new S(bv.get_buffers()[i])); } -- cgit v1.2.3 From ca1605a9924e27ccbacb33d04ccdb4326e7abe74 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 12:37:06 -0500 Subject: Epoch: Adjusted add empty buffer behavior Add empty buffer now supports a CAS-like operation, where it will only add a buffer if the currently active one is still the same as when the decision to add a buffer was made. This is to support adding new buffers on insert outside of the merge-lock, so that multiple concurrent threads cannot add multiple new empty buffers. --- include/framework/DynamicExtension.h | 14 +++++++++++--- include/framework/scheduling/Epoch.h | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 76722c0..955dbe5 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -301,14 +301,22 @@ private: * buffer while a new epoch is being created in the background. Returns a * pointer to the newly created buffer. */ - Buffer *add_empty_buffer(_Epoch *epoch) { - auto new_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); + Buffer *add_empty_buffer(_Epoch *epoch, Buffer *current_buffer=nullptr) { + auto temp_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); std::unique_lock m_struct_lock; - epoch->add_buffer(new_buffer); + auto new_buffer = epoch->add_buffer(temp_buffer, current_buffer); + /* + * if epoch->add_buffer doesn't add the new buffer, this insert + * won't update the buffer set (duplicate insert) + */ m_buffers.insert(new_buffer); m_struct_lock.release(); + if (new_buffer != temp_buffer) { + delete temp_buffer; + } + return new_buffer; } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index f4aefe9..58fe6cd 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -54,11 +54,25 @@ public: } } - void add_buffer(Buffer *buf) { + Buffer *add_buffer(Buffer *buf, Buffer *cur_buf=nullptr) { assert(buf); + /* + * if a current buffer is specified, only add the + * new buffer if the active buffer is the current, + * otherwise just return the active buffer (poor man's + * CAS). + */ + if (cur_buf) { + auto active_buf = get_active_buffer(); + if (active_buf != cur_buf) { + return active_buf; + } + } + buf->take_reference(); m_buffers.push_back(buf); + return buf; } void start_job() { -- cgit v1.2.3 From ad117358b8ab9924d216edeca0eafa87b4f86896 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 12:38:58 -0500 Subject: DynamicExtension: mutex bug fix Fixed an incorrectly initialized lock guard --- include/framework/DynamicExtension.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 955dbe5..8ce6a7a 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -198,8 +198,8 @@ public: */ void await_next_epoch() { while (m_current_epoch.load() != m_newest_epoch.load()) { - std::unique_lock m_epoch_cv_lk; - m_epoch_cv.wait(m_epoch_cv_lk); + std::unique_lock lk(m_epoch_cv_lk); + m_epoch_cv.wait(lk); } return; -- cgit v1.2.3 From 254f8aa85ea8962e5c11d8b475a171883c22f168 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 12:39:35 -0500 Subject: DynamicExtension: internal_append fixes Fixed a few bugs with concurrent operation of internal_append, as well as enabled the spawning of multiple empty buffers while merges are currently active. --- include/framework/DynamicExtension.h | 36 ++++++++++++++++++++++++++---------- include/framework/scheduling/Epoch.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 8ce6a7a..60aa07e 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -258,7 +258,11 @@ private: */ if constexpr (!std::same_as) { size_t old_buffer_cnt = new_epoch->clear_buffers(); - for (size_t i=old_buffer_cnt; iget_buffers().size(); i++) { + // FIXME: this is getting nightmarish... The -1 here is to ensure that the + // the empty buffer added when the merge was first triggered is also included. + // Due to the reordering of operations in internal_append, the new buffer exists + // at the time of the clone, and so is already in the new epoch. + for (size_t i=old_buffer_cnt-1; iget_buffers().size(); i++) { new_epoch->add_buffer(old_epoch->get_buffers()[i]); } } @@ -459,24 +463,36 @@ private: int internal_append(const R &rec, bool ts) { Buffer *buffer = nullptr; - int res; + int res = 0; do { - // FIXME: figure out best way to protect this epoch access auto epoch = get_active_epoch_protected(); buffer = epoch->get_active_buffer(); + assert(buffer); - /* if the buffer is full, schedule a merge and add a new empty buffer */ + /* + * If the buffer is full and there is no current merge, + * schedule a merge and add a new empty buffer. If there + * is a current merge, then just add a new empty buffer + * to the current epoch. + */ if (buffer->is_full()) { - // FIXME: possible race here--two identical merges could be scheduled - auto vers = epoch->get_structure(); - if constexpr (std::same_as) { + /* single threaded: run merge and then empty buffer */ + epoch->end_job(); + schedule_merge(); buffer->truncate(); - } else { + continue; + } else if (epoch->prepare_merge()) { + /* + * add an empty buffer to allow insert proceed and + * schedule a merge on a background thread + */ buffer = add_empty_buffer(epoch); + schedule_merge(); + } else { + /* background merge is ongoing, so just add empty buffer */ + buffer = add_empty_buffer(epoch, buffer); } - - schedule_merge(); } res = buffer->append(rec, ts); diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 58fe6cd..0ebbde9 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -25,6 +25,7 @@ public: Epoch(size_t number=0) : m_buffers() , m_structure(nullptr) + , m_active_merge(false) , m_active_jobs(0) , m_active(true) , m_epoch_number(number) @@ -34,6 +35,7 @@ public: : m_buffers() , m_structure(structure) , m_active_jobs(0) + , m_active_merge(false) , m_active(true) , m_epoch_number(number) { @@ -151,6 +153,31 @@ public: return epoch; } + /* + * Check if a merge can be started from this Epoch. + * At present, without concurrent merging, this simply + * checks if there is currently a scheduled merge based + * on this Epoch. If there is, returns false. If there + * isn't, return true and set a flag indicating that + * there is an active merge. + */ + bool prepare_merge() { + auto old = m_active_merge.load(); + if (old) { + return false; + } + + // FIXME: this needs cleaned up + while (!m_active_merge.compare_exchange_strong(old, true)) { + old = m_active_merge.load(); + if (old) { + return false; + } + } + + return true; + } + void set_inactive() { m_active = false; } @@ -184,6 +211,8 @@ private: std::condition_variable m_active_cv; std::mutex m_cv_lock; + std::atomic m_active_merge; + /* * The number of currently active jobs * (queries/merges) operating on this -- cgit v1.2.3 From 56cc8f63a218bc13e0c8395b479267862de19714 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 14:01:39 -0500 Subject: InternalLevel: switched to std::sharedptr for shard memory management --- include/framework/structure/InternalLevel.h | 81 +++++++++++------------------ 1 file changed, 29 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index 7a7b98c..632fe17 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -33,32 +33,10 @@ public: : m_level_no(level_no) , m_shard_cnt(0) , m_shards(shard_cap, nullptr) - , m_owns(shard_cap, true) , m_pending_shard(nullptr) {} - // Create a new memory level sharing the shards and repurposing it as previous level_no + 1 - // WARNING: for leveling only. - InternalLevel(InternalLevel* level) - : m_level_no(level->m_level_no + 1) - , m_shard_cnt(level->m_shard_cnt) - , m_shards(level->m_shards.size(), nullptr) - , m_owns(level->m_owns.size(), true) - , m_pending_shard(nullptr) - { - assert(m_shard_cnt == 1 && m_shards.size() == 1); - - for (size_t i=0; im_owns[i] = false; - m_shards[i] = level->m_shards[i]; - } - } - ~InternalLevel() { - for (size_t i=0; im_level_no, 1); res->m_shard_cnt = 1; Shard* shards[2]; - shards[0] = base_level->m_shards[0]; - shards[1] = new_level->m_shards[0]; + shards[0] = base_level->m_shards[0].get(); + shards[1] = new_level->m_shards[0].get(); - res->m_shards[0] = new S(shards, 2); + res->m_shards[0] = std::make_shared(shards, 2); return std::shared_ptr(res); } @@ -83,19 +61,23 @@ public: return; } - m_shards[m_shard_cnt] = new S(buffer); - m_owns[m_shard_cnt] = true; + m_shards[m_shard_cnt] = std::make_shared(buffer); ++m_shard_cnt; } void append_merged_shards(InternalLevel* level) { + Shard *shards[level->m_shard_cnt]; + for (size_t i=0; im_shard_cnt; i++) { + shards[i] = level->m_shards[i].get(); + } + if (m_shard_cnt == m_shards.size()) { - m_pending_shard = new S(level->m_shards.data(), level->m_shard_cnt); + m_pending_shard = new S(shards, level->m_shard_cnt); return; } - m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt); - m_owns[m_shard_cnt] = true; + auto tmp = new S(shards, level->m_shard_cnt); + m_shards[m_shard_cnt] = std::shared_ptr(tmp); ++m_shard_cnt; } @@ -104,15 +86,10 @@ public: void finalize() { if (m_pending_shard) { for (size_t i=0; i(m_pending_shard); m_pending_shard = nullptr; m_shard_cnt = 1; } @@ -126,7 +103,7 @@ public: Shard *shards[m_shard_cnt]; for (size_t i=0; i> &shards, std::vector& shard_states, void *query_parms) { for (size_t i=0; iget_record_count(); + if (m_shards[i]) { + cnt += m_shards[i]->get_record_count(); + } } return cnt; @@ -193,7 +172,9 @@ public: size_t get_tombstone_count() { size_t res = 0; for (size_t i = 0; i < m_shard_cnt; ++i) { - res += m_shards[i]->get_tombstone_count(); + if (m_shards[i]) { + res += m_shards[i]->get_tombstone_count(); + } } return res; } @@ -201,7 +182,9 @@ public: size_t get_aux_memory_usage() { size_t cnt = 0; for (size_t i=0; iget_aux_memory_usage(); + if (m_shards[i]){ + cnt += m_shards[i]->get_aux_memory_usage(); + } } return cnt; @@ -224,7 +207,7 @@ public: for (size_t i=0; iget_tombstone_count(); - reccnt += (*m_shards[i])->get_record_count(); + reccnt += m_shards[i]->get_record_count(); } } @@ -235,8 +218,6 @@ public: auto new_level = std::make_shared(m_level_no, m_shards.size()); for (size_t i=0; im_shards[i] = m_shards[i]; - new_level->m_owns[i] = true; - m_owns[i] = false; } return new_level; @@ -248,12 +229,8 @@ private: size_t m_shard_cnt; size_t m_shard_size_cap; - std::vector m_shards; - + std::vector> m_shards; Shard *m_pending_shard; - - std::vector m_owns; - }; } -- cgit v1.2.3 From 9fd6264122f09752b4278c9ff881b4cfe906bbc8 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 14:30:00 -0500 Subject: DynamicExtension: fixed race in get_active_epoch_protected This function wasn't ensuring that that the epoch pinned and the epoch returned were the same epoch in the situation where the epoch was advanced in the middle of the call. This is now resolved, and further the function will return the newer epoch, rather than the older one, in such a situation. --- include/framework/DynamicExtension.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 60aa07e..233bebb 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -238,8 +238,17 @@ private: } _Epoch *get_active_epoch_protected() { - m_epochs[m_current_epoch.load()]->start_job(); - return m_epochs[m_current_epoch.load()]; + ssize_t cur_epoch = -1; + do { + if (cur_epoch != -1) { + m_epochs[cur_epoch]->end_job(); + } + + cur_epoch = m_current_epoch.load(); + m_epochs[cur_epoch]->start_job(); + } while (cur_epoch != m_current_epoch.load()); + + return m_epochs[cur_epoch]; } void advance_epoch() { -- cgit v1.2.3 From e02742b07540dd5a9bcbb44dae14856bf10955ed Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 15:18:53 -0500 Subject: Refactoring progress --- include/framework/interface/Shard.h | 2 +- include/query/irs.h | 6 +- include/query/rangequery.h | 14 +- include/query/wss.h | 204 ++++++++++++++++ include/shard/Alias.h | 251 ++++++++++++++++++++ include/shard/ISAMTree.h | 339 +++++++++++++++++++++++++++ include/shard/MemISAM.h | 341 --------------------------- include/shard/PGM.h | 267 --------------------- include/shard/TrieSpline.h | 184 +-------------- include/shard/WSS.h | 453 ------------------------------------ 10 files changed, 810 insertions(+), 1251 deletions(-) create mode 100644 include/query/wss.h create mode 100644 include/shard/Alias.h create mode 100644 include/shard/ISAMTree.h delete mode 100644 include/shard/MemISAM.h delete mode 100644 include/shard/WSS.h (limited to 'include') diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index 40a696b..92cdca0 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -37,6 +37,6 @@ template concept SortedShardInterface = ShardInterface && requires(S s, R r, R *rp) { {s.lower_bound(r)} -> std::convertible_to; {s.upper_bound(r)} -> std::convertible_to; -} +}; } diff --git a/include/query/irs.h b/include/query/irs.h index 5b09e73..4cb69b0 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -44,8 +44,8 @@ public: static void *get_query_state(S *shard, void *parms) { auto res = new State(); - decltype(R::key) lower_key = ((PARMS *) parms)->lower_bound; - decltype(R::key) upper_key = (PARMS *) parms)->upper_bound; + decltype(R::key) lower_key = ((Parms *) parms)->lower_bound; + decltype(R::key) upper_key = ((Parms *) parms)->upper_bound; res->lower_bound = shard->get_lower_bound(lower_key); res->upper_bound = shard->get_upper_bound(upper_key); @@ -119,7 +119,7 @@ public: normalized_weights.push_back((double) w / (double) total_weight); } - auto shard_alias = Alias(normalized_weights); + auto shard_alias = psudb::Alias(normalized_weights); for (size_t i=0; isample_size; i++) { auto idx = shard_alias.get(p->rng); if (idx == 0) { diff --git a/include/query/rangequery.h b/include/query/rangequery.h index f9a34d9..b9ac9db 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -8,6 +8,12 @@ */ #pragma once +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/structure/MutableBuffer.h" +#include "psu-ds/PriorityQueue.h" +#include "util/Cursor.h" + namespace de { namespace rq { template @@ -27,7 +33,7 @@ struct BufferState { size_t cutoff; }; -template +template class Query { public: constexpr static bool EARLY_ABORT=false; @@ -74,7 +80,7 @@ public: ptr++; } - while (ptr->rec.key <= p->upper_bound && ptr < shard->m_data + s->stop_idx) { + while (ptr->rec.key <= p->upper_bound && ptr < shard->get_data() + s->stop_idx) { records.emplace_back(*ptr); ptr++; } @@ -101,7 +107,7 @@ public: std::vector>> cursors; cursors.reserve(results.size()); - PriorityQueue> pq(results.size()); + psudb::PriorityQueue> pq(results.size()); size_t total = 0; size_t tmp_n = results.size(); @@ -126,7 +132,7 @@ public: while (pq.size()) { auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + auto next = pq.size() > 1 ? pq.peek(1) : psudb::queue_record>{nullptr, 0}; if (!now.data->is_tombstone() && next.data != nullptr && now.data->rec == next.data->rec && next.data->is_tombstone()) { diff --git a/include/query/wss.h b/include/query/wss.h new file mode 100644 index 0000000..b8a5d54 --- /dev/null +++ b/include/query/wss.h @@ -0,0 +1,204 @@ +/* + * include/query/rangequery.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/structure/MutableBuffer.h" + +namespace de { namespace wss { + +template +struct Parms { + size_t sample_size; + gsl_rng *rng; +}; + +template +struct State { + decltype(R::weight) total_weight; + size_t sample_size; + + State() { + total_weight = 0; + } +}; + +template +struct BufferState { + size_t cutoff; + size_t sample_size; + psudb::Alias *alias; + decltype(R::weight) max_weight; + decltype(R::weight) total_weight; + + ~BufferState() { + delete alias; + } +}; + +template +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=false; + + static void *get_query_state(S *shard, void *parms) { + auto res = new State(); + res->total_weight = shard->get_total_weight(); + res->sample_size = 0; + + return res; + } + + static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + BufferState *state = new BufferState(); + auto parameters = (Parms*) parms; + if constexpr (Rejection) { + state->cutoff = buffer->get_record_count() - 1; + state->max_weight = buffer->get_max_weight(); + state->total_weight = buffer->get_total_weight(); + return state; + } + + std::vector weights; + + state->cutoff = buffer->get_record_count() - 1; + double total_weight = 0.0; + + for (size_t i = 0; i <= state->cutoff; i++) { + auto rec = buffer->get_data() + i; + weights.push_back(rec->rec.weight); + total_weight += rec->rec.weight; + } + + for (size_t i = 0; i < weights.size(); i++) { + weights[i] = weights[i] / total_weight; + } + + state->alias = new psudb::Alias(weights); + state->total_weight = total_weight; + + return state; + } + + static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { + auto p = (Parms *) query_parms; + auto bs = (BufferState *) buffer_states[0]; + + std::vector shard_sample_sizes(shard_states.size()+1, 0); + size_t buffer_sz = 0; + + std::vector weights; + weights.push_back(bs->total_weight); + + decltype(R::weight) total_weight = 0; + for (auto &s : shard_states) { + auto state = (State *) s; + total_weight += state->total_weight; + weights.push_back(state->total_weight); + } + + std::vector normalized_weights; + for (auto w : weights) { + normalized_weights.push_back((double) w / (double) total_weight); + } + + auto shard_alias = psudb::Alias(normalized_weights); + for (size_t i=0; isample_size; i++) { + auto idx = shard_alias.get(p->rng); + if (idx == 0) { + buffer_sz++; + } else { + shard_sample_sizes[idx - 1]++; + } + } + + + bs->sample_size = buffer_sz; + for (size_t i=0; i *) shard_states[i]; + state->sample_size = shard_sample_sizes[i+1]; + } + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + auto rng = ((Parms *) parms)->rng; + + auto state = (State *) q_state; + auto sample_size = state->sample_size; + + std::vector> result_set; + + if (sample_size == 0) { + return result_set; + } + size_t attempts = 0; + do { + attempts++; + size_t idx = shard->m_alias->get(rng); + result_set.emplace_back(*shard->get_record_at(idx)); + } while (attempts < sample_size); + + return result_set; + } + + static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + auto st = (BufferState *) state; + auto p = (Parms *) parms; + + std::vector> result; + result.reserve(st->sample_size); + + if constexpr (Rejection) { + for (size_t i=0; isample_size; i++) { + auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); + auto rec = buffer->get_data() + idx; + + auto test = gsl_rng_uniform(p->rng) * st->max_weight; + + if (test <= rec->rec.weight) { + result.emplace_back(*rec); + } + } + return result; + } + + for (size_t i=0; isample_size; i++) { + auto idx = st->alias->get(p->rng); + result.emplace_back(*(buffer->get_data() + idx)); + } + + return result; + } + + static std::vector merge(std::vector>> &results, void *parms) { + std::vector output; + + for (size_t i=0; i *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; + +}} diff --git a/include/shard/Alias.h b/include/shard/Alias.h new file mode 100644 index 0000000..b6b16c5 --- /dev/null +++ b/include/shard/Alias.h @@ -0,0 +1,251 @@ +/* + * include/shard/Alias.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "framework/ShardRequirements.h" + +#include "psu-ds/PriorityQueue.h" +#include "util/Cursor.h" +#include "psu-ds/psudb::Alias.h" +#include "psu-ds/BloomFilter.h" +#include "util/bf_config.h" + +using psudb::CACHELINE_SIZE; +using psudb::BloomFilter; +using psudb::PriorityQueue; +using psudb::queue_record; + +namespace de { + +thread_local size_t wss_cancelations = 0; + +template +class Alias { +private: + typedef decltype(R::key) K; + typedef decltype(R::value) V; + typedef decltype(R::weight) W; + +public: + Alias(MutableBuffer* buffer) + : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { + + m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); + + size_t offset = 0; + m_reccnt = 0; + auto base = buffer->get_data(); + auto stop = base + buffer->get_record_count(); + + std::sort(base, stop, std::less>()); + + std::vector weights; + + while (base < stop) { + if (!(base->is_tombstone()) && (base + 1) < stop) { + if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { + base += 2; + wss_cancelations++; + continue; + } + } else if (base->is_deleted()) { + base += 1; + continue; + } + + // FIXME: this shouldn't be necessary, but the tagged record + // bypass doesn't seem to be working on this code-path, so this + // ensures that tagged records from the buffer are able to be + // dropped, eventually. It should only need to be &= 1 + base->header &= 3; + m_data[m_reccnt++] = *base; + m_total_weight+= base->rec.weight; + weights.push_back(base->rec.weight); + + if (m_bf && base->is_tombstone()) { + m_tombstone_cnt++; + m_bf->insert(base->rec); + } + + base++; + } + + if (m_reccnt > 0) { + build_alias_structure(weights); + } + } + + Alias(Alias** shards, size_t len) + : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { + std::vector>> cursors; + cursors.reserve(len); + + PriorityQueue> pq(len); + + size_t attemp_reccnt = 0; + size_t tombstone_count = 0; + + for (size_t i = 0; i < len; ++i) { + if (shards[i]) { + auto base = shards[i]->get_data(); + cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); + attemp_reccnt += shards[i]->get_record_count(); + tombstone_count += shards[i]->get_tombstone_count(); + pq.push(cursors[i].ptr, i); + } else { + cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); + } + } + + m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + + m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + std::vector weights; + + while (pq.size()) { + auto now = pq.peek(); + auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + if (!cursor.ptr->is_deleted()) { + m_data[m_reccnt++] = *cursor.ptr; + m_total_weight += cursor.ptr->rec.weight; + weights.push_back(cursor.ptr->rec.weight); + if (m_bf && cursor.ptr->is_tombstone()) { + ++m_tombstone_cnt; + if (m_bf) m_bf->insert(cursor.ptr->rec); + } + } + pq.pop(); + + if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); + } + } + + if (m_reccnt > 0) { + build_alias_structure(weights); + } + } + + ~Alias() { + if (m_data) free(m_data); + if (m_alias) delete m_alias; + if (m_bf) delete m_bf; + + } + + Wrapped *point_lookup(const R &rec, bool filter=false) { + if (filter && !m_bf->lookup(rec)) { + return nullptr; + } + + size_t idx = get_lower_bound(rec.key); + if (idx >= m_reccnt) { + return nullptr; + } + + while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; + + if (m_data[idx].rec == rec) { + return m_data + idx; + } + + return nullptr; + } + + Wrapped* get_data() const { + return m_data; + } + + size_t get_record_count() const { + return m_reccnt; + } + + size_t get_tombstone_count() const { + return m_tombstone_cnt; + } + + const Wrapped* get_record_at(size_t idx) const { + if (idx >= m_reccnt) return nullptr; + return m_data + idx; + } + + + size_t get_memory_usage() { + return m_alloc_size; + } + + size_t get_aux_memory_usage() { + return 0; + } + +private: + + size_t get_lower_bound(const K& key) const { + size_t min = 0; + size_t max = m_reccnt - 1; + + const char * record_key; + while (min < max) { + size_t mid = (min + max) / 2; + + if (key > m_data[mid].rec.key) { + min = mid + 1; + } else { + max = mid; + } + } + + return min; + } + + void build_alias_structure(std::vector &weights) { + + // normalize the weights vector + std::vector norm_weights(weights.size()); + + for (size_t i=0; i* m_data; + psudb::Alias *m_alias; + W m_total_weight; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_group_size; + size_t m_alloc_size; + BloomFilter *m_bf; +}; diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h new file mode 100644 index 0000000..a610c09 --- /dev/null +++ b/include/shard/ISAMTree.h @@ -0,0 +1,339 @@ +/* + * include/shard/ISAMTree.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include + +#include "framework/ShardRequirements.h" + +#include "util/bf_config.h" +#include "psu-ds/PriorityQueue.h" +#include "util/Cursor.h" +#include "psu-util/timer.h" + +using psudb::CACHELINE_SIZE; +using psudb::BloomFilter; +using psudb::PriorityQueue; +using psudb::queue_record; +using psudb::Alias; + +namespace de { + +thread_local size_t mrun_cancelations = 0; + +template +class ISAMTree { +private: + +typedef decltype(R::key) K; +typedef decltype(R::value) V; + +constexpr static size_t inmem_isam_node_size = 256; +constexpr static size_t inmem_isam_fanout = inmem_isam_node_size / (sizeof(K) + sizeof(char*)); + +struct InternalNode { + K keys[inmem_isam_fanout]; + char* child[inmem_isam_fanout]; +}; + +constexpr static size_t inmem_isam_leaf_fanout = inmem_isam_node_size / sizeof(R); +constexpr static size_t inmem_isam_node_keyskip = sizeof(K) * inmem_isam_fanout; + +static_assert(sizeof(InternalNode) == inmem_isam_node_size, "node size does not match"); + +public: + ISAMTree(MutableBuffer* buffer) + :m_reccnt(0), m_tombstone_cnt(0), m_isam_nodes(nullptr), m_deleted_cnt(0) { + + m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); + + m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + TIMER_INIT(); + + size_t offset = 0; + m_reccnt = 0; + auto base = buffer->get_data(); + auto stop = base + buffer->get_record_count(); + + TIMER_START(); + std::sort(base, stop, std::less>()); + TIMER_STOP(); + auto sort_time = TIMER_RESULT(); + + TIMER_START(); + while (base < stop) { + if (!base->is_tombstone() && (base + 1 < stop) + && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { + base += 2; + mrun_cancelations++; + continue; + } else if (base->is_deleted()) { + base += 1; + continue; + } + + // FIXME: this shouldn't be necessary, but the tagged record + // bypass doesn't seem to be working on this code-path, so this + // ensures that tagged records from the buffer are able to be + // dropped, eventually. It should only need to be &= 1 + base->header &= 3; + m_data[m_reccnt++] = *base; + if (m_bf && base->is_tombstone()) { + ++m_tombstone_cnt; + m_bf->insert(base->rec); + } + + base++; + } + TIMER_STOP(); + auto copy_time = TIMER_RESULT(); + + TIMER_START(); + if (m_reccnt > 0) { + build_internal_levels(); + } + TIMER_STOP(); + auto level_time = TIMER_RESULT(); + } + + ISAMTree(ISAMTree** runs, size_t len) + : m_reccnt(0), m_tombstone_cnt(0), m_deleted_cnt(0), m_isam_nodes(nullptr) { + std::vector>> cursors; + cursors.reserve(len); + + PriorityQueue> pq(len); + + size_t attemp_reccnt = 0; + size_t tombstone_count = 0; + + for (size_t i = 0; i < len; ++i) { + if (runs[i]) { + auto base = runs[i]->get_data(); + cursors.emplace_back(Cursor{base, base + runs[i]->get_record_count(), 0, runs[i]->get_record_count()}); + attemp_reccnt += runs[i]->get_record_count(); + tombstone_count += runs[i]->get_tombstone_count(); + pq.push(cursors[i].ptr, i); + } else { + cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); + } + } + + m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + + m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + size_t offset = 0; + + while (pq.size()) { + auto now = pq.peek(); + auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + if (!cursor.ptr->is_deleted()) { + m_data[m_reccnt++] = *cursor.ptr; + if (cursor.ptr->is_tombstone()) { + ++m_tombstone_cnt; + m_bf->insert(cursor.ptr->rec); + } + } + pq.pop(); + + if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); + } + } + + if (m_reccnt > 0) { + build_internal_levels(); + } + } + + ~ISAMTree() { + if (m_data) free(m_data); + if (m_isam_nodes) free(m_isam_nodes); + if (m_bf) delete m_bf; + } + + Wrapped *point_lookup(const R &rec, bool filter=false) { + if (filter && !m_bf->lookup(rec)) { + return nullptr; + } + + size_t idx = get_lower_bound(rec.key); + if (idx >= m_reccnt) { + return nullptr; + } + + while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; + + if (m_data[idx].rec == rec) { + return m_data + idx; + } + + return nullptr; + } + + Wrapped* get_data() const { + return m_data; + } + + size_t get_record_count() const { + return m_reccnt; + } + + size_t get_tombstone_count() const { + return m_tombstone_cnt; + } + + const Wrapped* get_record_at(size_t idx) const { + return (idx < m_reccnt) ? m_data + idx : nullptr; + } + + size_t get_memory_usage() { + return m_internal_node_cnt * inmem_isam_node_size + m_alloc_size; + } + + size_t get_aux_memory_usage() { + return 0; + } + + size_t get_lower_bound(const K& key) const { + const InternalNode* now = m_root; + while (!is_leaf(reinterpret_cast(now))) { + const InternalNode* next = nullptr; + for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { + if (now->child[i + 1] == nullptr || key <= now->keys[i]) { + next = reinterpret_cast(now->child[i]); + break; + } + } + + now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); + } + + const Wrapped* pos = reinterpret_cast*>(now); + while (pos < m_data + m_reccnt && pos->rec.key < key) pos++; + + return pos - m_data; + } + + size_t get_upper_bound(const K& key) const { + const InternalNode* now = m_root; + while (!is_leaf(reinterpret_cast(now))) { + const InternalNode* next = nullptr; + for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { + if (now->child[i + 1] == nullptr || key < now->keys[i]) { + next = reinterpret_cast(now->child[i]); + break; + } + } + + now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); + } + + const Wrapped* pos = reinterpret_cast*>(now); + while (pos < m_data + m_reccnt && pos->rec.key <= key) pos++; + + return pos - m_data; + } + + +private: + void build_internal_levels() { + size_t n_leaf_nodes = m_reccnt / inmem_isam_leaf_fanout + (m_reccnt % inmem_isam_leaf_fanout != 0); + size_t level_node_cnt = n_leaf_nodes; + size_t node_cnt = 0; + do { + level_node_cnt = level_node_cnt / inmem_isam_fanout + (level_node_cnt % inmem_isam_fanout != 0); + node_cnt += level_node_cnt; + } while (level_node_cnt > 1); + + m_alloc_size = (node_cnt * inmem_isam_node_size) + (CACHELINE_SIZE - (node_cnt * inmem_isam_node_size) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + + m_isam_nodes = (InternalNode*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + m_internal_node_cnt = node_cnt; + memset(m_isam_nodes, 0, node_cnt * inmem_isam_node_size); + + InternalNode* current_node = m_isam_nodes; + + const Wrapped* leaf_base = m_data; + const Wrapped* leaf_stop = m_data + m_reccnt; + while (leaf_base < leaf_stop) { + size_t fanout = 0; + for (size_t i = 0; i < inmem_isam_fanout; ++i) { + auto rec_ptr = leaf_base + inmem_isam_leaf_fanout * i; + if (rec_ptr >= leaf_stop) break; + const Wrapped* sep_key = std::min(rec_ptr + inmem_isam_leaf_fanout - 1, leaf_stop - 1); + current_node->keys[i] = sep_key->rec.key; + current_node->child[i] = (char*)rec_ptr; + ++fanout; + } + current_node++; + leaf_base += fanout * inmem_isam_leaf_fanout; + } + + auto level_start = m_isam_nodes; + auto level_stop = current_node; + auto current_level_node_cnt = level_stop - level_start; + while (current_level_node_cnt > 1) { + auto now = level_start; + while (now < level_stop) { + size_t child_cnt = 0; + for (size_t i = 0; i < inmem_isam_fanout; ++i) { + auto node_ptr = now + i; + ++child_cnt; + if (node_ptr >= level_stop) break; + current_node->keys[i] = node_ptr->keys[inmem_isam_fanout - 1]; + current_node->child[i] = (char*)node_ptr; + } + now += child_cnt; + current_node++; + } + level_start = level_stop; + level_stop = current_node; + current_level_node_cnt = level_stop - level_start; + } + + assert(current_level_node_cnt == 1); + m_root = level_start; + } + + bool is_leaf(const char* ptr) const { + return ptr >= (const char*)m_data && ptr < (const char*)(m_data + m_reccnt); + } + + // Members: sorted data, internal ISAM levels, reccnt; + Wrapped* m_data; + psudb::BloomFilter *m_bf; + InternalNode* m_isam_nodes; + InternalNode* m_root; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_internal_node_cnt; + size_t m_deleted_cnt; + size_t m_alloc_size; +}; +} diff --git a/include/shard/MemISAM.h b/include/shard/MemISAM.h deleted file mode 100644 index 6962c19..0000000 --- a/include/shard/MemISAM.h +++ /dev/null @@ -1,341 +0,0 @@ -/* - * include/shard/MemISAM.h - * - * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - -#include -#include -#include -#include - -#include "framework/ShardRequirements.h" - -#include "util/bf_config.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-util/timer.h" - -using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; -using psudb::PriorityQueue; -using psudb::queue_record; -using psudb::Alias; - -namespace de { - -thread_local size_t mrun_cancelations = 0; - -template -class MemISAM { -private: - friend class IRSQuery; - friend class IRSQuery; - friend class ISAMRangeQuery; - -typedef decltype(R::key) K; -typedef decltype(R::value) V; - -constexpr static size_t inmem_isam_node_size = 256; -constexpr static size_t inmem_isam_fanout = inmem_isam_node_size / (sizeof(K) + sizeof(char*)); - -struct InMemISAMNode { - K keys[inmem_isam_fanout]; - char* child[inmem_isam_fanout]; -}; - -constexpr static size_t inmem_isam_leaf_fanout = inmem_isam_node_size / sizeof(R); -constexpr static size_t inmem_isam_node_keyskip = sizeof(K) * inmem_isam_fanout; - -static_assert(sizeof(InMemISAMNode) == inmem_isam_node_size, "node size does not match"); - -public: - MemISAM(MutableBuffer* buffer) - :m_reccnt(0), m_tombstone_cnt(0), m_isam_nodes(nullptr), m_deleted_cnt(0) { - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - TIMER_INIT(); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - TIMER_START(); - std::sort(base, stop, std::less>()); - TIMER_STOP(); - auto sort_time = TIMER_RESULT(); - - TIMER_START(); - while (base < stop) { - if (!base->is_tombstone() && (base + 1 < stop) - && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - mrun_cancelations++; - continue; - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - if (m_bf && base->is_tombstone()) { - ++m_tombstone_cnt; - m_bf->insert(base->rec); - } - - base++; - } - TIMER_STOP(); - auto copy_time = TIMER_RESULT(); - - TIMER_START(); - if (m_reccnt > 0) { - build_internal_levels(); - } - TIMER_STOP(); - auto level_time = TIMER_RESULT(); - } - - MemISAM(MemISAM** runs, size_t len) - : m_reccnt(0), m_tombstone_cnt(0), m_deleted_cnt(0), m_isam_nodes(nullptr) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - - size_t attemp_reccnt = 0; - size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (runs[i]) { - auto base = runs[i]->get_data(); - cursors.emplace_back(Cursor{base, base + runs[i]->get_record_count(), 0, runs[i]->get_record_count()}); - attemp_reccnt += runs[i]->get_record_count(); - tombstone_count += runs[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } - - m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - size_t offset = 0; - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - if (cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); - } - } - - if (m_reccnt > 0) { - build_internal_levels(); - } - } - - ~MemISAM() { - if (m_data) free(m_data); - if (m_isam_nodes) free(m_isam_nodes); - if (m_bf) delete m_bf; - } - - Wrapped *point_lookup(const R &rec, bool filter=false) { - if (filter && !m_bf->lookup(rec)) { - return nullptr; - } - - size_t idx = get_lower_bound(rec.key); - if (idx >= m_reccnt) { - return nullptr; - } - - while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; - - if (m_data[idx].rec == rec) { - return m_data + idx; - } - - return nullptr; - } - - Wrapped* get_data() const { - return m_data; - } - - size_t get_record_count() const { - return m_reccnt; - } - - size_t get_tombstone_count() const { - return m_tombstone_cnt; - } - - const Wrapped* get_record_at(size_t idx) const { - return (idx < m_reccnt) ? m_data + idx : nullptr; - } - - size_t get_memory_usage() { - return m_internal_node_cnt * inmem_isam_node_size + m_alloc_size; - } - - size_t get_aux_memory_usage() { - return 0; - } - -private: - size_t get_lower_bound(const K& key) const { - const InMemISAMNode* now = m_root; - while (!is_leaf(reinterpret_cast(now))) { - const InMemISAMNode* next = nullptr; - for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { - if (now->child[i + 1] == nullptr || key <= now->keys[i]) { - next = reinterpret_cast(now->child[i]); - break; - } - } - - now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); - } - - const Wrapped* pos = reinterpret_cast*>(now); - while (pos < m_data + m_reccnt && pos->rec.key < key) pos++; - - return pos - m_data; - } - - size_t get_upper_bound(const K& key) const { - const InMemISAMNode* now = m_root; - while (!is_leaf(reinterpret_cast(now))) { - const InMemISAMNode* next = nullptr; - for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { - if (now->child[i + 1] == nullptr || key < now->keys[i]) { - next = reinterpret_cast(now->child[i]); - break; - } - } - - now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); - } - - const Wrapped* pos = reinterpret_cast*>(now); - while (pos < m_data + m_reccnt && pos->rec.key <= key) pos++; - - return pos - m_data; - } - - void build_internal_levels() { - size_t n_leaf_nodes = m_reccnt / inmem_isam_leaf_fanout + (m_reccnt % inmem_isam_leaf_fanout != 0); - size_t level_node_cnt = n_leaf_nodes; - size_t node_cnt = 0; - do { - level_node_cnt = level_node_cnt / inmem_isam_fanout + (level_node_cnt % inmem_isam_fanout != 0); - node_cnt += level_node_cnt; - } while (level_node_cnt > 1); - - m_alloc_size = (node_cnt * inmem_isam_node_size) + (CACHELINE_SIZE - (node_cnt * inmem_isam_node_size) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - - m_isam_nodes = (InMemISAMNode*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - m_internal_node_cnt = node_cnt; - memset(m_isam_nodes, 0, node_cnt * inmem_isam_node_size); - - InMemISAMNode* current_node = m_isam_nodes; - - const Wrapped* leaf_base = m_data; - const Wrapped* leaf_stop = m_data + m_reccnt; - while (leaf_base < leaf_stop) { - size_t fanout = 0; - for (size_t i = 0; i < inmem_isam_fanout; ++i) { - auto rec_ptr = leaf_base + inmem_isam_leaf_fanout * i; - if (rec_ptr >= leaf_stop) break; - const Wrapped* sep_key = std::min(rec_ptr + inmem_isam_leaf_fanout - 1, leaf_stop - 1); - current_node->keys[i] = sep_key->rec.key; - current_node->child[i] = (char*)rec_ptr; - ++fanout; - } - current_node++; - leaf_base += fanout * inmem_isam_leaf_fanout; - } - - auto level_start = m_isam_nodes; - auto level_stop = current_node; - auto current_level_node_cnt = level_stop - level_start; - while (current_level_node_cnt > 1) { - auto now = level_start; - while (now < level_stop) { - size_t child_cnt = 0; - for (size_t i = 0; i < inmem_isam_fanout; ++i) { - auto node_ptr = now + i; - ++child_cnt; - if (node_ptr >= level_stop) break; - current_node->keys[i] = node_ptr->keys[inmem_isam_fanout - 1]; - current_node->child[i] = (char*)node_ptr; - } - now += child_cnt; - current_node++; - } - level_start = level_stop; - level_stop = current_node; - current_level_node_cnt = level_stop - level_start; - } - - assert(current_level_node_cnt == 1); - m_root = level_start; - } - - bool is_leaf(const char* ptr) const { - return ptr >= (const char*)m_data && ptr < (const char*)(m_data + m_reccnt); - } - - // Members: sorted data, internal ISAM levels, reccnt; - Wrapped* m_data; - psudb::BloomFilter *m_bf; - InMemISAMNode* m_isam_nodes; - InMemISAMNode* m_root; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_internal_node_cnt; - size_t m_deleted_cnt; - size_t m_alloc_size; -}; -} diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 6d76376..6b66b7d 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -31,34 +31,6 @@ using psudb::Alias; namespace de { -template -struct pgm_range_query_parms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; -}; - -template -struct PGMPointLookupParms { - decltype(R::key) target_key; -}; - -template -class PGMRangeQuery; - -template -class PGMPointLookup; - -template -struct PGMState { - size_t start_idx; - size_t stop_idx; -}; - -template -struct PGMBufferState { - size_t cutoff; -}; - template class PGM { private: @@ -67,11 +39,6 @@ private: public: - - // FIXME: there has to be a better way to do this - friend class PGMRangeQuery; - friend class PGMPointLookup; - PGM(MutableBuffer* buffer) : m_reccnt(0), m_tombstone_cnt(0) { @@ -80,8 +47,6 @@ public: m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); std::vector keys; - //m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - size_t offset = 0; m_reccnt = 0; auto base = buffer->get_data(); @@ -110,13 +75,6 @@ public: base->header &= 3; m_data[m_reccnt++] = *base; keys.emplace_back(base->rec.key); - - /* - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); - }*/ - base++; } @@ -148,8 +106,6 @@ public: } } - //m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); assert(m_alloc_size % CACHELINE_SIZE == 0); m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); @@ -172,10 +128,6 @@ public: if (!cursor.ptr->is_deleted()) { m_data[m_reccnt++] = *cursor.ptr; keys.emplace_back(cursor.ptr->rec.key); - /*if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - }*/ } pq.pop(); @@ -190,15 +142,9 @@ public: ~PGM() { if (m_data) free(m_data); - //if (m_bf) delete m_bf; - } Wrapped *point_lookup(const R &rec, bool filter=false) { - //if (filter && !m_bf->lookup(rec)) { - // return nullptr; - //} - size_t idx = get_lower_bound(rec.key); if (idx >= m_reccnt) { return nullptr; @@ -284,219 +230,6 @@ private: K m_max_key; K m_min_key; pgm::PGMIndex m_pgm; - //BloomFilter *m_bf; -}; -template -class PGMPointLookup { -public: - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=false; - - static void *get_query_state(PGM *ts, void *parms) { - return nullptr; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - return nullptr; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(PGM *ts, void *q_state, void *parms) { - std::vector> records; - auto p = (PGMPointLookupParms *) parms; - auto s = (PGMState *) q_state; - - size_t idx = ts->get_lower_bound(p->target_key); - if (ts->get_record_at(idx)->rec.key == p->target_key) { - records.emplace_back(*ts->get_record_at(idx)); - } - - return records; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto p = (PGMPointLookupParms *) parms; - auto s = (PGMBufferState *) state; - - std::vector> records; - for (size_t i=0; iget_record_count(); i++) { - auto rec = buffer->get_data() + i; - if (rec->rec.key == p->target_key) { - records.emplace_back(*rec); - return records; - } - } - - return records; - } - - static std::vector merge(std::vector>> &results, void *parms) { - std::vector output; - for (size_t i=0 ;i 0) { - output.emplace_back(results[i][0].rec); - return output; - } - } - - return output; - } - - static void delete_query_state(void *state) { - } - - static void delete_buffer_query_state(void *state) { - } }; - - -template -class PGMRangeQuery { -public: - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=false; - - static void *get_query_state(PGM *ts, void *parms) { - auto res = new PGMState(); - auto p = (pgm_range_query_parms *) parms; - - res->start_idx = ts->get_lower_bound(p->lower_bound); - res->stop_idx = ts->get_record_count(); - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new PGMBufferState(); - res->cutoff = buffer->get_record_count(); - - return res; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(PGM *ts, void *q_state, void *parms) { - size_t tot = 0; - //std::vector> records; - auto p = (pgm_range_query_parms *) parms; - auto s = (PGMState *) q_state; - - // if the returned index is one past the end of the - // records for the PGM, then there are not records - // in the index falling into the specified range. - if (s->start_idx == ts->get_record_count()) { - return {}; - } - - auto ptr = ts->get_record_at(s->start_idx); - - // roll the pointer forward to the first record that is - // greater than or equal to the lower bound. - while(ptr->rec.key < p->lower_bound) { - ptr++; - } - - while (ptr->rec.key <= p->upper_bound && ptr < ts->m_data + s->stop_idx) { - if (ptr->is_tombstone()) --tot; - else if (!ptr->is_deleted()) ++tot; - //records.emplace_back(*ptr); - ptr++; - } - - return {Wrapped{0, {tot, 0}}}; - //return records; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - size_t tot = 0; - auto p = (pgm_range_query_parms *) parms; - auto s = (PGMBufferState *) state; - - //std::vector> records; - for (size_t i=0; icutoff; i++) { - auto rec = buffer->get_data() + i; - if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - if (rec->is_tombstone()) --tot; - else if (!rec->is_deleted()) ++tot; - //records.emplace_back(*rec); - } - } - - return {Wrapped{0, {tot, 0}}}; - //return records; - } - - static std::vector merge(std::vector>> &results, void *parms) { - /*std::vector>> cursors; - cursors.reserve(results.size()); - - PriorityQueue> pq(results.size()); - size_t total = 0; - size_t tmp_n = results.size(); - - - for (size_t i = 0; i < tmp_n; ++i) - if (results[i].size() > 0){ - auto base = results[i].data(); - cursors.emplace_back(Cursor{base, base + results[i].size(), 0, results[i].size()}); - assert(i == cursors.size() - 1); - total += results[i].size(); - pq.push(cursors[i].ptr, tmp_n - i - 1); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - - if (total == 0) { - return std::vector(); - } - - std::vector output; - output.reserve(total); - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[tmp_n - now.version - 1]; - auto& cursor2 = cursors[tmp_n - next.version - 1]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[tmp_n - now.version - 1]; - if (!now.data->is_tombstone()) output.push_back(cursor.ptr->rec); - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - }*/ - - size_t tot = 0; - for (auto& result: results) - if (result.size() > 0) tot += result[0].rec.key; - - return {{tot, 0}}; - } - - static void delete_query_state(void *state) { - auto s = (PGMState *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (PGMBufferState *) state; - delete s; - } -}; - -; - } diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index a784a38..fdf8edb 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -30,32 +30,6 @@ using psudb::Alias; namespace de { -template -struct ts_range_query_parms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; -}; - -template -class TrieSplineRangeQuery; - -template -struct TrieSplineState { - size_t start_idx; - size_t stop_idx; -}; - -template -struct TrieSplineBufferState { - size_t cutoff; - Alias* alias; - - ~TrieSplineBufferState() { - delete alias; - } - -}; - template class TrieSpline { private: @@ -63,10 +37,6 @@ private: typedef decltype(R::value) V; public: - - // FIXME: there has to be a better way to do this - friend class TrieSplineRangeQuery; - TrieSpline(MutableBuffer* buffer) : m_reccnt(0), m_tombstone_cnt(0) { @@ -254,8 +224,6 @@ public: return 0; } -private: - size_t get_lower_bound(const K& key) const { auto bound = m_ts.GetSearchBound(key); size_t idx = bound.begin; @@ -293,6 +261,8 @@ private: return (m_data[idx].rec.key <= key) ? idx : m_reccnt; } +private: + Wrapped* m_data; size_t m_reccnt; size_t m_tombstone_cnt; @@ -302,154 +272,4 @@ private: ts::TrieSpline m_ts; BloomFilter *m_bf; }; - - -template -class TrieSplineRangeQuery { -public: - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=true; - - static void *get_query_state(TrieSpline *ts, void *parms) { - auto res = new TrieSplineState(); - auto p = (ts_range_query_parms *) parms; - - res->start_idx = ts->get_lower_bound(p->lower_bound); - res->stop_idx = ts->get_record_count(); - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new TrieSplineBufferState(); - res->cutoff = buffer->get_record_count(); - - return res; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(TrieSpline *ts, void *q_state, void *parms) { - //std::vector> records; - size_t tot = 0; - auto p = (ts_range_query_parms *) parms; - auto s = (TrieSplineState *) q_state; - - // if the returned index is one past the end of the - // records for the TrieSpline, then there are not records - // in the index falling into the specified range. - if (s->start_idx == ts->get_record_count()) { - return {}; - } - - auto ptr = ts->get_record_at(s->start_idx); - - // roll the pointer forward to the first record that is - // greater than or equal to the lower bound. - while(ptr->rec.key < p->lower_bound) { - ptr++; - } - - - while (ptr->rec.key <= p->upper_bound && ptr < ts->m_data + s->stop_idx) { - if (ptr->is_tombstone()) --tot; - else if (!ptr->is_deleted()) ++tot; - //records.emplace_back(*ptr); - ptr++; - } - - return {Wrapped{0, {tot, 0}}}; - //return records; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - size_t tot = 0; - auto p = (ts_range_query_parms *) parms; - auto s = (TrieSplineBufferState *) state; - - //std::vector> records; - for (size_t i=0; icutoff; i++) { - auto rec = buffer->get_data() + i; - if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - if (rec->is_tombstone()) --tot; - else if (!rec->is_deleted()) ++tot; - //records.emplace_back(*rec); - } - - } - - return {Wrapped{0, {tot, 0}}}; - //return records; - } - - static std::vector merge(std::vector>> &results, void *parms) { -/* - std::vector>> cursors; - cursors.reserve(results.size()); - - PriorityQueue> pq(results.size()); - size_t total = 0; - size_t tmp_n = results.size(); - - - for (size_t i = 0; i < tmp_n; ++i) - if (results[i].size() > 0){ - auto base = results[i].data(); - cursors.emplace_back(Cursor{base, base + results[i].size(), 0, results[i].size()}); - assert(i == cursors.size() - 1); - total += results[i].size(); - pq.push(cursors[i].ptr, tmp_n - i - 1); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - - if (total == 0) { - return std::vector(); - } - - std::vector output; - output.reserve(total); - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[tmp_n - now.version - 1]; - auto& cursor2 = cursors[tmp_n - next.version - 1]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[tmp_n - now.version - 1]; - if (!now.data->is_tombstone()) output.push_back(cursor.ptr->rec); - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } - - return output;*/ - - size_t tot = 0; - for (auto& result: results) - if (result.size() > 0) tot += result[0].rec.key; - - return {{tot, 0}}; - } - - static void delete_query_state(void *state) { - auto s = (TrieSplineState *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (TrieSplineBufferState *) state; - delete s; - } -}; - } diff --git a/include/shard/WSS.h b/include/shard/WSS.h deleted file mode 100644 index 4e3a326..0000000 --- a/include/shard/WSS.h +++ /dev/null @@ -1,453 +0,0 @@ -/* - * include/shard/WSS.h - * - * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - - -#include -#include -#include -#include -#include - -#include "framework/ShardRequirements.h" - -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-ds/Alias.h" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" - -using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; -using psudb::PriorityQueue; -using psudb::queue_record; -using psudb::Alias; - -namespace de { - -thread_local size_t wss_cancelations = 0; - -template -struct wss_query_parms { - size_t sample_size; - gsl_rng *rng; -}; - -template -class WSSQuery; - -template -struct WSSState { - decltype(R::weight) total_weight; - size_t sample_size; - - WSSState() { - total_weight = 0; - } -}; - -template -struct WSSBufferState { - size_t cutoff; - size_t sample_size; - Alias* alias; - decltype(R::weight) max_weight; - decltype(R::weight) total_weight; - - ~WSSBufferState() { - delete alias; - } - -}; - -template -class WSS { -private: - typedef decltype(R::key) K; - typedef decltype(R::value) V; - typedef decltype(R::weight) W; - -public: - - // FIXME: there has to be a better way to do this - friend class WSSQuery; - friend class WSSQuery; - - WSS(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - std::sort(base, stop, std::less>()); - - std::vector weights; - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - wss_cancelations++; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - m_total_weight+= base->rec.weight; - weights.push_back(base->rec.weight); - - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); - } - - base++; - } - - if (m_reccnt > 0) { - build_alias_structure(weights); - } - } - - WSS(WSS** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - - size_t attemp_reccnt = 0; - size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } - - m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - std::vector weights; - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - m_total_weight += cursor.ptr->rec.weight; - weights.push_back(cursor.ptr->rec.weight); - if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } - - if (m_reccnt > 0) { - build_alias_structure(weights); - } - } - - ~WSS() { - if (m_data) free(m_data); - if (m_alias) delete m_alias; - if (m_bf) delete m_bf; - - } - - Wrapped *point_lookup(const R &rec, bool filter=false) { - if (filter && !m_bf->lookup(rec)) { - return nullptr; - } - - size_t idx = get_lower_bound(rec.key); - if (idx >= m_reccnt) { - return nullptr; - } - - while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; - - if (m_data[idx].rec == rec) { - return m_data + idx; - } - - return nullptr; - } - - Wrapped* get_data() const { - return m_data; - } - - size_t get_record_count() const { - return m_reccnt; - } - - size_t get_tombstone_count() const { - return m_tombstone_cnt; - } - - const Wrapped* get_record_at(size_t idx) const { - if (idx >= m_reccnt) return nullptr; - return m_data + idx; - } - - - size_t get_memory_usage() { - return m_alloc_size; - } - - size_t get_aux_memory_usage() { - return 0; - } - -private: - - size_t get_lower_bound(const K& key) const { - size_t min = 0; - size_t max = m_reccnt - 1; - - const char * record_key; - while (min < max) { - size_t mid = (min + max) / 2; - - if (key > m_data[mid].rec.key) { - min = mid + 1; - } else { - max = mid; - } - } - - return min; - } - - void build_alias_structure(std::vector &weights) { - - // normalize the weights vector - std::vector norm_weights(weights.size()); - - for (size_t i=0; i* m_data; - Alias *m_alias; - W m_total_weight; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_group_size; - size_t m_alloc_size; - BloomFilter *m_bf; -}; - - -template -class WSSQuery { -public: - - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=false; - - static void *get_query_state(WSS *wss, void *parms) { - auto res = new WSSState(); - res->total_weight = wss->m_total_weight; - res->sample_size = 0; - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - WSSBufferState *state = new WSSBufferState(); - auto parameters = (wss_query_parms*) parms; - if constexpr (Rejection) { - state->cutoff = buffer->get_record_count() - 1; - state->max_weight = buffer->get_max_weight(); - state->total_weight = buffer->get_total_weight(); - return state; - } - - std::vector weights; - - state->cutoff = buffer->get_record_count() - 1; - double total_weight = 0.0; - - for (size_t i = 0; i <= state->cutoff; i++) { - auto rec = buffer->get_data() + i; - weights.push_back(rec->rec.weight); - total_weight += rec->rec.weight; - } - - for (size_t i = 0; i < weights.size(); i++) { - weights[i] = weights[i] / total_weight; - } - - state->alias = new Alias(weights); - state->total_weight = total_weight; - - return state; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - auto p = (wss_query_parms *) query_parms; - auto bs = (WSSBufferState *) buff_state; - - std::vector shard_sample_sizes(shard_states.size()+1, 0); - size_t buffer_sz = 0; - - std::vector weights; - weights.push_back(bs->total_weight); - - decltype(R::weight) total_weight = 0; - for (auto &s : shard_states) { - auto state = (WSSState *) s; - total_weight += state->total_weight; - weights.push_back(state->total_weight); - } - - std::vector normalized_weights; - for (auto w : weights) { - normalized_weights.push_back((double) w / (double) total_weight); - } - - auto shard_alias = Alias(normalized_weights); - for (size_t i=0; isample_size; i++) { - auto idx = shard_alias.get(p->rng); - if (idx == 0) { - buffer_sz++; - } else { - shard_sample_sizes[idx - 1]++; - } - } - - - bs->sample_size = buffer_sz; - for (size_t i=0; i *) shard_states[i]; - state->sample_size = shard_sample_sizes[i+1]; - } - } - - static std::vector> query(WSS *wss, void *q_state, void *parms) { - auto rng = ((wss_query_parms *) parms)->rng; - - auto state = (WSSState *) q_state; - auto sample_size = state->sample_size; - - std::vector> result_set; - - if (sample_size == 0) { - return result_set; - } - size_t attempts = 0; - do { - attempts++; - size_t idx = wss->m_alias->get(rng); - result_set.emplace_back(*wss->get_record_at(idx)); - } while (attempts < sample_size); - - return result_set; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto st = (WSSBufferState *) state; - auto p = (wss_query_parms *) parms; - - std::vector> result; - result.reserve(st->sample_size); - - if constexpr (Rejection) { - for (size_t i=0; isample_size; i++) { - auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; - - auto test = gsl_rng_uniform(p->rng) * st->max_weight; - - if (test <= rec->rec.weight) { - result.emplace_back(*rec); - } - } - return result; - } - - for (size_t i=0; isample_size; i++) { - auto idx = st->alias->get(p->rng); - result.emplace_back(*(buffer->get_data() + idx)); - } - - return result; - } - - static std::vector merge(std::vector>> &results, void *parms) { - std::vector output; - - for (size_t i=0; i *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (WSSBufferState *) state; - delete s; - } -}; - -} -- cgit v1.2.3 From a2fe4b1616a1b2318f70e842382818ee44aea9e6 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 7 Nov 2023 12:29:03 -0500 Subject: Alias shard fixes --- include/query/wss.h | 28 ++++++++++++++-------------- include/shard/Alias.h | 13 +++++++++++-- 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/query/wss.h b/include/query/wss.h index b8a5d54..794485c 100644 --- a/include/query/wss.h +++ b/include/query/wss.h @@ -90,15 +90,19 @@ public: static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { auto p = (Parms *) query_parms; - auto bs = (BufferState *) buffer_states[0]; - std::vector shard_sample_sizes(shard_states.size()+1, 0); + std::vector shard_sample_sizes(shard_states.size()+buffer_states.size(), 0); size_t buffer_sz = 0; std::vector weights; - weights.push_back(bs->total_weight); decltype(R::weight) total_weight = 0; + for (auto &s : buffer_states) { + auto bs = (BufferState *) s; + total_weight += bs->total_weight; + weights.push_back(bs->total_weight); + } + for (auto &s : shard_states) { auto state = (State *) s; total_weight += state->total_weight; @@ -113,19 +117,15 @@ public: auto shard_alias = psudb::Alias(normalized_weights); for (size_t i=0; isample_size; i++) { auto idx = shard_alias.get(p->rng); - if (idx == 0) { - buffer_sz++; + + if (idx < buffer_states.size()) { + auto state = (BufferState *) buffer_states[idx]; + state->sample_size++; } else { - shard_sample_sizes[idx - 1]++; + auto state = (State *) shard_states[idx - buffer_states.size()]; + state->sample_size++; } } - - - bs->sample_size = buffer_sz; - for (size_t i=0; i *) shard_states[i]; - state->sample_size = shard_sample_sizes[i+1]; - } } static std::vector> query(S *shard, void *q_state, void *parms) { @@ -142,7 +142,7 @@ public: size_t attempts = 0; do { attempts++; - size_t idx = shard->m_alias->get(rng); + size_t idx = shard->get_weighted_sample(rng); result_set.emplace_back(*shard->get_record_at(idx)); } while (attempts < sample_size); diff --git a/include/shard/Alias.h b/include/shard/Alias.h index b6b16c5..a4a7d02 100644 --- a/include/shard/Alias.h +++ b/include/shard/Alias.h @@ -19,7 +19,7 @@ #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" -#include "psu-ds/psudb::Alias.h" +#include "psu-ds/Alias.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" @@ -207,7 +207,13 @@ public: return 0; } -private: + W get_total_weight() { + return m_total_weight; + } + + size_t get_weighted_sample(gsl_rng *rng) const { + return m_alias->get(rng); + } size_t get_lower_bound(const K& key) const { size_t min = 0; @@ -227,6 +233,8 @@ private: return min; } +private: + void build_alias_structure(std::vector &weights) { // normalize the weights vector @@ -249,3 +257,4 @@ private: size_t m_alloc_size; BloomFilter *m_bf; }; +} -- cgit v1.2.3 From 9e1c1b1b930031896851b1ed4a15152508327d73 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 7 Nov 2023 13:35:54 -0500 Subject: Converted WIRS to the new interface --- include/query/wirs.h | 240 +++++++++++++++++++ include/shard/AugBTree.h | 371 +++++++++++++++++++++++++++++ include/shard/WIRS.h | 594 ----------------------------------------------- 3 files changed, 611 insertions(+), 594 deletions(-) create mode 100644 include/query/wirs.h create mode 100644 include/shard/AugBTree.h delete mode 100644 include/shard/WIRS.h (limited to 'include') diff --git a/include/query/wirs.h b/include/query/wirs.h new file mode 100644 index 0000000..1113b1d --- /dev/null +++ b/include/query/wirs.h @@ -0,0 +1,240 @@ +/* + * include/query/wirs.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/structure/MutableBuffer.h" +#include "psu-ds/Alias.h" + +namespace de { namespace wirs { + +template +struct Parms { + decltype(R::key) lower_bound; + decltype(R::key) upper_bound; + size_t sample_size; + gsl_rng *rng; +}; + +template +struct State { + decltype(R::weight) total_weight; + std::vector nodes; + psudb::Alias* top_level_alias; + size_t sample_size; + + State() { + total_weight = 0; + top_level_alias = nullptr; + } + + ~State() { + if (top_level_alias) delete top_level_alias; + } +}; + +template +struct BufferState { + size_t cutoff; + psudb::Alias* alias; + std::vector> records; + decltype(R::weight) max_weight; + size_t sample_size; + decltype(R::weight) total_weight; + + ~BufferState() { + delete alias; + } +}; + +template +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=false; + + static void *get_query_state(S *shard, void *parms) { + auto res = new State(); + decltype(R::key) lower_key = ((Parms *) parms)->lower_bound; + decltype(R::key) upper_key = ((Parms *) parms)->upper_bound; + + std::vector weights; + res->total_weight = shard->find_covering_nodes(lower_key, upper_key, res->nodes, weights); + + std::vector normalized_weights; + for (auto weight : weights) { + normalized_weights.emplace_back(weight / res->total_weight); + } + + res->top_level_alias = new psudb::Alias(normalized_weights); + res->sample_size = 0; + + return res; + } + + static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + BufferState *state = new BufferState(); + auto parameters = (Parms*) parms; + + if constexpr (Rejection) { + state->cutoff = buffer->get_record_count() - 1; + state->max_weight = buffer->get_max_weight(); + state->total_weight = buffer->get_total_weight(); + state->sample_size = 0; + return state; + } + + std::vector weights; + + state->cutoff = buffer->get_record_count() - 1; + decltype(R::weight) total_weight = 0; + + for (size_t i = 0; i <= state->cutoff; i++) { + auto rec = buffer->get_data() + i; + + if (rec->rec.key >= parameters->lower_bound && rec->rec.key <= parameters->upper_bound && !rec->is_tombstone() && !rec->is_deleted()) { + weights.push_back(rec->rec.weight); + state->records.push_back(*rec); + total_weight += rec->rec.weight; + } + } + + std::vector normalized_weights; + for (size_t i = 0; i < weights.size(); i++) { + normalized_weights.push_back(weights[i] / total_weight); + } + + state->total_weight = total_weight; + state->alias = new psudb::Alias(normalized_weights); + state->sample_size = 0; + + return state; + } + + static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { + auto p = (Parms *) query_parms; + + std::vector shard_sample_sizes(shard_states.size()+buffer_states.size(), 0); + size_t buffer_sz = 0; + + std::vector weights; + + decltype(R::weight) total_weight = 0; + for (auto &s : buffer_states) { + auto bs = (BufferState *) s; + total_weight += bs->total_weight; + weights.push_back(bs->total_weight); + } + + for (auto &s : shard_states) { + auto state = (State *) s; + total_weight += state->total_weight; + weights.push_back(state->total_weight); + } + + std::vector normalized_weights; + for (auto w : weights) { + normalized_weights.push_back((double) w / (double) total_weight); + } + + auto shard_alias = psudb::Alias(normalized_weights); + for (size_t i=0; isample_size; i++) { + auto idx = shard_alias.get(p->rng); + + if (idx < buffer_states.size()) { + auto state = (BufferState *) buffer_states[idx]; + state->sample_size++; + } else { + auto state = (State *) shard_states[idx - buffer_states.size()]; + state->sample_size++; + } + } + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + auto lower_key = ((Parms *) parms)->lower_bound; + auto upper_key = ((Parms *) parms)->upper_bound; + auto rng = ((Parms *) parms)->rng; + + auto state = (State *) q_state; + auto sample_size = state->sample_size; + + std::vector> result_set; + + if (sample_size == 0) { + return result_set; + } + size_t cnt = 0; + size_t attempts = 0; + + for (size_t i=0; iget_weighted_sample(lower_key, upper_key, + state->nodes[state->top_level_alias->get(rng)], + rng); + if (rec) { + result_set.emplace_back(*rec); + } + } + + return result_set; + } + + static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + auto st = (BufferState *) state; + auto p = (Parms *) parms; + + std::vector> result; + result.reserve(st->sample_size); + + if constexpr (Rejection) { + for (size_t i=0; isample_size; i++) { + auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); + auto rec = buffer->get_data() + idx; + + auto test = gsl_rng_uniform(p->rng) * st->max_weight; + + if (test <= rec->rec.weight && rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { + result.emplace_back(*rec); + } + } + return result; + } + + for (size_t i=0; isample_size; i++) { + auto idx = st->alias->get(p->rng); + result.emplace_back(st->records[idx]); + } + + return result; + } + + static std::vector merge(std::vector>> &results, void *parms) { + std::vector output; + + for (size_t i=0; i *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; +}} diff --git a/include/shard/AugBTree.h b/include/shard/AugBTree.h new file mode 100644 index 0000000..e32ec64 --- /dev/null +++ b/include/shard/AugBTree.h @@ -0,0 +1,371 @@ +/* + * include/shard/AugBTree.h + * + * Copyright (C) 2023 Dong Xie + * Douglas B. Rumbaugh + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + + +#include +#include +#include +#include +#include + +#include "framework/ShardRequirements.h" + +#include "psu-ds/PriorityQueue.h" +#include "util/Cursor.h" +#include "psu-ds/Alias.h" +#include "psu-ds/BloomFilter.h" +#include "util/bf_config.h" + +using psudb::CACHELINE_SIZE; +using psudb::BloomFilter; +using psudb::PriorityQueue; +using psudb::queue_record; +using psudb::Alias; + +namespace de { + +thread_local size_t wirs_cancelations = 0; + +template +struct AugBTreeNode { + struct AugBTreeNode *left, *right; + decltype(R::key) low, high; + decltype(R::weight) weight; + Alias* alias; +}; + +template +class AugBTree { +private: + typedef decltype(R::key) K; + typedef decltype(R::value) V; + typedef decltype(R::weight) W; + +public: + AugBTree(MutableBuffer* buffer) + : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { + m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); + + size_t offset = 0; + m_reccnt = 0; + auto base = buffer->get_data(); + auto stop = base + buffer->get_record_count(); + + std::sort(base, stop, std::less>()); + + while (base < stop) { + if (!(base->is_tombstone()) && (base + 1) < stop) { + if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { + base += 2; + wirs_cancelations++; + continue; + } + } else if (base->is_deleted()) { + base += 1; + continue; + } + + // FIXME: this shouldn't be necessary, but the tagged record + // bypass doesn't seem to be working on this code-path, so this + // ensures that tagged records from the buffer are able to be + // dropped, eventually. It should only need to be &= 1 + base->header &= 3; + m_data[m_reccnt++] = *base; + m_total_weight+= base->rec.weight; + + if (m_bf && base->is_tombstone()) { + m_tombstone_cnt++; + m_bf->insert(base->rec); + } + + base++; + } + + if (m_reccnt > 0) { + build_wirs_structure(); + } + } + + AugBTree(AugBTree** shards, size_t len) + : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { + std::vector>> cursors; + cursors.reserve(len); + + PriorityQueue> pq(len); + + size_t attemp_reccnt = 0; + size_t tombstone_count = 0; + + for (size_t i = 0; i < len; ++i) { + if (shards[i]) { + auto base = shards[i]->get_data(); + cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); + attemp_reccnt += shards[i]->get_record_count(); + tombstone_count += shards[i]->get_tombstone_count(); + pq.push(cursors[i].ptr, i); + } else { + cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); + } + } + + m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + + m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); + assert(m_alloc_size % CACHELINE_SIZE == 0); + m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + while (pq.size()) { + auto now = pq.peek(); + auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + if (!cursor.ptr->is_deleted()) { + m_data[m_reccnt++] = *cursor.ptr; + m_total_weight += cursor.ptr->rec.weight; + if (m_bf && cursor.ptr->is_tombstone()) { + ++m_tombstone_cnt; + if (m_bf) m_bf->insert(cursor.ptr->rec); + } + } + pq.pop(); + + if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); + } + } + + if (m_reccnt > 0) { + build_wirs_structure(); + } + } + + ~AugBTree() { + if (m_data) free(m_data); + for (size_t i=0; i *point_lookup(const R &rec, bool filter=false) { + if (filter && !m_bf->lookup(rec)) { + return nullptr; + } + + size_t idx = get_lower_bound(rec.key); + if (idx >= m_reccnt) { + return nullptr; + } + + while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; + + if (m_data[idx].rec == rec) { + return m_data + idx; + } + + return nullptr; + } + + Wrapped* get_data() const { + return m_data; + } + + size_t get_record_count() const { + return m_reccnt; + } + + size_t get_tombstone_count() const { + return m_tombstone_cnt; + } + + const Wrapped* get_record_at(size_t idx) const { + if (idx >= m_reccnt) return nullptr; + return m_data + idx; + } + + + size_t get_memory_usage() { + return m_alloc_size + m_node_cnt * sizeof(AugBTreeNode>); + } + + size_t get_aux_memory_usage() { + return 0; + } + + size_t get_lower_bound(const K& key) const { + size_t min = 0; + size_t max = m_reccnt - 1; + + const char * record_key; + while (min < max) { + size_t mid = (min + max) / 2; + + if (key > m_data[mid].rec.key) { + min = mid + 1; + } else { + max = mid; + } + } + + return min; + } + + W find_covering_nodes(K lower_key, K upper_key, std::vector &nodes, std::vector &weights) { + W total_weight = 0; + + /* Simulate a stack to unfold recursion. */ + struct AugBTreeNode* st[64] = {0}; + st[0] = m_root; + size_t top = 1; + while(top > 0) { + auto now = st[--top]; + if (covered_by(now, lower_key, upper_key) || + (now->left == nullptr && now->right == nullptr && intersects(now, lower_key, upper_key))) { + nodes.emplace_back(now); + weights.emplace_back(now->weight); + total_weight += now->weight; + } else { + if (now->left && intersects(now->left, lower_key, upper_key)) st[top++] = now->left; + if (now->right && intersects(now->right, lower_key, upper_key)) st[top++] = now->right; + } + } + + + return total_weight; + } + + Wrapped *get_weighted_sample(K lower_key, K upper_key, void *internal_node, gsl_rng *rng) { + /* k -> sampling: three levels. 1. select a node -> select a fat point -> select a record. */ + + /* first level */ + auto node = (AugBTreeNode*) internal_node; + + /* second level */ + auto fat_point = node->low + node->alias->get(rng); + + /* third level */ + size_t rec_offset = fat_point * m_group_size + m_alias[fat_point]->get(rng); + auto record = m_data + rec_offset; + + /* bounds rejection */ + if (lower_key > record->rec.key || upper_key < record->rec.key) { + return nullptr; + } + + return record; + } + +private: + + bool covered_by(struct AugBTreeNode* node, const K& lower_key, const K& upper_key) { + auto low_index = node->low * m_group_size; + auto high_index = std::min((node->high + 1) * m_group_size - 1, m_reccnt - 1); + return lower_key < m_data[low_index].rec.key && m_data[high_index].rec.key < upper_key; + } + + bool intersects(struct AugBTreeNode* node, const K& lower_key, const K& upper_key) { + auto low_index = node->low * m_group_size; + auto high_index = std::min((node->high + 1) * m_group_size - 1, m_reccnt - 1); + return lower_key < m_data[high_index].rec.key && m_data[low_index].rec.key < upper_key; + } + + void build_wirs_structure() { + m_group_size = std::ceil(std::log(m_reccnt)); + size_t n_groups = std::ceil((double) m_reccnt / (double) m_group_size); + + // Fat point construction + low level alias.... + double sum_weight = 0.0; + std::vector weights; + std::vector group_norm_weight; + size_t i = 0; + size_t group_no = 0; + while (i < m_reccnt) { + double group_weight = 0.0; + group_norm_weight.clear(); + for (size_t k = 0; k < m_group_size && i < m_reccnt; ++k, ++i) { + auto w = m_data[i].rec.weight; + group_norm_weight.emplace_back(w); + group_weight += w; + sum_weight += w; + } + + for (auto& w: group_norm_weight) + if (group_weight) w /= group_weight; + else w = 1.0 / group_norm_weight.size(); + m_alias.emplace_back(new Alias(group_norm_weight)); + + + weights.emplace_back(group_weight); + } + + assert(weights.size() == n_groups); + + m_root = construct_AugBTreeNode(weights, 0, n_groups-1); + } + + struct AugBTreeNode* construct_AugBTreeNode(const std::vector& weights, size_t low, size_t high) { + if (low == high) { + return new AugBTreeNode{nullptr, nullptr, low, high, weights[low], new Alias({1.0})}; + } else if (low > high) return nullptr; + + std::vector node_weights; + W sum = 0; + for (size_t i = low; i < high; ++i) { + node_weights.emplace_back(weights[i]); + sum += weights[i]; + } + + for (auto& w: node_weights) + if (sum) w /= sum; + else w = 1.0 / node_weights.size(); + + m_node_cnt += 1; + size_t mid = (low + high) / 2; + return new AugBTreeNode{construct_AugBTreeNode(weights, low, mid), + construct_AugBTreeNode(weights, mid + 1, high), + low, high, sum, new Alias(node_weights)}; + } + + void free_tree(struct AugBTreeNode* node) { + if (node) { + delete node->alias; + free_tree(node->left); + free_tree(node->right); + delete node; + } + } + + Wrapped* m_data; + std::vector m_alias; + AugBTreeNode* m_root; + W m_total_weight; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_group_size; + size_t m_alloc_size; + size_t m_node_cnt; + BloomFilter *m_bf; +}; +} diff --git a/include/shard/WIRS.h b/include/shard/WIRS.h deleted file mode 100644 index bf29325..0000000 --- a/include/shard/WIRS.h +++ /dev/null @@ -1,594 +0,0 @@ -/* - * include/shard/WIRS.h - * - * Copyright (C) 2023 Dong Xie - * Douglas B. Rumbaugh - * - * All rights reserved. Published under the Modified BSD License. - * - */ -#pragma once - - -#include -#include -#include -#include -#include - -#include "framework/ShardRequirements.h" - -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-ds/Alias.h" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" - -using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; -using psudb::PriorityQueue; -using psudb::queue_record; -using psudb::Alias; - -namespace de { - -thread_local size_t wirs_cancelations = 0; - -template -struct wirs_query_parms { - decltype(R::key) lower_bound; - decltype(R::key) upper_bound; - size_t sample_size; - gsl_rng *rng; -}; - -template -class WIRSQuery; - -template -struct wirs_node { - struct wirs_node *left, *right; - decltype(R::key) low, high; - decltype(R::weight) weight; - Alias* alias; -}; - -template -struct WIRSState { - decltype(R::weight) total_weight; - std::vector*> nodes; - Alias* top_level_alias; - size_t sample_size; - - WIRSState() { - total_weight = 0; - top_level_alias = nullptr; - } - - ~WIRSState() { - if (top_level_alias) delete top_level_alias; - } -}; - -template -struct WIRSBufferState { - size_t cutoff; - Alias* alias; - std::vector> records; - decltype(R::weight) max_weight; - size_t sample_size; - decltype(R::weight) total_weight; - - ~WIRSBufferState() { - delete alias; - } - -}; - -template -class WIRS { -private: - - typedef decltype(R::key) K; - typedef decltype(R::value) V; - typedef decltype(R::weight) W; - -public: - - // FIXME: there has to be a better way to do this - friend class WIRSQuery; - friend class WIRSQuery; - - WIRS(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - std::sort(base, stop, std::less>()); - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - wirs_cancelations++; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - m_total_weight+= base->rec.weight; - - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); - } - - base++; - } - - if (m_reccnt > 0) { - build_wirs_structure(); - } - } - - WIRS(WIRS** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - - size_t attemp_reccnt = 0; - size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } - - m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - m_total_weight += cursor.ptr->rec.weight; - if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } - - if (m_reccnt > 0) { - build_wirs_structure(); - } - } - - ~WIRS() { - if (m_data) free(m_data); - for (size_t i=0; i *point_lookup(const R &rec, bool filter=false) { - if (filter && !m_bf->lookup(rec)) { - return nullptr; - } - - size_t idx = get_lower_bound(rec.key); - if (idx >= m_reccnt) { - return nullptr; - } - - while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; - - if (m_data[idx].rec == rec) { - return m_data + idx; - } - - return nullptr; - } - - Wrapped* get_data() const { - return m_data; - } - - size_t get_record_count() const { - return m_reccnt; - } - - size_t get_tombstone_count() const { - return m_tombstone_cnt; - } - - const Wrapped* get_record_at(size_t idx) const { - if (idx >= m_reccnt) return nullptr; - return m_data + idx; - } - - - size_t get_memory_usage() { - return m_alloc_size + m_node_cnt * sizeof(wirs_node>); - } - - size_t get_aux_memory_usage() { - return 0; - } - -private: - - size_t get_lower_bound(const K& key) const { - size_t min = 0; - size_t max = m_reccnt - 1; - - const char * record_key; - while (min < max) { - size_t mid = (min + max) / 2; - - if (key > m_data[mid].rec.key) { - min = mid + 1; - } else { - max = mid; - } - } - - return min; - } - - bool covered_by(struct wirs_node* node, const K& lower_key, const K& upper_key) { - auto low_index = node->low * m_group_size; - auto high_index = std::min((node->high + 1) * m_group_size - 1, m_reccnt - 1); - return lower_key < m_data[low_index].rec.key && m_data[high_index].rec.key < upper_key; - } - - bool intersects(struct wirs_node* node, const K& lower_key, const K& upper_key) { - auto low_index = node->low * m_group_size; - auto high_index = std::min((node->high + 1) * m_group_size - 1, m_reccnt - 1); - return lower_key < m_data[high_index].rec.key && m_data[low_index].rec.key < upper_key; - } - - void build_wirs_structure() { - m_group_size = std::ceil(std::log(m_reccnt)); - size_t n_groups = std::ceil((double) m_reccnt / (double) m_group_size); - - // Fat point construction + low level alias.... - double sum_weight = 0.0; - std::vector weights; - std::vector group_norm_weight; - size_t i = 0; - size_t group_no = 0; - while (i < m_reccnt) { - double group_weight = 0.0; - group_norm_weight.clear(); - for (size_t k = 0; k < m_group_size && i < m_reccnt; ++k, ++i) { - auto w = m_data[i].rec.weight; - group_norm_weight.emplace_back(w); - group_weight += w; - sum_weight += w; - } - - for (auto& w: group_norm_weight) - if (group_weight) w /= group_weight; - else w = 1.0 / group_norm_weight.size(); - m_alias.emplace_back(new Alias(group_norm_weight)); - - - weights.emplace_back(group_weight); - } - - assert(weights.size() == n_groups); - - m_root = construct_wirs_node(weights, 0, n_groups-1); - } - - struct wirs_node* construct_wirs_node(const std::vector& weights, size_t low, size_t high) { - if (low == high) { - return new wirs_node{nullptr, nullptr, low, high, weights[low], new Alias({1.0})}; - } else if (low > high) return nullptr; - - std::vector node_weights; - W sum = 0; - for (size_t i = low; i < high; ++i) { - node_weights.emplace_back(weights[i]); - sum += weights[i]; - } - - for (auto& w: node_weights) - if (sum) w /= sum; - else w = 1.0 / node_weights.size(); - - m_node_cnt += 1; - size_t mid = (low + high) / 2; - return new wirs_node{construct_wirs_node(weights, low, mid), - construct_wirs_node(weights, mid + 1, high), - low, high, sum, new Alias(node_weights)}; - } - - void free_tree(struct wirs_node* node) { - if (node) { - delete node->alias; - free_tree(node->left); - free_tree(node->right); - delete node; - } - } - - Wrapped* m_data; - std::vector m_alias; - wirs_node* m_root; - W m_total_weight; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_group_size; - size_t m_alloc_size; - size_t m_node_cnt; - BloomFilter *m_bf; -}; - - -template -class WIRSQuery { -public: - - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=false; - - static void *get_query_state(WIRS *wirs, void *parms) { - auto res = new WIRSState(); - decltype(R::key) lower_key = ((wirs_query_parms *) parms)->lower_bound; - decltype(R::key) upper_key = ((wirs_query_parms *) parms)->upper_bound; - - // Simulate a stack to unfold recursion. - double total_weight = 0.0; - struct wirs_node* st[64] = {0}; - st[0] = wirs->m_root; - size_t top = 1; - while(top > 0) { - auto now = st[--top]; - if (wirs->covered_by(now, lower_key, upper_key) || - (now->left == nullptr && now->right == nullptr && wirs->intersects(now, lower_key, upper_key))) { - res->nodes.emplace_back(now); - total_weight += now->weight; - } else { - if (now->left && wirs->intersects(now->left, lower_key, upper_key)) st[top++] = now->left; - if (now->right && wirs->intersects(now->right, lower_key, upper_key)) st[top++] = now->right; - } - } - - std::vector weights; - for (const auto& node: res->nodes) { - weights.emplace_back(node->weight / total_weight); - } - res->total_weight = total_weight; - res->top_level_alias = new Alias(weights); - res->sample_size = 0; - - return res; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - WIRSBufferState *state = new WIRSBufferState(); - auto parameters = (wirs_query_parms*) parms; - if constexpr (Rejection) { - state->cutoff = buffer->get_record_count() - 1; - state->max_weight = buffer->get_max_weight(); - state->total_weight = buffer->get_total_weight(); - state->sample_size = 0; - return state; - } - - std::vector weights; - - state->cutoff = buffer->get_record_count() - 1; - double total_weight = 0.0; - - for (size_t i = 0; i <= state->cutoff; i++) { - auto rec = buffer->get_data() + i; - - if (rec->rec.key >= parameters->lower_bound && rec->rec.key <= parameters->upper_bound && !rec->is_tombstone() && !rec->is_deleted()) { - weights.push_back(rec->rec.weight); - state->records.push_back(*rec); - total_weight += rec->rec.weight; - } - } - - for (size_t i = 0; i < weights.size(); i++) { - weights[i] = weights[i] / total_weight; - } - - state->total_weight = total_weight; - state->alias = new Alias(weights); - state->sample_size = 0; - - return state; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buff_states) { - // FIXME: need to redo for the buffer vector interface - auto p = (wirs_query_parms *) query_parms; - - std::vector shard_sample_sizes(shard_states.size()+1, 0); - size_t buffer_sz = 0; - - decltype(R::weight) total_weight = 0; - std::vector weights; - for (auto &s : buff_states) { - auto state = (WIRSBufferState *) s; - total_weight += state->total_weight; - weights.push_back(state->total_weight); - } - - for (auto &s : shard_states) { - auto state = (WIRSState *) s; - total_weight += state->total_weight; - weights.push_back(state->total_weight); - } - - std::vector normalized_weights; - for (auto w : weights) { - normalized_weights.push_back((double) w / (double) total_weight); - } - - auto shard_alias = Alias(normalized_weights); - for (size_t i=0; isample_size; i++) { - auto idx = shard_alias.get(p->rng); - if (idx == 0) { - buffer_sz++; - } else { - shard_sample_sizes[idx - 1]++; - } - } - - for (size_t i=0; i *) shard_states[i]; - state->sample_size = shard_sample_sizes[i+1]; - } - } - - - - static std::vector> query(WIRS *wirs, void *q_state, void *parms) { - auto lower_key = ((wirs_query_parms *) parms)->lower_bound; - auto upper_key = ((wirs_query_parms *) parms)->upper_bound; - auto rng = ((wirs_query_parms *) parms)->rng; - - auto state = (WIRSState *) q_state; - auto sample_size = state->sample_size; - - std::vector> result_set; - - if (sample_size == 0) { - return result_set; - } - // k -> sampling: three levels. 1. select a node -> select a fat point -> select a record. - size_t cnt = 0; - size_t attempts = 0; - do { - ++attempts; - // first level.... - auto node = state->nodes[state->top_level_alias->get(rng)]; - // second level... - auto fat_point = node->low + node->alias->get(rng); - // third level... - size_t rec_offset = fat_point * wirs->m_group_size + wirs->m_alias[fat_point]->get(rng); - auto record = wirs->m_data + rec_offset; - - // bounds rejection - if (lower_key > record->rec.key || upper_key < record->rec.key) { - continue; - } - - result_set.emplace_back(*record); - cnt++; - } while (attempts < sample_size); - - return result_set; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - auto st = (WIRSBufferState *) state; - auto p = (wirs_query_parms *) parms; - - std::vector> result; - result.reserve(st->sample_size); - - if constexpr (Rejection) { - for (size_t i=0; isample_size; i++) { - auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; - - auto test = gsl_rng_uniform(p->rng) * st->max_weight; - - if (test <= rec->rec.weight && rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { - result.emplace_back(*rec); - } - } - return result; - } - - for (size_t i=0; isample_size; i++) { - auto idx = st->alias->get(p->rng); - result.emplace_back(st->records[idx]); - } - - return result; - } - - static std::vector merge(std::vector>> &results, void *parms) { - std::vector output; - - for (size_t i=0; i *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (WIRSBufferState *) state; - delete s; - } - - - //{q.get_buffer_query_state(p, p)}; - //{q.buffer_query(p, p)}; - -}; - -} -- cgit v1.2.3 From d703f2d74c2dfa6fdb367e9d7e309028005a907d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 6 Nov 2023 12:33:18 -0500 Subject: DynamicExtension::create_static_structure: fixed heap overflow --- include/framework/DynamicExtension.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 5c1eaab..0858fc3 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -169,7 +169,7 @@ public: // FIXME: With an interface adjustment, this could be done in // one call, rather than a loop. - for (size_t i=bv.size() - 1; i>=0; i--) { + for (ssize_t i=bv.size() - 1; i>=0; i--) { shards.emplace_back(new S(bv.get_buffers()[i])); } -- cgit v1.2.3 From 355ddd7b595fce201c305caecea415ab325e170e Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 7 Nov 2023 15:14:38 -0500 Subject: DynamicExtension: revised the way uneeded buffers/structures are released --- include/framework/DynamicExtension.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 233bebb..edbb6f5 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -271,6 +271,7 @@ private: // the empty buffer added when the merge was first triggered is also included. // Due to the reordering of operations in internal_append, the new buffer exists // at the time of the clone, and so is already in the new epoch. + std::unique_lock lk(m_struct_lock); for (size_t i=old_buffer_cnt-1; iget_buffers().size(); i++) { new_epoch->add_buffer(old_epoch->get_buffers()[i]); } @@ -361,17 +362,23 @@ private: * be safely freed. */ std::unique_lock lock(m_struct_lock); - for (auto buf : m_buffers) { - if (buf->get_reference_count() == 0) { - m_buffers.erase(buf); - delete buf; + for (auto itr = m_buffers.begin(); itr != m_buffers.end();) { + if ((*itr)->get_reference_count() == 0) { + auto tmp = *itr; + itr = m_buffers.erase(itr); + delete tmp; + } else { + itr++; } } - for (auto vers : m_versions) { - if (vers->get_reference_count() == 0) { - m_versions.erase(vers); - delete vers; + for (auto itr = m_versions.begin(); itr != m_versions.end();) { + if ((*itr)->get_reference_count() == 0) { + auto tmp = *itr; + itr = m_versions.erase(itr); + delete tmp; + } else { + itr++; } } } -- cgit v1.2.3 From 357cab549c2ed33970562b84ff6f83923742343d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 7 Nov 2023 15:34:24 -0500 Subject: Comment and License updates --- include/framework/DynamicExtension.h | 2 +- include/framework/QueryRequirements.h | 4 ++-- include/framework/ShardRequirements.h | 2 +- include/framework/interface/Query.h | 2 +- include/framework/interface/Record.h | 2 +- include/framework/interface/Scheduler.h | 2 +- include/framework/interface/Shard.h | 2 +- include/framework/scheduling/Epoch.h | 2 +- include/framework/scheduling/FIFOScheduler.h | 2 +- include/framework/scheduling/SerialScheduler.h | 2 +- include/framework/scheduling/Task.h | 2 +- include/framework/structure/BufferView.h | 2 +- include/framework/structure/ExtensionStructure.h | 2 +- include/framework/structure/InternalLevel.h | 2 +- include/framework/structure/MutableBuffer.h | 2 +- include/framework/util/Configuration.h | 2 +- include/query/irs.h | 6 +++++- include/query/rangequery.h | 6 ++++-- include/query/wirs.h | 6 +++++- include/query/wss.h | 9 ++++++--- include/shard/Alias.h | 6 +++++- include/shard/AugBTree.h | 6 +++++- include/shard/ISAMTree.h | 4 +++- include/shard/PGM.h | 6 +++++- include/shard/TrieSpline.h | 4 +++- include/shard/VPTree.h | 7 ++++++- include/util/Cursor.h | 2 +- include/util/bf_config.h | 4 ++-- include/util/types.h | 2 +- 29 files changed, 68 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index edbb6f5..7244856 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -4,7 +4,7 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/QueryRequirements.h b/include/framework/QueryRequirements.h index ff4eaff..4d3e97b 100644 --- a/include/framework/QueryRequirements.h +++ b/include/framework/QueryRequirements.h @@ -3,9 +3,9 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * - * A header file containing the necessary includes for Shard + * A header file containing the necessary includes for Query * development. * */ diff --git a/include/framework/ShardRequirements.h b/include/framework/ShardRequirements.h index 55e7199..d054030 100644 --- a/include/framework/ShardRequirements.h +++ b/include/framework/ShardRequirements.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * A header file containing the necessary includes for Shard * development. diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index 21cadcb..8b92c45 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h index bf495df..457078d 100644 --- a/include/framework/interface/Record.h +++ b/include/framework/interface/Record.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * FIXME: the record implementations could probably be broken out into * different files, leaving only the interface here diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h index 63581d2..a8544a7 100644 --- a/include/framework/interface/Scheduler.h +++ b/include/framework/interface/Scheduler.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index 92cdca0..2357795 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 0ebbde9..9193b06 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 1521eb6..ba62f9e 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index 93611d1..10c2af2 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * IMPORTANT: This "scheduler" is a shim implementation for allowing * strictly serial, single-threaded operation of the framework. It should diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 6dfd7df..d211fb5 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index ccd3dac..651e430 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 80ec7b9..74cede6 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -4,7 +4,7 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index 632fe17..00e0c58 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -4,7 +4,7 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index ba25cc3..671824f 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -4,7 +4,7 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h index ec4ec3a..866128a 100644 --- a/include/framework/util/Configuration.h +++ b/include/framework/util/Configuration.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * */ #pragma once diff --git a/include/query/irs.h b/include/query/irs.h index 4cb69b0..fa69ea1 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -3,7 +3,11 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A query class for independent range sampling. This query requires + * that the shard support get_lower_bound(key), get_upper_bound(key), + * and get_record_at(index). * */ #pragma once diff --git a/include/query/rangequery.h b/include/query/rangequery.h index b9ac9db..16dcd86 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -1,10 +1,12 @@ /* * include/query/rangequery.h * - * Copyright (C) 2023 Douglas B. Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * + * A query class for single dimensional range queries. This query requires + * that the shard support get_lower_bound(key) and get_record_at(index). */ #pragma once diff --git a/include/query/wirs.h b/include/query/wirs.h index 1113b1d..9b3d2ad 100644 --- a/include/query/wirs.h +++ b/include/query/wirs.h @@ -3,7 +3,11 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A query class for weighted independent range sampling. This + * class is tightly coupled with include/shard/AugBTree.h, and + * so is probably of limited general utility. * */ #pragma once diff --git a/include/query/wss.h b/include/query/wss.h index 794485c..4c8861e 100644 --- a/include/query/wss.h +++ b/include/query/wss.h @@ -1,10 +1,13 @@ /* - * include/query/rangequery.h + * include/query/wss.h * - * Copyright (C) 2023 Douglas B. Rumbaugh + * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * + * A query class for weighted set sampling. This + * class is tightly coupled with include/shard/Alias.h, + * and so is probably of limited general utility. */ #pragma once diff --git a/include/shard/Alias.h b/include/shard/Alias.h index a4a7d02..a3e8ad8 100644 --- a/include/shard/Alias.h +++ b/include/shard/Alias.h @@ -4,7 +4,11 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A shard shim around the psudb::Alias Walker's Alias + * structure. Designed to be used along side the WSS + * query in include/query/wss.h * */ #pragma once diff --git a/include/shard/AugBTree.h b/include/shard/AugBTree.h index e32ec64..be664ac 100644 --- a/include/shard/AugBTree.h +++ b/include/shard/AugBTree.h @@ -4,8 +4,12 @@ * Copyright (C) 2023 Dong Xie * Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * + * A shard shim around the alias augmented B-tree. Designed to be + * used along side the WIRS query in include/query/wirs.h, but + * also supports the necessary methods for other common query + * types. */ #pragma once diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index a610c09..e11c899 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -4,7 +4,9 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A shard shim around an in-memory ISAM tree. * */ #pragma once diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 6b66b7d..13db26a 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -2,8 +2,12 @@ * include/shard/PGM.h * * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A shard shim around the static version of the PGM learned + * index. * */ #pragma once diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index fdf8edb..56ec357 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -3,7 +3,9 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A shard shim around the TrieSpline learned index. * */ #pragma once diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 978372b..2f5ebbb 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -3,7 +3,12 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. + * + * A shard shim around the VPTree spatial index. + * + * FIXME: separate the KNN query class out into a standalone + * file in include/query . * */ #pragma once diff --git a/include/util/Cursor.h b/include/util/Cursor.h index 00afaab..be7ab32 100644 --- a/include/util/Cursor.h +++ b/include/util/Cursor.h @@ -4,7 +4,7 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * A simple record cursor type with associated methods for help in * merging record sets when constructing shards. diff --git a/include/util/bf_config.h b/include/util/bf_config.h index 4de465d..fdf2195 100644 --- a/include/util/bf_config.h +++ b/include/util/bf_config.h @@ -4,10 +4,10 @@ * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * Global parameters for configuring bloom filters used as auxiliary - * structures on shards within the framework. The bloom filters themselves + * structures on shards within the framework. The bloom filter class * can be found in * * $PROJECT_ROOT/external/psudb-common/cpp/include/psu-ds/BloomFilter.h diff --git a/include/util/types.h b/include/util/types.h index b7f9607..3908174 100644 --- a/include/util/types.h +++ b/include/util/types.h @@ -3,7 +3,7 @@ * * Copyright (C) 2023 Douglas B. Rumbaugh * - * All rights reserved. Published under the Modified BSD License. + * Distributed under the Modified BSD License. * * A centralized header file for various data types used throughout the * code base. There are a few very specific types, such as header formats, -- cgit v1.2.3 From 39d22316be1708073e4fe1f708814cc801ecdc69 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 9 Nov 2023 11:08:34 -0500 Subject: Fixed various concurrency bugs 1. The system should now cleanly shutdown when the DynamicExtension object is destroyed. Before now, this would lead to use-after-frees and/or deadlocks. 2. Improved synchronization on mutable buffer structure management to fix the issue of the framework losing track of buffers during Epoch changeovers. --- include/framework/DynamicExtension.h | 77 +++++++++++++++++++++++----- include/framework/scheduling/Epoch.h | 21 ++++++-- include/framework/scheduling/FIFOScheduler.h | 7 ++- 3 files changed, 84 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 7244856..a6047ea 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -62,6 +62,17 @@ public: } ~DynamicExtension() { + + /* let any in-flight epoch transition finish */ + await_next_epoch(); + + /* deactivate the active epoch */ + get_active_epoch()->set_inactive(); + + /* shutdown the scheduler */ + m_sched.shutdown(); + + /* delete all held resources */ for (auto e : m_epochs) { delete e.second; } @@ -125,7 +136,11 @@ public: } size_t get_height() { - return get_active_epoch()->get_structure()->get_height(); + auto epoch = get_active_epoch_protected(); + auto t = epoch->get_structure()->get_height(); + epoch->end_job(); + + return t; } size_t get_memory_usage() { @@ -211,7 +226,10 @@ public: * tombstone proportion invariant. */ bool validate_tombstone_proportion() { - return get_active_epoch()->get_structure()->validate_tombstone_proportion(); + auto epoch = get_active_epoch_protected(); + auto t = epoch->get_structure()->validate_tombstone_proportion(); + epoch->end_job(); + return t; } private: @@ -228,6 +246,8 @@ private: std::condition_variable m_epoch_cv; std::mutex m_epoch_cv_lk; + std::mutex m_epoch_transition_lk; + size_t m_scale_factor; double m_max_delete_prop; size_t m_buffer_capacity; @@ -252,6 +272,8 @@ private: } void advance_epoch() { + m_epoch_transition_lk.lock(); + size_t new_epoch_num = m_newest_epoch.load(); size_t old_epoch_num = m_current_epoch.load(); assert(new_epoch_num != old_epoch_num); @@ -267,18 +289,19 @@ private: */ if constexpr (!std::same_as) { size_t old_buffer_cnt = new_epoch->clear_buffers(); - // FIXME: this is getting nightmarish... The -1 here is to ensure that the - // the empty buffer added when the merge was first triggered is also included. - // Due to the reordering of operations in internal_append, the new buffer exists - // at the time of the clone, and so is already in the new epoch. - std::unique_lock lk(m_struct_lock); - for (size_t i=old_buffer_cnt-1; iget_buffers().size(); i++) { + + /* + * skip the first buffer, as this was the one that got merged, + * and copy all the other buffer references into the new epoch + */ + for (size_t i=1; iget_buffers().size(); i++) { new_epoch->add_buffer(old_epoch->get_buffers()[i]); } } m_current_epoch.fetch_add(1); old_epoch->set_inactive(); + m_epoch_transition_lk.unlock(); /* notify any blocking threads that the new epoch is available */ m_epoch_cv_lk.lock(); @@ -310,16 +333,41 @@ private: } /* - * Add a new empty buffer to the specified epoch. This is intended to be used + * Add a new empty buffer. This is intended to be used * when a merge is triggered, to allow for inserts to be sustained in the new * buffer while a new epoch is being created in the background. Returns a * pointer to the newly created buffer. */ - Buffer *add_empty_buffer(_Epoch *epoch, Buffer *current_buffer=nullptr) { + Buffer *add_empty_buffer() { + /* + * if there's a current Epoch transition ongoing, a buffer installed + * into an older Epoch, but not the new one, may be lost. So fail to + * insert a buffer. + */ + if (!m_epoch_transition_lk.try_lock()) { + return nullptr; + } + + /* + * verify that the currently active buffer is still full, if + * not, there is no reason to add a new one. This code is + * protected by the epoch transition lock, so need need to + * take a protected reference to the epoch. + */ + auto active_epoch = get_active_epoch(); + if (!active_epoch->get_active_buffer()->is_full()) { + m_epoch_transition_lk.unlock(); + return nullptr; + } + + /* + * create a new buffer and install it in the active epoch. + */ auto temp_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); std::unique_lock m_struct_lock; - auto new_buffer = epoch->add_buffer(temp_buffer, current_buffer); + auto new_buffer = active_epoch->add_buffer(temp_buffer); + /* * if epoch->add_buffer doesn't add the new buffer, this insert * won't update the buffer set (duplicate insert) @@ -330,6 +378,7 @@ private: if (new_buffer != temp_buffer) { delete temp_buffer; } + m_epoch_transition_lk.unlock(); return new_buffer; } @@ -503,15 +552,15 @@ private: * add an empty buffer to allow insert proceed and * schedule a merge on a background thread */ - buffer = add_empty_buffer(epoch); + buffer = add_empty_buffer(); schedule_merge(); } else { /* background merge is ongoing, so just add empty buffer */ - buffer = add_empty_buffer(epoch, buffer); + buffer = add_empty_buffer(); } } - res = buffer->append(rec, ts); + res = (buffer) ? buffer->append(rec, ts) : 0; epoch->end_job(); } while(!res); diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 9193b06..fc08d57 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -47,9 +47,14 @@ public: ~Epoch() { assert(m_active_jobs.load() == 0); - for (auto buf : m_buffers) { - buf->release_reference(); - } + /* FIXME: this is needed to keep the destructor from + * sometimes locking up here. But there *shouldn't* be + * any threads waiting on this signal at object destruction, + * so something else is going on here that needs looked into + */ + //m_active_cv.notify_all(); + + clear_buffers(); if (m_structure) { m_structure->release_reference(); @@ -59,6 +64,7 @@ public: Buffer *add_buffer(Buffer *buf, Buffer *cur_buf=nullptr) { assert(buf); + std::unique_lock m_buffer_lock; /* * if a current buffer is specified, only add the * new buffer if the active buffer is the current, @@ -108,6 +114,7 @@ public: } BufView get_buffer_view() { + std::unique_lock m_buffer_lock; return BufView(m_buffers); } @@ -123,6 +130,7 @@ public: * releasing all references in the process. */ size_t clear_buffers() { + std::unique_lock m_buffer_lock; size_t buf_cnt = m_buffers.size(); for (auto buf : m_buffers) { if (buf) buf->release_reference(); @@ -138,6 +146,7 @@ public: * the new epoch will be set to the provided argument. */ Epoch *clone(size_t number) { + std::unique_lock m_buffer_lock; auto epoch = new Epoch(number); epoch->m_buffers = m_buffers; if (m_structure) { @@ -196,8 +205,8 @@ public: * wait for them to finish and return true. If there are * not active jobs, return true immediately */ - while (m_active_jobs > 0) { - std::unique_lock lk(m_cv_lock); + std::unique_lock lk(m_cv_lock); + while (m_active_jobs.load() > 0) { m_active_cv.wait(lk); } @@ -211,6 +220,8 @@ private: std::condition_variable m_active_cv; std::mutex m_cv_lock; + std::mutex m_buffer_lock; + std::atomic m_active_merge; /* diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index ba62f9e..4cdc436 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -47,9 +47,10 @@ public: } ~FIFOScheduler() { - shutdown(); + if (!m_shutdown.load()) { + shutdown(); + } - m_cv.notify_all(); m_sched_thrd.join(); } @@ -63,6 +64,8 @@ public: void shutdown() { m_shutdown.store(true); + m_thrd_pool.stop(true); + m_cv.notify_all(); } private: -- cgit v1.2.3 From 83486744600e8be338c75c2e3d2339452a392a9d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 13 Nov 2023 10:41:13 -0500 Subject: Fixed merge logic bug in tiering In InternalLevel::clone(), the m_shard_cnt variable was not being set appropriately in the clone, resulting in the record counts reported for a multi-shard level to be reported incorrectly. In DynamicExtension::merge(), the merges were being performed in the wrong order, resulting in multi-level merges deleting records. The leveling tests all passed even with this bug for some reason, but it caused tiering tests to fail. It isn't clear _why_ leveling appeared to work, but the bug is now fixed, so that's largely irrelevant I suppose. --- include/framework/DynamicExtension.h | 3 ++- include/framework/structure/InternalLevel.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index a6047ea..9554c8c 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -436,11 +436,12 @@ private: MergeArgs *args = (MergeArgs *) arguments; Structure *vers = args->epoch->get_structure(); + // FIXME: with an improved shard interface, multiple full buffers // could be merged at once here. Buffer *buff = (Buffer *) args->epoch->get_buffers()[0]; - for (ssize_t i=args->merges.size() - 1; i>=0; i--) { + for (ssize_t i=0; imerges.size(); i++) { vers->merge_levels(args->merges[i].second, args->merges[i].first); } diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index 00e0c58..d146b73 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -219,6 +219,7 @@ public: for (size_t i=0; im_shards[i] = m_shards[i]; } + new_level->m_shard_cnt = m_shard_cnt; return new_level; } -- cgit v1.2.3 From 90bb0614fc1d8f1a185a778e31aaf9027c01aeb8 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 13 Nov 2023 11:44:09 -0500 Subject: Tombstone Compaction: re-enabled tombstone compaction Currently, proactive buffer tombstone compaction is disabled by forcing the buffer tombstone capacity to match its record capacity. It isn't clear how to best handle proactive buffer compactions in an environment where new buffers are spawned anyway. --- include/framework/DynamicExtension.h | 55 +++++++++++++++++++++++- include/framework/scheduling/Task.h | 1 + include/framework/structure/ExtensionStructure.h | 51 ++++++++++++++++++++++ include/framework/structure/MutableBuffer.h | 2 +- 4 files changed, 106 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 9554c8c..9adc320 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -253,6 +253,32 @@ private: size_t m_buffer_capacity; size_t m_buffer_delete_capacity; + void enforce_delete_invariant(_Epoch *epoch) { + auto structure = epoch->get_structure(); + auto compactions = structure->get_compaction_tasks(); + + while (compactions.size() > 0) { + /* otherwise, we need to schedule a merge to compact tombstones */ + MergeArgs *args = new MergeArgs(); + args->epoch = epoch; + // FIXME: all full buffers can be merged at this point--but that requires + // retooling the shard interface a bit to do efficiently. + args->merges = compactions; + args->extension = this; + args->compaction = true; + + auto wait = args->result.get_future(); + + epoch->start_job(); + m_sched.schedule_job(merge, 0, args); + + /* wait for merge completion */ + wait.get(); + + compactions = structure->get_compaction_tasks(); + } + } + _Epoch *get_active_epoch() { return m_epochs[m_current_epoch.load()]; } @@ -272,6 +298,7 @@ private: } void advance_epoch() { + m_epoch_transition_lk.lock(); size_t new_epoch_num = m_newest_epoch.load(); @@ -281,6 +308,15 @@ private: _Epoch *new_epoch = m_epochs[new_epoch_num]; _Epoch *old_epoch = m_epochs[old_epoch_num]; + /* + * Verify the tombstone invariant within the epoch's structure, this + * may require scheduling additional merges. + * + * FIXME: having this inside the lock is going to TANK + * insertion performance. + */ + enforce_delete_invariant(new_epoch); + /* * Update the new Epoch to contain the buffers from the old one * that it doesn't currently have if using a multi-threaded @@ -445,12 +481,26 @@ private: vers->merge_levels(args->merges[i].second, args->merges[i].first); } - vers->merge_buffer(buff); + /* + * if the merge is a compaction, don't push the buffer down, + * as there is no guarantee that the merges will free up + * sufficient space in L0 + */ + if (!args->compaction) { + vers->merge_buffer(buff); + } args->epoch->end_job(); args->result.set_value(true); - ((DynamicExtension *) args->extension)->advance_epoch(); + /* + * Compactions occur on an epoch _before_ it becomes active, + * and as a result the active epoch should _not_ be advanced as + * part of a compaction merge + */ + if (!args->compaction) { + ((DynamicExtension *) args->extension)->advance_epoch(); + } delete args; } @@ -511,6 +561,7 @@ private: // retooling the shard interface a bit to do efficiently. args->merges = epoch->get_structure()->get_merge_tasks(epoch->get_buffers()[0]->get_record_count()); args->extension = this; + args->compaction = false; m_sched.schedule_job(merge, 0, args); } diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index d211fb5..c10ed8b 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -21,6 +21,7 @@ struct MergeArgs { Epoch *epoch; std::vector merges; std::promise result; + bool compaction; void *extension; }; diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 74cede6..a174805 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -201,6 +201,57 @@ public: return m_levels; } + std::vector get_compaction_tasks() { + std::vector tasks; + + /* if the tombstone/delete invariant is satisfied, no need for compactions */ + if (validate_tombstone_proportion()) { + return tasks; + } + + /* locate the first level to violate the invariant */ + level_index violation_idx = -1; + for (level_index i=0; i0; i--) { + MergeTask task = {i-1, i}; + + /* + * The amount of storage required for the merge accounts + * for the cost of storing the new records, along with the + * cost of retaining the old records during the process + * (hence the 2x multiplier). + * + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records + * themselves. + */ + size_t reccnt = m_levels[i-1]->get_record_count(); + if constexpr (L == LayoutPolicy::LEVELING) { + if (can_merge_with(i, reccnt)) { + reccnt += m_levels[i]->get_record_count(); + } + } + //task.m_size = 2* reccnt * sizeof(R); + + tasks.push_back(task); + } + + return tasks; + } + /* * */ diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 671824f..8b17091 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -32,7 +32,7 @@ template class MutableBuffer { public: MutableBuffer(size_t capacity, size_t max_tombstone_cap) - : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0) + : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0) , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) { m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); -- cgit v1.2.3 From fe12926c41eed825da80a36d77b7facd9ba0567a Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 15 Nov 2023 15:18:33 -0500 Subject: Lock protect Epoch during retirement to avoid use-after-free errors --- include/framework/DynamicExtension.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 9adc320..8edcc5f 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "framework/structure/MutableBuffer.h" #include "framework/structure/InternalLevel.h" @@ -247,6 +249,7 @@ private: std::mutex m_epoch_cv_lk; std::mutex m_epoch_transition_lk; + std::shared_mutex m_epoch_retire_lk; size_t m_scale_factor; double m_max_delete_prop; @@ -284,15 +287,10 @@ private: } _Epoch *get_active_epoch_protected() { - ssize_t cur_epoch = -1; - do { - if (cur_epoch != -1) { - m_epochs[cur_epoch]->end_job(); - } - - cur_epoch = m_current_epoch.load(); - m_epochs[cur_epoch]->start_job(); - } while (cur_epoch != m_current_epoch.load()); + m_epoch_retire_lk.lock_shared(); + auto cur_epoch = m_current_epoch.load(); + m_epochs[cur_epoch]->start_job(); + m_epoch_retire_lk.unlock_shared(); return m_epochs[cur_epoch]; } @@ -429,8 +427,14 @@ private: * number will hit zero and the function will * proceed. */ - while (!epoch->retirable()) - ; + + do { + m_epoch_retire_lk.lock(); + if (epoch->retirable()) { + break; + } + m_epoch_retire_lk.unlock(); + } while (true); /* remove epoch from the framework's map */ m_epochs.erase(epoch->get_epoch_number()); @@ -440,6 +444,7 @@ private: * all the references it holds */ delete epoch; + m_epoch_retire_lk.unlock(); /* * Following the epoch's destruction, any buffers -- cgit v1.2.3 From 3c127eda69295cb306739bdd3c5ddccff6026a8d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 13 Dec 2023 12:39:54 -0500 Subject: Refactoring: corrected a number of names and added more comments --- include/framework/DynamicExtension.h | 77 ++++++------ include/framework/scheduling/Epoch.h | 2 +- include/framework/scheduling/Task.h | 4 +- include/framework/structure/ExtensionStructure.h | 142 +++++++++++------------ include/framework/structure/InternalLevel.h | 58 ++++++--- include/framework/structure/MutableBuffer.h | 18 ++- include/framework/util/Configuration.h | 2 +- 7 files changed, 163 insertions(+), 140 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 8edcc5f..fe43c52 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -165,8 +165,8 @@ public: return m_buffer_capacity; } - Shard *create_static_structure(bool await_merge_completion=false) { - if (await_merge_completion) { + Shard *create_static_structure(bool await_reconstruction_completion=false) { + if (await_reconstruction_completion) { await_next_epoch(); } @@ -179,7 +179,7 @@ public: if (vers->get_levels().size() > 0) { for (int i=vers->get_levels().size() - 1; i>= 0; i--) { if (vers->get_levels()[i]) { - shards.emplace_back(vers->get_levels()[i]->get_merged_shard()); + shards.emplace_back(vers->get_levels()[i]->get_combined_shard()); } } } @@ -261,10 +261,10 @@ private: auto compactions = structure->get_compaction_tasks(); while (compactions.size() > 0) { - /* otherwise, we need to schedule a merge to compact tombstones */ - MergeArgs *args = new MergeArgs(); + /* otherwise, we need to schedule a compaction */ + ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; - // FIXME: all full buffers can be merged at this point--but that requires + // FIXME: all full buffers can be flushed at this point--but that requires // retooling the shard interface a bit to do efficiently. args->merges = compactions; args->extension = this; @@ -273,9 +273,9 @@ private: auto wait = args->result.get_future(); epoch->start_job(); - m_sched.schedule_job(merge, 0, args); + m_sched.schedule_job(reconstruction, 0, args); - /* wait for merge completion */ + /* wait for reconstruction completion */ wait.get(); compactions = structure->get_compaction_tasks(); @@ -308,7 +308,7 @@ private: /* * Verify the tombstone invariant within the epoch's structure, this - * may require scheduling additional merges. + * may require scheduling additional reconstructions. * * FIXME: having this inside the lock is going to TANK * insertion performance. @@ -325,8 +325,9 @@ private: size_t old_buffer_cnt = new_epoch->clear_buffers(); /* - * skip the first buffer, as this was the one that got merged, - * and copy all the other buffer references into the new epoch + * skip the first buffer, as this was flushed into the epoch's + * structure already, and copy all the other buffer references + * into the new epoch */ for (size_t i=1; iget_buffers().size(); i++) { new_epoch->add_buffer(old_epoch->get_buffers()[i]); @@ -352,7 +353,7 @@ private: _Epoch *create_new_epoch() { /* * This epoch access is _not_ protected under the assumption that - * only one merge will be able to trigger at a time. If that condition + * only one reconstruction will be able to trigger at a time. If that condition * is violated, it is possible that this code will clone a retired * epoch. */ @@ -368,7 +369,7 @@ private: /* * Add a new empty buffer. This is intended to be used - * when a merge is triggered, to allow for inserts to be sustained in the new + * when a reconstruction is triggered, to allow for inserts to be sustained in the new * buffer while a new epoch is being created in the background. Returns a * pointer to the newly created buffer. */ @@ -429,13 +430,12 @@ private: */ do { - m_epoch_retire_lk.lock(); if (epoch->retirable()) { break; } - m_epoch_retire_lk.unlock(); } while (true); + m_epoch_retire_lk.lock(); /* remove epoch from the framework's map */ m_epochs.erase(epoch->get_epoch_number()); @@ -473,26 +473,26 @@ private: } } - static void merge(void *arguments) { - MergeArgs *args = (MergeArgs *) arguments; + static void reconstruction(void *arguments) { + ReconstructionArgs *args = (ReconstructionArgs *) arguments; Structure *vers = args->epoch->get_structure(); // FIXME: with an improved shard interface, multiple full buffers - // could be merged at once here. + // could be flushed at once here. Buffer *buff = (Buffer *) args->epoch->get_buffers()[0]; for (ssize_t i=0; imerges.size(); i++) { - vers->merge_levels(args->merges[i].second, args->merges[i].first); + vers->reconstruction(args->merges[i].second, args->merges[i].first); } /* - * if the merge is a compaction, don't push the buffer down, - * as there is no guarantee that the merges will free up - * sufficient space in L0 + * if performing a compaction, don't push the buffer down, + * as there is no guarantee that any necessary reconstructions + * will free sufficient space in L0 to support a flush */ if (!args->compaction) { - vers->merge_buffer(buff); + vers->flush_buffer(buff); } args->epoch->end_job(); @@ -501,7 +501,7 @@ private: /* * Compactions occur on an epoch _before_ it becomes active, * and as a result the active epoch should _not_ be advanced as - * part of a compaction merge + * part of a compaction */ if (!args->compaction) { ((DynamicExtension *) args->extension)->advance_epoch(); @@ -556,18 +556,19 @@ private: delete args; } - void schedule_merge() { + void schedule_reconstruction() { + //fprintf(stderr, "%ld\t Reconstruction Scheduling", m_current_epoch); auto epoch = create_new_epoch(); epoch->start_job(); - MergeArgs *args = new MergeArgs(); + ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; - // FIXME: all full buffers can be merged at this point--but that requires + // FIXME: all full buffers can be flushed at this point--but that requires // retooling the shard interface a bit to do efficiently. - args->merges = epoch->get_structure()->get_merge_tasks(epoch->get_buffers()[0]->get_record_count()); + args->merges = epoch->get_structure()->get_reconstruction_tasks(epoch->get_buffers()[0]->get_record_count()); args->extension = this; args->compaction = false; - m_sched.schedule_job(merge, 0, args); + m_sched.schedule_job(reconstruction, 0, args); } std::future> schedule_query(void *query_parms) { @@ -592,27 +593,27 @@ private: assert(buffer); /* - * If the buffer is full and there is no current merge, - * schedule a merge and add a new empty buffer. If there - * is a current merge, then just add a new empty buffer + * If the buffer is full and there is no ongoing reconstruction, + * schedule a reconstruction and add a new empty buffer. If there + * is an ongoing reconstruction, then add a new empty buffer * to the current epoch. */ if (buffer->is_full()) { if constexpr (std::same_as) { - /* single threaded: run merge and then empty buffer */ + /* single threaded: run reconstruction and then empty buffer */ epoch->end_job(); - schedule_merge(); + schedule_reconstruction(); buffer->truncate(); continue; - } else if (epoch->prepare_merge()) { + } else if (epoch->prepare_reconstruction()) { /* * add an empty buffer to allow insert proceed and - * schedule a merge on a background thread + * schedule a reconstruction on a background thread */ buffer = add_empty_buffer(); - schedule_merge(); + schedule_reconstruction(); } else { - /* background merge is ongoing, so just add empty buffer */ + /* background reconstruction is ongoing, so just add empty buffer */ buffer = add_empty_buffer(); } } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index fc08d57..4e1b8a2 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -170,7 +170,7 @@ public: * isn't, return true and set a flag indicating that * there is an active merge. */ - bool prepare_merge() { + bool prepare_reconstruction() { auto old = m_active_merge.load(); if (old) { return false; diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index c10ed8b..16f5e58 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -17,9 +17,9 @@ namespace de { template -struct MergeArgs { +struct ReconstructionArgs { Epoch *epoch; - std::vector merges; + std::vector merges; std::promise result; bool compaction; void *extension; diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index a174805..3cd55ac 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -45,8 +45,8 @@ public: /* * Create a shallow copy of this extension structure. The copy will share references to the * same levels/shards as the original, but will have its own lists. As all of the shards are - * immutable (with the exception of deletes), the copy can be restructured with merges, etc., - * without affecting the original. The copied structure will be returned with a reference + * immutable (with the exception of deletes), the copy can be restructured with reconstructions + * and flushes without affecting the original. The copied structure will be returned with a reference * count of 0; generally you will want to immediately call take_reference() on it. * * NOTE: When using tagged deletes, a delete of a record in the original structure will affect @@ -55,7 +55,7 @@ public: * need to be forwarded to the appropriate structures manually. */ ExtensionStructure *copy() { - auto new_struct = new ExtensionStructure(m_buffer_size, m_scale_factor, m_max_delete_prop); + auto new_struct = new ExtensionStructure(m_buffer_size, m_scale_factor, m_max_delete_prop); for (size_t i=0; im_levels.push_back(m_levels[i]->clone()); } @@ -90,17 +90,20 @@ public: } /* - * Merge the memory table down into the tree, completing any required other - * merges to make room for it. + * Flush a buffer into the extension structure, performing any necessary + * reconstructions to free up room in L0. + * + * FIXME: arguably, this should be a method attached to the buffer that + * takes a structure as input. */ - inline bool merge_buffer(Buffer *buffer) { - assert(can_merge_with(0, buffer->get_record_count())); + inline bool flush_buffer(Buffer *buffer) { + assert(can_reconstruct_with(0, buffer->get_record_count())); // FIXME: this step makes an extra copy of the buffer, // which could be avoided by adjusting the shard // reconstruction process a bit, possibly. - buffer->start_merge(); - merge_buffer_into_l0(buffer); + buffer->start_flush(); + flush_buffer_into_l0(buffer); return true; } @@ -123,7 +126,7 @@ public: * Return the total number of tombstones contained within all of the * levels of the structure. */ - size_t get_tombstone_cnt() { + size_t get_tombstone_count() { size_t cnt = 0; for (size_t i=0; i get_compaction_tasks() { - std::vector tasks; + std::vector get_compaction_tasks() { + std::vector tasks; /* if the tombstone/delete invariant is satisfied, no need for compactions */ if (validate_tombstone_proportion()) { @@ -220,16 +223,16 @@ public: assert(violation_idx != -1); - level_index merge_base_level = find_mergable_level(violation_idx); - if (merge_base_level == -1) { - merge_base_level = grow(); + level_index base_level = find_reconstruction_target(violation_idx); + if (base_level == -1) { + base_level = grow(); } - for (level_index i=merge_base_level; i>0; i--) { - MergeTask task = {i-1, i}; + for (level_index i=base_level; i>0; i--) { + ReconstructionTask task = {i-1, i}; /* - * The amount of storage required for the merge accounts + * The amount of storage required for the reconstruction accounts * for the cost of storing the new records, along with the * cost of retaining the old records during the process * (hence the 2x multiplier). @@ -240,7 +243,7 @@ public: */ size_t reccnt = m_levels[i-1]->get_record_count(); if constexpr (L == LayoutPolicy::LEVELING) { - if (can_merge_with(i, reccnt)) { + if (can_reconstruct_with(i, reccnt)) { reccnt += m_levels[i]->get_record_count(); } } @@ -255,28 +258,27 @@ public: /* * */ - std::vector get_merge_tasks(size_t buffer_reccnt) { - std::vector merges; + std::vector get_reconstruction_tasks(size_t buffer_reccnt) { + std::vector reconstructions; /* - * The buffer -> L0 merge task is not included so if that - * can be done without any other change, just return an - * empty list. + * The buffer flush is not included so if that can be done without any + * other change, just return an empty list. */ - if (can_merge_with(0, buffer_reccnt)) { - return std::move(merges); + if (can_reconstruct_with(0, buffer_reccnt)) { + return std::move(reconstructions); } - level_index merge_base_level = find_mergable_level(0); - if (merge_base_level == -1) { - merge_base_level = grow(); + level_index base_level = find_reconstruction_target(0); + if (base_level == -1) { + base_level = grow(); } - for (level_index i=merge_base_level; i>0; i--) { - MergeTask task = {i-1, i}; + for (level_index i=base_level; i>0; i--) { + ReconstructionTask task = {i-1, i}; /* - * The amount of storage required for the merge accounts + * The amount of storage required for the reconstruction accounts * for the cost of storing the new records, along with the * cost of retaining the old records during the process * (hence the 2x multiplier). @@ -287,34 +289,34 @@ public: */ size_t reccnt = m_levels[i-1]->get_record_count(); if constexpr (L == LayoutPolicy::LEVELING) { - if (can_merge_with(i, reccnt)) { + if (can_reconstruct_with(i, reccnt)) { reccnt += m_levels[i]->get_record_count(); } } //task.m_size = 2* reccnt * sizeof(R); - merges.push_back(task); + reconstructions.push_back(task); } - return std::move(merges); + return std::move(reconstructions); } /* * */ - std::vector get_merge_tasks_from_level(level_index source_level) { - std::vector merges; + std::vector get_reconstruction_tasks_from_level(level_index source_level) { + std::vector reconstructions; - level_index merge_base_level = find_mergable_level(source_level); - if (merge_base_level == -1) { - merge_base_level = grow(); + level_index base_level = find_reconstruction_target(source_level); + if (base_level == -1) { + base_level = grow(); } - for (level_index i=merge_base_level; i>source_level; i--) { - MergeTask task = {i - 1, i}; + for (level_index i=base_level; i>source_level; i--) { + ReconstructionTask task = {i - 1, i}; /* - * The amount of storage required for the merge accounts + * The amount of storage required for the reconstruction accounts * for the cost of storing the new records, along with the * cost of retaining the old records during the process * (hence the 2x multiplier). @@ -325,31 +327,30 @@ public: */ size_t reccnt = m_levels[i-1]->get_record_count(); if constexpr (L == LayoutPolicy::LEVELING) { - if (can_merge_with(i, reccnt)) { + if (can_reconstruct_with(i, reccnt)) { reccnt += m_levels[i]->get_record_count(); } } // task.m_size = 2* reccnt * sizeof(R); - merges.push_back(task); + reconstructions.push_back(task); } - return merges; + return reconstructions; } /* - * Merge the level specified by incoming level into the level specified - * by base level. The two levels should be sequential--i.e. no levels - * are skipped in the merge process--otherwise the tombstone ordering - * invariant may be violated by the merge operation. + * Combine incoming_level with base_level and reconstruct the shard, + * placing it in base_level. The two levels should be sequential--i.e. no + * levels are skipped in the reconstruction process--otherwise the + * tombstone ordering invariant may be violated. */ - inline void merge_levels(level_index base_level, level_index incoming_level) { - // merging two memory levels + inline void reconstruction(level_index base_level, level_index incoming_level) { if constexpr (L == LayoutPolicy::LEVELING) { auto tmp = m_levels[base_level]; - m_levels[base_level] = InternalLevel::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get()); + m_levels[base_level] = InternalLevel::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get()); } else { - m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get()); + m_levels[base_level]->append_level(m_levels[incoming_level].get()); m_levels[base_level]->finalize(); } @@ -391,9 +392,7 @@ private: std::vector>> m_levels; /* - * Add a new level to the LSM Tree and return that level's index. Will - * automatically determine whether the level should be on memory or on disk, - * and act appropriately. + * Add a new level to the structure and return its index. */ inline level_index grow() { level_index new_idx = m_levels.size(); @@ -405,22 +404,18 @@ private: /* * Find the first level below the level indicated by idx that - * is capable of sustaining a merge operation and return its + * is capable of sustaining a reconstruction and return its * level index. If no such level exists, returns -1. Also - * returns -1 if idx==0, and no such level exists, to skimplify - * the logic of the first merge. + * returns -1 if idx==0, and no such level exists, to simplify + * the logic of the first buffer flush. */ - inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) { + inline level_index find_reconstruction_target(level_index idx, Buffer *buffer=nullptr) { if (idx == 0 && m_levels.size() == 0) return -1; - bool level_found = false; - bool disk_level; - level_index merge_level_idx; - size_t incoming_rec_cnt = get_level_record_count(idx, buffer); for (level_index i=idx+1; i(0, 1); temp_level->append_buffer(buffer); - auto new_level = InternalLevel::merge_levels(old_level, temp_level); + auto new_level = InternalLevel::reconstruction(old_level, temp_level); m_levels[0] = new_level; delete temp_level; @@ -479,13 +474,10 @@ private: } /* - * Determines if the specific level can merge with another record containing - * incoming_rec_cnt number of records. The provided level index should be - * non-negative (i.e., not refer to the buffer) and will be automatically - * translated into the appropriate index into either the disk or memory level - * vector. + * Determines if a level can sustain a reconstruction with incoming_rec_cnt + * additional records without exceeding its capacity. */ - inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) { + inline bool can_reconstruct_with(level_index idx, size_t incoming_rec_cnt) { if (idx >= m_levels.size() || !m_levels[idx]) { return false; } diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index d146b73..e70ed76 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -40,9 +40,14 @@ public: delete m_pending_shard; } - // WARNING: for leveling only. - // assuming the base level is the level new level is merging into. (base_level is larger.) - static std::shared_ptr merge_levels(InternalLevel* base_level, InternalLevel* new_level) { + /* + * Create a new shard combining the records from base_level and new_level, + * and return a shared_ptr to a new level containing this shard. This is used + * for reconstructions under the leveling layout policy. + * + * No changes are made to the levels provided as arguments. + */ + static std::shared_ptr reconstruction(InternalLevel* base_level, InternalLevel* new_level) { assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0)); auto res = new InternalLevel(base_level->m_level_no, 1); res->m_shard_cnt = 1; @@ -54,18 +59,15 @@ public: return std::shared_ptr(res); } - void append_buffer(Buffer* buffer) { - if (m_shard_cnt == m_shards.size()) { - assert(m_pending_shard == nullptr); - m_pending_shard = new S(buffer); - return; - } - - m_shards[m_shard_cnt] = std::make_shared(buffer); - ++m_shard_cnt; - } - - void append_merged_shards(InternalLevel* level) { + /* + * Create a new shard combining the records from all of + * the shards in level, and append this new shard into + * this level. This is used for reconstructions under + * the tiering layout policy. + * + * No changes are made to the level provided as an argument. + */ + void append_level(InternalLevel* level) { Shard *shards[level->m_shard_cnt]; for (size_t i=0; im_shard_cnt; i++) { shards[i] = level->m_shards[i].get(); @@ -82,6 +84,22 @@ public: ++m_shard_cnt; } + /* + * Create a new shard using the records in the + * provided buffer, and append this new shard + * into this level. This is used for buffer + * flushes under the tiering layout policy. + */ + void append_buffer(Buffer* buffer) { + if (m_shard_cnt == m_shards.size()) { + assert(m_pending_shard == nullptr); + m_pending_shard = new S(buffer); + return; + } + + m_shards[m_shard_cnt] = std::make_shared(buffer); + ++m_shard_cnt; + } void finalize() { if (m_pending_shard) { @@ -95,7 +113,13 @@ public: } } - Shard *get_merged_shard() { + /* + * Create a new shard containing the combined records + * from all shards on this level and return it. + * + * No changes are made to this level. + */ + Shard *get_combined_shard() { if (m_shard_cnt == 0) { return nullptr; } @@ -109,7 +133,7 @@ public: return new S(shards, m_shard_cnt); } - // Append the sample range in-order..... + /* Append the sample range in-order */ void get_query_states(std::vector> &shards, std::vector& shard_states, void *query_parms) { for (size_t i=0; i + * Copyright (C) 2023 Douglas B. Rumbaugh * Dong Xie * * Distributed under the Modified BSD License. * + * FIXME: currently, the buffer itself is responsible for managing a + * secondary buffer for storing sorted records used during buffer flushes. It + * probably makes more sense to make the shard being flushed into responsible + * for this instead. This would also facilitate simultaneous flushes of multiple + * buffers more easily. + * */ #pragma once @@ -35,7 +41,7 @@ public: : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0) , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) { m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); - m_merge_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); + m_sorted_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); m_tombstone_filter = nullptr; if (max_tombstone_cap > 0) { m_tombstone_filter = new psudb::BloomFilter(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS); @@ -49,7 +55,7 @@ public: if (m_data) free(m_data); if (m_tombstone_filter) delete m_tombstone_filter; - if (m_merge_data) free(m_merge_data); + if (m_sorted_data) free(m_sorted_data); } template @@ -171,8 +177,8 @@ public: * to be adjusted). Other threads having read access is perfectly * acceptable, however. */ - bool start_merge() { - memcpy(m_merge_data, m_data, sizeof(Wrapped) * m_reccnt.load()); + bool start_flush() { + memcpy(m_sorted_data, m_data, sizeof(Wrapped) * m_reccnt.load()); return true; } @@ -210,7 +216,7 @@ private: size_t m_tombstone_cap; Wrapped* m_data; - Wrapped* m_merge_data; + Wrapped* m_sorted_data; psudb::BloomFilter* m_tombstone_filter; diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h index 866128a..8e3d20f 100644 --- a/include/framework/util/Configuration.h +++ b/include/framework/util/Configuration.h @@ -49,6 +49,6 @@ enum class DeletePolicy { }; typedef ssize_t level_index; -typedef std::pair MergeTask; +typedef std::pair ReconstructionTask; } -- cgit v1.2.3 From 24a42e300c96e2815bf20be3f6cce3efee1c4303 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 21 Dec 2023 17:03:39 -0500 Subject: ExtensionStructure: adjusted leveling logic to avoid unneeded copies This also reduces the special-case overhead on shards. As it was, shards would need to handle a special case when constructing from other shards where the first of the two provided shards was a nullptr, which caused a number of subtle issues (or outright crashes in some cases) with existing shard implementations. --- include/framework/structure/ExtensionStructure.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 3cd55ac..60016a0 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -347,13 +347,19 @@ public: */ inline void reconstruction(level_index base_level, level_index incoming_level) { if constexpr (L == LayoutPolicy::LEVELING) { - auto tmp = m_levels[base_level]; - m_levels[base_level] = InternalLevel::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get()); + /* if the base level has a shard, merge the base and incoming together to make a new one */ + if (m_levels[base_level]->get_shard_count() > 0) { + m_levels[base_level] = InternalLevel::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get()); + /* otherwise, we can just move the incoming to the base */ + } else { + m_levels[base_level] = m_levels[incoming_level]; + } } else { m_levels[base_level]->append_level(m_levels[incoming_level].get()); m_levels[base_level]->finalize(); } + /* place a new, empty level where the incoming level used to be */ m_levels[incoming_level] = std::shared_ptr>(new InternalLevel(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor)); } @@ -432,10 +438,13 @@ private: auto old_level = m_levels[0].get(); auto temp_level = new InternalLevel(0, 1); temp_level->append_buffer(buffer); - auto new_level = InternalLevel::reconstruction(old_level, temp_level); - m_levels[0] = new_level; - delete temp_level; + if (old_level->get_shard_count() > 0) { + m_levels[0] = InternalLevel::reconstruction(old_level, temp_level); + delete temp_level; + } else { + m_levels[0] = std::shared_ptr>(temp_level); + } } else { m_levels[0]->append_buffer(buffer); } -- cgit v1.2.3 From 46dfe1e0f3bb05016da14b39b2e35babbba4027a Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 21 Dec 2023 17:05:30 -0500 Subject: InternalLevel: appending an empty level is a no-op The existing reconstruction logic will occasionally attempt to append an empty level to another empty level, for some reason. While the underlying cause of this needs to be looked into, this special case should prevent shard constructors being called with a shard count of 0 under tiering, reducing the error handling overhead of shard code. --- include/framework/structure/InternalLevel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index e70ed76..ee85cb3 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -68,6 +68,13 @@ public: * No changes are made to the level provided as an argument. */ void append_level(InternalLevel* level) { + // FIXME: that this is happening probably means that + // something is going terribly wrong earlier in the + // reconstruction logic. + if (level->get_shard_count() == 0) { + return; + } + Shard *shards[level->m_shard_cnt]; for (size_t i=0; im_shard_cnt; i++) { shards[i] = level->m_shards[i].get(); -- cgit v1.2.3 From 8113b32d124f487856a858c7f68a4e531399f66d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 21 Dec 2023 17:07:29 -0500 Subject: DynamicExtension: comments/reorganization Clarified the reasoning for a few things in comments that just tripped me up during debugging. --- include/framework/DynamicExtension.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index fe43c52..c5c4a1a 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -261,23 +261,29 @@ private: auto compactions = structure->get_compaction_tasks(); while (compactions.size() > 0) { - /* otherwise, we need to schedule a compaction */ + + /* schedule a compaction */ ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; - // FIXME: all full buffers can be flushed at this point--but that requires - // retooling the shard interface a bit to do efficiently. args->merges = compactions; args->extension = this; args->compaction = true; + /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ auto wait = args->result.get_future(); + /* + * the reconstruction process calls end_job(), + * so we must start one before calling it + */ epoch->start_job(); + m_sched.schedule_job(reconstruction, 0, args); - /* wait for reconstruction completion */ + /* wait for compaction completion */ wait.get(); + /* get a new batch of compactions to perform, if needed */ compactions = structure->get_compaction_tasks(); } } @@ -557,17 +563,23 @@ private: } void schedule_reconstruction() { - //fprintf(stderr, "%ld\t Reconstruction Scheduling", m_current_epoch); auto epoch = create_new_epoch(); + /* + * the reconstruction process calls end_job(), + * so we must start one before calling it + */ epoch->start_job(); - ReconstructionArgs *args = new ReconstructionArgs(); - args->epoch = epoch; // FIXME: all full buffers can be flushed at this point--but that requires // retooling the shard interface a bit to do efficiently. + // + ReconstructionArgs *args = new ReconstructionArgs(); + args->epoch = epoch; args->merges = epoch->get_structure()->get_reconstruction_tasks(epoch->get_buffers()[0]->get_record_count()); args->extension = this; args->compaction = false; + /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ + m_sched.schedule_job(reconstruction, 0, args); } -- cgit v1.2.3 From 3c2e6b3b456867d7155b158432b891b84e4e1dd6 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Tue, 9 Jan 2024 14:47:48 -0500 Subject: Initial update of buffer to new specifications There are a few minor issues that this introduces, however. Global tracking of a lot of secondary information, such as weights for WIRS/WSS, or the exact number of tombstones will need to be approached differently than they have been historically with this new approach. I've also removed most of the tombstone capacity related code. We had decided not to bother enforcing this at the buffer level anyway, and it would greatly increase the complexity of the problem of predicting when the next compaction will be. On the whole this new approach seems like it'll simplify a lot. This commit actually removes significantly more code than it adds. One minor issue: the currently implementation will have problems in the circular array indexes once more than 2^64 records have been inserted. This doesn't seem like a realistic problem at the moment. --- include/framework/structure/BufferView.h | 98 ++++++----------------- include/framework/structure/MutableBuffer.h | 116 +++++++++------------------- 2 files changed, 62 insertions(+), 152 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 651e430..8a5f50f 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -22,107 +22,59 @@ #include "psu-ds/Alias.h" #include "psu-util/timer.h" #include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" #include "framework/interface/Query.h" namespace de { -template +template class BufferView { - typedef MutableBuffer Buffer; public: BufferView() = default; - BufferView(std::vector buffers) - : m_buffers(buffers) - , m_cutoff(buffers[buffers.size()-1]->get_record_count()) - {} + BufferView(const Wrapped *buffer, size_t head, size_t tail, psudb::BloomFilter *filter) + : m_buffer(buffer), m_head(head), m_tail(tail), m_tombstone_filter(filter) {} ~BufferView() = default; - bool delete_record(const R& rec) { - auto res = false; - for (auto buf : m_buffers) { - res = buf->delete_record(rec); - if (res) return true; - } - return false; - } - bool check_tombstone(const R& rec) { - auto res = false; - for (auto buf : m_buffers) { - res = buf->check_tombstone(rec); - if (res) return true; + if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false; + + for (size_t i=0; iget_record_count(); - } - return reccnt; + return m_tail - m_head; } - size_t get_capacity() { - return m_buffers[0]->get_capacity(); - } - - bool is_full() { - return m_buffers[m_buffers.size() - 1]->is_full(); - } - size_t get_tombstone_count() { - size_t tscnt = 0; - for (auto buf : m_buffers) { - tscnt += buf->get_tombstone_count(); - } - return tscnt; + // FIXME: tombstone count + return 0; } - size_t get_memory_usage() { - size_t mem = 0; - for (auto buf : m_buffers) { - mem += buf->get_memory_usage(); - } - return mem; + Wrapped *get(size_t i) { + assert(i < get_record_count()); + return m_buffer + to_idx(i); } - size_t get_aux_memory_usage() { - size_t mem = 0; - for (auto buf : m_buffers) { - mem += buf->get_aux_memory_usage(); - } - return mem; - } - - size_t get_tombstone_capacity() { - return m_buffers[0]->get_tombstone_capacity(); - } - - std::vector get_query_states(void *parms) { - std::vector states; - - for (auto buf : m_buffers) { - states.push_back(Q::get_buffer_query_state(buf, parms)); - } - - return states; + void copy_to_buffer(byte *buffer) { + memcpy(buffer, m_buffer, get_record_count() * sizeof(Wrapped)); } - std::vector &get_buffers() { - return m_buffers; - } +private: + const Wrapped* m_buffer; + size_t m_head; + size_t m_tail; + psudb::BloomFilter *m_tombstone_filter; - size_t size() { - return m_buffers.size(); + size_t to_idx(size_t i) { + return (m_head + i) % m_buffer->get_capacity(); } - -private: - std::vector m_buffers; - size_t m_cutoff; }; } diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 58b5fb4..7bec219 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -6,12 +6,6 @@ * * Distributed under the Modified BSD License. * - * FIXME: currently, the buffer itself is responsible for managing a - * secondary buffer for storing sorted records used during buffer flushes. It - * probably makes more sense to make the shard being flushed into responsible - * for this instead. This would also facilitate simultaneous flushes of multiple - * buffers more easily. - * */ #pragma once @@ -29,6 +23,7 @@ #include "psu-ds/Alias.h" #include "psu-util/timer.h" #include "framework/interface/Record.h" +#include "framework/structure/BufferView.h" using psudb::CACHELINE_SIZE; @@ -36,18 +31,22 @@ namespace de { template class MutableBuffer { + friend class BufferView; public: - MutableBuffer(size_t capacity, size_t max_tombstone_cap) - : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0) - , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) { - m_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); - m_sorted_data = (Wrapped*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped)); - m_tombstone_filter = nullptr; - if (max_tombstone_cap > 0) { - m_tombstone_filter = new psudb::BloomFilter(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS); + MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) + : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0) { + /* + * default capacity is twice the high water mark, to account for the worst-case + * memory requirements. + */ + if (m_cap == 0) { + m_cap = m_hwm * 2; } - m_refcnt.store(0); + m_data = (Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped)); + + // FIXME: need to figure out how to detail with tombstones at some point... + m_tombstone_filter = new psudb::BloomFilter(BF_FPR, m_hwm, BF_HASH_FUNCS); } ~MutableBuffer() { @@ -55,13 +54,10 @@ public: if (m_data) free(m_data); if (m_tombstone_filter) delete m_tombstone_filter; - if (m_sorted_data) free(m_sorted_data); } template int append(const R &rec, bool tombstone=false) { - if (tombstone && m_tombstonecnt + 1 > m_tombstone_cap) return 0; - int32_t pos = 0; if ((pos = try_advance_tail()) == -1) return 0; @@ -78,26 +74,11 @@ public: if (m_tombstone_filter) m_tombstone_filter->insert(rec); } - if constexpr (WeightedRecordInterface) { - m_weight.fetch_add(rec.weight); - double old = m_max_weight.load(); - while (old < rec.weight) { - m_max_weight.compare_exchange_strong(old, rec.weight); - old = m_max_weight.load(); - } - } else { - m_weight.fetch_add(1); - } - - m_reccnt.fetch_add(1); return 1; } bool truncate() { m_tombstonecnt.store(0); - m_reccnt.store(0); - m_weight.store(0); - m_max_weight.store(0); m_tail.store(0); if (m_tombstone_filter) m_tombstone_filter->clear(); @@ -105,7 +86,7 @@ public: } size_t get_record_count() { - return m_reccnt; + return (m_tail - m_head) % m_cap; } size_t get_capacity() { @@ -113,7 +94,7 @@ public: } bool is_full() { - return m_reccnt == m_cap; + return (m_tail % m_cap) >= m_hwm; } size_t get_tombstone_count() { @@ -121,13 +102,11 @@ public: } bool delete_record(const R& rec) { - auto offset = 0; - while (offset < m_reccnt.load()) { - if (m_data[offset].rec == rec) { - m_data[offset].set_delete(); + for (size_t i=0; ilookup(rec)) return false; - auto offset = 0; - while (offset < m_reccnt.load()) { - if (m_data[offset].rec == rec && m_data[offset].is_tombstone()) { + for (size_t i=0; i *get_data() { - return m_data; - } - - double get_max_weight() { - return m_max_weight; - } - - /* - * This operation assumes that no other threads have write access - * to the buffer. This will be the case in normal operation, at - * present, but may change (in which case this approach will need - * to be adjusted). Other threads having read access is perfectly - * acceptable, however. - */ - bool start_flush() { - memcpy(m_sorted_data, m_data, sizeof(Wrapped) * m_reccnt.load()); - return true; + // FIXME: tombstone capacity needs figured out again + return m_cap; } /* @@ -202,30 +157,33 @@ public: private: int64_t try_advance_tail() { - int64_t new_tail = m_tail.fetch_add(1); + int64_t new_tail = m_tail.fetch_add(1) % m_cap; - if (new_tail < m_cap) { + if (new_tail < m_hwm) { return new_tail; - } + } m_tail.fetch_add(-1); return -1; } + size_t to_idx(size_t i) { + return (m_head + i) % m_cap; + } + size_t m_cap; - size_t m_tombstone_cap; + + size_t m_lwm; + size_t m_hwm; + + alignas(64) std::atomic m_tail; + alignas(64) std::atomic m_head; Wrapped* m_data; - Wrapped* m_sorted_data; psudb::BloomFilter* m_tombstone_filter; alignas(64) std::atomic m_tombstonecnt; - alignas(64) std::atomic m_reccnt; - alignas(64) std::atomic m_tail; - alignas(64) std::atomic m_weight; - alignas(64) std::atomic m_max_weight; - alignas(64) std::atomic m_refcnt; }; -- cgit v1.2.3 From 53879a0d69f5e578710b7125e9b41e516c2371d4 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 10 Jan 2024 17:39:28 -0500 Subject: MutableBuffer+View: Implementation with unit tests --- include/framework/structure/BufferView.h | 32 ++++--- include/framework/structure/MutableBuffer.h | 133 ++++++++++++++++++++-------- 2 files changed, 116 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 8a5f50f..7e8af45 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -9,32 +9,34 @@ #pragma once #include -#include -#include #include -#include -#include -#include +#include #include "psu-util/alignment.h" -#include "util/bf_config.h" #include "psu-ds/BloomFilter.h" -#include "psu-ds/Alias.h" -#include "psu-util/timer.h" #include "framework/interface/Record.h" -#include "framework/interface/Query.h" namespace de { +typedef std::function ReleaseFunction; + template class BufferView { public: BufferView() = default; - BufferView(const Wrapped *buffer, size_t head, size_t tail, psudb::BloomFilter *filter) - : m_buffer(buffer), m_head(head), m_tail(tail), m_tombstone_filter(filter) {} + BufferView(const Wrapped *buffer, size_t head, size_t tail, psudb::BloomFilter *filter, + void *parent_buffer, ReleaseFunction release) + : m_buffer(buffer) + , m_release(release) + , m_parent_buffer(parent_buffer) + , m_head(head) + , m_tail(tail) + , m_tombstone_filter(filter) {} - ~BufferView() = default; + ~BufferView() { + m_release(m_parent_buffer, m_head); + } bool check_tombstone(const R& rec) { if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false; @@ -62,12 +64,14 @@ public: return m_buffer + to_idx(i); } - void copy_to_buffer(byte *buffer) { - memcpy(buffer, m_buffer, get_record_count() * sizeof(Wrapped)); + void copy_to_buffer(psudb::byte *buffer) { + memcpy(buffer, (std::byte*) (m_buffer + m_head), get_record_count() * sizeof(Wrapped)); } private: const Wrapped* m_buffer; + void *m_parent_buffer; + ReleaseFunction m_release; size_t m_head; size_t m_tail; psudb::BloomFilter *m_tombstone_filter; diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 7bec219..d57ad6e 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -11,17 +11,11 @@ #include #include -#include #include -#include -#include -#include #include "psu-util/alignment.h" #include "util/bf_config.h" #include "psu-ds/BloomFilter.h" -#include "psu-ds/Alias.h" -#include "psu-util/timer.h" #include "framework/interface/Record.h" #include "framework/structure/BufferView.h" @@ -34,7 +28,8 @@ class MutableBuffer { friend class BufferView; public: MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) - : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0) { + : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0), + m_old_head(0), m_head_refcnt(0), m_old_head_refcnt(0) { /* * default capacity is twice the high water mark, to account for the worst-case * memory requirements. @@ -50,10 +45,8 @@ public: } ~MutableBuffer() { - assert(m_refcnt.load() == 0); - - if (m_data) free(m_data); - if (m_tombstone_filter) delete m_tombstone_filter; + free(m_data); + delete m_tombstone_filter; } template @@ -94,7 +87,11 @@ public: } bool is_full() { - return (m_tail % m_cap) >= m_hwm; + return get_record_count() >= m_hwm; + } + + bool is_at_low_watermark() { + return (m_tail % m_cap) > m_lwm; } size_t get_tombstone_count() { @@ -125,66 +122,132 @@ public: } size_t get_memory_usage() { - return m_cap * sizeof(R); + return m_cap * sizeof(Wrapped); } size_t get_aux_memory_usage() { return m_tombstone_filter->get_memory_usage(); } - size_t get_tombstone_capacity() { - // FIXME: tombstone capacity needs figured out again - return m_cap; + BufferView get_buffer_view() { + m_head_refcnt.fetch_add(1); + return BufferView(m_data, m_head, m_tail.load(), m_tombstone_filter, (void*) this, release_head_reference); } /* - * Concurrency-related operations + * Advance the buffer following a reconstruction. Move current + * head and head_refcnt into old_head and old_head_refcnt, then + * assign new_head to old_head. */ - bool take_reference() { - m_refcnt.fetch_add(1); - return true; + void advance_head(size_t new_head) { + assert(new_head > m_head.load()); + assert(new_head <= m_tail.load()); + assert(m_old_head_refcnt == 0); + + /* + * the order here is very important. We first store zero to the + * old_refcnt (should be zero anyway). Then we move the current + * head to old head. At this point, any new buffer views should + * increment the old head refcnt, so no new references to the + * current head will be taken. Then we add the current head + * refcnt to this. This is to ensure that no references get + * dropped. Only after this do we change to the new head + */ + m_old_head_refcnt.store(0); + m_old_head.store(m_head.load()); + m_old_head_refcnt.fetch_add(m_head_refcnt); + + m_head_refcnt.store(0); + m_head.store(new_head); } - bool release_reference() { - assert(m_refcnt > 0); - m_refcnt.fetch_add(-1); - return true; + void set_low_watermark(size_t lwm) { + assert(lwm < m_hwm); + m_lwm = lwm; } - size_t get_reference_count() { - return m_refcnt.load(); + size_t get_low_watermark() { + return m_lwm; + } + + void set_high_watermark(size_t hwm) { + assert(hwm > m_lwm); + assert(hwm < m_cap); + m_hwm = hwm; + } + + size_t get_high_watermark() { + return m_hwm; + } + + size_t get_tail() { + return m_tail.load(); + } + + /* + * Note: this returns the available physical storage capacity, + * *not* now many more records can be inserted before the + * HWM is reached. + */ + size_t get_available_capacity() { + return m_cap - (m_tail.load() - m_old_head.load()); } private: int64_t try_advance_tail() { - int64_t new_tail = m_tail.fetch_add(1) % m_cap; + size_t old_value = m_tail.load(); + + /* if full, fail to advance the tail */ + if (old_value >= m_hwm) { + return -1; + } - if (new_tail < m_hwm) { - return new_tail; + while (!m_tail.compare_exchange_strong(old_value, old_value+1)) { + /* if full, stop trying and fail to advance the tail */ + if (m_tail.load() >= m_hwm) { + return -1; + } } - m_tail.fetch_add(-1); - return -1; + return old_value; } size_t to_idx(size_t i) { return (m_head + i) % m_cap; } - size_t m_cap; + static void release_head_reference(void *buff, size_t head) { + MutableBuffer *buffer = (MutableBuffer *) buff; + + if (head == buffer->m_head.load()) { + buffer->m_head_refcnt.fetch_sub(1); + } else if (head == buffer->m_old_head.load()) { + buffer->m_old_head_refcnt.fetch_sub(1); + /* + * if the old head refcnt drops to 0, free + * the records by setting old_head = head + */ + if (buffer->m_old_head_refcnt.load() == 0) { + buffer->m_old_head.store(buffer->m_head); + } + } + } size_t m_lwm; size_t m_hwm; + size_t m_cap; alignas(64) std::atomic m_tail; + alignas(64) std::atomic m_head; + alignas(64) std::atomic m_head_refcnt; + + alignas(64) std::atomic m_old_head; + alignas(64) std::atomic m_old_head_refcnt; Wrapped* m_data; - psudb::BloomFilter* m_tombstone_filter; - alignas(64) std::atomic m_tombstonecnt; - alignas(64) std::atomic m_refcnt; }; } -- cgit v1.2.3 From eb19677340be6f0befe9da2199e5832af51eea0d Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 10 Jan 2024 18:01:30 -0500 Subject: MutableBuffer: multithreaded insert test + bugfixes --- include/framework/structure/MutableBuffer.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index d57ad6e..a065154 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -49,7 +49,6 @@ public: delete m_tombstone_filter; } - template int append(const R &rec, bool tombstone=false) { int32_t pos = 0; if ((pos = try_advance_tail()) == -1) return 0; @@ -91,7 +90,7 @@ public: } bool is_at_low_watermark() { - return (m_tail % m_cap) > m_lwm; + return get_record_count() >= m_lwm; } size_t get_tombstone_count() { @@ -139,10 +138,14 @@ public: * head and head_refcnt into old_head and old_head_refcnt, then * assign new_head to old_head. */ - void advance_head(size_t new_head) { + bool advance_head(size_t new_head) { assert(new_head > m_head.load()); assert(new_head <= m_tail.load()); - assert(m_old_head_refcnt == 0); + + /* refuse to advance head while there is an old with one references */ + if (m_old_head_refcnt > 0) { + return false; + } /* * the order here is very important. We first store zero to the @@ -159,6 +162,8 @@ public: m_head_refcnt.store(0); m_head.store(new_head); + + return true; } void set_low_watermark(size_t lwm) { -- cgit v1.2.3 From 5db0f96e9f3d2505b5f751abc133cbf7e13b5129 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 11 Jan 2024 11:31:33 -0500 Subject: Fixed some potential buffer-related concurrency bugs --- include/framework/structure/BufferView.h | 45 ++++++++---- include/framework/structure/MutableBuffer.h | 106 +++++++++++++++++----------- 2 files changed, 94 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 7e8af45..00b6101 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -18,31 +18,43 @@ namespace de { -typedef std::function ReleaseFunction; +typedef std::_Bind ReleaseFunction; template class BufferView { public: BufferView() = default; - BufferView(const Wrapped *buffer, size_t head, size_t tail, psudb::BloomFilter *filter, - void *parent_buffer, ReleaseFunction release) - : m_buffer(buffer) + BufferView(const Wrapped *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter *filter, + ReleaseFunction release) + : m_data(buffer) , m_release(release) - , m_parent_buffer(parent_buffer) , m_head(head) , m_tail(tail) + , m_cap(cap) + , m_approx_ts_cnt(tombstone_cnt) , m_tombstone_filter(filter) {} ~BufferView() { - m_release(m_parent_buffer, m_head); + m_release(); } bool check_tombstone(const R& rec) { if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false; for (size_t i=0; i *get(size_t i) { assert(i < get_record_count()); - return m_buffer + to_idx(i); + return m_data + to_idx(i); } void copy_to_buffer(psudb::byte *buffer) { - memcpy(buffer, (std::byte*) (m_buffer + m_head), get_record_count() * sizeof(Wrapped)); + memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped)); } private: - const Wrapped* m_buffer; - void *m_parent_buffer; + const Wrapped* m_data; ReleaseFunction m_release; size_t m_head; size_t m_tail; + size_t m_cap; + size_t m_approx_ts_cnt; psudb::BloomFilter *m_tombstone_filter; size_t to_idx(size_t i) { - return (m_head + i) % m_buffer->get_capacity(); + return (m_head + i) % m_cap; } }; diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index a065154..3a06f0d 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -6,12 +6,22 @@ * * Distributed under the Modified BSD License. * + * NOTE: Concerning the tombstone count. One possible approach + * would be to track the number of tombstones below and above the + * low water mark--this would be straightforward to do. Then, if we + * *require* that the head only advance up to the LWM, we can get a + * correct view on the number of tombstones in the active buffer at + * any point in time, and the BufferView will have a pretty good + * approximation as well (potentially with a few extra if new inserts + * happen between when the tail pointer and tombstone count are fetched) + * */ #pragma once #include #include #include +#include #include "psu-util/alignment.h" #include "util/bf_config.h" @@ -28,20 +38,22 @@ class MutableBuffer { friend class BufferView; public: MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) - : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0), - m_old_head(0), m_head_refcnt(0), m_old_head_refcnt(0) { - /* - * default capacity is twice the high water mark, to account for the worst-case - * memory requirements. - */ - if (m_cap == 0) { - m_cap = m_hwm * 2; - } - - m_data = (Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped)); - - // FIXME: need to figure out how to detail with tombstones at some point... - m_tombstone_filter = new psudb::BloomFilter(BF_FPR, m_hwm, BF_HASH_FUNCS); + : m_lwm(low_watermark) + , m_hwm(high_watermark) + , m_cap((capacity == 0) ? 2 * high_watermark : capacity) + , m_tail(0) + , m_head(0) + , m_head_refcnt(0) + , m_old_head(0) + , m_old_head_refcnt(0) + , m_data((Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped))) + , m_tombstone_filter(new psudb::BloomFilter(BF_FPR, m_hwm, BF_HASH_FUNCS)) + , m_tscnt(0) + , m_old_tscnt(0) + , m_active_head_advance(false) + { + assert(m_cap > m_hwm); + assert(m_hwm > m_lwm); } ~MutableBuffer() { @@ -62,7 +74,7 @@ public: m_data[pos].header |= (pos << 2); if (tombstone) { - m_tombstonecnt.fetch_add(1); + m_tscnt.fetch_add(1); if (m_tombstone_filter) m_tombstone_filter->insert(rec); } @@ -70,7 +82,7 @@ public: } bool truncate() { - m_tombstonecnt.store(0); + m_tscnt.store(0); m_tail.store(0); if (m_tombstone_filter) m_tombstone_filter->clear(); @@ -78,7 +90,7 @@ public: } size_t get_record_count() { - return (m_tail - m_head) % m_cap; + return m_tail - m_head; } size_t get_capacity() { @@ -94,30 +106,15 @@ public: } size_t get_tombstone_count() { - return m_tombstonecnt.load(); + return m_tscnt.load(); } bool delete_record(const R& rec) { - for (size_t i=0; ilookup(rec)) return false; - - for (size_t i=0; i get_buffer_view() { m_head_refcnt.fetch_add(1); - return BufferView(m_data, m_head, m_tail.load(), m_tombstone_filter, (void*) this, release_head_reference); + auto f = std::bind(release_head_reference, (void *) this, m_head.load()); + return BufferView(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); } /* @@ -147,6 +145,8 @@ public: return false; } + m_active_head_advance.store(true); + /* * the order here is very important. We first store zero to the * old_refcnt (should be zero anyway). Then we move the current @@ -157,12 +157,14 @@ public: * dropped. Only after this do we change to the new head */ m_old_head_refcnt.store(0); + m_old_head.store(m_head.load()); m_old_head_refcnt.fetch_add(m_head_refcnt); m_head_refcnt.store(0); m_head.store(new_head); + m_active_head_advance.store(false); return true; } @@ -212,29 +214,44 @@ private: if (m_tail.load() >= m_hwm) { return -1; } + + _mm_pause(); } return old_value; } - size_t to_idx(size_t i) { - return (m_head + i) % m_cap; + size_t to_idx(size_t i, size_t head) { + return (head + i) % m_cap; } static void release_head_reference(void *buff, size_t head) { MutableBuffer *buffer = (MutableBuffer *) buff; - if (head == buffer->m_head.load()) { - buffer->m_head_refcnt.fetch_sub(1); - } else if (head == buffer->m_old_head.load()) { + /* + * check old head first. During a head transition, the head being + * retired will first be assigned to *both* head and old_head. As + * a result, any refcnt updates during this time should be applied + * to old_head, even if the current head and the head being released + * also match. + */ + if (head == buffer->m_old_head.load()) { buffer->m_old_head_refcnt.fetch_sub(1); /* * if the old head refcnt drops to 0, free * the records by setting old_head = head + * before this, spin while the two heads are equal to + * avoid */ + while (buffer->m_active_head_advance.load()) { + _mm_pause(); + } + if (buffer->m_old_head_refcnt.load() == 0) { buffer->m_old_head.store(buffer->m_head); } + } else if (head == buffer->m_head.load()) { + buffer->m_head_refcnt.fetch_sub(1); } } @@ -252,7 +269,10 @@ private: Wrapped* m_data; psudb::BloomFilter* m_tombstone_filter; - alignas(64) std::atomic m_tombstonecnt; + alignas(64) std::atomic m_tscnt; + size_t m_old_tscnt; + + alignas(64) std::atomic m_active_head_advance; }; } -- cgit v1.2.3 From c596ed468c2279f959b04d83d7f2e9692db84bae Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 11 Jan 2024 15:28:00 -0500 Subject: BufferView: enforce move semantics Because a BufferView's lifetime is so tightly linked to the lifetime of regions of the buffer, it can't be copied without potentially breaking things. --- include/framework/structure/BufferView.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 00b6101..98e41dd 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -25,7 +25,24 @@ class BufferView { public: BufferView() = default; - BufferView(const Wrapped *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter *filter, + /* + * the BufferView's lifetime is tightly linked to buffer versioning, and so + * copying and assignment are disabled. + */ + BufferView(const BufferView&) = delete; + BufferView &operator=(BufferView &) = delete; + + void operator=(BufferView &&src) { + m_data = src.m_data; + m_release = src.m_release; + m_head = src.m_head; + m_tail = src.m_tail; + m_cap = src.m_cap; + m_approx_ts_cnt = src.m_approx_ts_cnt; + m_tombstone_filter = src.filter; + } + + BufferView(Wrapped *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter *filter, ReleaseFunction release) : m_data(buffer) , m_release(release) @@ -85,7 +102,7 @@ public: } private: - const Wrapped* m_data; + Wrapped* m_data; ReleaseFunction m_release; size_t m_head; size_t m_tail; -- cgit v1.2.3 From 5a2c378aad3f1a9923db3191ffaa3fb807d392b2 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 11 Jan 2024 15:29:51 -0500 Subject: Ported ISAMTree over to new buffer setup I may still play with the shard from shards constructor, and queries need some work yet too. --- include/shard/ISAMTree.h | 125 ++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index e11c899..6b2f6b5 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -13,8 +13,6 @@ #include #include -#include -#include #include "framework/ShardRequirements.h" @@ -27,52 +25,54 @@ using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; -using psudb::Alias; namespace de { -thread_local size_t mrun_cancelations = 0; - -template +template class ISAMTree { private: typedef decltype(R::key) K; typedef decltype(R::value) V; -constexpr static size_t inmem_isam_node_size = 256; -constexpr static size_t inmem_isam_fanout = inmem_isam_node_size / (sizeof(K) + sizeof(char*)); +constexpr static size_t NODE_SZ = 256; +constexpr static size_t INTERNAL_FANOUT = NODE_SZ / (sizeof(K) + sizeof(byte*)); struct InternalNode { - K keys[inmem_isam_fanout]; - char* child[inmem_isam_fanout]; + K keys[INTERNAL_FANOUT]; + byte* child[INTERNAL_FANOUT]; }; -constexpr static size_t inmem_isam_leaf_fanout = inmem_isam_node_size / sizeof(R); -constexpr static size_t inmem_isam_node_keyskip = sizeof(K) * inmem_isam_fanout; - -static_assert(sizeof(InternalNode) == inmem_isam_node_size, "node size does not match"); +static_assert(sizeof(InternalNode) == NODE_SZ, "node size does not match"); -public: - ISAMTree(MutableBuffer* buffer) - :m_reccnt(0), m_tombstone_cnt(0), m_isam_nodes(nullptr), m_deleted_cnt(0) { +constexpr static size_t LEAF_FANOUT = NODE_SZ / sizeof(R); - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); +public: + ISAMTree(BufferView buffer) + : m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) + , m_isam_nodes(nullptr) + , m_root(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_internal_node_cnt(0) + , m_deleted_cnt(0) + , m_alloc_size(0) + , m_data(nullptr) + { TIMER_INIT(); - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped), (byte**) &m_data); TIMER_START(); + auto temp_buffer = (Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped)); + buffer.copy_to_buffer((byte *) temp_buffer); + + auto base = temp_buffer; + auto stop = base + buffer.get_record_count(); std::sort(base, stop, std::less>()); TIMER_STOP(); + auto sort_time = TIMER_RESULT(); TIMER_START(); @@ -80,7 +80,6 @@ public: if (!base->is_tombstone() && (base + 1 < stop) && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { base += 2; - mrun_cancelations++; continue; } else if (base->is_deleted()) { base += 1; @@ -109,10 +108,21 @@ public: } TIMER_STOP(); auto level_time = TIMER_RESULT(); + + free(temp_buffer); } ISAMTree(ISAMTree** runs, size_t len) - : m_reccnt(0), m_tombstone_cnt(0), m_deleted_cnt(0), m_isam_nodes(nullptr) { + : m_bf(nullptr) + , m_isam_nodes(nullptr) + , m_root(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_internal_node_cnt(0) + , m_deleted_cnt(0) + , m_alloc_size(0) + , m_data(nullptr) + { std::vector>> cursors; cursors.reserve(len); @@ -139,8 +149,6 @@ public: assert(m_alloc_size % CACHELINE_SIZE == 0); m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - size_t offset = 0; - while (pq.size()) { auto now = pq.peek(); auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; @@ -173,9 +181,9 @@ public: } ~ISAMTree() { - if (m_data) free(m_data); - if (m_isam_nodes) free(m_isam_nodes); - if (m_bf) delete m_bf; + free(m_data); + free(m_isam_nodes); + delete m_bf; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -214,25 +222,25 @@ public: } size_t get_memory_usage() { - return m_internal_node_cnt * inmem_isam_node_size + m_alloc_size; + return m_alloc_size; } size_t get_aux_memory_usage() { - return 0; + return m_bf->memory_usage(); } size_t get_lower_bound(const K& key) const { const InternalNode* now = m_root; - while (!is_leaf(reinterpret_cast(now))) { + while (!is_leaf(reinterpret_cast(now))) { const InternalNode* next = nullptr; - for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { + for (size_t i = 0; i < INTERNAL_FANOUT - 1; ++i) { if (now->child[i + 1] == nullptr || key <= now->keys[i]) { next = reinterpret_cast(now->child[i]); break; } } - now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); + now = next ? next : reinterpret_cast(now->child[INTERNAL_FANOUT - 1]); } const Wrapped* pos = reinterpret_cast*>(now); @@ -243,16 +251,16 @@ public: size_t get_upper_bound(const K& key) const { const InternalNode* now = m_root; - while (!is_leaf(reinterpret_cast(now))) { + while (!is_leaf(reinterpret_cast(now))) { const InternalNode* next = nullptr; - for (size_t i = 0; i < inmem_isam_fanout - 1; ++i) { + for (size_t i = 0; i < INTERNAL_FANOUT - 1; ++i) { if (now->child[i + 1] == nullptr || key < now->keys[i]) { next = reinterpret_cast(now->child[i]); break; } } - now = next ? next : reinterpret_cast(now->child[inmem_isam_fanout - 1]); + now = next ? next : reinterpret_cast(now->child[INTERNAL_FANOUT - 1]); } const Wrapped* pos = reinterpret_cast*>(now); @@ -264,20 +272,17 @@ public: private: void build_internal_levels() { - size_t n_leaf_nodes = m_reccnt / inmem_isam_leaf_fanout + (m_reccnt % inmem_isam_leaf_fanout != 0); + size_t n_leaf_nodes = m_reccnt / LEAF_FANOUT + (m_reccnt % LEAF_FANOUT != 0); + size_t level_node_cnt = n_leaf_nodes; size_t node_cnt = 0; do { - level_node_cnt = level_node_cnt / inmem_isam_fanout + (level_node_cnt % inmem_isam_fanout != 0); + level_node_cnt = level_node_cnt / INTERNAL_FANOUT + (level_node_cnt % INTERNAL_FANOUT != 0); node_cnt += level_node_cnt; } while (level_node_cnt > 1); - m_alloc_size = (node_cnt * inmem_isam_node_size) + (CACHELINE_SIZE - (node_cnt * inmem_isam_node_size) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - - m_isam_nodes = (InternalNode*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + m_alloc_size += psudb::sf_aligned_calloc(CACHELINE_SIZE, node_cnt, NODE_SZ, (byte**) &m_isam_nodes); m_internal_node_cnt = node_cnt; - memset(m_isam_nodes, 0, node_cnt * inmem_isam_node_size); InternalNode* current_node = m_isam_nodes; @@ -285,16 +290,16 @@ private: const Wrapped* leaf_stop = m_data + m_reccnt; while (leaf_base < leaf_stop) { size_t fanout = 0; - for (size_t i = 0; i < inmem_isam_fanout; ++i) { - auto rec_ptr = leaf_base + inmem_isam_leaf_fanout * i; + for (size_t i = 0; i < INTERNAL_FANOUT; ++i) { + auto rec_ptr = leaf_base + LEAF_FANOUT * i; if (rec_ptr >= leaf_stop) break; - const Wrapped* sep_key = std::min(rec_ptr + inmem_isam_leaf_fanout - 1, leaf_stop - 1); + const Wrapped* sep_key = std::min(rec_ptr + LEAF_FANOUT - 1, leaf_stop - 1); current_node->keys[i] = sep_key->rec.key; - current_node->child[i] = (char*)rec_ptr; + current_node->child[i] = (byte*)rec_ptr; ++fanout; } current_node++; - leaf_base += fanout * inmem_isam_leaf_fanout; + leaf_base += fanout * LEAF_FANOUT; } auto level_start = m_isam_nodes; @@ -304,12 +309,12 @@ private: auto now = level_start; while (now < level_stop) { size_t child_cnt = 0; - for (size_t i = 0; i < inmem_isam_fanout; ++i) { + for (size_t i = 0; i < INTERNAL_FANOUT; ++i) { auto node_ptr = now + i; ++child_cnt; if (node_ptr >= level_stop) break; - current_node->keys[i] = node_ptr->keys[inmem_isam_fanout - 1]; - current_node->child[i] = (char*)node_ptr; + current_node->keys[i] = node_ptr->keys[INTERNAL_FANOUT - 1]; + current_node->child[i] = (byte*)node_ptr; } now += child_cnt; current_node++; @@ -323,12 +328,10 @@ private: m_root = level_start; } - bool is_leaf(const char* ptr) const { - return ptr >= (const char*)m_data && ptr < (const char*)(m_data + m_reccnt); + bool is_leaf(const byte* ptr) const { + return ptr >= (const byte*)m_data && ptr < (const byte*)(m_data + m_reccnt); } - // Members: sorted data, internal ISAM levels, reccnt; - Wrapped* m_data; psudb::BloomFilter *m_bf; InternalNode* m_isam_nodes; InternalNode* m_root; @@ -337,5 +340,7 @@ private: size_t m_internal_node_cnt; size_t m_deleted_cnt; size_t m_alloc_size; + + Wrapped* m_data; }; } -- cgit v1.2.3 From 7e503464176adbd0880373325e30a6bfd58616f0 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 11 Jan 2024 16:31:24 -0500 Subject: InternalLevel update and tests Plus some assorted fixes for move semantics stuff in BufferView that accompanied these changes. --- include/framework/structure/BufferView.h | 44 ++++++++++++++++++++++------- include/framework/structure/InternalLevel.h | 11 ++++---- 2 files changed, 39 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 98e41dd..d058714 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -18,12 +18,25 @@ namespace de { -typedef std::_Bind ReleaseFunction; +typedef std::_Bind ReleaseFunction; + +static void noop_func(void *arg1, size_t arg2) { + return; +} + +constexpr auto noop_bind = std::bind(noop_func, (void*) nullptr, 0ul); template class BufferView { public: - BufferView() = default; + BufferView() + : m_data(nullptr) + , m_release(noop_bind) + , m_head(0) + , m_tail(0) + , m_cap(0) + , m_approx_ts_cnt(0) + , m_tombstone_filter(nullptr) {} /* * the BufferView's lifetime is tightly linked to buffer versioning, and so @@ -32,14 +45,25 @@ public: BufferView(const BufferView&) = delete; BufferView &operator=(BufferView &) = delete; - void operator=(BufferView &&src) { - m_data = src.m_data; - m_release = src.m_release; - m_head = src.m_head; - m_tail = src.m_tail; - m_cap = src.m_cap; - m_approx_ts_cnt = src.m_approx_ts_cnt; - m_tombstone_filter = src.filter; + BufferView(BufferView &&other) + : m_data(std::exchange(other.m_data, nullptr)) + , m_release(std::move(other.m_release)) + , m_head(std::exchange(other.m_head, 0)) + , m_tail(std::exchange(other.m_tail, 0)) + , m_cap(std::exchange(other.m_cap, 0)) + , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0)) + , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {} + + BufferView &operator=(BufferView &&other) noexcept { + std::swap(m_data, other.m_data); + m_release = std::move(other.m_release); + std::swap(m_head, other.m_head); + std::swap(m_tail, other.m_tail); + std::swap(m_cap, other.m_cap); + std::swap(m_approx_ts_cnt, other.m_approx_ts_cnt); + std::swap(m_tombstone_filter, other.m_tombstone_filter); + + return *this; } BufferView(Wrapped *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter *filter, diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index ee85cb3..b35cadd 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -16,7 +16,7 @@ #include "framework/interface/Shard.h" #include "framework/interface/Query.h" #include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" +#include "framework/structure/BufferView.h" namespace de { template @@ -27,7 +27,7 @@ class InternalLevel; template class InternalLevel { typedef S Shard; - typedef MutableBuffer Buffer; + typedef BufferView BuffView; public: InternalLevel(ssize_t level_no, size_t shard_cap) : m_level_no(level_no) @@ -97,14 +97,14 @@ public: * into this level. This is used for buffer * flushes under the tiering layout policy. */ - void append_buffer(Buffer* buffer) { + void append_buffer(BuffView buffer) { if (m_shard_cnt == m_shards.size()) { assert(m_pending_shard == nullptr); - m_pending_shard = new S(buffer); + m_pending_shard = new S(std::move(buffer)); return; } - m_shards[m_shard_cnt] = std::make_shared(buffer); + m_shards[m_shard_cnt] = std::make_shared(std::move(buffer)); ++m_shard_cnt; } @@ -140,7 +140,6 @@ public: return new S(shards, m_shard_cnt); } - /* Append the sample range in-order */ void get_query_states(std::vector> &shards, std::vector& shard_states, void *query_parms) { for (size_t i=0; i Date: Fri, 12 Jan 2024 11:29:37 -0500 Subject: BufferView.h: Hopefully the last necessary tweak to the move semantics stuff You can't move assign an std::Bind, but you can move construct it. So I had to disable the move assignment operator. This means that when you change the BufferView ownership over to, say, a QueryBufferState object, you need to do it by passing std::move(buffview) into a constructor call only--you cannot assign it. --- include/framework/structure/BufferView.h | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index d058714..47c7b9b 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -20,23 +20,10 @@ namespace de { typedef std::_Bind ReleaseFunction; -static void noop_func(void *arg1, size_t arg2) { - return; -} - -constexpr auto noop_bind = std::bind(noop_func, (void*) nullptr, 0ul); - template class BufferView { public: - BufferView() - : m_data(nullptr) - , m_release(noop_bind) - , m_head(0) - , m_tail(0) - , m_cap(0) - , m_approx_ts_cnt(0) - , m_tombstone_filter(nullptr) {} + BufferView() = default; /* * the BufferView's lifetime is tightly linked to buffer versioning, and so @@ -54,17 +41,8 @@ public: , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0)) , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {} - BufferView &operator=(BufferView &&other) noexcept { - std::swap(m_data, other.m_data); - m_release = std::move(other.m_release); - std::swap(m_head, other.m_head); - std::swap(m_tail, other.m_tail); - std::swap(m_cap, other.m_cap); - std::swap(m_approx_ts_cnt, other.m_approx_ts_cnt); - std::swap(m_tombstone_filter, other.m_tombstone_filter); + BufferView &operator=(BufferView &&other) = delete; - return *this; - } BufferView(Wrapped *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter *filter, ReleaseFunction release) -- cgit v1.2.3 From aac0bb661af8fae38d3ce08d6078cb4d9dfcb575 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 12 Jan 2024 14:10:11 -0500 Subject: Initial integration of new buffering scheme into framework It isn't working right now (lotsa test failures), but we're to the debugging phase now. --- include/framework/DynamicExtension.h | 252 ++++++++--------------- include/framework/QueryRequirements.h | 2 +- include/framework/interface/Query.h | 2 +- include/framework/interface/Scheduler.h | 4 - include/framework/scheduling/Epoch.h | 80 ++----- include/framework/structure/BufferView.h | 4 + include/framework/structure/ExtensionStructure.h | 41 ++-- include/query/rangequery.h | 34 +-- 8 files changed, 142 insertions(+), 277 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index c5c4a1a..c97b390 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -10,29 +10,23 @@ #pragma once #include -#include #include #include #include #include #include +#include "framework/interface/Scheduler.h" +#include "framework/scheduling/FIFOScheduler.h" +#include "framework/scheduling/SerialScheduler.h" + #include "framework/structure/MutableBuffer.h" -#include "framework/structure/InternalLevel.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" #include "framework/interface/Record.h" -#include "framework/interface/Query.h" -#include "framework/interface/Scheduler.h" #include "framework/structure/ExtensionStructure.h" #include "framework/util/Configuration.h" -#include "framework/scheduling/FIFOScheduler.h" -#include "framework/scheduling/SerialScheduler.h" #include "framework/scheduling/Epoch.h" -#include "psu-util/timer.h" -#include "psu-ds/Alias.h" namespace de { @@ -43,22 +37,19 @@ class DynamicExtension { typedef MutableBuffer Buffer; typedef ExtensionStructure Structure; typedef Epoch _Epoch; - typedef BufferView BufView; + typedef BufferView BufView; public: - DynamicExtension(size_t buffer_cap, size_t scale_factor, double max_delete_prop, size_t memory_budget=0, + DynamicExtension(size_t buffer_lwm, size_t buffer_hwm, size_t scale_factor, size_t memory_budget=0, size_t thread_cnt=16) : m_scale_factor(scale_factor) - , m_max_delete_prop(max_delete_prop) + , m_max_delete_prop(1) , m_sched(memory_budget, thread_cnt) - , m_buffer_capacity(buffer_cap) - , m_buffer_delete_capacity(max_delete_prop*buffer_cap) + , m_buffer(new Buffer(buffer_lwm, buffer_hwm)) { - auto buf = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); - auto vers = new Structure(m_buffer_capacity, m_scale_factor, m_max_delete_prop); - auto epoch = new _Epoch(0, vers, buf); + auto vers = new Structure(buffer_hwm, m_scale_factor, m_max_delete_prop); + auto epoch = new _Epoch(0, vers, m_buffer); - m_buffers.insert(buf); m_versions.insert(vers); m_epochs.insert({0, epoch}); } @@ -79,9 +70,7 @@ public: delete e.second; } - for (auto e : m_buffers) { - delete e; - } + delete m_buffer; for (auto e : m_versions) { delete e; @@ -95,10 +84,15 @@ public: int erase(const R &rec) { // FIXME: delete tagging will require a lot of extra work to get // operating "correctly" in a concurrent environment. + + /* + * Get a view on the buffer *first*. This will ensure a stronger + * ordering than simply accessing the buffer directly, but is + * not *strictly* necessary. + */ + auto view = m_buffer->get_buffer_view(); if constexpr (D == DeletePolicy::TAGGING) { static_assert(std::same_as, "Tagging is only supported in single-threaded operation"); - BufView buffers = get_active_epoch()->get_buffer_view(); - if (get_active_epoch()->get_structure()->tagged_delete(rec)) { return 1; } @@ -108,7 +102,7 @@ public: * probably has the lowest probability of having the record, * so we'll check it last. */ - return buffers.delete_record(rec); + return view.delete_record(rec); } /* @@ -123,7 +117,7 @@ public: size_t get_record_count() { auto epoch = get_active_epoch_protected(); - auto t = epoch->get_buffer_view().get_record_count() + epoch->get_structure()->get_record_count(); + auto t = epoch->get_buffer().get_record_count() + epoch->get_structure()->get_record_count(); epoch->end_job(); return t; @@ -131,7 +125,7 @@ public: size_t get_tombstone_count() { auto epoch = get_active_epoch_protected(); - auto t = epoch->get_buffer_view().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); + auto t = epoch->get_buffer().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); epoch->end_job(); return t; @@ -147,7 +141,7 @@ public: size_t get_memory_usage() { auto epoch = get_active_epoch_protected(); - auto t= epoch->get_buffer_view().get_memory_usage() + epoch->get_structure()->get_memory_usage(); + auto t= epoch->get_buffer().get_memory_usage() + epoch->get_structure()->get_memory_usage(); epoch->end_job(); return t; @@ -155,14 +149,14 @@ public: size_t get_aux_memory_usage() { auto epoch = get_active_epoch_protected(); - auto t = epoch->get_buffer_view().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); + auto t = epoch->get_buffer().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); epoch->end_job(); return t; } size_t get_buffer_capacity() { - return m_buffer_capacity; + return m_buffer->get_capacity(); } Shard *create_static_structure(bool await_reconstruction_completion=false) { @@ -171,11 +165,20 @@ public: } auto epoch = get_active_epoch_protected(); - auto bv = epoch->get_buffer_view(); - auto vers = epoch->get_structure(); std::vector shards; + /* + * construct a shard from the buffer view. We'll hold the view + * for as short a time as possible: once the records are exfiltrated + * from the buffer, there's no reason to retain a hold on the view's + * head pointer any longer + */ + { + auto bv = epoch->get_buffer(); + shards.emplace_back(new S(std::move(bv))); + } + if (vers->get_levels().size() > 0) { for (int i=vers->get_levels().size() - 1; i>= 0; i--) { if (vers->get_levels()[i]) { @@ -184,12 +187,6 @@ public: } } - // FIXME: With an interface adjustment, this could be done in - // one call, rather than a loop. - for (ssize_t i=bv.size() - 1; i>=0; i--) { - shards.emplace_back(new S(bv.get_buffers()[i])); - } - Shard *shards_array[shards.size()]; size_t j = 0; @@ -237,10 +234,13 @@ public: private: SCHED m_sched; + Buffer *m_buffer; + std::mutex m_struct_lock; - std::set m_buffers; std::set m_versions; + alignas(64) std::atomic m_reconstruction_scheduled; + std::atomic m_current_epoch; std::atomic m_newest_epoch; std::unordered_map m_epochs; @@ -253,8 +253,6 @@ private: size_t m_scale_factor; double m_max_delete_prop; - size_t m_buffer_capacity; - size_t m_buffer_delete_capacity; void enforce_delete_invariant(_Epoch *epoch) { auto structure = epoch->get_structure(); @@ -321,6 +319,7 @@ private: */ enforce_delete_invariant(new_epoch); + #if 0 /* * Update the new Epoch to contain the buffers from the old one * that it doesn't currently have if using a multi-threaded @@ -339,6 +338,7 @@ private: new_epoch->add_buffer(old_epoch->get_buffers()[i]); } } + #endif m_current_epoch.fetch_add(1); old_epoch->set_inactive(); @@ -373,57 +373,6 @@ private: return new_epoch; } - /* - * Add a new empty buffer. This is intended to be used - * when a reconstruction is triggered, to allow for inserts to be sustained in the new - * buffer while a new epoch is being created in the background. Returns a - * pointer to the newly created buffer. - */ - Buffer *add_empty_buffer() { - /* - * if there's a current Epoch transition ongoing, a buffer installed - * into an older Epoch, but not the new one, may be lost. So fail to - * insert a buffer. - */ - if (!m_epoch_transition_lk.try_lock()) { - return nullptr; - } - - /* - * verify that the currently active buffer is still full, if - * not, there is no reason to add a new one. This code is - * protected by the epoch transition lock, so need need to - * take a protected reference to the epoch. - */ - auto active_epoch = get_active_epoch(); - if (!active_epoch->get_active_buffer()->is_full()) { - m_epoch_transition_lk.unlock(); - return nullptr; - } - - /* - * create a new buffer and install it in the active epoch. - */ - auto temp_buffer = new Buffer(m_buffer_capacity, m_buffer_delete_capacity); - - std::unique_lock m_struct_lock; - auto new_buffer = active_epoch->add_buffer(temp_buffer); - - /* - * if epoch->add_buffer doesn't add the new buffer, this insert - * won't update the buffer set (duplicate insert) - */ - m_buffers.insert(new_buffer); - m_struct_lock.release(); - - if (new_buffer != temp_buffer) { - delete temp_buffer; - } - m_epoch_transition_lk.unlock(); - - return new_buffer; - } - void retire_epoch(_Epoch *epoch) { /* * Epochs with currently active jobs cannot @@ -452,21 +401,14 @@ private: delete epoch; m_epoch_retire_lk.unlock(); + /* NOTE: the BufferView mechanism handles freeing unused buffer space */ + /* * Following the epoch's destruction, any buffers * or structures with no remaining references can * be safely freed. */ std::unique_lock lock(m_struct_lock); - for (auto itr = m_buffers.begin(); itr != m_buffers.end();) { - if ((*itr)->get_reference_count() == 0) { - auto tmp = *itr; - itr = m_buffers.erase(itr); - delete tmp; - } else { - itr++; - } - } for (auto itr = m_versions.begin(); itr != m_versions.end();) { if ((*itr)->get_reference_count() == 0) { @@ -484,21 +426,31 @@ private: Structure *vers = args->epoch->get_structure(); - // FIXME: with an improved shard interface, multiple full buffers + // FIXME: with an improved shard interface, multiple full buffer_viewers // could be flushed at once here. - Buffer *buff = (Buffer *) args->epoch->get_buffers()[0]; + auto buffer_view = args->epoch->get_buffer(); + size_t new_head = buffer_view.get_tail(); for (ssize_t i=0; imerges.size(); i++) { vers->reconstruction(args->merges[i].second, args->merges[i].first); } /* - * if performing a compaction, don't push the buffer down, - * as there is no guarantee that any necessary reconstructions + * if performing a compaction, don't flush the buffer, as + * there is no guarantee that any necessary reconstructions * will free sufficient space in L0 to support a flush */ if (!args->compaction) { - vers->flush_buffer(buff); + vers->flush_buffer(std::move(buffer_view)); + + // FIXME: this may currently fail because there isn't any + // query preemption yet. At this point, we'd need to either + // 1) wait for all queries on the old_head to finish + // 2) kill all queries on the old_head + // 3) somehow migrate all queries on the old_head to the new + // version + auto res = args->epoch->advance_buffer_head(new_head); + assert(res); } args->epoch->end_job(); @@ -519,27 +471,33 @@ private: static void async_query(void *arguments) { QueryArgs *args = (QueryArgs *) arguments; - auto buffers = args->epoch->get_buffer_view(); + auto buffer = args->epoch->get_buffer(); auto vers = args->epoch->get_structure(); void *parms = args->query_parms; /* Get the buffer query states */ - std::vector buffer_states = buffers.get_query_states(parms); + void *buffer_state = Q::get_buffer_query_state(std::move(buffer), parms); /* Get the shard query states */ std::vector> shards; std::vector states = vers->get_query_states(shards, parms); - Q::process_query_states(parms, states, buffer_states); + Q::process_query_states(parms, states, buffer_state); - std::vector>> query_results(shards.size() + buffer_states.size()); + std::vector>> query_results(shards.size() + 1); for (size_t i=0; i> local_results = (i < buffer_states.size()) - ? Q::buffer_query(buffers.get_buffers()[i], buffer_states[i], parms) - : Q::query(shards[i - buffer_states.size()].second, - states[i - buffer_states.size()], parms); - ShardID shid = (i < buffer_states.size()) ? INVALID_SHID : shards[i - buffer_states.size()].first; - query_results[i] = std::move(filter_deletes(local_results, shid, buffers, vers)); + std::vector> local_results; + ShardID shid; + + if (i == 0) { /* process the buffer first */ + local_results = Q::buffer_query(buffer_state, parms); + shid = INVALID_SHID; + } else { + local_results = Q::query(shards[i - 1].second, states[i - 1], parms); + shid = shards[i - 1].first; + } + + query_results[i] = std::move(filter_deletes(local_results, shid, vers)); if constexpr (Q::EARLY_ABORT) { if (query_results[i].size() > 0) break; @@ -551,10 +509,7 @@ private: args->epoch->end_job(); - for (size_t i=0; i *args = new ReconstructionArgs(); args->epoch = epoch; - args->merges = epoch->get_structure()->get_reconstruction_tasks(epoch->get_buffers()[0]->get_record_count()); + args->merges = epoch->get_structure()->get_reconstruction_tasks(epoch->get_buffer().get_record_count()); args->extension = this; args->compaction = false; /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ @@ -597,49 +552,16 @@ private: } int internal_append(const R &rec, bool ts) { - Buffer *buffer = nullptr; - int res = 0; - do { - auto epoch = get_active_epoch_protected(); - buffer = epoch->get_active_buffer(); - assert(buffer); - - /* - * If the buffer is full and there is no ongoing reconstruction, - * schedule a reconstruction and add a new empty buffer. If there - * is an ongoing reconstruction, then add a new empty buffer - * to the current epoch. - */ - if (buffer->is_full()) { - if constexpr (std::same_as) { - /* single threaded: run reconstruction and then empty buffer */ - epoch->end_job(); - schedule_reconstruction(); - buffer->truncate(); - continue; - } else if (epoch->prepare_reconstruction()) { - /* - * add an empty buffer to allow insert proceed and - * schedule a reconstruction on a background thread - */ - buffer = add_empty_buffer(); - schedule_reconstruction(); - } else { - /* background reconstruction is ongoing, so just add empty buffer */ - buffer = add_empty_buffer(); - } - } - - res = (buffer) ? buffer->append(rec, ts) : 0; - epoch->end_job(); - } while(!res); + if (!m_reconstruction_scheduled.load() && m_buffer->is_at_low_watermark()) { + m_reconstruction_scheduled.store(true); + schedule_reconstruction(); + } - /* internal append should always succeed, eventually */ - return 1; + /* this will fail if the HWM is reached and return 0 */ + return m_buffer->append(rec, ts); } - static std::vector> filter_deletes(std::vector> &records, ShardID shid, - BufView &buffers, Structure *vers) { + static std::vector> filter_deletes(std::vector> &records, ShardID shid, Structure *vers) { if constexpr (!Q::SKIP_DELETE_FILTER) { return records; } @@ -672,9 +594,11 @@ private: continue; } - if (buffers.check_tombstone(rec.rec)) { - continue; - } + // FIXME: need to figure out how best to re-enable the buffer tombstone + // check in the correct manner. + //if (buffview.check_tombstone(rec.rec)) { + //continue; + //} if (shid != INVALID_SHID) { for (size_t lvl=0; lvl<=shid.level_idx; lvl++) { diff --git a/include/framework/QueryRequirements.h b/include/framework/QueryRequirements.h index 4d3e97b..dcba67e 100644 --- a/include/framework/QueryRequirements.h +++ b/include/framework/QueryRequirements.h @@ -11,7 +11,7 @@ */ #pragma once -#include "framework/structure/MutableBuffer.h" +#include "framework/structure/BufferView.h" #include "framework/interface/Record.h" #include "framework/interface/Shard.h" #include "framework/interface/Query.h" diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index 8b92c45..ca742c3 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -23,7 +23,7 @@ concept QueryInterface = requires(Q q, void *p, std::vector &s) { {Q::get_query_state(p, p)} -> std::convertible_to; {Q::get_buffer_query_state(p, p)} -> std::convertible_to; */ - {Q::process_query_states(p, s, s)}; + {Q::process_query_states(p, s, p)}; /* {Q::query(s, p, p)} -> std::convertible_to>>; {Q::buffer_query(p, p)} -> std::convertible_to>>; diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h index a8544a7..94afe6c 100644 --- a/include/framework/interface/Scheduler.h +++ b/include/framework/interface/Scheduler.h @@ -8,10 +8,6 @@ */ #pragma once -#include -#include -#include "framework/interface/Record.h" -#include "util/types.h" #include "framework/scheduling/Task.h" template diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 4e1b8a2..ca85fe2 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -8,6 +8,9 @@ */ #pragma once +#include +#include + #include "framework/structure/MutableBuffer.h" #include "framework/structure/ExtensionStructure.h" #include "framework/structure/BufferView.h" @@ -20,10 +23,10 @@ class Epoch { private: typedef MutableBuffer Buffer; typedef ExtensionStructure Structure; - typedef BufferView BufView; + typedef BufferView BufView; public: Epoch(size_t number=0) - : m_buffers() + : m_buffer(nullptr) , m_structure(nullptr) , m_active_merge(false) , m_active_jobs(0) @@ -31,8 +34,8 @@ public: , m_epoch_number(number) {} - Epoch(size_t number, Structure *structure, Buffer *buff) - : m_buffers() + Epoch(size_t number, Structure *structure, Buffer *buff) + : m_buffer(buff) , m_structure(structure) , m_active_jobs(0) , m_active_merge(false) @@ -40,8 +43,6 @@ public: , m_epoch_number(number) { structure->take_reference(); - buff->take_reference(); - m_buffers.push_back(buff); } ~Epoch() { @@ -54,35 +55,11 @@ public: */ //m_active_cv.notify_all(); - clear_buffers(); - if (m_structure) { m_structure->release_reference(); } } - Buffer *add_buffer(Buffer *buf, Buffer *cur_buf=nullptr) { - assert(buf); - - std::unique_lock m_buffer_lock; - /* - * if a current buffer is specified, only add the - * new buffer if the active buffer is the current, - * otherwise just return the active buffer (poor man's - * CAS). - */ - if (cur_buf) { - auto active_buf = get_active_buffer(); - if (active_buf != cur_buf) { - return active_buf; - } - } - - buf->take_reference(); - m_buffers.push_back(buf); - return buf; - } - void start_job() { m_active_jobs.fetch_add(1); } @@ -109,36 +86,10 @@ public: return m_structure; } - std::vector &get_buffers() { - return m_buffers; - } - - BufView get_buffer_view() { - std::unique_lock m_buffer_lock; - return BufView(m_buffers); - } - - Buffer *get_active_buffer() { - if (m_buffers.size() == 0) return nullptr; - - return m_buffers[m_buffers.size() - 1]; + BufView get_buffer() { + return m_buffer->get_buffer_view(); } - /* - * Return the number of buffers in this epoch at - * time of call, and then clear the buffer vector, - * releasing all references in the process. - */ - size_t clear_buffers() { - std::unique_lock m_buffer_lock; - size_t buf_cnt = m_buffers.size(); - for (auto buf : m_buffers) { - if (buf) buf->release_reference(); - } - - m_buffers.clear(); - return buf_cnt; - } /* * Returns a new Epoch object that is a copy of this one. The new object will also contain @@ -148,17 +99,14 @@ public: Epoch *clone(size_t number) { std::unique_lock m_buffer_lock; auto epoch = new Epoch(number); - epoch->m_buffers = m_buffers; + epoch->m_buffer = m_buffer; + if (m_structure) { epoch->m_structure = m_structure->copy(); /* the copy routine returns a structure with 0 references */ epoch->m_structure->take_reference(); } - for (auto b : m_buffers) { - b->take_reference(); - } - return epoch; } @@ -213,9 +161,13 @@ public: return true; } + bool advance_buffer_head(size_t head) { + return m_buffer->advance_head(head); + } + private: Structure *m_structure; - std::vector m_buffers; + Buffer *m_buffer; std::condition_variable m_active_cv; std::mutex m_cv_lock; diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 47c7b9b..ba5e693 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -103,6 +103,10 @@ public: memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped)); } + size_t get_tail() { + return m_tail; + } + private: Wrapped* m_data; ReleaseFunction m_release; diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 60016a0..ae566cb 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -10,28 +10,22 @@ #pragma once #include -#include #include #include -#include "framework/structure/MutableBuffer.h" +#include "framework/structure/BufferView.h" #include "framework/structure/InternalLevel.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" -#include "framework/interface/Record.h" #include "framework/util/Configuration.h" -#include "framework/scheduling/Task.h" #include "psu-util/timer.h" -#include "psu-ds/Alias.h" namespace de { template class ExtensionStructure { typedef S Shard; - typedef MutableBuffer Buffer; + typedef BufferView BuffView; public: ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop) @@ -96,14 +90,10 @@ public: * FIXME: arguably, this should be a method attached to the buffer that * takes a structure as input. */ - inline bool flush_buffer(Buffer *buffer) { - assert(can_reconstruct_with(0, buffer->get_record_count())); + inline bool flush_buffer(BuffView buffer) { + assert(can_reconstruct_with(0, buffer.get_record_count())); - // FIXME: this step makes an extra copy of the buffer, - // which could be avoided by adjusting the shard - // reconstruction process a bit, possibly. - buffer->start_flush(); - flush_buffer_into_l0(buffer); + flush_buffer_into_l0(std::move(buffer)); return true; } @@ -415,11 +405,11 @@ private: * returns -1 if idx==0, and no such level exists, to simplify * the logic of the first buffer flush. */ - inline level_index find_reconstruction_target(level_index idx, Buffer *buffer=nullptr) { + inline level_index find_reconstruction_target(level_index idx) { if (idx == 0 && m_levels.size() == 0) return -1; - size_t incoming_rec_cnt = get_level_record_count(idx, buffer); + size_t incoming_rec_cnt = get_level_record_count(idx); for (level_index i=idx+1; i(0, 1); - temp_level->append_buffer(buffer); + temp_level->append_buffer(std::move(buffer)); if (old_level->get_shard_count() > 0) { m_levels[0] = InternalLevel::reconstruction(old_level, temp_level); @@ -446,7 +436,7 @@ private: m_levels[0] = std::shared_ptr>(temp_level); } } else { - m_levels[0]->append_buffer(buffer); + m_levels[0]->append_buffer(std::move(buffer)); } } @@ -469,16 +459,9 @@ private: } /* - * Returns the actual number of records present on a specified level. An - * index value of -1 indicates the memory table. Can optionally pass in - * a pointer to the memory table to use, if desired. Otherwise, there are - * no guarantees about which buffer will be accessed if level_index is -1. + * Returns the number of records present on a specified level. */ - inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) { - if (buffer) { - return buffer->get_record_count(); - } - + inline size_t get_level_record_count(level_index idx) { return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0; } diff --git a/include/query/rangequery.h b/include/query/rangequery.h index 16dcd86..ad5b767 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -12,7 +12,7 @@ #include "framework/interface/Record.h" #include "framework/interface/Shard.h" -#include "framework/structure/MutableBuffer.h" +#include "framework/structure/BufferView.h" #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" @@ -32,7 +32,10 @@ struct State { template struct BufferState { - size_t cutoff; + BufferView buffer; + + BufferState(BufferView buffer) + : buffer(std::move(buffer)) {} }; template @@ -51,14 +54,13 @@ public: return res; } - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new BufferState(); - res->cutoff = buffer->get_record_count(); + static void* get_buffer_query_state(BufferView buffer, void *parms) { + auto res = new BufferState(std::move(buffer)); return res; } - static void process_query_states(void *query_parms, std::vector &shard_states, std::vector &buffer_states) { + static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_states) { return; } @@ -67,17 +69,21 @@ public: auto p = (Parms *) parms; auto s = (State *) q_state; - // if the returned index is one past the end of the - // records for the PGM, then there are not records - // in the index falling into the specified range. + /* + * if the returned index is one past the end of the + * records for the PGM, then there are not records + * in the index falling into the specified range. + */ if (s->start_idx == shard->get_record_count()) { return records; } auto ptr = shard->get_record_at(s->start_idx); - // roll the pointer forward to the first record that is - // greater than or equal to the lower bound. + /* + * roll the pointer forward to the first record that is + * greater than or equal to the lower bound. + */ while(ptr->rec.key < p->lower_bound) { ptr++; } @@ -90,13 +96,13 @@ public: return records; } - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + static std::vector> buffer_query(void *state, void *parms) { auto p = (Parms *) parms; auto s = (BufferState *) state; std::vector> records; - for (size_t i=0; icutoff; i++) { - auto rec = buffer->get_data() + i; + for (size_t i=0; ibuffer.get_record_count(); i++) { + auto rec = s->buffer.get(i); if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { records.emplace_back(*rec); } -- cgit v1.2.3 From cf178ae74a76b780b655a447531d2114f9f81d98 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Mon, 15 Jan 2024 14:01:36 -0500 Subject: Various single-threaded bug fixes --- include/framework/DynamicExtension.h | 12 +++++------- include/framework/scheduling/Epoch.h | 14 ++++++++++++++ include/framework/structure/BufferView.h | 18 ++++++++++++++---- include/framework/structure/MutableBuffer.h | 26 ++++++++++++++++++++++---- 4 files changed, 55 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index c97b390..bddc950 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -90,8 +90,8 @@ public: * ordering than simply accessing the buffer directly, but is * not *strictly* necessary. */ - auto view = m_buffer->get_buffer_view(); if constexpr (D == DeletePolicy::TAGGING) { + auto view = m_buffer->get_buffer_view(); static_assert(std::same_as, "Tagging is only supported in single-threaded operation"); if (get_active_epoch()->get_structure()->tagged_delete(rec)) { return 1; @@ -426,9 +426,8 @@ private: Structure *vers = args->epoch->get_structure(); - // FIXME: with an improved shard interface, multiple full buffer_viewers // could be flushed at once here. - auto buffer_view = args->epoch->get_buffer(); + auto buffer_view = args->epoch->get_flush_buffer(); size_t new_head = buffer_view.get_tail(); for (ssize_t i=0; imerges.size(); i++) { @@ -464,6 +463,8 @@ private: if (!args->compaction) { ((DynamicExtension *) args->extension)->advance_epoch(); } + + ((DynamicExtension *) args->extension)->m_reconstruction_scheduled = false; delete args; } @@ -525,12 +526,9 @@ private: */ epoch->start_job(); - // FIXME: all full buffers can be flushed at this point--but that requires - // retooling the shard interface a bit to do efficiently. - // ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; - args->merges = epoch->get_structure()->get_reconstruction_tasks(epoch->get_buffer().get_record_count()); + args->merges = epoch->get_structure()->get_reconstruction_tasks(m_buffer->get_low_watermark()); args->extension = this; args->compaction = false; /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index ca85fe2..b005ff6 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -60,6 +60,16 @@ public: } } + + /* + * Epochs are *not* copyable or movable. Only one can exist, and all users of + * it work with pointers + */ + Epoch(const Epoch&) = delete; + Epoch(Epoch&&) = delete; + Epoch &operator=(const Epoch&) = delete; + Epoch &operator=(Epoch&&) = delete; + void start_job() { m_active_jobs.fetch_add(1); } @@ -90,6 +100,10 @@ public: return m_buffer->get_buffer_view(); } + BufView get_flush_buffer() { + return m_buffer->get_flush_buffer_view(); + } + /* * Returns a new Epoch object that is a copy of this one. The new object will also contain diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index ba5e693..c751786 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "psu-util/alignment.h" #include "psu-ds/BloomFilter.h" @@ -39,7 +40,8 @@ public: , m_tail(std::exchange(other.m_tail, 0)) , m_cap(std::exchange(other.m_cap, 0)) , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0)) - , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {} + , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) + , m_active(std::exchange(other.m_active, false)) {} BufferView &operator=(BufferView &&other) = delete; @@ -52,10 +54,13 @@ public: , m_tail(tail) , m_cap(cap) , m_approx_ts_cnt(tombstone_cnt) - , m_tombstone_filter(filter) {} + , m_tombstone_filter(filter) + , m_active(true) {} ~BufferView() { - m_release(); + if (m_active) { + m_release(); + } } bool check_tombstone(const R& rec) { @@ -100,13 +105,17 @@ public: } void copy_to_buffer(psudb::byte *buffer) { - memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped)); + memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped)); } size_t get_tail() { return m_tail; } + size_t get_head() { + return m_head; + } + private: Wrapped* m_data; ReleaseFunction m_release; @@ -115,6 +124,7 @@ private: size_t m_cap; size_t m_approx_ts_cnt; psudb::BloomFilter *m_tombstone_filter; + bool m_active; size_t to_idx(size_t i) { return (m_head + i) % m_cap; diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 3a06f0d..5b655fc 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -44,7 +44,7 @@ public: , m_tail(0) , m_head(0) , m_head_refcnt(0) - , m_old_head(0) + , m_old_head(high_watermark) , m_old_head_refcnt(0) , m_data((Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped))) , m_tombstone_filter(new psudb::BloomFilter(BF_FPR, m_hwm, BF_HASH_FUNCS)) @@ -62,14 +62,18 @@ public: } int append(const R &rec, bool tombstone=false) { - int32_t pos = 0; - if ((pos = try_advance_tail()) == -1) return 0; + int32_t tail = 0; + if ((tail = try_advance_tail()) == -1) { + return 0; + } Wrapped wrec; wrec.rec = rec; wrec.header = 0; if (tombstone) wrec.set_tombstone(); + size_t pos = tail % m_cap; + m_data[pos] = wrec; m_data[pos].header |= (pos << 2); @@ -131,6 +135,13 @@ public: return BufferView(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); } + BufferView get_flush_buffer_view() { + m_head_refcnt.fetch_add(1); + auto f = std::bind(release_head_reference, (void *) this, m_head.load()); + return BufferView(m_data, m_cap, m_head.load(), m_head.load() + m_lwm, m_tscnt.load(), m_tombstone_filter, f); + + } + /* * Advance the buffer following a reconstruction. Move current * head and head_refcnt into old_head and old_head_refcnt, then @@ -142,6 +153,7 @@ public: /* refuse to advance head while there is an old with one references */ if (m_old_head_refcnt > 0) { + fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts"); return false; } @@ -195,6 +207,10 @@ public: * Note: this returns the available physical storage capacity, * *not* now many more records can be inserted before the * HWM is reached. + * + * FIXME: this logic is incorrect for the buffer prior to the + * first call to advance_head, and will under-report the available + * space. */ size_t get_available_capacity() { return m_cap - (m_tail.load() - m_old_head.load()); @@ -205,7 +221,7 @@ private: size_t old_value = m_tail.load(); /* if full, fail to advance the tail */ - if (old_value >= m_hwm) { + if (old_value - m_head.load() >= m_hwm) { return -1; } @@ -236,6 +252,7 @@ private: * also match. */ if (head == buffer->m_old_head.load()) { + assert(buffer->m_old_head_refcnt > 0); buffer->m_old_head_refcnt.fetch_sub(1); /* * if the old head refcnt drops to 0, free @@ -251,6 +268,7 @@ private: buffer->m_old_head.store(buffer->m_head); } } else if (head == buffer->m_head.load()) { + assert(buffer->m_head_refcnt > 0); buffer->m_head_refcnt.fetch_sub(1); } } -- cgit v1.2.3 From b485685968c7ab626d98cc2a84a122d7ca3c68ce Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Mon, 15 Jan 2024 15:16:20 -0500 Subject: Use 16-byte CAS to control buffer head --- include/framework/structure/MutableBuffer.h | 120 ++++++++++++++-------------- 1 file changed, 59 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 5b655fc..eeb3dc9 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -36,16 +36,20 @@ namespace de { template class MutableBuffer { friend class BufferView; + + struct buffer_head { + size_t head_idx; + size_t refcnt; + }; + public: MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) : m_lwm(low_watermark) , m_hwm(high_watermark) , m_cap((capacity == 0) ? 2 * high_watermark : capacity) , m_tail(0) - , m_head(0) - , m_head_refcnt(0) - , m_old_head(high_watermark) - , m_old_head_refcnt(0) + , m_head({0, 0}) + , m_old_head({high_watermark, 0}) , m_data((Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped))) , m_tombstone_filter(new psudb::BloomFilter(BF_FPR, m_hwm, BF_HASH_FUNCS)) , m_tscnt(0) @@ -94,7 +98,7 @@ public: } size_t get_record_count() { - return m_tail - m_head; + return m_tail.load() - m_head.load().head_idx; } size_t get_capacity() { @@ -130,16 +134,17 @@ public: } BufferView get_buffer_view() { - m_head_refcnt.fetch_add(1); - auto f = std::bind(release_head_reference, (void *) this, m_head.load()); - return BufferView(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); + size_t head = get_head(); + auto f = std::bind(release_head_reference, (void *) this, head); + + return BufferView(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); } BufferView get_flush_buffer_view() { - m_head_refcnt.fetch_add(1); - auto f = std::bind(release_head_reference, (void *) this, m_head.load()); - return BufferView(m_data, m_cap, m_head.load(), m_head.load() + m_lwm, m_tscnt.load(), m_tombstone_filter, f); + size_t head = get_head(); + auto f = std::bind(release_head_reference, (void *) this, head); + return BufferView(m_data, m_cap, head, head + m_lwm, m_tscnt.load(), m_tombstone_filter, f); } /* @@ -148,38 +153,41 @@ public: * assign new_head to old_head. */ bool advance_head(size_t new_head) { - assert(new_head > m_head.load()); + assert(new_head > m_head.load().head_idx); assert(new_head <= m_tail.load()); /* refuse to advance head while there is an old with one references */ - if (m_old_head_refcnt > 0) { + if (m_old_head.load().refcnt > 0) { fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts"); return false; } m_active_head_advance.store(true); - /* - * the order here is very important. We first store zero to the - * old_refcnt (should be zero anyway). Then we move the current - * head to old head. At this point, any new buffer views should - * increment the old head refcnt, so no new references to the - * current head will be taken. Then we add the current head - * refcnt to this. This is to ensure that no references get - * dropped. Only after this do we change to the new head - */ - m_old_head_refcnt.store(0); - - m_old_head.store(m_head.load()); - m_old_head_refcnt.fetch_add(m_head_refcnt); + buffer_head new_hd = {new_head, 0}; + buffer_head cur_hd; - m_head_refcnt.store(0); - m_head.store(new_head); + /* move the current head into the old head */ + do { + buffer_head cur_hd = m_head.load(); + m_old_head.store(cur_hd); + } while(!m_head.compare_exchange_strong(cur_hd, new_hd)); m_active_head_advance.store(false); return true; } + size_t get_head() { + buffer_head cur_hd, new_hd; + + do { + cur_hd = m_head.load(); + new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1}; + } while(!m_head.compare_exchange_strong(cur_hd, new_hd)); + + return new_hd.head_idx; + } + void set_low_watermark(size_t lwm) { assert(lwm < m_hwm); m_lwm = lwm; @@ -213,7 +221,7 @@ public: * space. */ size_t get_available_capacity() { - return m_cap - (m_tail.load() - m_old_head.load()); + return m_cap - (m_tail.load() - m_old_head.load().head_idx); } private: @@ -221,7 +229,7 @@ private: size_t old_value = m_tail.load(); /* if full, fail to advance the tail */ - if (old_value - m_head.load() >= m_hwm) { + if (old_value - m_head.load().head_idx >= m_hwm) { return -1; } @@ -244,33 +252,26 @@ private: static void release_head_reference(void *buff, size_t head) { MutableBuffer *buffer = (MutableBuffer *) buff; - /* - * check old head first. During a head transition, the head being - * retired will first be assigned to *both* head and old_head. As - * a result, any refcnt updates during this time should be applied - * to old_head, even if the current head and the head being released - * also match. - */ - if (head == buffer->m_old_head.load()) { - assert(buffer->m_old_head_refcnt > 0); - buffer->m_old_head_refcnt.fetch_sub(1); - /* - * if the old head refcnt drops to 0, free - * the records by setting old_head = head - * before this, spin while the two heads are equal to - * avoid - */ - while (buffer->m_active_head_advance.load()) { - _mm_pause(); - } - - if (buffer->m_old_head_refcnt.load() == 0) { - buffer->m_old_head.store(buffer->m_head); + buffer_head cur_hd, new_hd; + do { + if (buffer->m_head.load().head_idx == head) { + cur_hd = buffer->m_head; + assert(cur_hd.refcnt > 0); + new_hd = {cur_hd.head_idx, cur_hd.refcnt-1}; + + if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) { + break; + } + } else { + cur_hd = buffer->m_old_head; + assert(cur_hd.refcnt > 0); + new_hd = {cur_hd.head_idx, cur_hd.refcnt-1}; + if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) { + break; + } } - } else if (head == buffer->m_head.load()) { - assert(buffer->m_head_refcnt > 0); - buffer->m_head_refcnt.fetch_sub(1); - } + _mm_pause(); + } while(true); } size_t m_lwm; @@ -279,11 +280,8 @@ private: alignas(64) std::atomic m_tail; - alignas(64) std::atomic m_head; - alignas(64) std::atomic m_head_refcnt; - - alignas(64) std::atomic m_old_head; - alignas(64) std::atomic m_old_head_refcnt; + alignas(64) std::atomic m_head; + alignas(64) std::atomic m_old_head; Wrapped* m_data; psudb::BloomFilter* m_tombstone_filter; -- cgit v1.2.3 From 2117935e85412f3733ee0bcb1830c7fd0b129b29 Mon Sep 17 00:00:00 2001 From: "Douglas B. Rumbaugh" Date: Mon, 15 Jan 2024 17:23:57 -0500 Subject: Concurrency testing and bug fixes --- include/framework/DynamicExtension.h | 21 +++++++++++++++------ include/framework/structure/BufferView.h | 10 +++++++++- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index bddc950..cb21ae3 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -28,6 +28,7 @@ #include "framework/scheduling/Epoch.h" + namespace de { template get_buffer(); - shards.emplace_back(new S(std::move(bv))); + if (bv.get_record_count() > 0) { + shards.emplace_back(new S(std::move(bv))); + } } if (vers->get_levels().size() > 0) { for (int i=vers->get_levels().size() - 1; i>= 0; i--) { - if (vers->get_levels()[i]) { + if (vers->get_levels()[i] && vers->get_levels()[i]->get_record_count() > 0) { shards.emplace_back(vers->get_levels()[i]->get_combined_shard()); } } @@ -426,14 +429,20 @@ private: Structure *vers = args->epoch->get_structure(); - // could be flushed at once here. - auto buffer_view = args->epoch->get_flush_buffer(); - size_t new_head = buffer_view.get_tail(); for (ssize_t i=0; imerges.size(); i++) { vers->reconstruction(args->merges[i].second, args->merges[i].first); } + /* + * we'll grab the buffer AFTER doing the internal reconstruction, so we can + * flush as many records as possible in one go. The reconstruction was done so + * as to make room for the full buffer anyway, so there's no real benefit to doing + * this first. + */ + auto buffer_view = args->epoch->get_buffer(); + size_t new_head = buffer_view.get_tail(); + /* * if performing a compaction, don't flush the buffer, as * there is no guarantee that any necessary reconstructions @@ -528,7 +537,7 @@ private: ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; - args->merges = epoch->get_structure()->get_reconstruction_tasks(m_buffer->get_low_watermark()); + args->merges = epoch->get_structure()->get_reconstruction_tasks(m_buffer->get_high_watermark()); args->extension = this; args->compaction = false; /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index c751786..099b7a2 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -105,7 +105,15 @@ public: } void copy_to_buffer(psudb::byte *buffer) { - memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped)); + /* check if the region to be copied circles back to start. If so, do it in two steps */ + if ((m_head % m_cap) + get_record_count() > m_cap) { + size_t split_idx = m_cap - (m_head % m_cap); + + memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped)); + memcpy(buffer + split_idx, (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped)); + } else { + memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped)); + } } size_t get_tail() { -- cgit v1.2.3 From 138c793b0a58577713d98c98bb140cf1d9c79bee Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 17 Jan 2024 18:22:00 -0500 Subject: Multiple concurrency bug fixes A poorly organized commit with fixes for a variety of bugs that were causing missing records. The core problems all appear to be fixed, though there is an outstanding problem with tombstones not being completely canceled. A very small number are appearing in the wrong order during the static structure test. --- include/framework/DynamicExtension.h | 91 ++++++++++------------------- include/framework/interface/Shard.h | 5 +- include/framework/scheduling/Epoch.h | 51 ++++++++-------- include/framework/structure/BufferView.h | 2 +- include/framework/structure/InternalLevel.h | 26 ++++----- include/framework/structure/MutableBuffer.h | 58 +++++++++++------- include/query/rangequery.h | 5 +- include/shard/ISAMTree.h | 35 ++++++----- 8 files changed, 130 insertions(+), 143 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index cb21ae3..7590de2 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -49,7 +49,7 @@ public: , m_buffer(new Buffer(buffer_lwm, buffer_hwm)) { auto vers = new Structure(buffer_hwm, m_scale_factor, m_max_delete_prop); - auto epoch = new _Epoch(0, vers, m_buffer); + auto epoch = new _Epoch(0, vers, m_buffer, 0); m_versions.insert(vers); m_epochs.insert({0, epoch}); @@ -169,6 +169,15 @@ public: auto vers = epoch->get_structure(); std::vector shards; + + if (vers->get_levels().size() > 0) { + for (int i=vers->get_levels().size() - 1; i>= 0; i--) { + if (vers->get_levels()[i] && vers->get_levels()[i]->get_record_count() > 0) { + shards.emplace_back(vers->get_levels()[i]->get_combined_shard()); + } + } + } + /* * construct a shard from the buffer view. We'll hold the view * for as short a time as possible: once the records are exfiltrated @@ -182,24 +191,7 @@ public: } } - if (vers->get_levels().size() > 0) { - for (int i=vers->get_levels().size() - 1; i>= 0; i--) { - if (vers->get_levels()[i] && vers->get_levels()[i]->get_record_count() > 0) { - shards.emplace_back(vers->get_levels()[i]->get_combined_shard()); - } - } - } - - Shard *shards_array[shards.size()]; - - size_t j = 0; - for (size_t i=0; i) { - size_t old_buffer_cnt = new_epoch->clear_buffers(); - - /* - * skip the first buffer, as this was flushed into the epoch's - * structure already, and copy all the other buffer references - * into the new epoch - */ - for (size_t i=1; iget_buffers().size(); i++) { - new_epoch->add_buffer(old_epoch->get_buffers()[i]); - } - } - #endif + // FIXME: this may currently fail because there isn't any + // query preemption yet. At this point, we'd need to either + // 1) wait for all queries on the old_head to finish + // 2) kill all queries on the old_head + // 3) somehow migrate all queries on the old_head to the new + // version + auto res = new_epoch->advance_buffer_head(buffer_head); + assert(res); m_current_epoch.fetch_add(1); old_epoch->set_inactive(); @@ -425,40 +405,29 @@ private: } static void reconstruction(void *arguments) { - ReconstructionArgs *args = (ReconstructionArgs *) arguments; - + auto args = (ReconstructionArgs *) arguments; Structure *vers = args->epoch->get_structure(); - for (ssize_t i=0; imerges.size(); i++) { vers->reconstruction(args->merges[i].second, args->merges[i].first); } - /* - * we'll grab the buffer AFTER doing the internal reconstruction, so we can - * flush as many records as possible in one go. The reconstruction was done so - * as to make room for the full buffer anyway, so there's no real benefit to doing - * this first. + /* + * we'll grab the buffer AFTER doing the internal reconstruction, so we + * can flush as many records as possible in one go. The reconstruction + * was done so as to make room for the full buffer anyway, so there's + * no real benefit to doing this first. */ auto buffer_view = args->epoch->get_buffer(); size_t new_head = buffer_view.get_tail(); - /* - * if performing a compaction, don't flush the buffer, as - * there is no guarantee that any necessary reconstructions + /* + * if performing a compaction, don't flush the buffer, as + * there is no guarantee that any necessary reconstructions * will free sufficient space in L0 to support a flush */ if (!args->compaction) { vers->flush_buffer(std::move(buffer_view)); - - // FIXME: this may currently fail because there isn't any - // query preemption yet. At this point, we'd need to either - // 1) wait for all queries on the old_head to finish - // 2) kill all queries on the old_head - // 3) somehow migrate all queries on the old_head to the new - // version - auto res = args->epoch->advance_buffer_head(new_head); - assert(res); } args->epoch->end_job(); @@ -470,7 +439,7 @@ private: * part of a compaction */ if (!args->compaction) { - ((DynamicExtension *) args->extension)->advance_epoch(); + ((DynamicExtension *) args->extension)->advance_epoch(new_head); } ((DynamicExtension *) args->extension)->m_reconstruction_scheduled = false; diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index 2357795..8c4db34 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -12,6 +12,7 @@ #include "util/types.h" #include "framework/interface/Record.h" +#include namespace de { @@ -19,8 +20,8 @@ namespace de { // determining a good way to handle additional template arguments // to get the Record type into play template -concept ShardInterface = requires(S s, S **spp, void *p, bool b, size_t i) { - {S(spp, i)}; +concept ShardInterface = requires(S s, std::vector spp, void *p, bool b, size_t i) { + {S(spp)}; /* {S(mutable buffer)} {s.point_lookup(r, b) } -> std::convertible_to diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index b005ff6..45ee17d 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -32,15 +32,17 @@ public: , m_active_jobs(0) , m_active(true) , m_epoch_number(number) + , m_buffer_head(0) {} - Epoch(size_t number, Structure *structure, Buffer *buff) + Epoch(size_t number, Structure *structure, Buffer *buff, size_t head) : m_buffer(buff) , m_structure(structure) , m_active_jobs(0) , m_active_merge(false) , m_active(true) , m_epoch_number(number) + , m_buffer_head(head) { structure->take_reference(); } @@ -48,22 +50,21 @@ public: ~Epoch() { assert(m_active_jobs.load() == 0); - /* FIXME: this is needed to keep the destructor from - * sometimes locking up here. But there *shouldn't* be - * any threads waiting on this signal at object destruction, - * so something else is going on here that needs looked into + /* FIXME: this is needed to keep the destructor from sometimes locking + * up here. But there *shouldn't* be any threads waiting on this signal + * at object destruction, so something else is going on here that needs + * looked into */ - //m_active_cv.notify_all(); + // m_active_cv.notify_all(); if (m_structure) { m_structure->release_reference(); } } - - /* - * Epochs are *not* copyable or movable. Only one can exist, and all users of - * it work with pointers + /* + * Epochs are *not* copyable or movable. Only one can exist, and all users + * of it work with pointers */ Epoch(const Epoch&) = delete; Epoch(Epoch&&) = delete; @@ -97,23 +98,20 @@ public: } BufView get_buffer() { - return m_buffer->get_buffer_view(); - } - - BufView get_flush_buffer() { - return m_buffer->get_flush_buffer_view(); + return m_buffer->get_buffer_view(m_buffer_head); } - /* - * Returns a new Epoch object that is a copy of this one. The new object will also contain - * a copy of the m_structure, rather than a reference to the same one. The epoch number of - * the new epoch will be set to the provided argument. + * Returns a new Epoch object that is a copy of this one. The new object + * will also contain a copy of the m_structure, rather than a reference to + * the same one. The epoch number of the new epoch will be set to the + * provided argument. */ Epoch *clone(size_t number) { std::unique_lock m_buffer_lock; auto epoch = new Epoch(number); epoch->m_buffer = m_buffer; + epoch->m_buffer_head = m_buffer_head; if (m_structure) { epoch->m_structure = m_structure->copy(); @@ -125,12 +123,10 @@ public: } /* - * Check if a merge can be started from this Epoch. - * At present, without concurrent merging, this simply - * checks if there is currently a scheduled merge based - * on this Epoch. If there is, returns false. If there - * isn't, return true and set a flag indicating that - * there is an active merge. + * Check if a merge can be started from this Epoch. At present, without + * concurrent merging, this simply checks if there is currently a scheduled + * merge based on this Epoch. If there is, returns false. If there isn't, + * return true and set a flag indicating that there is an active merge. */ bool prepare_reconstruction() { auto old = m_active_merge.load(); @@ -176,7 +172,8 @@ public: } bool advance_buffer_head(size_t head) { - return m_buffer->advance_head(head); + m_buffer_head = head; + return m_buffer->advance_head(m_buffer_head); } private: @@ -187,7 +184,6 @@ private: std::mutex m_cv_lock; std::mutex m_buffer_lock; - std::atomic m_active_merge; /* @@ -199,5 +195,6 @@ private: std::atomic m_active_jobs; bool m_active; size_t m_epoch_number; + size_t m_buffer_head; }; } diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 099b7a2..30fffed 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -110,7 +110,7 @@ public: size_t split_idx = m_cap - (m_head % m_cap); memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped)); - memcpy(buffer + split_idx, (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped)); + memcpy(buffer + (split_idx * sizeof(Wrapped)), (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped)); } else { memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped)); } diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index b35cadd..e9874e0 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -51,11 +51,10 @@ public: assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0)); auto res = new InternalLevel(base_level->m_level_no, 1); res->m_shard_cnt = 1; - Shard* shards[2]; - shards[0] = base_level->m_shards[0].get(); - shards[1] = new_level->m_shards[0].get(); + std::vector shards = {base_level->m_shards[0].get(), + new_level->m_shards[0].get()}; - res->m_shards[0] = std::make_shared(shards, 2); + res->m_shards[0] = std::make_shared(shards); return std::shared_ptr(res); } @@ -75,17 +74,17 @@ public: return; } - Shard *shards[level->m_shard_cnt]; - for (size_t i=0; im_shard_cnt; i++) { - shards[i] = level->m_shards[i].get(); + std::vector shards; + for (auto shard : level->m_shards) { + if (shard) shards.emplace_back(shard.get()); } if (m_shard_cnt == m_shards.size()) { - m_pending_shard = new S(shards, level->m_shard_cnt); + m_pending_shard = new S(shards); return; } - auto tmp = new S(shards, level->m_shard_cnt); + auto tmp = new S(shards); m_shards[m_shard_cnt] = std::shared_ptr(tmp); ++m_shard_cnt; @@ -131,13 +130,12 @@ public: return nullptr; } - Shard *shards[m_shard_cnt]; - - for (size_t i=0; i shards; + for (auto shard : m_shards) { + if (shard) shards.emplace_back(shard.get()); } - return new S(shards, m_shard_cnt); + return new S(shards); } void get_query_states(std::vector> &shards, std::vector& shard_states, void *query_parms) { diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index eeb3dc9..7edde2f 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -133,18 +133,18 @@ public: return m_tombstone_filter->get_memory_usage(); } - BufferView get_buffer_view() { - size_t head = get_head(); + BufferView get_buffer_view(size_t target_head) { + size_t head = get_head(target_head); auto f = std::bind(release_head_reference, (void *) this, head); return BufferView(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); } - BufferView get_flush_buffer_view() { - size_t head = get_head(); + BufferView get_buffer_view() { + size_t head = get_head(m_head.load().head_idx); auto f = std::bind(release_head_reference, (void *) this, head); - return BufferView(m_data, m_cap, head, head + m_lwm, m_tscnt.load(), m_tombstone_filter, f); + return BufferView(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f); } /* @@ -167,23 +167,39 @@ public: buffer_head new_hd = {new_head, 0}; buffer_head cur_hd; - /* move the current head into the old head */ + /* replace current head with new head */ do { - buffer_head cur_hd = m_head.load(); - m_old_head.store(cur_hd); + cur_hd = m_head.load(); } while(!m_head.compare_exchange_strong(cur_hd, new_hd)); + /* move the current head into the old head */ + m_old_head.store(cur_hd); + m_active_head_advance.store(false); return true; } - size_t get_head() { + /* + * FIXME: If target_head does not match *either* the old_head or the + * current_head, this routine will loop infinitely. + */ + size_t get_head(size_t target_head) { buffer_head cur_hd, new_hd; + bool head_acquired = false; do { - cur_hd = m_head.load(); - new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1}; - } while(!m_head.compare_exchange_strong(cur_hd, new_hd)); + if (m_old_head.load().head_idx == target_head) { + cur_hd = m_old_head.load(); + cur_hd.head_idx = target_head; + new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1}; + head_acquired = m_old_head.compare_exchange_strong(cur_hd, new_hd); + } else if (m_head.load().head_idx == target_head){ + cur_hd = m_head.load(); + cur_hd.head_idx = target_head; + new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1}; + head_acquired = m_head.compare_exchange_strong(cur_hd, new_hd); + } + } while(!head_acquired); return new_hd.head_idx; } @@ -254,22 +270,22 @@ private: buffer_head cur_hd, new_hd; do { - if (buffer->m_head.load().head_idx == head) { - cur_hd = buffer->m_head; - assert(cur_hd.refcnt > 0); + if (buffer->m_old_head.load().head_idx == head) { + cur_hd = buffer->m_old_head; + if (cur_hd.refcnt == 0) continue; new_hd = {cur_hd.head_idx, cur_hd.refcnt-1}; - - if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) { + if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) { break; } } else { - cur_hd = buffer->m_old_head; - assert(cur_hd.refcnt > 0); + cur_hd = buffer->m_head; + if (cur_hd.refcnt == 0) continue; new_hd = {cur_hd.head_idx, cur_hd.refcnt-1}; - if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) { + + if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) { break; } - } + } _mm_pause(); } while(true); } diff --git a/include/query/rangequery.h b/include/query/rangequery.h index ad5b767..c44f5d7 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -84,11 +84,11 @@ public: * roll the pointer forward to the first record that is * greater than or equal to the lower bound. */ - while(ptr->rec.key < p->lower_bound) { + while(ptr < shard->get_data() + s->stop_idx && ptr->rec.key < p->lower_bound) { ptr++; } - while (ptr->rec.key <= p->upper_bound && ptr < shard->get_data() + s->stop_idx) { + while (ptr < shard->get_data() + s->stop_idx && ptr->rec.key <= p->upper_bound) { records.emplace_back(*ptr); ptr++; } @@ -152,6 +152,7 @@ public: } else { auto& cursor = cursors[tmp_n - now.version - 1]; if (!now.data->is_tombstone()) output.push_back(cursor.ptr->rec); + pq.pop(); if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index 6b2f6b5..932e767 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -62,10 +62,13 @@ public: { TIMER_INIT(); - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped), (byte**) &m_data); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); TIMER_START(); - auto temp_buffer = (Wrapped *) psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped)); + auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, buffer.get_record_count(), sizeof(Wrapped)); buffer.copy_to_buffer((byte *) temp_buffer); auto base = temp_buffer; @@ -99,6 +102,7 @@ public: base++; } + TIMER_STOP(); auto copy_time = TIMER_RESULT(); @@ -112,7 +116,7 @@ public: free(temp_buffer); } - ISAMTree(ISAMTree** runs, size_t len) + ISAMTree(std::vector &shards) : m_bf(nullptr) , m_isam_nodes(nullptr) , m_root(nullptr) @@ -124,19 +128,19 @@ public: , m_data(nullptr) { std::vector>> cursors; - cursors.reserve(len); + cursors.reserve(shards.size()); - PriorityQueue> pq(len); + PriorityQueue> pq(shards.size()); size_t attemp_reccnt = 0; size_t tombstone_count = 0; - for (size_t i = 0; i < len; ++i) { - if (runs[i]) { - auto base = runs[i]->get_data(); - cursors.emplace_back(Cursor{base, base + runs[i]->get_record_count(), 0, runs[i]->get_record_count()}); - attemp_reccnt += runs[i]->get_record_count(); - tombstone_count += runs[i]->get_tombstone_count(); + for (size_t i = 0; i < shards.size(); ++i) { + if (shards[i]) { + auto base = shards[i]->get_data(); + cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); + attemp_reccnt += shards[i]->get_record_count(); + tombstone_count += shards[i]->get_tombstone_count(); pq.push(cursors[i].ptr, i); } else { cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); @@ -144,10 +148,9 @@ public: } m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); while (pq.size()) { auto now = pq.peek(); @@ -165,6 +168,8 @@ public: if (!cursor.ptr->is_deleted()) { m_data[m_reccnt++] = *cursor.ptr; if (cursor.ptr->is_tombstone()) { + //fprintf(stderr, "ISAM: Tombstone from shard %ld next record from shard %ld\n", + //now.version, next.version); ++m_tombstone_cnt; m_bf->insert(cursor.ptr->rec); } -- cgit v1.2.3 From 38693c342558628c75e0ab0d23c32a95a499ed8b Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 19 Jan 2024 15:58:04 -0500 Subject: Initial rough-out of internal statistics tracker Need to figure out the best way to do the detailed tracking in a concurrent manner. I was thinking just an event log, with parsing routines for extracting statistics. But that'll be pretty slow. --- include/framework/DynamicExtension.h | 15 +++- include/framework/interface/Scheduler.h | 3 +- include/framework/scheduling/FIFOScheduler.h | 28 ++++---- include/framework/scheduling/SerialScheduler.h | 27 +++----- include/framework/scheduling/Task.h | 23 +++++- include/framework/scheduling/statistics.h | 96 ++++++++++++++++++++++++++ 6 files changed, 157 insertions(+), 35 deletions(-) create mode 100644 include/framework/scheduling/statistics.h (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 7590de2..89ee30f 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -40,6 +40,10 @@ class DynamicExtension { typedef Epoch _Epoch; typedef BufferView BufView; + static constexpr size_t QUERY = 1; + static constexpr size_t RECONSTRUCTION = 2; + + public: DynamicExtension(size_t buffer_lwm, size_t buffer_hwm, size_t scale_factor, size_t memory_budget=0, size_t thread_cnt=16) @@ -226,6 +230,11 @@ public: return t; } + + void print_scheduler_statistics() { + m_sched.print_statistics(); + } + private: SCHED m_sched; @@ -271,7 +280,7 @@ private: */ epoch->start_job(); - m_sched.schedule_job(reconstruction, 0, args); + m_sched.schedule_job(reconstruction, 0, args, RECONSTRUCTION); /* wait for compaction completion */ wait.get(); @@ -511,7 +520,7 @@ private: args->compaction = false; /* NOTE: args is deleted by the reconstruction job, so shouldn't be freed here */ - m_sched.schedule_job(reconstruction, 0, args); + m_sched.schedule_job(reconstruction, 0, args, RECONSTRUCTION); } std::future> schedule_query(void *query_parms) { @@ -522,7 +531,7 @@ private: args->query_parms = query_parms; auto result = args->result_set.get_future(); - m_sched.schedule_job(async_query, 0, args); + m_sched.schedule_job(async_query, 0, args, QUERY); return result; } diff --git a/include/framework/interface/Scheduler.h b/include/framework/interface/Scheduler.h index 94afe6c..451ddd2 100644 --- a/include/framework/interface/Scheduler.h +++ b/include/framework/interface/Scheduler.h @@ -13,6 +13,7 @@ template concept SchedulerInterface = requires(S s, size_t i, void *vp, de::Job j) { {S(i, i)}; - {s.schedule_job(j, i, vp)} -> std::convertible_to; + {s.schedule_job(j, i, vp, i)} -> std::convertible_to; {s.shutdown()}; + {s.print_statistics()}; }; diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 4cdc436..513a3a2 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -8,21 +8,11 @@ */ #pragma once -#include -#include -#include #include #include -#include - -#include "util/types.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" -#include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" -#include "framework/util/Configuration.h" -#include "framework/structure/ExtensionStructure.h" + #include "framework/scheduling/Task.h" +#include "framework/scheduling/statistics.h" #include "ctpl/ctpl.h" #include "psu-ds/LockedPriorityQueue.h" @@ -54,10 +44,12 @@ public: m_sched_thrd.join(); } - void schedule_job(std::function job, size_t size, void *args) { + void schedule_job(std::function job, size_t size, void *args, size_t type=0) { std::unique_lock lk(m_cv_lock); size_t ts = m_counter.fetch_add(1); - m_task_queue.push(Task(size, ts, job, args)); + + m_stats.job_queued(ts, type, size); + m_task_queue.push(Task(size, ts, job, args, type, &m_stats)); m_cv.notify_all(); } @@ -68,6 +60,10 @@ public: m_cv.notify_all(); } + void print_statistics() { + m_stats.print_statistics(); + } + private: psudb::LockedPriorityQueue m_task_queue; @@ -86,9 +82,13 @@ private: std::atomic m_used_thrds; std::atomic m_used_memory; + SchedulerStatistics m_stats; + void schedule_next() { assert(m_task_queue.size() > 0); auto t = m_task_queue.pop(); + m_stats.job_scheduled(t.m_timestamp); + m_thrd_pool.push(t); } diff --git a/include/framework/scheduling/SerialScheduler.h b/include/framework/scheduling/SerialScheduler.h index 10c2af2..ac59301 100644 --- a/include/framework/scheduling/SerialScheduler.h +++ b/include/framework/scheduling/SerialScheduler.h @@ -14,21 +14,8 @@ */ #pragma once -#include -#include -#include -#include -#include -#include - -#include "util/types.h" -#include "framework/interface/Shard.h" -#include "framework/interface/Query.h" -#include "framework/interface/Record.h" -#include "framework/structure/MutableBuffer.h" -#include "framework/util/Configuration.h" -#include "framework/structure/ExtensionStructure.h" #include "framework/scheduling/Task.h" +#include "framework/scheduling/statistics.h" namespace de { @@ -44,9 +31,11 @@ public: ~SerialScheduler() = default; - void schedule_job(std::function job, size_t size, void *args) { + void schedule_job(std::function job, size_t size, void *args, size_t type=0) { size_t ts = m_counter++; - auto t = Task(size, ts, job, args); + m_stats.job_queued(ts, type, size); + m_stats.job_scheduled(ts); + auto t = Task(size, ts, job, args, type, &m_stats); t(0); } @@ -54,6 +43,10 @@ public: /* intentionally left blank */ } + void print_statistics() { + m_stats.print_statistics(); + } + private: size_t m_memory_budget; size_t m_thrd_cnt; @@ -62,6 +55,8 @@ private: size_t m_used_memory; size_t m_counter; + + SchedulerStatistics m_stats; }; } diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 16f5e58..b14b229 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -10,9 +10,11 @@ #include #include +#include #include "framework/util/Configuration.h" #include "framework/scheduling/Epoch.h" +#include "framework/scheduling/statistics.h" namespace de { @@ -35,17 +37,21 @@ struct QueryArgs { typedef std::function Job; struct Task { - Task(size_t size, size_t ts, Job job, void *args) + Task(size_t size, size_t ts, Job job, void *args, size_t type=0, SchedulerStatistics *stats=nullptr) : m_job(job) , m_size(size) , m_timestamp(ts) , m_args(args) + , m_type(type) + , m_stats(stats) {} Job m_job; size_t m_size; size_t m_timestamp; void *m_args; + size_t m_type; + SchedulerStatistics *m_stats; friend bool operator<(const Task &self, const Task &other) { return self.m_timestamp < other.m_timestamp; @@ -56,7 +62,22 @@ struct Task { } void operator()(size_t thrd_id) { + auto start = std::chrono::high_resolution_clock::now(); + if (m_stats) { + m_stats->job_begin(m_timestamp); + } + m_job(m_args); + + if (m_stats) { + m_stats->job_complete(m_timestamp); + } + auto stop = std::chrono::high_resolution_clock::now(); + + if (m_stats) { + auto time = std::chrono::duration_cast(stop - start).count(); + m_stats->log_time_data(time, m_type); + } } }; diff --git a/include/framework/scheduling/statistics.h b/include/framework/scheduling/statistics.h new file mode 100644 index 0000000..8466ffc --- /dev/null +++ b/include/framework/scheduling/statistics.h @@ -0,0 +1,96 @@ +/* + * include/framework/scheduling/statistics.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * Distributed under the Modified BSD License. + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace de { + +class SchedulerStatistics { +private: + enum class EventType { + QUEUED, + SCHEDULED, + STARTED, + FINISHED + }; + + struct Event { + size_t id; + EventType type; + }; + + struct JobInfo { + size_t id; + size_t size; + size_t type; + }; + + +public: + SchedulerStatistics() = default; + ~SchedulerStatistics() = default; + + void job_queued(size_t id, size_t type, size_t size) { + auto time = std::chrono::high_resolution_clock::now(); + } + + void job_scheduled(size_t id) { + std::unique_lock lk(m_mutex); + + } + + void job_begin(size_t id) { + + } + + void job_complete(size_t id) { + + } + + /* FIXME: This is just a temporary approach */ + void log_time_data(size_t length, size_t type) { + assert(type == 1 || type == 2); + + if (type == 1) { + m_type_1_cnt.fetch_add(1); + m_type_1_total_time.fetch_add(length); + } else { + m_type_2_cnt.fetch_add(1); + m_type_2_total_time.fetch_add(length); + } + } + + void print_statistics() { + fprintf(stdout, "Query Count: %ld\tQuery Avg. Latency: %ld\n", + m_type_1_cnt.load(), + m_type_1_total_time.load() / m_type_1_cnt.load()); + fprintf(stdout, "Reconstruction Count: %ld\tReconstruction Avg. Latency: %ld\n", + m_type_2_cnt.load(), + m_type_2_total_time.load() / m_type_2_cnt.load()); + } + +private: + std::mutex m_mutex; + std::unordered_map m_jobs; + std::vector m_event_log; + + std::atomic m_type_1_cnt; + std::atomic m_type_1_total_time; + + std::atomic m_type_2_cnt; + std::atomic m_type_2_total_time; +}; +} -- cgit v1.2.3 From 4ac2e14d24a1fdd3f9bf777775b16bf6a677f487 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 22 Jan 2024 10:14:05 -0500 Subject: Added RangeCount query --- include/query/rangecount.h | 169 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 include/query/rangecount.h (limited to 'include') diff --git a/include/query/rangecount.h b/include/query/rangecount.h new file mode 100644 index 0000000..7d88b1d --- /dev/null +++ b/include/query/rangecount.h @@ -0,0 +1,169 @@ +/* + * include/query/rangecount.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * Distributed under the Modified BSD License. + * + * A query class for single dimensional range count queries. This query + * requires that the shard support get_lower_bound(key) and + * get_record_at(index). + */ +#pragma once + +#include "framework/interface/Record.h" +#include "framework/interface/Shard.h" +#include "framework/structure/BufferView.h" +#include "psu-ds/PriorityQueue.h" +#include "util/Cursor.h" + +namespace de { namespace rc { + +template +struct Parms { + decltype(R::key) lower_bound; + decltype(R::key) upper_bound; +}; + +template +struct State { + size_t start_idx; + size_t stop_idx; +}; + +template +struct BufferState { + BufferView buffer; + + BufferState(BufferView buffer) + : buffer(std::move(buffer)) {} +}; + +template +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=true; + + static void *get_query_state(S *shard, void *parms) { + auto res = new State(); + auto p = (Parms *) parms; + + res->start_idx = shard->get_lower_bound(p->lower_bound); + res->stop_idx = shard->get_record_count(); + + return res; + } + + static void* get_buffer_query_state(BufferView buffer, void *parms) { + auto res = new BufferState(std::move(buffer)); + + return res; + } + + static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_states) { + return; + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + std::vector> records; + auto p = (Parms *) parms; + auto s = (State *) q_state; + + size_t reccnt = 0; + size_t tscnt = 0; + + Wrapped res; + res.rec.key= 0; // records + res.rec.value = 0; // tombstones + records.emplace_back(res); + + /* + * if the returned index is one past the end of the + * records for the PGM, then there are not records + * in the index falling into the specified range. + */ + if (s->start_idx == shard->get_record_count()) { + return records; + } + + auto ptr = shard->get_record_at(s->start_idx); + + /* + * roll the pointer forward to the first record that is + * greater than or equal to the lower bound. + */ + while(ptr < shard->get_data() + s->stop_idx && ptr->rec.key < p->lower_bound) { + ptr++; + } + + while (ptr < shard->get_data() + s->stop_idx && ptr->rec.key <= p->upper_bound) { + if (!ptr->is_deleted()) { + if (ptr->is_tombstone()) { + records[0].rec.value++; + } else { + records[0].rec.key++; + } + } + + ptr++; + } + + return records; + } + + static std::vector> buffer_query(void *state, void *parms) { + auto p = (Parms *) parms; + auto s = (BufferState *) state; + + std::vector> records; + + Wrapped res; + res.rec.key= 0; // records + res.rec.value = 0; // tombstones + records.emplace_back(res); + + for (size_t i=0; ibuffer.get_record_count(); i++) { + auto rec = s->buffer.get(i); + if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound + && !rec->is_deleted()) { + if (rec->is_tombstone()) { + records[0].rec.value++; + } else { + records[0].rec.key++; + } + } + } + + return records; + } + + static std::vector merge(std::vector>> &results, void *parms) { + + R res; + res.key = 0; + res.value = 0; + std::vector output; + output.emplace_back(res); + + for (size_t i=0; i *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; + +}} -- cgit v1.2.3 From 4d0d26bfef684566a371ca7c87ba84df52f25ccc Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 22 Jan 2024 10:42:58 -0500 Subject: FIFOScheduler: added automake wakeup Sometimes, when the max thread count is exceeded, it is possible for the scheduler to lock up. This is because the scheduler is only run when a new job is put into the queue, and so it is possible for a job to be blocked by thread limitations and be left sitting in the queue. If the main program is waiting on this job to finish before scheduling a new one, then the system deadlocks. I added a second background thread to the scheduler that wakes the scheduler up every 20us to resolve this and prevent these deadlocks. --- include/framework/scheduling/FIFOScheduler.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 513a3a2..b77a8a1 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -10,7 +10,7 @@ #include #include - +#include #include "framework/scheduling/Task.h" #include "framework/scheduling/statistics.h" @@ -19,6 +19,8 @@ namespace de { +using namespace std::literals::chrono_literals; + class FIFOScheduler { private: @@ -33,6 +35,7 @@ public: , m_shutdown(false) { m_sched_thrd = std::thread(&FIFOScheduler::run, this); + m_sched_wakeup_thrd = std::thread(&FIFOScheduler::periodic_wakeup, this); m_thrd_pool.resize(m_thrd_cnt); } @@ -77,6 +80,7 @@ private: std::condition_variable m_cv; std::thread m_sched_thrd; + std::thread m_sched_wakeup_thrd; ctpl::thread_pool m_thrd_pool; std::atomic m_used_thrds; @@ -84,6 +88,13 @@ private: SchedulerStatistics m_stats; + void periodic_wakeup() { + do { + std::this_thread::sleep_for(10us); + m_cv.notify_all(); + } while (!m_shutdown.load()); + } + void schedule_next() { assert(m_task_queue.size() > 0); auto t = m_task_queue.pop(); -- cgit v1.2.3 From b1e4182825e6c162571b7cc4efaf8bc44055b49c Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 22 Jan 2024 12:10:54 -0500 Subject: Adjusted recon_benchmark and properly shutdown FIFOScheduler --- include/framework/scheduling/FIFOScheduler.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index b77a8a1..0df4d3c 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -45,6 +45,7 @@ public: } m_sched_thrd.join(); + m_sched_wakeup_thrd.join(); } void schedule_job(std::function job, size_t size, void *args, size_t type=0) { -- cgit v1.2.3 From f0a55f7996e9ea2c7824fd5ab136b7c1864bbcdd Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 24 Jan 2024 11:18:25 -0500 Subject: DynamicExtension: Fixed reconstruction trigger data race Tweak the reconstruction trigger code to ensure that multiple reconstructions won't be triggered at the same time. --- include/framework/DynamicExtension.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 89ee30f..40f137c 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -451,7 +451,7 @@ private: ((DynamicExtension *) args->extension)->advance_epoch(new_head); } - ((DynamicExtension *) args->extension)->m_reconstruction_scheduled = false; + ((DynamicExtension *) args->extension)->m_reconstruction_scheduled.store(false); delete args; } @@ -537,9 +537,12 @@ private: } int internal_append(const R &rec, bool ts) { - if (!m_reconstruction_scheduled.load() && m_buffer->is_at_low_watermark()) { - m_reconstruction_scheduled.store(true); - schedule_reconstruction(); + if (m_buffer->is_at_low_watermark()) { + auto old = false; + + if (m_reconstruction_scheduled.compare_exchange_strong(old, true)) { + schedule_reconstruction(); + } } /* this will fail if the HWM is reached and return 0 */ -- cgit v1.2.3 From f24fdf2fd310a5f868e15cd9682ca37d740c77af Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 30 Jan 2024 15:31:03 -0500 Subject: Benchmarking updates --- include/framework/scheduling/statistics.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/framework/scheduling/statistics.h b/include/framework/scheduling/statistics.h index 8466ffc..50ba196 100644 --- a/include/framework/scheduling/statistics.h +++ b/include/framework/scheduling/statistics.h @@ -67,19 +67,33 @@ public: if (type == 1) { m_type_1_cnt.fetch_add(1); m_type_1_total_time.fetch_add(length); + + if (length > m_type_1_largest_time) { + m_type_1_largest_time.store(length); + } } else { m_type_2_cnt.fetch_add(1); m_type_2_total_time.fetch_add(length); + + if (length > m_type_2_largest_time) { + m_type_2_largest_time.store(length); + } } } void print_statistics() { - fprintf(stdout, "Query Count: %ld\tQuery Avg. Latency: %ld\n", - m_type_1_cnt.load(), - m_type_1_total_time.load() / m_type_1_cnt.load()); - fprintf(stdout, "Reconstruction Count: %ld\tReconstruction Avg. Latency: %ld\n", - m_type_2_cnt.load(), - m_type_2_total_time.load() / m_type_2_cnt.load()); + if (m_type_1_cnt > 0) { + fprintf(stdout, "Query Count: %ld\tQuery Avg. Latency: %ld\tMax Query Latency: %ld\n", + m_type_1_cnt.load(), + m_type_1_total_time.load() / m_type_1_cnt.load(), + m_type_1_largest_time.load()); + } + if (m_type_2_cnt > 0) { + fprintf(stdout, "Reconstruction Count: %ld\tReconstruction Avg. Latency: %ld\tMax Recon. Latency:%ld\n", + m_type_2_cnt.load(), + m_type_2_total_time.load() / m_type_2_cnt.load(), + m_type_2_largest_time.load()); + } } private: @@ -92,5 +106,8 @@ private: std::atomic m_type_2_cnt; std::atomic m_type_2_total_time; + + std::atomic m_type_1_largest_time; + std::atomic m_type_2_largest_time; }; } -- cgit v1.2.3 From 51a85013236f4b2bd596caf179d90e67c848963c Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Tue, 30 Jan 2024 15:31:34 -0500 Subject: TrieSpline + tests --- include/shard/TrieSpline.h | 151 +++++++++++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 56ec357..8142a67 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -12,10 +12,6 @@ #include -#include -#include -#include -#include #include "framework/ShardRequirements.h" #include "ts/builder.h" @@ -23,62 +19,64 @@ #include "util/Cursor.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" +#include "psu-util/timer.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; -using psudb::Alias; namespace de { -template +template class TrieSpline { private: typedef decltype(R::key) K; typedef decltype(R::value) V; public: - TrieSpline(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - + TrieSpline(BufferView buffer) + : m_data(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) + , m_max_key(0) + , m_min_key(0) + , m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) + { + TIMER_INIT(); + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + + TIMER_START(); + auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, buffer.get_record_count(), sizeof(Wrapped)); + buffer.copy_to_buffer((byte *) temp_buffer); + + auto base = temp_buffer; + auto stop = base + buffer.get_record_count(); std::sort(base, stop, std::less>()); K min_key = base->rec.key; - K max_key = (stop - 1)->rec.key; + K max_key = (stop-1)->rec.key; + TIMER_STOP(); - auto bldr = ts::Builder(min_key, max_key, E); + auto sort_time = TIMER_RESULT(); + TIMER_START(); + auto bldr = ts::Builder(min_key, max_key, E); while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - continue; - } + if (!base->is_tombstone() && (base + 1 < stop) + && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { + base += 2; + continue; } else if (base->is_deleted()) { base += 1; continue; } - if (m_reccnt == 0) { - m_max_key = m_min_key = base->rec.key; - } else if (base->rec.key > m_max_key) { - m_max_key = base->rec.key; - } else if (base->rec.key < m_min_key) { - m_min_key = base->rec.key; - } - // FIXME: this shouldn't be necessary, but the tagged record // bypass doesn't seem to be working on this code-path, so this // ensures that tagged records from the buffer are able to be @@ -86,37 +84,67 @@ public: base->header &= 3; m_data[m_reccnt++] = *base; bldr.AddKey(base->rec.key); - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; + ++m_tombstone_cnt; m_bf->insert(base->rec); } - + + /* + * determine the "true" min/max keys based on the scan. This is + * to avoid situations where the min/max in the input array + * are deleted and don't survive into the structure itself. + */ + if (m_reccnt == 0) { + m_max_key = m_min_key = base->rec.key; + } else if (base->rec.key > m_max_key) { + m_max_key = base->rec.key; + } else if (base->rec.key < m_min_key) { + m_min_key = base->rec.key; + } + base++; } + TIMER_STOP(); + auto copy_time = TIMER_RESULT(); + + TIMER_START(); if (m_reccnt > 0) { m_ts = bldr.Finalize(); } + TIMER_STOP(); + auto level_time = TIMER_RESULT(); + + free(temp_buffer); } - TrieSpline(TrieSpline** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0) { + TrieSpline(std::vector &shards) + : m_data(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) + , m_max_key(0) + , m_min_key(0) + , m_bf(nullptr) + { + std::vector>> cursors; - cursors.reserve(len); + cursors.reserve(shards.size()); - PriorityQueue> pq(len); + PriorityQueue> pq(shards.size()); size_t attemp_reccnt = 0; size_t tombstone_count = 0; - // initialize m_max_key and m_min_key using the values from the - // first shard. These will later be updated when building - // the initial priority queue to their true values. + /* + * Initialize m_max_key and m_min_key using the values from the + * first shard. These will later be updated when building + * the initial priority queue to their true values. + */ m_max_key = shards[0]->m_max_key; m_min_key = shards[0]->m_min_key; - for (size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < shards.size(); ++i) { if (shards[i]) { auto base = shards[i]->get_data(); cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); @@ -137,12 +165,11 @@ public: } m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); - auto bldr = ts::Builder(m_min_key, m_max_key, E); - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); + auto bldr = ts::Builder(m_min_key, m_max_key, E); while (pq.size()) { auto now = pq.peek(); auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; @@ -152,33 +179,32 @@ public: pq.pop(); pq.pop(); auto& cursor1 = cursors[now.version]; auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); + if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); } else { auto& cursor = cursors[now.version]; if (!cursor.ptr->is_deleted()) { m_data[m_reccnt++] = *cursor.ptr; bldr.AddKey(cursor.ptr->rec.key); - if (m_bf && cursor.ptr->is_tombstone()) { + if (cursor.ptr->is_tombstone()) { ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); + m_bf->insert(cursor.ptr->rec); } } pq.pop(); - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); + if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); } } if (m_reccnt > 0) { m_ts = bldr.Finalize(); } - } + } ~TrieSpline() { - if (m_data) free(m_data); - if (m_bf) delete m_bf; - + free(m_data); + delete m_bf; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -253,14 +279,17 @@ public: max = mid; } } + } + if (idx == m_reccnt) { + return m_reccnt; } if (m_data[idx].rec.key > key && idx > 0 && m_data[idx-1].rec.key <= key) { return idx-1; } - return (m_data[idx].rec.key <= key) ? idx : m_reccnt; + return idx; } private: -- cgit v1.2.3 From d166465dcca3550cb8f3263e0f5b5189a69d531a Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 13:29:49 -0500 Subject: Temporary thread affinity for reconstruction --- include/framework/DynamicExtension.h | 33 +++++++++++++++++++++++++++- include/framework/scheduling/FIFOScheduler.h | 1 + 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 40f137c..3203945 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -32,7 +32,7 @@ namespace de { template + DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=SerialScheduler> class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; @@ -51,6 +51,8 @@ public: , m_max_delete_prop(1) , m_sched(memory_budget, thread_cnt) , m_buffer(new Buffer(buffer_lwm, buffer_hwm)) + , m_core_cnt(thread_cnt) + , m_next_core(0) { auto vers = new Structure(buffer_hwm, m_scale_factor, m_max_delete_prop); auto epoch = new _Epoch(0, vers, m_buffer, 0); @@ -258,6 +260,9 @@ private: size_t m_scale_factor; double m_max_delete_prop; + std::atomic m_next_core; + size_t m_core_cnt; + void enforce_delete_invariant(_Epoch *epoch) { auto structure = epoch->get_structure(); auto compactions = structure->get_compaction_tasks(); @@ -415,6 +420,8 @@ private: static void reconstruction(void *arguments) { auto args = (ReconstructionArgs *) arguments; + + ((DynamicExtension *) args->extension)->SetThreadAffinity(); Structure *vers = args->epoch->get_structure(); for (ssize_t i=0; imerges.size(); i++) { @@ -605,6 +612,30 @@ private: return processed_records; } + + void SetThreadAffinity() { + int core = m_next_core.fetch_add(1) % m_core_cnt; + cpu_set_t mask; + CPU_ZERO(&mask); + + switch (core % 2) { + case 0: + // 0 |-> 0 + // 2 |-> 2 + // 4 |-> 4 + core = core; + break; + case 1: + // 1 |-> 28 + // 3 |-> 30 + // 5 |-> 32 + core = (core - 1) + m_core_cnt; + break; + } + CPU_SET(core, &mask); + ::sched_setaffinity(0, sizeof(mask), &mask); + } + }; } diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index 0df4d3c..c6baf9b 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -114,6 +114,7 @@ private: } } while(!m_shutdown.load()); } + }; } -- cgit v1.2.3 From b1f966353695a0e06948df5332acccb84bbbcda0 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 14:26:34 -0500 Subject: Query/Insert intermix benchmarks --- include/framework/DynamicExtension.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 3203945..a10831e 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -328,14 +328,15 @@ private: */ enforce_delete_invariant(new_epoch); - // FIXME: this may currently fail because there isn't any + // FIXME: this may currently block because there isn't any // query preemption yet. At this point, we'd need to either // 1) wait for all queries on the old_head to finish // 2) kill all queries on the old_head // 3) somehow migrate all queries on the old_head to the new // version - auto res = new_epoch->advance_buffer_head(buffer_head); - assert(res); + while (!new_epoch->advance_buffer_head(buffer_head)) { + _mm_pause(); + } m_current_epoch.fetch_add(1); old_epoch->set_inactive(); -- cgit v1.2.3 From 27d36dd9a68e4cf454be2ca7877ece0a34c3e929 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 15:48:21 -0500 Subject: Insert throughput benchmark --- include/framework/structure/MutableBuffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 7edde2f..94a9c41 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -158,7 +158,7 @@ public: /* refuse to advance head while there is an old with one references */ if (m_old_head.load().refcnt > 0) { - fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts"); + //fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts\n"); return false; } -- cgit v1.2.3 From e4a7d2d5c7464fe97ab7e37c2b0b73c32b5e8b17 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 16:00:12 -0500 Subject: Possibly fixed epoch retirement sync error --- include/framework/DynamicExtension.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index a10831e..abe3839 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -384,11 +384,15 @@ private: do { if (epoch->retirable()) { + m_epoch_retire_lk.lock(); + if (!epoch->retirable()) { + m_epoch_retire_lk.unlock(); + continue; + } break; } } while (true); - m_epoch_retire_lk.lock(); /* remove epoch from the framework's map */ m_epochs.erase(epoch->get_epoch_number()); -- cgit v1.2.3 From 1e226fc415d7674de0ecde51199d89e9042c6a22 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 16:57:41 -0500 Subject: Updated insert query throughput to use IRS queries --- include/query/irs.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/query/irs.h b/include/query/irs.h index fa69ea1..7ef5069 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -13,6 +13,7 @@ #pragma once #include "framework/QueryRequirements.h" +#include "psu-ds/Alias.h" namespace de { namespace irs { @@ -38,6 +39,9 @@ struct BufferState { size_t cutoff; std::vector> records; size_t sample_size; + BufferView buffer; + + BufferState(BufferView buffer) : buffer(std::move(buffer)) {} }; template @@ -64,10 +68,10 @@ public: return res; } - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - auto res = new BufferState(); + static void* get_buffer_query_state(BufferView buffer, void *parms) { + auto res = new BufferState(std::move(buffer)); - res->cutoff = buffer->get_record_count(); + res->cutoff = res->buffer.get_record_count(); res->sample_size = 0; if constexpr (Rejection) { @@ -78,8 +82,8 @@ public: auto upper_key = ((Parms *) parms)->upper_bound; for (size_t i=0; icutoff; i++) { - if (((buffer->get_data() + i)->rec.key >= lower_key) && ((buffer->get_data() + i)->rec.key <= upper_key)) { - res->records.emplace_back(*(buffer->get_data() + i)); + if ((res->buffer.get(i)->rec.key >= lower_key) && (buffer.get(i)->rec.key <= upper_key)) { + res->records.emplace_back(*(res->buffer.get(i))); } } @@ -167,7 +171,7 @@ public: return result_set; } - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + static std::vector> buffer_query(void *state, void *parms) { auto st = (BufferState *) state; auto p = (Parms *) parms; @@ -177,7 +181,7 @@ public: if constexpr (Rejection) { for (size_t i=0; isample_size; i++) { auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; + auto rec = st->buffer.get(idx); if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { result.emplace_back(*rec); -- cgit v1.2.3 From 8fbcfda7270ef266f29f36b8eadcffaec2123612 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 17:02:20 -0500 Subject: More locking! --- include/framework/DynamicExtension.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index abe3839..cc226d2 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -301,8 +301,10 @@ private: _Epoch *get_active_epoch_protected() { m_epoch_retire_lk.lock_shared(); + m_struct_lock.lock(); auto cur_epoch = m_current_epoch.load(); m_epochs[cur_epoch]->start_job(); + m_struct_lock.unlock(); m_epoch_retire_lk.unlock_shared(); return m_epochs[cur_epoch]; -- cgit v1.2.3 From f3b7428cfa7f9364c5a8bc85107db3a7cccd53bc Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 18:41:17 -0500 Subject: Adjusted epoch transition methodology --- include/framework/DynamicExtension.h | 215 +++++++++++++++++++---------------- include/framework/scheduling/Epoch.h | 55 --------- include/framework/scheduling/Task.h | 2 +- 3 files changed, 120 insertions(+), 152 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index cc226d2..0992e14 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -43,6 +43,10 @@ class DynamicExtension { static constexpr size_t QUERY = 1; static constexpr size_t RECONSTRUCTION = 2; + struct epoch_ptr { + _Epoch *epoch; + size_t refcnt; + }; public: DynamicExtension(size_t buffer_lwm, size_t buffer_hwm, size_t scale_factor, size_t memory_budget=0, @@ -53,12 +57,14 @@ public: , m_buffer(new Buffer(buffer_lwm, buffer_hwm)) , m_core_cnt(thread_cnt) , m_next_core(0) + , m_epoch_cnt(0) { auto vers = new Structure(buffer_hwm, m_scale_factor, m_max_delete_prop); - auto epoch = new _Epoch(0, vers, m_buffer, 0); + m_current_epoch.store({new _Epoch(0, vers, m_buffer, 0), 0}); + m_previous_epoch.store({nullptr, 0}); + m_next_epoch.store({nullptr, 0}); m_versions.insert(vers); - m_epochs.insert({0, epoch}); } ~DynamicExtension() { @@ -66,16 +72,13 @@ public: /* let any in-flight epoch transition finish */ await_next_epoch(); - /* deactivate the active epoch */ - get_active_epoch()->set_inactive(); - /* shutdown the scheduler */ m_sched.shutdown(); /* delete all held resources */ - for (auto e : m_epochs) { - delete e.second; - } + delete m_next_epoch.load().epoch; + delete m_current_epoch.load().epoch; + delete m_previous_epoch.load().epoch; delete m_buffer; @@ -123,41 +126,41 @@ public: } size_t get_record_count() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_record_count() + epoch->get_structure()->get_record_count(); - epoch->end_job(); + end_job(epoch); return t; } size_t get_tombstone_count() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); - epoch->end_job(); + end_job(epoch); return t; } size_t get_height() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t = epoch->get_structure()->get_height(); - epoch->end_job(); + end_job(epoch); return t; } size_t get_memory_usage() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t= epoch->get_buffer().get_memory_usage() + epoch->get_structure()->get_memory_usage(); - epoch->end_job(); + end_job(epoch); return t; } size_t get_aux_memory_usage() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); - epoch->end_job(); + end_job(epoch); return t; } @@ -171,7 +174,7 @@ public: await_next_epoch(); } - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto vers = epoch->get_structure(); std::vector shards; @@ -203,7 +206,7 @@ public: delete shard; } - epoch->end_job(); + end_job(epoch); return flattened; } @@ -212,12 +215,10 @@ public: * the newest one to become available. Otherwise, returns immediately. */ void await_next_epoch() { - while (m_current_epoch.load() != m_newest_epoch.load()) { + while (m_next_epoch.load().epoch != nullptr) { std::unique_lock lk(m_epoch_cv_lk); m_epoch_cv.wait(lk); } - - return; } /* @@ -226,9 +227,9 @@ public: * tombstone proportion invariant. */ bool validate_tombstone_proportion() { - auto epoch = get_active_epoch_protected(); + auto epoch = get_active_epoch(); auto t = epoch->get_structure()->validate_tombstone_proportion(); - epoch->end_job(); + end_job(epoch); return t; } @@ -247,15 +248,14 @@ private: alignas(64) std::atomic m_reconstruction_scheduled; - std::atomic m_current_epoch; - std::atomic m_newest_epoch; - std::unordered_map m_epochs; + std::atomic m_next_epoch; + std::atomic m_current_epoch; + std::atomic m_previous_epoch; std::condition_variable m_epoch_cv; std::mutex m_epoch_cv_lk; - std::mutex m_epoch_transition_lk; - std::shared_mutex m_epoch_retire_lk; + std::atomic m_epoch_cnt; size_t m_scale_factor; double m_max_delete_prop; @@ -279,12 +279,6 @@ private: auto wait = args->result.get_future(); - /* - * the reconstruction process calls end_job(), - * so we must start one before calling it - */ - epoch->start_job(); - m_sched.schedule_job(reconstruction, 0, args, RECONSTRUCTION); /* wait for compaction completion */ @@ -296,39 +290,38 @@ private: } _Epoch *get_active_epoch() { - return m_epochs[m_current_epoch.load()]; - } + epoch_ptr old, new_ptr; - _Epoch *get_active_epoch_protected() { - m_epoch_retire_lk.lock_shared(); - m_struct_lock.lock(); - auto cur_epoch = m_current_epoch.load(); - m_epochs[cur_epoch]->start_job(); - m_struct_lock.unlock(); - m_epoch_retire_lk.unlock_shared(); + do { + if (m_current_epoch.load().epoch == nullptr) { + old = m_previous_epoch; + new_ptr = {old.epoch, old.refcnt+1}; + if (old.epoch != nullptr && m_previous_epoch.compare_exchange_strong(old, new_ptr)) { + break; + } + } else { + old = m_current_epoch; + new_ptr = {old.epoch, old.refcnt+1}; + if (old.epoch != nullptr && m_current_epoch.compare_exchange_strong(old, new_ptr)) { + break; + } + } + } while (true); - return m_epochs[cur_epoch]; + return new_ptr.epoch; } void advance_epoch(size_t buffer_head) { - m_epoch_transition_lk.lock(); - - size_t new_epoch_num = m_newest_epoch.load(); - size_t old_epoch_num = m_current_epoch.load(); - assert(new_epoch_num != old_epoch_num); + retire_epoch(m_previous_epoch.load().epoch); - _Epoch *new_epoch = m_epochs[new_epoch_num]; - _Epoch *old_epoch = m_epochs[old_epoch_num]; + epoch_ptr tmp = {nullptr, 0}; + epoch_ptr cur; + do { + cur = m_current_epoch; + } while(!m_current_epoch.compare_exchange_strong(cur, tmp)); - /* - * Verify the tombstone invariant within the epoch's structure, this - * may require scheduling additional reconstructions. - * - * FIXME: having this inside the lock is going to TANK - * insertion performance. - */ - enforce_delete_invariant(new_epoch); + m_previous_epoch.store(cur); // FIXME: this may currently block because there isn't any // query preemption yet. At this point, we'd need to either @@ -336,20 +329,19 @@ private: // 2) kill all queries on the old_head // 3) somehow migrate all queries on the old_head to the new // version - while (!new_epoch->advance_buffer_head(buffer_head)) { + while (!m_next_epoch.load().epoch->advance_buffer_head(buffer_head)) { _mm_pause(); } - m_current_epoch.fetch_add(1); - old_epoch->set_inactive(); - m_epoch_transition_lk.unlock(); + + m_current_epoch.store(m_next_epoch); + m_next_epoch.store({nullptr, 0}); + /* notify any blocking threads that the new epoch is available */ m_epoch_cv_lk.lock(); m_epoch_cv.notify_all(); m_epoch_cv_lk.unlock(); - - retire_epoch(old_epoch); } /* @@ -363,14 +355,20 @@ private: * is violated, it is possible that this code will clone a retired * epoch. */ - m_newest_epoch.fetch_add(1); - auto new_epoch = get_active_epoch()->clone(m_newest_epoch.load()); + assert(m_next_epoch.load().epoch == nullptr); + auto current_epoch = get_active_epoch(); + + m_epoch_cnt.fetch_add(1); + m_next_epoch.store({current_epoch->clone(m_epoch_cnt.load()), 0}); + + end_job(current_epoch); + std::unique_lock m_struct_lock; - m_versions.insert(new_epoch->get_structure()); - m_epochs.insert({m_newest_epoch.load(), new_epoch}); + m_versions.insert(m_next_epoch.load().epoch->get_structure()); m_struct_lock.release(); - return new_epoch; + + return m_next_epoch.load().epoch; } void retire_epoch(_Epoch *epoch) { @@ -384,28 +382,25 @@ private: * proceed. */ + if (epoch == nullptr) { + return; + } + + epoch_ptr old, new_ptr; + new_ptr = {nullptr, 0}; do { - if (epoch->retirable()) { - m_epoch_retire_lk.lock(); - if (!epoch->retirable()) { - m_epoch_retire_lk.unlock(); - continue; - } + old = m_previous_epoch.load(); + + if (old.epoch == epoch && old.refcnt == 0 && + m_previous_epoch.compare_exchange_strong(old, new_ptr)) { break; } - } while (true); + usleep(1); + } while(true); - /* remove epoch from the framework's map */ - m_epochs.erase(epoch->get_epoch_number()); + //fprintf(stderr, "Epoch %ld retired [%p]\n", epoch->get_epoch_number(), epoch); - /* - * The epoch's destructor will handle releasing - * all the references it holds - */ delete epoch; - m_epoch_retire_lk.unlock(); - - /* NOTE: the BufferView mechanism handles freeing unused buffer space */ /* * Following the epoch's destruction, any buffers @@ -453,7 +448,6 @@ private: vers->flush_buffer(std::move(buffer_view)); } - args->epoch->end_job(); args->result.set_value(true); /* @@ -473,8 +467,17 @@ private: static void async_query(void *arguments) { QueryArgs *args = (QueryArgs *) arguments; - auto buffer = args->epoch->get_buffer(); - auto vers = args->epoch->get_structure(); + auto epoch = ((DynamicExtension *) args->extension)->get_active_epoch(); + + auto ptr1 = ((DynamicExtension *) args->extension)->m_previous_epoch.load().epoch; + auto ptr2 = ((DynamicExtension *) args->extension)->m_current_epoch.load().epoch; + auto ptr3 = ((DynamicExtension *) args->extension)->m_next_epoch.load().epoch; + + //fprintf(stderr, "(%ld, %p)\t%p\t%p\t%p\n", epoch->get_epoch_number(), epoch, ptr1, ptr2, ptr3); + + + auto buffer = epoch->get_buffer(); + auto vers = epoch->get_structure(); void *parms = args->query_parms; /* Get the buffer query states */ @@ -509,7 +512,7 @@ private: auto result = Q::merge(query_results, parms); args->result_set.set_value(std::move(result)); - args->epoch->end_job(); + ((DynamicExtension *) args->extension)->end_job(epoch); Q::delete_buffer_query_state(buffer_state); for (size_t i=0; istart_job(); ReconstructionArgs *args = new ReconstructionArgs(); args->epoch = epoch; @@ -538,10 +540,8 @@ private: } std::future> schedule_query(void *query_parms) { - auto epoch = get_active_epoch_protected(); - QueryArgs *args = new QueryArgs(); - args->epoch = epoch; + args->extension = this; args->query_parms = query_parms; auto result = args->result_set.get_future(); @@ -643,6 +643,29 @@ private: ::sched_setaffinity(0, sizeof(mask), &mask); } + + void end_job(_Epoch *epoch) { + epoch_ptr old, new_ptr; + + do { + if (m_previous_epoch.load().epoch == epoch) { + old = m_previous_epoch; + assert(old.refcnt > 0); + new_ptr = {old.epoch, old.refcnt - 1}; + if (m_previous_epoch.compare_exchange_strong(old, new_ptr)) { + break; + } + } else { + old = m_current_epoch; + assert(old.refcnt > 0); + new_ptr = {old.epoch, old.refcnt - 1}; + if (m_current_epoch.compare_exchange_strong(old, new_ptr)) { + break; + } + } + } while (true); + } + }; } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 45ee17d..7b533b6 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -29,8 +29,6 @@ public: : m_buffer(nullptr) , m_structure(nullptr) , m_active_merge(false) - , m_active_jobs(0) - , m_active(true) , m_epoch_number(number) , m_buffer_head(0) {} @@ -38,9 +36,7 @@ public: Epoch(size_t number, Structure *structure, Buffer *buff, size_t head) : m_buffer(buff) , m_structure(structure) - , m_active_jobs(0) , m_active_merge(false) - , m_active(true) , m_epoch_number(number) , m_buffer_head(head) { @@ -48,8 +44,6 @@ public: } ~Epoch() { - assert(m_active_jobs.load() == 0); - /* FIXME: this is needed to keep the destructor from sometimes locking * up here. But there *shouldn't* be any threads waiting on this signal * at object destruction, so something else is going on here that needs @@ -71,24 +65,6 @@ public: Epoch &operator=(const Epoch&) = delete; Epoch &operator=(Epoch&&) = delete; - void start_job() { - m_active_jobs.fetch_add(1); - } - - void end_job() { - assert(m_active_jobs.load() > 0); - m_active_jobs.fetch_add(-1); - - if (m_active_jobs.load() == 0) { - std::unique_lock lk(m_cv_lock); - m_active_cv.notify_all(); - } - } - - size_t get_active_job_num() { - return m_active_jobs.load(); - } - size_t get_epoch_number() { return m_epoch_number; } @@ -145,32 +121,6 @@ public: return true; } - void set_inactive() { - m_active = false; - } - - /* - * - */ - bool retirable() { - /* if epoch is currently active, then it cannot be retired */ - if (m_active) { - return false; - } - - /* - * if the epoch has active jobs but is not itself active, - * wait for them to finish and return true. If there are - * not active jobs, return true immediately - */ - std::unique_lock lk(m_cv_lock); - while (m_active_jobs.load() > 0) { - m_active_cv.wait(lk); - } - - return true; - } - bool advance_buffer_head(size_t head) { m_buffer_head = head; return m_buffer->advance_head(m_buffer_head); @@ -180,9 +130,6 @@ private: Structure *m_structure; Buffer *m_buffer; - std::condition_variable m_active_cv; - std::mutex m_cv_lock; - std::mutex m_buffer_lock; std::atomic m_active_merge; @@ -192,8 +139,6 @@ private: * epoch. An epoch can only be retired * when this number is 0. */ - std::atomic m_active_jobs; - bool m_active; size_t m_epoch_number; size_t m_buffer_head; }; diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index b14b229..6f6b913 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -29,9 +29,9 @@ struct ReconstructionArgs { template struct QueryArgs { - Epoch *epoch; std::promise> result_set; void *query_parms; + void *extension; }; typedef std::function Job; -- cgit v1.2.3 From f7f61d6d5367f2984cbf40c3cd6d85f75cd999af Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 31 Jan 2024 20:34:35 -0500 Subject: temporary hack to get working --- include/framework/DynamicExtension.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 0992e14..a56cc6c 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -388,6 +388,7 @@ private: epoch_ptr old, new_ptr; new_ptr = {nullptr, 0}; + size_t i=0; do { old = m_previous_epoch.load(); @@ -396,10 +397,11 @@ private: break; } usleep(1); + i++; + + if (i > 600) break; } while(true); - //fprintf(stderr, "Epoch %ld retired [%p]\n", epoch->get_epoch_number(), epoch); - delete epoch; /* @@ -473,8 +475,6 @@ private: auto ptr2 = ((DynamicExtension *) args->extension)->m_current_epoch.load().epoch; auto ptr3 = ((DynamicExtension *) args->extension)->m_next_epoch.load().epoch; - //fprintf(stderr, "(%ld, %p)\t%p\t%p\t%p\n", epoch->get_epoch_number(), epoch, ptr1, ptr2, ptr3); - auto buffer = epoch->get_buffer(); auto vers = epoch->get_structure(); @@ -650,14 +650,20 @@ private: do { if (m_previous_epoch.load().epoch == epoch) { old = m_previous_epoch; - assert(old.refcnt > 0); + if (old.refcnt <= 0) { + return; + } + new_ptr = {old.epoch, old.refcnt - 1}; if (m_previous_epoch.compare_exchange_strong(old, new_ptr)) { break; } } else { old = m_current_epoch; - assert(old.refcnt > 0); + if (old.refcnt <= 0) { + return; + } + //assert(old.refcnt > 0); new_ptr = {old.epoch, old.refcnt - 1}; if (m_current_epoch.compare_exchange_strong(old, new_ptr)) { break; -- cgit v1.2.3 From db4806d9dd9757273a14e6c3ea92e5a087239145 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 5 Feb 2024 15:17:25 -0500 Subject: Set up tombstone deletes properly --- include/framework/DynamicExtension.h | 14 ++++++++++---- include/query/irs.h | 16 ++++++++-------- include/query/rangecount.h | 14 +++++++------- include/query/rangequery.h | 14 +++++++------- 4 files changed, 32 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index a56cc6c..3e9d0fb 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -481,7 +481,7 @@ private: void *parms = args->query_parms; /* Get the buffer query states */ - void *buffer_state = Q::get_buffer_query_state(std::move(buffer), parms); + void *buffer_state = Q::get_buffer_query_state(&buffer, parms); /* Get the shard query states */ std::vector> shards; @@ -502,7 +502,7 @@ private: shid = shards[i - 1].first; } - query_results[i] = std::move(filter_deletes(local_results, shid, vers)); + query_results[i] = std::move(filter_deletes(local_results, shid, vers, &buffer)); if constexpr (Q::EARLY_ABORT) { if (query_results[i].size() > 0) break; @@ -563,8 +563,8 @@ private: return m_buffer->append(rec, ts); } - static std::vector> filter_deletes(std::vector> &records, ShardID shid, Structure *vers) { - if constexpr (!Q::SKIP_DELETE_FILTER) { + static std::vector> filter_deletes(std::vector> &records, ShardID shid, Structure *vers, BufView *bview) { + if constexpr (Q::SKIP_DELETE_FILTER) { return records; } @@ -602,6 +602,12 @@ private: //continue; //} + for (size_t i=0; iget_record_count(); i++) { + if (bview->get(i)->is_tombstone() && bview->get(i)->rec == rec.rec) { + continue; + } + } + if (shid != INVALID_SHID) { for (size_t lvl=0; lvl<=shid.level_idx; lvl++) { if (vers->get_levels()[lvl]->check_tombstone(0, rec.rec)) { diff --git a/include/query/irs.h b/include/query/irs.h index 7ef5069..7eea14b 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -39,9 +39,9 @@ struct BufferState { size_t cutoff; std::vector> records; size_t sample_size; - BufferView buffer; + BufferView *buffer; - BufferState(BufferView buffer) : buffer(std::move(buffer)) {} + BufferState(BufferView *buffer) : buffer(buffer) {} }; template @@ -68,10 +68,10 @@ public: return res; } - static void* get_buffer_query_state(BufferView buffer, void *parms) { - auto res = new BufferState(std::move(buffer)); + static void* get_buffer_query_state(BufferView *buffer, void *parms) { + auto res = new BufferState(buffer); - res->cutoff = res->buffer.get_record_count(); + res->cutoff = res->buffer->get_record_count(); res->sample_size = 0; if constexpr (Rejection) { @@ -82,8 +82,8 @@ public: auto upper_key = ((Parms *) parms)->upper_bound; for (size_t i=0; icutoff; i++) { - if ((res->buffer.get(i)->rec.key >= lower_key) && (buffer.get(i)->rec.key <= upper_key)) { - res->records.emplace_back(*(res->buffer.get(i))); + if ((res->buffer->get(i)->rec.key >= lower_key) && (buffer->get(i)->rec.key <= upper_key)) { + res->records.emplace_back(*(res->buffer->get(i))); } } @@ -181,7 +181,7 @@ public: if constexpr (Rejection) { for (size_t i=0; isample_size; i++) { auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = st->buffer.get(idx); + auto rec = st->buffer->get(idx); if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { result.emplace_back(*rec); diff --git a/include/query/rangecount.h b/include/query/rangecount.h index 7d88b1d..70d57d8 100644 --- a/include/query/rangecount.h +++ b/include/query/rangecount.h @@ -33,10 +33,10 @@ struct State { template struct BufferState { - BufferView buffer; + BufferView *buffer; - BufferState(BufferView buffer) - : buffer(std::move(buffer)) {} + BufferState(BufferView *buffer) + : buffer(buffer) {} }; template @@ -55,8 +55,8 @@ public: return res; } - static void* get_buffer_query_state(BufferView buffer, void *parms) { - auto res = new BufferState(std::move(buffer)); + static void* get_buffer_query_state(BufferView *buffer, void *parms) { + auto res = new BufferState(buffer); return res; } @@ -123,8 +123,8 @@ public: res.rec.value = 0; // tombstones records.emplace_back(res); - for (size_t i=0; ibuffer.get_record_count(); i++) { - auto rec = s->buffer.get(i); + for (size_t i=0; ibuffer->get_record_count(); i++) { + auto rec = s->buffer->get(i); if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound && !rec->is_deleted()) { if (rec->is_tombstone()) { diff --git a/include/query/rangequery.h b/include/query/rangequery.h index c44f5d7..1a42265 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -32,10 +32,10 @@ struct State { template struct BufferState { - BufferView buffer; + BufferView *buffer; - BufferState(BufferView buffer) - : buffer(std::move(buffer)) {} + BufferState(BufferView *buffer) + : buffer(buffer) {} }; template @@ -54,8 +54,8 @@ public: return res; } - static void* get_buffer_query_state(BufferView buffer, void *parms) { - auto res = new BufferState(std::move(buffer)); + static void* get_buffer_query_state(BufferView *buffer, void *parms) { + auto res = new BufferState(buffer); return res; } @@ -101,8 +101,8 @@ public: auto s = (BufferState *) state; std::vector> records; - for (size_t i=0; ibuffer.get_record_count(); i++) { - auto rec = s->buffer.get(i); + for (size_t i=0; ibuffer->get_record_count(); i++) { + auto rec = s->buffer->get(i); if (rec->rec.key >= p->lower_bound && rec->rec.key <= p->upper_bound) { records.emplace_back(*rec); } -- cgit v1.2.3 From 0ff3cedf5df9c27bccd3053ce6339e317f87ff76 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Mon, 5 Feb 2024 15:18:33 -0500 Subject: BufferView: Adjusted BV to avoid repeated modulus operations --- include/framework/structure/BufferView.h | 50 +++++++++++++++++++++++------ include/framework/structure/MutableBuffer.h | 13 +++++--- 2 files changed, 48 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 30fffed..edf6707 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -38,6 +38,8 @@ public: , m_release(std::move(other.m_release)) , m_head(std::exchange(other.m_head, 0)) , m_tail(std::exchange(other.m_tail, 0)) + , m_start(std::exchange(other.m_start, 0)) + , m_stop(std::exchange(other.m_stop, 0)) , m_cap(std::exchange(other.m_cap, 0)) , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0)) , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) @@ -52,6 +54,8 @@ public: , m_release(release) , m_head(head) , m_tail(tail) + , m_start(m_head % cap) + , m_stop(m_tail % cap) , m_cap(cap) , m_approx_ts_cnt(tombstone_cnt) , m_tombstone_filter(filter) @@ -76,11 +80,29 @@ public: } bool delete_record(const R& rec) { - for (size_t i=0; i *get(size_t i) { assert(i < get_record_count()); + m_total += (m_data + to_idx(i))->rec.key; return m_data + to_idx(i); } void copy_to_buffer(psudb::byte *buffer) { /* check if the region to be copied circles back to start. If so, do it in two steps */ - if ((m_head % m_cap) + get_record_count() > m_cap) { - size_t split_idx = m_cap - (m_head % m_cap); + if (m_start > m_stop) { + size_t split_idx = m_cap - m_start; - memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped)); - memcpy(buffer + (split_idx * sizeof(Wrapped)), (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped)); + memcpy(buffer, (std::byte*) (m_data + m_start), split_idx* sizeof(Wrapped)); + memcpy(buffer + (split_idx * sizeof(Wrapped)), (std::byte*) m_data, m_stop * sizeof(Wrapped)); } else { - memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped)); + memcpy(buffer, (std::byte*) (m_data + m_start), get_record_count() * sizeof(Wrapped)); } } @@ -129,13 +152,20 @@ private: ReleaseFunction m_release; size_t m_head; size_t m_tail; + size_t m_start; + size_t m_stop; size_t m_cap; size_t m_approx_ts_cnt; psudb::BloomFilter *m_tombstone_filter; bool m_active; + size_t m_total; + size_t to_idx(size_t i) { - return (m_head + i) % m_cap; + size_t idx = (m_start + i >= m_cap) ? i = (m_cap - m_start) + : m_start + i; + assert(idx < m_cap); + return idx; } }; diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h index 94a9c41..415c95a 100644 --- a/include/framework/structure/MutableBuffer.h +++ b/include/framework/structure/MutableBuffer.h @@ -230,13 +230,16 @@ public: /* * Note: this returns the available physical storage capacity, * *not* now many more records can be inserted before the - * HWM is reached. - * - * FIXME: this logic is incorrect for the buffer prior to the - * first call to advance_head, and will under-report the available - * space. + * HWM is reached. It considers the old_head to be "free" + * when it has no remaining references. This should be true, + * but a buggy framework implementation may violate the + * assumption. */ size_t get_available_capacity() { + if (m_old_head.load().refcnt == 0) { + return m_cap - (m_tail.load() - m_head.load().head_idx); + } + return m_cap - (m_tail.load() - m_old_head.load().head_idx); } -- cgit v1.2.3 From 10b4425e842d10b7fbfa85978969ed4591d6b98e Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 7 Feb 2024 10:56:52 -0500 Subject: Fully implemented Query concept and adjusted queries to use it --- include/framework/DynamicExtension.h | 2 +- include/framework/interface/Query.h | 25 ++++++++++-------------- include/framework/scheduling/Epoch.h | 2 +- include/framework/scheduling/Task.h | 4 ++-- include/framework/structure/ExtensionStructure.h | 2 +- include/framework/structure/InternalLevel.h | 4 ++-- include/query/irs.h | 4 ++-- include/query/rangecount.h | 8 ++------ include/query/rangequery.h | 6 ++---- include/query/wirs.h | 19 +++++++++--------- include/query/wss.h | 20 ++++++++++--------- 11 files changed, 44 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 3e9d0fb..5c021f2 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -31,7 +31,7 @@ namespace de { -template Q, LayoutPolicy L=LayoutPolicy::TEIRING, DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=SerialScheduler> class DynamicExtension { typedef S Shard; diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index ca742c3..8cf9660 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -8,31 +8,26 @@ */ #pragma once -#include +#include "framework/QueryRequirements.h" #include -#include "util/types.h" - +namespace de{ // FIXME: The interface is not completely specified yet, as it is pending // determining a good way to handle additional template arguments // to get the Shard and Record types into play -template -concept QueryInterface = requires(Q q, void *p, std::vector &s) { - - /* - {Q::get_query_state(p, p)} -> std::convertible_to; - {Q::get_buffer_query_state(p, p)} -> std::convertible_to; - */ +template +concept QueryInterface = requires(void *p, S *sh, std::vector &s, std::vector>> &rv, BufferView *bv) { + {Q::get_query_state(sh, p)} -> std::convertible_to; + {Q::get_buffer_query_state(bv, p)} -> std::convertible_to; {Q::process_query_states(p, s, p)}; - /* - {Q::query(s, p, p)} -> std::convertible_to>>; + {Q::query(sh, p, p)} -> std::convertible_to>>; {Q::buffer_query(p, p)} -> std::convertible_to>>; {Q::merge(rv, p)} -> std::convertible_to>; - */ - {Q::delete_query_state(std::declval())} -> std::same_as; - {Q::delete_buffer_query_state(std::declval())} -> std::same_as; + {Q::delete_query_state(p)} -> std::same_as; + {Q::delete_buffer_query_state(p)} -> std::same_as; {Q::EARLY_ABORT} -> std::convertible_to; {Q::SKIP_DELETE_FILTER} -> std::convertible_to; }; +} diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 7b533b6..48b7742 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -18,7 +18,7 @@ namespace de { -template +template Q, LayoutPolicy L> class Epoch { private: typedef MutableBuffer Buffer; diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 6f6b913..ba0001d 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -18,7 +18,7 @@ namespace de { -template +template Q, LayoutPolicy L> struct ReconstructionArgs { Epoch *epoch; std::vector merges; @@ -27,7 +27,7 @@ struct ReconstructionArgs { void *extension; }; -template +template Q, LayoutPolicy L> struct QueryArgs { std::promise> result_set; void *query_parms; diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index ae566cb..0b8263e 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -22,7 +22,7 @@ namespace de { -template +template Q, LayoutPolicy L=LayoutPolicy::TEIRING> class ExtensionStructure { typedef S Shard; typedef BufferView BuffView; diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index e9874e0..0fd5275 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -19,12 +19,12 @@ #include "framework/structure/BufferView.h" namespace de { -template +template Q> class InternalLevel; -template +template Q> class InternalLevel { typedef S Shard; typedef BufferView BuffView; diff --git a/include/query/irs.h b/include/query/irs.h index 7eea14b..bef75bf 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -90,9 +90,9 @@ public: return res; } - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { + static void process_query_states(void *query_parms, std::vector &shard_states, void *buffer_state) { auto p = (Parms *) query_parms; - auto bs = (buff_state) ? (BufferState *) buff_state : nullptr; + auto bs = (buffer_state) ? (BufferState *) buffer_state : nullptr; std::vector shard_sample_sizes(shard_states.size()+1, 0); size_t buffer_sz = 0; diff --git a/include/query/rangecount.h b/include/query/rangecount.h index 70d57d8..a09ad64 100644 --- a/include/query/rangecount.h +++ b/include/query/rangecount.h @@ -11,11 +11,7 @@ */ #pragma once -#include "framework/interface/Record.h" -#include "framework/interface/Shard.h" -#include "framework/structure/BufferView.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" +#include "framework/QueryRequirements.h" namespace de { namespace rc { @@ -61,7 +57,7 @@ public: return res; } - static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_states) { + static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_state) { return; } diff --git a/include/query/rangequery.h b/include/query/rangequery.h index 1a42265..c3985fa 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -10,9 +10,7 @@ */ #pragma once -#include "framework/interface/Record.h" -#include "framework/interface/Shard.h" -#include "framework/structure/BufferView.h" +#include "framework/QueryRequirements.h" #include "psu-ds/PriorityQueue.h" #include "util/Cursor.h" @@ -60,7 +58,7 @@ public: return res; } - static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_states) { + static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_state) { return; } diff --git a/include/query/wirs.h b/include/query/wirs.h index 9b3d2ad..07c5292 100644 --- a/include/query/wirs.h +++ b/include/query/wirs.h @@ -12,9 +12,7 @@ */ #pragma once -#include "framework/interface/Record.h" -#include "framework/interface/Shard.h" -#include "framework/structure/MutableBuffer.h" +#include "framework/QueryRequirements.h" #include "psu-ds/Alias.h" namespace de { namespace wirs { @@ -52,6 +50,7 @@ struct BufferState { decltype(R::weight) max_weight; size_t sample_size; decltype(R::weight) total_weight; + BufferView *buffer; ~BufferState() { delete alias; @@ -83,7 +82,7 @@ public: return res; } - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + static void* get_buffer_query_state(BufferView *buffer, void *parms) { BufferState *state = new BufferState(); auto parameters = (Parms*) parms; @@ -92,16 +91,17 @@ public: state->max_weight = buffer->get_max_weight(); state->total_weight = buffer->get_total_weight(); state->sample_size = 0; + state->buffer = buffer; return state; } std::vector weights; - state->cutoff = buffer->get_record_count() - 1; + state->buffer = buffer; decltype(R::weight) total_weight = 0; - for (size_t i = 0; i <= state->cutoff; i++) { - auto rec = buffer->get_data() + i; + for (size_t i = 0; i <= buffer->get_record_count(); i++) { + auto rec = buffer->get(i); if (rec->rec.key >= parameters->lower_bound && rec->rec.key <= parameters->upper_bound && !rec->is_tombstone() && !rec->is_deleted()) { weights.push_back(rec->rec.weight); @@ -190,9 +190,10 @@ public: return result_set; } - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + static std::vector> buffer_query(void *state, void *parms) { auto st = (BufferState *) state; auto p = (Parms *) parms; + auto buffer = st->buffer; std::vector> result; result.reserve(st->sample_size); @@ -200,7 +201,7 @@ public: if constexpr (Rejection) { for (size_t i=0; isample_size; i++) { auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; + auto rec = buffer->get(idx); auto test = gsl_rng_uniform(p->rng) * st->max_weight; diff --git a/include/query/wss.h b/include/query/wss.h index 4c8861e..9f192ee 100644 --- a/include/query/wss.h +++ b/include/query/wss.h @@ -11,9 +11,8 @@ */ #pragma once -#include "framework/interface/Record.h" -#include "framework/interface/Shard.h" -#include "framework/structure/MutableBuffer.h" +#include "framework/QueryRequirements.h" +#include "psu-ds/Alias.h" namespace de { namespace wss { @@ -40,6 +39,7 @@ struct BufferState { psudb::Alias *alias; decltype(R::weight) max_weight; decltype(R::weight) total_weight; + BufferView *buffer; ~BufferState() { delete alias; @@ -60,23 +60,24 @@ public: return res; } - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { + static void* get_buffer_query_state(BufferState *buffer, void *parms) { BufferState *state = new BufferState(); auto parameters = (Parms*) parms; if constexpr (Rejection) { state->cutoff = buffer->get_record_count() - 1; state->max_weight = buffer->get_max_weight(); state->total_weight = buffer->get_total_weight(); + state->buffer = buffer; return state; } std::vector weights; - state->cutoff = buffer->get_record_count() - 1; double total_weight = 0.0; + state->buffer = buffer; - for (size_t i = 0; i <= state->cutoff; i++) { - auto rec = buffer->get_data() + i; + for (size_t i = 0; i <= buffer->get_record_count(); i++) { + auto rec = buffer->get_data(i); weights.push_back(rec->rec.weight); total_weight += rec->rec.weight; } @@ -152,9 +153,10 @@ public: return result_set; } - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { + static std::vector> buffer_query(void *state, void *parms) { auto st = (BufferState *) state; auto p = (Parms *) parms; + auto buffer = st->buffer; std::vector> result; result.reserve(st->sample_size); @@ -162,7 +164,7 @@ public: if constexpr (Rejection) { for (size_t i=0; isample_size; i++) { auto idx = gsl_rng_uniform_int(p->rng, st->cutoff); - auto rec = buffer->get_data() + idx; + auto rec = buffer->get(idx); auto test = gsl_rng_uniform(p->rng) * st->max_weight; -- cgit v1.2.3 From 2c5d549b3618b9ea72e6eece4cb4f3da5a6811a8 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 7 Feb 2024 13:42:34 -0500 Subject: Fully realized shard concept interface --- include/framework/DynamicExtension.h | 2 +- include/framework/interface/Shard.h | 25 +++++++++--------------- include/framework/scheduling/Epoch.h | 2 +- include/framework/scheduling/Task.h | 4 ++-- include/framework/structure/ExtensionStructure.h | 2 +- include/framework/structure/InternalLevel.h | 4 ++-- include/query/irs.h | 2 +- include/query/rangecount.h | 2 +- include/query/rangequery.h | 2 +- include/query/wirs.h | 2 +- include/query/wss.h | 2 +- include/shard/Alias.h | 15 ++++++-------- include/shard/ISAMTree.h | 8 +++++--- include/shard/TrieSpline.h | 1 + 14 files changed, 33 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 5c021f2..d88a945 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -31,7 +31,7 @@ namespace de { -template Q, LayoutPolicy L=LayoutPolicy::TEIRING, +template S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING, DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=SerialScheduler> class DynamicExtension { typedef S Shard; diff --git a/include/framework/interface/Shard.h b/include/framework/interface/Shard.h index 8c4db34..c4a9180 100644 --- a/include/framework/interface/Shard.h +++ b/include/framework/interface/Shard.h @@ -8,25 +8,17 @@ */ #pragma once -#include - -#include "util/types.h" -#include "framework/interface/Record.h" -#include +#include "framework/ShardRequirements.h" namespace de { -// FIXME: The interface is not completely specified yet, as it is pending -// determining a good way to handle additional template arguments -// to get the Record type into play -template -concept ShardInterface = requires(S s, std::vector spp, void *p, bool b, size_t i) { +template +concept ShardInterface = RecordInterface && requires(S s, std::vector spp, void *p, bool b, size_t i, BufferView bv, R r) { {S(spp)}; - /* - {S(mutable buffer)} - {s.point_lookup(r, b) } -> std::convertible_to - */ - {s.get_data()} -> std::convertible_to; + {S(std::move(bv))}; + + {s.point_lookup(r, b) } -> std::same_as*>; + {s.get_data()} -> std::same_as*>; {s.get_record_count()} -> std::convertible_to; {s.get_tombstone_count()} -> std::convertible_to; @@ -35,9 +27,10 @@ concept ShardInterface = requires(S s, std::vector spp, void *p, bool b, siz }; template -concept SortedShardInterface = ShardInterface && requires(S s, R r, R *rp) { +concept SortedShardInterface = ShardInterface && requires(S s, R r, R *rp, size_t i) { {s.lower_bound(r)} -> std::convertible_to; {s.upper_bound(r)} -> std::convertible_to; + {s.get_record_at(i)} -> std::same_as*>; }; } diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 48b7742..e58bd11 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -18,7 +18,7 @@ namespace de { -template Q, LayoutPolicy L> +template S, QueryInterface Q, LayoutPolicy L> class Epoch { private: typedef MutableBuffer Buffer; diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index ba0001d..008f232 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -18,7 +18,7 @@ namespace de { -template Q, LayoutPolicy L> +template S, QueryInterface Q, LayoutPolicy L> struct ReconstructionArgs { Epoch *epoch; std::vector merges; @@ -27,7 +27,7 @@ struct ReconstructionArgs { void *extension; }; -template Q, LayoutPolicy L> +template S, QueryInterface Q, LayoutPolicy L> struct QueryArgs { std::promise> result_set; void *query_parms; diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 0b8263e..373a1e2 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -22,7 +22,7 @@ namespace de { -template Q, LayoutPolicy L=LayoutPolicy::TEIRING> +template S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING> class ExtensionStructure { typedef S Shard; typedef BufferView BuffView; diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index 0fd5275..d586869 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -19,12 +19,12 @@ #include "framework/structure/BufferView.h" namespace de { -template Q> +template S, QueryInterface Q> class InternalLevel; -template Q> +template S, QueryInterface Q> class InternalLevel { typedef S Shard; typedef BufferView BuffView; diff --git a/include/query/irs.h b/include/query/irs.h index bef75bf..c14d0cf 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -44,7 +44,7 @@ struct BufferState { BufferState(BufferView *buffer) : buffer(buffer) {} }; -template +template S, bool Rejection=true> class Query { public: constexpr static bool EARLY_ABORT=false; diff --git a/include/query/rangecount.h b/include/query/rangecount.h index a09ad64..6c57809 100644 --- a/include/query/rangecount.h +++ b/include/query/rangecount.h @@ -35,7 +35,7 @@ struct BufferState { : buffer(buffer) {} }; -template +template S> class Query { public: constexpr static bool EARLY_ABORT=false; diff --git a/include/query/rangequery.h b/include/query/rangequery.h index c3985fa..24b38ec 100644 --- a/include/query/rangequery.h +++ b/include/query/rangequery.h @@ -36,7 +36,7 @@ struct BufferState { : buffer(buffer) {} }; -template +template S> class Query { public: constexpr static bool EARLY_ABORT=false; diff --git a/include/query/wirs.h b/include/query/wirs.h index 07c5292..4fac7e7 100644 --- a/include/query/wirs.h +++ b/include/query/wirs.h @@ -57,7 +57,7 @@ struct BufferState { } }; -template +template S, bool Rejection=true> class Query { public: constexpr static bool EARLY_ABORT=false; diff --git a/include/query/wss.h b/include/query/wss.h index 9f192ee..ea36cb2 100644 --- a/include/query/wss.h +++ b/include/query/wss.h @@ -46,7 +46,7 @@ struct BufferState { } }; -template +template S, bool Rejection=true> class Query { public: constexpr static bool EARLY_ABORT=false; diff --git a/include/shard/Alias.h b/include/shard/Alias.h index a3e8ad8..a234575 100644 --- a/include/shard/Alias.h +++ b/include/shard/Alias.h @@ -15,9 +15,6 @@ #include #include -#include -#include -#include #include "framework/ShardRequirements.h" @@ -34,7 +31,7 @@ using psudb::queue_record; namespace de { -thread_local size_t wss_cancelations = 0; +static thread_local size_t wss_cancelations = 0; template class Alias { @@ -44,7 +41,7 @@ private: typedef decltype(R::weight) W; public: - Alias(MutableBuffer* buffer) + Alias(BufferView* buffer) : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); @@ -96,17 +93,17 @@ public: } } - Alias(Alias** shards, size_t len) + Alias(std::vector &shards) : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { std::vector>> cursors; - cursors.reserve(len); + cursors.reserve(shards.size()); - PriorityQueue> pq(len); + PriorityQueue> pq(shards.size()); size_t attemp_reccnt = 0; size_t tombstone_count = 0; - for (size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < shards.size(); ++i) { if (shards[i]) { auto base = shards[i]->get_data(); cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index 932e767..7de9cb1 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -25,6 +25,7 @@ using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; +using psudb::byte; namespace de { @@ -222,9 +223,6 @@ public: return m_tombstone_cnt; } - const Wrapped* get_record_at(size_t idx) const { - return (idx < m_reccnt) ? m_data + idx : nullptr; - } size_t get_memory_usage() { return m_alloc_size; @@ -234,6 +232,7 @@ public: return m_bf->memory_usage(); } + /* SortedShardInterface methods */ size_t get_lower_bound(const K& key) const { const InternalNode* now = m_root; while (!is_leaf(reinterpret_cast(now))) { @@ -274,6 +273,9 @@ public: return pos - m_data; } + const Wrapped* get_record_at(size_t idx) const { + return (idx < m_reccnt) ? m_data + idx : nullptr; + } private: void build_internal_levels() { diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 8142a67..9473177 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -25,6 +25,7 @@ using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; +using psudb::byte; namespace de { -- cgit v1.2.3 From bd74e27b28bd95267ce50d2e4b6f12b51d9b6aae Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Wed, 7 Feb 2024 17:23:23 -0500 Subject: Cleaned up shard files (except VPTree) Cleaned up shard implementations, fixed a few bugs, and set up some tests. There's still some work to be done in creating tests for the weighted sampling operations for the alias and aug btree shards. --- include/query/wss.h | 2 +- include/shard/Alias.h | 155 ++++++++++++------------------------- include/shard/AugBTree.h | 150 ++++++++++-------------------------- include/shard/ISAMTree.h | 105 +++---------------------- include/shard/PGM.h | 140 +++++++++++----------------------- include/shard/TrieSpline.h | 149 ++++++------------------------------ include/util/SortedMerge.h | 185 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 357 insertions(+), 529 deletions(-) create mode 100644 include/util/SortedMerge.h (limited to 'include') diff --git a/include/query/wss.h b/include/query/wss.h index ea36cb2..8797035 100644 --- a/include/query/wss.h +++ b/include/query/wss.h @@ -60,7 +60,7 @@ public: return res; } - static void* get_buffer_query_state(BufferState *buffer, void *parms) { + static void* get_buffer_query_state(BufferView *buffer, void *parms) { BufferState *state = new BufferState(); auto parameters = (Parms*) parms; if constexpr (Rejection) { diff --git a/include/shard/Alias.h b/include/shard/Alias.h index a234575..f0d1d59 100644 --- a/include/shard/Alias.h +++ b/include/shard/Alias.h @@ -14,20 +14,19 @@ #pragma once #include -#include #include "framework/ShardRequirements.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" #include "psu-ds/Alias.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" +#include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; +using psudb::byte; namespace de { @@ -41,126 +40,73 @@ private: typedef decltype(R::weight) W; public: - Alias(BufferView* buffer) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - std::sort(base, stop, std::less>()); - - std::vector weights; - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - wss_cancelations++; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; - } + Alias(BufferView buffer) + : m_data(nullptr) + , m_alias(nullptr) + , m_total_weight(0) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) + , m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) { + + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - m_total_weight+= base->rec.weight; - weights.push_back(base->rec.weight); - - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); + if (m_reccnt > 0) { + std::vector weights; + for (size_t i=0; i 0) { build_alias_structure(weights); } } Alias(std::vector &shards) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_alias(nullptr), m_bf(nullptr) { - std::vector>> cursors; - cursors.reserve(shards.size()); - - PriorityQueue> pq(shards.size()); + : m_data(nullptr) + , m_alias(nullptr) + , m_total_weight(0) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) + , m_bf(nullptr) { size_t attemp_reccnt = 0; size_t tombstone_count = 0; - - for (size_t i = 0; i < shards.size(); ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - std::vector weights; - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - m_total_weight += cursor.ptr->rec.weight; - weights.push_back(cursor.ptr->rec.weight); - if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } + auto res = sorted_array_merge(cursors, m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { + std::vector weights; + for (size_t i=0; i *point_lookup(const R &rec, bool filter=false) { @@ -173,7 +119,7 @@ public: return nullptr; } - while (idx < m_reccnt && m_data[idx].rec < rec) ++idx; + while (idx < (m_reccnt-1) && m_data[idx].rec < rec) ++idx; if (m_data[idx].rec == rec) { return m_data + idx; @@ -205,7 +151,7 @@ public: } size_t get_aux_memory_usage() { - return 0; + return (m_bf) ? m_bf->memory_usage() : 0; } W get_total_weight() { @@ -254,7 +200,6 @@ private: W m_total_weight; size_t m_reccnt; size_t m_tombstone_cnt; - size_t m_group_size; size_t m_alloc_size; BloomFilter *m_bf; }; diff --git a/include/shard/AugBTree.h b/include/shard/AugBTree.h index be664ac..58bd098 100644 --- a/include/shard/AugBTree.h +++ b/include/shard/AugBTree.h @@ -16,28 +16,21 @@ #include #include -#include -#include -#include #include "framework/ShardRequirements.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" #include "psu-ds/Alias.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" +#include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; -using psudb::PriorityQueue; -using psudb::queue_record; using psudb::Alias; +using psudb::byte; namespace de { -thread_local size_t wirs_cancelations = 0; - template struct AugBTreeNode { struct AugBTreeNode *left, *right; @@ -54,108 +47,52 @@ private: typedef decltype(R::weight) W; public: - AugBTree(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - m_bf = new BloomFilter(BF_FPR, buffer->get_tombstone_count(), BF_HASH_FUNCS); - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); - - std::sort(base, stop, std::less>()); - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - wirs_cancelations++; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - m_total_weight+= base->rec.weight; - - if (m_bf && base->is_tombstone()) { - m_tombstone_cnt++; - m_bf->insert(base->rec); - } - - base++; - } + AugBTree(BufferView buffer) + : m_data(nullptr) + , m_root(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_group_size(0) + , m_alloc_size(0) + , m_node_cnt(0) + , m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) + { + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { build_wirs_structure(); } } - AugBTree(AugBTree** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0), m_total_weight(0), m_root(nullptr) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - + AugBTree(std::vector shards) + : m_data(nullptr) + , m_root(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_group_size(0) + , m_alloc_size(0) + , m_node_cnt(0) + , m_bf(nullptr) + { size_t attemp_reccnt = 0; size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - m_total_weight += cursor.ptr->rec.weight; - if (m_bf && cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - if (m_bf) m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } + auto res = sorted_array_merge(cursors, m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { build_wirs_structure(); @@ -163,13 +100,12 @@ public: } ~AugBTree() { - if (m_data) free(m_data); + free(m_data); for (size_t i=0; i>); } size_t get_aux_memory_usage() { - return 0; + return (m_bf) ? m_bf->memory_usage() : 0; } size_t get_lower_bound(const K& key) const { @@ -364,7 +299,6 @@ private: Wrapped* m_data; std::vector m_alias; AugBTreeNode* m_root; - W m_total_weight; size_t m_reccnt; size_t m_tombstone_cnt; size_t m_group_size; diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index 7de9cb1..33ba82f 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -17,9 +17,8 @@ #include "framework/ShardRequirements.h" #include "util/bf_config.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-util/timer.h" +#include "psu-ds/BloomFilter.h" +#include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; @@ -61,60 +60,18 @@ public: , m_alloc_size(0) , m_data(nullptr) { - TIMER_INIT(); - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped), (byte**) &m_data); - TIMER_START(); - auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, buffer.get_record_count(), sizeof(Wrapped)); - buffer.copy_to_buffer((byte *) temp_buffer); - - auto base = temp_buffer; - auto stop = base + buffer.get_record_count(); - std::sort(base, stop, std::less>()); - TIMER_STOP(); - - auto sort_time = TIMER_RESULT(); - - TIMER_START(); - while (base < stop) { - if (!base->is_tombstone() && (base + 1 < stop) - && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - continue; - } else if (base->is_deleted()) { - base += 1; - continue; - } - - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - if (m_bf && base->is_tombstone()) { - ++m_tombstone_cnt; - m_bf->insert(base->rec); - } - - base++; - } - - TIMER_STOP(); - auto copy_time = TIMER_RESULT(); + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; - TIMER_START(); if (m_reccnt > 0) { build_internal_levels(); } - TIMER_STOP(); - auto level_time = TIMER_RESULT(); - - free(temp_buffer); } ISAMTree(std::vector &shards) @@ -128,58 +85,18 @@ public: , m_alloc_size(0) , m_data(nullptr) { - std::vector>> cursors; - cursors.reserve(shards.size()); - - PriorityQueue> pq(shards.size()); - size_t attemp_reccnt = 0; size_t tombstone_count = 0; - - for (size_t i = 0; i < shards.size(); ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, attemp_reccnt * sizeof(Wrapped), (byte **) &m_data); - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - if (cursor.ptr->is_tombstone()) { - //fprintf(stderr, "ISAM: Tombstone from shard %ld next record from shard %ld\n", - //now.version, next.version); - ++m_tombstone_cnt; - m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); - } - } + auto res = sorted_array_merge(cursors, m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { build_internal_levels(); @@ -225,11 +142,11 @@ public: size_t get_memory_usage() { - return m_alloc_size; + return m_alloc_size + m_internal_node_cnt * NODE_SZ; } size_t get_aux_memory_usage() { - return m_bf->memory_usage(); + return (m_bf) ? m_bf->memory_usage() : 0; } /* SortedShardInterface methods */ diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 13db26a..8031870 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -14,24 +14,19 @@ #include -#include -#include -#include -#include #include "framework/ShardRequirements.h" #include "pgm/pgm_index.hpp" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" #include "psu-ds/BloomFilter.h" +#include "util/SortedMerge.h" #include "util/bf_config.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; -using psudb::Alias; +using psudb::byte; namespace de { @@ -41,111 +36,65 @@ private: typedef decltype(R::key) K; typedef decltype(R::value) V; - public: - PGM(MutableBuffer* buffer) - : m_reccnt(0), m_tombstone_cnt(0) { - - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - std::vector keys; - - size_t offset = 0; - m_reccnt = 0; - auto base = buffer->get_data(); - auto stop = base + buffer->get_record_count(); + PGM(BufferView buffer) + : m_data(nullptr) + , m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) { + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; - std::sort(base, stop, std::less>()); - - K min_key = base->rec.key; - K max_key = (stop - 1)->rec.key; - - while (base < stop) { - if (!(base->is_tombstone()) && (base + 1) < stop) { - if (base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - continue; - } - } else if (base->is_deleted()) { - base += 1; - continue; + if (m_reccnt > 0) { + std::vector keys; + for (size_t i=0; iheader &= 3; - m_data[m_reccnt++] = *base; - keys.emplace_back(base->rec.key); - base++; - } - - if (m_reccnt > 0) { m_pgm = pgm::PGMIndex(keys); } } - PGM(PGM** shards, size_t len) - : m_reccnt(0), m_tombstone_cnt(0) { - std::vector>> cursors; - cursors.reserve(len); - - PriorityQueue> pq(len); - + PGM(std::vector shards) + : m_data(nullptr) + , m_bf(nullptr) + , m_reccnt(0) + , m_tombstone_cnt(0) + , m_alloc_size(0) { + size_t attemp_reccnt = 0; size_t tombstone_count = 0; - - for (size_t i = 0; i < len; ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - - std::vector keys; - - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor>(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor>(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - keys.emplace_back(cursor.ptr->rec.key); - } - pq.pop(); - - if (advance_cursor>(cursor)) pq.push(cursor.ptr, now.version); - } - } + m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); + + auto res = sorted_array_merge(cursors, m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { + std::vector keys; + for (size_t i=0; i(keys); } } ~PGM() { - if (m_data) free(m_data); + free(m_data); + delete m_bf; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -186,7 +135,7 @@ public: } size_t get_aux_memory_usage() { - return 0; + return (m_bf) ? m_bf->memory_usage() : 0; } size_t get_lower_bound(const K& key) const { @@ -228,6 +177,7 @@ public: private: Wrapped* m_data; + BloomFilter *m_bf; size_t m_reccnt; size_t m_tombstone_cnt; size_t m_alloc_size; diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index 9473177..f9fb3cb 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -15,11 +15,9 @@ #include "framework/ShardRequirements.h" #include "ts/builder.h" -#include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" #include "psu-ds/BloomFilter.h" #include "util/bf_config.h" -#include "psu-util/timer.h" +#include "util/SortedMerge.h" using psudb::CACHELINE_SIZE; using psudb::BloomFilter; @@ -45,78 +43,26 @@ public: , m_min_key(0) , m_bf(new BloomFilter(BF_FPR, buffer.get_tombstone_count(), BF_HASH_FUNCS)) { - TIMER_INIT(); - m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, buffer.get_record_count() * sizeof(Wrapped), (byte**) &m_data); - TIMER_START(); - auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, buffer.get_record_count(), sizeof(Wrapped)); - buffer.copy_to_buffer((byte *) temp_buffer); - - auto base = temp_buffer; - auto stop = base + buffer.get_record_count(); - std::sort(base, stop, std::less>()); - - K min_key = base->rec.key; - K max_key = (stop-1)->rec.key; - TIMER_STOP(); - - auto sort_time = TIMER_RESULT(); - - TIMER_START(); - auto bldr = ts::Builder(min_key, max_key, E); - while (base < stop) { - if (!base->is_tombstone() && (base + 1 < stop) - && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { - base += 2; - continue; - } else if (base->is_deleted()) { - base += 1; - continue; - } + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; - // FIXME: this shouldn't be necessary, but the tagged record - // bypass doesn't seem to be working on this code-path, so this - // ensures that tagged records from the buffer are able to be - // dropped, eventually. It should only need to be &= 1 - base->header &= 3; - m_data[m_reccnt++] = *base; - bldr.AddKey(base->rec.key); - if (m_bf && base->is_tombstone()) { - ++m_tombstone_cnt; - m_bf->insert(base->rec); - } + if (m_reccnt > 0) { + m_min_key = m_data[0].rec.key; + m_max_key = m_data[m_reccnt-1].rec.key; - /* - * determine the "true" min/max keys based on the scan. This is - * to avoid situations where the min/max in the input array - * are deleted and don't survive into the structure itself. - */ - if (m_reccnt == 0) { - m_max_key = m_min_key = base->rec.key; - } else if (base->rec.key > m_max_key) { - m_max_key = base->rec.key; - } else if (base->rec.key < m_min_key) { - m_min_key = base->rec.key; + auto bldr = ts::Builder(m_min_key, m_max_key, E); + for (size_t i=0; i 0) { m_ts = bldr.Finalize(); } - TIMER_STOP(); - auto level_time = TIMER_RESULT(); - - free(temp_buffer); } TrieSpline(std::vector &shards) @@ -128,77 +74,28 @@ public: , m_min_key(0) , m_bf(nullptr) { - - std::vector>> cursors; - cursors.reserve(shards.size()); - - PriorityQueue> pq(shards.size()); - size_t attemp_reccnt = 0; size_t tombstone_count = 0; - - /* - * Initialize m_max_key and m_min_key using the values from the - * first shard. These will later be updated when building - * the initial priority queue to their true values. - */ - m_max_key = shards[0]->m_max_key; - m_min_key = shards[0]->m_min_key; + auto cursors = build_cursor_vec(shards, &attemp_reccnt, &tombstone_count); - for (size_t i = 0; i < shards.size(); ++i) { - if (shards[i]) { - auto base = shards[i]->get_data(); - cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); - attemp_reccnt += shards[i]->get_record_count(); - tombstone_count += shards[i]->get_tombstone_count(); - pq.push(cursors[i].ptr, i); - - if (shards[i]->m_max_key > m_max_key) { - m_max_key = shards[i]->m_max_key; - } - - if (shards[i]->m_min_key < m_min_key) { - m_min_key = shards[i]->m_min_key; - } - } else { - cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); - } - } - m_bf = new BloomFilter(BF_FPR, tombstone_count, BF_HASH_FUNCS); m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, attemp_reccnt * sizeof(Wrapped), (byte **) &m_data); - auto bldr = ts::Builder(m_min_key, m_max_key, E); - while (pq.size()) { - auto now = pq.peek(); - auto next = pq.size() > 1 ? pq.peek(1) : queue_record>{nullptr, 0}; - if (!now.data->is_tombstone() && next.data != nullptr && - now.data->rec == next.data->rec && next.data->is_tombstone()) { - - pq.pop(); pq.pop(); - auto& cursor1 = cursors[now.version]; - auto& cursor2 = cursors[next.version]; - if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); - if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); - } else { - auto& cursor = cursors[now.version]; - if (!cursor.ptr->is_deleted()) { - m_data[m_reccnt++] = *cursor.ptr; - bldr.AddKey(cursor.ptr->rec.key); - if (cursor.ptr->is_tombstone()) { - ++m_tombstone_cnt; - m_bf->insert(cursor.ptr->rec); - } - } - pq.pop(); - - if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); - } - } + auto res = sorted_array_merge(cursors, m_data, m_bf); + m_reccnt = res.record_count; + m_tombstone_cnt = res.tombstone_count; if (m_reccnt > 0) { + m_min_key = m_data[0].rec.key; + m_max_key = m_data[m_reccnt-1].rec.key; + + auto bldr = ts::Builder(m_min_key, m_max_key, E); + for (size_t i=0; imemory_usage() : 0; } size_t get_lower_bound(const K& key) const { diff --git a/include/util/SortedMerge.h b/include/util/SortedMerge.h new file mode 100644 index 0000000..ed47acb --- /dev/null +++ b/include/util/SortedMerge.h @@ -0,0 +1,185 @@ +/* + * include/util/SortedMerge.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * Dong Xie + * + * Distributed under the Modified BSD License. + * + * A sorted array merge routine for use in Shard construction, as many + * shards will use a sorted array to represent their data. Also encapsulates + * the necessary tombstone-cancellation logic. + * + * FIXME: include generic per-record processing functionality for Shards that + * need it, to avoid needing to reprocess the array in the shard after + * creation. + */ +#pragma once + +#include "util/Cursor.h" +#include "framework/interface/Shard.h" +#include "psu-ds/PriorityQueue.h" + +namespace de { + +using psudb::PriorityQueue; +using psudb::BloomFilter; +using psudb::queue_record; +using psudb::byte; +using psudb::CACHELINE_SIZE; + +struct merge_info { + size_t record_count; + size_t tombstone_count; +}; + + +template S> +static std::vector>> build_cursor_vec(std::vector &shards, size_t *reccnt, size_t *tscnt) { + std::vector>> cursors; + cursors.reserve(shards.size()); + + *reccnt = 0; + *tscnt = 0; + + for (size_t i = 0; i < shards.size(); ++i) { + if (shards[i]) { + auto base = shards[i]->get_data(); + cursors.emplace_back(Cursor{base, base + shards[i]->get_record_count(), 0, shards[i]->get_record_count()}); + *reccnt += shards[i]->get_record_count(); + *tscnt += shards[i]->get_tombstone_count(); + } else { + cursors.emplace_back(Cursor>{nullptr, nullptr, 0, 0}); + } + } + + return cursors; +} + +/* + * + */ +template +static merge_info sorted_array_from_bufferview(BufferView bv, + Wrapped *buffer, + psudb::BloomFilter *bf=nullptr) { + /* + * Copy the contents of the buffer view into a temporary buffer, and + * sort them. We still need to iterate over these temporary records to + * apply tombstone/deleted record filtering, as well as any possible + * per-record processing that is required by the shard being built. + */ + auto temp_buffer = (Wrapped *) psudb::sf_aligned_calloc(CACHELINE_SIZE, + bv.get_record_count(), + sizeof(Wrapped)); + bv.copy_to_buffer((byte *) temp_buffer); + + auto base = temp_buffer; + auto stop = base + bv.get_record_count(); + std::sort(base, stop, std::less>()); + + merge_info info = {0, 0}; + + /* + * Iterate over the temporary buffer to process the records, copying + * them into buffer as needed + */ + while (base < stop) { + if (!base->is_tombstone() && (base + 1 < stop) + && base->rec == (base + 1)->rec && (base + 1)->is_tombstone()) { + base += 2; + continue; + } else if (base->is_deleted()) { + base += 1; + continue; + } + + // fixme: this shouldn't be necessary, but the tagged record + // bypass doesn't seem to be working on this code-path, so this + // ensures that tagged records from the buffer are able to be + // dropped, eventually. it should only need to be &= 1 + base->header &= 3; + buffer[info.record_count++] = *base; + + if (base->is_tombstone()) { + info.tombstone_count++; + if (bf){ + bf->insert(base->rec); + } + } + + base++; + } + + free(temp_buffer); + return info; +} + +/* + * Perform a sorted merge of the records within cursors into the provided + * buffer. Includes tombstone and tagged delete cancellation logic, and + * will insert tombstones into a bloom filter, if one is provided. + * + * The behavior of this function is undefined if the provided buffer does + * not have space to contain all of the records within the input cursors. + */ +template +static merge_info sorted_array_merge(std::vector>> &cursors, + Wrapped *buffer, + psudb::BloomFilter *bf=nullptr) { + + // FIXME: For smaller cursor arrays, it may be more efficient to skip + // the priority queue and just do a scan. + PriorityQueue> pq(cursors.size()); + for (size_t i=0; i 1 ? pq.peek(1) : queue_record>{nullptr, 0}; + /* + * if the current record is not a tombstone, and the next record is + * a tombstone that matches the current one, then the current one + * has been deleted, and both it and its tombstone can be skipped + * over. + */ + if (!now.data->is_tombstone() && next.data != nullptr && + now.data->rec == next.data->rec && next.data->is_tombstone()) { + + pq.pop(); pq.pop(); + auto& cursor1 = cursors[now.version]; + auto& cursor2 = cursors[next.version]; + if (advance_cursor(cursor1)) pq.push(cursor1.ptr, now.version); + if (advance_cursor(cursor2)) pq.push(cursor2.ptr, next.version); + } else { + auto& cursor = cursors[now.version]; + /* skip over records that have been deleted via tagging */ + if (!cursor.ptr->is_deleted()) { + buffer[info.record_count++] = *cursor.ptr; + + /* + * if the record is a tombstone, increment the ts count and + * insert it into the bloom filter if one has been + * provided. + */ + if (cursor.ptr->is_tombstone()) { + info.tombstone_count++; + if (bf) { + bf->insert(cursor.ptr->rec); + } + } + } + pq.pop(); + + if (advance_cursor(cursor)) pq.push(cursor.ptr, now.version); + } + } + + return info; +} + + + +} -- cgit v1.2.3 From ded1f979d101a5df37a65370f6c18803212edb66 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 8 Feb 2024 12:40:13 -0500 Subject: Fixed a slight synchronization bug in Epoch retirement "properly" --- include/framework/DynamicExtension.h | 37 ++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index d88a945..e7dd774 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -293,6 +293,12 @@ private: epoch_ptr old, new_ptr; do { + /* + * during an epoch transition, a nullptr will installed in the + * current_epoch. At this moment, the "new" current epoch will + * soon be installed, but the "current" current epoch has been + * moved back to m_previous_epoch. + */ if (m_current_epoch.load().epoch == nullptr) { old = m_previous_epoch; new_ptr = {old.epoch, old.refcnt+1}; @@ -308,6 +314,8 @@ private: } } while (true); + assert(new_ptr.refcnt > 0); + return new_ptr.epoch; } @@ -388,7 +396,6 @@ private: epoch_ptr old, new_ptr; new_ptr = {nullptr, 0}; - size_t i=0; do { old = m_previous_epoch.load(); @@ -397,9 +404,7 @@ private: break; } usleep(1); - i++; - if (i > 600) break; } while(true); delete epoch; @@ -656,9 +661,15 @@ private: do { if (m_previous_epoch.load().epoch == epoch) { old = m_previous_epoch; - if (old.refcnt <= 0) { - return; - } + /* + * This could happen if we get into the system during a + * transition. In this case, we can just back out and retry + */ + if (old.epoch == nullptr) { + continue; + } + + assert(old.refcnt > 0); new_ptr = {old.epoch, old.refcnt - 1}; if (m_previous_epoch.compare_exchange_strong(old, new_ptr)) { @@ -666,10 +677,16 @@ private: } } else { old = m_current_epoch; - if (old.refcnt <= 0) { - return; - } - //assert(old.refcnt > 0); + /* + * This could happen if we get into the system during a + * transition. In this case, we can just back out and retry + */ + if (old.epoch == nullptr) { + continue; + } + + assert(old.refcnt > 0); + new_ptr = {old.epoch, old.refcnt - 1}; if (m_current_epoch.compare_exchange_strong(old, new_ptr)) { break; -- cgit v1.2.3 From 0c4a80b90e1f25b42e00a8af57131040203d2f89 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 8 Feb 2024 13:10:29 -0500 Subject: Added compiler fence to block reordering I'm reasonably certain that this is a compiler bug... --- include/shard/ISAMTree.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index 33ba82f..9458b1f 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -65,6 +65,12 @@ public: sizeof(Wrapped), (byte**) &m_data); + /* + * without this, gcc seems to hoist the building of the array + * _above_ its allocation under -O3, resulting in memfaults. + */ + asm volatile ("" ::: "memory"); + auto res = sorted_array_from_bufferview(std::move(buffer), m_data, m_bf); m_reccnt = res.record_count; m_tombstone_cnt = res.tombstone_count; -- cgit v1.2.3 From 711769574e647839677739192698e400529efe75 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Thu, 8 Feb 2024 16:38:44 -0500 Subject: Updated VPTree to new shard/query interfaces --- include/framework/interface/Record.h | 19 +++ include/framework/structure/BufferView.h | 3 - include/query/knn.h | 159 +++++++++++++++++ include/shard/VPTree.h | 282 ++++++------------------------- 4 files changed, 228 insertions(+), 235 deletions(-) create mode 100644 include/query/knn.h (limited to 'include') diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h index 457078d..29df4b6 100644 --- a/include/framework/interface/Record.h +++ b/include/framework/interface/Record.h @@ -212,4 +212,23 @@ struct RecordHash { } }; +template +class DistCmpMax { +public: + DistCmpMax(R *baseline) : P(baseline) {} + + inline bool operator()(const R *a, const R *b) requires WrappedInterface { + return a->rec.calc_distance(P->rec) > b->rec.calc_distance(P->rec); + } + + inline bool operator()(const R *a, const R *b) requires (!WrappedInterface){ + return a->calc_distance(*P) > b->calc_distance(*P); + } + +private: + R *P; +}; + + + } diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index edf6707..4e3de25 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -123,7 +123,6 @@ public: Wrapped *get(size_t i) { assert(i < get_record_count()); - m_total += (m_data + to_idx(i))->rec.key; return m_data + to_idx(i); } @@ -159,8 +158,6 @@ private: psudb::BloomFilter *m_tombstone_filter; bool m_active; - size_t m_total; - size_t to_idx(size_t i) { size_t idx = (m_start + i >= m_cap) ? i = (m_cap - m_start) : m_start + i; diff --git a/include/query/knn.h b/include/query/knn.h new file mode 100644 index 0000000..19dcf5c --- /dev/null +++ b/include/query/knn.h @@ -0,0 +1,159 @@ +/* + * include/query/knn.h + * + * Copyright (C) 2023 Douglas B. Rumbaugh + * + * Distributed under the Modified BSD License. + * + * A query class for k-NN queries, designed for use with the VPTree + * shard. + * + * FIXME: no support for tombstone deletes just yet. This would require a + * query resumption mechanism, most likely. + */ +#pragma once + +#include "framework/QueryRequirements.h" +#include "psu-ds/PriorityQueue.h" + +namespace de { namespace knn { + +using psudb::PriorityQueue; + +template +struct Parms { + R point; + size_t k; +}; + +template +struct State { + size_t k; +}; + +template +struct BufferState { + BufferView *buffer; + + BufferState(BufferView *buffer) + : buffer(buffer) {} +}; + +template S> +class Query { +public: + constexpr static bool EARLY_ABORT=false; + constexpr static bool SKIP_DELETE_FILTER=true; + + static void *get_query_state(S *shard, void *parms) { + return nullptr; + } + + static void* get_buffer_query_state(BufferView *buffer, void *parms) { + return new BufferState(buffer); + } + + static void process_query_states(void *query_parms, std::vector &shard_states, void* buffer_state) { + return; + } + + static std::vector> query(S *shard, void *q_state, void *parms) { + std::vector> results; + Parms *p = (Parms *) parms; + Wrapped wrec; + wrec.rec = p->point; + wrec.header = 0; + + PriorityQueue, DistCmpMax>> pq(p->k, &wrec); + + shard->search(p->point, p->k, pq); + + while (pq.size() > 0) { + results.emplace_back(*pq.peek().data); + pq.pop(); + } + + return results; + } + + static std::vector> buffer_query(void *state, void *parms) { + Parms *p = (Parms *) parms; + BufferState *s = (BufferState *) state; + Wrapped wrec; + wrec.rec = p->point; + wrec.header = 0; + + size_t k = p->k; + + PriorityQueue, DistCmpMax>> pq(k, &wrec); + for (size_t i=0; ibuffer->get_record_count(); i++) { + // Skip over deleted records (under tagging) + if (s->buffer->get(i)->is_deleted()) { + continue; + } + + if (pq.size() < k) { + pq.push(s->buffer->get(i)); + } else { + double head_dist = pq.peek().data->rec.calc_distance(wrec.rec); + double cur_dist = (s->buffer->get(i))->rec.calc_distance(wrec.rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(s->buffer->get(i)); + } + } + } + + std::vector> results; + while (pq.size() > 0) { + results.emplace_back(*(pq.peek().data)); + pq.pop(); + } + + return results; + } + + static std::vector merge(std::vector>> &results, void *parms) { + Parms *p = (Parms *) parms; + R rec = p->point; + size_t k = p->k; + + PriorityQueue> pq(k, &rec); + for (size_t i=0; icalc_distance(rec); + double cur_dist = results[i][j].rec.calc_distance(rec); + + if (cur_dist < head_dist) { + pq.pop(); + pq.push(&results[i][j].rec); + } + } + } + } + + std::vector output; + while (pq.size() > 0) { + output.emplace_back(*pq.peek().data); + pq.pop(); + } + + return output; + } + + static void delete_query_state(void *state) { + auto s = (State *) state; + delete s; + } + + static void delete_buffer_query_state(void *state) { + auto s = (BufferState *) state; + delete s; + } +}; + +}} diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index 2f5ebbb..ba13a87 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -5,98 +5,27 @@ * * Distributed under the Modified BSD License. * - * A shard shim around the VPTree spatial index. + * A shard shim around a VPTree for high-dimensional metric similarity + * search. * - * FIXME: separate the KNN query class out into a standalone - * file in include/query . + * FIXME: Does not yet support the tombstone delete policy. * */ #pragma once #include -#include -#include -#include -#include -#include -#include -#include +#include #include "framework/ShardRequirements.h" - #include "psu-ds/PriorityQueue.h" -#include "util/Cursor.h" -#include "psu-ds/BloomFilter.h" -#include "util/bf_config.h" using psudb::CACHELINE_SIZE; -using psudb::BloomFilter; using psudb::PriorityQueue; using psudb::queue_record; -using psudb::Alias; +using psudb::byte; namespace de { -template -struct KNNQueryParms { - R point; - size_t k; -}; - -template -class KNNQuery; - -template -struct KNNState { - size_t k; - - KNNState() { - k = 0; - } -}; - -template -struct KNNBufferState { - -}; - - -template -class KNNDistCmpMax { -public: - KNNDistCmpMax(R *baseline) : P(baseline) {} - - inline bool operator()(const R *a, const R *b) requires WrappedInterface { - return a->rec.calc_distance(P->rec) > b->rec.calc_distance(P->rec); - } - - inline bool operator()(const R *a, const R *b) requires (!WrappedInterface){ - return a->calc_distance(*P) > b->calc_distance(*P); - } - -private: - R *P; -}; - -template -class KNNDistCmpMin { -public: - KNNDistCmpMin(R *baseline) : P(baseline) {} - - inline bool operator()(const R *a, const R *b) requires WrappedInterface { - return a->rec.calc_distance(P->rec) < b->rec.calc_distance(P->rec); - } - - inline bool operator()(const R *a, const R *b) requires (!WrappedInterface){ - return a->calc_distance(*P) < b->calc_distance(*P); - } - -private: - R *P; -}; - - - template class VPTree { private: @@ -117,16 +46,19 @@ private: } }; -public: - friend class KNNQuery; - VPTree(MutableBuffer* buffer) + +public: + VPTree(BufferView buffer) : m_reccnt(0), m_tombstone_cnt(0), m_root(nullptr), m_node_cnt(0) { - m_alloc_size = (buffer->get_record_count() * sizeof(Wrapped)) + (CACHELINE_SIZE - (buffer->get_record_count() * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); - m_ptrs = new Wrapped*[buffer->get_record_count()]; + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + buffer.get_record_count() * + sizeof(Wrapped), + (byte**) &m_data); + + m_ptrs = new Wrapped*[buffer.get_record_count()]; size_t offset = 0; m_reccnt = 0; @@ -135,8 +67,8 @@ public: // this one will likely require the multi-pass // approach, as otherwise we'll need to sort the // records repeatedly on each reconstruction. - for (size_t i=0; iget_record_count(); i++) { - auto rec = buffer->get_data() + i; + for (size_t i=0; iis_deleted()) { continue; @@ -154,25 +86,24 @@ public: } } - VPTree(VPTree** shards, size_t len) + VPTree(std::vector shards) : m_reccnt(0), m_tombstone_cnt(0), m_root(nullptr), m_node_cnt(0) { size_t attemp_reccnt = 0; - - for (size_t i=0; iget_record_count(); } - - m_alloc_size = (attemp_reccnt * sizeof(Wrapped)) + (CACHELINE_SIZE - (attemp_reccnt * sizeof(Wrapped)) % CACHELINE_SIZE); - assert(m_alloc_size % CACHELINE_SIZE == 0); - m_data = (Wrapped*)std::aligned_alloc(CACHELINE_SIZE, m_alloc_size); + + m_alloc_size = psudb::sf_aligned_alloc(CACHELINE_SIZE, + attemp_reccnt * sizeof(Wrapped), + (byte **) &m_data); m_ptrs = new Wrapped*[attemp_reccnt]; // FIXME: will eventually need to figure out tombstones // this one will likely require the multi-pass // approach, as otherwise we'll need to sort the // records repeatedly on each reconstruction. - for (size_t i=0; iget_record_count(); j++) { if (shards[i]->get_record_at(j)->is_deleted()) { continue; @@ -191,9 +122,9 @@ public: } ~VPTree() { - if (m_data) free(m_data); - if (m_root) delete m_root; - if (m_ptrs) delete[] m_ptrs; + free(m_data); + delete m_root; + delete[] m_ptrs; } Wrapped *point_lookup(const R &rec, bool filter=false) { @@ -248,11 +179,27 @@ public: } size_t get_aux_memory_usage() { + // FIXME: need to return the size of the unordered_map return 0; } + void search(const R &point, size_t k, PriorityQueue, + DistCmpMax>> &pq) { + double farthest = std::numeric_limits::max(); + + internal_search(m_root, point, k, pq, &farthest); + } private: + Wrapped* m_data; + Wrapped** m_ptrs; + std::unordered_map> m_lookup_map; + size_t m_reccnt; + size_t m_tombstone_cnt; + size_t m_node_cnt; + size_t m_alloc_size; + + vpnode *m_root; vpnode *build_vptree() { if (m_reccnt == 0) { @@ -332,7 +279,6 @@ private: return node; } - void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, gsl_rng *rng) { if (start == stop) return; @@ -345,7 +291,6 @@ private: } } - size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { auto pivot = start + gsl_rng_uniform_int(rng, stop - start); double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); @@ -364,15 +309,15 @@ private: return j; } - void swap(size_t idx1, size_t idx2) { auto tmp = m_ptrs[idx1]; m_ptrs[idx1] = m_ptrs[idx2]; m_ptrs[idx2] = tmp; } + void internal_search(vpnode *node, const R &point, size_t k, PriorityQueue, + DistCmpMax>> &pq, double *farthest) { - void search(vpnode *node, const R &point, size_t k, PriorityQueue, KNNDistCmpMax>> &pq, double *farthest) { if (node == nullptr) return; if (node->leaf) { @@ -408,151 +353,24 @@ private: if (d < node->radius) { if (d - (*farthest) <= node->radius) { - search(node->inside, point, k, pq, farthest); + internal_search(node->inside, point, k, pq, farthest); } if (d + (*farthest) >= node->radius) { - search(node->outside, point, k, pq, farthest); + internal_search(node->outside, point, k, pq, farthest); } } else { if (d + (*farthest) >= node->radius) { - search(node->outside, point, k, pq, farthest); + internal_search(node->outside, point, k, pq, farthest); } if (d - (*farthest) <= node->radius) { - search(node->inside, point, k, pq, farthest); + internal_search(node->inside, point, k, pq, farthest); } } } - Wrapped* m_data; - Wrapped** m_ptrs; - std::unordered_map> m_lookup_map; - size_t m_reccnt; - size_t m_tombstone_cnt; - size_t m_node_cnt; - size_t m_alloc_size; - - vpnode *m_root; -}; - - -template -class KNNQuery { -public: - constexpr static bool EARLY_ABORT=false; - constexpr static bool SKIP_DELETE_FILTER=true; - - static void *get_query_state(VPTree *wss, void *parms) { - return nullptr; - } - - static void* get_buffer_query_state(MutableBuffer *buffer, void *parms) { - return nullptr; - } - - static void process_query_states(void *query_parms, std::vector &shard_states, void *buff_state) { - return; - } - - static std::vector> query(VPTree *wss, void *q_state, void *parms) { - std::vector> results; - KNNQueryParms *p = (KNNQueryParms *) parms; - Wrapped wrec; - wrec.rec = p->point; - wrec.header = 0; - - PriorityQueue, KNNDistCmpMax>> pq(p->k, &wrec); - - double farthest = std::numeric_limits::max(); - - wss->search(wss->m_root, p->point, p->k, pq, &farthest); - - while (pq.size() > 0) { - results.emplace_back(*pq.peek().data); - pq.pop(); - } - - return results; - } - - static std::vector> buffer_query(MutableBuffer *buffer, void *state, void *parms) { - KNNQueryParms *p = (KNNQueryParms *) parms; - Wrapped wrec; - wrec.rec = p->point; - wrec.header = 0; - - size_t k = p->k; - - PriorityQueue, KNNDistCmpMax>> pq(k, &wrec); - for (size_t i=0; iget_record_count(); i++) { - // Skip over deleted records (under tagging) - if ((buffer->get_data())[i].is_deleted()) { - continue; - } - - if (pq.size() < k) { - pq.push(buffer->get_data() + i); - } else { - double head_dist = pq.peek().data->rec.calc_distance(wrec.rec); - double cur_dist = (buffer->get_data() + i)->rec.calc_distance(wrec.rec); - - if (cur_dist < head_dist) { - pq.pop(); - pq.push(buffer->get_data() + i); - } - } - } - - std::vector> results; - while (pq.size() > 0) { - results.emplace_back(*(pq.peek().data)); - pq.pop(); - } - - return results; - } - - static std::vector merge(std::vector>> &results, void *parms) { - KNNQueryParms *p = (KNNQueryParms *) parms; - R rec = p->point; - size_t k = p->k; - - PriorityQueue> pq(k, &rec); - for (size_t i=0; icalc_distance(rec); - double cur_dist = results[i][j].rec.calc_distance(rec); - if (cur_dist < head_dist) { - pq.pop(); - pq.push(&results[i][j].rec); - } - } - } - } - - std::vector output; - while (pq.size() > 0) { - output.emplace_back(*pq.peek().data); - pq.pop(); - } - - return output; - } - - static void delete_query_state(void *state) { - auto s = (KNNState *) state; - delete s; - } - - static void delete_buffer_query_state(void *state) { - auto s = (KNNBufferState *) state; - delete s; - } -}; + }; } -- cgit v1.2.3 From 402fc269c0aaa671d84a6d15918735ad4b90e6b2 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 9 Feb 2024 12:30:21 -0500 Subject: Comment updates/fixes --- include/framework/DynamicExtension.h | 73 +++++++++++++++++++++++- include/framework/interface/Query.h | 5 +- include/framework/interface/Record.h | 7 +-- include/framework/scheduling/Epoch.h | 7 --- include/framework/scheduling/FIFOScheduler.h | 9 +++ include/framework/scheduling/Task.h | 5 ++ include/framework/scheduling/statistics.h | 5 ++ include/framework/structure/BufferView.h | 1 + include/framework/structure/ExtensionStructure.h | 64 +++++++++++---------- include/framework/structure/InternalLevel.h | 6 ++ include/framework/util/Configuration.h | 37 ++++++------ include/query/irs.h | 1 - include/query/wirs.h | 1 - include/shard/Alias.h | 1 + include/shard/AugBTree.h | 2 + include/shard/ISAMTree.h | 1 + include/shard/PGM.h | 1 + include/shard/TrieSpline.h | 1 + include/shard/VPTree.h | 39 +++++++------ include/util/Cursor.h | 17 ++++-- include/util/SortedMerge.h | 30 ++++++++-- include/util/bf_config.h | 3 +- include/util/types.h | 10 ++-- 23 files changed, 223 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index e7dd774..473592d 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "framework/interface/Scheduler.h" @@ -87,10 +86,34 @@ public: } } + /* + * Insert the record `rec` into the index. If the buffer is full and + * the framework is blocking on an epoch transition, this call may fail + * and return 0. In this case, retry the call again later. If + * successful, 1 will be returned. The record will be immediately + * visible in the buffer upon the successful return of this function. + */ int insert(const R &rec) { return internal_append(rec, false); } + /* + * Erase the record `rec` from the index. It is assumed that `rec` + * currently exists--no special checks are made for correctness here. + * The behavior if this function will differ depending on if tombstone + * or tagged deletes are used. + * + * Tombstone deletes - inserts a tombstone record for `rec`. This *may* + * return 0 and fail if the buffer is full and the framework is + * blocking on an epoch transition. In this case, repeat the call + * later. 1 will be returned when the tombstone is successfully + * inserted. + * + * Tagging deletes - Does a point lookup for the record across the + * entire structure, and sets its delete bit when found. Returns 1 if + * the record is found and marked, and 0 if it was not (i.e., if it + * isn't present in the index). + */ int erase(const R &rec) { // FIXME: delete tagging will require a lot of extra work to get // operating "correctly" in a concurrent environment. @@ -121,10 +144,23 @@ public: return internal_append(rec, true); } + /* + * Execute the query with parameters `parms` and return a future. This + * future can be used to access a vector containing the results of the + * query. + * + * The behavior of this function is undefined if `parms` is not a + * pointer to a valid query parameter object for the query type used as + * a template parameter to construct the framework. + */ std::future> query(void *parms) { return schedule_query(parms); } + /* + * Returns the number of records (included tagged records and + * tombstones) currently within the framework. + */ size_t get_record_count() { auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_record_count() + epoch->get_structure()->get_record_count(); @@ -133,6 +169,11 @@ public: return t; } + /* + * Returns the number of tombstone records currently within the + * framework. This function can be called when tagged deletes are used, + * but will always return 0 in that case. + */ size_t get_tombstone_count() { auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_tombstone_count() + epoch->get_structure()->get_tombstone_count(); @@ -141,6 +182,12 @@ public: return t; } + /* + * Get the number of levels within the framework. This count will + * include any empty levels, but will not include the buffer. Note that + * this is *not* the same as the number of shards when tiering is used, + * as each level can contain multiple shards in that case. + */ size_t get_height() { auto epoch = get_active_epoch(); auto t = epoch->get_structure()->get_height(); @@ -149,6 +196,13 @@ public: return t; } + /* + * Get the number of bytes of memory allocated across the framework for + * storing records and associated index information (i.e., internal + * ISAM tree nodes). This includes memory that is allocated but + * currently unused in the buffer, or in shards themselves + * (overallocation due to delete cancellation, etc.). + */ size_t get_memory_usage() { auto epoch = get_active_epoch(); auto t= epoch->get_buffer().get_memory_usage() + epoch->get_structure()->get_memory_usage(); @@ -157,6 +211,11 @@ public: return t; } + /* + * Get the number of bytes of memory allocated across the framework for + * auxiliary structures. This can include bloom filters, aux + * hashtables, etc. + */ size_t get_aux_memory_usage() { auto epoch = get_active_epoch(); auto t = epoch->get_buffer().get_aux_memory_usage() + epoch->get_structure()->get_aux_memory_usage(); @@ -165,10 +224,22 @@ public: return t; } + /* + * Returns the maximum physical capacity of the buffer, measured in + * records. + */ size_t get_buffer_capacity() { return m_buffer->get_capacity(); } + /* + * Create a new single Shard object containing all of the records + * within the framework (buffer and shards). The optional parameter can + * be used to specify whether the Shard should be constructed with the + * currently active state of the framework (false), or if shard + * construction should wait until any ongoing reconstructions have + * finished and use that new version (true). + */ Shard *create_static_structure(bool await_reconstruction_completion=false) { if (await_reconstruction_completion) { await_next_epoch(); diff --git a/include/framework/interface/Query.h b/include/framework/interface/Query.h index 8cf9660..3d487f0 100644 --- a/include/framework/interface/Query.h +++ b/include/framework/interface/Query.h @@ -9,12 +9,9 @@ #pragma once #include "framework/QueryRequirements.h" -#include namespace de{ -// FIXME: The interface is not completely specified yet, as it is pending -// determining a good way to handle additional template arguments -// to get the Shard and Record types into play + template concept QueryInterface = requires(void *p, S *sh, std::vector &s, std::vector>> &rv, BufferView *bv) { {Q::get_query_state(sh, p)} -> std::convertible_to; diff --git a/include/framework/interface/Record.h b/include/framework/interface/Record.h index 29df4b6..5b9f307 100644 --- a/include/framework/interface/Record.h +++ b/include/framework/interface/Record.h @@ -138,7 +138,7 @@ struct CosinePoint{ return true; } - // lexicographic order + /* lexicographic order */ inline bool operator<(const CosinePoint& other) const { for (size_t i=0; irelease_reference(); } diff --git a/include/framework/scheduling/FIFOScheduler.h b/include/framework/scheduling/FIFOScheduler.h index c6baf9b..3ed4f49 100644 --- a/include/framework/scheduling/FIFOScheduler.h +++ b/include/framework/scheduling/FIFOScheduler.h @@ -5,6 +5,15 @@ * * Distributed under the Modified BSD License. * + * This scheduler runs just concurrently, using a standard FIFO queue to + * determine which jobs to run next. If more jobs are scheduled than there + * are available threads, the excess will stall until a thread becomes + * available and then run in the order they were received by the scheduler. + * + * TODO: We need to set up a custom threadpool based on jthreads to support + * thread preemption for a later phase of this project. That will allow us + * to avoid blocking epoch transitions on long-running queries, or to pause + * reconstructions on demand. */ #pragma once diff --git a/include/framework/scheduling/Task.h b/include/framework/scheduling/Task.h index 008f232..d5d4266 100644 --- a/include/framework/scheduling/Task.h +++ b/include/framework/scheduling/Task.h @@ -5,6 +5,11 @@ * * Distributed under the Modified BSD License. * + * An abstraction to represent a job to be scheduled. Currently the + * supported task types are queries and merges. Based on the current plan, + * simple buffer inserts will likely also be made into a task at some + * point. + * */ #pragma once diff --git a/include/framework/scheduling/statistics.h b/include/framework/scheduling/statistics.h index 50ba196..6c479cd 100644 --- a/include/framework/scheduling/statistics.h +++ b/include/framework/scheduling/statistics.h @@ -5,6 +5,11 @@ * * Distributed under the Modified BSD License. * + * This is a stub for a statistics tracker to be used in scheduling. It + * currently only tracks simple aggregated statistics, but should be + * updated in the future for more fine-grained statistics. These will be + * used for making scheduling decisions and predicting the runtime of a + * given job. */ #pragma once diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h index 4e3de25..9e0872b 100644 --- a/include/framework/structure/BufferView.h +++ b/include/framework/structure/BufferView.h @@ -5,6 +5,7 @@ * * Distributed under the Modified BSD License. * + * TODO: This file is very poorly commented. */ #pragma once diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h index 373a1e2..4802bc1 100644 --- a/include/framework/structure/ExtensionStructure.h +++ b/include/framework/structure/ExtensionStructure.h @@ -37,19 +37,23 @@ public: ~ExtensionStructure() = default; /* - * Create a shallow copy of this extension structure. The copy will share references to the - * same levels/shards as the original, but will have its own lists. As all of the shards are - * immutable (with the exception of deletes), the copy can be restructured with reconstructions - * and flushes without affecting the original. The copied structure will be returned with a reference - * count of 0; generally you will want to immediately call take_reference() on it. + * Create a shallow copy of this extension structure. The copy will share + * references to the same levels/shards as the original, but will have its + * own lists. As all of the shards are immutable (with the exception of + * deletes), the copy can be restructured with reconstructions and flushes + * without affecting the original. The copied structure will be returned + * with a reference count of 0; generally you will want to immediately call + * take_reference() on it. * - * NOTE: When using tagged deletes, a delete of a record in the original structure will affect - * the copy, so long as the copy retains a reference to the same shard as the original. This could - * cause synchronization problems under tagging with concurrency. Any deletes in this context will + * NOTE: When using tagged deletes, a delete of a record in the original + * structure will affect the copy, so long as the copy retains a reference + * to the same shard as the original. This could cause synchronization + * problems under tagging with concurrency. Any deletes in this context will * need to be forwarded to the appropriate structures manually. */ ExtensionStructure *copy() { - auto new_struct = new ExtensionStructure(m_buffer_size, m_scale_factor, m_max_delete_prop); + auto new_struct = new ExtensionStructure(m_buffer_size, m_scale_factor, + m_max_delete_prop); for (size_t i=0; im_levels.push_back(m_levels[i]->clone()); } @@ -64,9 +68,9 @@ public: * setting the delete bit in its wrapped header. Returns 1 if a matching * record was found and deleted, and 0 if a matching record was not found. * - * This function will stop after finding the first matching record. It is assumed - * that no duplicate records exist. In the case of duplicates, this function will - * still "work", but in the sense of "delete first match". + * This function will stop after finding the first matching record. It is + * assumed that no duplicate records exist. In the case of duplicates, this + * function will still "work", but in the sense of "delete first match". */ int tagged_delete(const R &rec) { for (auto level : m_levels) { @@ -77,7 +81,7 @@ public: /* * If the record to be erased wasn't found, return 0. The - * DynamicExtension itself will then search the active + * DynamicExtension itself will then search the active * Buffers. */ return 0; @@ -164,21 +168,23 @@ public: } /* - * Validate that no level in the structure exceeds its maximum tombstone capacity. This is - * used to trigger preemptive compactions at the end of the reconstruction process. + * Validate that no level in the structure exceeds its maximum tombstone + * capacity. This is used to trigger preemptive compactions at the end of + * the reconstruction process. */ bool validate_tombstone_proportion() { - long double ts_prop; - for (size_t i=0; iget_tombstone_count() / (long double) calc_level_record_capacity(i); - if (ts_prop > (long double) m_max_delete_prop) { - return false; - } - } + long double ts_prop; + for (size_t i = 0; i < m_levels.size(); i++) { + if (m_levels[i]) { + ts_prop = (long double)m_levels[i]->get_tombstone_count() / + (long double)calc_level_record_capacity(i); + if (ts_prop > (long double)m_max_delete_prop) { + return false; + } } + } - return true; + return true; } bool validate_tombstone_proportion(level_index level) { @@ -224,14 +230,14 @@ public: /* * The amount of storage required for the reconstruction accounts * for the cost of storing the new records, along with the - * cost of retaining the old records during the process - * (hence the 2x multiplier). + * cost of retaining the old records during the process + * (hence the 2x multiplier). * - * FIXME: currently does not account for the *actual* size - * of the shards, only the storage for the records + * FIXME: currently does not account for the *actual* size + * of the shards, only the storage for the records * themselves. */ - size_t reccnt = m_levels[i-1]->get_record_count(); + size_t reccnt = m_levels[i - 1]->get_record_count(); if constexpr (L == LayoutPolicy::LEVELING) { if (can_reconstruct_with(i, reccnt)) { reccnt += m_levels[i]->get_record_count(); diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h index d586869..db38946 100644 --- a/include/framework/structure/InternalLevel.h +++ b/include/framework/structure/InternalLevel.h @@ -6,6 +6,12 @@ * * Distributed under the Modified BSD License. * + * The word `Internal` in this class's name refers to memory. The current + * model, inherited from the framework in Practical Dynamic Extension for + * Sampling Indexes, would use a different ExternalLevel for shards stored + * on external storage. This is a distinction that can probably be avoided + * with some more thought being put into interface design. + * */ #pragma once diff --git a/include/framework/util/Configuration.h b/include/framework/util/Configuration.h index 8e3d20f..65ca181 100644 --- a/include/framework/util/Configuration.h +++ b/include/framework/util/Configuration.h @@ -8,34 +8,29 @@ */ #pragma once -#include -#include -#include -#include - -#include "psu-util/timer.h" -#include "psu-ds/Alias.h" +#include +#include namespace de { -thread_local size_t sampling_attempts = 0; -thread_local size_t sampling_rejections = 0; -thread_local size_t deletion_rejections = 0; -thread_local size_t bounds_rejections = 0; -thread_local size_t tombstone_rejections = 0; -thread_local size_t buffer_rejections = 0; +static thread_local size_t sampling_attempts = 0; +static thread_local size_t sampling_rejections = 0; +static thread_local size_t deletion_rejections = 0; +static thread_local size_t bounds_rejections = 0; +static thread_local size_t tombstone_rejections = 0; +static thread_local size_t buffer_rejections = 0; /* * thread_local size_t various_sampling_times go here. */ -thread_local size_t sample_range_time = 0; -thread_local size_t alias_time = 0; -thread_local size_t alias_query_time = 0; -thread_local size_t rejection_check_time = 0; -thread_local size_t buffer_sample_time = 0; -thread_local size_t memlevel_sample_time = 0; -thread_local size_t disklevel_sample_time = 0; -thread_local size_t sampling_bailouts = 0; +static thread_local size_t sample_range_time = 0; +static thread_local size_t alias_time = 0; +static thread_local size_t alias_query_time = 0; +static thread_local size_t rejection_check_time = 0; +static thread_local size_t buffer_sample_time = 0; +static thread_local size_t memlevel_sample_time = 0; +static thread_local size_t disklevel_sample_time = 0; +static thread_local size_t sampling_bailouts = 0; enum class LayoutPolicy { diff --git a/include/query/irs.h b/include/query/irs.h index c14d0cf..e2d9325 100644 --- a/include/query/irs.h +++ b/include/query/irs.h @@ -8,7 +8,6 @@ * A query class for independent range sampling. This query requires * that the shard support get_lower_bound(key), get_upper_bound(key), * and get_record_at(index). - * */ #pragma once diff --git a/include/query/wirs.h b/include/query/wirs.h index 4fac7e7..ae82194 100644 --- a/include/query/wirs.h +++ b/include/query/wirs.h @@ -8,7 +8,6 @@ * A query class for weighted independent range sampling. This * class is tightly coupled with include/shard/AugBTree.h, and * so is probably of limited general utility. - * */ #pragma once diff --git a/include/shard/Alias.h b/include/shard/Alias.h index f0d1d59..9275952 100644 --- a/include/shard/Alias.h +++ b/include/shard/Alias.h @@ -10,6 +10,7 @@ * structure. Designed to be used along side the WSS * query in include/query/wss.h * + * TODO: The code in this file is very poorly commented. */ #pragma once diff --git a/include/shard/AugBTree.h b/include/shard/AugBTree.h index 58bd098..54931bd 100644 --- a/include/shard/AugBTree.h +++ b/include/shard/AugBTree.h @@ -10,6 +10,8 @@ * used along side the WIRS query in include/query/wirs.h, but * also supports the necessary methods for other common query * types. + * + * TODO: The code in this file is very poorly commented. */ #pragma once diff --git a/include/shard/ISAMTree.h b/include/shard/ISAMTree.h index 9458b1f..3763271 100644 --- a/include/shard/ISAMTree.h +++ b/include/shard/ISAMTree.h @@ -8,6 +8,7 @@ * * A shard shim around an in-memory ISAM tree. * + * TODO: The code in this file is very poorly commented. */ #pragma once diff --git a/include/shard/PGM.h b/include/shard/PGM.h index 8031870..e2752ef 100644 --- a/include/shard/PGM.h +++ b/include/shard/PGM.h @@ -9,6 +9,7 @@ * A shard shim around the static version of the PGM learned * index. * + * TODO: The code in this file is very poorly commented. */ #pragma once diff --git a/include/shard/TrieSpline.h b/include/shard/TrieSpline.h index f9fb3cb..2a432e8 100644 --- a/include/shard/TrieSpline.h +++ b/include/shard/TrieSpline.h @@ -7,6 +7,7 @@ * * A shard shim around the TrieSpline learned index. * + * TODO: The code in this file is very poorly commented. */ #pragma once diff --git a/include/shard/VPTree.h b/include/shard/VPTree.h index ba13a87..b342fe6 100644 --- a/include/shard/VPTree.h +++ b/include/shard/VPTree.h @@ -9,7 +9,7 @@ * search. * * FIXME: Does not yet support the tombstone delete policy. - * + * TODO: The code in this file is very poorly commented. */ #pragma once @@ -234,13 +234,15 @@ private: } vpnode *build_subtree(size_t start, size_t stop, gsl_rng *rng) { - // base-case: sometimes happens (probably because of the +1 and -1 - // in the first recursive call) + /* + * base-case: sometimes happens (probably because of the +1 and -1 + * in the first recursive call) + */ if (start > stop) { return nullptr; } - // base-case: create a leaf node + /* base-case: create a leaf node */ if (stop - start <= LEAFSZ) { vpnode *node = new vpnode(); node->start = start; @@ -251,26 +253,30 @@ private: return node; } - // select a random element to be the root of the - // subtree + /* + * select a random element to be the root of the + * subtree + */ auto i = start + gsl_rng_uniform_int(rng, stop - start + 1); swap(start, i); - // partition elements based on their distance from the start, - // with those elements with distance falling below the median - // distance going into the left sub-array and those above - // the median in the right. This is easily done using QuickSelect. + /* + * partition elements based on their distance from the start, + * with those elements with distance falling below the median + * distance going into the left sub-array and those above + * the median in the right. This is easily done using QuickSelect. + */ auto mid = (start + 1 + stop) / 2; quickselect(start + 1, stop, mid, m_ptrs[start], rng); - // Create a new node based on this partitioning + /* Create a new node based on this partitioning */ vpnode *node = new vpnode(); node->start = start; - // store the radius of the circle used for partitioning the node. + /* store the radius of the circle used for partitioning the node. */ node->radius = m_ptrs[start]->rec.calc_distance(m_ptrs[mid]->rec); - // recursively construct the left and right subtrees + /* recursively construct the left and right subtrees */ node->inside = build_subtree(start + 1, mid-1, rng); node->outside = build_subtree(mid, stop, rng); @@ -279,6 +285,8 @@ private: return node; } + // TODO: The quickselect code can probably be generalized and moved out + // to psudb-common instead. void quickselect(size_t start, size_t stop, size_t k, Wrapped *p, gsl_rng *rng) { if (start == stop) return; @@ -291,6 +299,8 @@ private: } } + // TODO: The quickselect code can probably be generalized and moved out + // to psudb-common instead. size_t partition(size_t start, size_t stop, Wrapped *p, gsl_rng *rng) { auto pivot = start + gsl_rng_uniform_int(rng, stop - start); double pivot_dist = p->rec.calc_distance(m_ptrs[pivot]->rec); @@ -369,8 +379,5 @@ private: } } } - - }; - } diff --git a/include/util/Cursor.h b/include/util/Cursor.h index be7ab32..e8ba53d 100644 --- a/include/util/Cursor.h +++ b/include/util/Cursor.h @@ -7,15 +7,18 @@ * Distributed under the Modified BSD License. * * A simple record cursor type with associated methods for help in - * merging record sets when constructing shards. + * merging record sets when constructing shards. Iterates an array + * of records in order, and provides facilities to make sorted merges + * easier. + * + * TODO: Prior versions of this module included automatic support for + * working with data stored in PagedFiles as well. That should be + * reintroduced at some point. */ #pragma once -#include "framework/ShardRequirements.h" - -#include "psu-ds/BloomFilter.h" -#include "psu-ds/PriorityQueue.h" -#include "psu-util/alignment.h" +#include +#include namespace de { template @@ -64,6 +67,8 @@ template inline static Cursor *get_next(std::vector> &cursors, Cursor *current=nullptr) { const R *min_rec = nullptr; Cursor *result = nullptr; + // FIXME: for large cursor vectors, it may be worth it to use a + // PriorityQueue here instead of scanning. for (size_t i=0; i< cursors.size(); i++) { if (cursors[i] == (Cursor) {0} ) continue; diff --git a/include/util/SortedMerge.h b/include/util/SortedMerge.h index ed47acb..8a1e782 100644 --- a/include/util/SortedMerge.h +++ b/include/util/SortedMerge.h @@ -2,7 +2,6 @@ * include/util/SortedMerge.h * * Copyright (C) 2023 Douglas B. Rumbaugh - * Dong Xie * * Distributed under the Modified BSD License. * @@ -28,12 +27,26 @@ using psudb::queue_record; using psudb::byte; using psudb::CACHELINE_SIZE; +/* + * A simple struct to return record_count and tombstone_count information + * back to the caller. Could've been an std::pair, but I like the more + * explicit names. + */ struct merge_info { size_t record_count; size_t tombstone_count; }; - +/* + * Build a vector of cursors corresponding to the records contained within + * a vector of shards. The cursor at index i in the output will correspond + * to the shard at index i in the input. + * + * The values of reccnt and tscnt will be updated with the sum of the + * records contained within the shards. Note that these counts include deleted + * records that may be removed during shard construction, and so constitute + * upper bounds only. + */ template S> static std::vector>> build_cursor_vec(std::vector &shards, size_t *reccnt, size_t *tscnt) { std::vector>> cursors; @@ -57,7 +70,14 @@ static std::vector>> build_cursor_vec(std::vector &shards, } /* - * + * Build a sorted array of records based on the contents of a BufferView. + * This routine does not alter the buffer view, but rather copies the + * records out and then sorts them. The provided buffer must be large + * enough to store the records from the BufferView, or the behavior of the + * function is undefined. + * + * It allocates a temporary buffer for the sorting, and execution of the + * program will be aborted if the allocation fails. */ template static merge_info sorted_array_from_bufferview(BufferView bv, @@ -94,10 +114,10 @@ static merge_info sorted_array_from_bufferview(BufferView bv, continue; } - // fixme: this shouldn't be necessary, but the tagged record + // FIXME: this shouldn't be necessary, but the tagged record // bypass doesn't seem to be working on this code-path, so this // ensures that tagged records from the buffer are able to be - // dropped, eventually. it should only need to be &= 1 + // dropped, eventually. It should only need to be &= 1 base->header &= 3; buffer[info.record_count++] = *base; diff --git a/include/util/bf_config.h b/include/util/bf_config.h index fdf2195..9f29ed7 100644 --- a/include/util/bf_config.h +++ b/include/util/bf_config.h @@ -15,7 +15,7 @@ */ #pragma once -#include "psu-util/alignment.h" +#include namespace de { @@ -30,7 +30,6 @@ static size_t BF_HASH_FUNCS = 7; * (0, 1), or the behavior of bloom filters is undefined. */ static void BF_SET_FPR(double fpr) { - BF_FPR = fpr; } diff --git a/include/util/types.h b/include/util/types.h index 3908174..a13bd95 100644 --- a/include/util/types.h +++ b/include/util/types.h @@ -10,18 +10,18 @@ * that are defined within the header files that make direct use of them, * but all generally usable, simple types are defined here. * + * Many of these types were used in the Practical Dynamic Extension for + * Sampling Indexes work, particularly for external storage and buffer + * pool systems. They aren't used now, but we're leaving them here to use + * them in the future, when we add this functionality into this system too. */ #pragma once -#include #include -#include -#include +#include namespace de { -using std::byte; - /* Represents a page offset within a specific file (physical or virtual) */ typedef uint32_t PageNum; -- cgit v1.2.3 From aa1b40e9249afc03bf1a2f35de4cbf67c7f9b47e Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 9 Feb 2024 12:42:55 -0500 Subject: Framework: Fixed a bug where tagged deletes didn't release the epoch --- include/framework/DynamicExtension.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 473592d..238fc7f 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -124,12 +124,18 @@ public: * not *strictly* necessary. */ if constexpr (D == DeletePolicy::TAGGING) { - auto view = m_buffer->get_buffer_view(); static_assert(std::same_as, "Tagging is only supported in single-threaded operation"); - if (get_active_epoch()->get_structure()->tagged_delete(rec)) { + + auto view = m_buffer->get_buffer_view(); + + auto epoch = get_active_epoch(); + if (epoch->get_structure()->tagged_delete(rec)) { + end_job(epoch); return 1; } + end_job(epoch); + /* * the buffer will take the longest amount of time, and * probably has the lowest probability of having the record, @@ -470,6 +476,15 @@ private: do { old = m_previous_epoch.load(); + /* + * If running in single threaded mode, the failure to retire + * an Epoch will result in the thread of execution blocking + * indefinitely. + */ + if constexpr (std::same_as) { + if (old.epoch == epoch) assert(old.refcnt == 0); + } + if (old.epoch == epoch && old.refcnt == 0 && m_previous_epoch.compare_exchange_strong(old, new_ptr)) { break; -- cgit v1.2.3 From 3ddafd3b9ac089252814af87cb7d9fe534cf59a4 Mon Sep 17 00:00:00 2001 From: Douglas Rumbaugh Date: Fri, 9 Feb 2024 13:09:05 -0500 Subject: Removed centralized version structure --- include/framework/DynamicExtension.h | 38 +++--------------------------------- include/framework/scheduling/Epoch.h | 5 +++++ 2 files changed, 8 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/framework/DynamicExtension.h b/include/framework/DynamicExtension.h index 238fc7f..7ea5370 100644 --- a/include/framework/DynamicExtension.h +++ b/include/framework/DynamicExtension.h @@ -12,8 +12,6 @@ #include #include #include -#include -#include #include "framework/interface/Scheduler.h" #include "framework/scheduling/FIFOScheduler.h" @@ -26,12 +24,10 @@ #include "framework/util/Configuration.h" #include "framework/scheduling/Epoch.h" - - namespace de { template S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING, - DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=SerialScheduler> + DeletePolicy D=DeletePolicy::TAGGING, SchedulerInterface SCHED=FIFOScheduler> class DynamicExtension { typedef S Shard; typedef MutableBuffer Buffer; @@ -62,8 +58,6 @@ public: m_current_epoch.store({new _Epoch(0, vers, m_buffer, 0), 0}); m_previous_epoch.store({nullptr, 0}); m_next_epoch.store({nullptr, 0}); - - m_versions.insert(vers); } ~DynamicExtension() { @@ -80,10 +74,6 @@ public: delete m_previous_epoch.load().epoch; delete m_buffer; - - for (auto e : m_versions) { - delete e; - } } /* @@ -320,8 +310,8 @@ private: Buffer *m_buffer; - std::mutex m_struct_lock; - std::set m_versions; + //std::mutex m_struct_lock; + //std::set m_versions; alignas(64) std::atomic m_reconstruction_scheduled; @@ -448,11 +438,6 @@ private: end_job(current_epoch); - std::unique_lock m_struct_lock; - m_versions.insert(m_next_epoch.load().epoch->get_structure()); - m_struct_lock.release(); - - return m_next_epoch.load().epoch; } @@ -494,23 +479,6 @@ private: } while(true); delete epoch; - - /* - * Following the epoch's destruction, any buffers - * or structures with no remaining references can - * be safely freed. - */ - std::unique_lock lock(m_struct_lock); - - for (auto itr = m_versions.begin(); itr != m_versions.end();) { - if ((*itr)->get_reference_count() == 0) { - auto tmp = *itr; - itr = m_versions.erase(itr); - delete tmp; - } else { - itr++; - } - } } static void reconstruction(void *arguments) { diff --git a/include/framework/scheduling/Epoch.h b/include/framework/scheduling/Epoch.h index 3ffa145..9377fb0 100644 --- a/include/framework/scheduling/Epoch.h +++ b/include/framework/scheduling/Epoch.h @@ -47,6 +47,11 @@ public: if (m_structure) { m_structure->release_reference(); } + + if (m_structure->get_reference_count() == 0) { + delete m_structure; + } + } /* -- cgit v1.2.3