From 7c03d771475421c1d5a2bbc135242536af1a371c Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 25 Sep 2023 10:49:36 -0400
Subject: Re-structuring Project + scheduling updates

This is a big one--probably should have split it apart, but I'm feeling
lazy this morning.

* Organized the mess of header files in include/framework by splitting
them out into their own subdirectories, and renaming a few files to
remove redundancies introduced by the directory structure.

* Introduced a new framework/ShardRequirements.h header file for simpler
shard development. This header simply contains the necessary includes
from framework/* for creating shard files. This should help to remove
structural dependencies from the framework file structure and shards,
as well as centralizing the necessary framework files to make shard
development easier.

* Created a (currently dummy) SchedulerInterface, and make the scheduler
implementation a template parameter of the dynamic extension for easier
testing of various scheduling policies. There's still more work to be
done to fully integrate the scheduler (queries, multiple buffers), but
some more of the necessary framework code for this has been added as well.

* Adjusted the Task interface setup for the scheduler. The task structures
have been removed from ExtensionStructure and placed in their own header
file. Additionally, I started experimenting with using std::variant,
as opposed to inheritence, to implement subtype polymorphism on the
Merge and Query tasks. The scheduler now has a general task queue that
contains both, and std::variant, std::visit, and std::get are used to
manipulate them without virtual functions.

* Removed Alex.h, as it can't build anyway. There's a branch out there
containing the Alex implementation stripped of the C++20 stuff. So
there's no need to keep it here.
---
 include/framework/structure/ExtensionStructure.h | 428 +++++++++++++++++++++++
 include/framework/structure/InternalLevel.h      | 258 ++++++++++++++
 include/framework/structure/MutableBuffer.h      | 242 +++++++++++++
 3 files changed, 928 insertions(+)
 create mode 100644 include/framework/structure/ExtensionStructure.h
 create mode 100644 include/framework/structure/InternalLevel.h
 create mode 100644 include/framework/structure/MutableBuffer.h

(limited to 'include/framework/structure')
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
new file mode 100644
index 0000000..920e1c3
--- /dev/null
+++ b/include/framework/structure/ExtensionStructure.h
@@ -0,0 +1,428 @@
+/*
+ * include/framework/ExtensionStructure.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *                    Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <atomic>
+#include <numeric>
+#include <cstdio>
+#include <vector>
+
+#include "framework/structure/MutableBuffer.h"
+#include "framework/structure/InternalLevel.h"
+#include "framework/interface/Shard.h"
+#include "framework/interface/Query.h"
+#include "framework/interface/Record.h"
+
+#include "framework/util/Configuration.h"
+#include "framework/scheduling/Task.h"
+
+#include "psu-util/timer.h"
+#include "psu-ds/Alias.h"
+
+namespace de {
+
+template <RecordInterface R, ShardInterface S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING>
+class ExtensionStructure {
+    typedef S Shard;
+    typedef MutableBuffer<R> Buffer;
+
+public:
+    ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop)
+        : m_scale_factor(scale_factor)
+        , m_max_delete_prop(max_delete_prop)
+        , m_buffer_size(buffer_size)
+    {}
+
+    ~ExtensionStructure() = default;
+
+    /*
+     * Create a shallow copy of this extension structure. The copy will share references to the
+     * same levels/shards as the original, but will have its own lists. As all of the shards are
+     * immutable (with the exception of deletes), the copy can be restructured with merges, etc., 
+     * without affecting the original.
+     *
+     * NOTE: When using tagged deletes, a delete of a record in the original structure will affect
+     * the copy, so long as the copy retains a reference to the same shard as the original. This could
+     * cause synchronization problems under tagging with concurrency. Any deletes in this context will
+     * need to be forwarded to the appropriate structures manually.
+     */
+    ExtensionStructure<R, S, Q, L> *copy() {
+       auto new_struct = new ExtensionStructure<R, S, Q, L>(m_scale_factor, m_max_delete_prop, m_buffer_size);
+        for (size_t i=0; i<m_levels.size(); i++) {
+            new_struct->m_levels.push_back(m_levels[i]->clone());
+        }
+
+        return new_struct;
+    }
+
+    /*
+     * Search for a record matching the argument and mark it deleted by
+     * setting the delete bit in its wrapped header. Returns 1 if a matching
+     * record was found and deleted, and 0 if a matching record was not found.
+     *
+     * This function will stop after finding the first matching record. It is assumed
+     * that no duplicate records exist. In the case of duplicates, this function will
+     * still "work", but in the sense of "delete first match".
+     */
+    int tagged_delete(const R &rec) {
+        for (auto level : m_levels) {
+            if (level && level->delete_record(rec)) {
+                return 1;
+            }
+        }
+
+        /*
+         * If the record to be erased wasn't found, return 0. The
+         * DynamicExtension itself will then search the active 
+         * Buffers.
+         */
+        return 0;
+    }
+
+    /*
+     * Merge the memory table down into the tree, completing any required other
+     * merges to make room for it.
+     */
+    inline bool merge_buffer(Buffer *buffer) {
+        assert(can_merge_with(0, buffer->get_record_count()));
+
+        merge_buffer_into_l0(buffer);
+        buffer->truncate();
+
+        return true;
+    }
+
+    /*
+     * Return the total number of records (including tombstones) within all
+     * of the levels of the structure.
+     */
+    size_t get_record_count() {
+        size_t cnt = 0;
+
+        for (size_t i=0; i<m_levels.size(); i++) {
+            if (m_levels[i]) cnt += m_levels[i]->get_record_count();
+        }
+
+        return cnt;
+    }
+
+    /*
+     * Return the total number of tombstones contained within all of the
+     * levels of the structure.
+     */
+    size_t get_tombstone_cnt() {
+        size_t cnt = 0;
+
+        for (size_t i=0; i<m_levels.size(); i++) {
+            if (m_levels[i]) cnt += m_levels[i]->get_tombstone_count();
+        }
+
+        return cnt;
+    }
+
+    /*
+     * Return the number of levels within the structure. Note that not
+     * all of these levels are necessarily populated.
+     */
+    size_t get_height() {
+        return m_levels.size();
+    }
+
+    /*
+     * Return the amount of memory (in bytes) used by the shards within the
+     * structure for storing the primary data structure and raw data.
+     */
+    size_t get_memory_usage() {
+        size_t cnt = 0;
+        for (size_t i=0; i<m_levels.size(); i++) {
+            if (m_levels[i]) cnt += m_levels[i]->get_memory_usage();
+        }
+
+        return cnt;
+    }
+
+    /*
+     * Return the amount of memory (in bytes) used by the shards within the
+     * structure for storing auxiliary data structures. This total does not
+     * include memory used for the main data structure, or raw data.
+     */
+    size_t get_aux_memory_usage() {
+        size_t cnt = 0;
+        for (size_t i=0; i<m_levels.size(); i++) {
+            if (m_levels[i]) {
+                cnt += m_levels[i]->get_aux_memory_usage();
+            }
+        }
+
+        return cnt;
+    }
+
+    /*
+     * Validate that no level in the structure exceeds its maximum tombstone capacity. This is
+     * used to trigger preemptive compactions at the end of the merge process.
+     */
+    bool validate_tombstone_proportion() {
+        long double ts_prop;
+        for (size_t i=0; i<m_levels.size(); i++) {
+            if (m_levels[i]) {
+                ts_prop = (long double) m_levels[i]->get_tombstone_count() / (long double) calc_level_record_capacity(i);
+                if (ts_prop > (long double) m_max_delete_prop) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool validate_tombstone_proportion(level_index level) {
+        long double ts_prop =  (long double) m_levels[level]->get_tombstone_count() / (long double) calc_level_record_capacity(level);
+        return ts_prop <= (long double) m_max_delete_prop;
+    }
+
+    /*
+     * Return a reference to the underlying vector of levels within the
+     * structure.
+     */
+    std::vector<std::shared_ptr<InternalLevel<R, S, Q>>> &get_levels() {
+        return m_levels;
+    }
+
+    /*
+     *
+     */
+    std::vector<MergeTask> get_merge_tasks(size_t buffer_reccnt) {
+        std::vector<MergeTask> merges;
+
+        /*
+         * The buffer -> L0 merge task is not included so if that 
+         * can be done without any other change, just return an 
+         * empty list.
+         */
+        if (can_merge_with(0, buffer_reccnt)) {
+            return std::move(merges); 
+        }
+
+        level_index merge_base_level = find_mergable_level(0);
+        if (merge_base_level == -1) {
+            merge_base_level = grow();
+        }
+
+        for (level_index i=merge_base_level; i>0; i--) {
+            MergeTask task;
+            task.m_source_level = i - 1;
+            task.m_target_level = i;
+            task.m_type = TaskType::MERGE;
+
+            /*
+             * The amount of storage required for the merge accounts
+             * for the cost of storing the new records, along with the
+             * cost of retaining the old records during the process 
+             * (hence the 2x multiplier). 
+             *
+             * FIXME: currently does not account for the *actual* size 
+             * of the shards, only the storage for the records 
+             * themselves.
+             */
+            size_t reccnt = m_levels[i-1]->get_record_count();
+            if constexpr (L == LayoutPolicy::LEVELING) {
+                if (can_merge_with(i, reccnt)) {
+                    reccnt += m_levels[i]->get_record_count();
+                }
+            }
+            task.m_size = 2* reccnt * sizeof(R);
+
+            merges.push_back(task);
+        }
+
+        return std::move(merges);
+    }
+
+
+    /*
+     *
+     */
+    std::vector<MergeTask> get_merge_tasks_from_level(size_t source_level) {
+        std::vector<MergeTask> merges;
+
+        level_index merge_base_level = find_mergable_level(source_level);
+        if (merge_base_level == -1) {
+            merge_base_level = grow();
+        }
+
+        for (level_index i=merge_base_level; i>source_level; i--) {
+            MergeTask task;
+            task.m_source_level = i - 1;
+            task.m_target_level = i;
+
+            /*
+             * The amount of storage required for the merge accounts
+             * for the cost of storing the new records, along with the
+             * cost of retaining the old records during the process 
+             * (hence the 2x multiplier). 
+             *
+             * FIXME: currently does not account for the *actual* size 
+             * of the shards, only the storage for the records 
+             * themselves.
+             */
+            size_t reccnt = m_levels[i-1]->get_record_count();
+            if constexpr (L == LayoutPolicy::LEVELING) {
+                if (can_merge_with(i, reccnt)) {
+                    reccnt += m_levels[i]->get_record_count();
+                }
+            }
+            task.m_size = 2* reccnt * sizeof(R);
+
+            merges.push_back(task);
+        }
+
+        return std::move(merges);
+    }
+
+    /*
+     * Merge the level specified by incoming level into the level specified
+     * by base level. The two levels should be sequential--i.e. no levels
+     * are skipped in the merge process--otherwise the tombstone ordering
+     * invariant may be violated by the merge operation.
+     */
+    inline void merge_levels(level_index base_level, level_index incoming_level) {
+        // merging two memory levels
+        if constexpr (L == LayoutPolicy::LEVELING) {
+            auto tmp = m_levels[base_level];
+            m_levels[base_level] = InternalLevel<R, Shard, Q>::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get());
+        } else {
+            m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get());
+            m_levels[base_level]->finalize();
+        }
+
+        m_levels[incoming_level] = std::shared_ptr<InternalLevel<R, Shard, Q>>(new InternalLevel<R, Shard, Q>(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor));
+    }
+
+
+private:
+    size_t m_scale_factor;
+    double m_max_delete_prop;
+    size_t m_buffer_size;
+
+    std::vector<std::shared_ptr<InternalLevel<R, S, Q>>> m_levels;
+
+    /*
+     * Add a new level to the LSM Tree and return that level's index. Will
+     * automatically determine whether the level should be on memory or on disk,
+     * and act appropriately.
+     */
+    inline level_index grow() {
+        level_index new_idx = m_levels.size();
+        size_t new_shard_cnt = (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor;
+
+        m_levels.emplace_back(std::shared_ptr<InternalLevel<R, Shard, Q>>(new InternalLevel<R, Shard, Q>(new_idx, new_shard_cnt)));
+        return new_idx;
+    }
+
+    /*
+     * Find the first level below the level indicated by idx that
+     * is capable of sustaining a merge operation and return its
+     * level index. If no such level exists, returns -1. Also
+     * returns -1 if idx==0, and no such level exists, to skimplify
+     * the logic of the first merge.
+     */
+    inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) {
+
+        if (idx == 0 && m_levels.size() == 0) return -1;
+
+        bool level_found = false;
+        bool disk_level;
+        level_index merge_level_idx;
+
+        size_t incoming_rec_cnt = get_level_record_count(idx, buffer);
+        for (level_index i=idx+1; i<m_levels.size(); i++) {
+            if (can_merge_with(i, incoming_rec_cnt)) {
+                return i;
+            }
+
+            incoming_rec_cnt = get_level_record_count(i);
+        }
+
+        return -1;
+    }
+
+    inline void merge_buffer_into_l0(Buffer *buffer) {
+        assert(m_levels[0]);
+        if constexpr (L == LayoutPolicy::LEVELING) {
+            // FIXME: Kludgey implementation due to interface constraints.
+            auto old_level = m_levels[0].get();
+            auto temp_level = new InternalLevel<R, Shard, Q>(0, 1);
+            temp_level->append_buffer(buffer);
+            auto new_level = InternalLevel<R, Shard, Q>::merge_levels(old_level, temp_level);
+
+            m_levels[0] = new_level;
+            delete temp_level;
+        } else {
+            m_levels[0]->append_buffer(buffer);
+        }
+    }
+
+    /*
+     * Mark a given memory level as no-longer in use by the tree. For now this
+     * will just free the level. In future, this will be more complex as the
+     * level may not be able to immediately be deleted, depending upon who
+     * else is using it.
+     */ 
+    inline void mark_as_unused(std::shared_ptr<InternalLevel<R, Shard, Q>> level) {
+        level.reset();
+    }
+
+    /*
+     * Assume that level "0" should be larger than the buffer. The buffer
+     * itself is index -1, which should return simply the buffer capacity.
+     */
+    inline size_t calc_level_record_capacity(level_index idx) {
+        return m_buffer_size * pow(m_scale_factor, idx+1);
+    }
+
+    /*
+     * Returns the actual number of records present on a specified level. An
+     * index value of -1 indicates the memory table. Can optionally pass in
+     * a pointer to the memory table to use, if desired. Otherwise, there are
+     * no guarantees about which buffer will be accessed if level_index is -1.
+     */
+    inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) {
+        if (buffer) { 
+            return buffer->get_record_count();
+        }
+
+        return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0;
+    }
+
+    /*
+     * Determines if the specific level can merge with another record containing
+     * incoming_rec_cnt number of records. The provided level index should be 
+     * non-negative (i.e., not refer to the buffer) and will be automatically
+     * translated into the appropriate index into either the disk or memory level
+     * vector.
+     */
+    inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) {
+        if (idx>= m_levels.size() || !m_levels[idx]) {
+            return false;
+        }
+
+        if (L == LayoutPolicy::LEVELING) {
+            return m_levels[idx]->get_record_count() + incoming_rec_cnt <= calc_level_record_capacity(idx);
+        } else {
+            return m_levels[idx]->get_shard_count() < m_scale_factor;
+        }
+
+        /* unreachable */
+        assert(true);
+    }
+};
+
+}
+
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
new file mode 100644
index 0000000..b9230f4
--- /dev/null
+++ b/include/framework/structure/InternalLevel.h
@@ -0,0 +1,258 @@
+/*
+ * include/framework/InternalLevel.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *                    Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "util/types.h"
+#include "framework/interface/Shard.h"
+#include "framework/interface/Query.h"
+#include "framework/interface/Record.h"
+#include "framework/structure/MutableBuffer.h"
+
+namespace de {
+template <RecordInterface R, ShardInterface S, QueryInterface Q>
+class InternalLevel;
+
+
+
+template <RecordInterface R, ShardInterface S, QueryInterface Q>
+class InternalLevel {
+    typedef S Shard;
+    typedef MutableBuffer<R> Buffer;
+public:
+    InternalLevel(ssize_t level_no, size_t shard_cap)
+    : m_level_no(level_no)
+    , m_shard_cnt(0)
+    , m_shards(shard_cap, nullptr)
+    , m_owns(shard_cap, true)
+    , m_pending_shard(nullptr)
+    {}
+
+    // Create a new memory level sharing the shards and repurposing it as previous level_no + 1
+    // WARNING: for leveling only.
+    InternalLevel(InternalLevel* level)
+    : m_level_no(level->m_level_no + 1)
+    , m_shard_cnt(level->m_shard_cnt)
+    , m_shards(level->m_shards.size(), nullptr)
+    , m_owns(level->m_owns.size(), true) 
+    , m_pending_shard(nullptr)
+    {
+        assert(m_shard_cnt == 1 && m_shards.size() == 1);
+
+        for (size_t i=0; i<m_shards.size(); i++) {
+            level->m_owns[i] = false;
+            m_shards[i] = level->m_shards[i];
+        }
+    }
+
+    ~InternalLevel() { 
+        for (size_t i=0; i<m_shards.size(); i++) {
+            if (m_owns[i]) delete m_shards[i];
+        }
+
+        delete m_pending_shard;
+    }
+
+    // WARNING: for leveling only.
+    // assuming the base level is the level new level is merging into. (base_level is larger.)
+    static std::shared_ptr<InternalLevel> merge_levels(InternalLevel* base_level, InternalLevel* new_level) {
+        assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0));
+        auto res = new InternalLevel(base_level->m_level_no, 1);
+        res->m_shard_cnt = 1;
+        Shard* shards[2];
+        shards[0] = base_level->m_shards[0];
+        shards[1] = new_level->m_shards[0];
+
+        res->m_shards[0] = new S(shards, 2);
+        return std::shared_ptr<InternalLevel>(res);
+    }
+
+    void append_buffer(Buffer* buffer) {
+        if (m_shard_cnt == m_shards.size()) {
+            assert(m_pending_shard == nullptr);
+            m_pending_shard = new S(buffer);
+            return;
+        }
+
+        m_shards[m_shard_cnt] = new S(buffer);
+        m_owns[m_shard_cnt] = true;
+        ++m_shard_cnt;
+    }
+
+    void append_merged_shards(InternalLevel* level) {
+        if (m_shard_cnt == m_shards.size()) {
+            m_pending_shard = new S(level->m_shards.data(), level->m_shard_cnt);
+            return;
+        }
+
+        m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt);
+        m_owns[m_shard_cnt] = true;
+
+        ++m_shard_cnt;
+    }
+
+
+    void finalize() {
+        if (m_pending_shard) {
+            for (size_t i=0; i<m_shards.size(); i++) {
+                if (m_owns[i]) {
+                    delete m_shards[i];
+                    m_shards[i] = nullptr;
+                    m_owns[i] = false;
+                }
+            }
+
+            m_shards[0] = m_pending_shard;
+            m_owns[0] = true;
+            m_pending_shard = nullptr;
+            m_shard_cnt = 1;
+        }
+    }
+
+    Shard *get_merged_shard() {
+        if (m_shard_cnt == 0) {
+            return nullptr;
+        }
+
+        Shard *shards[m_shard_cnt];
+
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            shards[i] = m_shards[i];
+        }
+
+        return new S(shards, m_shard_cnt);
+    }
+
+    // Append the sample range in-order.....
+    void get_query_states(std::vector<std::pair<ShardID, Shard *>> &shards, std::vector<void*>& shard_states, void *query_parms) {
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            if (m_shards[i]) {
+                auto shard_state = Q::get_query_state(m_shards[i], query_parms);
+                shards.push_back({{m_level_no, (ssize_t) i}, m_shards[i]});
+                shard_states.emplace_back(shard_state);
+            }
+        }
+    }
+
+    bool check_tombstone(size_t shard_stop, const R& rec) {
+        if (m_shard_cnt == 0) return false;
+
+        for (int i = m_shard_cnt - 1; i >= (ssize_t) shard_stop;  i--) {
+            if (m_shards[i]) {
+                auto res = m_shards[i]->point_lookup(rec, true);
+                if (res && res->is_tombstone()) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    bool delete_record(const R &rec) {
+        if (m_shard_cnt == 0) return false;
+
+        for (size_t i = 0; i < m_shards.size();  ++i) {
+            if (m_shards[i]) {
+                auto res = m_shards[i]->point_lookup(rec);
+                if (res) {
+                    res->set_delete();
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    Shard* get_shard(size_t idx) {
+        return m_shards[idx];
+    }
+
+    size_t get_shard_count() {
+        return m_shard_cnt;
+    }
+
+    size_t get_record_count() {
+        size_t cnt = 0;
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            cnt += m_shards[i]->get_record_count();
+        }
+
+        return cnt;
+    }
+    
+    size_t get_tombstone_count() {
+        size_t res = 0;
+        for (size_t i = 0; i < m_shard_cnt; ++i) {
+            res += m_shards[i]->get_tombstone_count();
+        }
+        return res;
+    }
+
+    size_t get_aux_memory_usage() {
+        size_t cnt = 0;
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            cnt += m_shards[i]->get_aux_memory_usage();
+        }
+
+        return cnt;
+    }
+
+    size_t get_memory_usage() {
+        size_t cnt = 0;
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            if (m_shards[i]) {
+                cnt += m_shards[i]->get_memory_usage();
+            }
+        }
+
+        return cnt;
+    }
+
+    double get_tombstone_prop() {
+        size_t tscnt = 0;
+        size_t reccnt = 0;
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            if (m_shards[i]) {
+                tscnt += m_shards[i]->get_tombstone_count();
+                reccnt += (*m_shards[i])->get_record_count();
+            }
+        }
+
+        return (double) tscnt / (double) (tscnt + reccnt);
+    }
+
+private:
+    ssize_t m_level_no;
+    
+    size_t m_shard_cnt;
+    size_t m_shard_size_cap;
+
+    std::vector<Shard*> m_shards;
+
+    Shard *m_pending_shard;
+
+    std::vector<bool> m_owns;
+
+    std::shared_ptr<InternalLevel> clone() {
+        auto new_level = std::make_shared<InternalLevel>(m_level_no, m_shards.size());
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            new_level->m_shards[i] = m_shards[i];
+            new_level->m_owns[i] = true;
+            m_owns[i] = false;
+        }
+
+        return new_level;
+    }
+};
+
+}
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
new file mode 100644
index 0000000..9f12175
--- /dev/null
+++ b/include/framework/structure/MutableBuffer.h
@@ -0,0 +1,242 @@
+/*
+ * include/framework/MutableBuffer.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *                    Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cstdlib>
+#include <atomic>
+#include <condition_variable>
+#include <cassert>
+#include <numeric>
+#include <algorithm>
+#include <type_traits>
+
+#include "psu-util/alignment.h"
+#include "util/bf_config.h"
+#include "psu-ds/BloomFilter.h"
+#include "psu-ds/Alias.h"
+#include "psu-util/timer.h"
+#include "framework/interface/Record.h"
+
+using psudb::CACHELINE_SIZE;
+
+namespace de {
+
+template <RecordInterface R>
+class MutableBuffer {
+public:
+    MutableBuffer(size_t capacity, size_t max_tombstone_cap)
+    : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0)
+    , m_tombstonecnt(0), m_weight(0), m_max_weight(0) {
+        m_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
+        m_merge_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
+        m_tombstone_filter = nullptr;
+        if (max_tombstone_cap > 0) {
+            m_tombstone_filter = new psudb::BloomFilter<R>(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS);
+        }
+
+        m_refcnt.store(0);
+        m_deferred_truncate.store(false);
+        m_merging.store(false);
+    }
+
+    ~MutableBuffer() {
+        assert(m_refcnt.load() == 0);
+        assert(m_merging.load() == false);
+
+        if (m_data) free(m_data);
+        if (m_tombstone_filter) delete m_tombstone_filter;
+        if (m_merge_data) free(m_merge_data);
+    }
+
+    template <typename R_ = R>
+    int append(const R &rec, bool tombstone=false) {
+        if (tombstone && m_tombstonecnt + 1 > m_tombstone_cap) return 0;
+
+        int32_t pos = 0;
+        if ((pos = try_advance_tail()) == -1) return 0;
+
+        Wrapped<R> wrec;
+        wrec.rec = rec;
+        wrec.header = 0;
+        if (tombstone) wrec.set_tombstone();
+
+        m_data[pos] = wrec;
+        m_data[pos].header |= (pos << 2);
+
+        if (tombstone) {
+            m_tombstonecnt.fetch_add(1);
+            if (m_tombstone_filter) m_tombstone_filter->insert(rec);
+        }
+
+        if constexpr (WeightedRecordInterface<R_>) {
+            m_weight.fetch_add(rec.weight);
+            double old = m_max_weight.load();
+            while (old < rec.weight) {
+                m_max_weight.compare_exchange_strong(old, rec.weight);
+                old = m_max_weight.load();
+            }
+        } else {
+            m_weight.fetch_add(1);
+        }
+
+        return 1;     
+    }
+
+    bool truncate() {
+        m_tombstonecnt.store(0);
+        m_reccnt.store(0);
+        m_weight.store(0);
+        m_max_weight.store(0);
+        if (m_tombstone_filter) m_tombstone_filter->clear();
+
+        return true;
+    }
+
+    size_t get_record_count() {
+        return m_reccnt;
+    }
+    
+    size_t get_capacity() {
+        return m_cap;
+    }
+
+    bool is_full() {
+        return m_reccnt == m_cap;
+    }
+
+    size_t get_tombstone_count() {
+        return m_tombstonecnt.load();
+    }
+
+    bool delete_record(const R& rec) {
+        auto offset = 0;
+        while (offset < m_reccnt.load()) {
+            if (m_data[offset].rec == rec) {
+                m_data[offset].set_delete();
+                return true;
+            }
+            offset++;
+        }
+
+        return false;
+    }
+
+    bool check_tombstone(const R& rec) {
+        if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
+
+        auto offset = 0;
+        while (offset < m_reccnt.load()) {
+            if (m_data[offset].rec == rec && m_data[offset].is_tombstone()) {
+                return true;
+            }
+            offset++;;
+        }
+        return false;
+    }
+
+    size_t get_memory_usage() {
+        return m_cap * sizeof(R);
+    }
+
+    size_t get_aux_memory_usage() {
+        return m_tombstone_filter->get_memory_usage();
+    }
+
+    size_t get_tombstone_capacity() {
+        return m_tombstone_cap;
+    }
+
+    double get_total_weight() {
+        return m_weight.load();
+    }
+
+    Wrapped<R> *get_data() {
+        return m_data;
+    }
+
+    double get_max_weight() {
+        return m_max_weight;
+    }
+
+    bool start_merge() {
+        if (m_merge_lock.try_lock()) {
+            /* there cannot already been an active merge */
+            if (m_merging.load()) {
+                m_merge_lock.unlock();
+                return false;
+            } 
+
+            m_merging.store(true);
+            memcpy(m_merge_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
+            return true;
+        }
+
+        /* lock could not be obtained */
+        return false;
+    }
+
+    bool finish_merge() {
+        m_merge_lock.unlock();
+        return true;
+    }
+
+    /*
+     * Concurrency-related operations
+     */
+    bool take_reference() {
+        m_refcnt.fetch_add(1);
+        return true;
+    }
+
+    bool release_reference() {
+        m_refcnt.fetch_add(-1);
+
+        if (m_refcnt.load() == 0 && m_deferred_truncate.load()) {
+            assert(this->truncate());
+        }
+
+        return true;
+    }
+
+    bool active_merge() {
+        return m_merging.load();
+    }
+
+private:
+    int32_t try_advance_tail() {
+        size_t new_tail = m_reccnt.fetch_add(1);
+
+        if (new_tail < m_cap) return new_tail;
+        else return -1;
+    }
+
+    size_t m_cap;
+    size_t m_tombstone_cap;
+    
+    Wrapped<R>* m_data;
+    Wrapped<R>* m_merge_data;
+
+    psudb::BloomFilter<R>* m_tombstone_filter;
+
+    alignas(64) std::atomic<size_t> m_tombstonecnt;
+    alignas(64) std::atomic<uint32_t> m_reccnt;
+    alignas(64) std::atomic<double> m_weight;
+    alignas(64) std::atomic<double> m_max_weight;
+    alignas(64) std::atomic<bool> m_merging;
+    alignas(64) std::atomic<bool> m_deferred_truncate;
+    alignas(64) std::atomic<size_t> m_refcnt;
+
+    alignas(64) std::mutex m_merge_lock;
+    alignas(64) std::mutex m_trunc_lock;
+    alignas(64) std::condition_variable m_trunc_signal;
+
+};
+
+}
-- 
cgit v1.2.3


From 7ecfb22c32b7986ed1a2439c1abbeed298e4153a Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Fri, 20 Oct 2023 17:00:42 -0400
Subject: Initial pass w/ new scheduler setup

currently there's a race condition of some type to sort out.
---
 include/framework/structure/ExtensionStructure.h | 21 +++++++++------------
 include/framework/structure/MutableBuffer.h      | 11 +++++++++++
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 920e1c3..8344518 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -93,7 +93,10 @@ public:
     inline bool merge_buffer(Buffer *buffer) {
         assert(can_merge_with(0, buffer->get_record_count()));
 
+        buffer->start_merge();
         merge_buffer_into_l0(buffer);
+        buffer->finish_merge();
+
         buffer->truncate();
 
         return true;
@@ -216,10 +219,7 @@ public:
         }
 
         for (level_index i=merge_base_level; i>0; i--) {
-            MergeTask task;
-            task.m_source_level = i - 1;
-            task.m_target_level = i;
-            task.m_type = TaskType::MERGE;
+            MergeTask task = {i-1, i};
 
             /*
              * The amount of storage required for the merge accounts
@@ -237,7 +237,7 @@ public:
                     reccnt += m_levels[i]->get_record_count();
                 }
             }
-            task.m_size = 2* reccnt * sizeof(R);
+            //task.m_size = 2* reccnt * sizeof(R);
 
             merges.push_back(task);
         }
@@ -249,7 +249,7 @@ public:
     /*
      *
      */
-    std::vector<MergeTask> get_merge_tasks_from_level(size_t source_level) {
+    std::vector<MergeTask> get_merge_tasks_from_level(level_index source_level) {
         std::vector<MergeTask> merges;
 
         level_index merge_base_level = find_mergable_level(source_level);
@@ -258,10 +258,7 @@ public:
         }
 
         for (level_index i=merge_base_level; i>source_level; i--) {
-            MergeTask task;
-            task.m_source_level = i - 1;
-            task.m_target_level = i;
-
+            MergeTask task = {i - 1, i};
             /*
              * The amount of storage required for the merge accounts
              * for the cost of storing the new records, along with the
@@ -278,12 +275,12 @@ public:
                     reccnt += m_levels[i]->get_record_count();
                 }
             }
-            task.m_size = 2* reccnt * sizeof(R);
+//            task.m_size = 2* reccnt * sizeof(R);
 
             merges.push_back(task);
         }
 
-        return std::move(merges);
+        return merges;
     }
 
     /*
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 9f12175..804ca5e 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -90,12 +90,23 @@ public:
     }
 
     bool truncate() {
+
+        while (active_merge() || m_refcnt.load() > 0)
+            ;
+
+        m_merge_lock.lock();
+
+        while (m_refcnt > 0) 
+            ;
+
         m_tombstonecnt.store(0);
         m_reccnt.store(0);
         m_weight.store(0);
         m_max_weight.store(0);
         if (m_tombstone_filter) m_tombstone_filter->clear();
 
+        m_merge_lock.unlock();
+
         return true;
     }
 
-- 
cgit v1.2.3


From b72103cb11347f0dd108bd2321f29b0d6ab05106 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 23 Oct 2023 13:18:30 -0400
Subject: Bugfixes

---
 include/framework/structure/MutableBuffer.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 804ca5e..4e0b5c2 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -195,6 +195,7 @@ public:
 
     bool finish_merge() {
         m_merge_lock.unlock();
+        m_merging.store(false);
         return true;
     }
 
-- 
cgit v1.2.3


From 3afacb7702e6d8fa67749a2a41dc776d315e02a9 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 23 Oct 2023 17:43:22 -0400
Subject: Began moving to an explicit epoch-based system

I started moving over to an explicit Epoch based system, which has
necessitated a ton of changes throughout the code base. This will
ultimately allow for a much cleaner set of abstractions for managing
concurrency.
---
 include/framework/structure/BufferView.h         | 124 +++++++++++++++++++++++
 include/framework/structure/ExtensionStructure.h |  26 +++++
 include/framework/structure/MutableBuffer.h      |   4 +
 3 files changed, 154 insertions(+)
 create mode 100644 include/framework/structure/BufferView.h

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
new file mode 100644
index 0000000..1efc1ac
--- /dev/null
+++ b/include/framework/structure/BufferView.h
@@ -0,0 +1,124 @@
+/*
+ * include/framework/structure/BufferView.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cstdlib>
+#include <atomic>
+#include <condition_variable>
+#include <cassert>
+#include <numeric>
+#include <algorithm>
+#include <type_traits>
+
+#include "psu-util/alignment.h"
+#include "util/bf_config.h"
+#include "psu-ds/BloomFilter.h"
+#include "psu-ds/Alias.h"
+#include "psu-util/timer.h"
+#include "framework/interface/Record.h"
+#include "framework/structure/MutableBuffer.h"
+#include "framework/interface/Query.h"
+
+namespace de {
+
+template <RecordInterface R, QueryInterface Q>
+class BufferView {
+    typedef MutableBuffer<R> Buffer;
+public:
+    BufferView() = default;
+
+    BufferView(std::vector<Buffer*> buffers) 
+        : m_buffers(buffers)
+        , m_cutoff(buffers[buffers->size()-1]->get_record_count())
+    {}
+
+    ~BufferView() = default;
+
+    bool delete_record(const R& rec) {
+        auto res = false;
+        for (auto buf : m_buffers) {
+            res = buf->delete_record(rec);
+            if (res) return true;
+        }
+        return false;
+    }
+
+    bool check_tombstone(const R& rec) {
+        auto res = false;
+        for (auto buf : m_buffers) {
+            res = buf->check_tombstone(rec);
+            if (res) return true;
+        }
+        return false;
+    }
+
+    size_t get_record_count() {
+        size_t reccnt = 0;
+        for (auto buf : m_buffers) {
+            reccnt += buf->get_record_count();
+        }
+        return reccnt;
+    }
+    
+    size_t get_capacity() {
+        return m_buffers[0]->get_capacity();
+    }
+
+    bool is_full() {
+        return m_buffers[m_buffers.size() - 1]->is_full();
+    }
+
+    size_t get_tombstone_count() {
+        size_t tscnt = 0;
+        for (auto buf : m_buffers) {
+            tscnt += buf->get_tombstone_count();
+        }
+        return tscnt;
+    }
+
+    size_t get_memory_usage() {
+        size_t mem = 0;
+        for (auto buf : m_buffers) {
+            mem += buf->get_memory_usage();
+        }
+        return mem;
+    }
+
+    size_t get_aux_memory_usage() {
+        size_t mem = 0;
+        for (auto buf : m_buffers) {
+            mem += buf->get_aux_memory_usage();
+        }
+        return mem;
+    }
+
+    size_t get_tombstone_capacity() {
+        return m_buffers[0]->get_tombstone_capacity();
+    }
+
+    std::vector<void *> get_buffer_states(void *parms) {
+        std::vector<void *> states;
+
+        for (auto buf : m_buffers) {
+            states.push_back(Q::get_buffer_query_state(buf, parms));
+        }
+
+        return states;
+    }
+
+    std::vector<Buffer *> &get_buffers() {
+        return m_buffers;
+    }
+
+private:
+    std::vector<Buffer *> m_buffers;
+    size_t m_cutoff;
+};
+
+}
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 8344518..de965ae 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -302,12 +302,38 @@ public:
         m_levels[incoming_level] = std::shared_ptr<InternalLevel<R, Shard, Q>>(new InternalLevel<R, Shard, Q>(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor));
     }
 
+    bool take_reference() {
+        m_refcnt.fetch_add(1);
+        return true;
+    }
+
+    bool release_reference() {
+        assert(m_refcnt.load() > 0);
+        m_refcnt.fetch_add(-1);
+        return true;
+    }
+
+    size_t get_reference_count() {
+        return m_refcnt.load();
+    }
+
+    std::vector<void *> get_query_states(std::vector<std::pair<ShardID, Shard*>> &shards, void *parms) {
+        std::vector<void*> states;
+
+        for (auto &level : m_levels) {
+            level->get_query_states(shards, states, parms);
+        }
+
+        return states;
+    }
 
 private:
     size_t m_scale_factor;
     double m_max_delete_prop;
     size_t m_buffer_size;
 
+    std::atomic<size_t> m_refcnt;
+
     std::vector<std::shared_ptr<InternalLevel<R, S, Q>>> m_levels;
 
     /*
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 4e0b5c2..974dc28 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -217,6 +217,10 @@ public:
         return true;
     }
 
+    size_t get_reference_count() {
+        return m_refcnt.load();
+    }
+
     bool active_merge() {
         return m_merging.load();
     }
-- 
cgit v1.2.3


From 39ae3e0441d8297a09197aba98bd494b5ada12c1 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 30 Oct 2023 14:17:59 -0400
Subject: Concurrency updates + fixes for compile errors

---
 include/framework/structure/BufferView.h    |  4 ++--
 include/framework/structure/InternalLevel.h | 21 +++++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 1efc1ac..14abedc 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -35,7 +35,7 @@ public:
 
     BufferView(std::vector<Buffer*> buffers) 
         : m_buffers(buffers)
-        , m_cutoff(buffers[buffers->size()-1]->get_record_count())
+        , m_cutoff(buffers[buffers.size()-1]->get_record_count())
     {}
 
     ~BufferView() = default;
@@ -102,7 +102,7 @@ public:
         return m_buffers[0]->get_tombstone_capacity();
     }
 
-    std::vector<void *> get_buffer_states(void *parms) {
+    std::vector<void *> get_query_states(void *parms) {
         std::vector<void *> states;
 
         for (auto buf : m_buffers) {
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index b9230f4..342a2c7 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -231,6 +231,17 @@ public:
         return (double) tscnt / (double) (tscnt + reccnt);
     }
 
+    std::shared_ptr<InternalLevel> clone() {
+        auto new_level = std::make_shared<InternalLevel>(m_level_no, m_shards.size());
+        for (size_t i=0; i<m_shard_cnt; i++) {
+            new_level->m_shards[i] = m_shards[i];
+            new_level->m_owns[i] = true;
+            m_owns[i] = false;
+        }
+
+        return new_level;
+    }
+
 private:
     ssize_t m_level_no;
     
@@ -243,16 +254,6 @@ private:
 
     std::vector<bool> m_owns;
 
-    std::shared_ptr<InternalLevel> clone() {
-        auto new_level = std::make_shared<InternalLevel>(m_level_no, m_shards.size());
-        for (size_t i=0; i<m_shard_cnt; i++) {
-            new_level->m_shards[i] = m_shards[i];
-            new_level->m_owns[i] = true;
-            m_owns[i] = false;
-        }
-
-        return new_level;
-    }
 };
 
 }
-- 
cgit v1.2.3


From ceffd8caf5e4e827e2cc4d6975507a66d88f77a9 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 30 Oct 2023 14:25:28 -0400
Subject: DynamicExtension: adjusted a few operations to ensure conistency

get_memory_usage, get_aux_memory_usage, get_record_count,
get_tombstone_count, and create_static_structure have been adjusted to
ensure that they pull from a consistent epoch, even if a change-over
occurs midway through the function.

These functions also now register with the epoch as a job, to ensure that
the epoch they are operating own isn't retired midway through the
function. Probably not a big issue for the accessors, but I could see it
being very important for create_static_structure.
---
 include/framework/structure/BufferView.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 14abedc..8dff2ef 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -116,6 +116,10 @@ public:
         return m_buffers;
     }
 
+    size_t size() {
+        return m_buffers.size();
+    }
+
 private:
     std::vector<Buffer *> m_buffers;
     size_t m_cutoff;
-- 
cgit v1.2.3


From d2279e1b96d352a0af1d425dcaaf93e8a26a8d52 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 30 Oct 2023 17:15:05 -0400
Subject: General Comment + Consistency updates

---
 include/framework/structure/BufferView.h         | 2 +-
 include/framework/structure/ExtensionStructure.h | 4 ++--
 include/framework/structure/InternalLevel.h      | 4 ++--
 include/framework/structure/MutableBuffer.h      | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 8dff2ef..ccd3dac 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -1,7 +1,7 @@
 /*
  * include/framework/structure/BufferView.h
  *
- * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *
  * All rights reserved. Published under the Modified BSD License.
  *
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index de965ae..1f365ae 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -1,7 +1,7 @@
 /*
- * include/framework/ExtensionStructure.h
+ * include/framework/structure/ExtensionStructure.h
  *
- * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
  * All rights reserved. Published under the Modified BSD License.
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index 342a2c7..7a7b98c 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -1,7 +1,7 @@
 /*
- * include/framework/InternalLevel.h
+ * include/framework/structure/InternalLevel.h
  *
- * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
  * All rights reserved. Published under the Modified BSD License.
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 974dc28..e0a6962 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -1,7 +1,7 @@
 /*
- * include/framework/MutableBuffer.h
+ * include/framework/structure/MutableBuffer.h
  *
- * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
  * All rights reserved. Published under the Modified BSD License.
-- 
cgit v1.2.3


From 230831243a61f1ca1b1dd4319a4c5224b15d2657 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Tue, 31 Oct 2023 12:05:58 -0400
Subject: ExtensionStructure: fixed incorrect constructor args in clone()

---
 include/framework/structure/ExtensionStructure.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 1f365ae..2ced439 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -54,7 +54,7 @@ public:
      * need to be forwarded to the appropriate structures manually.
      */
     ExtensionStructure<R, S, Q, L> *copy() {
-       auto new_struct = new ExtensionStructure<R, S, Q, L>(m_scale_factor, m_max_delete_prop, m_buffer_size);
+       auto new_struct = new ExtensionStructure<R, S, Q, L>(m_buffer_size, m_scale_factor, m_max_delete_prop);
         for (size_t i=0; i<m_levels.size(); i++) {
             new_struct->m_levels.push_back(m_levels[i]->clone());
         }
@@ -432,7 +432,7 @@ private:
      * vector.
      */
     inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) {
-        if (idx>= m_levels.size() || !m_levels[idx]) {
+        if (idx >= m_levels.size() || !m_levels[idx]) {
             return false;
         }
 
-- 
cgit v1.2.3


From ca729108869b4143f1eea31f6dde9195decfec9c Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Tue, 31 Oct 2023 12:14:57 -0400
Subject: MutableBuffer: removed most concurrency control stuff

The buffer isn't responsible for a lot of CC anymore (just the append
operation), so this code was no longer necessary. Also removed the only
calls to some of these CC operations within the rest of the framework.
---
 include/framework/structure/ExtensionStructure.h |  6 +--
 include/framework/structure/MutableBuffer.h      | 59 ++++--------------------
 2 files changed, 13 insertions(+), 52 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 2ced439..f5657af 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -93,11 +93,11 @@ public:
     inline bool merge_buffer(Buffer *buffer) {
         assert(can_merge_with(0, buffer->get_record_count()));
 
+        // FIXME: this step makes an extra copy of the buffer,
+        //        which could be avoided by adjusting the shard
+        //        reconstruction process a bit, possibly.
         buffer->start_merge();
         merge_buffer_into_l0(buffer);
-        buffer->finish_merge();
-
-        buffer->truncate();
 
         return true;
     }
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index e0a6962..a70b86b 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -42,13 +42,10 @@ public:
         }
 
         m_refcnt.store(0);
-        m_deferred_truncate.store(false);
-        m_merging.store(false);
     }
 
     ~MutableBuffer() {
         assert(m_refcnt.load() == 0);
-        assert(m_merging.load() == false);
 
         if (m_data) free(m_data);
         if (m_tombstone_filter) delete m_tombstone_filter;
@@ -90,23 +87,12 @@ public:
     }
 
     bool truncate() {
-
-        while (active_merge() || m_refcnt.load() > 0)
-            ;
-
-        m_merge_lock.lock();
-
-        while (m_refcnt > 0) 
-            ;
-
         m_tombstonecnt.store(0);
         m_reccnt.store(0);
         m_weight.store(0);
         m_max_weight.store(0);
         if (m_tombstone_filter) m_tombstone_filter->clear();
 
-        m_merge_lock.unlock();
-
         return true;
     }
 
@@ -176,26 +162,15 @@ public:
         return m_max_weight;
     }
 
+    /*
+     * This operation assumes that no other threads have write access
+     * to the buffer. This will be the case in normal operation, at
+     * present, but may change (in which case this approach will need
+     * to be adjusted). Other threads having read access is perfectly
+     * acceptable, however.
+     */
     bool start_merge() {
-        if (m_merge_lock.try_lock()) {
-            /* there cannot already been an active merge */
-            if (m_merging.load()) {
-                m_merge_lock.unlock();
-                return false;
-            } 
-
-            m_merging.store(true);
-            memcpy(m_merge_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
-            return true;
-        }
-
-        /* lock could not be obtained */
-        return false;
-    }
-
-    bool finish_merge() {
-        m_merge_lock.unlock();
-        m_merging.store(false);
+        memcpy(m_merge_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
         return true;
     }
 
@@ -208,12 +183,8 @@ public:
     }
 
     bool release_reference() {
+        assert(m_refcnt > 0);
         m_refcnt.fetch_add(-1);
-
-        if (m_refcnt.load() == 0 && m_deferred_truncate.load()) {
-            assert(this->truncate());
-        }
-
         return true;
     }
 
@@ -221,10 +192,6 @@ public:
         return m_refcnt.load();
     }
 
-    bool active_merge() {
-        return m_merging.load();
-    }
-
 private:
     int32_t try_advance_tail() {
         size_t new_tail = m_reccnt.fetch_add(1);
@@ -245,14 +212,8 @@ private:
     alignas(64) std::atomic<uint32_t> m_reccnt;
     alignas(64) std::atomic<double> m_weight;
     alignas(64) std::atomic<double> m_max_weight;
-    alignas(64) std::atomic<bool> m_merging;
-    alignas(64) std::atomic<bool> m_deferred_truncate;
-    alignas(64) std::atomic<size_t> m_refcnt;
-
-    alignas(64) std::mutex m_merge_lock;
-    alignas(64) std::mutex m_trunc_lock;
-    alignas(64) std::condition_variable m_trunc_signal;
 
+    alignas(64) std::atomic<size_t> m_refcnt;
 };
 
 }
-- 
cgit v1.2.3


From 68ae6279476e7d37837ac06474fb558e50ce6706 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Tue, 31 Oct 2023 12:41:55 -0400
Subject: Fixes for various bugs under SerialScheduler

---
 include/framework/structure/ExtensionStructure.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index f5657af..80ec7b9 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -46,7 +46,8 @@ public:
      * Create a shallow copy of this extension structure. The copy will share references to the
      * same levels/shards as the original, but will have its own lists. As all of the shards are
      * immutable (with the exception of deletes), the copy can be restructured with merges, etc., 
-     * without affecting the original.
+     * without affecting the original. The copied structure will be returned with a reference
+     * count of 0; generally you will want to immediately call take_reference() on it.
      *
      * NOTE: When using tagged deletes, a delete of a record in the original structure will affect
      * the copy, so long as the copy retains a reference to the same shard as the original. This could
@@ -59,6 +60,8 @@ public:
             new_struct->m_levels.push_back(m_levels[i]->clone());
         }
 
+        new_struct->m_refcnt = 0;
+
         return new_struct;
     }
 
-- 
cgit v1.2.3


From 7249af78a3f39bd2852c3f81fe92dc5b647161fb Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 6 Nov 2023 11:33:17 -0500
Subject: MutableBuffer: added explicit tail variable

Use an explicit m_tail variable for insertion, rather than using m_reccnt.
This ensures that the record count doesn't increase despite new records
being inserted, and allows for the m_tail variable to be decremented on
failure without causing the record count to momentarily change.
---
 include/framework/structure/MutableBuffer.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index a70b86b..ba25cc3 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -33,7 +33,7 @@ class MutableBuffer {
 public:
     MutableBuffer(size_t capacity, size_t max_tombstone_cap)
     : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0)
-    , m_tombstonecnt(0), m_weight(0), m_max_weight(0) {
+    , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) {
         m_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
         m_merge_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
         m_tombstone_filter = nullptr;
@@ -83,6 +83,7 @@ public:
             m_weight.fetch_add(1);
         }
 
+        m_reccnt.fetch_add(1);
         return 1;     
     }
 
@@ -91,6 +92,7 @@ public:
         m_reccnt.store(0);
         m_weight.store(0);
         m_max_weight.store(0);
+        m_tail.store(0);
         if (m_tombstone_filter) m_tombstone_filter->clear();
 
         return true;
@@ -193,11 +195,15 @@ public:
     }
 
 private:
-    int32_t try_advance_tail() {
-        size_t new_tail = m_reccnt.fetch_add(1);
+    int64_t try_advance_tail() {
+        int64_t new_tail = m_tail.fetch_add(1);
 
-        if (new_tail < m_cap) return new_tail;
-        else return -1;
+        if (new_tail < m_cap) {
+            return new_tail;
+        } 
+
+        m_tail.fetch_add(-1);
+        return -1;
     }
 
     size_t m_cap;
@@ -210,6 +216,7 @@ private:
 
     alignas(64) std::atomic<size_t> m_tombstonecnt;
     alignas(64) std::atomic<uint32_t> m_reccnt;
+    alignas(64) std::atomic<int64_t> m_tail;
     alignas(64) std::atomic<double> m_weight;
     alignas(64) std::atomic<double> m_max_weight;
 
-- 
cgit v1.2.3


From 56cc8f63a218bc13e0c8395b479267862de19714 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 6 Nov 2023 14:01:39 -0500
Subject: InternalLevel: switched to std::sharedptr for shard memory management

---
 include/framework/structure/InternalLevel.h | 81 +++++++++++------------------
 1 file changed, 29 insertions(+), 52 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index 7a7b98c..632fe17 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -33,32 +33,10 @@ public:
     : m_level_no(level_no)
     , m_shard_cnt(0)
     , m_shards(shard_cap, nullptr)
-    , m_owns(shard_cap, true)
     , m_pending_shard(nullptr)
     {}
 
-    // Create a new memory level sharing the shards and repurposing it as previous level_no + 1
-    // WARNING: for leveling only.
-    InternalLevel(InternalLevel* level)
-    : m_level_no(level->m_level_no + 1)
-    , m_shard_cnt(level->m_shard_cnt)
-    , m_shards(level->m_shards.size(), nullptr)
-    , m_owns(level->m_owns.size(), true) 
-    , m_pending_shard(nullptr)
-    {
-        assert(m_shard_cnt == 1 && m_shards.size() == 1);
-
-        for (size_t i=0; i<m_shards.size(); i++) {
-            level->m_owns[i] = false;
-            m_shards[i] = level->m_shards[i];
-        }
-    }
-
     ~InternalLevel() { 
-        for (size_t i=0; i<m_shards.size(); i++) {
-            if (m_owns[i]) delete m_shards[i];
-        }
-
         delete m_pending_shard;
     }
 
@@ -69,10 +47,10 @@ public:
         auto res = new InternalLevel(base_level->m_level_no, 1);
         res->m_shard_cnt = 1;
         Shard* shards[2];
-        shards[0] = base_level->m_shards[0];
-        shards[1] = new_level->m_shards[0];
+        shards[0] = base_level->m_shards[0].get();
+        shards[1] = new_level->m_shards[0].get();
 
-        res->m_shards[0] = new S(shards, 2);
+        res->m_shards[0] = std::make_shared<S>(shards, 2);
         return std::shared_ptr<InternalLevel>(res);
     }
 
@@ -83,19 +61,23 @@ public:
             return;
         }
 
-        m_shards[m_shard_cnt] = new S(buffer);
-        m_owns[m_shard_cnt] = true;
+        m_shards[m_shard_cnt] = std::make_shared<S>(buffer);
         ++m_shard_cnt;
     }
 
     void append_merged_shards(InternalLevel* level) {
+        Shard *shards[level->m_shard_cnt];
+        for (size_t i=0; i<level->m_shard_cnt; i++) {
+            shards[i] = level->m_shards[i].get();
+        }
+
         if (m_shard_cnt == m_shards.size()) {
-            m_pending_shard = new S(level->m_shards.data(), level->m_shard_cnt);
+            m_pending_shard = new S(shards, level->m_shard_cnt);
             return;
         }
 
-        m_shards[m_shard_cnt] = new S(level->m_shards.data(), level->m_shard_cnt);
-        m_owns[m_shard_cnt] = true;
+        auto tmp = new S(shards, level->m_shard_cnt);
+        m_shards[m_shard_cnt] = std::shared_ptr<S>(tmp);
 
         ++m_shard_cnt;
     }
@@ -104,15 +86,10 @@ public:
     void finalize() {
         if (m_pending_shard) {
             for (size_t i=0; i<m_shards.size(); i++) {
-                if (m_owns[i]) {
-                    delete m_shards[i];
-                    m_shards[i] = nullptr;
-                    m_owns[i] = false;
-                }
+                m_shards[i] = nullptr;
             }
 
-            m_shards[0] = m_pending_shard;
-            m_owns[0] = true;
+            m_shards[0] = std::shared_ptr<S>(m_pending_shard);
             m_pending_shard = nullptr;
             m_shard_cnt = 1;
         }
@@ -126,7 +103,7 @@ public:
         Shard *shards[m_shard_cnt];
 
         for (size_t i=0; i<m_shard_cnt; i++) {
-            shards[i] = m_shards[i];
+            shards[i] = m_shards[i].get();
         }
 
         return new S(shards, m_shard_cnt);
@@ -136,8 +113,8 @@ public:
     void get_query_states(std::vector<std::pair<ShardID, Shard *>> &shards, std::vector<void*>& shard_states, void *query_parms) {
         for (size_t i=0; i<m_shard_cnt; i++) {
             if (m_shards[i]) {
-                auto shard_state = Q::get_query_state(m_shards[i], query_parms);
-                shards.push_back({{m_level_no, (ssize_t) i}, m_shards[i]});
+                auto shard_state = Q::get_query_state(m_shards[i].get(), query_parms);
+                shards.push_back({{m_level_no, (ssize_t) i}, m_shards[i].get()});
                 shard_states.emplace_back(shard_state);
             }
         }
@@ -174,7 +151,7 @@ public:
     }
 
     Shard* get_shard(size_t idx) {
-        return m_shards[idx];
+        return m_shards[idx].get();
     }
 
     size_t get_shard_count() {
@@ -184,7 +161,9 @@ public:
     size_t get_record_count() {
         size_t cnt = 0;
         for (size_t i=0; i<m_shard_cnt; i++) {
-            cnt += m_shards[i]->get_record_count();
+            if (m_shards[i]) {
+                cnt += m_shards[i]->get_record_count();
+            }
         }
 
         return cnt;
@@ -193,7 +172,9 @@ public:
     size_t get_tombstone_count() {
         size_t res = 0;
         for (size_t i = 0; i < m_shard_cnt; ++i) {
-            res += m_shards[i]->get_tombstone_count();
+            if (m_shards[i]) {
+                res += m_shards[i]->get_tombstone_count();
+            }
         }
         return res;
     }
@@ -201,7 +182,9 @@ public:
     size_t get_aux_memory_usage() {
         size_t cnt = 0;
         for (size_t i=0; i<m_shard_cnt; i++) {
-            cnt += m_shards[i]->get_aux_memory_usage();
+            if (m_shards[i]){
+                cnt += m_shards[i]->get_aux_memory_usage();
+            }
         }
 
         return cnt;
@@ -224,7 +207,7 @@ public:
         for (size_t i=0; i<m_shard_cnt; i++) {
             if (m_shards[i]) {
                 tscnt += m_shards[i]->get_tombstone_count();
-                reccnt += (*m_shards[i])->get_record_count();
+                reccnt += m_shards[i]->get_record_count();
             }
         }
 
@@ -235,8 +218,6 @@ public:
         auto new_level = std::make_shared<InternalLevel>(m_level_no, m_shards.size());
         for (size_t i=0; i<m_shard_cnt; i++) {
             new_level->m_shards[i] = m_shards[i];
-            new_level->m_owns[i] = true;
-            m_owns[i] = false;
         }
 
         return new_level;
@@ -248,12 +229,8 @@ private:
     size_t m_shard_cnt;
     size_t m_shard_size_cap;
 
-    std::vector<Shard*> m_shards;
-
+    std::vector<std::shared_ptr<Shard>> m_shards;
     Shard *m_pending_shard;
-
-    std::vector<bool> m_owns;
-
 };
 
 }
-- 
cgit v1.2.3


From 357cab549c2ed33970562b84ff6f83923742343d Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Tue, 7 Nov 2023 15:34:24 -0500
Subject: Comment and License updates

---
 include/framework/structure/BufferView.h         | 2 +-
 include/framework/structure/ExtensionStructure.h | 2 +-
 include/framework/structure/InternalLevel.h      | 2 +-
 include/framework/structure/MutableBuffer.h      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index ccd3dac..651e430 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *
- * All rights reserved. Published under the Modified BSD License.
+ * Distributed under the Modified BSD License.
  *
  */
 #pragma once
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 80ec7b9..74cede6 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -4,7 +4,7 @@
  * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
- * All rights reserved. Published under the Modified BSD License.
+ * Distributed under the Modified BSD License.
  *
  */
 #pragma once
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index 632fe17..00e0c58 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -4,7 +4,7 @@
  * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
- * All rights reserved. Published under the Modified BSD License.
+ * Distributed under the Modified BSD License.
  *
  */
 #pragma once
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index ba25cc3..671824f 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -4,7 +4,7 @@
  * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
  *                    Dong Xie <dongx@psu.edu>
  *
- * All rights reserved. Published under the Modified BSD License.
+ * Distributed under the Modified BSD License.
  *
  */
 #pragma once
-- 
cgit v1.2.3


From 83486744600e8be338c75c2e3d2339452a392a9d Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 13 Nov 2023 10:41:13 -0500
Subject: Fixed merge logic bug in tiering

In InternalLevel::clone(), the m_shard_cnt variable was not being set
appropriately in the clone, resulting in the record counts reported for
a multi-shard level to be reported incorrectly.

In DynamicExtension::merge(), the merges were being performed in the
wrong order, resulting in multi-level merges deleting records. The
leveling tests all passed even with this bug for some reason, but it
caused tiering tests to fail. It isn't clear _why_ leveling appeared to
work, but the bug is now fixed, so that's largely irrelevant I suppose.
---
 include/framework/structure/InternalLevel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index 00e0c58..d146b73 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -219,6 +219,7 @@ public:
         for (size_t i=0; i<m_shard_cnt; i++) {
             new_level->m_shards[i] = m_shards[i];
         }
+        new_level->m_shard_cnt = m_shard_cnt;
 
         return new_level;
     }
-- 
cgit v1.2.3


From 90bb0614fc1d8f1a185a778e31aaf9027c01aeb8 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 13 Nov 2023 11:44:09 -0500
Subject: Tombstone Compaction: re-enabled tombstone compaction

Currently, proactive buffer tombstone compaction is disabled by forcing
the buffer tombstone capacity to match its record capacity. It isn't
clear how to best handle proactive buffer compactions in an environment
where new buffers are spawned anyway.
---
 include/framework/structure/ExtensionStructure.h | 51 ++++++++++++++++++++++++
 include/framework/structure/MutableBuffer.h      |  2 +-
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 74cede6..a174805 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -201,6 +201,57 @@ public:
         return m_levels;
     }
 
+    std::vector<MergeTask> get_compaction_tasks() {
+        std::vector<MergeTask> tasks;
+
+        /* if the tombstone/delete invariant is satisfied, no need for compactions */
+        if (validate_tombstone_proportion()) {
+            return tasks;
+        }
+
+        /* locate the first level to violate the invariant */
+        level_index violation_idx = -1;
+        for (level_index i=0; i<m_levels.size(); i++) {
+            if (!validate_tombstone_proportion(i))  {
+                violation_idx = i;
+                break;
+            }
+        }
+
+        assert(violation_idx != -1);
+
+        level_index merge_base_level = find_mergable_level(violation_idx);
+        if (merge_base_level == -1) {
+            merge_base_level = grow();
+        }
+
+        for (level_index i=merge_base_level; i>0; i--) {
+            MergeTask task = {i-1, i};
+
+            /*
+             * The amount of storage required for the merge accounts
+             * for the cost of storing the new records, along with the
+             * cost of retaining the old records during the process 
+             * (hence the 2x multiplier). 
+             *
+             * FIXME: currently does not account for the *actual* size 
+             * of the shards, only the storage for the records 
+             * themselves.
+             */
+            size_t reccnt = m_levels[i-1]->get_record_count();
+            if constexpr (L == LayoutPolicy::LEVELING) {
+                if (can_merge_with(i, reccnt)) {
+                    reccnt += m_levels[i]->get_record_count();
+                }
+            }
+            //task.m_size = 2* reccnt * sizeof(R);
+
+            tasks.push_back(task);
+        }
+
+        return tasks;
+    }
+
     /*
      *
      */
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 671824f..8b17091 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -32,7 +32,7 @@ template <RecordInterface R>
 class MutableBuffer {
 public:
     MutableBuffer(size_t capacity, size_t max_tombstone_cap)
-    : m_cap(capacity), m_tombstone_cap(max_tombstone_cap), m_reccnt(0)
+    : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0)
     , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) {
         m_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
         m_merge_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
-- 
cgit v1.2.3


From 3c127eda69295cb306739bdd3c5ddccff6026a8d Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 13 Dec 2023 12:39:54 -0500
Subject: Refactoring: corrected a number of names and added more comments

---
 include/framework/structure/ExtensionStructure.h | 142 +++++++++++------------
 include/framework/structure/InternalLevel.h      |  58 ++++++---
 include/framework/structure/MutableBuffer.h      |  18 ++-
 3 files changed, 120 insertions(+), 98 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index a174805..3cd55ac 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -45,8 +45,8 @@ public:
     /*
      * Create a shallow copy of this extension structure. The copy will share references to the
      * same levels/shards as the original, but will have its own lists. As all of the shards are
-     * immutable (with the exception of deletes), the copy can be restructured with merges, etc., 
-     * without affecting the original. The copied structure will be returned with a reference
+     * immutable (with the exception of deletes), the copy can be restructured with reconstructions
+     * and flushes without affecting the original. The copied structure will be returned with a reference
      * count of 0; generally you will want to immediately call take_reference() on it.
      *
      * NOTE: When using tagged deletes, a delete of a record in the original structure will affect
@@ -55,7 +55,7 @@ public:
      * need to be forwarded to the appropriate structures manually.
      */
     ExtensionStructure<R, S, Q, L> *copy() {
-       auto new_struct = new ExtensionStructure<R, S, Q, L>(m_buffer_size, m_scale_factor, m_max_delete_prop);
+        auto new_struct = new ExtensionStructure<R, S, Q, L>(m_buffer_size, m_scale_factor, m_max_delete_prop);
         for (size_t i=0; i<m_levels.size(); i++) {
             new_struct->m_levels.push_back(m_levels[i]->clone());
         }
@@ -90,17 +90,20 @@ public:
     }
 
     /*
-     * Merge the memory table down into the tree, completing any required other
-     * merges to make room for it.
+     * Flush a buffer into the extension structure, performing any necessary
+     * reconstructions to free up room in L0.
+     *
+     * FIXME: arguably, this should be a method attached to the buffer that
+     * takes a structure as input.
      */
-    inline bool merge_buffer(Buffer *buffer) {
-        assert(can_merge_with(0, buffer->get_record_count()));
+    inline bool flush_buffer(Buffer *buffer) {
+        assert(can_reconstruct_with(0, buffer->get_record_count()));
 
         // FIXME: this step makes an extra copy of the buffer,
         //        which could be avoided by adjusting the shard
         //        reconstruction process a bit, possibly.
-        buffer->start_merge();
-        merge_buffer_into_l0(buffer);
+        buffer->start_flush();
+        flush_buffer_into_l0(buffer);
 
         return true;
     }
@@ -123,7 +126,7 @@ public:
      * Return the total number of tombstones contained within all of the
      * levels of the structure.
      */
-    size_t get_tombstone_cnt() {
+    size_t get_tombstone_count() {
         size_t cnt = 0;
 
         for (size_t i=0; i<m_levels.size(); i++) {
@@ -172,7 +175,7 @@ public:
 
     /*
      * Validate that no level in the structure exceeds its maximum tombstone capacity. This is
-     * used to trigger preemptive compactions at the end of the merge process.
+     * used to trigger preemptive compactions at the end of the reconstruction process.
      */
     bool validate_tombstone_proportion() {
         long double ts_prop;
@@ -201,8 +204,8 @@ public:
         return m_levels;
     }
 
-    std::vector<MergeTask> get_compaction_tasks() {
-        std::vector<MergeTask> tasks;
+    std::vector<ReconstructionTask> get_compaction_tasks() {
+        std::vector<ReconstructionTask> tasks;
 
         /* if the tombstone/delete invariant is satisfied, no need for compactions */
         if (validate_tombstone_proportion()) {
@@ -220,16 +223,16 @@ public:
 
         assert(violation_idx != -1);
 
-        level_index merge_base_level = find_mergable_level(violation_idx);
-        if (merge_base_level == -1) {
-            merge_base_level = grow();
+        level_index base_level = find_reconstruction_target(violation_idx);
+        if (base_level == -1) {
+            base_level = grow();
         }
 
-        for (level_index i=merge_base_level; i>0; i--) {
-            MergeTask task = {i-1, i};
+        for (level_index i=base_level; i>0; i--) {
+            ReconstructionTask task = {i-1, i};
 
             /*
-             * The amount of storage required for the merge accounts
+             * The amount of storage required for the reconstruction accounts
              * for the cost of storing the new records, along with the
              * cost of retaining the old records during the process 
              * (hence the 2x multiplier). 
@@ -240,7 +243,7 @@ public:
              */
             size_t reccnt = m_levels[i-1]->get_record_count();
             if constexpr (L == LayoutPolicy::LEVELING) {
-                if (can_merge_with(i, reccnt)) {
+                if (can_reconstruct_with(i, reccnt)) {
                     reccnt += m_levels[i]->get_record_count();
                 }
             }
@@ -255,28 +258,27 @@ public:
     /*
      *
      */
-    std::vector<MergeTask> get_merge_tasks(size_t buffer_reccnt) {
-        std::vector<MergeTask> merges;
+    std::vector<ReconstructionTask> get_reconstruction_tasks(size_t buffer_reccnt) {
+        std::vector<ReconstructionTask> reconstructions;
 
         /*
-         * The buffer -> L0 merge task is not included so if that 
-         * can be done without any other change, just return an 
-         * empty list.
+         * The buffer flush is not included so if that can be done without any
+         * other change, just return an empty list.
          */
-        if (can_merge_with(0, buffer_reccnt)) {
-            return std::move(merges); 
+        if (can_reconstruct_with(0, buffer_reccnt)) {
+            return std::move(reconstructions); 
         }
 
-        level_index merge_base_level = find_mergable_level(0);
-        if (merge_base_level == -1) {
-            merge_base_level = grow();
+        level_index base_level = find_reconstruction_target(0);
+        if (base_level == -1) {
+            base_level = grow();
         }
 
-        for (level_index i=merge_base_level; i>0; i--) {
-            MergeTask task = {i-1, i};
+        for (level_index i=base_level; i>0; i--) {
+            ReconstructionTask task = {i-1, i};
 
             /*
-             * The amount of storage required for the merge accounts
+             * The amount of storage required for the reconstruction accounts
              * for the cost of storing the new records, along with the
              * cost of retaining the old records during the process 
              * (hence the 2x multiplier). 
@@ -287,34 +289,34 @@ public:
              */
             size_t reccnt = m_levels[i-1]->get_record_count();
             if constexpr (L == LayoutPolicy::LEVELING) {
-                if (can_merge_with(i, reccnt)) {
+                if (can_reconstruct_with(i, reccnt)) {
                     reccnt += m_levels[i]->get_record_count();
                 }
             }
             //task.m_size = 2* reccnt * sizeof(R);
 
-            merges.push_back(task);
+            reconstructions.push_back(task);
         }
 
-        return std::move(merges);
+        return std::move(reconstructions);
     }
 
 
     /*
      *
      */
-    std::vector<MergeTask> get_merge_tasks_from_level(level_index source_level) {
-        std::vector<MergeTask> merges;
+    std::vector<ReconstructionTask> get_reconstruction_tasks_from_level(level_index source_level) {
+        std::vector<ReconstructionTask> reconstructions;
 
-        level_index merge_base_level = find_mergable_level(source_level);
-        if (merge_base_level == -1) {
-            merge_base_level = grow();
+        level_index base_level = find_reconstruction_target(source_level);
+        if (base_level == -1) {
+            base_level = grow();
         }
 
-        for (level_index i=merge_base_level; i>source_level; i--) {
-            MergeTask task = {i - 1, i};
+        for (level_index i=base_level; i>source_level; i--) {
+            ReconstructionTask task = {i - 1, i};
             /*
-             * The amount of storage required for the merge accounts
+             * The amount of storage required for the reconstruction accounts
              * for the cost of storing the new records, along with the
              * cost of retaining the old records during the process 
              * (hence the 2x multiplier). 
@@ -325,31 +327,30 @@ public:
              */
             size_t reccnt = m_levels[i-1]->get_record_count();
             if constexpr (L == LayoutPolicy::LEVELING) {
-                if (can_merge_with(i, reccnt)) {
+                if (can_reconstruct_with(i, reccnt)) {
                     reccnt += m_levels[i]->get_record_count();
                 }
             }
 //            task.m_size = 2* reccnt * sizeof(R);
 
-            merges.push_back(task);
+            reconstructions.push_back(task);
         }
 
-        return merges;
+        return reconstructions;
     }
 
     /*
-     * Merge the level specified by incoming level into the level specified
-     * by base level. The two levels should be sequential--i.e. no levels
-     * are skipped in the merge process--otherwise the tombstone ordering
-     * invariant may be violated by the merge operation.
+     * Combine incoming_level with base_level and reconstruct the shard,
+     * placing it in base_level. The two levels should be sequential--i.e. no
+     * levels are skipped in the reconstruction process--otherwise the
+     * tombstone ordering invariant may be violated.
      */
-    inline void merge_levels(level_index base_level, level_index incoming_level) {
-        // merging two memory levels
+    inline void reconstruction(level_index base_level, level_index incoming_level) {
         if constexpr (L == LayoutPolicy::LEVELING) {
             auto tmp = m_levels[base_level];
-            m_levels[base_level] = InternalLevel<R, Shard, Q>::merge_levels(m_levels[base_level].get(), m_levels[incoming_level].get());
+            m_levels[base_level] = InternalLevel<R, Shard, Q>::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get());
         } else {
-            m_levels[base_level]->append_merged_shards(m_levels[incoming_level].get());
+            m_levels[base_level]->append_level(m_levels[incoming_level].get());
             m_levels[base_level]->finalize();
         }
 
@@ -391,9 +392,7 @@ private:
     std::vector<std::shared_ptr<InternalLevel<R, S, Q>>> m_levels;
 
     /*
-     * Add a new level to the LSM Tree and return that level's index. Will
-     * automatically determine whether the level should be on memory or on disk,
-     * and act appropriately.
+     * Add a new level to the structure and return its index.
      */
     inline level_index grow() {
         level_index new_idx = m_levels.size();
@@ -405,22 +404,18 @@ private:
 
     /*
      * Find the first level below the level indicated by idx that
-     * is capable of sustaining a merge operation and return its
+     * is capable of sustaining a reconstruction and return its
      * level index. If no such level exists, returns -1. Also
-     * returns -1 if idx==0, and no such level exists, to skimplify
-     * the logic of the first merge.
+     * returns -1 if idx==0, and no such level exists, to simplify
+     * the logic of the first buffer flush.
      */
-    inline level_index find_mergable_level(level_index idx, Buffer *buffer=nullptr) {
+    inline level_index find_reconstruction_target(level_index idx, Buffer *buffer=nullptr) {
 
         if (idx == 0 && m_levels.size() == 0) return -1;
 
-        bool level_found = false;
-        bool disk_level;
-        level_index merge_level_idx;
-
         size_t incoming_rec_cnt = get_level_record_count(idx, buffer);
         for (level_index i=idx+1; i<m_levels.size(); i++) {
-            if (can_merge_with(i, incoming_rec_cnt)) {
+            if (can_reconstruct_with(i, incoming_rec_cnt)) {
                 return i;
             }
 
@@ -430,14 +425,14 @@ private:
         return -1;
     }
 
-    inline void merge_buffer_into_l0(Buffer *buffer) {
+    inline void flush_buffer_into_l0(Buffer *buffer) {
         assert(m_levels[0]);
         if constexpr (L == LayoutPolicy::LEVELING) {
             // FIXME: Kludgey implementation due to interface constraints.
             auto old_level = m_levels[0].get();
             auto temp_level = new InternalLevel<R, Shard, Q>(0, 1);
             temp_level->append_buffer(buffer);
-            auto new_level = InternalLevel<R, Shard, Q>::merge_levels(old_level, temp_level);
+            auto new_level = InternalLevel<R, Shard, Q>::reconstruction(old_level, temp_level);
 
             m_levels[0] = new_level;
             delete temp_level;
@@ -479,13 +474,10 @@ private:
     }
 
     /*
-     * Determines if the specific level can merge with another record containing
-     * incoming_rec_cnt number of records. The provided level index should be 
-     * non-negative (i.e., not refer to the buffer) and will be automatically
-     * translated into the appropriate index into either the disk or memory level
-     * vector.
+     * Determines if a level can sustain a reconstruction with incoming_rec_cnt
+     * additional records without exceeding its capacity.
      */
-    inline bool can_merge_with(level_index idx, size_t incoming_rec_cnt) {
+    inline bool can_reconstruct_with(level_index idx, size_t incoming_rec_cnt) {
         if (idx >= m_levels.size() || !m_levels[idx]) {
             return false;
         }
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index d146b73..e70ed76 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -40,9 +40,14 @@ public:
         delete m_pending_shard;
     }
 
-    // WARNING: for leveling only.
-    // assuming the base level is the level new level is merging into. (base_level is larger.)
-    static std::shared_ptr<InternalLevel> merge_levels(InternalLevel* base_level, InternalLevel* new_level) {
+    /*
+     * Create a new shard combining the records from base_level and new_level,
+     * and return a shared_ptr to a new level containing this shard. This is used
+     * for reconstructions under the leveling layout policy.
+     *
+     * No changes are made to the levels provided as arguments.
+     */
+    static std::shared_ptr<InternalLevel> reconstruction(InternalLevel* base_level, InternalLevel* new_level) {
         assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0));
         auto res = new InternalLevel(base_level->m_level_no, 1);
         res->m_shard_cnt = 1;
@@ -54,18 +59,15 @@ public:
         return std::shared_ptr<InternalLevel>(res);
     }
 
-    void append_buffer(Buffer* buffer) {
-        if (m_shard_cnt == m_shards.size()) {
-            assert(m_pending_shard == nullptr);
-            m_pending_shard = new S(buffer);
-            return;
-        }
-
-        m_shards[m_shard_cnt] = std::make_shared<S>(buffer);
-        ++m_shard_cnt;
-    }
-
-    void append_merged_shards(InternalLevel* level) {
+    /*
+     * Create a new shard combining the records from all of
+     * the shards in level, and append this new shard into
+     * this level. This is used for reconstructions under
+     * the tiering layout policy.
+     *
+     * No changes are made to the level provided as an argument.
+     */
+    void append_level(InternalLevel* level) {
         Shard *shards[level->m_shard_cnt];
         for (size_t i=0; i<level->m_shard_cnt; i++) {
             shards[i] = level->m_shards[i].get();
@@ -82,6 +84,22 @@ public:
         ++m_shard_cnt;
     }
 
+    /*
+     * Create a new shard using the records in the
+     * provided buffer, and append this new shard
+     * into this level. This is used for buffer
+     * flushes under the tiering layout policy.
+     */
+    void append_buffer(Buffer* buffer) {
+        if (m_shard_cnt == m_shards.size()) {
+            assert(m_pending_shard == nullptr);
+            m_pending_shard = new S(buffer);
+            return;
+        }
+
+        m_shards[m_shard_cnt] = std::make_shared<S>(buffer);
+        ++m_shard_cnt;
+    }
 
     void finalize() {
         if (m_pending_shard) {
@@ -95,7 +113,13 @@ public:
         }
     }
 
-    Shard *get_merged_shard() {
+    /*
+     * Create a new shard containing the combined records
+     * from all shards on this level and return it.
+     *
+     * No changes are made to this level.
+     */
+    Shard *get_combined_shard() {
         if (m_shard_cnt == 0) {
             return nullptr;
         }
@@ -109,7 +133,7 @@ public:
         return new S(shards, m_shard_cnt);
     }
 
-    // Append the sample range in-order.....
+    /* Append the sample range in-order */
     void get_query_states(std::vector<std::pair<ShardID, Shard *>> &shards, std::vector<void*>& shard_states, void *query_parms) {
         for (size_t i=0; i<m_shard_cnt; i++) {
             if (m_shards[i]) {
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 8b17091..58b5fb4 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -1,11 +1,17 @@
 /*
  * include/framework/structure/MutableBuffer.h
  *
- * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu> 
+ * Copyright (C) 2023 Douglas B. Rumbaugh <drumbaugh@psu.edu>
  *                    Dong Xie <dongx@psu.edu>
  *
  * Distributed under the Modified BSD License.
  *
+ * FIXME: currently, the buffer itself is responsible for managing a
+ * secondary buffer for storing sorted records used during buffer flushes. It
+ * probably makes more sense to make the shard being flushed into responsible
+ * for this instead. This would also facilitate simultaneous flushes of multiple
+ * buffers more easily.
+ *
  */
 #pragma once
 
@@ -35,7 +41,7 @@ public:
     : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0)
     , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) {
         m_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
-        m_merge_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
+        m_sorted_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
         m_tombstone_filter = nullptr;
         if (max_tombstone_cap > 0) {
             m_tombstone_filter = new psudb::BloomFilter<R>(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS);
@@ -49,7 +55,7 @@ public:
 
         if (m_data) free(m_data);
         if (m_tombstone_filter) delete m_tombstone_filter;
-        if (m_merge_data) free(m_merge_data);
+        if (m_sorted_data) free(m_sorted_data);
     }
 
     template <typename R_ = R>
@@ -171,8 +177,8 @@ public:
      * to be adjusted). Other threads having read access is perfectly
      * acceptable, however.
      */
-    bool start_merge() {
-        memcpy(m_merge_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
+    bool start_flush() {
+        memcpy(m_sorted_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
         return true;
     }
 
@@ -210,7 +216,7 @@ private:
     size_t m_tombstone_cap;
     
     Wrapped<R>* m_data;
-    Wrapped<R>* m_merge_data;
+    Wrapped<R>* m_sorted_data;
 
     psudb::BloomFilter<R>* m_tombstone_filter;
 
-- 
cgit v1.2.3


From 24a42e300c96e2815bf20be3f6cce3efee1c4303 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 21 Dec 2023 17:03:39 -0500
Subject: ExtensionStructure: adjusted leveling logic to avoid unneeded copies

This also reduces the special-case overhead on shards. As it was,
shards would need to handle a special case when constructing from other
shards where the first of the two provided shards was a nullptr, which
caused a number of subtle issues (or outright crashes in some cases)
with existing shard implementations.
---
 include/framework/structure/ExtensionStructure.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 3cd55ac..60016a0 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -347,13 +347,19 @@ public:
      */
     inline void reconstruction(level_index base_level, level_index incoming_level) {
         if constexpr (L == LayoutPolicy::LEVELING) {
-            auto tmp = m_levels[base_level];
-            m_levels[base_level] = InternalLevel<R, Shard, Q>::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get());
+            /* if the base level has a shard, merge the base and incoming together to make a new one */
+            if (m_levels[base_level]->get_shard_count() > 0) {
+                m_levels[base_level] = InternalLevel<R, Shard, Q>::reconstruction(m_levels[base_level].get(), m_levels[incoming_level].get());
+            /* otherwise, we can just move the incoming to the base */
+            } else {
+                m_levels[base_level] = m_levels[incoming_level];
+            }
         } else {
             m_levels[base_level]->append_level(m_levels[incoming_level].get());
             m_levels[base_level]->finalize();
         }
 
+        /* place a new, empty level where the incoming level used to be */
         m_levels[incoming_level] = std::shared_ptr<InternalLevel<R, Shard, Q>>(new InternalLevel<R, Shard, Q>(incoming_level, (L == LayoutPolicy::LEVELING) ? 1 : m_scale_factor));
     }
 
@@ -432,10 +438,13 @@ private:
             auto old_level = m_levels[0].get();
             auto temp_level = new InternalLevel<R, Shard, Q>(0, 1);
             temp_level->append_buffer(buffer);
-            auto new_level = InternalLevel<R, Shard, Q>::reconstruction(old_level, temp_level);
 
-            m_levels[0] = new_level;
-            delete temp_level;
+            if (old_level->get_shard_count() > 0) {
+                m_levels[0] = InternalLevel<R, Shard, Q>::reconstruction(old_level, temp_level);
+                delete temp_level;
+            } else {
+                m_levels[0] = std::shared_ptr<InternalLevel<R, Shard, Q>>(temp_level);
+            }
         } else {
             m_levels[0]->append_buffer(buffer);
         }
-- 
cgit v1.2.3


From 46dfe1e0f3bb05016da14b39b2e35babbba4027a Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 21 Dec 2023 17:05:30 -0500
Subject: InternalLevel: appending an empty level is a no-op

The existing reconstruction logic will occasionally attempt to append an
empty level to another empty level, for some reason. While the underlying
cause of this needs to be looked into, this special case should prevent
shard constructors being called with a shard count of 0 under tiering,
reducing the error handling overhead of shard code.
---
 include/framework/structure/InternalLevel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index e70ed76..ee85cb3 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -68,6 +68,13 @@ public:
      * No changes are made to the level provided as an argument.
      */
     void append_level(InternalLevel* level) {
+        // FIXME: that this is happening probably means that
+        // something is going terribly wrong earlier in the
+        // reconstruction logic.
+        if (level->get_shard_count() == 0) {
+            return;
+        }
+
         Shard *shards[level->m_shard_cnt];
         for (size_t i=0; i<level->m_shard_cnt; i++) {
             shards[i] = level->m_shards[i].get();
-- 
cgit v1.2.3


From 3c2e6b3b456867d7155b158432b891b84e4e1dd6 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Tue, 9 Jan 2024 14:47:48 -0500
Subject: Initial update of buffer to new specifications

There are a few minor issues that this introduces, however. Global
tracking of a lot of secondary information, such as weights for WIRS/WSS,
or the exact number of tombstones will need to be approached differently
than they have been historically with this new approach.

I've also removed most of the tombstone capacity related code. We had
decided not to bother enforcing this at the buffer level anyway, and it
would greatly increase the complexity of the problem of predicting when
the next compaction will be.

On the whole this new approach seems like it'll simplify a lot. This
commit actually removes significantly more code than it adds.

One minor issue: the currently implementation will have problems
in the circular array indexes once more than 2^64 records have been
inserted. This doesn't seem like a realistic problem at the moment.
---
 include/framework/structure/BufferView.h    |  98 ++++++-----------------
 include/framework/structure/MutableBuffer.h | 116 +++++++++-------------------
 2 files changed, 62 insertions(+), 152 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 651e430..8a5f50f 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -22,107 +22,59 @@
 #include "psu-ds/Alias.h"
 #include "psu-util/timer.h"
 #include "framework/interface/Record.h"
-#include "framework/structure/MutableBuffer.h"
 #include "framework/interface/Query.h"
 
 namespace de {
 
-template <RecordInterface R, QueryInterface Q>
+template <RecordInterface R>
 class BufferView {
-    typedef MutableBuffer<R> Buffer;
 public:
     BufferView() = default;
 
-    BufferView(std::vector<Buffer*> buffers) 
-        : m_buffers(buffers)
-        , m_cutoff(buffers[buffers.size()-1]->get_record_count())
-    {}
+    BufferView(const Wrapped<R> *buffer, size_t head, size_t tail, psudb::BloomFilter<R> *filter) 
+        : m_buffer(buffer), m_head(head), m_tail(tail), m_tombstone_filter(filter) {}
 
     ~BufferView() = default;
 
-    bool delete_record(const R& rec) {
-        auto res = false;
-        for (auto buf : m_buffers) {
-            res = buf->delete_record(rec);
-            if (res) return true;
-        }
-        return false;
-    }
-
     bool check_tombstone(const R& rec) {
-        auto res = false;
-        for (auto buf : m_buffers) {
-            res = buf->check_tombstone(rec);
-            if (res) return true;
+        if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
+
+        for (size_t i=0; i<get_record_count(); i++) {
+            if (m_buffer[to_idx(i)].rec == rec && m_buffer[to_idx(i)].is_tombstone()) {
+                return true;
+            }
         }
+
         return false;
     }
 
     size_t get_record_count() {
-        size_t reccnt = 0;
-        for (auto buf : m_buffers) {
-            reccnt += buf->get_record_count();
-        }
-        return reccnt;
+        return m_tail - m_head;
     }
     
-    size_t get_capacity() {
-        return m_buffers[0]->get_capacity();
-    }
-
-    bool is_full() {
-        return m_buffers[m_buffers.size() - 1]->is_full();
-    }
-
     size_t get_tombstone_count() {
-        size_t tscnt = 0;
-        for (auto buf : m_buffers) {
-            tscnt += buf->get_tombstone_count();
-        }
-        return tscnt;
+        // FIXME: tombstone count
+        return 0;
     }
 
-    size_t get_memory_usage() {
-        size_t mem = 0;
-        for (auto buf : m_buffers) {
-            mem += buf->get_memory_usage();
-        }
-        return mem;
+    Wrapped<R> *get(size_t i) {
+        assert(i < get_record_count());
+        return m_buffer + to_idx(i);
     }
 
-    size_t get_aux_memory_usage() {
-        size_t mem = 0;
-        for (auto buf : m_buffers) {
-            mem += buf->get_aux_memory_usage();
-        }
-        return mem;
-    }
-
-    size_t get_tombstone_capacity() {
-        return m_buffers[0]->get_tombstone_capacity();
-    }
-
-    std::vector<void *> get_query_states(void *parms) {
-        std::vector<void *> states;
-
-        for (auto buf : m_buffers) {
-            states.push_back(Q::get_buffer_query_state(buf, parms));
-        }
-
-        return states;
+    void copy_to_buffer(byte *buffer) {
+        memcpy(buffer, m_buffer, get_record_count() * sizeof(Wrapped<R>));
     }
 
-    std::vector<Buffer *> &get_buffers() {
-        return m_buffers;
-    }
+private:
+    const Wrapped<R>* m_buffer;
+    size_t m_head;
+    size_t m_tail;
+    psudb::BloomFilter<R> *m_tombstone_filter;
 
-    size_t size() {
-        return m_buffers.size();
+    size_t to_idx(size_t i) {
+        return (m_head + i) % m_buffer->get_capacity();
     }
-
-private:
-    std::vector<Buffer *> m_buffers;
-    size_t m_cutoff;
 };
 
 }
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 58b5fb4..7bec219 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -6,12 +6,6 @@
  *
  * Distributed under the Modified BSD License.
  *
- * FIXME: currently, the buffer itself is responsible for managing a
- * secondary buffer for storing sorted records used during buffer flushes. It
- * probably makes more sense to make the shard being flushed into responsible
- * for this instead. This would also facilitate simultaneous flushes of multiple
- * buffers more easily.
- *
  */
 #pragma once
 
@@ -29,6 +23,7 @@
 #include "psu-ds/Alias.h"
 #include "psu-util/timer.h"
 #include "framework/interface/Record.h"
+#include "framework/structure/BufferView.h"
 
 using psudb::CACHELINE_SIZE;
 
@@ -36,18 +31,22 @@ namespace de {
 
 template <RecordInterface R>
 class MutableBuffer {
+    friend class BufferView<R>;
 public:
-    MutableBuffer(size_t capacity, size_t max_tombstone_cap)
-    : m_cap(capacity), m_tombstone_cap(capacity), m_reccnt(0)
-    , m_tombstonecnt(0), m_weight(0), m_max_weight(0), m_tail(0) {
-        m_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
-        m_sorted_data = (Wrapped<R>*) psudb::sf_aligned_alloc(CACHELINE_SIZE, capacity*sizeof(Wrapped<R>));
-        m_tombstone_filter = nullptr;
-        if (max_tombstone_cap > 0) {
-            m_tombstone_filter = new psudb::BloomFilter<R>(BF_FPR, max_tombstone_cap, BF_HASH_FUNCS);
+    MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) 
+    : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0) {
+        /* 
+         * default capacity is twice the high water mark, to account for the worst-case
+         * memory requirements.
+         */
+        if (m_cap == 0) {
+            m_cap = m_hwm * 2;
         }
 
-        m_refcnt.store(0);
+        m_data = (Wrapped<R> *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped<R>));
+        
+        // FIXME: need to figure out how to detail with tombstones at some point...
+        m_tombstone_filter = new psudb::BloomFilter<R>(BF_FPR, m_hwm, BF_HASH_FUNCS);
     }
 
     ~MutableBuffer() {
@@ -55,13 +54,10 @@ public:
 
         if (m_data) free(m_data);
         if (m_tombstone_filter) delete m_tombstone_filter;
-        if (m_sorted_data) free(m_sorted_data);
     }
 
     template <typename R_ = R>
     int append(const R &rec, bool tombstone=false) {
-        if (tombstone && m_tombstonecnt + 1 > m_tombstone_cap) return 0;
-
         int32_t pos = 0;
         if ((pos = try_advance_tail()) == -1) return 0;
 
@@ -78,26 +74,11 @@ public:
             if (m_tombstone_filter) m_tombstone_filter->insert(rec);
         }
 
-        if constexpr (WeightedRecordInterface<R_>) {
-            m_weight.fetch_add(rec.weight);
-            double old = m_max_weight.load();
-            while (old < rec.weight) {
-                m_max_weight.compare_exchange_strong(old, rec.weight);
-                old = m_max_weight.load();
-            }
-        } else {
-            m_weight.fetch_add(1);
-        }
-
-        m_reccnt.fetch_add(1);
         return 1;     
     }
 
     bool truncate() {
         m_tombstonecnt.store(0);
-        m_reccnt.store(0);
-        m_weight.store(0);
-        m_max_weight.store(0);
         m_tail.store(0);
         if (m_tombstone_filter) m_tombstone_filter->clear();
 
@@ -105,7 +86,7 @@ public:
     }
 
     size_t get_record_count() {
-        return m_reccnt;
+        return (m_tail - m_head) % m_cap;
     }
     
     size_t get_capacity() {
@@ -113,7 +94,7 @@ public:
     }
 
     bool is_full() {
-        return m_reccnt == m_cap;
+        return (m_tail % m_cap) >= m_hwm;
     }
 
     size_t get_tombstone_count() {
@@ -121,13 +102,11 @@ public:
     }
 
     bool delete_record(const R& rec) {
-        auto offset = 0;
-        while (offset < m_reccnt.load()) {
-            if (m_data[offset].rec == rec) {
-                m_data[offset].set_delete();
+        for (size_t i=0; i<get_record_count(); i++) {
+            if (m_data[to_idx(i)].rec == rec) {
+                m_data[to_idx(i)].set_delete();
                 return true;
             }
-            offset++;
         }
 
         return false;
@@ -136,13 +115,12 @@ public:
     bool check_tombstone(const R& rec) {
         if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
 
-        auto offset = 0;
-        while (offset < m_reccnt.load()) {
-            if (m_data[offset].rec == rec && m_data[offset].is_tombstone()) {
+        for (size_t i=0; i<get_record_count(); i++) {
+            if (m_data[to_idx(i)].rec == rec && m_data[to_idx(i)].is_tombstone()) {
                 return true;
             }
-            offset++;;
         }
+
         return false;
     }
 
@@ -155,31 +133,8 @@ public:
     }
 
     size_t get_tombstone_capacity() {
-        return m_tombstone_cap;
-    }
-
-    double get_total_weight() {
-        return m_weight.load();
-    }
-
-    Wrapped<R> *get_data() {
-        return m_data;
-    }
-
-    double get_max_weight() {
-        return m_max_weight;
-    }
-
-    /*
-     * This operation assumes that no other threads have write access
-     * to the buffer. This will be the case in normal operation, at
-     * present, but may change (in which case this approach will need
-     * to be adjusted). Other threads having read access is perfectly
-     * acceptable, however.
-     */
-    bool start_flush() {
-        memcpy(m_sorted_data, m_data, sizeof(Wrapped<R>) * m_reccnt.load());
-        return true;
+        // FIXME: tombstone capacity needs figured out again
+        return m_cap;
     }
 
     /*
@@ -202,30 +157,33 @@ public:
 
 private:
     int64_t try_advance_tail() {
-        int64_t new_tail = m_tail.fetch_add(1);
+        int64_t new_tail = m_tail.fetch_add(1) % m_cap;
 
-        if (new_tail < m_cap) {
+        if (new_tail < m_hwm) {
             return new_tail;
-        } 
+        }
 
         m_tail.fetch_add(-1);
         return -1;
     }
 
+    size_t to_idx(size_t i) {
+        return (m_head + i) % m_cap;
+    }
+
     size_t m_cap;
-    size_t m_tombstone_cap;
+
+    size_t m_lwm;
+    size_t m_hwm;
+    
+    alignas(64) std::atomic<size_t> m_tail;
+    alignas(64) std::atomic<size_t> m_head;
     
     Wrapped<R>* m_data;
-    Wrapped<R>* m_sorted_data;
 
     psudb::BloomFilter<R>* m_tombstone_filter;
 
     alignas(64) std::atomic<size_t> m_tombstonecnt;
-    alignas(64) std::atomic<uint32_t> m_reccnt;
-    alignas(64) std::atomic<int64_t> m_tail;
-    alignas(64) std::atomic<double> m_weight;
-    alignas(64) std::atomic<double> m_max_weight;
-
     alignas(64) std::atomic<size_t> m_refcnt;
 };
 
-- 
cgit v1.2.3


From 53879a0d69f5e578710b7125e9b41e516c2371d4 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 10 Jan 2024 17:39:28 -0500
Subject: MutableBuffer+View: Implementation with unit tests

---
 include/framework/structure/BufferView.h    |  32 ++++---
 include/framework/structure/MutableBuffer.h | 133 ++++++++++++++++++++--------
 2 files changed, 116 insertions(+), 49 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 8a5f50f..7e8af45 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -9,32 +9,34 @@
 #pragma once
 
 #include <cstdlib>
-#include <atomic>
-#include <condition_variable>
 #include <cassert>
-#include <numeric>
-#include <algorithm>
-#include <type_traits>
+#include <functional>
 
 #include "psu-util/alignment.h"
-#include "util/bf_config.h"
 #include "psu-ds/BloomFilter.h"
-#include "psu-ds/Alias.h"
-#include "psu-util/timer.h"
 #include "framework/interface/Record.h"
-#include "framework/interface/Query.h"
 
 namespace de {
 
+typedef std::function<void(void*, size_t)> ReleaseFunction;
+
 template <RecordInterface R>
 class BufferView {
 public:
     BufferView() = default;
 
-    BufferView(const Wrapped<R> *buffer, size_t head, size_t tail, psudb::BloomFilter<R> *filter) 
-        : m_buffer(buffer), m_head(head), m_tail(tail), m_tombstone_filter(filter) {}
+    BufferView(const Wrapped<R> *buffer, size_t head, size_t tail, psudb::BloomFilter<R> *filter,
+               void *parent_buffer, ReleaseFunction release) 
+        : m_buffer(buffer)
+        , m_release(release)
+        , m_parent_buffer(parent_buffer)
+        , m_head(head)
+        , m_tail(tail)
+        , m_tombstone_filter(filter) {}
 
-    ~BufferView() = default;
+    ~BufferView() {
+        m_release(m_parent_buffer, m_head);
+    }
 
     bool check_tombstone(const R& rec) {
         if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
@@ -62,12 +64,14 @@ public:
         return m_buffer + to_idx(i);
     }
 
-    void copy_to_buffer(byte *buffer) {
-        memcpy(buffer, m_buffer, get_record_count() * sizeof(Wrapped<R>));
+    void copy_to_buffer(psudb::byte *buffer) {
+        memcpy(buffer, (std::byte*) (m_buffer + m_head), get_record_count() * sizeof(Wrapped<R>));
     }
 
 private:
     const Wrapped<R>* m_buffer;
+    void *m_parent_buffer;
+    ReleaseFunction m_release;
     size_t m_head;
     size_t m_tail;
     psudb::BloomFilter<R> *m_tombstone_filter;
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 7bec219..d57ad6e 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -11,17 +11,11 @@
 
 #include <cstdlib>
 #include <atomic>
-#include <condition_variable>
 #include <cassert>
-#include <numeric>
-#include <algorithm>
-#include <type_traits>
 
 #include "psu-util/alignment.h"
 #include "util/bf_config.h"
 #include "psu-ds/BloomFilter.h"
-#include "psu-ds/Alias.h"
-#include "psu-util/timer.h"
 #include "framework/interface/Record.h"
 #include "framework/structure/BufferView.h"
 
@@ -34,7 +28,8 @@ class MutableBuffer {
     friend class BufferView<R>;
 public:
     MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) 
-    : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0) {
+    : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0),
+    m_old_head(0), m_head_refcnt(0), m_old_head_refcnt(0) {
         /* 
          * default capacity is twice the high water mark, to account for the worst-case
          * memory requirements.
@@ -50,10 +45,8 @@ public:
     }
 
     ~MutableBuffer() {
-        assert(m_refcnt.load() == 0);
-
-        if (m_data) free(m_data);
-        if (m_tombstone_filter) delete m_tombstone_filter;
+        free(m_data);
+        delete m_tombstone_filter;
     }
 
     template <typename R_ = R>
@@ -94,7 +87,11 @@ public:
     }
 
     bool is_full() {
-        return (m_tail % m_cap) >= m_hwm;
+        return get_record_count() >= m_hwm;
+    }
+
+    bool is_at_low_watermark() {
+        return (m_tail % m_cap) > m_lwm;
     }
 
     size_t get_tombstone_count() {
@@ -125,66 +122,132 @@ public:
     }
 
     size_t get_memory_usage() {
-        return m_cap * sizeof(R);
+        return m_cap * sizeof(Wrapped<R>);
     }
 
     size_t get_aux_memory_usage() {
         return m_tombstone_filter->get_memory_usage();
     }
 
-    size_t get_tombstone_capacity() {
-        // FIXME: tombstone capacity needs figured out again
-        return m_cap;
+    BufferView<R> get_buffer_view() {
+        m_head_refcnt.fetch_add(1);
+        return BufferView(m_data, m_head, m_tail.load(), m_tombstone_filter, (void*) this, release_head_reference);
     }
 
     /*
-     * Concurrency-related operations
+     * Advance the buffer following a reconstruction. Move current
+     * head and head_refcnt into old_head and old_head_refcnt, then
+     * assign new_head to old_head.
      */
-    bool take_reference() {
-        m_refcnt.fetch_add(1);
-        return true;
+    void advance_head(size_t new_head) {
+        assert(new_head > m_head.load());
+        assert(new_head <= m_tail.load());
+        assert(m_old_head_refcnt == 0);
+
+        /*
+         * the order here is very important. We first store zero to the 
+         * old_refcnt (should be zero anyway). Then we move the current 
+         * head to old head. At this point, any new buffer views should
+         * increment the old head refcnt, so no new references to the 
+         * current head will be taken. Then we add the current head 
+         * refcnt to this. This is to ensure that no references get 
+         * dropped. Only after this do we change to the new head
+         */
+        m_old_head_refcnt.store(0);
+        m_old_head.store(m_head.load());
+        m_old_head_refcnt.fetch_add(m_head_refcnt);
+
+        m_head_refcnt.store(0);
+        m_head.store(new_head);
     }
 
-    bool release_reference() {
-        assert(m_refcnt > 0);
-        m_refcnt.fetch_add(-1);
-        return true;
+    void set_low_watermark(size_t lwm) {
+        assert(lwm < m_hwm);
+        m_lwm = lwm;
     }
 
-    size_t get_reference_count() {
-        return m_refcnt.load();
+    size_t get_low_watermark() {
+        return m_lwm;
+    }
+
+    void set_high_watermark(size_t hwm) {
+        assert(hwm > m_lwm);
+        assert(hwm < m_cap);
+        m_hwm = hwm;
+    }
+
+    size_t get_high_watermark() {
+        return m_hwm;
+    }
+
+    size_t get_tail() {
+        return m_tail.load();
+    }
+
+    /*
+     * Note: this returns the available physical storage capacity,
+     * *not* now many more records can be inserted before the
+     * HWM is reached.
+     */
+    size_t get_available_capacity() {
+        return m_cap - (m_tail.load() - m_old_head.load());
     }
 
 private:
     int64_t try_advance_tail() {
-        int64_t new_tail = m_tail.fetch_add(1) % m_cap;
+        size_t old_value = m_tail.load();
+
+        /* if full, fail to advance the tail */
+        if (old_value >= m_hwm) {
+            return -1;
+        }
 
-        if (new_tail < m_hwm) {
-            return new_tail;
+        while (!m_tail.compare_exchange_strong(old_value, old_value+1)) {
+            /* if full, stop trying and fail to advance the tail */
+            if (m_tail.load() >= m_hwm) {
+                return -1;
+            }
         }
 
-        m_tail.fetch_add(-1);
-        return -1;
+        return old_value;
     }
 
     size_t to_idx(size_t i) {
         return (m_head + i) % m_cap;
     }
 
-    size_t m_cap;
+    static void release_head_reference(void *buff, size_t head) {
+        MutableBuffer<R> *buffer = (MutableBuffer<R> *) buff;
+
+        if (head == buffer->m_head.load()) {
+            buffer->m_head_refcnt.fetch_sub(1);
+        } else if (head == buffer->m_old_head.load()) {
+            buffer->m_old_head_refcnt.fetch_sub(1);
+            /* 
+             * if the old head refcnt drops to 0, free
+             * the records by setting old_head = head
+             */
+            if (buffer->m_old_head_refcnt.load() == 0) {
+                buffer->m_old_head.store(buffer->m_head);   
+            }
+        }
+    }
 
     size_t m_lwm;
     size_t m_hwm;
+    size_t m_cap;
     
     alignas(64) std::atomic<size_t> m_tail;
+
     alignas(64) std::atomic<size_t> m_head;
+    alignas(64) std::atomic<size_t> m_head_refcnt;
+
+    alignas(64) std::atomic<size_t> m_old_head;
+    alignas(64) std::atomic<size_t> m_old_head_refcnt;
     
     Wrapped<R>* m_data;
-
     psudb::BloomFilter<R>* m_tombstone_filter;
-
     alignas(64) std::atomic<size_t> m_tombstonecnt;
-    alignas(64) std::atomic<size_t> m_refcnt;
 };
 
 }
-- 
cgit v1.2.3


From eb19677340be6f0befe9da2199e5832af51eea0d Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 10 Jan 2024 18:01:30 -0500
Subject: MutableBuffer: multithreaded insert test + bugfixes

---
 include/framework/structure/MutableBuffer.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index d57ad6e..a065154 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -49,7 +49,6 @@ public:
         delete m_tombstone_filter;
     }
 
-    template <typename R_ = R>
     int append(const R &rec, bool tombstone=false) {
         int32_t pos = 0;
         if ((pos = try_advance_tail()) == -1) return 0;
@@ -91,7 +90,7 @@ public:
     }
 
     bool is_at_low_watermark() {
-        return (m_tail % m_cap) > m_lwm;
+        return get_record_count() >= m_lwm;
     }
 
     size_t get_tombstone_count() {
@@ -139,10 +138,14 @@ public:
      * head and head_refcnt into old_head and old_head_refcnt, then
      * assign new_head to old_head.
      */
-    void advance_head(size_t new_head) {
+    bool advance_head(size_t new_head) {
         assert(new_head > m_head.load());
         assert(new_head <= m_tail.load());
-        assert(m_old_head_refcnt == 0);
+
+        /* refuse to advance head while there is an old with one references */
+        if (m_old_head_refcnt > 0) {
+            return false;
+        }
 
         /*
          * the order here is very important. We first store zero to the 
@@ -159,6 +162,8 @@ public:
 
         m_head_refcnt.store(0);
         m_head.store(new_head);
+
+        return true;
     }
 
     void set_low_watermark(size_t lwm) {
-- 
cgit v1.2.3


From 5db0f96e9f3d2505b5f751abc133cbf7e13b5129 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 11 Jan 2024 11:31:33 -0500
Subject: Fixed some potential buffer-related concurrency bugs

---
 include/framework/structure/BufferView.h    |  45 ++++++++----
 include/framework/structure/MutableBuffer.h | 106 +++++++++++++++++-----------
 2 files changed, 94 insertions(+), 57 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 7e8af45..00b6101 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -18,31 +18,43 @@
 
 namespace de {
 
-typedef std::function<void(void*, size_t)> ReleaseFunction;
+typedef  std::_Bind<void (*(void*, long unsigned int))(void*, long unsigned int)> ReleaseFunction;
 
 template <RecordInterface R>
 class BufferView {
 public:
     BufferView() = default;
 
-    BufferView(const Wrapped<R> *buffer, size_t head, size_t tail, psudb::BloomFilter<R> *filter,
-               void *parent_buffer, ReleaseFunction release) 
-        : m_buffer(buffer)
+    BufferView(const Wrapped<R> *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter<R> *filter,
+               ReleaseFunction release) 
+        : m_data(buffer)
         , m_release(release)
-        , m_parent_buffer(parent_buffer)
         , m_head(head)
         , m_tail(tail)
+        , m_cap(cap)
+        , m_approx_ts_cnt(tombstone_cnt)
         , m_tombstone_filter(filter) {}
 
     ~BufferView() {
-        m_release(m_parent_buffer, m_head);
+        m_release();
     }
 
     bool check_tombstone(const R& rec) {
         if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
 
         for (size_t i=0; i<get_record_count(); i++) {
-            if (m_buffer[to_idx(i)].rec == rec && m_buffer[to_idx(i)].is_tombstone()) {
+            if (m_data[to_idx(i)].rec == rec && m_data[to_idx(i)].is_tombstone()) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    bool delete_record(const R& rec) {
+        for (size_t i=0; i<get_record_count(); i++) {
+            if (m_data[to_idx(i)].rec == rec) {
+                m_data[to_idx(i)].set_delete();
                 return true;
             }
         }
@@ -54,30 +66,35 @@ public:
         return m_tail - m_head;
     }
     
+    /*
+     * NOTE: This function returns an upper bound on the number
+     *       of tombstones within the view. There may be less than
+     *       this, due to synchronization issues during view creation.
+     */
     size_t get_tombstone_count() {
-        // FIXME: tombstone count
-        return 0;
+        return m_approx_ts_cnt;
     }
 
     Wrapped<R> *get(size_t i) {
         assert(i < get_record_count());
-        return m_buffer + to_idx(i);
+        return m_data + to_idx(i);
     }
 
     void copy_to_buffer(psudb::byte *buffer) {
-        memcpy(buffer, (std::byte*) (m_buffer + m_head), get_record_count() * sizeof(Wrapped<R>));
+        memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped<R>));
     }
 
 private:
-    const Wrapped<R>* m_buffer;
-    void *m_parent_buffer;
+    const Wrapped<R>* m_data;
     ReleaseFunction m_release;
     size_t m_head;
     size_t m_tail;
+    size_t m_cap;
+    size_t m_approx_ts_cnt;
     psudb::BloomFilter<R> *m_tombstone_filter;
 
     size_t to_idx(size_t i) {
-        return (m_head + i) % m_buffer->get_capacity();
+        return (m_head + i) % m_cap;
     }
 };
 
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index a065154..3a06f0d 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -6,12 +6,22 @@
  *
  * Distributed under the Modified BSD License.
  *
+ * NOTE: Concerning the tombstone count. One possible approach
+ * would be to track the number of tombstones below and above the
+ * low water mark--this would be straightforward to do. Then, if we
+ * *require* that the head only advance up to the LWM, we can get a
+ * correct view on the number of tombstones in the active buffer at
+ * any point in time, and the BufferView will have a pretty good
+ * approximation as well (potentially with a few extra if new inserts
+ * happen between when the tail pointer and tombstone count are fetched)
+ *
  */
 #pragma once
 
 #include <cstdlib>
 #include <atomic>
 #include <cassert>
+#include <immintrin.h>
 
 #include "psu-util/alignment.h"
 #include "util/bf_config.h"
@@ -28,20 +38,22 @@ class MutableBuffer {
     friend class BufferView<R>;
 public:
     MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) 
-    : m_lwm(low_watermark), m_hwm(high_watermark), m_cap(capacity), m_head(0), m_tail(0),
-    m_old_head(0), m_head_refcnt(0), m_old_head_refcnt(0) {
-        /* 
-         * default capacity is twice the high water mark, to account for the worst-case
-         * memory requirements.
-         */
-        if (m_cap == 0) {
-            m_cap = m_hwm * 2;
-        }
-
-        m_data = (Wrapped<R> *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped<R>));
-        
-        // FIXME: need to figure out how to detail with tombstones at some point...
-        m_tombstone_filter = new psudb::BloomFilter<R>(BF_FPR, m_hwm, BF_HASH_FUNCS);
+        : m_lwm(low_watermark)
+        , m_hwm(high_watermark)
+        , m_cap((capacity == 0) ? 2 * high_watermark : capacity)
+        , m_tail(0)
+        , m_head(0)
+        , m_head_refcnt(0)
+        , m_old_head(0)
+        , m_old_head_refcnt(0)
+        , m_data((Wrapped<R> *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped<R>)))
+        , m_tombstone_filter(new psudb::BloomFilter<R>(BF_FPR, m_hwm, BF_HASH_FUNCS))
+        , m_tscnt(0)
+        , m_old_tscnt(0)
+        , m_active_head_advance(false) 
+    {
+        assert(m_cap > m_hwm);
+        assert(m_hwm > m_lwm);
     }
 
     ~MutableBuffer() {
@@ -62,7 +74,7 @@ public:
         m_data[pos].header |= (pos << 2);
 
         if (tombstone) {
-            m_tombstonecnt.fetch_add(1);
+            m_tscnt.fetch_add(1);
             if (m_tombstone_filter) m_tombstone_filter->insert(rec);
         }
 
@@ -70,7 +82,7 @@ public:
     }
 
     bool truncate() {
-        m_tombstonecnt.store(0);
+        m_tscnt.store(0);
         m_tail.store(0);
         if (m_tombstone_filter) m_tombstone_filter->clear();
 
@@ -78,7 +90,7 @@ public:
     }
 
     size_t get_record_count() {
-        return (m_tail - m_head) % m_cap;
+        return m_tail - m_head;
     }
     
     size_t get_capacity() {
@@ -94,30 +106,15 @@ public:
     }
 
     size_t get_tombstone_count() {
-        return m_tombstonecnt.load();
+        return m_tscnt.load();
     }
 
     bool delete_record(const R& rec) {
-        for (size_t i=0; i<get_record_count(); i++) {
-            if (m_data[to_idx(i)].rec == rec) {
-                m_data[to_idx(i)].set_delete();
-                return true;
-            }
-        }
-
-        return false;
-    }
+        return get_buffer_view().delete_record(rec);
+   }
 
     bool check_tombstone(const R& rec) {
-        if (m_tombstone_filter && !m_tombstone_filter->lookup(rec)) return false;
-
-        for (size_t i=0; i<get_record_count(); i++) {
-            if (m_data[to_idx(i)].rec == rec && m_data[to_idx(i)].is_tombstone()) {
-                return true;
-            }
-        }
-
-        return false;
+        return get_buffer_view().check_tombstone(rec);
     }
 
     size_t get_memory_usage() {
@@ -130,7 +127,8 @@ public:
 
     BufferView<R> get_buffer_view() {
         m_head_refcnt.fetch_add(1);
-        return BufferView(m_data, m_head, m_tail.load(), m_tombstone_filter, (void*) this, release_head_reference);
+        auto f = std::bind(release_head_reference, (void *) this, m_head.load());
+        return BufferView<R>(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
     }
 
     /*
@@ -147,6 +145,8 @@ public:
             return false;
         }
 
+        m_active_head_advance.store(true);
+
         /*
          * the order here is very important. We first store zero to the 
          * old_refcnt (should be zero anyway). Then we move the current 
@@ -157,12 +157,14 @@ public:
          * dropped. Only after this do we change to the new head
          */
         m_old_head_refcnt.store(0);
+
         m_old_head.store(m_head.load());
         m_old_head_refcnt.fetch_add(m_head_refcnt);
 
         m_head_refcnt.store(0);
         m_head.store(new_head);
 
+        m_active_head_advance.store(false);
         return true;
     }
 
@@ -212,29 +214,44 @@ private:
             if (m_tail.load() >= m_hwm) {
                 return -1;
             }
+
+            _mm_pause();
         }
 
         return old_value;
     }
 
-    size_t to_idx(size_t i) {
-        return (m_head + i) % m_cap;
+    size_t to_idx(size_t i, size_t head) {
+        return (head + i) % m_cap;
     }
 
     static void release_head_reference(void *buff, size_t head) {
         MutableBuffer<R> *buffer = (MutableBuffer<R> *) buff;
 
-        if (head == buffer->m_head.load()) {
-            buffer->m_head_refcnt.fetch_sub(1);
-        } else if (head == buffer->m_old_head.load()) {
+        /* 
+         * check old head first. During a head transition, the head being 
+         * retired will first be assigned to *both* head and old_head. As
+         * a result, any refcnt updates during this time should be applied
+         * to old_head, even if the current head and the head being released
+         * also match.
+         */
+        if (head == buffer->m_old_head.load()) {
             buffer->m_old_head_refcnt.fetch_sub(1);
             /* 
              * if the old head refcnt drops to 0, free
              * the records by setting old_head = head
+             * before this, spin while the two heads are equal to
+             * avoid 
              */
+            while (buffer->m_active_head_advance.load()) {
+                _mm_pause();
+            }
+
             if (buffer->m_old_head_refcnt.load() == 0) {
                 buffer->m_old_head.store(buffer->m_head);   
             }
+        } else if (head == buffer->m_head.load()) {
+            buffer->m_head_refcnt.fetch_sub(1);
         }
     }
 
@@ -252,7 +269,10 @@ private:
     
     Wrapped<R>* m_data;
     psudb::BloomFilter<R>* m_tombstone_filter;
-    alignas(64) std::atomic<size_t> m_tombstonecnt;
+    alignas(64) std::atomic<size_t> m_tscnt;
+    size_t m_old_tscnt;
+
+    alignas(64) std::atomic<bool> m_active_head_advance;
 };
 
 }
-- 
cgit v1.2.3


From c596ed468c2279f959b04d83d7f2e9692db84bae Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 11 Jan 2024 15:28:00 -0500
Subject: BufferView: enforce move semantics

Because a BufferView's lifetime is so tightly linked to the lifetime of
regions of the buffer, it can't be copied without potentially breaking
things.
---
 include/framework/structure/BufferView.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 00b6101..98e41dd 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -25,7 +25,24 @@ class BufferView {
 public:
     BufferView() = default;
 
-    BufferView(const Wrapped<R> *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter<R> *filter,
+    /* 
+     * the BufferView's lifetime is tightly linked to buffer versioning, and so
+     * copying and assignment are disabled.
+     */
+    BufferView(const BufferView&) = delete;
+    BufferView &operator=(BufferView &) = delete;
+
+    void operator=(BufferView &&src) {
+        m_data = src.m_data;
+        m_release = src.m_release;
+        m_head = src.m_head;
+        m_tail = src.m_tail;
+        m_cap = src.m_cap;
+        m_approx_ts_cnt = src.m_approx_ts_cnt;
+        m_tombstone_filter = src.filter;
+    }
+
+    BufferView(Wrapped<R> *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter<R> *filter,
                ReleaseFunction release) 
         : m_data(buffer)
         , m_release(release)
@@ -85,7 +102,7 @@ public:
     }
 
 private:
-    const Wrapped<R>* m_data;
+    Wrapped<R>* m_data;
     ReleaseFunction m_release;
     size_t m_head;
     size_t m_tail;
-- 
cgit v1.2.3


From 7e503464176adbd0880373325e30a6bfd58616f0 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 11 Jan 2024 16:31:24 -0500
Subject: InternalLevel update and tests

Plus some assorted fixes for move semantics stuff in BufferView that
accompanied these changes.
---
 include/framework/structure/BufferView.h    | 44 ++++++++++++++++++++++-------
 include/framework/structure/InternalLevel.h | 11 ++++----
 2 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 98e41dd..d058714 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -18,12 +18,25 @@
 
 namespace de {
 
-typedef  std::_Bind<void (*(void*, long unsigned int))(void*, long unsigned int)> ReleaseFunction;
+typedef std::_Bind<void (*(void*, long unsigned int))(void*, long unsigned int)> ReleaseFunction;
+
+static void noop_func(void *arg1, size_t arg2) {
+    return;
+}
+
+constexpr auto noop_bind = std::bind(noop_func, (void*) nullptr, 0ul);
 
 template <RecordInterface R>
 class BufferView {
 public:
-    BufferView() = default;
+    BufferView() 
+        : m_data(nullptr)
+        , m_release(noop_bind)
+        , m_head(0)
+        , m_tail(0)
+        , m_cap(0)
+        , m_approx_ts_cnt(0)
+        , m_tombstone_filter(nullptr) {}
 
     /* 
      * the BufferView's lifetime is tightly linked to buffer versioning, and so
@@ -32,14 +45,25 @@ public:
     BufferView(const BufferView&) = delete;
     BufferView &operator=(BufferView &) = delete;
 
-    void operator=(BufferView &&src) {
-        m_data = src.m_data;
-        m_release = src.m_release;
-        m_head = src.m_head;
-        m_tail = src.m_tail;
-        m_cap = src.m_cap;
-        m_approx_ts_cnt = src.m_approx_ts_cnt;
-        m_tombstone_filter = src.filter;
+    BufferView(BufferView &&other) 
+        : m_data(std::exchange(other.m_data, nullptr))
+        , m_release(std::move(other.m_release))
+        , m_head(std::exchange(other.m_head, 0))
+        , m_tail(std::exchange(other.m_tail, 0))
+        , m_cap(std::exchange(other.m_cap, 0))
+        , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0))
+        , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {}
+
+    BufferView &operator=(BufferView &&other) noexcept {
+        std::swap(m_data, other.m_data);
+        m_release = std::move(other.m_release);
+        std::swap(m_head, other.m_head);
+        std::swap(m_tail, other.m_tail);
+        std::swap(m_cap, other.m_cap);
+        std::swap(m_approx_ts_cnt, other.m_approx_ts_cnt);
+        std::swap(m_tombstone_filter, other.m_tombstone_filter);
+
+        return *this;
     }
 
     BufferView(Wrapped<R> *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter<R> *filter,
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index ee85cb3..b35cadd 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -16,7 +16,7 @@
 #include "framework/interface/Shard.h"
 #include "framework/interface/Query.h"
 #include "framework/interface/Record.h"
-#include "framework/structure/MutableBuffer.h"
+#include "framework/structure/BufferView.h"
 
 namespace de {
 template <RecordInterface R, ShardInterface S, QueryInterface Q>
@@ -27,7 +27,7 @@ class InternalLevel;
 template <RecordInterface R, ShardInterface S, QueryInterface Q>
 class InternalLevel {
     typedef S Shard;
-    typedef MutableBuffer<R> Buffer;
+    typedef BufferView<R> BuffView;
 public:
     InternalLevel(ssize_t level_no, size_t shard_cap)
     : m_level_no(level_no)
@@ -97,14 +97,14 @@ public:
      * into this level. This is used for buffer
      * flushes under the tiering layout policy.
      */
-    void append_buffer(Buffer* buffer) {
+    void append_buffer(BuffView buffer) {
         if (m_shard_cnt == m_shards.size()) {
             assert(m_pending_shard == nullptr);
-            m_pending_shard = new S(buffer);
+            m_pending_shard = new S(std::move(buffer));
             return;
         }
 
-        m_shards[m_shard_cnt] = std::make_shared<S>(buffer);
+        m_shards[m_shard_cnt] = std::make_shared<S>(std::move(buffer));
         ++m_shard_cnt;
     }
 
@@ -140,7 +140,6 @@ public:
         return new S(shards, m_shard_cnt);
     }
 
-    /* Append the sample range in-order */
     void get_query_states(std::vector<std::pair<ShardID, Shard *>> &shards, std::vector<void*>& shard_states, void *query_parms) {
         for (size_t i=0; i<m_shard_cnt; i++) {
             if (m_shards[i]) {
-- 
cgit v1.2.3


From 3a89d7f6ea2679ff7b9bb1e3c37da9480be6c115 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Fri, 12 Jan 2024 11:29:37 -0500
Subject: BufferView.h: Hopefully the last necessary tweak to the move
 semantics stuff

You can't move assign an std::Bind, but you can move construct it. So
I had to disable the move assignment operator. This means that when you
change the BufferView ownership over to, say, a QueryBufferState object,
you need to do it by passing std::move(buffview) into a constructor call
only--you cannot assign it.
---
 include/framework/structure/BufferView.h | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index d058714..47c7b9b 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -20,23 +20,10 @@ namespace de {
 
 typedef std::_Bind<void (*(void*, long unsigned int))(void*, long unsigned int)> ReleaseFunction;
 
-static void noop_func(void *arg1, size_t arg2) {
-    return;
-}
-
-constexpr auto noop_bind = std::bind(noop_func, (void*) nullptr, 0ul);
-
 template <RecordInterface R>
 class BufferView {
 public:
-    BufferView() 
-        : m_data(nullptr)
-        , m_release(noop_bind)
-        , m_head(0)
-        , m_tail(0)
-        , m_cap(0)
-        , m_approx_ts_cnt(0)
-        , m_tombstone_filter(nullptr) {}
+    BufferView() = default;
 
     /* 
      * the BufferView's lifetime is tightly linked to buffer versioning, and so
@@ -54,17 +41,8 @@ public:
         , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0))
         , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {}
 
-    BufferView &operator=(BufferView &&other) noexcept {
-        std::swap(m_data, other.m_data);
-        m_release = std::move(other.m_release);
-        std::swap(m_head, other.m_head);
-        std::swap(m_tail, other.m_tail);
-        std::swap(m_cap, other.m_cap);
-        std::swap(m_approx_ts_cnt, other.m_approx_ts_cnt);
-        std::swap(m_tombstone_filter, other.m_tombstone_filter);
+    BufferView &operator=(BufferView &&other) = delete;
 
-        return *this;
-    }
 
     BufferView(Wrapped<R> *buffer, size_t cap, size_t head, size_t tail, size_t tombstone_cnt, psudb::BloomFilter<R> *filter,
                ReleaseFunction release) 
-- 
cgit v1.2.3


From aac0bb661af8fae38d3ce08d6078cb4d9dfcb575 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Fri, 12 Jan 2024 14:10:11 -0500
Subject: Initial integration of new buffering scheme into framework

It isn't working right now (lotsa test failures), but we're to the
debugging phase now.
---
 include/framework/structure/BufferView.h         |  4 +++
 include/framework/structure/ExtensionStructure.h | 41 +++++++-----------------
 2 files changed, 16 insertions(+), 29 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 47c7b9b..ba5e693 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -103,6 +103,10 @@ public:
         memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped<R>));
     }
 
+    size_t get_tail() {
+        return m_tail;
+    }
+
 private:
     Wrapped<R>* m_data;
     ReleaseFunction m_release;
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 60016a0..ae566cb 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -10,28 +10,22 @@
 #pragma once
 
 #include <atomic>
-#include <numeric>
 #include <cstdio>
 #include <vector>
 
-#include "framework/structure/MutableBuffer.h"
+#include "framework/structure/BufferView.h"
 #include "framework/structure/InternalLevel.h"
-#include "framework/interface/Shard.h"
-#include "framework/interface/Query.h"
-#include "framework/interface/Record.h"
 
 #include "framework/util/Configuration.h"
-#include "framework/scheduling/Task.h"
 
 #include "psu-util/timer.h"
-#include "psu-ds/Alias.h"
 
 namespace de {
 
 template <RecordInterface R, ShardInterface S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING>
 class ExtensionStructure {
     typedef S Shard;
-    typedef MutableBuffer<R> Buffer;
+    typedef BufferView<R> BuffView;
 
 public:
     ExtensionStructure(size_t buffer_size, size_t scale_factor, double max_delete_prop)
@@ -96,14 +90,10 @@ public:
      * FIXME: arguably, this should be a method attached to the buffer that
      * takes a structure as input.
      */
-    inline bool flush_buffer(Buffer *buffer) {
-        assert(can_reconstruct_with(0, buffer->get_record_count()));
+    inline bool flush_buffer(BuffView buffer) {
+        assert(can_reconstruct_with(0, buffer.get_record_count()));
 
-        // FIXME: this step makes an extra copy of the buffer,
-        //        which could be avoided by adjusting the shard
-        //        reconstruction process a bit, possibly.
-        buffer->start_flush();
-        flush_buffer_into_l0(buffer);
+        flush_buffer_into_l0(std::move(buffer));
 
         return true;
     }
@@ -415,11 +405,11 @@ private:
      * returns -1 if idx==0, and no such level exists, to simplify
      * the logic of the first buffer flush.
      */
-    inline level_index find_reconstruction_target(level_index idx, Buffer *buffer=nullptr) {
+    inline level_index find_reconstruction_target(level_index idx) {
 
         if (idx == 0 && m_levels.size() == 0) return -1;
 
-        size_t incoming_rec_cnt = get_level_record_count(idx, buffer);
+        size_t incoming_rec_cnt = get_level_record_count(idx);
         for (level_index i=idx+1; i<m_levels.size(); i++) {
             if (can_reconstruct_with(i, incoming_rec_cnt)) {
                 return i;
@@ -431,13 +421,13 @@ private:
         return -1;
     }
 
-    inline void flush_buffer_into_l0(Buffer *buffer) {
+    inline void flush_buffer_into_l0(BuffView buffer) {
         assert(m_levels[0]);
         if constexpr (L == LayoutPolicy::LEVELING) {
             // FIXME: Kludgey implementation due to interface constraints.
             auto old_level = m_levels[0].get();
             auto temp_level = new InternalLevel<R, Shard, Q>(0, 1);
-            temp_level->append_buffer(buffer);
+            temp_level->append_buffer(std::move(buffer));
 
             if (old_level->get_shard_count() > 0) {
                 m_levels[0] = InternalLevel<R, Shard, Q>::reconstruction(old_level, temp_level);
@@ -446,7 +436,7 @@ private:
                 m_levels[0] = std::shared_ptr<InternalLevel<R, Shard, Q>>(temp_level);
             }
         } else {
-            m_levels[0]->append_buffer(buffer);
+            m_levels[0]->append_buffer(std::move(buffer));
         }
     }
 
@@ -469,16 +459,9 @@ private:
     }
 
     /*
-     * Returns the actual number of records present on a specified level. An
-     * index value of -1 indicates the memory table. Can optionally pass in
-     * a pointer to the memory table to use, if desired. Otherwise, there are
-     * no guarantees about which buffer will be accessed if level_index is -1.
+     * Returns the number of records present on a specified level. 
      */
-    inline size_t get_level_record_count(level_index idx, Buffer *buffer=nullptr) {
-        if (buffer) { 
-            return buffer->get_record_count();
-        }
-
+    inline size_t get_level_record_count(level_index idx) {
         return (m_levels[idx]) ? m_levels[idx]->get_record_count() : 0;
     }
 
-- 
cgit v1.2.3


From cf178ae74a76b780b655a447531d2114f9f81d98 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Mon, 15 Jan 2024 14:01:36 -0500
Subject: Various single-threaded bug fixes

---
 include/framework/structure/BufferView.h    | 18 ++++++++++++++----
 include/framework/structure/MutableBuffer.h | 26 ++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index ba5e693..c751786 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -11,6 +11,7 @@
 #include <cstdlib>
 #include <cassert>
 #include <functional>
+#include <utility>
 
 #include "psu-util/alignment.h"
 #include "psu-ds/BloomFilter.h"
@@ -39,7 +40,8 @@ public:
         , m_tail(std::exchange(other.m_tail, 0))
         , m_cap(std::exchange(other.m_cap, 0))
         , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0))
-        , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr)) {}
+        , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr))
+        , m_active(std::exchange(other.m_active, false)) {}
 
     BufferView &operator=(BufferView &&other) = delete;
 
@@ -52,10 +54,13 @@ public:
         , m_tail(tail)
         , m_cap(cap)
         , m_approx_ts_cnt(tombstone_cnt)
-        , m_tombstone_filter(filter) {}
+        , m_tombstone_filter(filter)
+        , m_active(true) {}
 
     ~BufferView() {
-        m_release();
+        if (m_active) {
+            m_release();
+        }
     }
 
     bool check_tombstone(const R& rec) {
@@ -100,13 +105,17 @@ public:
     }
 
     void copy_to_buffer(psudb::byte *buffer) {
-        memcpy(buffer, (std::byte*) (m_data + m_head), get_record_count() * sizeof(Wrapped<R>));
+        memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped<R>));
     }
 
     size_t get_tail() {
         return m_tail;
     }
 
+    size_t get_head() {
+        return m_head;
+    }
+
 private:
     Wrapped<R>* m_data;
     ReleaseFunction m_release;
@@ -115,6 +124,7 @@ private:
     size_t m_cap;
     size_t m_approx_ts_cnt;
     psudb::BloomFilter<R> *m_tombstone_filter;
+    bool m_active;
 
     size_t to_idx(size_t i) {
         return (m_head + i) % m_cap;
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 3a06f0d..5b655fc 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -44,7 +44,7 @@ public:
         , m_tail(0)
         , m_head(0)
         , m_head_refcnt(0)
-        , m_old_head(0)
+        , m_old_head(high_watermark)
         , m_old_head_refcnt(0)
         , m_data((Wrapped<R> *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped<R>)))
         , m_tombstone_filter(new psudb::BloomFilter<R>(BF_FPR, m_hwm, BF_HASH_FUNCS))
@@ -62,14 +62,18 @@ public:
     }
 
     int append(const R &rec, bool tombstone=false) {
-        int32_t pos = 0;
-        if ((pos = try_advance_tail()) == -1) return 0;
+        int32_t tail = 0;
+        if ((tail = try_advance_tail()) == -1) {
+            return 0;
+        }
 
         Wrapped<R> wrec;
         wrec.rec = rec;
         wrec.header = 0;
         if (tombstone) wrec.set_tombstone();
 
+        size_t pos = tail % m_cap;
+
         m_data[pos] = wrec;
         m_data[pos].header |= (pos << 2);
 
@@ -131,6 +135,13 @@ public:
         return BufferView<R>(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
     }
 
+    BufferView<R> get_flush_buffer_view() {
+        m_head_refcnt.fetch_add(1);
+        auto f = std::bind(release_head_reference, (void *) this, m_head.load());
+        return BufferView<R>(m_data, m_cap, m_head.load(), m_head.load() + m_lwm, m_tscnt.load(), m_tombstone_filter, f);
+
+    }
+
     /*
      * Advance the buffer following a reconstruction. Move current
      * head and head_refcnt into old_head and old_head_refcnt, then
@@ -142,6 +153,7 @@ public:
 
         /* refuse to advance head while there is an old with one references */
         if (m_old_head_refcnt > 0) {
+            fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts");
             return false;
         }
 
@@ -195,6 +207,10 @@ public:
      * Note: this returns the available physical storage capacity,
      * *not* now many more records can be inserted before the
      * HWM is reached.
+     *
+     * FIXME: this logic is incorrect for the buffer prior to the
+     * first call to advance_head, and will under-report the available
+     * space.
      */
     size_t get_available_capacity() {
         return m_cap - (m_tail.load() - m_old_head.load());
@@ -205,7 +221,7 @@ private:
         size_t old_value = m_tail.load();
 
         /* if full, fail to advance the tail */
-        if (old_value >= m_hwm) {
+        if (old_value - m_head.load() >= m_hwm) {
             return -1;
         }
 
@@ -236,6 +252,7 @@ private:
          * also match.
          */
         if (head == buffer->m_old_head.load()) {
+            assert(buffer->m_old_head_refcnt > 0);
             buffer->m_old_head_refcnt.fetch_sub(1);
             /* 
              * if the old head refcnt drops to 0, free
@@ -251,6 +268,7 @@ private:
                 buffer->m_old_head.store(buffer->m_head);   
             }
         } else if (head == buffer->m_head.load()) {
+            assert(buffer->m_head_refcnt > 0);
             buffer->m_head_refcnt.fetch_sub(1);
         }
     }
-- 
cgit v1.2.3


From b485685968c7ab626d98cc2a84a122d7ca3c68ce Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Mon, 15 Jan 2024 15:16:20 -0500
Subject: Use 16-byte CAS to control buffer head

---
 include/framework/structure/MutableBuffer.h | 120 ++++++++++++++--------------
 1 file changed, 59 insertions(+), 61 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 5b655fc..eeb3dc9 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -36,16 +36,20 @@ namespace de {
 template <RecordInterface R>
 class MutableBuffer {
     friend class BufferView<R>;
+
+    struct buffer_head {
+        size_t head_idx;
+        size_t refcnt;
+    };
+    
 public:
     MutableBuffer(size_t low_watermark, size_t high_watermark, size_t capacity=0) 
         : m_lwm(low_watermark)
         , m_hwm(high_watermark)
         , m_cap((capacity == 0) ? 2 * high_watermark : capacity)
         , m_tail(0)
-        , m_head(0)
-        , m_head_refcnt(0)
-        , m_old_head(high_watermark)
-        , m_old_head_refcnt(0)
+        , m_head({0, 0})
+        , m_old_head({high_watermark, 0})
         , m_data((Wrapped<R> *) psudb::sf_aligned_alloc(CACHELINE_SIZE, m_cap * sizeof(Wrapped<R>)))
         , m_tombstone_filter(new psudb::BloomFilter<R>(BF_FPR, m_hwm, BF_HASH_FUNCS))
         , m_tscnt(0)
@@ -94,7 +98,7 @@ public:
     }
 
     size_t get_record_count() {
-        return m_tail - m_head;
+        return m_tail.load() - m_head.load().head_idx;
     }
     
     size_t get_capacity() {
@@ -130,16 +134,17 @@ public:
     }
 
     BufferView<R> get_buffer_view() {
-        m_head_refcnt.fetch_add(1);
-        auto f = std::bind(release_head_reference, (void *) this, m_head.load());
-        return BufferView<R>(m_data, m_cap, m_head.load(), m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
+        size_t head = get_head();
+        auto f = std::bind(release_head_reference, (void *) this, head);
+
+        return BufferView<R>(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
     }
 
     BufferView<R> get_flush_buffer_view() {
-        m_head_refcnt.fetch_add(1);
-        auto f = std::bind(release_head_reference, (void *) this, m_head.load());
-        return BufferView<R>(m_data, m_cap, m_head.load(), m_head.load() + m_lwm, m_tscnt.load(), m_tombstone_filter, f);
+        size_t head = get_head();
+        auto f = std::bind(release_head_reference, (void *) this, head);
 
+        return BufferView<R>(m_data, m_cap, head, head + m_lwm, m_tscnt.load(), m_tombstone_filter, f);
     }
 
     /*
@@ -148,38 +153,41 @@ public:
      * assign new_head to old_head.
      */
     bool advance_head(size_t new_head) {
-        assert(new_head > m_head.load());
+        assert(new_head > m_head.load().head_idx);
         assert(new_head <= m_tail.load());
 
         /* refuse to advance head while there is an old with one references */
-        if (m_old_head_refcnt > 0) {
+        if (m_old_head.load().refcnt > 0) {
             fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts");
             return false;
         }
 
         m_active_head_advance.store(true);
 
-        /*
-         * the order here is very important. We first store zero to the 
-         * old_refcnt (should be zero anyway). Then we move the current 
-         * head to old head. At this point, any new buffer views should
-         * increment the old head refcnt, so no new references to the 
-         * current head will be taken. Then we add the current head 
-         * refcnt to this. This is to ensure that no references get 
-         * dropped. Only after this do we change to the new head
-         */
-        m_old_head_refcnt.store(0);
-
-        m_old_head.store(m_head.load());
-        m_old_head_refcnt.fetch_add(m_head_refcnt);
+        buffer_head new_hd = {new_head, 0};
+        buffer_head cur_hd;
 
-        m_head_refcnt.store(0);
-        m_head.store(new_head);
+        /* move the current head into the old head */
+        do {
+            buffer_head cur_hd = m_head.load();
+            m_old_head.store(cur_hd);
+        } while(!m_head.compare_exchange_strong(cur_hd, new_hd));
 
         m_active_head_advance.store(false);
         return true;
     }
 
+    size_t get_head() {
+        buffer_head cur_hd, new_hd;
+
+        do {
+            cur_hd = m_head.load();
+            new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1};
+        } while(!m_head.compare_exchange_strong(cur_hd, new_hd));
+
+        return new_hd.head_idx;
+    }
+
     void set_low_watermark(size_t lwm) {
         assert(lwm < m_hwm);
         m_lwm = lwm;
@@ -213,7 +221,7 @@ public:
      * space.
      */
     size_t get_available_capacity() {
-        return m_cap - (m_tail.load() - m_old_head.load());
+        return m_cap - (m_tail.load() - m_old_head.load().head_idx);
     }
 
 private:
@@ -221,7 +229,7 @@ private:
         size_t old_value = m_tail.load();
 
         /* if full, fail to advance the tail */
-        if (old_value - m_head.load() >= m_hwm) {
+        if (old_value - m_head.load().head_idx >= m_hwm) {
             return -1;
         }
 
@@ -244,33 +252,26 @@ private:
     static void release_head_reference(void *buff, size_t head) {
         MutableBuffer<R> *buffer = (MutableBuffer<R> *) buff;
 
-        /* 
-         * check old head first. During a head transition, the head being 
-         * retired will first be assigned to *both* head and old_head. As
-         * a result, any refcnt updates during this time should be applied
-         * to old_head, even if the current head and the head being released
-         * also match.
-         */
-        if (head == buffer->m_old_head.load()) {
-            assert(buffer->m_old_head_refcnt > 0);
-            buffer->m_old_head_refcnt.fetch_sub(1);
-            /* 
-             * if the old head refcnt drops to 0, free
-             * the records by setting old_head = head
-             * before this, spin while the two heads are equal to
-             * avoid 
-             */
-            while (buffer->m_active_head_advance.load()) {
-                _mm_pause();
-            }
-
-            if (buffer->m_old_head_refcnt.load() == 0) {
-                buffer->m_old_head.store(buffer->m_head);   
+        buffer_head cur_hd, new_hd;
+        do {
+            if (buffer->m_head.load().head_idx == head) {
+                cur_hd = buffer->m_head;
+                assert(cur_hd.refcnt > 0);
+                new_hd = {cur_hd.head_idx, cur_hd.refcnt-1};
+
+                if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) {
+                    break;
+                }
+            } else {
+                cur_hd = buffer->m_old_head;
+                assert(cur_hd.refcnt > 0);
+                new_hd = {cur_hd.head_idx, cur_hd.refcnt-1};
+                if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) {
+                    break;
+                }
             }
-        } else if (head == buffer->m_head.load()) {
-            assert(buffer->m_head_refcnt > 0);
-            buffer->m_head_refcnt.fetch_sub(1);
-        }
+            _mm_pause();
+        } while(true);
     }
 
     size_t m_lwm;
@@ -279,11 +280,8 @@ private:
     
     alignas(64) std::atomic<size_t> m_tail;
 
-    alignas(64) std::atomic<size_t> m_head;
-    alignas(64) std::atomic<size_t> m_head_refcnt;
-
-    alignas(64) std::atomic<size_t> m_old_head;
-    alignas(64) std::atomic<size_t> m_old_head_refcnt;
+    alignas(64) std::atomic<buffer_head> m_head;
+    alignas(64) std::atomic<buffer_head> m_old_head;
     
     Wrapped<R>* m_data;
     psudb::BloomFilter<R>* m_tombstone_filter;
-- 
cgit v1.2.3


From 2117935e85412f3733ee0bcb1830c7fd0b129b29 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Mon, 15 Jan 2024 17:23:57 -0500
Subject: Concurrency testing and bug fixes

---
 include/framework/structure/BufferView.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index c751786..099b7a2 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -105,7 +105,15 @@ public:
     }
 
     void copy_to_buffer(psudb::byte *buffer) {
-        memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped<R>));
+        /* check if the region to be copied circles back to start. If so, do it in two steps */
+        if ((m_head % m_cap) + get_record_count() > m_cap) {
+            size_t split_idx = m_cap - (m_head % m_cap);
+
+            memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped<R>));
+            memcpy(buffer + split_idx, (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped<R>));
+        } else {
+            memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped<R>));
+        }
     }
 
     size_t get_tail() {
-- 
cgit v1.2.3


From 138c793b0a58577713d98c98bb140cf1d9c79bee Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 17 Jan 2024 18:22:00 -0500
Subject: Multiple concurrency bug fixes

A poorly organized commit with fixes for a variety of bugs that were
causing missing records. The core problems all appear to be fixed,
though there is an outstanding problem with tombstones not being
completely canceled. A very small number are appearing in the wrong
order during the static structure test.
---
 include/framework/structure/BufferView.h    |  2 +-
 include/framework/structure/InternalLevel.h | 26 ++++++-------
 include/framework/structure/MutableBuffer.h | 58 ++++++++++++++++++-----------
 3 files changed, 50 insertions(+), 36 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 099b7a2..30fffed 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -110,7 +110,7 @@ public:
             size_t split_idx = m_cap - (m_head % m_cap);
 
             memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped<R>));
-            memcpy(buffer + split_idx, (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped<R>));
+            memcpy(buffer + (split_idx * sizeof(Wrapped<R>)), (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped<R>));
         } else {
             memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped<R>));
         }
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index b35cadd..e9874e0 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -51,11 +51,10 @@ public:
         assert(base_level->m_level_no > new_level->m_level_no || (base_level->m_level_no == 0 && new_level->m_level_no == 0));
         auto res = new InternalLevel(base_level->m_level_no, 1);
         res->m_shard_cnt = 1;
-        Shard* shards[2];
-        shards[0] = base_level->m_shards[0].get();
-        shards[1] = new_level->m_shards[0].get();
+        std::vector<Shard *> shards = {base_level->m_shards[0].get(),
+                                       new_level->m_shards[0].get()};
 
-        res->m_shards[0] = std::make_shared<S>(shards, 2);
+        res->m_shards[0] = std::make_shared<S>(shards);
         return std::shared_ptr<InternalLevel>(res);
     }
 
@@ -75,17 +74,17 @@ public:
             return;
         }
 
-        Shard *shards[level->m_shard_cnt];
-        for (size_t i=0; i<level->m_shard_cnt; i++) {
-            shards[i] = level->m_shards[i].get();
+        std::vector<S*> shards;
+        for (auto shard : level->m_shards) {
+            if (shard) shards.emplace_back(shard.get());
         }
 
         if (m_shard_cnt == m_shards.size()) {
-            m_pending_shard = new S(shards, level->m_shard_cnt);
+            m_pending_shard = new S(shards);
             return;
         }
 
-        auto tmp = new S(shards, level->m_shard_cnt);
+        auto tmp = new S(shards);
         m_shards[m_shard_cnt] = std::shared_ptr<S>(tmp);
 
         ++m_shard_cnt;
@@ -131,13 +130,12 @@ public:
             return nullptr;
         }
 
-        Shard *shards[m_shard_cnt];
-
-        for (size_t i=0; i<m_shard_cnt; i++) {
-            shards[i] = m_shards[i].get();
+        std::vector<Shard *> shards;
+        for (auto shard : m_shards) {
+            if (shard) shards.emplace_back(shard.get());
         }
 
-        return new S(shards, m_shard_cnt);
+        return new S(shards);
     }
 
     void get_query_states(std::vector<std::pair<ShardID, Shard *>> &shards, std::vector<void*>& shard_states, void *query_parms) {
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index eeb3dc9..7edde2f 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -133,18 +133,18 @@ public:
         return m_tombstone_filter->get_memory_usage();
     }
 
-    BufferView<R> get_buffer_view() {
-        size_t head = get_head();
+    BufferView<R> get_buffer_view(size_t target_head) {
+        size_t head = get_head(target_head);
         auto f = std::bind(release_head_reference, (void *) this, head);
 
         return BufferView<R>(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
     }
 
-    BufferView<R> get_flush_buffer_view() {
-        size_t head = get_head();
+    BufferView<R> get_buffer_view() {
+        size_t head = get_head(m_head.load().head_idx);
         auto f = std::bind(release_head_reference, (void *) this, head);
 
-        return BufferView<R>(m_data, m_cap, head, head + m_lwm, m_tscnt.load(), m_tombstone_filter, f);
+        return BufferView<R>(m_data, m_cap, head, m_tail.load(), m_tscnt.load(), m_tombstone_filter, f);
     }
 
     /*
@@ -167,23 +167,39 @@ public:
         buffer_head new_hd = {new_head, 0};
         buffer_head cur_hd;
 
-        /* move the current head into the old head */
+        /* replace current head with new head */
         do {
-            buffer_head cur_hd = m_head.load();
-            m_old_head.store(cur_hd);
+            cur_hd = m_head.load();
         } while(!m_head.compare_exchange_strong(cur_hd, new_hd));
 
+        /* move the current head into the old head */
+        m_old_head.store(cur_hd);
+
         m_active_head_advance.store(false);
         return true;
     }
 
-    size_t get_head() {
+    /*
+     * FIXME: If target_head does not match *either* the old_head or the
+     * current_head, this routine will loop infinitely.
+     */
+    size_t get_head(size_t target_head) {
         buffer_head cur_hd, new_hd;
+        bool head_acquired = false;
 
         do {
-            cur_hd = m_head.load();
-            new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1};
-        } while(!m_head.compare_exchange_strong(cur_hd, new_hd));
+            if (m_old_head.load().head_idx == target_head) {
+                cur_hd = m_old_head.load();
+                cur_hd.head_idx = target_head;
+                new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1};
+                head_acquired = m_old_head.compare_exchange_strong(cur_hd, new_hd);
+            } else if (m_head.load().head_idx == target_head){
+                cur_hd = m_head.load();
+                cur_hd.head_idx = target_head;
+                new_hd = {cur_hd.head_idx, cur_hd.refcnt + 1};
+                head_acquired = m_head.compare_exchange_strong(cur_hd, new_hd);
+            }
+        } while(!head_acquired);
 
         return new_hd.head_idx;
     }
@@ -254,22 +270,22 @@ private:
 
         buffer_head cur_hd, new_hd;
         do {
-            if (buffer->m_head.load().head_idx == head) {
-                cur_hd = buffer->m_head;
-                assert(cur_hd.refcnt > 0);
+            if (buffer->m_old_head.load().head_idx == head) {
+                cur_hd = buffer->m_old_head;
+                if (cur_hd.refcnt == 0) continue;
                 new_hd = {cur_hd.head_idx, cur_hd.refcnt-1};
-
-                if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) {
+                if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) {
                     break;
                 }
             } else {
-                cur_hd = buffer->m_old_head;
-                assert(cur_hd.refcnt > 0);
+                cur_hd = buffer->m_head;
+                if (cur_hd.refcnt == 0) continue;
                 new_hd = {cur_hd.head_idx, cur_hd.refcnt-1};
-                if (buffer->m_old_head.compare_exchange_strong(cur_hd, new_hd)) {
+
+                if (buffer->m_head.compare_exchange_strong(cur_hd, new_hd)) {
                     break;
                 }
-            }
+            } 
             _mm_pause();
         } while(true);
     }
-- 
cgit v1.2.3


From 27d36dd9a68e4cf454be2ca7877ece0a34c3e929 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 31 Jan 2024 15:48:21 -0500
Subject: Insert throughput benchmark

---
 include/framework/structure/MutableBuffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 7edde2f..94a9c41 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -158,7 +158,7 @@ public:
 
         /* refuse to advance head while there is an old with one references */
         if (m_old_head.load().refcnt > 0) {
-            fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts");
+            //fprintf(stderr, "[W]: Refusing to advance head due to remaining reference counts\n");
             return false;
         }
 
-- 
cgit v1.2.3


From 0ff3cedf5df9c27bccd3053ce6339e317f87ff76 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Mon, 5 Feb 2024 15:18:33 -0500
Subject: BufferView: Adjusted BV to avoid repeated modulus operations

---
 include/framework/structure/BufferView.h    | 50 +++++++++++++++++++++++------
 include/framework/structure/MutableBuffer.h | 13 +++++---
 2 files changed, 48 insertions(+), 15 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 30fffed..edf6707 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -38,6 +38,8 @@ public:
         , m_release(std::move(other.m_release))
         , m_head(std::exchange(other.m_head, 0))
         , m_tail(std::exchange(other.m_tail, 0))
+        , m_start(std::exchange(other.m_start, 0))
+        , m_stop(std::exchange(other.m_stop, 0))
         , m_cap(std::exchange(other.m_cap, 0))
         , m_approx_ts_cnt(std::exchange(other.m_approx_ts_cnt, 0))
         , m_tombstone_filter(std::exchange(other.m_tombstone_filter, nullptr))
@@ -52,6 +54,8 @@ public:
         , m_release(release)
         , m_head(head)
         , m_tail(tail)
+        , m_start(m_head % cap)
+        , m_stop(m_tail % cap)
         , m_cap(cap)
         , m_approx_ts_cnt(tombstone_cnt)
         , m_tombstone_filter(filter)
@@ -76,11 +80,29 @@ public:
     }
 
     bool delete_record(const R& rec) {
-        for (size_t i=0; i<get_record_count(); i++) {
-            if (m_data[to_idx(i)].rec == rec) {
-                m_data[to_idx(i)].set_delete();
-                return true;
+        if (m_start < m_stop) {
+            for (size_t i=m_start; i<m_stop; i++) {
+                if (m_data[i].rec == rec) {
+                    m_data[i].set_delete();
+                    return true;
+                }
             }
+        } else {
+            for (size_t i=m_start; i<m_cap; i++) {
+                if (m_data[i].rec == rec) {
+                    m_data[i].set_delete();
+                    return true;
+                }
+            }
+
+            for (size_t i=0; i<m_stop; i++) {
+                if (m_data[i].rec == rec) {
+                    m_data[i].set_delete();
+                    return true;
+                }
+
+            }
+
         }
 
         return false;
@@ -101,18 +123,19 @@ public:
 
     Wrapped<R> *get(size_t i) {
         assert(i < get_record_count());
+        m_total += (m_data + to_idx(i))->rec.key;
         return m_data + to_idx(i);
     }
 
     void copy_to_buffer(psudb::byte *buffer) {
         /* check if the region to be copied circles back to start. If so, do it in two steps */
-        if ((m_head % m_cap) + get_record_count() > m_cap) {
-            size_t split_idx = m_cap - (m_head % m_cap);
+        if (m_start > m_stop) { 
+            size_t split_idx = m_cap - m_start;
 
-            memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), split_idx* sizeof(Wrapped<R>));
-            memcpy(buffer + (split_idx * sizeof(Wrapped<R>)), (std::byte*) m_data, (get_record_count() - split_idx) * sizeof(Wrapped<R>));
+            memcpy(buffer, (std::byte*) (m_data + m_start), split_idx* sizeof(Wrapped<R>));
+            memcpy(buffer + (split_idx * sizeof(Wrapped<R>)), (std::byte*) m_data, m_stop * sizeof(Wrapped<R>));
         } else {
-            memcpy(buffer, (std::byte*) (m_data + (m_head % m_cap)), get_record_count() * sizeof(Wrapped<R>));
+            memcpy(buffer, (std::byte*) (m_data + m_start), get_record_count() * sizeof(Wrapped<R>));
         }
     }
 
@@ -129,13 +152,20 @@ private:
     ReleaseFunction m_release;
     size_t m_head;
     size_t m_tail;
+    size_t m_start;
+    size_t m_stop;
     size_t m_cap;
     size_t m_approx_ts_cnt;
     psudb::BloomFilter<R> *m_tombstone_filter;
     bool m_active;
 
+    size_t m_total;
+
     size_t to_idx(size_t i) {
-        return (m_head + i) % m_cap;
+        size_t idx = (m_start + i >= m_cap) ? i = (m_cap - m_start) 
+                                            : m_start + i;
+        assert(idx < m_cap);
+        return idx;
     }
 };
 
diff --git a/include/framework/structure/MutableBuffer.h b/include/framework/structure/MutableBuffer.h
index 94a9c41..415c95a 100644
--- a/include/framework/structure/MutableBuffer.h
+++ b/include/framework/structure/MutableBuffer.h
@@ -230,13 +230,16 @@ public:
     /*
      * Note: this returns the available physical storage capacity,
      * *not* now many more records can be inserted before the
-     * HWM is reached.
-     *
-     * FIXME: this logic is incorrect for the buffer prior to the
-     * first call to advance_head, and will under-report the available
-     * space.
+     * HWM is reached. It considers the old_head to be "free"
+     * when it has no remaining references. This should be true,
+     * but a buggy framework implementation may violate the
+     * assumption.
      */
     size_t get_available_capacity() {
+        if (m_old_head.load().refcnt == 0) {
+            return m_cap - (m_tail.load() - m_head.load().head_idx);
+        }
+
         return m_cap - (m_tail.load() - m_old_head.load().head_idx);
     }
 
-- 
cgit v1.2.3


From 10b4425e842d10b7fbfa85978969ed4591d6b98e Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 7 Feb 2024 10:56:52 -0500
Subject: Fully implemented Query concept and adjusted queries to use it

---
 include/framework/structure/ExtensionStructure.h | 2 +-
 include/framework/structure/InternalLevel.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index ae566cb..0b8263e 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -22,7 +22,7 @@
 
 namespace de {
 
-template <RecordInterface R, ShardInterface S, QueryInterface Q, LayoutPolicy L=LayoutPolicy::TEIRING>
+template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q, LayoutPolicy L=LayoutPolicy::TEIRING>
 class ExtensionStructure {
     typedef S Shard;
     typedef BufferView<R> BuffView;
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index e9874e0..0fd5275 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -19,12 +19,12 @@
 #include "framework/structure/BufferView.h"
 
 namespace de {
-template <RecordInterface R, ShardInterface S, QueryInterface Q>
+template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q>
 class InternalLevel;
 
 
 
-template <RecordInterface R, ShardInterface S, QueryInterface Q>
+template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q>
 class InternalLevel {
     typedef S Shard;
     typedef BufferView<R> BuffView;
-- 
cgit v1.2.3


From 2c5d549b3618b9ea72e6eece4cb4f3da5a6811a8 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Wed, 7 Feb 2024 13:42:34 -0500
Subject: Fully realized shard concept interface

---
 include/framework/structure/ExtensionStructure.h | 2 +-
 include/framework/structure/InternalLevel.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 0b8263e..373a1e2 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -22,7 +22,7 @@
 
 namespace de {
 
-template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q, LayoutPolicy L=LayoutPolicy::TEIRING>
+template <RecordInterface R, ShardInterface<R> S, QueryInterface<R, S> Q, LayoutPolicy L=LayoutPolicy::TEIRING>
 class ExtensionStructure {
     typedef S Shard;
     typedef BufferView<R> BuffView;
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index 0fd5275..d586869 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -19,12 +19,12 @@
 #include "framework/structure/BufferView.h"
 
 namespace de {
-template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q>
+template <RecordInterface R, ShardInterface<R> S, QueryInterface<R, S> Q>
 class InternalLevel;
 
 
 
-template <RecordInterface R, ShardInterface S, QueryInterface<R, S> Q>
+template <RecordInterface R, ShardInterface<R> S, QueryInterface<R, S> Q>
 class InternalLevel {
     typedef S Shard;
     typedef BufferView<R> BuffView;
-- 
cgit v1.2.3


From 711769574e647839677739192698e400529efe75 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Thu, 8 Feb 2024 16:38:44 -0500
Subject: Updated VPTree to new shard/query interfaces

---
 include/framework/structure/BufferView.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index edf6707..4e3de25 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -123,7 +123,6 @@ public:
 
     Wrapped<R> *get(size_t i) {
         assert(i < get_record_count());
-        m_total += (m_data + to_idx(i))->rec.key;
         return m_data + to_idx(i);
     }
 
@@ -159,8 +158,6 @@ private:
     psudb::BloomFilter<R> *m_tombstone_filter;
     bool m_active;
 
-    size_t m_total;
-
     size_t to_idx(size_t i) {
         size_t idx = (m_start + i >= m_cap) ? i = (m_cap - m_start) 
                                             : m_start + i;
-- 
cgit v1.2.3


From 402fc269c0aaa671d84a6d15918735ad4b90e6b2 Mon Sep 17 00:00:00 2001
From: Douglas Rumbaugh <dbr4@psu.edu>
Date: Fri, 9 Feb 2024 12:30:21 -0500
Subject: Comment updates/fixes

---
 include/framework/structure/BufferView.h         |  1 +
 include/framework/structure/ExtensionStructure.h | 64 +++++++++++++-----------
 include/framework/structure/InternalLevel.h      |  6 +++
 3 files changed, 42 insertions(+), 29 deletions(-)

(limited to 'include/framework/structure')

diff --git a/include/framework/structure/BufferView.h b/include/framework/structure/BufferView.h
index 4e3de25..9e0872b 100644
--- a/include/framework/structure/BufferView.h
+++ b/include/framework/structure/BufferView.h
@@ -5,6 +5,7 @@
  *
  * Distributed under the Modified BSD License.
  *
+ * TODO: This file is very poorly commented.
  */
 #pragma once
 
diff --git a/include/framework/structure/ExtensionStructure.h b/include/framework/structure/ExtensionStructure.h
index 373a1e2..4802bc1 100644
--- a/include/framework/structure/ExtensionStructure.h
+++ b/include/framework/structure/ExtensionStructure.h
@@ -37,19 +37,23 @@ public:
     ~ExtensionStructure() = default;
 
     /*
-     * Create a shallow copy of this extension structure. The copy will share references to the
-     * same levels/shards as the original, but will have its own lists. As all of the shards are
-     * immutable (with the exception of deletes), the copy can be restructured with reconstructions
-     * and flushes without affecting the original. The copied structure will be returned with a reference
-     * count of 0; generally you will want to immediately call take_reference() on it.
+     * Create a shallow copy of this extension structure. The copy will share
+     * references to the same levels/shards as the original, but will have its
+     * own lists. As all of the shards are immutable (with the exception of
+     * deletes), the copy can be restructured with reconstructions and flushes
+     * without affecting the original. The copied structure will be returned
+     * with a reference count of 0; generally you will want to immediately call
+     * take_reference() on it.
      *
-     * NOTE: When using tagged deletes, a delete of a record in the original structure will affect
-     * the copy, so long as the copy retains a reference to the same shard as the original. This could
-     * cause synchronization problems under tagging with concurrency. Any deletes in this context will
+     * NOTE: When using tagged deletes, a delete of a record in the original
+     * structure will affect the copy, so long as the copy retains a reference
+     * to the same shard as the original. This could cause synchronization
+     * problems under tagging with concurrency. Any deletes in this context will
      * need to be forwarded to the appropriate structures manually.
      */
     ExtensionStructure<R, S, Q, L> *copy() {
-        auto new_struct = new ExtensionStructure<R, S, Q, L>(m_buffer_size, m_scale_factor, m_max_delete_prop);
+        auto new_struct = new ExtensionStructure<R, S, Q, L>(m_buffer_size, m_scale_factor, 
+                                                             m_max_delete_prop);
         for (size_t i=0; i<m_levels.size(); i++) {
             new_struct->m_levels.push_back(m_levels[i]->clone());
         }
@@ -64,9 +68,9 @@ public:
      * setting the delete bit in its wrapped header. Returns 1 if a matching
      * record was found and deleted, and 0 if a matching record was not found.
      *
-     * This function will stop after finding the first matching record. It is assumed
-     * that no duplicate records exist. In the case of duplicates, this function will
-     * still "work", but in the sense of "delete first match".
+     * This function will stop after finding the first matching record. It is
+     * assumed that no duplicate records exist. In the case of duplicates, this
+     * function will still "work", but in the sense of "delete first match".
      */
     int tagged_delete(const R &rec) {
         for (auto level : m_levels) {
@@ -77,7 +81,7 @@ public:
 
         /*
          * If the record to be erased wasn't found, return 0. The
-         * DynamicExtension itself will then search the active 
+         * DynamicExtension itself will then search the active
          * Buffers.
          */
         return 0;
@@ -164,21 +168,23 @@ public:
     }
 
     /*
-     * Validate that no level in the structure exceeds its maximum tombstone capacity. This is
-     * used to trigger preemptive compactions at the end of the reconstruction process.
+     * Validate that no level in the structure exceeds its maximum tombstone
+     * capacity. This is used to trigger preemptive compactions at the end of
+     * the reconstruction process.
      */
     bool validate_tombstone_proportion() {
-        long double ts_prop;
-        for (size_t i=0; i<m_levels.size(); i++) {
-            if (m_levels[i]) {
-                ts_prop = (long double) m_levels[i]->get_tombstone_count() / (long double) calc_level_record_capacity(i);
-                if (ts_prop > (long double) m_max_delete_prop) {
-                    return false;
-                }
-            }
+      long double ts_prop;
+      for (size_t i = 0; i < m_levels.size(); i++) {
+        if (m_levels[i]) {
+          ts_prop = (long double)m_levels[i]->get_tombstone_count() /
+                    (long double)calc_level_record_capacity(i);
+          if (ts_prop > (long double)m_max_delete_prop) {
+            return false;
+          }
         }
+      }
 
-        return true;
+      return true;
     }
 
     bool validate_tombstone_proportion(level_index level) {
@@ -224,14 +230,14 @@ public:
             /*
              * The amount of storage required for the reconstruction accounts
              * for the cost of storing the new records, along with the
-             * cost of retaining the old records during the process 
-             * (hence the 2x multiplier). 
+             * cost of retaining the old records during the process
+             * (hence the 2x multiplier).
              *
-             * FIXME: currently does not account for the *actual* size 
-             * of the shards, only the storage for the records 
+             * FIXME: currently does not account for the *actual* size
+             * of the shards, only the storage for the records
              * themselves.
              */
-            size_t reccnt = m_levels[i-1]->get_record_count();
+            size_t reccnt = m_levels[i - 1]->get_record_count();
             if constexpr (L == LayoutPolicy::LEVELING) {
                 if (can_reconstruct_with(i, reccnt)) {
                     reccnt += m_levels[i]->get_record_count();
diff --git a/include/framework/structure/InternalLevel.h b/include/framework/structure/InternalLevel.h
index d586869..db38946 100644
--- a/include/framework/structure/InternalLevel.h
+++ b/include/framework/structure/InternalLevel.h
@@ -6,6 +6,12 @@
  *
  * Distributed under the Modified BSD License.
  *
+ * The word `Internal` in this class's name refers to memory. The current
+ * model, inherited from the framework in Practical Dynamic Extension for
+ * Sampling Indexes, would use a different ExternalLevel for shards stored
+ * on external storage. This is a distinction that can probably be avoided
+ * with some more thought being put into interface design.
+ *
  */
 #pragma once
 
-- 
cgit v1.2.3