4 files changed, 348 insertions, 0 deletions
diff --git a/include/ds/Alias.h b/include/ds/Alias.h
new file mode 100644
index 0000000..855dc75
--- /dev/null
+++ b/include/ds/Alias.h
@@ -0,0 +1,72 @@
+/*
+ * include/ds/Alias.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *                    Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <gsl/gsl_rng.h>
+#include <vector>
+
+namespace de {
+
+/*
+ * An implementation of Walker's Alias Structure for Weighted Set Sampling. Requires
+ * that the input weight vector is already normalized.
+ */
+class Alias {
+public:
+    Alias(const std::vector<double>& weights)
+    : m_alias(weights.size()), m_cutoff(weights.size()) {
+        size_t n = weights.size();
+        auto overfull = std::vector<size_t>();
+        auto underfull = std::vector<size_t>();
+        overfull.reserve(n);
+        underfull.reserve(n);
+
+        // initialize the probability_table with n*p(i) as well as the overfull and
+        // underfull lists.
+        for (size_t i = 0; i < n; i++) {
+            m_cutoff[i] = (double) n * weights[i];
+            if (m_cutoff[i] > 1) {
+                overfull.emplace_back(i);
+            } else if (m_cutoff[i] < 1) {
+                underfull.emplace_back(i);
+            } else {
+                m_alias[i] = i;
+            }
+        }
+
+        while (overfull.size() > 0 && underfull.size() > 0) {
+            auto i = overfull.back(); overfull.pop_back();
+            auto j = underfull.back(); underfull.pop_back();
+
+            m_alias[j] = i;
+            m_cutoff[i] = m_cutoff[i] + m_cutoff[j] - 1.0;
+
+            if (m_cutoff[i] > 1.0) {
+                overfull.emplace_back(i);
+            } else if (m_cutoff[i] < 1.0) {
+                underfull.emplace_back(i);
+            }
+        }
+    }
+
+    size_t get(const gsl_rng* rng) {
+        double coin1 = gsl_rng_uniform(rng);
+        double coin2 = gsl_rng_uniform(rng);
+
+        size_t k = ((double) m_alias.size()) * coin1;
+        return coin2 < m_cutoff[k] ? k : m_alias[k];
+    }
+
+private:
+    std::vector<size_t> m_alias;
+    std::vector<double> m_cutoff;
+};
+
+}
diff --git a/include/ds/BitArray.h b/include/ds/BitArray.h
new file mode 100644
index 0000000..1d33d5b
--- /dev/null
+++ b/include/ds/BitArray.h
@@ -0,0 +1,67 @@
+/*
+ * include/ds/BloomFilter.h
+ *
+ * Copyright (C) 2023 Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+
+#include "util/base.h"
+
+namespace lsm {
+
+class BitArray {
+public:
+    BitArray(size_t bits): m_bits(bits), m_data(nullptr) {
+        if (m_bits > 0) {
+            size_t n_bytes = (m_bits >> 3) << 3;
+            m_data = (char*) std::aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(n_bytes));
+            memset(m_data, 0, n_bytes);
+        }
+    }
+
+    ~BitArray() {
+        if (m_data) free(m_data);
+    }
+
+    bool is_set(size_t bit) {
+        if (bit >= m_bits) return false;
+        return m_data[bit >> 3] & (1 << (bit & 7));
+    }
+
+    int set(size_t bit) {
+        if (bit >= m_bits) return 0;
+        m_data[bit >> 3] |= ((char) 1 << (bit & 7));
+        return 1;
+    }
+
+    int unset(size_t bit) {
+        if (bit >= m_bits) return 0;
+        m_data[bit >> 3] &= ~((char) 1 << (bit & 7));
+        return 1;
+    }
+
+    void clear() {
+        memset(m_data, 0, (m_bits >> 3) << 3);
+    }
+
+    size_t mem_size() {
+        return m_bits >> 3;
+    }
+
+    size_t size() {
+        return m_bits;
+    }
+    
+private:
+    size_t m_bits;
+    char* m_data;
+};
+
+}
diff --git a/include/ds/BloomFilter.h b/include/ds/BloomFilter.h
new file mode 100644
index 0000000..dbaab2f
--- /dev/null
+++ b/include/ds/BloomFilter.h
@@ -0,0 +1,74 @@
+/*
+ * include/ds/BloomFilter.h
+ *
+ * Copyright (C) 2023 Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cmath>
+#include <gsl/gsl_rng.h>
+
+#include "ds/BitArray.h"
+#include "util/hash.h"
+#include "util/base.h"
+#include "util/record.h"
+
+namespace lsm {
+
+class BloomFilter {
+public:
+    BloomFilter(size_t n_bits, size_t k, const gsl_rng* rng)
+    : m_n_bits(n_bits), m_n_salts(k), m_bitarray(n_bits) {
+        salt = (uint16_t*) aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(k * sizeof(uint16_t)));
+        for (size_t i = 0;  i < k; ++i) {
+            salt[i] = (uint16_t) gsl_rng_uniform_int(rng, 1 << 16);
+        }
+        
+    }
+
+    BloomFilter(double max_fpr, size_t n, size_t k, const gsl_rng* rng)
+    : BloomFilter((size_t)(-(double) (k * n) / std::log(1.0 - std::pow(max_fpr, 1.0 / k))), k, rng) {}
+
+    ~BloomFilter() {
+        if (salt) free(salt);
+    }
+
+    int insert(const key_t& key, size_t sz = sizeof(key_t)) {
+        if (m_bitarray.size() == 0) return 0;
+
+        for (size_t i = 0; i < m_n_salts; ++i) {
+            m_bitarray.set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits);
+        }
+
+        return 1;
+    }
+
+    bool lookup(const key_t& key, size_t sz = sizeof(key_t)) {
+        if (m_bitarray.size() == 0) return false;
+        for (size_t i = 0; i < m_n_salts; ++i) {
+            if (!m_bitarray.is_set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits))
+                return false;
+        }
+
+        return true;
+    }
+
+    void clear() {
+        m_bitarray.clear();
+    }
+
+    size_t get_memory_utilization() {
+        return this->m_bitarray.mem_size();
+    }
+private: 
+    size_t m_n_salts;
+    size_t m_n_bits;
+    uint16_t* salt;
+
+    BitArray m_bitarray;
+};
+
+}
diff --git a/include/ds/PriorityQueue.h b/include/ds/PriorityQueue.h
new file mode 100644
index 0000000..49f3e41
--- /dev/null
+++ b/include/ds/PriorityQueue.h
@@ -0,0 +1,135 @@
+/*
+ * include/ds/PriorityQueue.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> 
+ *                    Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <vector>
+#include <cassert>
+
+#include "util/record.h"
+
+namespace lsm {
+
+struct queue_record {
+    const char *data;
+    size_t version;
+};
+
+
+class PriorityQueue {
+public:
+    PriorityQueue(size_t size) : data(size), tail(0) {}
+    ~PriorityQueue() = default;
+
+    size_t size() const {
+        return tail;
+    }
+
+    void pop() {
+        assert(this->tail != 0);
+
+        // If there is only one element, just decrement the
+        // tail.
+        if (this->size() == 1) {
+            this->tail--;
+            return;
+        }
+
+        swap(0, --this->tail);
+
+        ssize_t idx = 0;
+        ssize_t child_idx;
+
+        while ((child_idx = min_child(idx)) != -1 && heap_cmp(child_idx, idx)) {
+            swap(idx, child_idx);
+            idx = child_idx;
+        }
+    }
+
+    void push(const char* record, size_t version=0) {
+        assert(tail != this->data.size());
+
+        size_t new_idx = this->tail++;
+        this->data[new_idx] = {record, version};
+
+        while (new_idx != 0 && !heap_cmp(parent(new_idx), new_idx)) {
+            swap(parent(new_idx), new_idx);
+            new_idx = parent(new_idx);
+        }
+    }
+
+
+    queue_record peek(size_t depth=0) {
+        ssize_t idx = 0;
+        size_t cur_depth = 0;
+
+        while (cur_depth != depth) {
+            idx = min_child(idx);
+            assert(idx != -1);
+            cur_depth++;
+        }
+
+        return this->data[idx];
+    }
+
+private:
+    std::vector<queue_record> data;
+    size_t tail;
+
+    /*
+     * Swap the elements at position a and position
+     * b within the heap
+     */
+    inline void swap(size_t a, size_t b) {
+        if (a == b) return;
+
+        queue_record temp = this->data[a];
+        this->data[a] = this->data[b];
+        this->data[b] = temp;
+    }
+
+    inline size_t left_child(size_t idx) {
+        return 2 * idx + 1;
+    }
+
+    inline size_t right_child(size_t idx) {
+        return 2 * idx + 2;
+    }
+
+    inline size_t parent(size_t idx) {
+        return (idx - 1) / 2;
+    }
+
+    inline ssize_t min_child(size_t idx) {
+        ssize_t left = left_child(idx) < tail ? left_child(idx) : -1;
+        ssize_t right = right_child(idx) < tail ? right_child(idx) : -1;
+
+        if (left == -1 && right == -1) {
+            return -1;
+        } else if (left == -1) {
+            return right;
+        } else  if (right == -1) {
+            return left;
+        }
+
+        return (heap_cmp(left, right)) ? left : right;
+    }
+
+    inline bool heap_cmp(size_t a, size_t b) {
+        auto cmp = record_cmp(this->data[a].data, this->data[b].data);
+        if (cmp == 0) {
+            if (this->data[a].version != this->data[b].version)
+                return this->data[a].version < this->data[b].version;
+            else return is_tombstone(this->data[a].data) && is_tombstone(this->data[b].data);
+        }
+        return cmp == -1;
+    }
+
+};
+}