summaryrefslogtreecommitdiffstats
path: root/include/ds
diff options
context:
space:
mode:
Diffstat (limited to 'include/ds')
-rw-r--r--include/ds/Alias.h72
-rw-r--r--include/ds/BitArray.h67
-rw-r--r--include/ds/BloomFilter.h74
-rw-r--r--include/ds/PriorityQueue.h135
4 files changed, 348 insertions, 0 deletions
diff --git a/include/ds/Alias.h b/include/ds/Alias.h
new file mode 100644
index 0000000..855dc75
--- /dev/null
+++ b/include/ds/Alias.h
@@ -0,0 +1,72 @@
+/*
+ * include/ds/Alias.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu>
+ * Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <gsl/gsl_rng.h>
+#include <vector>
+
+namespace de {
+
+/*
+ * An implementation of Walker's Alias Structure for Weighted Set Sampling. Requires
+ * that the input weight vector is already normalized.
+ */
+class Alias {
+public:
+ Alias(const std::vector<double>& weights)
+ : m_alias(weights.size()), m_cutoff(weights.size()) {
+ size_t n = weights.size();
+ auto overfull = std::vector<size_t>();
+ auto underfull = std::vector<size_t>();
+ overfull.reserve(n);
+ underfull.reserve(n);
+
+ // initialize the probability_table with n*p(i) as well as the overfull and
+ // underfull lists.
+ for (size_t i = 0; i < n; i++) {
+ m_cutoff[i] = (double) n * weights[i];
+ if (m_cutoff[i] > 1) {
+ overfull.emplace_back(i);
+ } else if (m_cutoff[i] < 1) {
+ underfull.emplace_back(i);
+ } else {
+ m_alias[i] = i;
+ }
+ }
+
+ while (overfull.size() > 0 && underfull.size() > 0) {
+ auto i = overfull.back(); overfull.pop_back();
+ auto j = underfull.back(); underfull.pop_back();
+
+ m_alias[j] = i;
+ m_cutoff[i] = m_cutoff[i] + m_cutoff[j] - 1.0;
+
+ if (m_cutoff[i] > 1.0) {
+ overfull.emplace_back(i);
+ } else if (m_cutoff[i] < 1.0) {
+ underfull.emplace_back(i);
+ }
+ }
+ }
+
+ size_t get(const gsl_rng* rng) {
+ double coin1 = gsl_rng_uniform(rng);
+ double coin2 = gsl_rng_uniform(rng);
+
+ size_t k = ((double) m_alias.size()) * coin1;
+ return coin2 < m_cutoff[k] ? k : m_alias[k];
+ }
+
+private:
+ std::vector<size_t> m_alias;
+ std::vector<double> m_cutoff;
+};
+
+}
diff --git a/include/ds/BitArray.h b/include/ds/BitArray.h
new file mode 100644
index 0000000..1d33d5b
--- /dev/null
+++ b/include/ds/BitArray.h
@@ -0,0 +1,67 @@
+/*
+ * include/ds/BloomFilter.h
+ *
+ * Copyright (C) 2023 Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+
+#include "util/base.h"
+
+namespace lsm {
+
+class BitArray {
+public:
+ BitArray(size_t bits): m_bits(bits), m_data(nullptr) {
+ if (m_bits > 0) {
+ size_t n_bytes = (m_bits >> 3) << 3;
+ m_data = (char*) std::aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(n_bytes));
+ memset(m_data, 0, n_bytes);
+ }
+ }
+
+ ~BitArray() {
+ if (m_data) free(m_data);
+ }
+
+ bool is_set(size_t bit) {
+ if (bit >= m_bits) return false;
+ return m_data[bit >> 3] & (1 << (bit & 7));
+ }
+
+ int set(size_t bit) {
+ if (bit >= m_bits) return 0;
+ m_data[bit >> 3] |= ((char) 1 << (bit & 7));
+ return 1;
+ }
+
+ int unset(size_t bit) {
+ if (bit >= m_bits) return 0;
+ m_data[bit >> 3] &= ~((char) 1 << (bit & 7));
+ return 1;
+ }
+
+ void clear() {
+ memset(m_data, 0, (m_bits >> 3) << 3);
+ }
+
+ size_t mem_size() {
+ return m_bits >> 3;
+ }
+
+ size_t size() {
+ return m_bits;
+ }
+
+private:
+ size_t m_bits;
+ char* m_data;
+};
+
+}
diff --git a/include/ds/BloomFilter.h b/include/ds/BloomFilter.h
new file mode 100644
index 0000000..dbaab2f
--- /dev/null
+++ b/include/ds/BloomFilter.h
@@ -0,0 +1,74 @@
+/*
+ * include/ds/BloomFilter.h
+ *
+ * Copyright (C) 2023 Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <cmath>
+#include <gsl/gsl_rng.h>
+
+#include "ds/BitArray.h"
+#include "util/hash.h"
+#include "util/base.h"
+#include "util/record.h"
+
+namespace lsm {
+
+class BloomFilter {
+public:
+ BloomFilter(size_t n_bits, size_t k, const gsl_rng* rng)
+ : m_n_bits(n_bits), m_n_salts(k), m_bitarray(n_bits) {
+ salt = (uint16_t*) aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(k * sizeof(uint16_t)));
+ for (size_t i = 0; i < k; ++i) {
+ salt[i] = (uint16_t) gsl_rng_uniform_int(rng, 1 << 16);
+ }
+
+ }
+
+ BloomFilter(double max_fpr, size_t n, size_t k, const gsl_rng* rng)
+ : BloomFilter((size_t)(-(double) (k * n) / std::log(1.0 - std::pow(max_fpr, 1.0 / k))), k, rng) {}
+
+ ~BloomFilter() {
+ if (salt) free(salt);
+ }
+
+ int insert(const key_t& key, size_t sz = sizeof(key_t)) {
+ if (m_bitarray.size() == 0) return 0;
+
+ for (size_t i = 0; i < m_n_salts; ++i) {
+ m_bitarray.set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits);
+ }
+
+ return 1;
+ }
+
+ bool lookup(const key_t& key, size_t sz = sizeof(key_t)) {
+ if (m_bitarray.size() == 0) return false;
+ for (size_t i = 0; i < m_n_salts; ++i) {
+ if (!m_bitarray.is_set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits))
+ return false;
+ }
+
+ return true;
+ }
+
+ void clear() {
+ m_bitarray.clear();
+ }
+
+ size_t get_memory_utilization() {
+ return this->m_bitarray.mem_size();
+ }
+private:
+ size_t m_n_salts;
+ size_t m_n_bits;
+ uint16_t* salt;
+
+ BitArray m_bitarray;
+};
+
+}
diff --git a/include/ds/PriorityQueue.h b/include/ds/PriorityQueue.h
new file mode 100644
index 0000000..49f3e41
--- /dev/null
+++ b/include/ds/PriorityQueue.h
@@ -0,0 +1,135 @@
+/*
+ * include/ds/PriorityQueue.h
+ *
+ * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu>
+ * Dong Xie <dongx@psu.edu>
+ *
+ * All rights reserved. Published under the Modified BSD License.
+ *
+ */
+#pragma once
+
+#include <vector>
+#include <cassert>
+
+#include "util/record.h"
+
+namespace lsm {
+
+struct queue_record {
+ const char *data;
+ size_t version;
+};
+
+
+class PriorityQueue {
+public:
+ PriorityQueue(size_t size) : data(size), tail(0) {}
+ ~PriorityQueue() = default;
+
+ size_t size() const {
+ return tail;
+ }
+
+ void pop() {
+ assert(this->tail != 0);
+
+ // If there is only one element, just decrement the
+ // tail.
+ if (this->size() == 1) {
+ this->tail--;
+ return;
+ }
+
+ swap(0, --this->tail);
+
+ ssize_t idx = 0;
+ ssize_t child_idx;
+
+ while ((child_idx = min_child(idx)) != -1 && heap_cmp(child_idx, idx)) {
+ swap(idx, child_idx);
+ idx = child_idx;
+ }
+ }
+
+ void push(const char* record, size_t version=0) {
+ assert(tail != this->data.size());
+
+ size_t new_idx = this->tail++;
+ this->data[new_idx] = {record, version};
+
+ while (new_idx != 0 && !heap_cmp(parent(new_idx), new_idx)) {
+ swap(parent(new_idx), new_idx);
+ new_idx = parent(new_idx);
+ }
+ }
+
+
+ queue_record peek(size_t depth=0) {
+ ssize_t idx = 0;
+ size_t cur_depth = 0;
+
+ while (cur_depth != depth) {
+ idx = min_child(idx);
+ assert(idx != -1);
+ cur_depth++;
+ }
+
+ return this->data[idx];
+ }
+
+private:
+ std::vector<queue_record> data;
+ size_t tail;
+
+ /*
+ * Swap the elements at position a and position
+ * b within the heap
+ */
+ inline void swap(size_t a, size_t b) {
+ if (a == b) return;
+
+ queue_record temp = this->data[a];
+ this->data[a] = this->data[b];
+ this->data[b] = temp;
+ }
+
+ inline size_t left_child(size_t idx) {
+ return 2 * idx + 1;
+ }
+
+ inline size_t right_child(size_t idx) {
+ return 2 * idx + 2;
+ }
+
+ inline size_t parent(size_t idx) {
+ return (idx - 1) / 2;
+ }
+
+ inline ssize_t min_child(size_t idx) {
+ ssize_t left = left_child(idx) < tail ? left_child(idx) : -1;
+ ssize_t right = right_child(idx) < tail ? right_child(idx) : -1;
+
+ if (left == -1 && right == -1) {
+ return -1;
+ } else if (left == -1) {
+ return right;
+ } else if (right == -1) {
+ return left;
+ }
+
+ return (heap_cmp(left, right)) ? left : right;
+ }
+
+ inline bool heap_cmp(size_t a, size_t b) {
+ auto cmp = record_cmp(this->data[a].data, this->data[b].data);
+ if (cmp == 0) {
+ if (this->data[a].version != this->data[b].version)
+ return this->data[a].version < this->data[b].version;
+ else return is_tombstone(this->data[a].data) && is_tombstone(this->data[b].data);
+ }
+ return cmp == -1;
+ }
+
+};
+}