diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2023-05-08 12:59:46 -0400 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2023-05-08 12:59:46 -0400 |
| commit | fdf5ef27fec1368a33801a98bbc5ed3556476979 (patch) | |
| tree | 7ecd9477fed6495d05a44343e19bfb6480785c7a /include/ds | |
| parent | 2380dd4d73cd6a179567e06e1aa4871ad44ccce3 (diff) | |
| download | dynamic-extension-fdf5ef27fec1368a33801a98bbc5ed3556476979.tar.gz | |
Began porting source files over from other repository
Diffstat (limited to 'include/ds')
| -rw-r--r-- | include/ds/Alias.h | 72 | ||||
| -rw-r--r-- | include/ds/BitArray.h | 67 | ||||
| -rw-r--r-- | include/ds/BloomFilter.h | 74 | ||||
| -rw-r--r-- | include/ds/PriorityQueue.h | 135 |
4 files changed, 348 insertions, 0 deletions
diff --git a/include/ds/Alias.h b/include/ds/Alias.h new file mode 100644 index 0000000..855dc75 --- /dev/null +++ b/include/ds/Alias.h @@ -0,0 +1,72 @@ +/* + * include/ds/Alias.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * Dong Xie <dongx@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <gsl/gsl_rng.h> +#include <vector> + +namespace de { + +/* + * An implementation of Walker's Alias Structure for Weighted Set Sampling. Requires + * that the input weight vector is already normalized. + */ +class Alias { +public: + Alias(const std::vector<double>& weights) + : m_alias(weights.size()), m_cutoff(weights.size()) { + size_t n = weights.size(); + auto overfull = std::vector<size_t>(); + auto underfull = std::vector<size_t>(); + overfull.reserve(n); + underfull.reserve(n); + + // initialize the probability_table with n*p(i) as well as the overfull and + // underfull lists. + for (size_t i = 0; i < n; i++) { + m_cutoff[i] = (double) n * weights[i]; + if (m_cutoff[i] > 1) { + overfull.emplace_back(i); + } else if (m_cutoff[i] < 1) { + underfull.emplace_back(i); + } else { + m_alias[i] = i; + } + } + + while (overfull.size() > 0 && underfull.size() > 0) { + auto i = overfull.back(); overfull.pop_back(); + auto j = underfull.back(); underfull.pop_back(); + + m_alias[j] = i; + m_cutoff[i] = m_cutoff[i] + m_cutoff[j] - 1.0; + + if (m_cutoff[i] > 1.0) { + overfull.emplace_back(i); + } else if (m_cutoff[i] < 1.0) { + underfull.emplace_back(i); + } + } + } + + size_t get(const gsl_rng* rng) { + double coin1 = gsl_rng_uniform(rng); + double coin2 = gsl_rng_uniform(rng); + + size_t k = ((double) m_alias.size()) * coin1; + return coin2 < m_cutoff[k] ? k : m_alias[k]; + } + +private: + std::vector<size_t> m_alias; + std::vector<double> m_cutoff; +}; + +} diff --git a/include/ds/BitArray.h b/include/ds/BitArray.h new file mode 100644 index 0000000..1d33d5b --- /dev/null +++ b/include/ds/BitArray.h @@ -0,0 +1,67 @@ +/* + * include/ds/BloomFilter.h + * + * Copyright (C) 2023 Dong Xie <dongx@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <cstdlib> +#include <memory> +#include <cstring> + +#include "util/base.h" + +namespace lsm { + +class BitArray { +public: + BitArray(size_t bits): m_bits(bits), m_data(nullptr) { + if (m_bits > 0) { + size_t n_bytes = (m_bits >> 3) << 3; + m_data = (char*) std::aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(n_bytes)); + memset(m_data, 0, n_bytes); + } + } + + ~BitArray() { + if (m_data) free(m_data); + } + + bool is_set(size_t bit) { + if (bit >= m_bits) return false; + return m_data[bit >> 3] & (1 << (bit & 7)); + } + + int set(size_t bit) { + if (bit >= m_bits) return 0; + m_data[bit >> 3] |= ((char) 1 << (bit & 7)); + return 1; + } + + int unset(size_t bit) { + if (bit >= m_bits) return 0; + m_data[bit >> 3] &= ~((char) 1 << (bit & 7)); + return 1; + } + + void clear() { + memset(m_data, 0, (m_bits >> 3) << 3); + } + + size_t mem_size() { + return m_bits >> 3; + } + + size_t size() { + return m_bits; + } + +private: + size_t m_bits; + char* m_data; +}; + +} diff --git a/include/ds/BloomFilter.h b/include/ds/BloomFilter.h new file mode 100644 index 0000000..dbaab2f --- /dev/null +++ b/include/ds/BloomFilter.h @@ -0,0 +1,74 @@ +/* + * include/ds/BloomFilter.h + * + * Copyright (C) 2023 Dong Xie <dongx@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <cmath> +#include <gsl/gsl_rng.h> + +#include "ds/BitArray.h" +#include "util/hash.h" +#include "util/base.h" +#include "util/record.h" + +namespace lsm { + +class BloomFilter { +public: + BloomFilter(size_t n_bits, size_t k, const gsl_rng* rng) + : m_n_bits(n_bits), m_n_salts(k), m_bitarray(n_bits) { + salt = (uint16_t*) aligned_alloc(CACHELINE_SIZE, CACHELINEALIGN(k * sizeof(uint16_t))); + for (size_t i = 0; i < k; ++i) { + salt[i] = (uint16_t) gsl_rng_uniform_int(rng, 1 << 16); + } + + } + + BloomFilter(double max_fpr, size_t n, size_t k, const gsl_rng* rng) + : BloomFilter((size_t)(-(double) (k * n) / std::log(1.0 - std::pow(max_fpr, 1.0 / k))), k, rng) {} + + ~BloomFilter() { + if (salt) free(salt); + } + + int insert(const key_t& key, size_t sz = sizeof(key_t)) { + if (m_bitarray.size() == 0) return 0; + + for (size_t i = 0; i < m_n_salts; ++i) { + m_bitarray.set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits); + } + + return 1; + } + + bool lookup(const key_t& key, size_t sz = sizeof(key_t)) { + if (m_bitarray.size() == 0) return false; + for (size_t i = 0; i < m_n_salts; ++i) { + if (!m_bitarray.is_set(hash_bytes_with_salt((const char*)&key, sz, salt[i]) % m_n_bits)) + return false; + } + + return true; + } + + void clear() { + m_bitarray.clear(); + } + + size_t get_memory_utilization() { + return this->m_bitarray.mem_size(); + } +private: + size_t m_n_salts; + size_t m_n_bits; + uint16_t* salt; + + BitArray m_bitarray; +}; + +} diff --git a/include/ds/PriorityQueue.h b/include/ds/PriorityQueue.h new file mode 100644 index 0000000..49f3e41 --- /dev/null +++ b/include/ds/PriorityQueue.h @@ -0,0 +1,135 @@ +/* + * include/ds/PriorityQueue.h + * + * Copyright (C) 2023 Douglas Rumbaugh <drumbaugh@psu.edu> + * Dong Xie <dongx@psu.edu> + * + * All rights reserved. Published under the Modified BSD License. + * + */ +#pragma once + +#include <vector> +#include <cassert> + +#include "util/record.h" + +namespace lsm { + +struct queue_record { + const char *data; + size_t version; +}; + + +class PriorityQueue { +public: + PriorityQueue(size_t size) : data(size), tail(0) {} + ~PriorityQueue() = default; + + size_t size() const { + return tail; + } + + void pop() { + assert(this->tail != 0); + + // If there is only one element, just decrement the + // tail. + if (this->size() == 1) { + this->tail--; + return; + } + + swap(0, --this->tail); + + ssize_t idx = 0; + ssize_t child_idx; + + while ((child_idx = min_child(idx)) != -1 && heap_cmp(child_idx, idx)) { + swap(idx, child_idx); + idx = child_idx; + } + } + + void push(const char* record, size_t version=0) { + assert(tail != this->data.size()); + + size_t new_idx = this->tail++; + this->data[new_idx] = {record, version}; + + while (new_idx != 0 && !heap_cmp(parent(new_idx), new_idx)) { + swap(parent(new_idx), new_idx); + new_idx = parent(new_idx); + } + } + + + queue_record peek(size_t depth=0) { + ssize_t idx = 0; + size_t cur_depth = 0; + + while (cur_depth != depth) { + idx = min_child(idx); + assert(idx != -1); + cur_depth++; + } + + return this->data[idx]; + } + +private: + std::vector<queue_record> data; + size_t tail; + + /* + * Swap the elements at position a and position + * b within the heap + */ + inline void swap(size_t a, size_t b) { + if (a == b) return; + + queue_record temp = this->data[a]; + this->data[a] = this->data[b]; + this->data[b] = temp; + } + + inline size_t left_child(size_t idx) { + return 2 * idx + 1; + } + + inline size_t right_child(size_t idx) { + return 2 * idx + 2; + } + + inline size_t parent(size_t idx) { + return (idx - 1) / 2; + } + + inline ssize_t min_child(size_t idx) { + ssize_t left = left_child(idx) < tail ? left_child(idx) : -1; + ssize_t right = right_child(idx) < tail ? right_child(idx) : -1; + + if (left == -1 && right == -1) { + return -1; + } else if (left == -1) { + return right; + } else if (right == -1) { + return left; + } + + return (heap_cmp(left, right)) ? left : right; + } + + inline bool heap_cmp(size_t a, size_t b) { + auto cmp = record_cmp(this->data[a].data, this->data[b].data); + if (cmp == 0) { + if (this->data[a].version != this->data[b].version) + return this->data[a].version < this->data[b].version; + else return is_tombstone(this->data[a].data) && is_tombstone(this->data[b].data); + } + return cmp == -1; + } + +}; +} |