summaryrefslogtreecommitdiffstats
path: root/benchmarks
diff options
context:
space:
mode:
authorDouglas Rumbaugh <dbr4@psu.edu>2023-07-25 11:17:36 -0400
committerDouglas Rumbaugh <dbr4@psu.edu>2023-07-25 11:17:36 -0400
commit37434f5baf632e839dc14b3c7d8745287cb9368a (patch)
tree4b9a77c25b734872a1b815cc7c0bad6258784601 /benchmarks
parent9e869d32344d5bd8ee703a0733d80d48d458217c (diff)
downloaddynamic-extension-37434f5baf632e839dc14b3c7d8745287cb9368a.tar.gz
Benchmarks: mtree and vptree benchmark updates
Note: cosine similarity doesn't seem to work for VPTree--I don't think that it is actually a metric, upon further research. At the very least I can't find anyone claiming it is, and I've found several people claiming it isn't. On testing with the Word2Vec data, Euclidean distance works insofar as the M-Tree and VPTree return the same KNN results for test queries, whereas Cosine Similarity does not work.
Diffstat (limited to 'benchmarks')
-rw-r--r--benchmarks/include/bench.h3
-rw-r--r--benchmarks/include/bench_utility.h15
-rw-r--r--benchmarks/mtree_knn_bench.cpp46
-rw-r--r--benchmarks/vptree_knn_bench.cpp2
4 files changed, 58 insertions, 8 deletions
diff --git a/benchmarks/include/bench.h b/benchmarks/include/bench.h
index 12d0a7e..586ff12 100644
--- a/benchmarks/include/bench.h
+++ b/benchmarks/include/bench.h
@@ -85,7 +85,7 @@ static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cn
}
template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true>
-static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=100) {
+static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) {
char progbuf[25];
if constexpr (PROGRESS) {
sprintf(progbuf, "querying:");
@@ -102,6 +102,7 @@ static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t tr
auto start = std::chrono::high_resolution_clock::now();
for (size_t j=0; j<queries.size(); j++) {
auto res = de_index.query(&queries[j]);
+
total_results += res.size();
}
auto stop = std::chrono::high_resolution_clock::now();
diff --git a/benchmarks/include/bench_utility.h b/benchmarks/include/bench_utility.h
index 6610ab4..28040be 100644
--- a/benchmarks/include/bench_utility.h
+++ b/benchmarks/include/bench_utility.h
@@ -40,7 +40,7 @@ typedef de::WeightedRecord<key_type, value_type, weight_type> WRec;
typedef de::Record<key_type, value_type> Rec;
const size_t W2V_SIZE = 300;
-typedef de::CosinePoint<double, W2V_SIZE> Word2VecRec;
+typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec;
typedef de::DynamicExtension<WRec, de::WSS<WRec>, de::WSSQuery<WRec>> ExtendedWSS;
typedef de::DynamicExtension<Rec, de::TrieSpline<Rec>, de::TrieSplineRangeQuery<Rec>> ExtendedTSRQ;
@@ -68,6 +68,17 @@ struct btree_key_extract {
}
};
+struct euclidean_distance {
+ double operator()(const Word2VecRec &first, const Word2VecRec &second) const {
+ double dist = 0;
+ for (size_t i=0; i<W2V_SIZE; i++) {
+ dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]);
+ }
+
+ return std::sqrt(dist);
+ }
+};
+
struct cosine_similarity {
double operator()(const Word2VecRec &first, const Word2VecRec &second) const {
@@ -86,7 +97,7 @@ struct cosine_similarity {
};
typedef tlx::BTree<key_type, btree_record, btree_key_extract> TreeMap;
-typedef mt::mtree<Word2VecRec, cosine_similarity> MTree;
+typedef mt::mtree<Word2VecRec, euclidean_distance> MTree;
static gsl_rng *g_rng;
static std::set<WRec> *g_to_delete;
diff --git a/benchmarks/mtree_knn_bench.cpp b/benchmarks/mtree_knn_bench.cpp
index 3c1792a..7ae4e83 100644
--- a/benchmarks/mtree_knn_bench.cpp
+++ b/benchmarks/mtree_knn_bench.cpp
@@ -1,6 +1,44 @@
#include "include/bench.h"
#include "mtree.h"
+static void mtree_knn_bench(MTree &tree, std::vector<de::KNNQueryParms<Word2VecRec>> queries, size_t trial_cnt=1)
+{
+ char progbuf[25];
+ sprintf(progbuf, "sampling:");
+
+ size_t batch_size = 100;
+ size_t batches = trial_cnt / batch_size;
+ size_t total_time = 0;
+
+ std::vector<Word2VecRec> result_set;
+
+ for (int i=0; i<trial_cnt; i++) {
+ progress_update((double) (i * batch_size) / (double) trial_cnt, progbuf);
+
+ std::vector<Word2VecRec> results;
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for (size_t j=0; j<queries.size(); j++) {
+ results.clear();
+ auto query_output = tree.get_nearest_by_limit(queries[j].point, queries[j].k);
+ auto itr = query_output.begin();
+ while (itr != query_output.end()) {
+ results.emplace_back(itr->data);
+ itr++;
+ }
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+
+ total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count();
+ }
+
+ progress_update(1.0, progbuf);
+
+ size_t latency = total_time / (trial_cnt * queries.size());
+
+ fprintf(stdout, "%ld\t", latency);
+}
+
int main(int argc, char **argv)
{
if (argc < 5) {
@@ -20,7 +58,7 @@ int main(int argc, char **argv)
double insert_batch = 0.1;
init_bench_env(record_count, true);
- auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 50);
+ auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 10);
auto mtree = MTree();
@@ -37,10 +75,10 @@ int main(int argc, char **argv)
size_t insert_cnt = record_count - warmup_cnt;
insert_tput_bench<MTree, Word2VecRec>(mtree, datafile, insert_cnt, delete_prop, to_delete, true);
- //fprintf(stdout, "%ld\t", mtree.get_memory_usage());
+ // fprintf(stdout, "%ld\t", mtree.get_memory_usage());
-// query_latency_bench<MTree, Word2VecRec, de::KNNQueryParms<Word2VecRec>>(mtree, queries);
- // fprintf(stdout, "\n");
+ mtree_knn_bench(mtree, queries);
+ fprintf(stdout, "\n");
delete_bench_env();
fflush(stdout);
diff --git a/benchmarks/vptree_knn_bench.cpp b/benchmarks/vptree_knn_bench.cpp
index a5c45f4..0021c4a 100644
--- a/benchmarks/vptree_knn_bench.cpp
+++ b/benchmarks/vptree_knn_bench.cpp
@@ -19,7 +19,7 @@ int main(int argc, char **argv)
double insert_batch = 0.1;
init_bench_env(record_count, true);
- auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 50);
+ auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 10);
auto de_vp_knn = ExtendedVPTree_KNN(buffer_cap, scale_factor, max_delete_prop);