diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2023-07-25 11:17:36 -0400 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2023-07-25 11:17:36 -0400 |
| commit | 37434f5baf632e839dc14b3c7d8745287cb9368a (patch) | |
| tree | 4b9a77c25b734872a1b815cc7c0bad6258784601 /benchmarks | |
| parent | 9e869d32344d5bd8ee703a0733d80d48d458217c (diff) | |
| download | dynamic-extension-37434f5baf632e839dc14b3c7d8745287cb9368a.tar.gz | |
Benchmarks: mtree and vptree benchmark updates
Note: cosine similarity doesn't seem to work for VPTree--I don't think
that it is actually a metric, upon further research. At the very least I
can't find anyone claiming it is, and I've found several people claiming
it isn't. On testing with the Word2Vec data, Euclidean distance works
insofar as the M-Tree and VPTree return the same KNN results for test
queries, whereas Cosine Similarity does not work.
Diffstat (limited to 'benchmarks')
| -rw-r--r-- | benchmarks/include/bench.h | 3 | ||||
| -rw-r--r-- | benchmarks/include/bench_utility.h | 15 | ||||
| -rw-r--r-- | benchmarks/mtree_knn_bench.cpp | 46 | ||||
| -rw-r--r-- | benchmarks/vptree_knn_bench.cpp | 2 |
4 files changed, 58 insertions, 8 deletions
diff --git a/benchmarks/include/bench.h b/benchmarks/include/bench.h index 12d0a7e..586ff12 100644 --- a/benchmarks/include/bench.h +++ b/benchmarks/include/bench.h @@ -85,7 +85,7 @@ static bool insert_tput_bench(DE &de_index, std::fstream &file, size_t insert_cn } template <typename DE, de::RecordInterface R, typename QP, bool PROGRESS=true> -static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=100) { +static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t trial_cnt=1) { char progbuf[25]; if constexpr (PROGRESS) { sprintf(progbuf, "querying:"); @@ -102,6 +102,7 @@ static bool query_latency_bench(DE &de_index, std::vector<QP> queries, size_t tr auto start = std::chrono::high_resolution_clock::now(); for (size_t j=0; j<queries.size(); j++) { auto res = de_index.query(&queries[j]); + total_results += res.size(); } auto stop = std::chrono::high_resolution_clock::now(); diff --git a/benchmarks/include/bench_utility.h b/benchmarks/include/bench_utility.h index 6610ab4..28040be 100644 --- a/benchmarks/include/bench_utility.h +++ b/benchmarks/include/bench_utility.h @@ -40,7 +40,7 @@ typedef de::WeightedRecord<key_type, value_type, weight_type> WRec; typedef de::Record<key_type, value_type> Rec; const size_t W2V_SIZE = 300; -typedef de::CosinePoint<double, W2V_SIZE> Word2VecRec; +typedef de::EuclidPoint<double, W2V_SIZE> Word2VecRec; typedef de::DynamicExtension<WRec, de::WSS<WRec>, de::WSSQuery<WRec>> ExtendedWSS; typedef de::DynamicExtension<Rec, de::TrieSpline<Rec>, de::TrieSplineRangeQuery<Rec>> ExtendedTSRQ; @@ -68,6 +68,17 @@ struct btree_key_extract { } }; +struct euclidean_distance { + double operator()(const Word2VecRec &first, const Word2VecRec &second) const { + double dist = 0; + for (size_t i=0; i<W2V_SIZE; i++) { + dist += (first.data[i] - second.data[i]) * (first.data[i] - second.data[i]); + } + + return std::sqrt(dist); + } +}; + struct cosine_similarity { double operator()(const Word2VecRec &first, const Word2VecRec &second) const { @@ -86,7 +97,7 @@ struct cosine_similarity { }; typedef tlx::BTree<key_type, btree_record, btree_key_extract> TreeMap; -typedef mt::mtree<Word2VecRec, cosine_similarity> MTree; +typedef mt::mtree<Word2VecRec, euclidean_distance> MTree; static gsl_rng *g_rng; static std::set<WRec> *g_to_delete; diff --git a/benchmarks/mtree_knn_bench.cpp b/benchmarks/mtree_knn_bench.cpp index 3c1792a..7ae4e83 100644 --- a/benchmarks/mtree_knn_bench.cpp +++ b/benchmarks/mtree_knn_bench.cpp @@ -1,6 +1,44 @@ #include "include/bench.h" #include "mtree.h" +static void mtree_knn_bench(MTree &tree, std::vector<de::KNNQueryParms<Word2VecRec>> queries, size_t trial_cnt=1) +{ + char progbuf[25]; + sprintf(progbuf, "sampling:"); + + size_t batch_size = 100; + size_t batches = trial_cnt / batch_size; + size_t total_time = 0; + + std::vector<Word2VecRec> result_set; + + for (int i=0; i<trial_cnt; i++) { + progress_update((double) (i * batch_size) / (double) trial_cnt, progbuf); + + std::vector<Word2VecRec> results; + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t j=0; j<queries.size(); j++) { + results.clear(); + auto query_output = tree.get_nearest_by_limit(queries[j].point, queries[j].k); + auto itr = query_output.begin(); + while (itr != query_output.end()) { + results.emplace_back(itr->data); + itr++; + } + } + auto stop = std::chrono::high_resolution_clock::now(); + + total_time += std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count(); + } + + progress_update(1.0, progbuf); + + size_t latency = total_time / (trial_cnt * queries.size()); + + fprintf(stdout, "%ld\t", latency); +} + int main(int argc, char **argv) { if (argc < 5) { @@ -20,7 +58,7 @@ int main(int argc, char **argv) double insert_batch = 0.1; init_bench_env(record_count, true); - auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 50); + auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 10); auto mtree = MTree(); @@ -37,10 +75,10 @@ int main(int argc, char **argv) size_t insert_cnt = record_count - warmup_cnt; insert_tput_bench<MTree, Word2VecRec>(mtree, datafile, insert_cnt, delete_prop, to_delete, true); - //fprintf(stdout, "%ld\t", mtree.get_memory_usage()); + // fprintf(stdout, "%ld\t", mtree.get_memory_usage()); -// query_latency_bench<MTree, Word2VecRec, de::KNNQueryParms<Word2VecRec>>(mtree, queries); - // fprintf(stdout, "\n"); + mtree_knn_bench(mtree, queries); + fprintf(stdout, "\n"); delete_bench_env(); fflush(stdout); diff --git a/benchmarks/vptree_knn_bench.cpp b/benchmarks/vptree_knn_bench.cpp index a5c45f4..0021c4a 100644 --- a/benchmarks/vptree_knn_bench.cpp +++ b/benchmarks/vptree_knn_bench.cpp @@ -19,7 +19,7 @@ int main(int argc, char **argv) double insert_batch = 0.1; init_bench_env(record_count, true); - auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 50); + auto queries = read_knn_queries<de::KNNQueryParms<Word2VecRec>>(qfilename, 10); auto de_vp_knn = ExtendedVPTree_KNN(buffer_cap, scale_factor, max_delete_prop); |