diff options
| author | Douglas Rumbaugh <dbr4@psu.edu> | 2024-05-11 12:45:25 -0400 |
|---|---|---|
| committer | Douglas Rumbaugh <dbr4@psu.edu> | 2024-05-11 12:45:25 -0400 |
| commit | c611e8e56ebe72e09127fff4fb14a08dc3fcb698 (patch) | |
| tree | 7fb65b3a82eb21f5788153fb3693553f05286228 /benchmarks/bigann_sample.cpp | |
| parent | ab0ab297959fcca370e80670e17f90a780607a80 (diff) | |
| download | dynamic-extension-c611e8e56ebe72e09127fff4fb14a08dc3fcb698.tar.gz | |
Added program to sample the binary knn files
Diffstat (limited to 'benchmarks/bigann_sample.cpp')
| -rw-r--r-- | benchmarks/bigann_sample.cpp | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/benchmarks/bigann_sample.cpp b/benchmarks/bigann_sample.cpp new file mode 100644 index 0000000..aa12f91 --- /dev/null +++ b/benchmarks/bigann_sample.cpp @@ -0,0 +1,55 @@ +/* + * + */ + +#define ENABLE_TIMER + +#include "file_util.h" +#include "benchmark_types.h" + +#include <gsl/gsl_rng.h> + +typedef ANNRec Rec; + +void usage(char *progname) { + fprintf(stderr, "%s reccnt datafile sampcnt\n", progname); +} + +int main(int argc, char **argv) { + + if (argc < 4) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + size_t n = atol(argv[1]); + std::string d_fname = std::string(argv[2]); + size_t m = atol(argv[3]); + + gsl_rng * rng = gsl_rng_alloc(gsl_rng_mt19937); + auto data = read_binary_vector_file<Rec>(d_fname, n); + + std::vector<size_t> to_delete(m); + + std::unordered_map<Rec, size_t, de::RecordHash<Rec>> filter; + double ratio = (double) data.size() / (double) m; + size_t j=0; + for (size_t i=0; i<data.size() && j<to_delete.size(); i++) { + if (gsl_rng_uniform(rng) <= ratio && filter.find(data[i]) == filter.end()) { + to_delete[j++] = i; + filter.insert({data[i], i}); + } + } + + for (size_t i=0; i<to_delete.size(); i++) { + for (size_t j=0; j<ANNSize; j++ ) { + fprintf(stdout, "%ld ", data[to_delete[i]].data[j]); + } + fprintf(stdout, "\n"); + } + + gsl_rng_free(rng); + fflush(stderr); + fflush(stdout); +} + |