diff options
| author | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2025-08-29 18:24:43 -0400 |
|---|---|---|
| committer | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2025-08-29 18:24:43 -0400 |
| commit | beb3e03072c706554acdfdd38dc7fb920ff2bb41 (patch) | |
| tree | 0d0f199aa2b57fc9efccb6e5f769d5a0e814e0bb | |
| parent | 90e3df1407b6b8c608b362081d5d9b4802259aff (diff) | |
| download | math-utils-beb3e03072c706554acdfdd38dc7fb920ff2bb41.tar.gz | |
Code cleanup+documentation
| -rw-r--r-- | Makefile | 4 | ||||
| -rw-r--r-- | doc/cdf.1 | 106 | ||||
| -rw-r--r-- | include/cdf.h | 1 | ||||
| -rw-r--r-- | src/cdf.c | 188 | ||||
| -rw-r--r-- | src/cumsum.c | 63 |
5 files changed, 207 insertions, 155 deletions
@@ -7,10 +7,10 @@ build: -mkdir build bin/cdf: build src/cdf.c include/cdf.h - gcc -std=c23 -Iinclude src/cdf.c -o bin/cdf + clang -std=c2x -Iinclude src/cdf.c -o bin/cdf bin/cumsum: src/cumsum.c include/cumsum.h - gcc -std=c23 -Iinclude src/cumsum.c -o bin/cumsum + clang -std=c2x -Iinclude src/cumsum.c -o bin/cumsum .PHONY: clean clean: diff --git a/doc/cdf.1 b/doc/cdf.1 new file mode 100644 index 0000000..06f3a49 --- /dev/null +++ b/doc/cdf.1 @@ -0,0 +1,106 @@ +.\" Copyright (c) 2025 +.\" Manual page for cdf(1) +.\" +.Dd $Mdocdate$ +.Dt CDF 1 +.Os +.Sh NAME +.Nm cdf +.Nd calculate cumulative distribution function from count data +.Sh SYNOPSIS +.Nm cdf +.Op Fl f | u +.Op Fl r +.Op Fl h +.Op Ar file +.Sh DESCRIPTION +The +.Nm +utility computes cumulative distribution functions from input data consisting +of value-count pairs. It reads data from +.Ar file +or standard input if no file is specified, calculates relative frequencies, +and outputs probability-value pairs suitable for statistical analysis and +plotting. +.Pp +Input data must consist of whitespace-separated pairs where the first field +is the count (frequency) and the second field is the data value. Output +consists of cumulative probability values followed by the corresponding data +values, separated by tabs. +.Pp +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl f +Read input values as floats. +.It Fl u +Read input values as unsigned integers +.It Fl r +Generate the complementary cumulative distribution function (CCDF), +which is P(X > x) = 1 - F(x). +.It Fl h +Display usage information and exit. +.El +.Pp +If +.Fl f +or +.Fl u +is not specified, the input values will be read as signed integers by +default. +.Sh INPUT FORMAT +Each input line must contain exactly two whitespace-separated fields: +.Bd -literal -offset indent +count value +.Ed +.Pp +Where +.Em count +is a positive integer representing the frequency of occurrence, and +.Em value +is the data point. +.Pp +Example input: +.Bd -literal -offset indent +15 1.25 +23 2.30 +8 3.75 +.Ed +.Pp +This format was selected to be compatible with the output of the uniq -c command. +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Calculate CDF from integer data in a file: +.Bd -literal -offset indent +$ cdf data.txt +0.300000000000000 10 +0.650000000000000 15 +1.000000000000000 20 +.Ed +.Pp +Generate complementary CDF in a pipeline: +.Bd -literal -offset indent +$ awk '{print $2, $1}' measurements.dat | cdf -r -f +1.000000000000000 1.234000 +0.750000000000000 2.567000 +0.400000000000000 4.890000 +.Ed +.Pp +Use standard tools for pre-processing: +.Bd -literal -offset indent +$ sort -n data.txt | uniq -c | cdf > dist.cdf +.Ed +.Sh SEE ALSO +.Xr sort 1 , +.Xr uniq 1 , +.Xr gnuplot 1 +.Sh AUTHORS +.An Douglas B. Rumbaugh +.Mt "dbrumbaugh@harrisburgu.edu" +.Sh BUGS +The program must materialize the full file in order to calculate +the frequency table. It currently does this in memory, and so very +large datasets may lead to crashes due to memory allocation failures +when RAM is limited. + + diff --git a/include/cdf.h b/include/cdf.h index 418ffeb..956e630 100644 --- a/include/cdf.h +++ b/include/cdf.h @@ -7,7 +7,6 @@ #include <stdlib.h> #include <unistd.h> #include <stdio.h> -#include <ctype.h> #include <getopt.h> #include <string.h> #include <stdint.h> @@ -1,8 +1,7 @@ /* - * + * */ - #include "cdf.h" /* @@ -14,35 +13,36 @@ static bool ARG_FP_INPUT = false; static bool ARG_UINT_INPUT = false; static bool ARG_HELP = false; -static int parse_options(int argc, char*const* argv) { +static int parse_options(int argc, char *const *argv) { int arg_index = 0; int arg; bool error = false; while ((arg = getopt(argc, argv, "frhu")) != -1) { switch (arg) { - case 'f': - ARG_FP_INPUT = true; - break; - case 'r': - ARG_REVERSE_CDF = true; - break; - case 'u': - ARG_UINT_INPUT = true; - case 'h': - ARG_HELP = true; - break; - case '?': - if (isprint(optopt)) { - fprintf(stderr, "Unknown option `-%c`.\n", optopt); - } else { - fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt); - } - error = true; - break; - default: - error = true; - break; + case 'f': + ARG_FP_INPUT = true; + break; + case 'r': + ARG_REVERSE_CDF = true; + break; + case 'u': + ARG_UINT_INPUT = true; + break; + case 'h': + ARG_HELP = true; + break; + case '?': + if (isprint(optopt)) { + fprintf(stderr, "Unknown option `-%c`.\n", optopt); + } else { + fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt); + } + error = true; + break; + default: + error = true; + break; } } @@ -58,13 +58,11 @@ static int parse_options(int argc, char*const* argv) { return arg_index; } -static void help() { - fprintf(stderr, "Usage:\ncdf [-f|-u] [-r] [filename]\n"); -} +static void help() { fprintf(stderr, "Usage:\ncdf [-f|-u] [-r] [filename]\n"); } static DistRecord *expand_array(DistRecord *records, size_t *capacity) { (*capacity) *= 2; - DistRecord *new = realloc(records, (*capacity*sizeof(DistRecord))); + DistRecord *new = realloc(records, (*capacity * sizeof(DistRecord))); if (!new) { fprintf(stderr, "ERROR: Memory allocation failed\n"); return nullptr; @@ -73,42 +71,26 @@ static DistRecord *expand_array(DistRecord *records, size_t *capacity) { return new; } -static int read_data_int(DistRecord **records, size_t capacity, FILE *file) { - size_t reccnt = 0; - while (fscanf(file, "%ld %ld\n", &(*records + reccnt)->count, - &(*records + reccnt)->data.i) != EOF) { - - reccnt++; - if (reccnt == capacity) { - if (!(*records = expand_array(*records, &capacity))) { - return -1; - } - } +static int parse_line(DistRecord *record, char *line) { + if (ARG_FP_INPUT) { + return sscanf(line, "%lld %lf", &record->count, &record->data.d); + } else if (ARG_UINT_INPUT) { + return sscanf(line, "%lld %llu", &record->count, &record->data.u); + } else { + return sscanf(line, "%lld %lld", &record->count, &record->data.i); } - - return reccnt; } -static int read_data_fp(DistRecord **records, size_t capacity, FILE *file) { +static ssize_t read_data(DistRecord **records, size_t capacity, FILE *file) { size_t reccnt = 0; - while (fscanf(file, "%ld %lf\n", &(*records + reccnt)->count, - &(*records + reccnt)->data.d) != EOF) { + char *line = nullptr; + size_t line_len = 0; - reccnt++; - if (reccnt == capacity) { - if (!(*records = expand_array(*records, &capacity))) { - return -1; - } + while (getline(&line, &line_len, file) != -1) { + if (parse_line(*records + reccnt, line) != 2) { + fprintf(stderr, "[W] Skipping invalid input line: %s\n", line); + continue; } - } - - return reccnt; -} - -static int read_data_uint(DistRecord **records, size_t capacity, FILE *file) { - size_t reccnt = 0; - while (fscanf(file, "%ld %ld\n", &(*records + reccnt)->count, - &(*records + reccnt)->data.u) != EOF) { reccnt++; if (reccnt == capacity) { @@ -121,50 +103,27 @@ static int read_data_uint(DistRecord **records, size_t capacity, FILE *file) { return reccnt; } -static int print_data_fp(DistRecord *records, long double *freqs, size_t cnt) { - if (ARG_REVERSE_CDF) { - long double total_freq = 1.0; - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%lf\n", total_freq, records[i].data.d); - total_freq -= freqs[i]; - } - +static void print_record(long double freq, DistRecord *record) { + if (ARG_FP_INPUT) { + fprintf(stdout, "%.15Lf\t%lf\n", freq, record->data.d); + } else if (ARG_UINT_INPUT) { + fprintf(stdout, "%.15Lf\t%llu\n", freq, record->data.u); } else { - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%lf\n", freqs[i], records[i].data.d); - } + fprintf(stdout, "%.15Lf\t%lld\n", freq, record->data.i); } - - return 1; } -static int print_data_int(DistRecord *records, long double *freqs, size_t cnt) { +static int print_data(DistRecord *records, long double *freqs, size_t cnt) { if (ARG_REVERSE_CDF) { long double total_freq = 1.0; - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%ld\n", total_freq, records[i].data.i); + for (size_t i = 0; i < cnt; i++) { + print_record(total_freq, records + i); total_freq -= freqs[i]; } } else { - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%ld\n", freqs[i], records[i].data.i); - } - } - - return 1; -} - -static int print_data_uint(DistRecord *records, long double *freqs, size_t cnt) { - if (ARG_REVERSE_CDF) { - long double total_freq = 1.0; - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%ld\n", total_freq, records[i].data.u); - total_freq -= freqs[i]; + for (size_t i = 0; i < cnt; i++) { + print_record(freqs[i], records + i); } - } else { - for (size_t i=0; i<cnt; i++) { - fprintf(stdout, "%.15Lf\t%ld\n", freqs[i], records[i].data.u); - } } return 1; @@ -172,22 +131,17 @@ static int print_data_uint(DistRecord *records, long double *freqs, size_t cnt) static int process_data(FILE *file) { int rc = 1; - + size_t reccap = 100; - DistRecord *records = malloc(reccap*sizeof(DistRecord)); + DistRecord *records = malloc(reccap * sizeof(DistRecord)); - ssize_t cnt; - /* FIXME: this could probably use a type-based macro to collapse the - if statements into a single macro call - */ - if (ARG_FP_INPUT) { - cnt = read_data_fp(&records, reccap, file); - } else if (ARG_UINT_INPUT) { - cnt = read_data_uint(&records, reccap, file); - } else { - cnt = read_data_int(&records, reccap, file); + if (!records) { + rc = 0; + goto process_data_end; } + ssize_t cnt = read_data(&records, reccap, file); + /* propogate the error */ if (cnt == -1) { rc = 0; @@ -196,7 +150,7 @@ static int process_data(FILE *file) { /* calculate total sum of counts */ uint64_t total_count = 0; - for (size_t i=0; i<cnt; i++) { + for (size_t i = 0; i < cnt; i++) { total_count += records[i].count; } @@ -207,18 +161,12 @@ static int process_data(FILE *file) { goto free_freqs; } - for (size_t i=0; i<cnt; i++) { - freqs[i] = (long double) (records[i].count) / (long double) (total_count); + for (size_t i = 0; i < cnt; i++) { + freqs[i] = (long double)(records[i].count) / (long double)(total_count); } - if (ARG_FP_INPUT) { - rc = print_data_fp(records, freqs, cnt); - } else if (ARG_UINT_INPUT) { - rc = print_data_uint(records, freqs, cnt); - } else { - rc = print_data_int(records, freqs, cnt); - } - + rc = print_data(records, freqs, cnt); + free_freqs: free(freqs); @@ -229,7 +177,6 @@ process_data_end: return rc; } - int main(int argc, char **argv) { int rc = EXIT_SUCCESS; int file_index = 0; @@ -249,7 +196,8 @@ int main(int argc, char **argv) { FILE *input_file; if (file_index < argc && strcmp(argv[file_index], "-") != 0) { if (!(input_file = fopen(argv[file_index], "r"))) { - fprintf(stderr, "Error: Unable to open input file %s\n", argv[file_index]); + fprintf(stderr, "Error: Unable to open input file %s\n", + argv[file_index]); rc = EXIT_FAILURE; goto program_exit; } @@ -262,7 +210,9 @@ int main(int argc, char **argv) { } close_file: - fclose(input_file); + if (input_file != stdin) { + fclose(input_file); + } program_exit: exit(rc); diff --git a/src/cumsum.c b/src/cumsum.c index 5648661..268a548 100644 --- a/src/cumsum.c +++ b/src/cumsum.c @@ -1,8 +1,7 @@ /* - * + * */ - #include "cumsum.h" /* @@ -13,32 +12,32 @@ static bool ARG_FP_INPUT = false; static bool ARG_UINT_INPUT = false; static bool ARG_HELP = false; -static int parse_options(int argc, char*const* argv) { +static int parse_options(int argc, char *const *argv) { int arg_index = 0; int arg; bool error = false; while ((arg = getopt(argc, argv, "frhu")) != -1) { switch (arg) { - case 'f': - ARG_FP_INPUT = true; - break; - case 'u': - ARG_UINT_INPUT = true; - case 'h': - ARG_HELP = true; - break; - case '?': - if (isprint(optopt)) { - fprintf(stderr, "Unknown option `-%c`.\n", optopt); - } else { - fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt); - } - error = true; - break; - default: - error = true; - break; + case 'f': + ARG_FP_INPUT = true; + break; + case 'u': + ARG_UINT_INPUT = true; + case 'h': + ARG_HELP = true; + break; + case '?': + if (isprint(optopt)) { + fprintf(stderr, "Unknown option `-%c`.\n", optopt); + } else { + fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt); + } + error = true; + break; + default: + error = true; + break; } } @@ -54,17 +53,15 @@ static int parse_options(int argc, char*const* argv) { return arg_index; } -static void help() { - fprintf(stderr, "Usage:\ncumsum [-f|-u] [filename]\n"); -} +static void help() { fprintf(stderr, "Usage:\ncumsum [-f|-u] [filename]\n"); } void print_sum(Number sum) { if (ARG_FP_INPUT) { fprintf(stdout, "%lf\n", sum.d); } else if (ARG_UINT_INPUT) { - fprintf(stdout, "%ld\n", sum.u); + fprintf(stdout, "%lld\n", sum.u); } else { - fprintf(stdout, "%ld\n", sum.i); + fprintf(stdout, "%lld\n", sum.i); } } @@ -79,7 +76,7 @@ static int read_data_fp(FILE *file, Number *num) { static int read_data_int(FILE *file, Number *num) { int64_t val; - while (fscanf(file, "%ld ", &val) != EOF) { + while (fscanf(file, "%lld ", &val) != EOF) { num->i += val; } @@ -88,7 +85,7 @@ static int read_data_int(FILE *file, Number *num) { static int read_data_uint(FILE *file, Number *num) { uint64_t val; - while (fscanf(file, "%ld ", &val) != EOF) { + while (fscanf(file, "%lld ", &val) != EOF) { num->u += val; } @@ -97,10 +94,10 @@ static int read_data_uint(FILE *file, Number *num) { static int process_data(FILE *file) { int rc = 1; - + Number sum = {}; - /* FIXME: this could probably use a type-based macro to collapse the + /* FIXME: this could probably use a type-based macro to collapse the if statements into a single macro call */ if (ARG_FP_INPUT) { @@ -119,7 +116,6 @@ process_data_end: return rc; } - int main(int argc, char **argv) { int rc = EXIT_SUCCESS; int file_index = 0; @@ -139,7 +135,8 @@ int main(int argc, char **argv) { FILE *input_file; if (file_index < argc && strcmp(argv[file_index], "-") != 0) { if (!(input_file = fopen(argv[file_index], "r"))) { - fprintf(stderr, "Error: Unable to open input file %s\n", argv[file_index]); + fprintf(stderr, "Error: Unable to open input file %s\n", + argv[file_index]); rc = EXIT_FAILURE; goto program_exit; } |