summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--doc/cdf.1106
-rw-r--r--include/cdf.h1
-rw-r--r--src/cdf.c188
-rw-r--r--src/cumsum.c63
5 files changed, 207 insertions, 155 deletions
diff --git a/Makefile b/Makefile
index 080b44f..bcfa271 100644
--- a/Makefile
+++ b/Makefile
@@ -7,10 +7,10 @@ build:
-mkdir build
bin/cdf: build src/cdf.c include/cdf.h
- gcc -std=c23 -Iinclude src/cdf.c -o bin/cdf
+ clang -std=c2x -Iinclude src/cdf.c -o bin/cdf
bin/cumsum: src/cumsum.c include/cumsum.h
- gcc -std=c23 -Iinclude src/cumsum.c -o bin/cumsum
+ clang -std=c2x -Iinclude src/cumsum.c -o bin/cumsum
.PHONY: clean
clean:
diff --git a/doc/cdf.1 b/doc/cdf.1
new file mode 100644
index 0000000..06f3a49
--- /dev/null
+++ b/doc/cdf.1
@@ -0,0 +1,106 @@
+.\" Copyright (c) 2025
+.\" Manual page for cdf(1)
+.\"
+.Dd $Mdocdate$
+.Dt CDF 1
+.Os
+.Sh NAME
+.Nm cdf
+.Nd calculate cumulative distribution function from count data
+.Sh SYNOPSIS
+.Nm cdf
+.Op Fl f | u
+.Op Fl r
+.Op Fl h
+.Op Ar file
+.Sh DESCRIPTION
+The
+.Nm
+utility computes cumulative distribution functions from input data consisting
+of value-count pairs. It reads data from
+.Ar file
+or standard input if no file is specified, calculates relative frequencies,
+and outputs probability-value pairs suitable for statistical analysis and
+plotting.
+.Pp
+Input data must consist of whitespace-separated pairs where the first field
+is the count (frequency) and the second field is the data value. Output
+consists of cumulative probability values followed by the corresponding data
+values, separated by tabs.
+.Pp
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl f
+Read input values as floats.
+.It Fl u
+Read input values as unsigned integers
+.It Fl r
+Generate the complementary cumulative distribution function (CCDF),
+which is P(X > x) = 1 - F(x).
+.It Fl h
+Display usage information and exit.
+.El
+.Pp
+If
+.Fl f
+or
+.Fl u
+is not specified, the input values will be read as signed integers by
+default.
+.Sh INPUT FORMAT
+Each input line must contain exactly two whitespace-separated fields:
+.Bd -literal -offset indent
+count value
+.Ed
+.Pp
+Where
+.Em count
+is a positive integer representing the frequency of occurrence, and
+.Em value
+is the data point.
+.Pp
+Example input:
+.Bd -literal -offset indent
+15 1.25
+23 2.30
+8 3.75
+.Ed
+.Pp
+This format was selected to be compatible with the output of the uniq -c command.
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+Calculate CDF from integer data in a file:
+.Bd -literal -offset indent
+$ cdf data.txt
+0.300000000000000 10
+0.650000000000000 15
+1.000000000000000 20
+.Ed
+.Pp
+Generate complementary CDF in a pipeline:
+.Bd -literal -offset indent
+$ awk '{print $2, $1}' measurements.dat | cdf -r -f
+1.000000000000000 1.234000
+0.750000000000000 2.567000
+0.400000000000000 4.890000
+.Ed
+.Pp
+Use standard tools for pre-processing:
+.Bd -literal -offset indent
+$ sort -n data.txt | uniq -c | cdf > dist.cdf
+.Ed
+.Sh SEE ALSO
+.Xr sort 1 ,
+.Xr uniq 1 ,
+.Xr gnuplot 1
+.Sh AUTHORS
+.An Douglas B. Rumbaugh
+.Mt "dbrumbaugh@harrisburgu.edu"
+.Sh BUGS
+The program must materialize the full file in order to calculate
+the frequency table. It currently does this in memory, and so very
+large datasets may lead to crashes due to memory allocation failures
+when RAM is limited.
+
+
diff --git a/include/cdf.h b/include/cdf.h
index 418ffeb..956e630 100644
--- a/include/cdf.h
+++ b/include/cdf.h
@@ -7,7 +7,6 @@
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
-#include <ctype.h>
#include <getopt.h>
#include <string.h>
#include <stdint.h>
diff --git a/src/cdf.c b/src/cdf.c
index 45b8fb7..3bf39fe 100644
--- a/src/cdf.c
+++ b/src/cdf.c
@@ -1,8 +1,7 @@
/*
- *
+ *
*/
-
#include "cdf.h"
/*
@@ -14,35 +13,36 @@ static bool ARG_FP_INPUT = false;
static bool ARG_UINT_INPUT = false;
static bool ARG_HELP = false;
-static int parse_options(int argc, char*const* argv) {
+static int parse_options(int argc, char *const *argv) {
int arg_index = 0;
int arg;
bool error = false;
while ((arg = getopt(argc, argv, "frhu")) != -1) {
switch (arg) {
- case 'f':
- ARG_FP_INPUT = true;
- break;
- case 'r':
- ARG_REVERSE_CDF = true;
- break;
- case 'u':
- ARG_UINT_INPUT = true;
- case 'h':
- ARG_HELP = true;
- break;
- case '?':
- if (isprint(optopt)) {
- fprintf(stderr, "Unknown option `-%c`.\n", optopt);
- } else {
- fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt);
- }
- error = true;
- break;
- default:
- error = true;
- break;
+ case 'f':
+ ARG_FP_INPUT = true;
+ break;
+ case 'r':
+ ARG_REVERSE_CDF = true;
+ break;
+ case 'u':
+ ARG_UINT_INPUT = true;
+ break;
+ case 'h':
+ ARG_HELP = true;
+ break;
+ case '?':
+ if (isprint(optopt)) {
+ fprintf(stderr, "Unknown option `-%c`.\n", optopt);
+ } else {
+ fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt);
+ }
+ error = true;
+ break;
+ default:
+ error = true;
+ break;
}
}
@@ -58,13 +58,11 @@ static int parse_options(int argc, char*const* argv) {
return arg_index;
}
-static void help() {
- fprintf(stderr, "Usage:\ncdf [-f|-u] [-r] [filename]\n");
-}
+static void help() { fprintf(stderr, "Usage:\ncdf [-f|-u] [-r] [filename]\n"); }
static DistRecord *expand_array(DistRecord *records, size_t *capacity) {
(*capacity) *= 2;
- DistRecord *new = realloc(records, (*capacity*sizeof(DistRecord)));
+ DistRecord *new = realloc(records, (*capacity * sizeof(DistRecord)));
if (!new) {
fprintf(stderr, "ERROR: Memory allocation failed\n");
return nullptr;
@@ -73,42 +71,26 @@ static DistRecord *expand_array(DistRecord *records, size_t *capacity) {
return new;
}
-static int read_data_int(DistRecord **records, size_t capacity, FILE *file) {
- size_t reccnt = 0;
- while (fscanf(file, "%ld %ld\n", &(*records + reccnt)->count,
- &(*records + reccnt)->data.i) != EOF) {
-
- reccnt++;
- if (reccnt == capacity) {
- if (!(*records = expand_array(*records, &capacity))) {
- return -1;
- }
- }
+static int parse_line(DistRecord *record, char *line) {
+ if (ARG_FP_INPUT) {
+ return sscanf(line, "%lld %lf", &record->count, &record->data.d);
+ } else if (ARG_UINT_INPUT) {
+ return sscanf(line, "%lld %llu", &record->count, &record->data.u);
+ } else {
+ return sscanf(line, "%lld %lld", &record->count, &record->data.i);
}
-
- return reccnt;
}
-static int read_data_fp(DistRecord **records, size_t capacity, FILE *file) {
+static ssize_t read_data(DistRecord **records, size_t capacity, FILE *file) {
size_t reccnt = 0;
- while (fscanf(file, "%ld %lf\n", &(*records + reccnt)->count,
- &(*records + reccnt)->data.d) != EOF) {
+ char *line = nullptr;
+ size_t line_len = 0;
- reccnt++;
- if (reccnt == capacity) {
- if (!(*records = expand_array(*records, &capacity))) {
- return -1;
- }
+ while (getline(&line, &line_len, file) != -1) {
+ if (parse_line(*records + reccnt, line) != 2) {
+ fprintf(stderr, "[W] Skipping invalid input line: %s\n", line);
+ continue;
}
- }
-
- return reccnt;
-}
-
-static int read_data_uint(DistRecord **records, size_t capacity, FILE *file) {
- size_t reccnt = 0;
- while (fscanf(file, "%ld %ld\n", &(*records + reccnt)->count,
- &(*records + reccnt)->data.u) != EOF) {
reccnt++;
if (reccnt == capacity) {
@@ -121,50 +103,27 @@ static int read_data_uint(DistRecord **records, size_t capacity, FILE *file) {
return reccnt;
}
-static int print_data_fp(DistRecord *records, long double *freqs, size_t cnt) {
- if (ARG_REVERSE_CDF) {
- long double total_freq = 1.0;
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%lf\n", total_freq, records[i].data.d);
- total_freq -= freqs[i];
- }
-
+static void print_record(long double freq, DistRecord *record) {
+ if (ARG_FP_INPUT) {
+ fprintf(stdout, "%.15Lf\t%lf\n", freq, record->data.d);
+ } else if (ARG_UINT_INPUT) {
+ fprintf(stdout, "%.15Lf\t%llu\n", freq, record->data.u);
} else {
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%lf\n", freqs[i], records[i].data.d);
- }
+ fprintf(stdout, "%.15Lf\t%lld\n", freq, record->data.i);
}
-
- return 1;
}
-static int print_data_int(DistRecord *records, long double *freqs, size_t cnt) {
+static int print_data(DistRecord *records, long double *freqs, size_t cnt) {
if (ARG_REVERSE_CDF) {
long double total_freq = 1.0;
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%ld\n", total_freq, records[i].data.i);
+ for (size_t i = 0; i < cnt; i++) {
+ print_record(total_freq, records + i);
total_freq -= freqs[i];
}
} else {
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%ld\n", freqs[i], records[i].data.i);
- }
- }
-
- return 1;
-}
-
-static int print_data_uint(DistRecord *records, long double *freqs, size_t cnt) {
- if (ARG_REVERSE_CDF) {
- long double total_freq = 1.0;
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%ld\n", total_freq, records[i].data.u);
- total_freq -= freqs[i];
+ for (size_t i = 0; i < cnt; i++) {
+ print_record(freqs[i], records + i);
}
- } else {
- for (size_t i=0; i<cnt; i++) {
- fprintf(stdout, "%.15Lf\t%ld\n", freqs[i], records[i].data.u);
- }
}
return 1;
@@ -172,22 +131,17 @@ static int print_data_uint(DistRecord *records, long double *freqs, size_t cnt)
static int process_data(FILE *file) {
int rc = 1;
-
+
size_t reccap = 100;
- DistRecord *records = malloc(reccap*sizeof(DistRecord));
+ DistRecord *records = malloc(reccap * sizeof(DistRecord));
- ssize_t cnt;
- /* FIXME: this could probably use a type-based macro to collapse the
- if statements into a single macro call
- */
- if (ARG_FP_INPUT) {
- cnt = read_data_fp(&records, reccap, file);
- } else if (ARG_UINT_INPUT) {
- cnt = read_data_uint(&records, reccap, file);
- } else {
- cnt = read_data_int(&records, reccap, file);
+ if (!records) {
+ rc = 0;
+ goto process_data_end;
}
+ ssize_t cnt = read_data(&records, reccap, file);
+
/* propogate the error */
if (cnt == -1) {
rc = 0;
@@ -196,7 +150,7 @@ static int process_data(FILE *file) {
/* calculate total sum of counts */
uint64_t total_count = 0;
- for (size_t i=0; i<cnt; i++) {
+ for (size_t i = 0; i < cnt; i++) {
total_count += records[i].count;
}
@@ -207,18 +161,12 @@ static int process_data(FILE *file) {
goto free_freqs;
}
- for (size_t i=0; i<cnt; i++) {
- freqs[i] = (long double) (records[i].count) / (long double) (total_count);
+ for (size_t i = 0; i < cnt; i++) {
+ freqs[i] = (long double)(records[i].count) / (long double)(total_count);
}
- if (ARG_FP_INPUT) {
- rc = print_data_fp(records, freqs, cnt);
- } else if (ARG_UINT_INPUT) {
- rc = print_data_uint(records, freqs, cnt);
- } else {
- rc = print_data_int(records, freqs, cnt);
- }
-
+ rc = print_data(records, freqs, cnt);
+
free_freqs:
free(freqs);
@@ -229,7 +177,6 @@ process_data_end:
return rc;
}
-
int main(int argc, char **argv) {
int rc = EXIT_SUCCESS;
int file_index = 0;
@@ -249,7 +196,8 @@ int main(int argc, char **argv) {
FILE *input_file;
if (file_index < argc && strcmp(argv[file_index], "-") != 0) {
if (!(input_file = fopen(argv[file_index], "r"))) {
- fprintf(stderr, "Error: Unable to open input file %s\n", argv[file_index]);
+ fprintf(stderr, "Error: Unable to open input file %s\n",
+ argv[file_index]);
rc = EXIT_FAILURE;
goto program_exit;
}
@@ -262,7 +210,9 @@ int main(int argc, char **argv) {
}
close_file:
- fclose(input_file);
+ if (input_file != stdin) {
+ fclose(input_file);
+ }
program_exit:
exit(rc);
diff --git a/src/cumsum.c b/src/cumsum.c
index 5648661..268a548 100644
--- a/src/cumsum.c
+++ b/src/cumsum.c
@@ -1,8 +1,7 @@
/*
- *
+ *
*/
-
#include "cumsum.h"
/*
@@ -13,32 +12,32 @@ static bool ARG_FP_INPUT = false;
static bool ARG_UINT_INPUT = false;
static bool ARG_HELP = false;
-static int parse_options(int argc, char*const* argv) {
+static int parse_options(int argc, char *const *argv) {
int arg_index = 0;
int arg;
bool error = false;
while ((arg = getopt(argc, argv, "frhu")) != -1) {
switch (arg) {
- case 'f':
- ARG_FP_INPUT = true;
- break;
- case 'u':
- ARG_UINT_INPUT = true;
- case 'h':
- ARG_HELP = true;
- break;
- case '?':
- if (isprint(optopt)) {
- fprintf(stderr, "Unknown option `-%c`.\n", optopt);
- } else {
- fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt);
- }
- error = true;
- break;
- default:
- error = true;
- break;
+ case 'f':
+ ARG_FP_INPUT = true;
+ break;
+ case 'u':
+ ARG_UINT_INPUT = true;
+ case 'h':
+ ARG_HELP = true;
+ break;
+ case '?':
+ if (isprint(optopt)) {
+ fprintf(stderr, "Unknown option `-%c`.\n", optopt);
+ } else {
+ fprintf(stderr, "Unknown option character `\\x%x`.\n", optopt);
+ }
+ error = true;
+ break;
+ default:
+ error = true;
+ break;
}
}
@@ -54,17 +53,15 @@ static int parse_options(int argc, char*const* argv) {
return arg_index;
}
-static void help() {
- fprintf(stderr, "Usage:\ncumsum [-f|-u] [filename]\n");
-}
+static void help() { fprintf(stderr, "Usage:\ncumsum [-f|-u] [filename]\n"); }
void print_sum(Number sum) {
if (ARG_FP_INPUT) {
fprintf(stdout, "%lf\n", sum.d);
} else if (ARG_UINT_INPUT) {
- fprintf(stdout, "%ld\n", sum.u);
+ fprintf(stdout, "%lld\n", sum.u);
} else {
- fprintf(stdout, "%ld\n", sum.i);
+ fprintf(stdout, "%lld\n", sum.i);
}
}
@@ -79,7 +76,7 @@ static int read_data_fp(FILE *file, Number *num) {
static int read_data_int(FILE *file, Number *num) {
int64_t val;
- while (fscanf(file, "%ld ", &val) != EOF) {
+ while (fscanf(file, "%lld ", &val) != EOF) {
num->i += val;
}
@@ -88,7 +85,7 @@ static int read_data_int(FILE *file, Number *num) {
static int read_data_uint(FILE *file, Number *num) {
uint64_t val;
- while (fscanf(file, "%ld ", &val) != EOF) {
+ while (fscanf(file, "%lld ", &val) != EOF) {
num->u += val;
}
@@ -97,10 +94,10 @@ static int read_data_uint(FILE *file, Number *num) {
static int process_data(FILE *file) {
int rc = 1;
-
+
Number sum = {};
- /* FIXME: this could probably use a type-based macro to collapse the
+ /* FIXME: this could probably use a type-based macro to collapse the
if statements into a single macro call
*/
if (ARG_FP_INPUT) {
@@ -119,7 +116,6 @@ process_data_end:
return rc;
}
-
int main(int argc, char **argv) {
int rc = EXIT_SUCCESS;
int file_index = 0;
@@ -139,7 +135,8 @@ int main(int argc, char **argv) {
FILE *input_file;
if (file_index < argc && strcmp(argv[file_index], "-") != 0) {
if (!(input_file = fopen(argv[file_index], "r"))) {
- fprintf(stderr, "Error: Unable to open input file %s\n", argv[file_index]);
+ fprintf(stderr, "Error: Unable to open input file %s\n",
+ argv[file_index]);
rc = EXIT_FAILURE;
goto program_exit;
}