diff options
| author | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2026-06-06 13:44:00 -0400 |
|---|---|---|
| committer | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2026-06-06 13:44:00 -0400 |
| commit | c102ab995f9a86a77e40b9a952b2b23c0bd7de74 (patch) | |
| tree | d51b9a8f1a55f7f6e6e5afb89d524b9baa350f45 /tests/fuzz.sh | |
| parent | b56c273d8198ae6cee69bbc9fe5a6a61da4074e4 (diff) | |
| download | bibutils-c102ab995f9a86a77e40b9a952b2b23c0bd7de74.tar.gz | |
Fuzzing with associated fixes
Diffstat (limited to 'tests/fuzz.sh')
| -rwxr-xr-x | tests/fuzz.sh | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/tests/fuzz.sh b/tests/fuzz.sh new file mode 100755 index 0000000..4145d42 --- /dev/null +++ b/tests/fuzz.sh @@ -0,0 +1,229 @@ +#!/bin/sh +# fuzz.sh - throw bogus input at the tools and watch for misbehavior +# +# usage: tests/fuzz.sh [iterations] [seed-offset] +# (default 100 per generator; the offset lets parallel runs +# explore different deterministic mutation seeds) +# +# Four generators feed every entry-consuming tool: +# random - raw bytes from /dev/urandom +# mutated - a valid bibtex file with random structural damage +# (deleted/duplicated/inserted braces, quotes, @, #, =) +# soup - random streams of bibtex syntax tokens +# format - malformed aux files for bib-extract and refer records +# for bib-convert +# +# A case fails if a tool hangs (5s timeout), dies with an awk runtime +# error, exits above 2, or breaks the canonicalization fixed-point +# property (canon(canon(x)) must equal canon(x)). +# +# The mutated and soup inputs are additionally fired at bib-add +# against a known database, which must afterwards still parse and +# still contain every original entry (the survival invariant). + +ROOT=$(cd "$(dirname "$0")/.." && pwd) +PATH=$ROOT:$PATH + +# byte semantics: random bytes are rarely valid UTF-8, and gawk's +# locale warnings about that are not parser failures +LC_ALL=C +export LC_ALL +N=${1:-100} +OFF=${2:-0} +tmpd=$(mktemp -d) || exit 1 +trap 'rm -rf "$tmpd"' EXIT INT TERM + +fails=0 +cases=0 + +# canonicalizing filter (used for the fixed-point property) +canon() { + awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-canon.awk" \ + -f "$ROOT/lib/bib-select.awk" -v keys= -v invert=1 +} + +# run one tool on one input file; report any sign of misbehavior +probe() { # probe <case-name> <input> <cmd...> + name=$1 + input=$2 + shift 2 + cases=$((cases + 1)) + timeout 5 "$@" < "$input" > "$tmpd/out" 2> "$tmpd/err" + rc=$? + if [ "$rc" -eq 124 ]; then + fails=$((fails + 1)) + printf 'HANG %s: %s\n' "$name" "$*" + cp "$input" "$tmpd/hang.$fails" + elif [ "$rc" -gt 2 ]; then + fails=$((fails + 1)) + printf 'CRASH %s: %s (exit %d)\n' "$name" "$*" "$rc" + cp "$input" "$tmpd/crash.$fails" + elif grep -Eq 'awk:.*(fatal|error)|[Ss]egmentation' "$tmpd/err"; then + fails=$((fails + 1)) + printf 'AWKERR %s: %s: %s\n' "$name" "$*" "$(head -1 "$tmpd/err")" + cp "$input" "$tmpd/awkerr.$fails" + fi +} + +# the canonicalization of any input must be a fixed point +probe_fixedpoint() { # probe_fixedpoint <case-name> <input> + cases=$((cases + 1)) + timeout 5 canon < "$2" > "$tmpd/c1" 2> /dev/null + timeout 5 canon < "$tmpd/c1" > "$tmpd/c2" 2> /dev/null + if ! cmp -s "$tmpd/c1" "$tmpd/c2"; then + fails=$((fails + 1)) + printf 'NOTFIX %s: canon not idempotent\n' "$1" + cp "$2" "$tmpd/notfix.$fails" + fi +} + +seed_bib() { + cat <<'EOF' +@string{cj = {The Computer Journal}} +@article{knuth1984literate, + author = {Donald E. Knuth}, + title = {Literate {P}rogramming}, + journal = cj, + year = 1984, + pages = "97--111", + note = "vol. " # 27, +} +@inproceedings{lamport1978time, + author = {Leslie Lamport}, + title = {Time, Clocks, and the Ordering of Events}, + booktitle = {Communications of the ACM}, + year = {1978}, +} +EOF +} + +# damage a file at a random spot: delete, duplicate, or insert a +# structural character (awk does the randomness; seeded per case) +mutate() { # mutate <seed> < in > out + awk -v seed="$1" ' + BEGIN { srand(seed) } + { buf = buf $0 "\n" } + END { + n = length(buf) + chars = "{}\"@#=,()\\%" + for (m = 0; m < 1 + int(rand() * 8); m++) { + pos = 1 + int(rand() * n) + op = int(rand() * 3) + c = substr(chars, 1 + int(rand() * length(chars)), 1) + if (op == 0) # delete a character + buf = substr(buf, 1, pos - 1) substr(buf, pos + 1) + else if (op == 1) # insert a structural character + buf = substr(buf, 1, pos - 1) c substr(buf, pos) + else # duplicate a slice + buf = substr(buf, 1, pos) substr(buf, pos, 1 + int(rand() * 20)) substr(buf, pos) + n = length(buf) + } + printf "%s", buf + }' +} + +# a stream of plausible bibtex syntax fragments in random order +soup() { # soup <seed> > out + awk -v seed="$1" ' + BEGIN { + srand(seed) + n = 0 + T[++n] = "@"; T[++n] = "{"; T[++n] = "}"; T[++n] = "\"" + T[++n] = "#"; T[++n] = "="; T[++n] = ","; T[++n] = "(" + T[++n] = ")"; T[++n] = "%"; T[++n] = "\\"; T[++n] = " " + T[++n] = "\n"; T[++n] = "word"; T[++n] = "1984" + T[++n] = "@article{k,"; T[++n] = "t = {v}"; T[++n] = "@string" + T[++n] = "@comment"; T[++n] = " and "; T[++n] = "--" + len = 200 + int(rand() * 800) + for (i = 0; i < len; i++) + printf "%s", T[1 + int(rand() * n)] + }' +} + +run_entry_tools() { # run_entry_tools <case-name> <input> + probe "$1" "$2" bib-key + probe "$1" "$2" bib-ls -l + probe "$1" "$2" bib-check + probe "$1" "$2" bib-convert -r + probe "$1" "$2" bib-add "$tmpd/scratch.bib" + rm -f "$tmpd/scratch.bib" "$tmpd/scratch.bib.bak" + probe_fixedpoint "$1" "$2" +} + +# fire input at bib-add (with and without -f) against a known database; +# afterwards the database must still parse and still contain every +# original entry +probe_survival() { # probe_survival <case-name> <input> + cases=$((cases + 1)) + cat > "$tmpd/inv.bib" <<'EOF' +@string{js = {Journal of Survival}} +@article{orig1990one, author = {A. Original}, title = {One}, year = 1990} +@article{orig1991two, author = {B. Original}, title = {Two}, journal = js, year = 1991} +@misc{orig1992three, title = {Three}, note = "v. " # 3} +EOF + timeout 5 bib-add "$tmpd/inv.bib" < "$2" > /dev/null 2>&1 + timeout 5 bib-add -f "$tmpd/inv.bib" < "$2" > /dev/null 2>&1 + if ! awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-lskeys.awk" \ + "$tmpd/inv.bib" > "$tmpd/invkeys" 2> /dev/null \ + || ! grep -q '^orig1990one$' "$tmpd/invkeys" \ + || ! grep -q '^orig1991two$' "$tmpd/invkeys" \ + || ! grep -q '^orig1992three$' "$tmpd/invkeys"; then + fails=$((fails + 1)) + printf 'WRECK %s: database lost entries or no longer parses\n' "$1" + cp "$2" "$tmpd/wreck.$fails" + fi + rm -f "$tmpd/inv.bib.bak" +} + +echo "=== random bytes (x$N) ===" +i=0 +while [ "$i" -lt "$N" ]; do + i=$((i + 1)) + head -c 512 /dev/urandom > "$tmpd/in" + run_entry_tools "random/$i" "$tmpd/in" +done + +echo "=== mutated bibtex (x$N) ===" +seed_bib > "$tmpd/seed" +i=0 +while [ "$i" -lt "$N" ]; do + i=$((i + 1)) + mutate "$((i + OFF))" < "$tmpd/seed" > "$tmpd/in" + run_entry_tools "mutated/$i" "$tmpd/in" + probe_survival "mutated/$i" "$tmpd/in" +done + +echo "=== syntax soup (x$N) ===" +i=0 +while [ "$i" -lt "$N" ]; do + i=$((i + 1)) + soup "$((i + OFF))" > "$tmpd/in" + run_entry_tools "soup/$i" "$tmpd/in" + probe_survival "soup/$i" "$tmpd/in" +done + +echo "=== malformed aux and refer (x$N) ===" +printf '@article{k, author={A}, title={T}, year=1}\n' > "$tmpd/db.bib" +i=0 +while [ "$i" -lt "$N" ]; do + i=$((i + 1)) + printf '\\citation{k}\n\\citation{a,b,c}\n\\abx@aux@cite{0}{k}\n%%A Some One\n%%T Title\n' \ + | mutate "$((i + OFF))" > "$tmpd/in" + cases=$((cases + 1)) + if ! timeout 5 bib-extract "$tmpd/in" "$tmpd/db.bib" > /dev/null 2> "$tmpd/err"; then + rc=$? + if [ "$rc" -gt 2 ]; then + fails=$((fails + 1)) + printf 'CRASH aux/%d: bib-extract (exit %d)\n' "$i" "$rc" + fi + fi + probe "ref/$i" "$tmpd/in" bib-convert -b +done + +printf '\n%d cases, %d failures' "$cases" "$fails" +if [ "$fails" -gt 0 ]; then + printf ' (failing inputs preserved in %s)\n' "$tmpd" + trap - EXIT + exit 1 +fi +printf '\n' |