Fuzzing with associated fixes

author: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 13:44:00 -0400
committer: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 13:44:00 -0400
commit: c102ab995f9a86a77e40b9a952b2b23c0bd7de74 (patch)
tree: d51b9a8f1a55f7f6e6e5afb89d524b9baa350f45 /tests/fuzz.sh
parent: b56c273d8198ae6cee69bbc9fe5a6a61da4074e4 (diff)
download: bibutils-c102ab995f9a86a77e40b9a952b2b23c0bd7de74.tar.gz
1 files changed, 229 insertions, 0 deletions
diff --git a/tests/fuzz.sh b/tests/fuzz.sh
new file mode 100755
index 0000000..4145d42
--- /dev/null
+++ b/tests/fuzz.sh
@@ -0,0 +1,229 @@
+#!/bin/sh
+# fuzz.sh - throw bogus input at the tools and watch for misbehavior
+#
+# usage: tests/fuzz.sh [iterations] [seed-offset]
+#        (default 100 per generator; the offset lets parallel runs
+#         explore different deterministic mutation seeds)
+#
+# Four generators feed every entry-consuming tool:
+#   random  - raw bytes from /dev/urandom
+#   mutated - a valid bibtex file with random structural damage
+#             (deleted/duplicated/inserted braces, quotes, @, #, =)
+#   soup    - random streams of bibtex syntax tokens
+#   format  - malformed aux files for bib-extract and refer records
+#             for bib-convert
+#
+# A case fails if a tool hangs (5s timeout), dies with an awk runtime
+# error, exits above 2, or breaks the canonicalization fixed-point
+# property (canon(canon(x)) must equal canon(x)).
+#
+# The mutated and soup inputs are additionally fired at bib-add
+# against a known database, which must afterwards still parse and
+# still contain every original entry (the survival invariant).
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+
+# byte semantics: random bytes are rarely valid UTF-8, and gawk's
+# locale warnings about that are not parser failures
+LC_ALL=C
+export LC_ALL
+N=${1:-100}
+OFF=${2:-0}
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+
+fails=0
+cases=0
+
+# canonicalizing filter (used for the fixed-point property)
+canon() {
+  awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-canon.awk" \
+      -f "$ROOT/lib/bib-select.awk" -v keys= -v invert=1
+}
+
+# run one tool on one input file; report any sign of misbehavior
+probe() { # probe <case-name> <input> <cmd...>
+  name=$1
+  input=$2
+  shift 2
+  cases=$((cases + 1))
+  timeout 5 "$@" < "$input" > "$tmpd/out" 2> "$tmpd/err"
+  rc=$?
+  if [ "$rc" -eq 124 ]; then
+    fails=$((fails + 1))
+    printf 'HANG  %s: %s\n' "$name" "$*"
+    cp "$input" "$tmpd/hang.$fails"
+  elif [ "$rc" -gt 2 ]; then
+    fails=$((fails + 1))
+    printf 'CRASH %s: %s (exit %d)\n' "$name" "$*" "$rc"
+    cp "$input" "$tmpd/crash.$fails"
+  elif grep -Eq 'awk:.*(fatal|error)|[Ss]egmentation' "$tmpd/err"; then
+    fails=$((fails + 1))
+    printf 'AWKERR %s: %s: %s\n' "$name" "$*" "$(head -1 "$tmpd/err")"
+    cp "$input" "$tmpd/awkerr.$fails"
+  fi
+}
+
+# the canonicalization of any input must be a fixed point
+probe_fixedpoint() { # probe_fixedpoint <case-name> <input>
+  cases=$((cases + 1))
+  timeout 5 canon < "$2" > "$tmpd/c1" 2> /dev/null
+  timeout 5 canon < "$tmpd/c1" > "$tmpd/c2" 2> /dev/null
+  if ! cmp -s "$tmpd/c1" "$tmpd/c2"; then
+    fails=$((fails + 1))
+    printf 'NOTFIX %s: canon not idempotent\n' "$1"
+    cp "$2" "$tmpd/notfix.$fails"
+  fi
+}
+
+seed_bib() {
+  cat <<'EOF'
+@string{cj = {The Computer Journal}}
+@article{knuth1984literate,
+  author = {Donald E. Knuth},
+  title = {Literate {P}rogramming},
+  journal = cj,
+  year = 1984,
+  pages = "97--111",
+  note = "vol. " # 27,
+}
+@inproceedings{lamport1978time,
+  author = {Leslie Lamport},
+  title = {Time, Clocks, and the Ordering of Events},
+  booktitle = {Communications of the ACM},
+  year = {1978},
+}
+EOF
+}
+
+# damage a file at a random spot: delete, duplicate, or insert a
+# structural character (awk does the randomness; seeded per case)
+mutate() { # mutate <seed> < in > out
+  awk -v seed="$1" '
+    BEGIN { srand(seed) }
+    { buf = buf $0 "\n" }
+    END {
+      n = length(buf)
+      chars = "{}\"@#=,()\\%"
+      for (m = 0; m < 1 + int(rand() * 8); m++) {
+        pos = 1 + int(rand() * n)
+        op = int(rand() * 3)
+        c = substr(chars, 1 + int(rand() * length(chars)), 1)
+        if (op == 0)        # delete a character
+          buf = substr(buf, 1, pos - 1) substr(buf, pos + 1)
+        else if (op == 1)   # insert a structural character
+          buf = substr(buf, 1, pos - 1) c substr(buf, pos)
+        else                # duplicate a slice
+          buf = substr(buf, 1, pos) substr(buf, pos, 1 + int(rand() * 20)) substr(buf, pos)
+        n = length(buf)
+      }
+      printf "%s", buf
+    }'
+}
+
+# a stream of plausible bibtex syntax fragments in random order
+soup() { # soup <seed> > out
+  awk -v seed="$1" '
+    BEGIN {
+      srand(seed)
+      n = 0
+      T[++n] = "@";  T[++n] = "{";    T[++n] = "}";  T[++n] = "\""
+      T[++n] = "#";  T[++n] = "=";    T[++n] = ",";  T[++n] = "("
+      T[++n] = ")";  T[++n] = "%";    T[++n] = "\\"; T[++n] = " "
+      T[++n] = "\n"; T[++n] = "word"; T[++n] = "1984"
+      T[++n] = "@article{k,"; T[++n] = "t = {v}";   T[++n] = "@string"
+      T[++n] = "@comment";    T[++n] = " and ";     T[++n] = "--"
+      len = 200 + int(rand() * 800)
+      for (i = 0; i < len; i++)
+        printf "%s", T[1 + int(rand() * n)]
+    }'
+}
+
+run_entry_tools() { # run_entry_tools <case-name> <input>
+  probe "$1" "$2" bib-key
+  probe "$1" "$2" bib-ls -l
+  probe "$1" "$2" bib-check
+  probe "$1" "$2" bib-convert -r
+  probe "$1" "$2" bib-add "$tmpd/scratch.bib"
+  rm -f "$tmpd/scratch.bib" "$tmpd/scratch.bib.bak"
+  probe_fixedpoint "$1" "$2"
+}
+
+# fire input at bib-add (with and without -f) against a known database;
+# afterwards the database must still parse and still contain every
+# original entry
+probe_survival() { # probe_survival <case-name> <input>
+  cases=$((cases + 1))
+  cat > "$tmpd/inv.bib" <<'EOF'
+@string{js = {Journal of Survival}}
+@article{orig1990one, author = {A. Original}, title = {One}, year = 1990}
+@article{orig1991two, author = {B. Original}, title = {Two}, journal = js, year = 1991}
+@misc{orig1992three, title = {Three}, note = "v. " # 3}
+EOF
+  timeout 5 bib-add "$tmpd/inv.bib" < "$2" > /dev/null 2>&1
+  timeout 5 bib-add -f "$tmpd/inv.bib" < "$2" > /dev/null 2>&1
+  if ! awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-lskeys.awk" \
+       "$tmpd/inv.bib" > "$tmpd/invkeys" 2> /dev/null \
+     || ! grep -q '^orig1990one$' "$tmpd/invkeys" \
+     || ! grep -q '^orig1991two$' "$tmpd/invkeys" \
+     || ! grep -q '^orig1992three$' "$tmpd/invkeys"; then
+    fails=$((fails + 1))
+    printf 'WRECK %s: database lost entries or no longer parses\n' "$1"
+    cp "$2" "$tmpd/wreck.$fails"
+  fi
+  rm -f "$tmpd/inv.bib.bak"
+}
+
+echo "=== random bytes (x$N) ==="
+i=0
+while [ "$i" -lt "$N" ]; do
+  i=$((i + 1))
+  head -c 512 /dev/urandom > "$tmpd/in"
+  run_entry_tools "random/$i" "$tmpd/in"
+done
+
+echo "=== mutated bibtex (x$N) ==="
+seed_bib > "$tmpd/seed"
+i=0
+while [ "$i" -lt "$N" ]; do
+  i=$((i + 1))
+  mutate "$((i + OFF))" < "$tmpd/seed" > "$tmpd/in"
+  run_entry_tools "mutated/$i" "$tmpd/in"
+  probe_survival "mutated/$i" "$tmpd/in"
+done
+
+echo "=== syntax soup (x$N) ==="
+i=0
+while [ "$i" -lt "$N" ]; do
+  i=$((i + 1))
+  soup "$((i + OFF))" > "$tmpd/in"
+  run_entry_tools "soup/$i" "$tmpd/in"
+  probe_survival "soup/$i" "$tmpd/in"
+done
+
+echo "=== malformed aux and refer (x$N) ==="
+printf '@article{k, author={A}, title={T}, year=1}\n' > "$tmpd/db.bib"
+i=0
+while [ "$i" -lt "$N" ]; do
+  i=$((i + 1))
+  printf '\\citation{k}\n\\citation{a,b,c}\n\\abx@aux@cite{0}{k}\n%%A Some One\n%%T Title\n' \
+    | mutate "$((i + OFF))" > "$tmpd/in"
+  cases=$((cases + 1))
+  if ! timeout 5 bib-extract "$tmpd/in" "$tmpd/db.bib" > /dev/null 2> "$tmpd/err"; then
+    rc=$?
+    if [ "$rc" -gt 2 ]; then
+      fails=$((fails + 1))
+      printf 'CRASH aux/%d: bib-extract (exit %d)\n' "$i" "$rc"
+    fi
+  fi
+  probe "ref/$i" "$tmpd/in" bib-convert -b
+done
+
+printf '\n%d cases, %d failures' "$cases" "$fails"
+if [ "$fails" -gt 0 ]; then
+  printf ' (failing inputs preserved in %s)\n' "$tmpd"
+  trap - EXIT
+  exit 1
+fi
+printf '\n'
author	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 13:44:00 -0400
committer	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 13:44:00 -0400
commit	c102ab995f9a86a77e40b9a952b2b23c0bd7de74 (patch)
tree	d51b9a8f1a55f7f6e6e5afb89d524b9baa350f45 /tests/fuzz.sh
parent	b56c273d8198ae6cee69bbc9fe5a6a61da4074e4 (diff)
download	bibutils-c102ab995f9a86a77e40b9a952b2b23c0bd7de74.tar.gz