aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile9
-rw-r--r--README.md8
-rwxr-xr-xbib-add121
-rwxr-xr-xbib-extract13
-rw-r--r--lib/bib-key.awk4
-rw-r--r--lib/bib-parse.awk5
-rw-r--r--lib/bib-select.awk30
-rw-r--r--lib/bib-strip.awk49
-rw-r--r--man/bib-add.133
-rwxr-xr-xtests/fuzz.sh229
-rwxr-xr-xtests/run-tests.sh57
11 files changed, 506 insertions, 52 deletions
diff --git a/Makefile b/Makefile
index a7df72c..1c48121 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,8 @@ MANDIR = $(PREFIX)/share/man/man1
SCRIPTS = bib-util bib-add bib-check bib-convert bib-extract bib-fetch \
bib-gen bib-key bib-ls
LIBS = lib/bib-parse.awk lib/bib-canon.awk lib/bib-select.awk \
- lib/bib-lskeys.awk lib/bib-key.awk lib/bib-ls.awk \
- lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk
+ lib/bib-strip.awk lib/bib-lskeys.awk lib/bib-key.awk \
+ lib/bib-ls.awk lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk
MANPAGES = man/bib-util.1 man/bib-add.1 man/bib-check.1 man/bib-convert.1 \
man/bib-extract.1 man/bib-fetch.1 man/bib-gen.1 man/bib-key.1 \
man/bib-ls.1
@@ -19,6 +19,9 @@ test:
tests/run-tests.sh
tests/integration.sh
+fuzz:
+ tests/fuzz.sh
+
install:
-mkdir -p $(BINDIR) $(LIBDIR) $(MANDIR)
cp $(SCRIPTS) $(BINDIR)
@@ -31,4 +34,4 @@ uninstall:
bib-extract.1 bib-fetch.1 bib-gen.1 bib-key.1 bib-ls.1
rm -rf $(LIBDIR)
-.PHONY: all test install uninstall
+.PHONY: all test fuzz install uninstall
diff --git a/README.md b/README.md
index 11cda7f..434a98a 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,10 @@ smaller provided scripts for convenience's sake.
## bib-add
A script for inserting a new entry into a bibtex database file. It will
accept a fully formatted entry on standard input and add it to a database
-file presented as an argument.
+file presented as an argument. The database is never modified in place:
+the new version is built in a temporary file, verified, and only then
+swapped in, with the previous contents saved in db.bib.bak. Replacing an
+entry (-f) preserves every other byte of the file.
## bib-gen
A script which generates a bibtex entry based on input. By default it will
@@ -72,4 +75,5 @@ POSIX shell and awk only, with two exceptions: bib-fetch requires curl,
plus pdftotext (poppler) for DOI extraction from pdfs.
# Tests
- make test
+ make test # unit + integration suites
+ make fuzz # robustness fuzzing against bogus input
diff --git a/bib-add b/bib-add
index 02a079e..8e3846c 100755
--- a/bib-add
+++ b/bib-add
@@ -3,12 +3,23 @@
#
# usage: bib-add [-f] db.bib < entry
# -f replace existing entries with the same key
+#
+# The database is never modified in place: the complete new version is
+# built in a temporary file, verified by re-parsing, and only then
+# moved over the original, with the previous contents saved in
+# db.bib.bak. Replacement with -f splices entries out by their exact
+# source spans, so the rest of the file is preserved byte-for-byte.
usage() {
printf 'usage: bib-add [-f] db.bib < entry\n' >&2
exit 2
}
+die() {
+ printf 'bib-add: %s\n' "$1" >&2
+ exit 1
+}
+
if [ -n "$BIBUTILS_LIB" ]; then
LIB=$BIBUTILS_LIB
elif [ -d "$(dirname "$0")/lib" ]; then
@@ -17,6 +28,10 @@ else
LIB=/usr/local/share/bibutils
fi
+lskeys() {
+ awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$@"
+}
+
force=0
while getopts f opt; do
case $opt in
@@ -27,41 +42,95 @@ done
shift $((OPTIND - 1))
[ $# -eq 1 ] || usage
db=$1
+[ -e "$db" ] && [ ! -f "$db" ] && die "$db is not a regular file"
-tmp=$(mktemp) && tmpkeys=$(mktemp) && tmpdb=$(mktemp) || exit 1
-trap 'rm -f "$tmp" "$tmpkeys" "$tmpdb"' EXIT INT TERM
+# serialize writers: set -C (noclobber) makes creating db.lock with
+# our pid inside a single atomic step, so whoever creates it owns the
+# database until they remove it; a lock whose owner has died is reaped
+lock=$db.lock
+tries=0
+while ! (set -C; echo $$ > "$lock") 2> /dev/null; do
+ owner=$(cat "$lock" 2> /dev/null)
+ if [ -n "$owner" ] && ! kill -0 "$owner" 2> /dev/null; then
+ # reap, but only if it is still that dead process's lock
+ printf 'bib-add: reaping stale lock from dead pid %s\n' "$owner" >&2
+ [ "$(cat "$lock" 2> /dev/null)" = "$owner" ] && rm -f "$lock"
+ continue
+ fi
+ tries=$((tries + 1))
+ [ "$tries" -ge 30 ] && die "$db is locked by pid ${owner:-unknown} (remove $lock if wrong)"
+ sleep 1
+done
-# canonicalize the incoming entries
+# release only a lock that is still ours
+unlock() {
+ [ "$(cat "$lock" 2> /dev/null)" = "$$" ] && rm -f "$lock"
+}
+
+tmp=$(mktemp) && tmpkeys=$(mktemp) || { unlock; exit 1; }
+trap 'rm -f "$tmp" "$tmpkeys" "$new"; unlock' EXIT INT TERM
+
+# canonicalize and validate the incoming entries
awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" -f "$LIB/bib-select.awk" \
-v keys= -v invert=1 > "$tmp"
+[ -s "$tmp" ] || die "no entries on stdin"
-if [ ! -s "$tmp" ]; then
- printf 'bib-add: no entries on stdin\n' >&2
- exit 1
+lskeys "$tmp" > "$tmpkeys"
+grep -q '^$' "$tmpkeys" && die "refusing to add an entry with an empty key"
+indups=$(sort "$tmpkeys" | uniq -d)
+[ -n "$indups" ] && die "duplicate keys within input: $indups"
+
+# check the incoming keys against the database
+dups=
+oldcount=0
+if [ -s "$db" ]; then
+ lskeys "$db" > "$tmp.old" || die "cannot parse $db"
+ oldcount=$(wc -l < "$tmp.old")
+ dups=$(grep -Fxf "$tmpkeys" "$tmp.old")
+ rm -f "$tmp.old"
+ if [ -n "$dups" ] && [ "$force" -ne 1 ]; then
+ printf 'bib-add: duplicate keys in %s (use -f to replace):\n' "$db" >&2
+ printf '%s\n' "$dups" >&2
+ exit 1
+ fi
fi
-awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$tmp" > "$tmpkeys"
+# build the complete new database next to the original (same
+# filesystem, so the final move cannot be interrupted halfway)
+new=$(mktemp "$db.XXXXXX") || exit 1
-if [ -f "$db" ]; then
- dups=$(awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$db" \
- | grep -Fxf "$tmpkeys")
+if [ -s "$db" ]; then
if [ -n "$dups" ]; then
- if [ "$force" -eq 1 ]; then
- # rewrite the database without the entries being replaced
- keys=$(printf '%s\n' "$dups" | paste -sd, -)
- awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
- -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=1 \
- "$db" > "$tmpdb" || exit 1
- cp "$tmpdb" "$db"
- else
- printf 'bib-add: duplicate keys in %s:\n' "$db" >&2
- printf '%s\n' "$dups" >&2
- exit 1
- fi
+ # splice out the entries being replaced; all other bytes survive
+ printf '%s\n' "$dups" > "$tmp.dups"
+ awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-strip.awk" \
+ -v keyfile="$tmp.dups" "$db" > "$new" || die "failed to rewrite $db"
+ rm -f "$tmp.dups"
+ else
+ cat "$db" > "$new" || die "failed to copy $db"
fi
+ # ensure exactly one blank line before the appended entries
+ [ -n "$(tail -c 1 "$new")" ] && echo >> "$new"
+ echo >> "$new"
fi
+cat "$tmp" >> "$new"
+
+# verify the result before touching the original: every old key minus
+# the replaced ones, plus every new key, must parse back out
+ndups=$(printf '%s' "$dups" | grep -c '^' || true)
+nnew=$(wc -l < "$tmpkeys")
+expect=$((oldcount - ndups + nnew))
+actual=$(lskeys "$new" | wc -l)
+[ "$actual" -eq "$expect" ] || \
+ die "verification failed ($actual entries, expected $expect); $db left untouched"
-{
- [ -s "$db" ] && echo ""
- cat "$tmp"
-} >> "$db"
+if [ -s "$db" ]; then
+ # back up first, then write through the original name so that its
+ # permissions, ownership and any symlink are preserved
+ cp "$db" "$db.bak" || die "cannot write backup $db.bak; $db left untouched"
+ cat "$new" > "$db" || die "write to $db failed; original is in $db.bak"
+ rm -f "$new"
+else
+ mv "$new" "$db" || die "cannot write $db"
+ chmod 644 "$db" 2> /dev/null
+fi
diff --git a/bib-extract b/bib-extract
index 297588a..ac0363d 100755
--- a/bib-extract
+++ b/bib-extract
@@ -23,7 +23,10 @@ aux=$1
shift
[ -r "$aux" ] || { printf 'bib-extract: cannot read %s\n' "$aux" >&2; exit 1; }
-keys=$(awk '
+keyfile=$(mktemp) || exit 1
+trap 'rm -f "$keyfile"' EXIT INT TERM
+
+awk '
# classic bibtex: \citation{key,key,...}
{
line = $0
@@ -46,10 +49,10 @@ keys=$(awk '
print s
line = substr(line, RSTART + RLENGTH)
}
- }' "$aux" | sort -u | paste -sd, -)
+ }' "$aux" | sort -u > "$keyfile"
-[ -n "$keys" ] || exit 0
+[ -s "$keyfile" ] || exit 0
# a key of "*" (from \nocite{*}) selects the whole database
-exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
- -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=0 "$@"
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-select.awk" -v keyfile="$keyfile" -v invert=0 "$@"
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
index 4223155..3f4117f 100644
--- a/lib/bib-key.awk
+++ b/lib/bib-key.awk
@@ -38,7 +38,7 @@ function bib_mkkey( a, y, t, surname, word, n, parts, i, w) {
surname = (n > 0) ? parts[n] : ""
}
gsub(/[^A-Za-z0-9]/, "", surname)
- surname = tolower(surname)
+ surname = tolower(substr(surname, 1, 30))
if (surname == "")
surname = "anon"
@@ -62,5 +62,5 @@ function bib_mkkey( a, y, t, surname, word, n, parts, i, w) {
break
}
- return surname y word
+ return surname y substr(word, 1, 30)
}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
index e5bf9fa..e83cb07 100644
--- a/lib/bib-parse.awk
+++ b/lib/bib-parse.awk
@@ -4,7 +4,8 @@
# bib_entry(type, key) - called once per regular entry. The fields are
# available in BIB_N, BIB_NAME[], BIB_VAL[] and
# BIB_KIND[]; the raw source text of the entry
-# is in BIB_RAW.
+# is in BIB_RAW, and its position in the input
+# buffer bib_buf is BIB_START..BIB_END-1.
# bib_pass(raw) - called for @string and @preamble blocks with
# their raw source text.
#
@@ -211,6 +212,8 @@ function bib_entry_at(s, i, at, type, opener, closer, key, name, c) {
BIB_KIND[BIB_N] = BIB_VKIND
}
BIB_RAW = bib_trim(substr(s, at, i - at))
+ BIB_START = at
+ BIB_END = i
bib_entry(type, key)
return i
}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
index 9aa5a37..3ebd16f 100644
--- a/lib/bib-select.awk
+++ b/lib/bib-select.awk
@@ -1,21 +1,31 @@
# bib-select.awk - emit entries selected by key, canonically
#
# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
-# keys - comma-separated list of entry keys; a key of "*" selects
-# every entry (as produced by \nocite{*})
-# invert - 0: emit entries whose key is in the list
-# 1: emit entries whose key is NOT in the list
+# keys - comma-separated list of entry keys; a key of "*" selects
+# every entry (as produced by \nocite{*})
+# keyfile - file with one key per line, for key lists too large to
+# pass on the command line; merged with keys
+# invert - 0: emit entries whose key is in the list
+# 1: emit entries whose key is NOT in the list
#
-# With keys="" and invert=1 this acts as a canonicalizing filter for
+# With no keys and invert=1 this acts as a canonicalizing filter for
# everything. @string and @preamble blocks always pass through.
+function bib_sel_add(k) {
+ if (k == "*")
+ BIB_SEL_ALL = 1
+ else
+ BIB_SEL[k] = 1
+}
+
BEGIN {
bib_sel_n = split(keys, bib_sel_k, ",")
- for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++) {
- if (bib_sel_k[bib_sel_i] == "*")
- BIB_SEL_ALL = 1
- else
- BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+ for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+ bib_sel_add(bib_sel_k[bib_sel_i])
+ if (keyfile != "") {
+ while ((getline bib_sel_line < keyfile) > 0)
+ bib_sel_add(bib_sel_line)
+ close(keyfile)
}
}
diff --git a/lib/bib-strip.awk b/lib/bib-strip.awk
new file mode 100644
index 0000000..cecca3e
--- /dev/null
+++ b/lib/bib-strip.awk
@@ -0,0 +1,49 @@
+# bib-strip.awk - remove entries by key, preserving all other bytes
+#
+# Requires bib-parse.awk. Variables (set with -v):
+# keys - comma-separated list of entry keys to remove
+# keyfile - file with one key per line, for key lists too large to
+# pass on the command line; merged with keys
+#
+# Unlike bib-select.awk, which re-emits entries canonically, this
+# splices the matched entries' source spans out of the input and
+# leaves everything else - comments, formatting, @string blocks -
+# byte-for-byte intact. Used by bib-add -f so that replacing one
+# entry never rewrites the rest of the database.
+#
+# This END block runs after bib-parse.awk's (END blocks execute in
+# the order their files are given to awk), so the spans recorded by
+# the hooks below are complete by the time output happens.
+
+BEGIN {
+ bib_strip_n = split(keys, bib_strip_k, ",")
+ for (bib_strip_i = 1; bib_strip_i <= bib_strip_n; bib_strip_i++)
+ BIB_DROP[bib_strip_k[bib_strip_i]] = 1
+ if (keyfile != "") {
+ while ((getline bib_strip_line < keyfile) > 0)
+ BIB_DROP[bib_strip_line] = 1
+ close(keyfile)
+ }
+}
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+ if (key in BIB_DROP) {
+ BIB_NSPAN++
+ BIB_SPAN_S[BIB_NSPAN] = BIB_START
+ BIB_SPAN_E[BIB_NSPAN] = BIB_END
+ }
+}
+
+END {
+ i = 1
+ for (j = 1; j <= BIB_NSPAN; j++) {
+ printf "%s", substr(bib_buf, i, BIB_SPAN_S[j] - i)
+ i = BIB_SPAN_E[j]
+ # swallow the whitespace that followed the removed entry
+ while (i <= length(bib_buf) && substr(bib_buf, i, 1) ~ /[ \t\r\n]/)
+ i++
+ }
+ printf "%s", substr(bib_buf, i)
+}
diff --git a/man/bib-add.1 b/man/bib-add.1
index 5c7a674..32582c5 100644
--- a/man/bib-add.1
+++ b/man/bib-add.1
@@ -21,14 +21,41 @@ If an incoming entry's key already exists in the database, the entry is
rejected and the duplicate keys are reported on standard error, unless
.B \-f
is given.
+Input with an empty key, or with the same key appearing twice, is
+always rejected.
+.SH SAFETY
+The database is never modified in place.
+The complete new version is built in a temporary file alongside the
+original, verified by re-parsing it and checking that exactly the
+expected entries are present, and only then written over the database
+\(em with the previous contents first saved in
+.IB db.bib .bak\fR.
+If anything fails along the way, the original file is left untouched.
+.PP
+Concurrent invocations are serialized through a lock file,
+.IB db.bib .lock\fR,
+created atomically with the owner's pid inside.
+A waiter retries for 30 seconds before giving up with an error;
+a lock whose owning process has died is reaped automatically.
.SH OPTIONS
.TP
.B \-f
Replace existing entries that share a key with an incoming entry.
-The database is rewritten canonically in the process.
+The replaced entries are spliced out by their exact source spans, so
+comments and the formatting of every other entry are preserved
+byte-for-byte.
.SH EXIT STATUS
-0 on success, 1 if no entries were read or a duplicate key was
-rejected, 2 on usage error.
+0 on success, 1 if the input was rejected or the database could not
+be safely rewritten, 2 on usage error.
+.SH FILES
+.TP
+.IB db.bib .bak
+The previous contents of the database, written before each
+modification.
+.TP
+.IB db.bib .lock
+Write lock held for the duration of an invocation; contains the
+owner's pid.
.SH ENVIRONMENT
.TP
.B BIBUTILS_LIB
diff --git a/tests/fuzz.sh b/tests/fuzz.sh
new file mode 100755
index 0000000..4145d42
--- /dev/null
+++ b/tests/fuzz.sh
@@ -0,0 +1,229 @@
+#!/bin/sh
+# fuzz.sh - throw bogus input at the tools and watch for misbehavior
+#
+# usage: tests/fuzz.sh [iterations] [seed-offset]
+# (default 100 per generator; the offset lets parallel runs
+# explore different deterministic mutation seeds)
+#
+# Four generators feed every entry-consuming tool:
+# random - raw bytes from /dev/urandom
+# mutated - a valid bibtex file with random structural damage
+# (deleted/duplicated/inserted braces, quotes, @, #, =)
+# soup - random streams of bibtex syntax tokens
+# format - malformed aux files for bib-extract and refer records
+# for bib-convert
+#
+# A case fails if a tool hangs (5s timeout), dies with an awk runtime
+# error, exits above 2, or breaks the canonicalization fixed-point
+# property (canon(canon(x)) must equal canon(x)).
+#
+# The mutated and soup inputs are additionally fired at bib-add
+# against a known database, which must afterwards still parse and
+# still contain every original entry (the survival invariant).
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+
+# byte semantics: random bytes are rarely valid UTF-8, and gawk's
+# locale warnings about that are not parser failures
+LC_ALL=C
+export LC_ALL
+N=${1:-100}
+OFF=${2:-0}
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+
+fails=0
+cases=0
+
+# canonicalizing filter (used for the fixed-point property)
+canon() {
+ awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-canon.awk" \
+ -f "$ROOT/lib/bib-select.awk" -v keys= -v invert=1
+}
+
+# run one tool on one input file; report any sign of misbehavior
+probe() { # probe <case-name> <input> <cmd...>
+ name=$1
+ input=$2
+ shift 2
+ cases=$((cases + 1))
+ timeout 5 "$@" < "$input" > "$tmpd/out" 2> "$tmpd/err"
+ rc=$?
+ if [ "$rc" -eq 124 ]; then
+ fails=$((fails + 1))
+ printf 'HANG %s: %s\n' "$name" "$*"
+ cp "$input" "$tmpd/hang.$fails"
+ elif [ "$rc" -gt 2 ]; then
+ fails=$((fails + 1))
+ printf 'CRASH %s: %s (exit %d)\n' "$name" "$*" "$rc"
+ cp "$input" "$tmpd/crash.$fails"
+ elif grep -Eq 'awk:.*(fatal|error)|[Ss]egmentation' "$tmpd/err"; then
+ fails=$((fails + 1))
+ printf 'AWKERR %s: %s: %s\n' "$name" "$*" "$(head -1 "$tmpd/err")"
+ cp "$input" "$tmpd/awkerr.$fails"
+ fi
+}
+
+# the canonicalization of any input must be a fixed point
+probe_fixedpoint() { # probe_fixedpoint <case-name> <input>
+ cases=$((cases + 1))
+ timeout 5 canon < "$2" > "$tmpd/c1" 2> /dev/null
+ timeout 5 canon < "$tmpd/c1" > "$tmpd/c2" 2> /dev/null
+ if ! cmp -s "$tmpd/c1" "$tmpd/c2"; then
+ fails=$((fails + 1))
+ printf 'NOTFIX %s: canon not idempotent\n' "$1"
+ cp "$2" "$tmpd/notfix.$fails"
+ fi
+}
+
+seed_bib() {
+ cat <<'EOF'
+@string{cj = {The Computer Journal}}
+@article{knuth1984literate,
+ author = {Donald E. Knuth},
+ title = {Literate {P}rogramming},
+ journal = cj,
+ year = 1984,
+ pages = "97--111",
+ note = "vol. " # 27,
+}
+@inproceedings{lamport1978time,
+ author = {Leslie Lamport},
+ title = {Time, Clocks, and the Ordering of Events},
+ booktitle = {Communications of the ACM},
+ year = {1978},
+}
+EOF
+}
+
+# damage a file at a random spot: delete, duplicate, or insert a
+# structural character (awk does the randomness; seeded per case)
+mutate() { # mutate <seed> < in > out
+ awk -v seed="$1" '
+ BEGIN { srand(seed) }
+ { buf = buf $0 "\n" }
+ END {
+ n = length(buf)
+ chars = "{}\"@#=,()\\%"
+ for (m = 0; m < 1 + int(rand() * 8); m++) {
+ pos = 1 + int(rand() * n)
+ op = int(rand() * 3)
+ c = substr(chars, 1 + int(rand() * length(chars)), 1)
+ if (op == 0) # delete a character
+ buf = substr(buf, 1, pos - 1) substr(buf, pos + 1)
+ else if (op == 1) # insert a structural character
+ buf = substr(buf, 1, pos - 1) c substr(buf, pos)
+ else # duplicate a slice
+ buf = substr(buf, 1, pos) substr(buf, pos, 1 + int(rand() * 20)) substr(buf, pos)
+ n = length(buf)
+ }
+ printf "%s", buf
+ }'
+}
+
+# a stream of plausible bibtex syntax fragments in random order
+soup() { # soup <seed> > out
+ awk -v seed="$1" '
+ BEGIN {
+ srand(seed)
+ n = 0
+ T[++n] = "@"; T[++n] = "{"; T[++n] = "}"; T[++n] = "\""
+ T[++n] = "#"; T[++n] = "="; T[++n] = ","; T[++n] = "("
+ T[++n] = ")"; T[++n] = "%"; T[++n] = "\\"; T[++n] = " "
+ T[++n] = "\n"; T[++n] = "word"; T[++n] = "1984"
+ T[++n] = "@article{k,"; T[++n] = "t = {v}"; T[++n] = "@string"
+ T[++n] = "@comment"; T[++n] = " and "; T[++n] = "--"
+ len = 200 + int(rand() * 800)
+ for (i = 0; i < len; i++)
+ printf "%s", T[1 + int(rand() * n)]
+ }'
+}
+
+run_entry_tools() { # run_entry_tools <case-name> <input>
+ probe "$1" "$2" bib-key
+ probe "$1" "$2" bib-ls -l
+ probe "$1" "$2" bib-check
+ probe "$1" "$2" bib-convert -r
+ probe "$1" "$2" bib-add "$tmpd/scratch.bib"
+ rm -f "$tmpd/scratch.bib" "$tmpd/scratch.bib.bak"
+ probe_fixedpoint "$1" "$2"
+}
+
+# fire input at bib-add (with and without -f) against a known database;
+# afterwards the database must still parse and still contain every
+# original entry
+probe_survival() { # probe_survival <case-name> <input>
+ cases=$((cases + 1))
+ cat > "$tmpd/inv.bib" <<'EOF'
+@string{js = {Journal of Survival}}
+@article{orig1990one, author = {A. Original}, title = {One}, year = 1990}
+@article{orig1991two, author = {B. Original}, title = {Two}, journal = js, year = 1991}
+@misc{orig1992three, title = {Three}, note = "v. " # 3}
+EOF
+ timeout 5 bib-add "$tmpd/inv.bib" < "$2" > /dev/null 2>&1
+ timeout 5 bib-add -f "$tmpd/inv.bib" < "$2" > /dev/null 2>&1
+ if ! awk -f "$ROOT/lib/bib-parse.awk" -f "$ROOT/lib/bib-lskeys.awk" \
+ "$tmpd/inv.bib" > "$tmpd/invkeys" 2> /dev/null \
+ || ! grep -q '^orig1990one$' "$tmpd/invkeys" \
+ || ! grep -q '^orig1991two$' "$tmpd/invkeys" \
+ || ! grep -q '^orig1992three$' "$tmpd/invkeys"; then
+ fails=$((fails + 1))
+ printf 'WRECK %s: database lost entries or no longer parses\n' "$1"
+ cp "$2" "$tmpd/wreck.$fails"
+ fi
+ rm -f "$tmpd/inv.bib.bak"
+}
+
+echo "=== random bytes (x$N) ==="
+i=0
+while [ "$i" -lt "$N" ]; do
+ i=$((i + 1))
+ head -c 512 /dev/urandom > "$tmpd/in"
+ run_entry_tools "random/$i" "$tmpd/in"
+done
+
+echo "=== mutated bibtex (x$N) ==="
+seed_bib > "$tmpd/seed"
+i=0
+while [ "$i" -lt "$N" ]; do
+ i=$((i + 1))
+ mutate "$((i + OFF))" < "$tmpd/seed" > "$tmpd/in"
+ run_entry_tools "mutated/$i" "$tmpd/in"
+ probe_survival "mutated/$i" "$tmpd/in"
+done
+
+echo "=== syntax soup (x$N) ==="
+i=0
+while [ "$i" -lt "$N" ]; do
+ i=$((i + 1))
+ soup "$((i + OFF))" > "$tmpd/in"
+ run_entry_tools "soup/$i" "$tmpd/in"
+ probe_survival "soup/$i" "$tmpd/in"
+done
+
+echo "=== malformed aux and refer (x$N) ==="
+printf '@article{k, author={A}, title={T}, year=1}\n' > "$tmpd/db.bib"
+i=0
+while [ "$i" -lt "$N" ]; do
+ i=$((i + 1))
+ printf '\\citation{k}\n\\citation{a,b,c}\n\\abx@aux@cite{0}{k}\n%%A Some One\n%%T Title\n' \
+ | mutate "$((i + OFF))" > "$tmpd/in"
+ cases=$((cases + 1))
+ if ! timeout 5 bib-extract "$tmpd/in" "$tmpd/db.bib" > /dev/null 2> "$tmpd/err"; then
+ rc=$?
+ if [ "$rc" -gt 2 ]; then
+ fails=$((fails + 1))
+ printf 'CRASH aux/%d: bib-extract (exit %d)\n' "$i" "$rc"
+ fi
+ fi
+ probe "ref/$i" "$tmpd/in" bib-convert -b
+done
+
+printf '\n%d cases, %d failures' "$cases" "$fails"
+if [ "$fails" -gt 0 ]; then
+ printf ' (failing inputs preserved in %s)\n' "$tmpd"
+ trap - EXIT
+ exit 1
+fi
+printf '\n'
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index 8a9f49a..653f838 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -74,6 +74,63 @@ check "bib-add -f replaces entry" grep -q ' year = 1985,' "$db"
n=$(grep -c '^@article{junk-key,' "$db")
[ "$n" = 1 ] && ok "bib-add -f leaves one copy" || not_ok "bib-add -f leaves one copy"
+# ---- bib-add hardening --------------------------------------------------
+check "bib-add writes a backup on modify" \
+ sh -c "cmp -s '$db.bak' /dev/null; [ -s '$db.bak' ]"
+
+# replacement must not disturb other bytes (comments, formatting)
+cat > "$tmpd/pres.bib" <<'EOF'
+% Encoding: UTF-8
+% hand-maintained; do not reformat
+
+@ARTICLE{ keep , AUTHOR = "Stays Verbatim", YEAR = 1111 }
+
+@article{swap2000old, author = {Old One}, title = {Swap}, year = 2000}
+EOF
+printf '@article{swap2000old, author = {New One}, title = {Swap}, year = 2000}\n' \
+ | bib-add -f "$tmpd/pres.bib"
+check "bib-add -f preserves comments" grep -q '^% Encoding: UTF-8$' "$tmpd/pres.bib"
+check "bib-add -f preserves untouched entries verbatim" \
+ grep -q 'AUTHOR = "Stays Verbatim"' "$tmpd/pres.bib"
+check "bib-add -f swapped the entry" grep -q '{New One}' "$tmpd/pres.bib"
+n=$(grep -c 'swap2000old' "$tmpd/pres.bib")
+[ "$n" = 1 ] && ok "bib-add -f removed the old version" \
+ || not_ok "bib-add -f removed the old version"
+
+# bogus input must never modify the database
+cp "$db" "$tmpd/before"
+printf '@article{, author = {No Key}, year = 1}\n' | bib-add "$db" 2> /dev/null \
+ && not_ok "bib-add rejects empty keys" || ok "bib-add rejects empty keys"
+printf '@misc{same2, title={A}}\n@misc{same2, title={B}}\n' \
+ | bib-add "$db" 2> /dev/null \
+ && not_ok "bib-add rejects dup keys within input" \
+ || ok "bib-add rejects dup keys within input"
+check "database untouched after rejected input" cmp -s "$db" "$tmpd/before"
+
+# concurrent writers serialize; no entries lost, lock released
+i=0
+while [ "$i" -lt 10 ]; do
+ i=$((i + 1))
+ printf '@misc{lock%d, title = {L %d}}\n' "$i" "$i" \
+ | bib-add "$tmpd/lock.bib" 2> /dev/null &
+done
+wait
+n=$(bib-ls "$tmpd/lock.bib" | wc -l)
+[ "$n" -eq 10 ] && ok "concurrent bib-add loses no entries" \
+ || not_ok "concurrent bib-add loses no entries (got $n)"
+[ -e "$tmpd/lock.bib.lock" ] && not_ok "lock released after use" \
+ || ok "lock released after use"
+
+# a stale lock from a dead process is reaped
+echo 999999 > "$tmpd/lock.bib.lock"
+printf '@misc{lock11, title = {L 11}}\n' | bib-add "$tmpd/lock.bib" 2> /dev/null
+check "stale lock reaped" grep -q 'lock11' "$tmpd/lock.bib"
+
+mkdir "$tmpd/adir"
+printf '@misc{k, title={T}}\n' | bib-add "$tmpd/adir" 2> /dev/null \
+ && not_ok "bib-add refuses non-regular files" \
+ || ok "bib-add refuses non-regular files"
+
# ---- bib-extract -------------------------------------------------------
cat > "$tmpd/all.bib" <<'EOF'
@article{alpha2020one, author = {A. Alpha}, title = {One}, year = 2020}