aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--LICENSE29
-rw-r--r--Makefile27
-rw-r--r--README.md45
-rwxr-xr-xbib-add67
-rwxr-xr-xbib-check18
-rwxr-xr-xbib-convert56
-rwxr-xr-xbib-extract60
-rwxr-xr-xbib-fetch82
-rwxr-xr-xbib-gen95
-rwxr-xr-xbib-key15
-rwxr-xr-xbib-ls30
-rwxr-xr-xbib-util28
-rw-r--r--lib/bib-canon.awk28
-rw-r--r--lib/bib-check.awk69
-rw-r--r--lib/bib-key.awk69
-rw-r--r--lib/bib-ls.awk25
-rw-r--r--lib/bib-lskeys.awk9
-rw-r--r--lib/bib-parse.awk216
-rw-r--r--lib/bib-select.awk29
-rw-r--r--lib/bib2ref.awk52
-rw-r--r--lib/ref2bib.awk107
-rwxr-xr-xtests/integration.sh123
-rwxr-xr-xtests/run-tests.sh187
24 files changed, 1465 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d38c149
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.swp
+*~
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..53b781e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2026, Douglas Brumbaugh
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..079df9f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+LIBDIR = $(PREFIX)/share/bibutils
+
+SCRIPTS = bib-util bib-add bib-check bib-convert bib-extract bib-fetch \
+ bib-gen bib-key bib-ls
+LIBS = lib/bib-parse.awk lib/bib-canon.awk lib/bib-select.awk \
+ lib/bib-lskeys.awk lib/bib-key.awk lib/bib-ls.awk \
+ lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk
+
+all:
+ @echo "nothing to build; run 'make test' or 'make install'"
+
+test:
+ tests/run-tests.sh
+ tests/integration.sh
+
+install:
+ -mkdir -p $(BINDIR) $(LIBDIR)
+ cp $(SCRIPTS) $(BINDIR)
+ cp $(LIBS) $(LIBDIR)
+
+uninstall:
+ cd $(BINDIR) && rm -f $(SCRIPTS)
+ rm -rf $(LIBDIR)
+
+.PHONY: all test install uninstall
diff --git a/README.md b/README.md
index d692649..53ccd48 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,19 @@ formatted bibtex entry will be emitted on stdout.
## bib-extract
A script which filters a bibtex database provided on stdin or as an argument
-and emits only those entries contained within a specified aux file.
+and emits only those entries contained within a specified aux file. Both
+classic bibtex and biblatex/biber aux files are understood, and \nocite{*}
+selects the whole database. (roff citation sources are planned but not yet
+supported.)
+
+## bib-ls
+List the entries in a database, one key per line, or with -l as
+tab-separated key, type, author, year and title.
+
+## bib-check
+Lint a database: reports missing required fields, duplicate keys,
+duplicate titles (likely duplicated entries) and empty field values.
+Exits nonzero if any problem was found.
## bib-key
A script which accepts a bibtex entry on stdin, and emits it on stdout with
@@ -29,5 +41,32 @@ an automatically generated bibtex key.
## bib-fetch
A script which accepts a pdf file as an input argument and will attempt to
-fetch a corresponding bibtex entry from crossref.org based on its DOI, if
-one is available.
+fetch a corresponding bibtex entry based on its DOI (via crossref.org) or,
+failing that, its arXiv id (via arxiv.org). An identifier can also be given
+directly with -d (DOI) or -a (arXiv id).
+
+## bib-convert
+Convert between bibtex and refer database formats. The direction is detected
+automatically from the input, or can be forced with -b (to bibtex) or -r
+(to refer).
+
+# Canonical form
+Entries that pass through these tools are canonicalized: lowercase entry
+types and field names, 2-space indentation, brace-delimited values with
+internal whitespace collapsed, bare numbers left bare, and macro
+references/concatenations preserved verbatim. @string and @preamble blocks
+pass through untouched.
+
+# Installation
+ make install # PREFIX=/usr/local by default
+
+The scripts look for the shared awk library in $BIBUTILS_LIB, then in
+lib/ next to the script, then in /usr/local/share/bibutils. If installing
+with a non-default PREFIX, set BIBUTILS_LIB accordingly.
+
+# Dependencies
+POSIX shell and awk only, with two exceptions: bib-fetch requires curl,
+plus pdftotext (poppler) for DOI extraction from pdfs.
+
+# Tests
+ make test
diff --git a/bib-add b/bib-add
new file mode 100755
index 0000000..28ebf82
--- /dev/null
+++ b/bib-add
@@ -0,0 +1,67 @@
+#!/bin/sh
+# bib-add - insert bibtex entries from stdin into a database file
+#
+# usage: bib-add [-f] db.bib < entry
+# -f replace existing entries with the same key
+
+usage() {
+ printf 'usage: bib-add [-f] db.bib < entry\n' >&2
+ exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+force=0
+while getopts f opt; do
+ case $opt in
+ f) force=1 ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+[ $# -eq 1 ] || usage
+db=$1
+
+tmp=$(mktemp) && tmpkeys=$(mktemp) && tmpdb=$(mktemp) || exit 1
+trap 'rm -f "$tmp" "$tmpkeys" "$tmpdb"' EXIT INT TERM
+
+# canonicalize the incoming entries
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" -f "$LIB/bib-select.awk" \
+ -v keys= -v invert=1 > "$tmp"
+
+if [ ! -s "$tmp" ]; then
+ printf 'bib-add: no entries on stdin\n' >&2
+ exit 1
+fi
+
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$tmp" > "$tmpkeys"
+
+if [ -f "$db" ]; then
+ dups=$(awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$db" \
+ | grep -Fxf "$tmpkeys") || dups=
+ if [ -n "$dups" ]; then
+ if [ "$force" -eq 1 ]; then
+ # rewrite the database without the entries being replaced
+ keys=$(printf '%s\n' "$dups" | paste -sd, -)
+ awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=1 \
+ "$db" > "$tmpdb" || exit 1
+ cp "$tmpdb" "$db"
+ else
+ printf 'bib-add: duplicate keys in %s:\n' "$db" >&2
+ printf '%s\n' "$dups" >&2
+ exit 1
+ fi
+ fi
+fi
+
+{
+ [ -s "$db" ] && echo ""
+ cat "$tmp"
+} >> "$db"
diff --git a/bib-check b/bib-check
new file mode 100755
index 0000000..062e157
--- /dev/null
+++ b/bib-check
@@ -0,0 +1,18 @@
+#!/bin/sh
+# bib-check - lint a bibtex database
+#
+# usage: bib-check [file ...] (stdin if no file given)
+#
+# Reports missing required fields, duplicate keys, duplicate titles and
+# empty field values. Exits nonzero if any problem was found.
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-check.awk" "$@"
diff --git a/bib-convert b/bib-convert
new file mode 100755
index 0000000..ef4c0b0
--- /dev/null
+++ b/bib-convert
@@ -0,0 +1,56 @@
+#!/bin/sh
+# bib-convert - convert between bibtex and refer database formats
+#
+# usage: bib-convert [-b | -r] [file] (stdin if no file given)
+# -b force refer -> bibtex
+# -r force bibtex -> refer
+#
+# Without a flag the direction is detected from the input: text whose
+# first record starts with @ is taken as bibtex, with % as refer.
+
+usage() {
+ printf 'usage: bib-convert [-b | -r] [file]\n' >&2
+ exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+mode=auto
+while getopts br opt; do
+ case $opt in
+ b) mode=tobib ;;
+ r) mode=toref ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+[ $# -le 1 ] || usage
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+cat "$@" > "$tmp"
+
+if [ "$mode" = auto ]; then
+ first=$(awk 'NF { sub(/^[ \t]+/, ""); print substr($0, 1, 1); exit }' "$tmp")
+ case $first in
+ @) mode=toref ;;
+ %) mode=tobib ;;
+ *) printf 'bib-convert: cannot detect input format\n' >&2; exit 1 ;;
+ esac
+fi
+
+if [ "$mode" = toref ]; then
+ exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib2ref.awk" "$tmp"
+else
+ awk -f "$LIB/ref2bib.awk" "$tmp" | "$bibkey"
+fi
diff --git a/bib-extract b/bib-extract
new file mode 100755
index 0000000..52aa85b
--- /dev/null
+++ b/bib-extract
@@ -0,0 +1,60 @@
+#!/bin/sh
+# bib-extract - emit only the database entries cited in an aux file
+#
+# usage: bib-extract file.aux [db.bib] (db on stdin if omitted)
+#
+# roff/refer citation sources are planned but not yet supported.
+
+usage() {
+ printf 'usage: bib-extract file.aux [db.bib]\n' >&2
+ exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+[ $# -ge 1 ] && [ $# -le 2 ] || usage
+aux=$1
+shift
+[ -r "$aux" ] || { printf 'bib-extract: cannot read %s\n' "$aux" >&2; exit 1; }
+
+keys=$(awk '
+ # classic bibtex: \citation{key,key,...}
+ {
+ line = $0
+ while (match(line, /\\citation\{[^}]*\}/)) {
+ n = split(substr(line, RSTART + 10, RLENGTH - 11), a, ",")
+ for (i = 1; i <= n; i++)
+ if (a[i] != "")
+ print a[i]
+ line = substr(line, RSTART + RLENGTH)
+ }
+ }
+ # biblatex/biber: \abx@aux@cite{segment}{key} (older: one argument)
+ {
+ line = $0
+ while (match(line, /\\abx@aux@cite(\{[0-9]*\})?\{[^}]*\}/)) {
+ s = substr(line, RSTART, RLENGTH)
+ sub(/\}$/, "", s)
+ sub(/^.*\{/, "", s)
+ if (s != "")
+ print s
+ line = substr(line, RSTART + RLENGTH)
+ }
+ }' "$aux" | sort -u | paste -sd, -)
+
+[ -n "$keys" ] || exit 0
+
+# \nocite{*} cites everything: emit the whole database
+case ",$keys," in
+ *,\*,*) keys= invert=1 ;;
+ *) invert=0 ;;
+esac
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-select.awk" -v keys="$keys" -v invert="$invert" "$@"
diff --git a/bib-fetch b/bib-fetch
new file mode 100755
index 0000000..62f7993
--- /dev/null
+++ b/bib-fetch
@@ -0,0 +1,82 @@
+#!/bin/sh
+# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
+#
+# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
+#
+# Unless given with -d or -a, an identifier is extracted from the first
+# pages of the pdf (requires pdftotext): a DOI if one is found, falling
+# back to an arXiv id. DOIs are resolved through doi.org content
+# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
+# is emitted canonically on stdout with a generated key.
+
+usage() {
+ printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
+ exit 2
+}
+
+doi=
+arxiv=
+while getopts d:a: opt; do
+ case $opt in
+ d) doi=$OPTARG ;;
+ a) arxiv=$OPTARG ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+[ -n "$doi" ] && [ -n "$arxiv" ] && usage
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+command -v curl > /dev/null 2>&1 || {
+ printf 'bib-fetch: curl is required\n' >&2
+ exit 1
+}
+
+if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+ [ $# -eq 1 ] || usage
+ pdf=$1
+ [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
+ command -v pdftotext > /dev/null 2>&1 || {
+ printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
+ exit 1
+ }
+ ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
+ # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
+ doi == "" {
+ if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
+ doi = substr($0, RSTART, RLENGTH)
+ sub(/[.,;)\]]+$/, "", doi)
+ }
+ }
+ arxiv == "" {
+ # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
+ if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
+ match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
+ arxiv = substr($0, RSTART + 6, RLENGTH - 6)
+ }
+ END { printf "%s\t%s\n", doi, arxiv }')
+ doi=${ids%% *}
+ arxiv=${ids#* }
+ if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+ printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
+ exit 1
+ fi
+fi
+
+if [ -n "$doi" ]; then
+ entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
+ "https://doi.org/$doi") || {
+ printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
+ exit 1
+ }
+else
+ arxiv=${arxiv#arXiv:}
+ entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
+ printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
+ exit 1
+ }
+fi
+
+printf '%s\n' "$entry" | "$bibkey"
diff --git a/bib-gen b/bib-gen
new file mode 100755
index 0000000..0fdd63a
--- /dev/null
+++ b/bib-gen
@@ -0,0 +1,95 @@
+#!/bin/sh
+# bib-gen - generate a bibtex entry
+#
+# usage: bib-gen [-t type] [field=value ...]
+# bib-gen [-t type] -F field,field,... (tab-separated stdin)
+#
+# With field=value arguments, one entry is built from them. With -F,
+# one entry is built per tab-separated line of stdin, columns matching
+# the listed fields. Otherwise the user is prompted interactively.
+# Entries are emitted on stdout with generated keys.
+
+usage() {
+ printf 'usage: bib-gen [-t type] [field=value ...]\n' >&2
+ printf ' bib-gen [-t type] -F field,field,... < data\n' >&2
+ exit 2
+}
+
+type=article
+fmt=
+while getopts t:F: opt; do
+ case $opt in
+ t) type=$OPTARG ;;
+ F) fmt=$OPTARG ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+# fields prompted for in interactive mode, per entry type
+fields_for() {
+ case $1 in
+ article) echo "author title journal year volume number pages month doi" ;;
+ book) echo "author title publisher year volume series address edition" ;;
+ inproceedings|conference)
+ echo "author title booktitle year editor pages publisher doi" ;;
+ incollection) echo "author title booktitle publisher year editor pages chapter" ;;
+ techreport) echo "author title institution year number address month" ;;
+ phdthesis|mastersthesis)
+ echo "author title school year address month" ;;
+ *) echo "author title year howpublished note url" ;;
+ esac
+}
+
+if [ -n "$fmt" ]; then
+ # batch mode: tab-separated values on stdin
+ awk -F '\t' -v fmt="$fmt" -v type="$type" '
+ BEGIN { nf = split(fmt, F, ",") }
+ NF {
+ printf "@%s{FIXME,\n", type
+ for (i = 1; i <= nf && i <= NF; i++)
+ if ($i != "")
+ printf " %s = {%s},\n", F[i], $i
+ print "}"
+ }' | "$bibkey"
+ exit $?
+fi
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+
+if [ $# -gt 0 ]; then
+ # argument mode: field=value pairs
+ for arg in "$@"; do
+ case $arg in
+ *=*) printf '%s\t%s\n' "${arg%%=*}" "${arg#*=}" >> "$tmp" ;;
+ *) usage ;;
+ esac
+ done
+else
+ # interactive mode
+ printf 'entry type [%s]: ' "$type" >&2
+ read -r ans || exit 1
+ [ -n "$ans" ] && type=$ans
+ for f in $(fields_for "$type"); do
+ printf '%s: ' "$f" >&2
+ read -r ans || break
+ [ -n "$ans" ] && printf '%s\t%s\n' "$f" "$ans" >> "$tmp"
+ done
+fi
+
+if [ ! -s "$tmp" ]; then
+ printf 'bib-gen: no fields given\n' >&2
+ exit 1
+fi
+
+{
+ printf '@%s{FIXME,\n' "$type"
+ while IFS=' ' read -r name value; do
+ printf ' %s = {%s},\n' "$name" "$value"
+ done < "$tmp"
+ printf '}\n'
+} | "$bibkey"
diff --git a/bib-key b/bib-key
new file mode 100755
index 0000000..ff3c363
--- /dev/null
+++ b/bib-key
@@ -0,0 +1,15 @@
+#!/bin/sh
+# bib-key - read bibtex entries and emit them with generated keys
+#
+# usage: bib-key [file ...] (stdin if no file given)
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-key.awk" "$@"
diff --git a/bib-ls b/bib-ls
new file mode 100755
index 0000000..0ed7236
--- /dev/null
+++ b/bib-ls
@@ -0,0 +1,30 @@
+#!/bin/sh
+# bib-ls - list the entries in a bibtex database
+#
+# usage: bib-ls [-l] [file ...] (stdin if no file given)
+# -l long format: key, type, author, year, title (tab-separated)
+
+usage() {
+ printf 'usage: bib-ls [-l] [file ...]\n' >&2
+ exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+ LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+ LIB=$(dirname "$0")/lib
+else
+ LIB=/usr/local/share/bibutils
+fi
+
+long=0
+while getopts l opt; do
+ case $opt in
+ l) long=1 ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+ -f "$LIB/bib-ls.awk" -v long="$long" "$@"
diff --git a/bib-util b/bib-util
new file mode 100755
index 0000000..e807b03
--- /dev/null
+++ b/bib-util
@@ -0,0 +1,28 @@
+#!/bin/sh
+# bib-util - wrapper dispatching to the individual bibutils scripts
+#
+# usage: bib-util command [args ...]
+
+usage() {
+ printf 'usage: bib-util command [args ...]\n' >&2
+ printf 'commands: add check convert extract fetch gen key ls\n' >&2
+ exit 2
+}
+
+[ $# -ge 1 ] || usage
+cmd=$1
+shift
+
+dir=$(dirname "$0")
+case $cmd in
+ add|check|convert|extract|fetch|gen|key|ls)
+ exec "$dir/bib-$cmd" "$@"
+ ;;
+ help|-h|--help)
+ usage
+ ;;
+ *)
+ printf 'bib-util: unknown command: %s\n' "$cmd" >&2
+ usage
+ ;;
+esac
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key, j, v) {
+ printf "@%s{%s,\n", type, key
+ for (j = 1; j <= BIB_N; j++) {
+ v = BIB_VAL[j]
+ if (BIB_KIND[j] == "s") {
+ gsub(/[ \t\r\n]+/, " ", v)
+ v = bib_trim(v)
+ printf " %s = {%s},\n", BIB_NAME[j], v
+ } else
+ printf " %s = %s,\n", BIB_NAME[j], v
+ }
+ print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name, j) {
+ for (j = 1; j <= BIB_N; j++)
+ if (BIB_NAME[j] == name)
+ return BIB_VAL[j]
+ return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+# - missing fields required by the entry type
+# - duplicate keys
+# - entries whose titles normalize to the same string (likely dups)
+# - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+ REQ["article"] = "author title journal year"
+ REQ["book"] = "author|editor title publisher year"
+ REQ["booklet"] = "title"
+ REQ["inbook"] = "author|editor title publisher year"
+ REQ["incollection"] = "author title booktitle publisher year"
+ REQ["inproceedings"] = "author title booktitle year"
+ REQ["conference"] = "author title booktitle year"
+ REQ["manual"] = "title"
+ REQ["mastersthesis"] = "author title school year"
+ REQ["phdthesis"] = "author title school year"
+ REQ["proceedings"] = "title year"
+ REQ["techreport"] = "author title institution year"
+ REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+ printf "%s: %s\n", key, msg
+ BIB_BAD = 1
+}
+
+function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) {
+ if (key in BIB_KEYS_SEEN)
+ problem(key, "duplicate key")
+ BIB_KEYS_SEEN[key] = 1
+
+ # required fields ("a|b" means at least one of a, b)
+ if (type in REQ) {
+ n = split(REQ[type], req, " ")
+ for (i = 1; i <= n; i++) {
+ na = split(req[i], alts, "|")
+ found = 0
+ for (j = 1; j <= na; j++)
+ if (bib_get(alts[j]) != "")
+ found = 1
+ if (!found)
+ problem(key, "missing required field: " req[i])
+ }
+ }
+
+ # empty values
+ for (i = 1; i <= BIB_N; i++)
+ if (bib_trim(BIB_VAL[i]) == "")
+ problem(key, "empty field: " BIB_NAME[i])
+
+ # likely duplicate entries: same normalized title
+ t = tolower(bib_get("title"))
+ gsub(/[^a-z0-9]/, "", t)
+ if (t != "") {
+ if (t in BIB_TITLES_SEEN)
+ problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+ else
+ BIB_TITLES_SEEN[t] = key
+ }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+ if (bib_out_n++)
+ print ""
+ print raw
+}
+
+function bib_entry(type, key, k, n) {
+ if (bib_out_n++)
+ print ""
+ k = bib_mkkey()
+ # disambiguate collisions with b, c, ... suffixes
+ if (k in BIB_KEYS_SEEN) {
+ n = ++BIB_KEYS_SEEN[k]
+ k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+ } else
+ BIB_KEYS_SEEN[k] = 1
+ bib_emit(type, k)
+}
+
+function bib_mkkey( a, y, t, surname, word, n, parts, i, w) {
+ a = bib_get("author")
+ if (a == "")
+ a = bib_get("editor")
+ y = bib_get("year")
+ t = bib_get("title")
+
+ # surname of the first author
+ if (match(a, / [Aa][Nn][Dd] /))
+ a = substr(a, 1, RSTART - 1)
+ gsub(/[{}]/, "", a)
+ a = bib_trim(a)
+ if (index(a, ",") > 0)
+ surname = substr(a, 1, index(a, ",") - 1)
+ else {
+ n = split(a, parts, /[ \t]+/)
+ surname = (n > 0) ? parts[n] : ""
+ }
+ gsub(/[^A-Za-z0-9]/, "", surname)
+ surname = tolower(surname)
+ if (surname == "")
+ surname = "anon"
+
+ # four-digit year
+ if (match(y, /[0-9][0-9][0-9][0-9]/))
+ y = substr(y, RSTART, 4)
+ else
+ y = ""
+
+ # first significant word of the title
+ gsub(/[{}]/, "", t)
+ word = ""
+ n = split(tolower(t), parts, /[^a-z0-9]+/)
+ for (i = 1; i <= n; i++) {
+ w = parts[i]
+ if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+ w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+ w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+ continue
+ word = w
+ break
+ }
+
+ return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+# long - 0: print one key per line
+# 1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key, a, t) {
+ if (long + 0 == 0) {
+ print key
+ return
+ }
+ a = bib_get("author")
+ if (a == "")
+ a = bib_get("editor")
+ gsub(/[{}]/, "", a)
+ gsub(/[ \t\r\n]+/, " ", a)
+ if (match(a, / [Aa][Nn][Dd] /))
+ a = substr(a, 1, RSTART - 1) " et al."
+ t = bib_get("title")
+ gsub(/[{}]/, "", t)
+ gsub(/[ \t\r\n]+/, " ", t)
+ printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+ print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+# bib_entry(type, key) - called once per regular entry. The fields are
+# available in BIB_N, BIB_NAME[], BIB_VAL[] and
+# BIB_KIND[]; the raw source text of the entry
+# is in BIB_RAW.
+# bib_pass(raw) - called for @string and @preamble blocks with
+# their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s, i) {
+ i = 1
+ while (i <= length(s)) {
+ if (substr(s, i, 1) == "@")
+ i = bib_entry_at(s, i)
+ else
+ i++
+ }
+}
+
+function bib_ws(s, i) {
+ while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+ i++
+ return i
+}
+
+function bib_trim(t) {
+ sub(/^[ \t\r\n]+/, "", t)
+ sub(/[ \t\r\n]+$/, "", t)
+ return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i, depth, start, c) {
+ start = i
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ i++
+ if (c == "{")
+ depth++
+ else if (c == "}") {
+ depth--
+ if (depth == 0)
+ break
+ }
+ }
+ BIB_PIECE = substr(s, start + 1, i - start - 2)
+ return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i, depth, start, c) {
+ start = i
+ i++
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ if (c == "{")
+ depth++
+ else if (c == "}")
+ depth--
+ else if (c == "\"" && depth == 0) {
+ i++
+ break
+ }
+ i++
+ }
+ BIB_PIECE = substr(s, start + 1, i - start - 2)
+ return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl, depth, c) {
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ i++
+ if (c == op)
+ depth++
+ else if (c == cl) {
+ depth--
+ if (depth == 0)
+ break
+ }
+ }
+ return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i, start, c, piece, pieces, kind) {
+ start = i
+ pieces = 0
+ kind = ""
+ BIB_VALUE = ""
+ while (1) {
+ c = substr(s, i, 1)
+ if (c == "{") {
+ i = bib_braced(s, i)
+ BIB_VALUE = BIB_VALUE BIB_PIECE
+ if (kind == "")
+ kind = "s"
+ } else if (c == "\"") {
+ i = bib_quoted(s, i)
+ BIB_VALUE = BIB_VALUE BIB_PIECE
+ if (kind == "")
+ kind = "s"
+ } else {
+ piece = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+ piece = piece substr(s, i, 1)
+ i++
+ }
+ BIB_VALUE = BIB_VALUE piece
+ kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+ }
+ pieces++
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) == "#")
+ i = bib_ws(s, i + 1)
+ else
+ break
+ }
+ if (pieces > 1)
+ kind = "r"
+ if (kind == "r")
+ BIB_VALUE = bib_trim(substr(s, start, i - start))
+ BIB_VKIND = kind
+ return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i, at, type, opener, closer, key, name, c) {
+ at = i
+ i++
+ type = ""
+ while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+ type = type substr(s, i, 1)
+ i++
+ }
+ type = tolower(type)
+ i = bib_ws(s, i)
+ c = substr(s, i, 1)
+ if (c == "{") {
+ opener = "{"
+ closer = "}"
+ } else if (c == "(") {
+ opener = "("
+ closer = ")"
+ } else
+ return i # stray @, not an entry
+
+ if (type == "comment")
+ return bib_skip_group(s, i, opener, closer)
+ if (type == "string" || type == "preamble") {
+ i = bib_skip_group(s, i, opener, closer)
+ bib_pass(bib_trim(substr(s, at, i - at)))
+ return i
+ }
+
+ i++ # consume opener
+ i = bib_ws(s, i)
+ key = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+ key = key substr(s, i, 1)
+ i++
+ }
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) == ",")
+ i++
+
+ BIB_N = 0
+ while (1) {
+ i = bib_ws(s, i)
+ c = substr(s, i, 1)
+ if (c == "" || c == closer) {
+ if (c == closer)
+ i++
+ break
+ }
+ if (c == ",") {
+ i++
+ continue
+ }
+ name = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+ name = name substr(s, i, 1)
+ i++
+ }
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) != "=") { # malformed; skip a char and resync
+ i++
+ continue
+ }
+ i = bib_ws(s, i + 1)
+ i = bib_value(s, i)
+ BIB_N++
+ BIB_NAME[BIB_N] = tolower(name)
+ BIB_VAL[BIB_N] = BIB_VALUE
+ BIB_KIND[BIB_N] = BIB_VKIND
+ }
+ BIB_RAW = bib_trim(substr(s, at, i - at))
+ bib_entry(type, key)
+ return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+# keys - comma-separated list of entry keys
+# invert - 0: emit entries whose key is in the list
+# 1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+ bib_sel_n = split(keys, bib_sel_k, ",")
+ for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+ BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+ if (bib_out_n++)
+ print ""
+ print raw
+}
+
+function bib_entry(type, key) {
+ if ((key in BIB_SEL) != invert + 0) {
+ if (bib_out_n++)
+ print ""
+ bib_emit(type, key)
+ }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+ if (v != "") {
+ gsub(/[{}]/, "", v)
+ gsub(/[ \t\r\n]+/, " ", v)
+ printf "%%%s %s\n", tag, bib_trim(v)
+ }
+}
+
+function r_names(tag, v, n, parts, i) {
+ gsub(/[{}]/, "", v)
+ gsub(/[ \t\r\n]+/, " ", v)
+ n = split(v, parts, / +[Aa][Nn][Dd] +/)
+ for (i = 1; i <= n; i++)
+ if (bib_trim(parts[i]) != "")
+ printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key, d, p, m) {
+ if (bib_out_n++)
+ print ""
+ r_names("A", bib_get("author"))
+ r_names("E", bib_get("editor"))
+ r_field("T", bib_get("title"))
+ r_field("J", bib_get("journal"))
+ r_field("B", bib_get("booktitle"))
+ d = bib_get("year")
+ m = bib_get("month")
+ if (m != "")
+ d = (d != "") ? m " " d : m
+ r_field("D", d)
+ r_field("V", bib_get("volume"))
+ r_field("N", bib_get("number"))
+ p = bib_get("pages")
+ gsub(/--/, "-", p)
+ r_field("P", p)
+ if (bib_get("publisher") != "")
+ r_field("I", bib_get("publisher"))
+ else if (bib_get("institution") != "")
+ r_field("I", bib_get("institution"))
+ else if (bib_get("school") != "")
+ r_field("I", bib_get("school"))
+ r_field("C", bib_get("address"))
+ r_field("K", bib_get("keywords"))
+ r_field("X", bib_get("abstract"))
+ r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+ RS = ""
+ FS = "\n"
+}
+
+function r_trim(t) {
+ sub(/^[ \t\r]+/, "", t)
+ sub(/[ \t\r]+$/, "", t)
+ return t
+}
+
+function r_emit(name, v) {
+ if (v != "")
+ printf " %s = {%s},\n", name, v
+}
+
+{
+ split("", val)
+ na = 0
+ ne = 0
+ split("", A)
+ split("", E)
+ lasttag = ""
+ for (i = 1; i <= NF; i++) {
+ line = $i
+ if (substr(line, 1, 1) == "%") {
+ tag = substr(line, 2, 1)
+ v = r_trim(substr(line, 3))
+ if (tag == "A")
+ A[++na] = v
+ else if (tag == "E")
+ E[++ne] = v
+ else
+ val[tag] = v
+ lasttag = tag
+ } else if (lasttag == "A")
+ A[na] = A[na] " " r_trim(line)
+ else if (lasttag == "E")
+ E[ne] = E[ne] " " r_trim(line)
+ else if (lasttag != "")
+ val[lasttag] = val[lasttag] " " r_trim(line)
+ }
+ if (na == 0 && ne == 0 && !("T" in val))
+ next
+
+ # guess an entry type from the fields present
+ if ("J" in val)
+ type = "article"
+ else if ("B" in val)
+ type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+ ? "inproceedings" : "incollection"
+ else if ("R" in val)
+ type = "techreport"
+ else if ("I" in val)
+ type = "book"
+ else
+ type = "misc"
+
+ if (out_n++)
+ print ""
+ printf "@%s{FIXME,\n", type
+
+ authors = ""
+ for (i = 1; i <= na; i++)
+ authors = (i == 1) ? A[i] : authors " and " A[i]
+ r_emit("author", authors)
+ editors = ""
+ for (i = 1; i <= ne; i++)
+ editors = (i == 1) ? E[i] : editors " and " E[i]
+ r_emit("editor", editors)
+
+ r_emit("title", val["T"])
+ r_emit("journal", val["J"])
+ r_emit("booktitle", val["B"])
+
+ d = val["D"]
+ if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+ r_emit("year", substr(d, RSTART, 4))
+ m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+ if (m != "")
+ r_emit("month", m)
+ } else
+ r_emit("year", d)
+
+ r_emit("volume", val["V"])
+ r_emit("number", val["N"])
+ p = val["P"]
+ gsub(/-+/, "--", p)
+ r_emit("pages", p)
+ r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+ r_emit("address", val["C"])
+ if ("R" in val) {
+ if ("N" in val)
+ r_emit("note", val["R"])
+ else
+ r_emit("number", val["R"])
+ }
+ r_emit("keywords", val["K"])
+ r_emit("abstract", val["X"])
+ r_emit("note", val["O"])
+ print "}"
+}
diff --git a/tests/integration.sh b/tests/integration.sh
new file mode 100755
index 0000000..ea847e4
--- /dev/null
+++ b/tests/integration.sh
@@ -0,0 +1,123 @@
+#!/bin/sh
+# integration.sh - end-to-end test against a real LaTeX document
+#
+# Requires pdflatex and bibtex; skipped otherwise. Set BIBTEST_NET=1 to
+# also exercise bib-fetch against doi.org (needs network access).
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+LSKEYS="awk -f $ROOT/lib/bib-parse.awk -f $ROOT/lib/bib-lskeys.awk"
+
+command -v pdflatex > /dev/null 2>&1 && command -v bibtex > /dev/null 2>&1 || {
+ printf 'integration: pdflatex/bibtex not found, skipping\n' >&2
+ exit 0
+}
+
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+cd "$tmpd" || exit 1
+
+pass=0
+fail=0
+ok() { pass=$((pass + 1)); printf 'ok - %s\n' "$1"; }
+not_ok() { fail=$((fail + 1)); printf 'FAIL - %s\n' "$1"; }
+
+# ---- build a database with bib-gen | bib-add ---------------------------
+bib-gen -t article author='Donald E. Knuth' title='Literate Programming' \
+ journal='The Computer Journal' year=1984 volume=27 number=2 \
+ pages='97--111' | bib-add master.bib
+bib-gen -t article author='Alan M. Turing' \
+ title='Computing Machinery and Intelligence' journal='Mind' year=1950 \
+ volume=59 pages='433--460' | bib-add master.bib
+printf 'Claude E. Shannon\tA Mathematical Theory of Communication\tBell System Technical Journal\t1948
+Edsger W. Dijkstra\tGo To Statement Considered Harmful\tCommunications of the ACM\t1968
+' | bib-gen -F author,title,journal,year | bib-add master.bib
+
+n=$($LSKEYS master.bib | wc -l)
+[ "$n" -eq 4 ] && ok "database built with 4 entries" \
+ || not_ok "database built with 4 entries (got $n)"
+
+# ---- compile a document citing a subset --------------------------------
+cat > paper.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+Machines may think~\cite{turing1950computing}; programs are
+literature~\cite{knuth1984literate}.
+
+DOI: 10.1093/comjnl/27.2.97
+\bibliographystyle{plain}
+\bibliography{master}
+\end{document}
+EOF
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+
+grep -q 'citation{turing1950computing}' paper.aux \
+ && ok "pdflatex produced citations in aux" \
+ || not_ok "pdflatex produced citations in aux"
+
+# ---- extract the cited subset and build against it ---------------------
+bib-extract paper.aux master.bib > paper.bib
+n=$($LSKEYS paper.bib | wc -l)
+[ "$n" -eq 2 ] && ok "bib-extract kept the 2 cited entries" \
+ || not_ok "bib-extract kept the 2 cited entries (got $n)"
+
+sed 's/\\bibdata{master}/\\bibdata{paper}/' paper.aux > tmp.aux \
+ && mv tmp.aux paper.aux
+bibtex paper > bibtex.log 2>&1
+grep -qi 'error\|warning' bibtex.log \
+ && not_ok "bibtex accepts canonical output cleanly" \
+ || ok "bibtex accepts canonical output cleanly"
+
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+if grep -qi 'undefined' paper.log; then
+ not_ok "document resolves all citations"
+else
+ ok "document resolves all citations"
+fi
+[ -s paper.pdf ] && ok "pdf produced" || not_ok "pdf produced"
+
+# ---- convert roundtrip --------------------------------------------------
+bib-convert master.bib | bib-convert > roundtrip.bib
+if [ "$($LSKEYS master.bib | sort)" = "$($LSKEYS roundtrip.bib | sort)" ]; then
+ ok "bibtex -> refer -> bibtex preserves all keys"
+else
+ not_ok "bibtex -> refer -> bibtex preserves all keys"
+fi
+
+# ---- bib-fetch against the built pdf (network) --------------------------
+if [ "$BIBTEST_NET" = 1 ]; then
+ if bib-fetch paper.pdf > fetched.bib 2> /dev/null; then
+ grep -q '^@article{knuth1984literate,' fetched.bib \
+ && ok "bib-fetch resolves DOI from built pdf" \
+ || not_ok "bib-fetch resolves DOI from built pdf"
+ if bib-fetch paper.pdf 2> /dev/null | bib-add master.bib 2> /dev/null; then
+ not_ok "fetched entry detected as duplicate"
+ else
+ ok "fetched entry detected as duplicate"
+ fi
+ else
+ not_ok "bib-fetch resolves DOI from built pdf"
+ fi
+ bib-fetch -a 1706.03762 2> /dev/null \
+ | grep -q '^@misc{vaswani[0-9]*attention,' \
+ && ok "bib-fetch resolves arXiv id" \
+ || not_ok "bib-fetch resolves arXiv id"
+ cat > arx.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+A preprint without any DOI.
+
+arXiv:1706.03762v7 [cs.CL] 2 Aug 2023
+\end{document}
+EOF
+ pdflatex -interaction=batchmode arx.tex > /dev/null 2>&1
+ bib-fetch arx.pdf 2> /dev/null | grep -q 'eprint = {1706.03762}' \
+ && ok "bib-fetch extracts arXiv id from pdf" \
+ || not_ok "bib-fetch extracts arXiv id from pdf"
+else
+ printf 'skip - bib-fetch network tests (set BIBTEST_NET=1 to enable)\n'
+fi
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
new file mode 100755
index 0000000..70721db
--- /dev/null
+++ b/tests/run-tests.sh
@@ -0,0 +1,187 @@
+#!/bin/sh
+# run-tests.sh - test suite for bibutils
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+
+pass=0
+fail=0
+
+ok() {
+ pass=$((pass + 1))
+ printf 'ok - %s\n' "$1"
+}
+
+not_ok() {
+ fail=$((fail + 1))
+ printf 'FAIL - %s\n' "$1"
+}
+
+# check description command... (passes if the command succeeds)
+check() {
+ desc=$1
+ shift
+ if "$@" > /dev/null 2>&1; then
+ ok "$desc"
+ else
+ not_ok "$desc"
+ fi
+}
+
+entry='@ARTICLE{ junk-key ,
+ AUTHOR = "Donald E. Knuth",
+ Title={Literate Programming},
+ JOURNAL = {The Computer Journal},
+ Year = 1984, volume={27},
+ pages = {97--111}
+}'
+
+# ---- bib-key ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-key)
+check "bib-key generates surname-year-word key" \
+ sh -c "printf '%s' '$out' | grep -q '^@article{knuth1984literate,'"
+
+# key collisions get letter suffixes
+out=$(printf '@inproceedings{a, author={J. Smith}, title={Fast Trees}, year=2020}
+@article{b, author={J. Smith}, title={Fast Trees Extended}, year=2020}
+@article{c, author={J. Smith}, title={Fast Tree Methods}, year=2020}\n' | bib-key)
+check "bib-key disambiguates colliding keys" \
+ sh -c "printf '%s' '$out' | grep -q '{smith2020fast,' &&
+ printf '%s' '$out' | grep -q '{smith2020fastb,' &&
+ printf '%s' '$out' | grep -q '{smith2020fastc,'"
+
+# ---- canonicalization via bib-add -------------------------------------
+db=$tmpd/refs.bib
+printf '%s\n' "$entry" | bib-add "$db"
+check "bib-add creates database" test -s "$db"
+check "bib-add lowercases field names" grep -q ' author = {Donald E. Knuth},' "$db"
+check "bib-add collapses whitespace in values" \
+ grep -q ' title = {Literate Programming},' "$db"
+check "bib-add keeps bare numbers bare" grep -q ' year = 1984,' "$db"
+
+# duplicate detection
+if printf '%s\n' "$entry" | bib-add "$db" 2> /dev/null; then
+ not_ok "bib-add rejects duplicate key"
+else
+ ok "bib-add rejects duplicate key"
+fi
+
+# forced replacement
+printf '%s\n' "$entry" | sed 's/1984/1985/' | bib-add -f "$db"
+check "bib-add -f replaces entry" grep -q ' year = 1985,' "$db"
+n=$(grep -c '^@article{junk-key,' "$db")
+[ "$n" = 1 ] && ok "bib-add -f leaves one copy" || not_ok "bib-add -f leaves one copy"
+
+# ---- bib-extract -------------------------------------------------------
+cat > "$tmpd/all.bib" <<'EOF'
+@article{alpha2020one, author = {A. Alpha}, title = {One}, year = 2020}
+@article{beta2021two, author = {B. Beta}, title = {Two}, year = 2021}
+@article{gamma2022three, author = {C. Gamma}, title = {Three}, year = 2022}
+EOF
+cat > "$tmpd/doc.aux" <<'EOF'
+\relax
+\citation{alpha2020one}
+\citation{gamma2022three,alpha2020one}
+\bibstyle{plain}
+EOF
+out=$(bib-extract "$tmpd/doc.aux" "$tmpd/all.bib")
+check "bib-extract keeps cited entries" \
+ sh -c "printf '%s' '$out' | grep -q alpha2020one"
+check "bib-extract keeps all cited entries" \
+ sh -c "printf '%s' '$out' | grep -q gamma2022three"
+if printf '%s' "$out" | grep -q beta2021two; then
+ not_ok "bib-extract drops uncited entries"
+else
+ ok "bib-extract drops uncited entries"
+fi
+
+# ---- bib-convert -------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-convert)
+check "bib-convert emits refer author" \
+ sh -c "printf '%s' '$out' | grep -q '^%A Donald E. Knuth$'"
+check "bib-convert emits refer pages with single dash" \
+ sh -c "printf '%s' '$out' | grep -q '^%P 97-111$'"
+
+cat > "$tmpd/rec.ref" <<'EOF'
+%A Alan M. Turing
+%T Computing Machinery and Intelligence
+%J Mind
+%D 1950
+%V 59
+%P 433-460
+EOF
+out=$(bib-convert "$tmpd/rec.ref")
+check "bib-convert refer->bibtex type guess" \
+ sh -c "printf '%s' '$out' | grep -q '^@article{turing1950computing,'"
+check "bib-convert refer->bibtex pages" \
+ sh -c "printf '%s' '$out' | grep -q ' pages = {433--460},'"
+
+# ---- bib-gen -----------------------------------------------------------
+out=$(bib-gen -t book author='Xavier Yu' title='Some Title' year=2001 publisher='Pub')
+check "bib-gen argument mode" \
+ sh -c "printf '%s' '$out' | grep -q '^@book{yu2001some,'"
+
+out=$(printf 'A. Author\tNeat Paper\tGood Journal\t1999\n' \
+ | bib-gen -F author,title,journal,year)
+check "bib-gen batch mode" \
+ sh -c "printf '%s' '$out' | grep -q '^@article{author1999neat,'"
+
+# ---- bib-ls ------------------------------------------------------------
+out=$(bib-ls "$tmpd/all.bib")
+check "bib-ls lists keys" \
+ sh -c "[ \"\$(printf '%s\n' '$out' | wc -l)\" = 3 ]"
+out=$(bib-ls -l "$tmpd/all.bib")
+check "bib-ls -l shows details" \
+ sh -c "printf '%s' '$out' | grep -q 'beta2021two article B. Beta 2021 Two'"
+
+# ---- bib-check ---------------------------------------------------------
+cat > "$tmpd/bad.bib" <<'EOF'
+@article{good2020fine, author = {A. Good}, title = {Fine}, journal = {J}, year = 2020}
+@article{noj2020sad, author = {B. Sad}, title = {No Journal Here}, year = 2020}
+@misc{noj2020sad, title = {Dup Key}}
+@book{dup2021title, author = {C. Dup}, title = {FINE!}, publisher = {P}, year = 2021}
+EOF
+out=$(bib-check "$tmpd/bad.bib")
+if [ $? -ne 0 ]; then ok "bib-check exits nonzero on problems"; else not_ok "bib-check exits nonzero on problems"; fi
+check "bib-check finds missing field" \
+ sh -c "printf '%s' '$out' | grep -q 'noj2020sad: missing required field: journal'"
+check "bib-check finds duplicate key" \
+ sh -c "printf '%s' '$out' | grep -q 'noj2020sad: duplicate key'"
+check "bib-check finds duplicate title" \
+ sh -c "printf '%s' '$out' | grep -q 'dup2021title: title duplicates good2020fine'"
+cat > "$tmpd/clean.bib" <<'EOF'
+@article{a2020x, author = {A. A}, title = {X}, journal = {J}, year = 2020}
+@misc{b2021y, title = {Y}}
+EOF
+check "bib-check passes a clean db" bib-check "$tmpd/clean.bib"
+
+# ---- biblatex aux ------------------------------------------------------
+cat > "$tmpd/bl.aux" <<'EOF'
+\abx@aux@refcontext{nty/global//global/global}
+\abx@aux@cite{0}{beta2021two}
+EOF
+out=$(bib-extract "$tmpd/bl.aux" "$tmpd/all.bib")
+check "bib-extract reads biblatex aux" \
+ sh -c "printf '%s' '$out' | grep -q beta2021two"
+
+# ---- bib-util ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-util key)
+check "bib-util dispatches" \
+ sh -c "printf '%s' '$out' | grep -q knuth1984literate"
+
+# ---- @string passthrough -----------------------------------------------
+cat > "$tmpd/str.bib" <<'EOF'
+@string{cj = {The Computer Journal}}
+@article{knuth1984literate, author = {D. Knuth}, journal = cj, year = 1984}
+EOF
+out=$(printf '\\citation{knuth1984literate}\n' > "$tmpd/s.aux"; \
+ bib-extract "$tmpd/s.aux" "$tmpd/str.bib")
+check "bib-extract passes @string through" \
+ sh -c "printf '%s' '$out' | grep -q '@string{cj'"
+check "macro field stays raw" \
+ sh -c "printf '%s' '$out' | grep -q ' journal = cj,'"
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]