Initial implementation (only a few years later!)

This is pure Claude. I'd written out the plan for this suite of scripts eons ago, but never found the time to actual do it. Remembered it this morning, pointed Claude at the README, and had something that appears to work in minutes. caveat emptor: the design is mine, but the code is purely LLM generated at this point.
author: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 12:02:41 -0400
committer: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 12:02:41 -0400
commit: eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch)
tree: 626d64c3574cfbc7cc38eae6d142ef22b21cf59b
parent: 8351a1da3f56cde9939b934bc5533a95aff1c95e (diff)
download: bibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz
24 files changed, 1465 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d38c149
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.swp
+*~
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..53b781e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2026, Douglas Brumbaugh
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..079df9f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+LIBDIR = $(PREFIX)/share/bibutils
+
+SCRIPTS = bib-util bib-add bib-check bib-convert bib-extract bib-fetch \
+          bib-gen bib-key bib-ls
+LIBS = lib/bib-parse.awk lib/bib-canon.awk lib/bib-select.awk \
+       lib/bib-lskeys.awk lib/bib-key.awk lib/bib-ls.awk \
+       lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk
+
+all:
+	@echo "nothing to build; run 'make test' or 'make install'"
+
+test:
+	tests/run-tests.sh
+	tests/integration.sh
+
+install:
+	-mkdir -p $(BINDIR) $(LIBDIR)
+	cp $(SCRIPTS) $(BINDIR)
+	cp $(LIBS) $(LIBDIR)
+
+uninstall:
+	cd $(BINDIR) && rm -f $(SCRIPTS)
+	rm -rf $(LIBDIR)
+
+.PHONY: all test install uninstall
diff --git a/README.md b/README.md
index d692649..53ccd48 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,19 @@ formatted bibtex entry will be emitted on stdout.
 
 ## bib-extract 
 A script which filters a bibtex database provided on stdin or as an argument
-and emits only those entries contained within a specified aux file.
+and emits only those entries contained within a specified aux file. Both
+classic bibtex and biblatex/biber aux files are understood, and \nocite{*}
+selects the whole database. (roff citation sources are planned but not yet
+supported.)
+
+## bib-ls
+List the entries in a database, one key per line, or with -l as
+tab-separated key, type, author, year and title.
+
+## bib-check
+Lint a database: reports missing required fields, duplicate keys,
+duplicate titles (likely duplicated entries) and empty field values.
+Exits nonzero if any problem was found.
 
 ## bib-key
 A script which accepts a bibtex entry on stdin, and emits it on stdout with
@@ -29,5 +41,32 @@ an automatically generated bibtex key.
 
 ## bib-fetch
 A script which accepts a pdf file as an input argument and will attempt to
-fetch a corresponding bibtex entry from crossref.org based on its DOI, if
-one is available.
+fetch a corresponding bibtex entry based on its DOI (via crossref.org) or,
+failing that, its arXiv id (via arxiv.org). An identifier can also be given
+directly with -d (DOI) or -a (arXiv id).
+
+## bib-convert
+Convert between bibtex and refer database formats. The direction is detected
+automatically from the input, or can be forced with -b (to bibtex) or -r
+(to refer).
+
+# Canonical form
+Entries that pass through these tools are canonicalized: lowercase entry
+types and field names, 2-space indentation, brace-delimited values with
+internal whitespace collapsed, bare numbers left bare, and macro
+references/concatenations preserved verbatim. @string and @preamble blocks
+pass through untouched.
+
+# Installation
+    make install            # PREFIX=/usr/local by default
+
+The scripts look for the shared awk library in $BIBUTILS_LIB, then in
+lib/ next to the script, then in /usr/local/share/bibutils. If installing
+with a non-default PREFIX, set BIBUTILS_LIB accordingly.
+
+# Dependencies
+POSIX shell and awk only, with two exceptions: bib-fetch requires curl,
+plus pdftotext (poppler) for DOI extraction from pdfs.
+
+# Tests
+    make test
diff --git a/bib-add b/bib-add
new file mode 100755
index 0000000..28ebf82
--- /dev/null
+++ b/bib-add
@@ -0,0 +1,67 @@
+#!/bin/sh
+# bib-add - insert bibtex entries from stdin into a database file
+#
+# usage: bib-add [-f] db.bib < entry
+#   -f  replace existing entries with the same key
+
+usage() {
+  printf 'usage: bib-add [-f] db.bib < entry\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+force=0
+while getopts f opt; do
+  case $opt in
+    f) force=1 ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ $# -eq 1 ] || usage
+db=$1
+
+tmp=$(mktemp) && tmpkeys=$(mktemp) && tmpdb=$(mktemp) || exit 1
+trap 'rm -f "$tmp" "$tmpkeys" "$tmpdb"' EXIT INT TERM
+
+# canonicalize the incoming entries
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" -f "$LIB/bib-select.awk" \
+    -v keys= -v invert=1 > "$tmp"
+
+if [ ! -s "$tmp" ]; then
+  printf 'bib-add: no entries on stdin\n' >&2
+  exit 1
+fi
+
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$tmp" > "$tmpkeys"
+
+if [ -f "$db" ]; then
+  dups=$(awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$db" \
+      | grep -Fxf "$tmpkeys") || dups=
+  if [ -n "$dups" ]; then
+    if [ "$force" -eq 1 ]; then
+      # rewrite the database without the entries being replaced
+      keys=$(printf '%s\n' "$dups" | paste -sd, -)
+      awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+          -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=1 \
+          "$db" > "$tmpdb" || exit 1
+      cp "$tmpdb" "$db"
+    else
+      printf 'bib-add: duplicate keys in %s:\n' "$db" >&2
+      printf '%s\n' "$dups" >&2
+      exit 1
+    fi
+  fi
+fi
+
+{
+  [ -s "$db" ] && echo ""
+  cat "$tmp"
+} >> "$db"
diff --git a/bib-check b/bib-check
new file mode 100755
index 0000000..062e157
--- /dev/null
+++ b/bib-check
@@ -0,0 +1,18 @@
+#!/bin/sh
+# bib-check - lint a bibtex database
+#
+# usage: bib-check [file ...]   (stdin if no file given)
+#
+# Reports missing required fields, duplicate keys, duplicate titles and
+# empty field values. Exits nonzero if any problem was found.
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-check.awk" "$@"
diff --git a/bib-convert b/bib-convert
new file mode 100755
index 0000000..ef4c0b0
--- /dev/null
+++ b/bib-convert
@@ -0,0 +1,56 @@
+#!/bin/sh
+# bib-convert - convert between bibtex and refer database formats
+#
+# usage: bib-convert [-b | -r] [file]   (stdin if no file given)
+#   -b  force refer -> bibtex
+#   -r  force bibtex -> refer
+#
+# Without a flag the direction is detected from the input: text whose
+# first record starts with @ is taken as bibtex, with % as refer.
+
+usage() {
+  printf 'usage: bib-convert [-b | -r] [file]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+mode=auto
+while getopts br opt; do
+  case $opt in
+    b) mode=tobib ;;
+    r) mode=toref ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ $# -le 1 ] || usage
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+cat "$@" > "$tmp"
+
+if [ "$mode" = auto ]; then
+  first=$(awk 'NF { sub(/^[ \t]+/, ""); print substr($0, 1, 1); exit }' "$tmp")
+  case $first in
+    @) mode=toref ;;
+    %) mode=tobib ;;
+    *) printf 'bib-convert: cannot detect input format\n' >&2; exit 1 ;;
+  esac
+fi
+
+if [ "$mode" = toref ]; then
+  exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+           -f "$LIB/bib2ref.awk" "$tmp"
+else
+  awk -f "$LIB/ref2bib.awk" "$tmp" | "$bibkey"
+fi
diff --git a/bib-extract b/bib-extract
new file mode 100755
index 0000000..52aa85b
--- /dev/null
+++ b/bib-extract
@@ -0,0 +1,60 @@
+#!/bin/sh
+# bib-extract - emit only the database entries cited in an aux file
+#
+# usage: bib-extract file.aux [db.bib]   (db on stdin if omitted)
+#
+# roff/refer citation sources are planned but not yet supported.
+
+usage() {
+  printf 'usage: bib-extract file.aux [db.bib]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+[ $# -ge 1 ] && [ $# -le 2 ] || usage
+aux=$1
+shift
+[ -r "$aux" ] || { printf 'bib-extract: cannot read %s\n' "$aux" >&2; exit 1; }
+
+keys=$(awk '
+  # classic bibtex: \citation{key,key,...}
+  {
+    line = $0
+    while (match(line, /\\citation\{[^}]*\}/)) {
+      n = split(substr(line, RSTART + 10, RLENGTH - 11), a, ",")
+      for (i = 1; i <= n; i++)
+        if (a[i] != "")
+          print a[i]
+      line = substr(line, RSTART + RLENGTH)
+    }
+  }
+  # biblatex/biber: \abx@aux@cite{segment}{key} (older: one argument)
+  {
+    line = $0
+    while (match(line, /\\abx@aux@cite(\{[0-9]*\})?\{[^}]*\}/)) {
+      s = substr(line, RSTART, RLENGTH)
+      sub(/\}$/, "", s)
+      sub(/^.*\{/, "", s)
+      if (s != "")
+        print s
+      line = substr(line, RSTART + RLENGTH)
+    }
+  }' "$aux" | sort -u | paste -sd, -)
+
+[ -n "$keys" ] || exit 0
+
+# \nocite{*} cites everything: emit the whole database
+case ",$keys," in
+  *,\*,*) keys= invert=1 ;;
+  *) invert=0 ;;
+esac
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-select.awk" -v keys="$keys" -v invert="$invert" "$@"
diff --git a/bib-fetch b/bib-fetch
new file mode 100755
index 0000000..62f7993
--- /dev/null
+++ b/bib-fetch
@@ -0,0 +1,82 @@
+#!/bin/sh
+# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
+#
+# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
+#
+# Unless given with -d or -a, an identifier is extracted from the first
+# pages of the pdf (requires pdftotext): a DOI if one is found, falling
+# back to an arXiv id. DOIs are resolved through doi.org content
+# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
+# is emitted canonically on stdout with a generated key.
+
+usage() {
+  printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
+  exit 2
+}
+
+doi=
+arxiv=
+while getopts d:a: opt; do
+  case $opt in
+    d) doi=$OPTARG ;;
+    a) arxiv=$OPTARG ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ -n "$doi" ] && [ -n "$arxiv" ] && usage
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+command -v curl > /dev/null 2>&1 || {
+  printf 'bib-fetch: curl is required\n' >&2
+  exit 1
+}
+
+if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+  [ $# -eq 1 ] || usage
+  pdf=$1
+  [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
+  command -v pdftotext > /dev/null 2>&1 || {
+    printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
+    exit 1
+  }
+  ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
+    # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
+    doi == "" {
+      if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
+        doi = substr($0, RSTART, RLENGTH)
+        sub(/[.,;)\]]+$/, "", doi)
+      }
+    }
+    arxiv == "" {
+      # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
+      if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
+          match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
+        arxiv = substr($0, RSTART + 6, RLENGTH - 6)
+    }
+    END { printf "%s\t%s\n", doi, arxiv }')
+  doi=${ids%%	*}
+  arxiv=${ids#*	}
+  if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+    printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
+    exit 1
+  fi
+fi
+
+if [ -n "$doi" ]; then
+  entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
+      "https://doi.org/$doi") || {
+    printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
+    exit 1
+  }
+else
+  arxiv=${arxiv#arXiv:}
+  entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
+    printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
+    exit 1
+  }
+fi
+
+printf '%s\n' "$entry" | "$bibkey"
diff --git a/bib-gen b/bib-gen
new file mode 100755
index 0000000..0fdd63a
--- /dev/null
+++ b/bib-gen
@@ -0,0 +1,95 @@
+#!/bin/sh
+# bib-gen - generate a bibtex entry
+#
+# usage: bib-gen [-t type] [field=value ...]
+#        bib-gen [-t type] -F field,field,...   (tab-separated stdin)
+#
+# With field=value arguments, one entry is built from them. With -F,
+# one entry is built per tab-separated line of stdin, columns matching
+# the listed fields. Otherwise the user is prompted interactively.
+# Entries are emitted on stdout with generated keys.
+
+usage() {
+  printf 'usage: bib-gen [-t type] [field=value ...]\n' >&2
+  printf '       bib-gen [-t type] -F field,field,... < data\n' >&2
+  exit 2
+}
+
+type=article
+fmt=
+while getopts t:F: opt; do
+  case $opt in
+    t) type=$OPTARG ;;
+    F) fmt=$OPTARG ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+# fields prompted for in interactive mode, per entry type
+fields_for() {
+  case $1 in
+    article)       echo "author title journal year volume number pages month doi" ;;
+    book)          echo "author title publisher year volume series address edition" ;;
+    inproceedings|conference)
+                   echo "author title booktitle year editor pages publisher doi" ;;
+    incollection)  echo "author title booktitle publisher year editor pages chapter" ;;
+    techreport)    echo "author title institution year number address month" ;;
+    phdthesis|mastersthesis)
+                   echo "author title school year address month" ;;
+    *)             echo "author title year howpublished note url" ;;
+  esac
+}
+
+if [ -n "$fmt" ]; then
+  # batch mode: tab-separated values on stdin
+  awk -F '\t' -v fmt="$fmt" -v type="$type" '
+    BEGIN { nf = split(fmt, F, ",") }
+    NF {
+      printf "@%s{FIXME,\n", type
+      for (i = 1; i <= nf && i <= NF; i++)
+        if ($i != "")
+          printf "  %s = {%s},\n", F[i], $i
+      print "}"
+    }' | "$bibkey"
+  exit $?
+fi
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+
+if [ $# -gt 0 ]; then
+  # argument mode: field=value pairs
+  for arg in "$@"; do
+    case $arg in
+      *=*) printf '%s\t%s\n' "${arg%%=*}" "${arg#*=}" >> "$tmp" ;;
+      *) usage ;;
+    esac
+  done
+else
+  # interactive mode
+  printf 'entry type [%s]: ' "$type" >&2
+  read -r ans || exit 1
+  [ -n "$ans" ] && type=$ans
+  for f in $(fields_for "$type"); do
+    printf '%s: ' "$f" >&2
+    read -r ans || break
+    [ -n "$ans" ] && printf '%s\t%s\n' "$f" "$ans" >> "$tmp"
+  done
+fi
+
+if [ ! -s "$tmp" ]; then
+  printf 'bib-gen: no fields given\n' >&2
+  exit 1
+fi
+
+{
+  printf '@%s{FIXME,\n' "$type"
+  while IFS='	' read -r name value; do
+    printf '  %s = {%s},\n' "$name" "$value"
+  done < "$tmp"
+  printf '}\n'
+} | "$bibkey"
diff --git a/bib-key b/bib-key
new file mode 100755
index 0000000..ff3c363
--- /dev/null
+++ b/bib-key
@@ -0,0 +1,15 @@
+#!/bin/sh
+# bib-key - read bibtex entries and emit them with generated keys
+#
+# usage: bib-key [file ...]   (stdin if no file given)
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-key.awk" "$@"
diff --git a/bib-ls b/bib-ls
new file mode 100755
index 0000000..0ed7236
--- /dev/null
+++ b/bib-ls
@@ -0,0 +1,30 @@
+#!/bin/sh
+# bib-ls - list the entries in a bibtex database
+#
+# usage: bib-ls [-l] [file ...]   (stdin if no file given)
+#   -l  long format: key, type, author, year, title (tab-separated)
+
+usage() {
+  printf 'usage: bib-ls [-l] [file ...]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+long=0
+while getopts l opt; do
+  case $opt in
+    l) long=1 ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-ls.awk" -v long="$long" "$@"
diff --git a/bib-util b/bib-util
new file mode 100755
index 0000000..e807b03
--- /dev/null
+++ b/bib-util
@@ -0,0 +1,28 @@
+#!/bin/sh
+# bib-util - wrapper dispatching to the individual bibutils scripts
+#
+# usage: bib-util command [args ...]
+
+usage() {
+  printf 'usage: bib-util command [args ...]\n' >&2
+  printf 'commands: add check convert extract fetch gen key ls\n' >&2
+  exit 2
+}
+
+[ $# -ge 1 ] || usage
+cmd=$1
+shift
+
+dir=$(dirname "$0")
+case $cmd in
+  add|check|convert|extract|fetch|gen|key|ls)
+    exec "$dir/bib-$cmd" "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    printf 'bib-util: unknown command: %s\n' "$cmd" >&2
+    usage
+    ;;
+esac
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key,    j, v) {
+  printf "@%s{%s,\n", type, key
+  for (j = 1; j <= BIB_N; j++) {
+    v = BIB_VAL[j]
+    if (BIB_KIND[j] == "s") {
+      gsub(/[ \t\r\n]+/, " ", v)
+      v = bib_trim(v)
+      printf "  %s = {%s},\n", BIB_NAME[j], v
+    } else
+      printf "  %s = %s,\n", BIB_NAME[j], v
+  }
+  print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name,    j) {
+  for (j = 1; j <= BIB_N; j++)
+    if (BIB_NAME[j] == name)
+      return BIB_VAL[j]
+  return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+#   - missing fields required by the entry type
+#   - duplicate keys
+#   - entries whose titles normalize to the same string (likely dups)
+#   - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+  REQ["article"] = "author title journal year"
+  REQ["book"] = "author|editor title publisher year"
+  REQ["booklet"] = "title"
+  REQ["inbook"] = "author|editor title publisher year"
+  REQ["incollection"] = "author title booktitle publisher year"
+  REQ["inproceedings"] = "author title booktitle year"
+  REQ["conference"] = "author title booktitle year"
+  REQ["manual"] = "title"
+  REQ["mastersthesis"] = "author title school year"
+  REQ["phdthesis"] = "author title school year"
+  REQ["proceedings"] = "title year"
+  REQ["techreport"] = "author title institution year"
+  REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+  printf "%s: %s\n", key, msg
+  BIB_BAD = 1
+}
+
+function bib_entry(type, key,    n, req, i, alts, na, j, found, t, k) {
+  if (key in BIB_KEYS_SEEN)
+    problem(key, "duplicate key")
+  BIB_KEYS_SEEN[key] = 1
+
+  # required fields ("a|b" means at least one of a, b)
+  if (type in REQ) {
+    n = split(REQ[type], req, " ")
+    for (i = 1; i <= n; i++) {
+      na = split(req[i], alts, "|")
+      found = 0
+      for (j = 1; j <= na; j++)
+        if (bib_get(alts[j]) != "")
+          found = 1
+      if (!found)
+        problem(key, "missing required field: " req[i])
+    }
+  }
+
+  # empty values
+  for (i = 1; i <= BIB_N; i++)
+    if (bib_trim(BIB_VAL[i]) == "")
+      problem(key, "empty field: " BIB_NAME[i])
+
+  # likely duplicate entries: same normalized title
+  t = tolower(bib_get("title"))
+  gsub(/[^a-z0-9]/, "", t)
+  if (t != "") {
+    if (t in BIB_TITLES_SEEN)
+      problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+    else
+      BIB_TITLES_SEEN[t] = key
+  }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key,    k, n) {
+  if (bib_out_n++)
+    print ""
+  k = bib_mkkey()
+  # disambiguate collisions with b, c, ... suffixes
+  if (k in BIB_KEYS_SEEN) {
+    n = ++BIB_KEYS_SEEN[k]
+    k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+  } else
+    BIB_KEYS_SEEN[k] = 1
+  bib_emit(type, k)
+}
+
+function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  y = bib_get("year")
+  t = bib_get("title")
+
+  # surname of the first author
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1)
+  gsub(/[{}]/, "", a)
+  a = bib_trim(a)
+  if (index(a, ",") > 0)
+    surname = substr(a, 1, index(a, ",") - 1)
+  else {
+    n = split(a, parts, /[ \t]+/)
+    surname = (n > 0) ? parts[n] : ""
+  }
+  gsub(/[^A-Za-z0-9]/, "", surname)
+  surname = tolower(surname)
+  if (surname == "")
+    surname = "anon"
+
+  # four-digit year
+  if (match(y, /[0-9][0-9][0-9][0-9]/))
+    y = substr(y, RSTART, 4)
+  else
+    y = ""
+
+  # first significant word of the title
+  gsub(/[{}]/, "", t)
+  word = ""
+  n = split(tolower(t), parts, /[^a-z0-9]+/)
+  for (i = 1; i <= n; i++) {
+    w = parts[i]
+    if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+        w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+        w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+      continue
+    word = w
+    break
+  }
+
+  return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   long - 0: print one key per line
+#          1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key,    a, t) {
+  if (long + 0 == 0) {
+    print key
+    return
+  }
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  gsub(/[{}]/, "", a)
+  gsub(/[ \t\r\n]+/, " ", a)
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1) " et al."
+  t = bib_get("title")
+  gsub(/[{}]/, "", t)
+  gsub(/[ \t\r\n]+/, " ", t)
+  printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+  print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+#   bib_entry(type, key) - called once per regular entry. The fields are
+#                          available in BIB_N, BIB_NAME[], BIB_VAL[] and
+#                          BIB_KIND[]; the raw source text of the entry
+#                          is in BIB_RAW.
+#   bib_pass(raw)        - called for @string and @preamble blocks with
+#                          their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s,    i) {
+  i = 1
+  while (i <= length(s)) {
+    if (substr(s, i, 1) == "@")
+      i = bib_entry_at(s, i)
+    else
+      i++
+  }
+}
+
+function bib_ws(s, i) {
+  while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+    i++
+  return i
+}
+
+function bib_trim(t) {
+  sub(/^[ \t\r\n]+/, "", t)
+  sub(/[ \t\r\n]+$/, "", t)
+  return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i,    depth, start, c) {
+  start = i
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == "{")
+      depth++
+    else if (c == "}") {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i,    depth, start, c) {
+  start = i
+  i++
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    if (c == "{")
+      depth++
+    else if (c == "}")
+      depth--
+    else if (c == "\"" && depth == 0) {
+      i++
+      break
+    }
+    i++
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl,    depth, c) {
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == op)
+      depth++
+    else if (c == cl) {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i,    start, c, piece, pieces, kind) {
+  start = i
+  pieces = 0
+  kind = ""
+  BIB_VALUE = ""
+  while (1) {
+    c = substr(s, i, 1)
+    if (c == "{") {
+      i = bib_braced(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else if (c == "\"") {
+      i = bib_quoted(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else {
+      piece = ""
+      while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+        piece = piece substr(s, i, 1)
+        i++
+      }
+      BIB_VALUE = BIB_VALUE piece
+      kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+    }
+    pieces++
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) == "#")
+      i = bib_ws(s, i + 1)
+    else
+      break
+  }
+  if (pieces > 1)
+    kind = "r"
+  if (kind == "r")
+    BIB_VALUE = bib_trim(substr(s, start, i - start))
+  BIB_VKIND = kind
+  return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i,    at, type, opener, closer, key, name, c) {
+  at = i
+  i++
+  type = ""
+  while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+    type = type substr(s, i, 1)
+    i++
+  }
+  type = tolower(type)
+  i = bib_ws(s, i)
+  c = substr(s, i, 1)
+  if (c == "{") {
+    opener = "{"
+    closer = "}"
+  } else if (c == "(") {
+    opener = "("
+    closer = ")"
+  } else
+    return i                       # stray @, not an entry
+
+  if (type == "comment")
+    return bib_skip_group(s, i, opener, closer)
+  if (type == "string" || type == "preamble") {
+    i = bib_skip_group(s, i, opener, closer)
+    bib_pass(bib_trim(substr(s, at, i - at)))
+    return i
+  }
+
+  i++                              # consume opener
+  i = bib_ws(s, i)
+  key = ""
+  while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+    key = key substr(s, i, 1)
+    i++
+  }
+  i = bib_ws(s, i)
+  if (substr(s, i, 1) == ",")
+    i++
+
+  BIB_N = 0
+  while (1) {
+    i = bib_ws(s, i)
+    c = substr(s, i, 1)
+    if (c == "" || c == closer) {
+      if (c == closer)
+        i++
+      break
+    }
+    if (c == ",") {
+      i++
+      continue
+    }
+    name = ""
+    while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+      name = name substr(s, i, 1)
+      i++
+    }
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) != "=") {  # malformed; skip a char and resync
+      i++
+      continue
+    }
+    i = bib_ws(s, i + 1)
+    i = bib_value(s, i)
+    BIB_N++
+    BIB_NAME[BIB_N] = tolower(name)
+    BIB_VAL[BIB_N] = BIB_VALUE
+    BIB_KIND[BIB_N] = BIB_VKIND
+  }
+  BIB_RAW = bib_trim(substr(s, at, i - at))
+  bib_entry(type, key)
+  return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   keys   - comma-separated list of entry keys
+#   invert - 0: emit entries whose key is in the list
+#            1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+  bib_sel_n = split(keys, bib_sel_k, ",")
+  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+    BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key) {
+  if ((key in BIB_SEL) != invert + 0) {
+    if (bib_out_n++)
+      print ""
+    bib_emit(type, key)
+  }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+  if (v != "") {
+    gsub(/[{}]/, "", v)
+    gsub(/[ \t\r\n]+/, " ", v)
+    printf "%%%s %s\n", tag, bib_trim(v)
+  }
+}
+
+function r_names(tag, v,    n, parts, i) {
+  gsub(/[{}]/, "", v)
+  gsub(/[ \t\r\n]+/, " ", v)
+  n = split(v, parts, / +[Aa][Nn][Dd] +/)
+  for (i = 1; i <= n; i++)
+    if (bib_trim(parts[i]) != "")
+      printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key,    d, p, m) {
+  if (bib_out_n++)
+    print ""
+  r_names("A", bib_get("author"))
+  r_names("E", bib_get("editor"))
+  r_field("T", bib_get("title"))
+  r_field("J", bib_get("journal"))
+  r_field("B", bib_get("booktitle"))
+  d = bib_get("year")
+  m = bib_get("month")
+  if (m != "")
+    d = (d != "") ? m " " d : m
+  r_field("D", d)
+  r_field("V", bib_get("volume"))
+  r_field("N", bib_get("number"))
+  p = bib_get("pages")
+  gsub(/--/, "-", p)
+  r_field("P", p)
+  if (bib_get("publisher") != "")
+    r_field("I", bib_get("publisher"))
+  else if (bib_get("institution") != "")
+    r_field("I", bib_get("institution"))
+  else if (bib_get("school") != "")
+    r_field("I", bib_get("school"))
+  r_field("C", bib_get("address"))
+  r_field("K", bib_get("keywords"))
+  r_field("X", bib_get("abstract"))
+  r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+  RS = ""
+  FS = "\n"
+}
+
+function r_trim(t) {
+  sub(/^[ \t\r]+/, "", t)
+  sub(/[ \t\r]+$/, "", t)
+  return t
+}
+
+function r_emit(name, v) {
+  if (v != "")
+    printf "  %s = {%s},\n", name, v
+}
+
+{
+  split("", val)
+  na = 0
+  ne = 0
+  split("", A)
+  split("", E)
+  lasttag = ""
+  for (i = 1; i <= NF; i++) {
+    line = $i
+    if (substr(line, 1, 1) == "%") {
+      tag = substr(line, 2, 1)
+      v = r_trim(substr(line, 3))
+      if (tag == "A")
+        A[++na] = v
+      else if (tag == "E")
+        E[++ne] = v
+      else
+        val[tag] = v
+      lasttag = tag
+    } else if (lasttag == "A")
+      A[na] = A[na] " " r_trim(line)
+    else if (lasttag == "E")
+      E[ne] = E[ne] " " r_trim(line)
+    else if (lasttag != "")
+      val[lasttag] = val[lasttag] " " r_trim(line)
+  }
+  if (na == 0 && ne == 0 && !("T" in val))
+    next
+
+  # guess an entry type from the fields present
+  if ("J" in val)
+    type = "article"
+  else if ("B" in val)
+    type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+        ? "inproceedings" : "incollection"
+  else if ("R" in val)
+    type = "techreport"
+  else if ("I" in val)
+    type = "book"
+  else
+    type = "misc"
+
+  if (out_n++)
+    print ""
+  printf "@%s{FIXME,\n", type
+
+  authors = ""
+  for (i = 1; i <= na; i++)
+    authors = (i == 1) ? A[i] : authors " and " A[i]
+  r_emit("author", authors)
+  editors = ""
+  for (i = 1; i <= ne; i++)
+    editors = (i == 1) ? E[i] : editors " and " E[i]
+  r_emit("editor", editors)
+
+  r_emit("title", val["T"])
+  r_emit("journal", val["J"])
+  r_emit("booktitle", val["B"])
+
+  d = val["D"]
+  if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+    r_emit("year", substr(d, RSTART, 4))
+    m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+    if (m != "")
+      r_emit("month", m)
+  } else
+    r_emit("year", d)
+
+  r_emit("volume", val["V"])
+  r_emit("number", val["N"])
+  p = val["P"]
+  gsub(/-+/, "--", p)
+  r_emit("pages", p)
+  r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+  r_emit("address", val["C"])
+  if ("R" in val) {
+    if ("N" in val)
+      r_emit("note", val["R"])
+    else
+      r_emit("number", val["R"])
+  }
+  r_emit("keywords", val["K"])
+  r_emit("abstract", val["X"])
+  r_emit("note", val["O"])
+  print "}"
+}
diff --git a/tests/integration.sh b/tests/integration.sh
new file mode 100755
index 0000000..ea847e4
--- /dev/null
+++ b/tests/integration.sh
@@ -0,0 +1,123 @@
+#!/bin/sh
+# integration.sh - end-to-end test against a real LaTeX document
+#
+# Requires pdflatex and bibtex; skipped otherwise. Set BIBTEST_NET=1 to
+# also exercise bib-fetch against doi.org (needs network access).
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+LSKEYS="awk -f $ROOT/lib/bib-parse.awk -f $ROOT/lib/bib-lskeys.awk"
+
+command -v pdflatex > /dev/null 2>&1 && command -v bibtex > /dev/null 2>&1 || {
+  printf 'integration: pdflatex/bibtex not found, skipping\n' >&2
+  exit 0
+}
+
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+cd "$tmpd" || exit 1
+
+pass=0
+fail=0
+ok() { pass=$((pass + 1)); printf 'ok   - %s\n' "$1"; }
+not_ok() { fail=$((fail + 1)); printf 'FAIL - %s\n' "$1"; }
+
+# ---- build a database with bib-gen | bib-add ---------------------------
+bib-gen -t article author='Donald E. Knuth' title='Literate Programming' \
+  journal='The Computer Journal' year=1984 volume=27 number=2 \
+  pages='97--111' | bib-add master.bib
+bib-gen -t article author='Alan M. Turing' \
+  title='Computing Machinery and Intelligence' journal='Mind' year=1950 \
+  volume=59 pages='433--460' | bib-add master.bib
+printf 'Claude E. Shannon\tA Mathematical Theory of Communication\tBell System Technical Journal\t1948
+Edsger W. Dijkstra\tGo To Statement Considered Harmful\tCommunications of the ACM\t1968
+' | bib-gen -F author,title,journal,year | bib-add master.bib
+
+n=$($LSKEYS master.bib | wc -l)
+[ "$n" -eq 4 ] && ok "database built with 4 entries" \
+               || not_ok "database built with 4 entries (got $n)"
+
+# ---- compile a document citing a subset --------------------------------
+cat > paper.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+Machines may think~\cite{turing1950computing}; programs are
+literature~\cite{knuth1984literate}.
+
+DOI: 10.1093/comjnl/27.2.97
+\bibliographystyle{plain}
+\bibliography{master}
+\end{document}
+EOF
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+
+grep -q 'citation{turing1950computing}' paper.aux \
+  && ok "pdflatex produced citations in aux" \
+  || not_ok "pdflatex produced citations in aux"
+
+# ---- extract the cited subset and build against it ---------------------
+bib-extract paper.aux master.bib > paper.bib
+n=$($LSKEYS paper.bib | wc -l)
+[ "$n" -eq 2 ] && ok "bib-extract kept the 2 cited entries" \
+               || not_ok "bib-extract kept the 2 cited entries (got $n)"
+
+sed 's/\\bibdata{master}/\\bibdata{paper}/' paper.aux > tmp.aux \
+  && mv tmp.aux paper.aux
+bibtex paper > bibtex.log 2>&1
+grep -qi 'error\|warning' bibtex.log \
+  && not_ok "bibtex accepts canonical output cleanly" \
+  || ok "bibtex accepts canonical output cleanly"
+
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+if grep -qi 'undefined' paper.log; then
+  not_ok "document resolves all citations"
+else
+  ok "document resolves all citations"
+fi
+[ -s paper.pdf ] && ok "pdf produced" || not_ok "pdf produced"
+
+# ---- convert roundtrip --------------------------------------------------
+bib-convert master.bib | bib-convert > roundtrip.bib
+if [ "$($LSKEYS master.bib | sort)" = "$($LSKEYS roundtrip.bib | sort)" ]; then
+  ok "bibtex -> refer -> bibtex preserves all keys"
+else
+  not_ok "bibtex -> refer -> bibtex preserves all keys"
+fi
+
+# ---- bib-fetch against the built pdf (network) --------------------------
+if [ "$BIBTEST_NET" = 1 ]; then
+  if bib-fetch paper.pdf > fetched.bib 2> /dev/null; then
+    grep -q '^@article{knuth1984literate,' fetched.bib \
+      && ok "bib-fetch resolves DOI from built pdf" \
+      || not_ok "bib-fetch resolves DOI from built pdf"
+    if bib-fetch paper.pdf 2> /dev/null | bib-add master.bib 2> /dev/null; then
+      not_ok "fetched entry detected as duplicate"
+    else
+      ok "fetched entry detected as duplicate"
+    fi
+  else
+    not_ok "bib-fetch resolves DOI from built pdf"
+  fi
+  bib-fetch -a 1706.03762 2> /dev/null \
+      | grep -q '^@misc{vaswani[0-9]*attention,' \
+    && ok "bib-fetch resolves arXiv id" \
+    || not_ok "bib-fetch resolves arXiv id"
+  cat > arx.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+A preprint without any DOI.
+
+arXiv:1706.03762v7 [cs.CL] 2 Aug 2023
+\end{document}
+EOF
+  pdflatex -interaction=batchmode arx.tex > /dev/null 2>&1
+  bib-fetch arx.pdf 2> /dev/null | grep -q 'eprint = {1706.03762}' \
+    && ok "bib-fetch extracts arXiv id from pdf" \
+    || not_ok "bib-fetch extracts arXiv id from pdf"
+else
+  printf 'skip - bib-fetch network tests (set BIBTEST_NET=1 to enable)\n'
+fi
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
new file mode 100755
index 0000000..70721db
--- /dev/null
+++ b/tests/run-tests.sh
@@ -0,0 +1,187 @@
+#!/bin/sh
+# run-tests.sh - test suite for bibutils
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+
+pass=0
+fail=0
+
+ok() {
+  pass=$((pass + 1))
+  printf 'ok   - %s\n' "$1"
+}
+
+not_ok() {
+  fail=$((fail + 1))
+  printf 'FAIL - %s\n' "$1"
+}
+
+# check description command...  (passes if the command succeeds)
+check() {
+  desc=$1
+  shift
+  if "$@" > /dev/null 2>&1; then
+    ok "$desc"
+  else
+    not_ok "$desc"
+  fi
+}
+
+entry='@ARTICLE{ junk-key ,
+  AUTHOR = "Donald E. Knuth",
+  Title={Literate   Programming},
+  JOURNAL  = {The Computer Journal},
+  Year = 1984, volume={27},
+  pages = {97--111}
+}'
+
+# ---- bib-key ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-key)
+check "bib-key generates surname-year-word key" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{knuth1984literate,'"
+
+# key collisions get letter suffixes
+out=$(printf '@inproceedings{a, author={J. Smith}, title={Fast Trees}, year=2020}
+@article{b, author={J. Smith}, title={Fast Trees Extended}, year=2020}
+@article{c, author={J. Smith}, title={Fast Tree Methods}, year=2020}\n' | bib-key)
+check "bib-key disambiguates colliding keys" \
+  sh -c "printf '%s' '$out' | grep -q '{smith2020fast,' &&
+         printf '%s' '$out' | grep -q '{smith2020fastb,' &&
+         printf '%s' '$out' | grep -q '{smith2020fastc,'"
+
+# ---- canonicalization via bib-add -------------------------------------
+db=$tmpd/refs.bib
+printf '%s\n' "$entry" | bib-add "$db"
+check "bib-add creates database" test -s "$db"
+check "bib-add lowercases field names" grep -q '  author = {Donald E. Knuth},' "$db"
+check "bib-add collapses whitespace in values" \
+  grep -q '  title = {Literate Programming},' "$db"
+check "bib-add keeps bare numbers bare" grep -q '  year = 1984,' "$db"
+
+# duplicate detection
+if printf '%s\n' "$entry" | bib-add "$db" 2> /dev/null; then
+  not_ok "bib-add rejects duplicate key"
+else
+  ok "bib-add rejects duplicate key"
+fi
+
+# forced replacement
+printf '%s\n' "$entry" | sed 's/1984/1985/' | bib-add -f "$db"
+check "bib-add -f replaces entry" grep -q '  year = 1985,' "$db"
+n=$(grep -c '^@article{junk-key,' "$db")
+[ "$n" = 1 ] && ok "bib-add -f leaves one copy" || not_ok "bib-add -f leaves one copy"
+
+# ---- bib-extract -------------------------------------------------------
+cat > "$tmpd/all.bib" <<'EOF'
+@article{alpha2020one, author = {A. Alpha}, title = {One}, year = 2020}
+@article{beta2021two, author = {B. Beta}, title = {Two}, year = 2021}
+@article{gamma2022three, author = {C. Gamma}, title = {Three}, year = 2022}
+EOF
+cat > "$tmpd/doc.aux" <<'EOF'
+\relax
+\citation{alpha2020one}
+\citation{gamma2022three,alpha2020one}
+\bibstyle{plain}
+EOF
+out=$(bib-extract "$tmpd/doc.aux" "$tmpd/all.bib")
+check "bib-extract keeps cited entries" \
+  sh -c "printf '%s' '$out' | grep -q alpha2020one"
+check "bib-extract keeps all cited entries" \
+  sh -c "printf '%s' '$out' | grep -q gamma2022three"
+if printf '%s' "$out" | grep -q beta2021two; then
+  not_ok "bib-extract drops uncited entries"
+else
+  ok "bib-extract drops uncited entries"
+fi
+
+# ---- bib-convert -------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-convert)
+check "bib-convert emits refer author" \
+  sh -c "printf '%s' '$out' | grep -q '^%A Donald E. Knuth$'"
+check "bib-convert emits refer pages with single dash" \
+  sh -c "printf '%s' '$out' | grep -q '^%P 97-111$'"
+
+cat > "$tmpd/rec.ref" <<'EOF'
+%A Alan M. Turing
+%T Computing Machinery and Intelligence
+%J Mind
+%D 1950
+%V 59
+%P 433-460
+EOF
+out=$(bib-convert "$tmpd/rec.ref")
+check "bib-convert refer->bibtex type guess" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{turing1950computing,'"
+check "bib-convert refer->bibtex pages" \
+  sh -c "printf '%s' '$out' | grep -q '  pages = {433--460},'"
+
+# ---- bib-gen -----------------------------------------------------------
+out=$(bib-gen -t book author='Xavier Yu' title='Some Title' year=2001 publisher='Pub')
+check "bib-gen argument mode" \
+  sh -c "printf '%s' '$out' | grep -q '^@book{yu2001some,'"
+
+out=$(printf 'A. Author\tNeat Paper\tGood Journal\t1999\n' \
+    | bib-gen -F author,title,journal,year)
+check "bib-gen batch mode" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{author1999neat,'"
+
+# ---- bib-ls ------------------------------------------------------------
+out=$(bib-ls "$tmpd/all.bib")
+check "bib-ls lists keys" \
+  sh -c "[ \"\$(printf '%s\n' '$out' | wc -l)\" = 3 ]"
+out=$(bib-ls -l "$tmpd/all.bib")
+check "bib-ls -l shows details" \
+  sh -c "printf '%s' '$out' | grep -q 'beta2021two	article	B. Beta	2021	Two'"
+
+# ---- bib-check ---------------------------------------------------------
+cat > "$tmpd/bad.bib" <<'EOF'
+@article{good2020fine, author = {A. Good}, title = {Fine}, journal = {J}, year = 2020}
+@article{noj2020sad, author = {B. Sad}, title = {No Journal Here}, year = 2020}
+@misc{noj2020sad, title = {Dup Key}}
+@book{dup2021title, author = {C. Dup}, title = {FINE!}, publisher = {P}, year = 2021}
+EOF
+out=$(bib-check "$tmpd/bad.bib")
+if [ $? -ne 0 ]; then ok "bib-check exits nonzero on problems"; else not_ok "bib-check exits nonzero on problems"; fi
+check "bib-check finds missing field" \
+  sh -c "printf '%s' '$out' | grep -q 'noj2020sad: missing required field: journal'"
+check "bib-check finds duplicate key" \
+  sh -c "printf '%s' '$out' | grep -q 'noj2020sad: duplicate key'"
+check "bib-check finds duplicate title" \
+  sh -c "printf '%s' '$out' | grep -q 'dup2021title: title duplicates good2020fine'"
+cat > "$tmpd/clean.bib" <<'EOF'
+@article{a2020x, author = {A. A}, title = {X}, journal = {J}, year = 2020}
+@misc{b2021y, title = {Y}}
+EOF
+check "bib-check passes a clean db" bib-check "$tmpd/clean.bib"
+
+# ---- biblatex aux ------------------------------------------------------
+cat > "$tmpd/bl.aux" <<'EOF'
+\abx@aux@refcontext{nty/global//global/global}
+\abx@aux@cite{0}{beta2021two}
+EOF
+out=$(bib-extract "$tmpd/bl.aux" "$tmpd/all.bib")
+check "bib-extract reads biblatex aux" \
+  sh -c "printf '%s' '$out' | grep -q beta2021two"
+
+# ---- bib-util ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-util key)
+check "bib-util dispatches" \
+  sh -c "printf '%s' '$out' | grep -q knuth1984literate"
+
+# ---- @string passthrough -----------------------------------------------
+cat > "$tmpd/str.bib" <<'EOF'
+@string{cj = {The Computer Journal}}
+@article{knuth1984literate, author = {D. Knuth}, journal = cj, year = 1984}
+EOF
+out=$(printf '\\citation{knuth1984literate}\n' > "$tmpd/s.aux"; \
+      bib-extract "$tmpd/s.aux" "$tmpd/str.bib")
+check "bib-extract passes @string through" \
+  sh -c "printf '%s' '$out' | grep -q '@string{cj'"
+check "macro field stays raw" \
+  sh -c "printf '%s' '$out' | grep -q '  journal = cj,'"
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]
author	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 12:02:41 -0400
committer	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 12:02:41 -0400
commit	eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch)
tree	626d64c3574cfbc7cc38eae6d142ef22b21cf59b
parent	8351a1da3f56cde9939b934bc5533a95aff1c95e (diff)
download	bibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz