24 files changed, 1465 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d38c149
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.swp
+*~
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..53b781e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2026, Douglas Brumbaugh
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..079df9f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+LIBDIR = $(PREFIX)/share/bibutils
+
+SCRIPTS = bib-util bib-add bib-check bib-convert bib-extract bib-fetch \
+          bib-gen bib-key bib-ls
+LIBS = lib/bib-parse.awk lib/bib-canon.awk lib/bib-select.awk \
+       lib/bib-lskeys.awk lib/bib-key.awk lib/bib-ls.awk \
+       lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk
+
+all:
+	@echo "nothing to build; run 'make test' or 'make install'"
+
+test:
+	tests/run-tests.sh
+	tests/integration.sh
+
+install:
+	-mkdir -p $(BINDIR) $(LIBDIR)
+	cp $(SCRIPTS) $(BINDIR)
+	cp $(LIBS) $(LIBDIR)
+
+uninstall:
+	cd $(BINDIR) && rm -f $(SCRIPTS)
+	rm -rf $(LIBDIR)
+
+.PHONY: all test install uninstall
diff --git a/README.md b/README.md
index d692649..53ccd48 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,19 @@ formatted bibtex entry will be emitted on stdout.
 
 ## bib-extract 
 A script which filters a bibtex database provided on stdin or as an argument
-and emits only those entries contained within a specified aux file.
+and emits only those entries contained within a specified aux file. Both
+classic bibtex and biblatex/biber aux files are understood, and \nocite{*}
+selects the whole database. (roff citation sources are planned but not yet
+supported.)
+
+## bib-ls
+List the entries in a database, one key per line, or with -l as
+tab-separated key, type, author, year and title.
+
+## bib-check
+Lint a database: reports missing required fields, duplicate keys,
+duplicate titles (likely duplicated entries) and empty field values.
+Exits nonzero if any problem was found.
 
 ## bib-key
 A script which accepts a bibtex entry on stdin, and emits it on stdout with
@@ -29,5 +41,32 @@ an automatically generated bibtex key.
 
 ## bib-fetch
 A script which accepts a pdf file as an input argument and will attempt to
-fetch a corresponding bibtex entry from crossref.org based on its DOI, if
-one is available.
+fetch a corresponding bibtex entry based on its DOI (via crossref.org) or,
+failing that, its arXiv id (via arxiv.org). An identifier can also be given
+directly with -d (DOI) or -a (arXiv id).
+
+## bib-convert
+Convert between bibtex and refer database formats. The direction is detected
+automatically from the input, or can be forced with -b (to bibtex) or -r
+(to refer).
+
+# Canonical form
+Entries that pass through these tools are canonicalized: lowercase entry
+types and field names, 2-space indentation, brace-delimited values with
+internal whitespace collapsed, bare numbers left bare, and macro
+references/concatenations preserved verbatim. @string and @preamble blocks
+pass through untouched.
+
+# Installation
+    make install            # PREFIX=/usr/local by default
+
+The scripts look for the shared awk library in $BIBUTILS_LIB, then in
+lib/ next to the script, then in /usr/local/share/bibutils. If installing
+with a non-default PREFIX, set BIBUTILS_LIB accordingly.
+
+# Dependencies
+POSIX shell and awk only, with two exceptions: bib-fetch requires curl,
+plus pdftotext (poppler) for DOI extraction from pdfs.
+
+# Tests
+    make test
diff --git a/bib-add b/bib-add
new file mode 100755
index 0000000..28ebf82
--- /dev/null
+++ b/bib-add
@@ -0,0 +1,67 @@
+#!/bin/sh
+# bib-add - insert bibtex entries from stdin into a database file
+#
+# usage: bib-add [-f] db.bib < entry
+#   -f  replace existing entries with the same key
+
+usage() {
+  printf 'usage: bib-add [-f] db.bib < entry\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+force=0
+while getopts f opt; do
+  case $opt in
+    f) force=1 ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ $# -eq 1 ] || usage
+db=$1
+
+tmp=$(mktemp) && tmpkeys=$(mktemp) && tmpdb=$(mktemp) || exit 1
+trap 'rm -f "$tmp" "$tmpkeys" "$tmpdb"' EXIT INT TERM
+
+# canonicalize the incoming entries
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" -f "$LIB/bib-select.awk" \
+    -v keys= -v invert=1 > "$tmp"
+
+if [ ! -s "$tmp" ]; then
+  printf 'bib-add: no entries on stdin\n' >&2
+  exit 1
+fi
+
+awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$tmp" > "$tmpkeys"
+
+if [ -f "$db" ]; then
+  dups=$(awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$db" \
+      | grep -Fxf "$tmpkeys") || dups=
+  if [ -n "$dups" ]; then
+    if [ "$force" -eq 1 ]; then
+      # rewrite the database without the entries being replaced
+      keys=$(printf '%s\n' "$dups" | paste -sd, -)
+      awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+          -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=1 \
+          "$db" > "$tmpdb" || exit 1
+      cp "$tmpdb" "$db"
+    else
+      printf 'bib-add: duplicate keys in %s:\n' "$db" >&2
+      printf '%s\n' "$dups" >&2
+      exit 1
+    fi
+  fi
+fi
+
+{
+  [ -s "$db" ] && echo ""
+  cat "$tmp"
+} >> "$db"
diff --git a/bib-check b/bib-check
new file mode 100755
index 0000000..062e157
--- /dev/null
+++ b/bib-check
@@ -0,0 +1,18 @@
+#!/bin/sh
+# bib-check - lint a bibtex database
+#
+# usage: bib-check [file ...]   (stdin if no file given)
+#
+# Reports missing required fields, duplicate keys, duplicate titles and
+# empty field values. Exits nonzero if any problem was found.
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-check.awk" "$@"
diff --git a/bib-convert b/bib-convert
new file mode 100755
index 0000000..ef4c0b0
--- /dev/null
+++ b/bib-convert
@@ -0,0 +1,56 @@
+#!/bin/sh
+# bib-convert - convert between bibtex and refer database formats
+#
+# usage: bib-convert [-b | -r] [file]   (stdin if no file given)
+#   -b  force refer -> bibtex
+#   -r  force bibtex -> refer
+#
+# Without a flag the direction is detected from the input: text whose
+# first record starts with @ is taken as bibtex, with % as refer.
+
+usage() {
+  printf 'usage: bib-convert [-b | -r] [file]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+mode=auto
+while getopts br opt; do
+  case $opt in
+    b) mode=tobib ;;
+    r) mode=toref ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ $# -le 1 ] || usage
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+cat "$@" > "$tmp"
+
+if [ "$mode" = auto ]; then
+  first=$(awk 'NF { sub(/^[ \t]+/, ""); print substr($0, 1, 1); exit }' "$tmp")
+  case $first in
+    @) mode=toref ;;
+    %) mode=tobib ;;
+    *) printf 'bib-convert: cannot detect input format\n' >&2; exit 1 ;;
+  esac
+fi
+
+if [ "$mode" = toref ]; then
+  exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+           -f "$LIB/bib2ref.awk" "$tmp"
+else
+  awk -f "$LIB/ref2bib.awk" "$tmp" | "$bibkey"
+fi
diff --git a/bib-extract b/bib-extract
new file mode 100755
index 0000000..52aa85b
--- /dev/null
+++ b/bib-extract
@@ -0,0 +1,60 @@
+#!/bin/sh
+# bib-extract - emit only the database entries cited in an aux file
+#
+# usage: bib-extract file.aux [db.bib]   (db on stdin if omitted)
+#
+# roff/refer citation sources are planned but not yet supported.
+
+usage() {
+  printf 'usage: bib-extract file.aux [db.bib]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+[ $# -ge 1 ] && [ $# -le 2 ] || usage
+aux=$1
+shift
+[ -r "$aux" ] || { printf 'bib-extract: cannot read %s\n' "$aux" >&2; exit 1; }
+
+keys=$(awk '
+  # classic bibtex: \citation{key,key,...}
+  {
+    line = $0
+    while (match(line, /\\citation\{[^}]*\}/)) {
+      n = split(substr(line, RSTART + 10, RLENGTH - 11), a, ",")
+      for (i = 1; i <= n; i++)
+        if (a[i] != "")
+          print a[i]
+      line = substr(line, RSTART + RLENGTH)
+    }
+  }
+  # biblatex/biber: \abx@aux@cite{segment}{key} (older: one argument)
+  {
+    line = $0
+    while (match(line, /\\abx@aux@cite(\{[0-9]*\})?\{[^}]*\}/)) {
+      s = substr(line, RSTART, RLENGTH)
+      sub(/\}$/, "", s)
+      sub(/^.*\{/, "", s)
+      if (s != "")
+        print s
+      line = substr(line, RSTART + RLENGTH)
+    }
+  }' "$aux" | sort -u | paste -sd, -)
+
+[ -n "$keys" ] || exit 0
+
+# \nocite{*} cites everything: emit the whole database
+case ",$keys," in
+  *,\*,*) keys= invert=1 ;;
+  *) invert=0 ;;
+esac
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-select.awk" -v keys="$keys" -v invert="$invert" "$@"
diff --git a/bib-fetch b/bib-fetch
new file mode 100755
index 0000000..62f7993
--- /dev/null
+++ b/bib-fetch
@@ -0,0 +1,82 @@
+#!/bin/sh
+# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
+#
+# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
+#
+# Unless given with -d or -a, an identifier is extracted from the first
+# pages of the pdf (requires pdftotext): a DOI if one is found, falling
+# back to an arXiv id. DOIs are resolved through doi.org content
+# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
+# is emitted canonically on stdout with a generated key.
+
+usage() {
+  printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
+  exit 2
+}
+
+doi=
+arxiv=
+while getopts d:a: opt; do
+  case $opt in
+    d) doi=$OPTARG ;;
+    a) arxiv=$OPTARG ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ -n "$doi" ] && [ -n "$arxiv" ] && usage
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+command -v curl > /dev/null 2>&1 || {
+  printf 'bib-fetch: curl is required\n' >&2
+  exit 1
+}
+
+if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+  [ $# -eq 1 ] || usage
+  pdf=$1
+  [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
+  command -v pdftotext > /dev/null 2>&1 || {
+    printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
+    exit 1
+  }
+  ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
+    # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
+    doi == "" {
+      if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
+        doi = substr($0, RSTART, RLENGTH)
+        sub(/[.,;)\]]+$/, "", doi)
+      }
+    }
+    arxiv == "" {
+      # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
+      if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
+          match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
+        arxiv = substr($0, RSTART + 6, RLENGTH - 6)
+    }
+    END { printf "%s\t%s\n", doi, arxiv }')
+  doi=${ids%%	*}
+  arxiv=${ids#*	}
+  if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+    printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
+    exit 1
+  fi
+fi
+
+if [ -n "$doi" ]; then
+  entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
+      "https://doi.org/$doi") || {
+    printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
+    exit 1
+  }
+else
+  arxiv=${arxiv#arXiv:}
+  entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
+    printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
+    exit 1
+  }
+fi
+
+printf '%s\n' "$entry" | "$bibkey"
diff --git a/bib-gen b/bib-gen
new file mode 100755
index 0000000..0fdd63a
--- /dev/null
+++ b/bib-gen
@@ -0,0 +1,95 @@
+#!/bin/sh
+# bib-gen - generate a bibtex entry
+#
+# usage: bib-gen [-t type] [field=value ...]
+#        bib-gen [-t type] -F field,field,...   (tab-separated stdin)
+#
+# With field=value arguments, one entry is built from them. With -F,
+# one entry is built per tab-separated line of stdin, columns matching
+# the listed fields. Otherwise the user is prompted interactively.
+# Entries are emitted on stdout with generated keys.
+
+usage() {
+  printf 'usage: bib-gen [-t type] [field=value ...]\n' >&2
+  printf '       bib-gen [-t type] -F field,field,... < data\n' >&2
+  exit 2
+}
+
+type=article
+fmt=
+while getopts t:F: opt; do
+  case $opt in
+    t) type=$OPTARG ;;
+    F) fmt=$OPTARG ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+# fields prompted for in interactive mode, per entry type
+fields_for() {
+  case $1 in
+    article)       echo "author title journal year volume number pages month doi" ;;
+    book)          echo "author title publisher year volume series address edition" ;;
+    inproceedings|conference)
+                   echo "author title booktitle year editor pages publisher doi" ;;
+    incollection)  echo "author title booktitle publisher year editor pages chapter" ;;
+    techreport)    echo "author title institution year number address month" ;;
+    phdthesis|mastersthesis)
+                   echo "author title school year address month" ;;
+    *)             echo "author title year howpublished note url" ;;
+  esac
+}
+
+if [ -n "$fmt" ]; then
+  # batch mode: tab-separated values on stdin
+  awk -F '\t' -v fmt="$fmt" -v type="$type" '
+    BEGIN { nf = split(fmt, F, ",") }
+    NF {
+      printf "@%s{FIXME,\n", type
+      for (i = 1; i <= nf && i <= NF; i++)
+        if ($i != "")
+          printf "  %s = {%s},\n", F[i], $i
+      print "}"
+    }' | "$bibkey"
+  exit $?
+fi
+
+tmp=$(mktemp) || exit 1
+trap 'rm -f "$tmp"' EXIT INT TERM
+
+if [ $# -gt 0 ]; then
+  # argument mode: field=value pairs
+  for arg in "$@"; do
+    case $arg in
+      *=*) printf '%s\t%s\n' "${arg%%=*}" "${arg#*=}" >> "$tmp" ;;
+      *) usage ;;
+    esac
+  done
+else
+  # interactive mode
+  printf 'entry type [%s]: ' "$type" >&2
+  read -r ans || exit 1
+  [ -n "$ans" ] && type=$ans
+  for f in $(fields_for "$type"); do
+    printf '%s: ' "$f" >&2
+    read -r ans || break
+    [ -n "$ans" ] && printf '%s\t%s\n' "$f" "$ans" >> "$tmp"
+  done
+fi
+
+if [ ! -s "$tmp" ]; then
+  printf 'bib-gen: no fields given\n' >&2
+  exit 1
+fi
+
+{
+  printf '@%s{FIXME,\n' "$type"
+  while IFS='	' read -r name value; do
+    printf '  %s = {%s},\n' "$name" "$value"
+  done < "$tmp"
+  printf '}\n'
+} | "$bibkey"
diff --git a/bib-key b/bib-key
new file mode 100755
index 0000000..ff3c363
--- /dev/null
+++ b/bib-key
@@ -0,0 +1,15 @@
+#!/bin/sh
+# bib-key - read bibtex entries and emit them with generated keys
+#
+# usage: bib-key [file ...]   (stdin if no file given)
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-key.awk" "$@"
diff --git a/bib-ls b/bib-ls
new file mode 100755
index 0000000..0ed7236
--- /dev/null
+++ b/bib-ls
@@ -0,0 +1,30 @@
+#!/bin/sh
+# bib-ls - list the entries in a bibtex database
+#
+# usage: bib-ls [-l] [file ...]   (stdin if no file given)
+#   -l  long format: key, type, author, year, title (tab-separated)
+
+usage() {
+  printf 'usage: bib-ls [-l] [file ...]\n' >&2
+  exit 2
+}
+
+if [ -n "$BIBUTILS_LIB" ]; then
+  LIB=$BIBUTILS_LIB
+elif [ -d "$(dirname "$0")/lib" ]; then
+  LIB=$(dirname "$0")/lib
+else
+  LIB=/usr/local/share/bibutils
+fi
+
+long=0
+while getopts l opt; do
+  case $opt in
+    l) long=1 ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \
+         -f "$LIB/bib-ls.awk" -v long="$long" "$@"
diff --git a/bib-util b/bib-util
new file mode 100755
index 0000000..e807b03
--- /dev/null
+++ b/bib-util
@@ -0,0 +1,28 @@
+#!/bin/sh
+# bib-util - wrapper dispatching to the individual bibutils scripts
+#
+# usage: bib-util command [args ...]
+
+usage() {
+  printf 'usage: bib-util command [args ...]\n' >&2
+  printf 'commands: add check convert extract fetch gen key ls\n' >&2
+  exit 2
+}
+
+[ $# -ge 1 ] || usage
+cmd=$1
+shift
+
+dir=$(dirname "$0")
+case $cmd in
+  add|check|convert|extract|fetch|gen|key|ls)
+    exec "$dir/bib-$cmd" "$@"
+    ;;
+  help|-h|--help)
+    usage
+    ;;
+  *)
+    printf 'bib-util: unknown command: %s\n' "$cmd" >&2
+    usage
+    ;;
+esac
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key,    j, v) {
+  printf "@%s{%s,\n", type, key
+  for (j = 1; j <= BIB_N; j++) {
+    v = BIB_VAL[j]
+    if (BIB_KIND[j] == "s") {
+      gsub(/[ \t\r\n]+/, " ", v)
+      v = bib_trim(v)
+      printf "  %s = {%s},\n", BIB_NAME[j], v
+    } else
+      printf "  %s = %s,\n", BIB_NAME[j], v
+  }
+  print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name,    j) {
+  for (j = 1; j <= BIB_N; j++)
+    if (BIB_NAME[j] == name)
+      return BIB_VAL[j]
+  return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+#   - missing fields required by the entry type
+#   - duplicate keys
+#   - entries whose titles normalize to the same string (likely dups)
+#   - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+  REQ["article"] = "author title journal year"
+  REQ["book"] = "author|editor title publisher year"
+  REQ["booklet"] = "title"
+  REQ["inbook"] = "author|editor title publisher year"
+  REQ["incollection"] = "author title booktitle publisher year"
+  REQ["inproceedings"] = "author title booktitle year"
+  REQ["conference"] = "author title booktitle year"
+  REQ["manual"] = "title"
+  REQ["mastersthesis"] = "author title school year"
+  REQ["phdthesis"] = "author title school year"
+  REQ["proceedings"] = "title year"
+  REQ["techreport"] = "author title institution year"
+  REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+  printf "%s: %s\n", key, msg
+  BIB_BAD = 1
+}
+
+function bib_entry(type, key,    n, req, i, alts, na, j, found, t, k) {
+  if (key in BIB_KEYS_SEEN)
+    problem(key, "duplicate key")
+  BIB_KEYS_SEEN[key] = 1
+
+  # required fields ("a|b" means at least one of a, b)
+  if (type in REQ) {
+    n = split(REQ[type], req, " ")
+    for (i = 1; i <= n; i++) {
+      na = split(req[i], alts, "|")
+      found = 0
+      for (j = 1; j <= na; j++)
+        if (bib_get(alts[j]) != "")
+          found = 1
+      if (!found)
+        problem(key, "missing required field: " req[i])
+    }
+  }
+
+  # empty values
+  for (i = 1; i <= BIB_N; i++)
+    if (bib_trim(BIB_VAL[i]) == "")
+      problem(key, "empty field: " BIB_NAME[i])
+
+  # likely duplicate entries: same normalized title
+  t = tolower(bib_get("title"))
+  gsub(/[^a-z0-9]/, "", t)
+  if (t != "") {
+    if (t in BIB_TITLES_SEEN)
+      problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+    else
+      BIB_TITLES_SEEN[t] = key
+  }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key,    k, n) {
+  if (bib_out_n++)
+    print ""
+  k = bib_mkkey()
+  # disambiguate collisions with b, c, ... suffixes
+  if (k in BIB_KEYS_SEEN) {
+    n = ++BIB_KEYS_SEEN[k]
+    k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+  } else
+    BIB_KEYS_SEEN[k] = 1
+  bib_emit(type, k)
+}
+
+function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  y = bib_get("year")
+  t = bib_get("title")
+
+  # surname of the first author
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1)
+  gsub(/[{}]/, "", a)
+  a = bib_trim(a)
+  if (index(a, ",") > 0)
+    surname = substr(a, 1, index(a, ",") - 1)
+  else {
+    n = split(a, parts, /[ \t]+/)
+    surname = (n > 0) ? parts[n] : ""
+  }
+  gsub(/[^A-Za-z0-9]/, "", surname)
+  surname = tolower(surname)
+  if (surname == "")
+    surname = "anon"
+
+  # four-digit year
+  if (match(y, /[0-9][0-9][0-9][0-9]/))
+    y = substr(y, RSTART, 4)
+  else
+    y = ""
+
+  # first significant word of the title
+  gsub(/[{}]/, "", t)
+  word = ""
+  n = split(tolower(t), parts, /[^a-z0-9]+/)
+  for (i = 1; i <= n; i++) {
+    w = parts[i]
+    if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+        w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+        w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+      continue
+    word = w
+    break
+  }
+
+  return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   long - 0: print one key per line
+#          1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key,    a, t) {
+  if (long + 0 == 0) {
+    print key
+    return
+  }
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  gsub(/[{}]/, "", a)
+  gsub(/[ \t\r\n]+/, " ", a)
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1) " et al."
+  t = bib_get("title")
+  gsub(/[{}]/, "", t)
+  gsub(/[ \t\r\n]+/, " ", t)
+  printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+  print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+#   bib_entry(type, key) - called once per regular entry. The fields are
+#                          available in BIB_N, BIB_NAME[], BIB_VAL[] and
+#                          BIB_KIND[]; the raw source text of the entry
+#                          is in BIB_RAW.
+#   bib_pass(raw)        - called for @string and @preamble blocks with
+#                          their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s,    i) {
+  i = 1
+  while (i <= length(s)) {
+    if (substr(s, i, 1) == "@")
+      i = bib_entry_at(s, i)
+    else
+      i++
+  }
+}
+
+function bib_ws(s, i) {
+  while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+    i++
+  return i
+}
+
+function bib_trim(t) {
+  sub(/^[ \t\r\n]+/, "", t)
+  sub(/[ \t\r\n]+$/, "", t)
+  return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i,    depth, start, c) {
+  start = i
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == "{")
+      depth++
+    else if (c == "}") {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i,    depth, start, c) {
+  start = i
+  i++
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    if (c == "{")
+      depth++
+    else if (c == "}")
+      depth--
+    else if (c == "\"" && depth == 0) {
+      i++
+      break
+    }
+    i++
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl,    depth, c) {
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == op)
+      depth++
+    else if (c == cl) {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i,    start, c, piece, pieces, kind) {
+  start = i
+  pieces = 0
+  kind = ""
+  BIB_VALUE = ""
+  while (1) {
+    c = substr(s, i, 1)
+    if (c == "{") {
+      i = bib_braced(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else if (c == "\"") {
+      i = bib_quoted(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else {
+      piece = ""
+      while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+        piece = piece substr(s, i, 1)
+        i++
+      }
+      BIB_VALUE = BIB_VALUE piece
+      kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+    }
+    pieces++
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) == "#")
+      i = bib_ws(s, i + 1)
+    else
+      break
+  }
+  if (pieces > 1)
+    kind = "r"
+  if (kind == "r")
+    BIB_VALUE = bib_trim(substr(s, start, i - start))
+  BIB_VKIND = kind
+  return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i,    at, type, opener, closer, key, name, c) {
+  at = i
+  i++
+  type = ""
+  while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+    type = type substr(s, i, 1)
+    i++
+  }
+  type = tolower(type)
+  i = bib_ws(s, i)
+  c = substr(s, i, 1)
+  if (c == "{") {
+    opener = "{"
+    closer = "}"
+  } else if (c == "(") {
+    opener = "("
+    closer = ")"
+  } else
+    return i                       # stray @, not an entry
+
+  if (type == "comment")
+    return bib_skip_group(s, i, opener, closer)
+  if (type == "string" || type == "preamble") {
+    i = bib_skip_group(s, i, opener, closer)
+    bib_pass(bib_trim(substr(s, at, i - at)))
+    return i
+  }
+
+  i++                              # consume opener
+  i = bib_ws(s, i)
+  key = ""
+  while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+    key = key substr(s, i, 1)
+    i++
+  }
+  i = bib_ws(s, i)
+  if (substr(s, i, 1) == ",")
+    i++
+
+  BIB_N = 0
+  while (1) {
+    i = bib_ws(s, i)
+    c = substr(s, i, 1)
+    if (c == "" || c == closer) {
+      if (c == closer)
+        i++
+      break
+    }
+    if (c == ",") {
+      i++
+      continue
+    }
+    name = ""
+    while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+      name = name substr(s, i, 1)
+      i++
+    }
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) != "=") {  # malformed; skip a char and resync
+      i++
+      continue
+    }
+    i = bib_ws(s, i + 1)
+    i = bib_value(s, i)
+    BIB_N++
+    BIB_NAME[BIB_N] = tolower(name)
+    BIB_VAL[BIB_N] = BIB_VALUE
+    BIB_KIND[BIB_N] = BIB_VKIND
+  }
+  BIB_RAW = bib_trim(substr(s, at, i - at))
+  bib_entry(type, key)
+  return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   keys   - comma-separated list of entry keys
+#   invert - 0: emit entries whose key is in the list
+#            1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+  bib_sel_n = split(keys, bib_sel_k, ",")
+  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+    BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key) {
+  if ((key in BIB_SEL) != invert + 0) {
+    if (bib_out_n++)
+      print ""
+    bib_emit(type, key)
+  }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+  if (v != "") {
+    gsub(/[{}]/, "", v)
+    gsub(/[ \t\r\n]+/, " ", v)
+    printf "%%%s %s\n", tag, bib_trim(v)
+  }
+}
+
+function r_names(tag, v,    n, parts, i) {
+  gsub(/[{}]/, "", v)
+  gsub(/[ \t\r\n]+/, " ", v)
+  n = split(v, parts, / +[Aa][Nn][Dd] +/)
+  for (i = 1; i <= n; i++)
+    if (bib_trim(parts[i]) != "")
+      printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key,    d, p, m) {
+  if (bib_out_n++)
+    print ""
+  r_names("A", bib_get("author"))
+  r_names("E", bib_get("editor"))
+  r_field("T", bib_get("title"))
+  r_field("J", bib_get("journal"))
+  r_field("B", bib_get("booktitle"))
+  d = bib_get("year")
+  m = bib_get("month")
+  if (m != "")
+    d = (d != "") ? m " " d : m
+  r_field("D", d)
+  r_field("V", bib_get("volume"))
+  r_field("N", bib_get("number"))
+  p = bib_get("pages")
+  gsub(/--/, "-", p)
+  r_field("P", p)
+  if (bib_get("publisher") != "")
+    r_field("I", bib_get("publisher"))
+  else if (bib_get("institution") != "")
+    r_field("I", bib_get("institution"))
+  else if (bib_get("school") != "")
+    r_field("I", bib_get("school"))
+  r_field("C", bib_get("address"))
+  r_field("K", bib_get("keywords"))
+  r_field("X", bib_get("abstract"))
+  r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+  RS = ""
+  FS = "\n"
+}
+
+function r_trim(t) {
+  sub(/^[ \t\r]+/, "", t)
+  sub(/[ \t\r]+$/, "", t)
+  return t
+}
+
+function r_emit(name, v) {
+  if (v != "")
+    printf "  %s = {%s},\n", name, v
+}
+
+{
+  split("", val)
+  na = 0
+  ne = 0
+  split("", A)
+  split("", E)
+  lasttag = ""
+  for (i = 1; i <= NF; i++) {
+    line = $i
+    if (substr(line, 1, 1) == "%") {
+      tag = substr(line, 2, 1)
+      v = r_trim(substr(line, 3))
+      if (tag == "A")
+        A[++na] = v
+      else if (tag == "E")
+        E[++ne] = v
+      else
+        val[tag] = v
+      lasttag = tag
+    } else if (lasttag == "A")
+      A[na] = A[na] " " r_trim(line)
+    else if (lasttag == "E")
+      E[ne] = E[ne] " " r_trim(line)
+    else if (lasttag != "")
+      val[lasttag] = val[lasttag] " " r_trim(line)
+  }
+  if (na == 0 && ne == 0 && !("T" in val))
+    next
+
+  # guess an entry type from the fields present
+  if ("J" in val)
+    type = "article"
+  else if ("B" in val)
+    type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+        ? "inproceedings" : "incollection"
+  else if ("R" in val)
+    type = "techreport"
+  else if ("I" in val)
+    type = "book"
+  else
+    type = "misc"
+
+  if (out_n++)
+    print ""
+  printf "@%s{FIXME,\n", type
+
+  authors = ""
+  for (i = 1; i <= na; i++)
+    authors = (i == 1) ? A[i] : authors " and " A[i]
+  r_emit("author", authors)
+  editors = ""
+  for (i = 1; i <= ne; i++)
+    editors = (i == 1) ? E[i] : editors " and " E[i]
+  r_emit("editor", editors)
+
+  r_emit("title", val["T"])
+  r_emit("journal", val["J"])
+  r_emit("booktitle", val["B"])
+
+  d = val["D"]
+  if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+    r_emit("year", substr(d, RSTART, 4))
+    m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+    if (m != "")
+      r_emit("month", m)
+  } else
+    r_emit("year", d)
+
+  r_emit("volume", val["V"])
+  r_emit("number", val["N"])
+  p = val["P"]
+  gsub(/-+/, "--", p)
+  r_emit("pages", p)
+  r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+  r_emit("address", val["C"])
+  if ("R" in val) {
+    if ("N" in val)
+      r_emit("note", val["R"])
+    else
+      r_emit("number", val["R"])
+  }
+  r_emit("keywords", val["K"])
+  r_emit("abstract", val["X"])
+  r_emit("note", val["O"])
+  print "}"
+}
diff --git a/tests/integration.sh b/tests/integration.sh
new file mode 100755
index 0000000..ea847e4
--- /dev/null
+++ b/tests/integration.sh
@@ -0,0 +1,123 @@
+#!/bin/sh
+# integration.sh - end-to-end test against a real LaTeX document
+#
+# Requires pdflatex and bibtex; skipped otherwise. Set BIBTEST_NET=1 to
+# also exercise bib-fetch against doi.org (needs network access).
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+LSKEYS="awk -f $ROOT/lib/bib-parse.awk -f $ROOT/lib/bib-lskeys.awk"
+
+command -v pdflatex > /dev/null 2>&1 && command -v bibtex > /dev/null 2>&1 || {
+  printf 'integration: pdflatex/bibtex not found, skipping\n' >&2
+  exit 0
+}
+
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+cd "$tmpd" || exit 1
+
+pass=0
+fail=0
+ok() { pass=$((pass + 1)); printf 'ok   - %s\n' "$1"; }
+not_ok() { fail=$((fail + 1)); printf 'FAIL - %s\n' "$1"; }
+
+# ---- build a database with bib-gen | bib-add ---------------------------
+bib-gen -t article author='Donald E. Knuth' title='Literate Programming' \
+  journal='The Computer Journal' year=1984 volume=27 number=2 \
+  pages='97--111' | bib-add master.bib
+bib-gen -t article author='Alan M. Turing' \
+  title='Computing Machinery and Intelligence' journal='Mind' year=1950 \
+  volume=59 pages='433--460' | bib-add master.bib
+printf 'Claude E. Shannon\tA Mathematical Theory of Communication\tBell System Technical Journal\t1948
+Edsger W. Dijkstra\tGo To Statement Considered Harmful\tCommunications of the ACM\t1968
+' | bib-gen -F author,title,journal,year | bib-add master.bib
+
+n=$($LSKEYS master.bib | wc -l)
+[ "$n" -eq 4 ] && ok "database built with 4 entries" \
+               || not_ok "database built with 4 entries (got $n)"
+
+# ---- compile a document citing a subset --------------------------------
+cat > paper.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+Machines may think~\cite{turing1950computing}; programs are
+literature~\cite{knuth1984literate}.
+
+DOI: 10.1093/comjnl/27.2.97
+\bibliographystyle{plain}
+\bibliography{master}
+\end{document}
+EOF
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+
+grep -q 'citation{turing1950computing}' paper.aux \
+  && ok "pdflatex produced citations in aux" \
+  || not_ok "pdflatex produced citations in aux"
+
+# ---- extract the cited subset and build against it ---------------------
+bib-extract paper.aux master.bib > paper.bib
+n=$($LSKEYS paper.bib | wc -l)
+[ "$n" -eq 2 ] && ok "bib-extract kept the 2 cited entries" \
+               || not_ok "bib-extract kept the 2 cited entries (got $n)"
+
+sed 's/\\bibdata{master}/\\bibdata{paper}/' paper.aux > tmp.aux \
+  && mv tmp.aux paper.aux
+bibtex paper > bibtex.log 2>&1
+grep -qi 'error\|warning' bibtex.log \
+  && not_ok "bibtex accepts canonical output cleanly" \
+  || ok "bibtex accepts canonical output cleanly"
+
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1
+if grep -qi 'undefined' paper.log; then
+  not_ok "document resolves all citations"
+else
+  ok "document resolves all citations"
+fi
+[ -s paper.pdf ] && ok "pdf produced" || not_ok "pdf produced"
+
+# ---- convert roundtrip --------------------------------------------------
+bib-convert master.bib | bib-convert > roundtrip.bib
+if [ "$($LSKEYS master.bib | sort)" = "$($LSKEYS roundtrip.bib | sort)" ]; then
+  ok "bibtex -> refer -> bibtex preserves all keys"
+else
+  not_ok "bibtex -> refer -> bibtex preserves all keys"
+fi
+
+# ---- bib-fetch against the built pdf (network) --------------------------
+if [ "$BIBTEST_NET" = 1 ]; then
+  if bib-fetch paper.pdf > fetched.bib 2> /dev/null; then
+    grep -q '^@article{knuth1984literate,' fetched.bib \
+      && ok "bib-fetch resolves DOI from built pdf" \
+      || not_ok "bib-fetch resolves DOI from built pdf"
+    if bib-fetch paper.pdf 2> /dev/null | bib-add master.bib 2> /dev/null; then
+      not_ok "fetched entry detected as duplicate"
+    else
+      ok "fetched entry detected as duplicate"
+    fi
+  else
+    not_ok "bib-fetch resolves DOI from built pdf"
+  fi
+  bib-fetch -a 1706.03762 2> /dev/null \
+      | grep -q '^@misc{vaswani[0-9]*attention,' \
+    && ok "bib-fetch resolves arXiv id" \
+    || not_ok "bib-fetch resolves arXiv id"
+  cat > arx.tex <<'EOF'
+\documentclass{article}
+\begin{document}
+A preprint without any DOI.
+
+arXiv:1706.03762v7 [cs.CL] 2 Aug 2023
+\end{document}
+EOF
+  pdflatex -interaction=batchmode arx.tex > /dev/null 2>&1
+  bib-fetch arx.pdf 2> /dev/null | grep -q 'eprint = {1706.03762}' \
+    && ok "bib-fetch extracts arXiv id from pdf" \
+    || not_ok "bib-fetch extracts arXiv id from pdf"
+else
+  printf 'skip - bib-fetch network tests (set BIBTEST_NET=1 to enable)\n'
+fi
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
new file mode 100755
index 0000000..70721db
--- /dev/null
+++ b/tests/run-tests.sh
@@ -0,0 +1,187 @@
+#!/bin/sh
+# run-tests.sh - test suite for bibutils
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+PATH=$ROOT:$PATH
+tmpd=$(mktemp -d) || exit 1
+trap 'rm -rf "$tmpd"' EXIT INT TERM
+
+pass=0
+fail=0
+
+ok() {
+  pass=$((pass + 1))
+  printf 'ok   - %s\n' "$1"
+}
+
+not_ok() {
+  fail=$((fail + 1))
+  printf 'FAIL - %s\n' "$1"
+}
+
+# check description command...  (passes if the command succeeds)
+check() {
+  desc=$1
+  shift
+  if "$@" > /dev/null 2>&1; then
+    ok "$desc"
+  else
+    not_ok "$desc"
+  fi
+}
+
+entry='@ARTICLE{ junk-key ,
+  AUTHOR = "Donald E. Knuth",
+  Title={Literate   Programming},
+  JOURNAL  = {The Computer Journal},
+  Year = 1984, volume={27},
+  pages = {97--111}
+}'
+
+# ---- bib-key ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-key)
+check "bib-key generates surname-year-word key" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{knuth1984literate,'"
+
+# key collisions get letter suffixes
+out=$(printf '@inproceedings{a, author={J. Smith}, title={Fast Trees}, year=2020}
+@article{b, author={J. Smith}, title={Fast Trees Extended}, year=2020}
+@article{c, author={J. Smith}, title={Fast Tree Methods}, year=2020}\n' | bib-key)
+check "bib-key disambiguates colliding keys" \
+  sh -c "printf '%s' '$out' | grep -q '{smith2020fast,' &&
+         printf '%s' '$out' | grep -q '{smith2020fastb,' &&
+         printf '%s' '$out' | grep -q '{smith2020fastc,'"
+
+# ---- canonicalization via bib-add -------------------------------------
+db=$tmpd/refs.bib
+printf '%s\n' "$entry" | bib-add "$db"
+check "bib-add creates database" test -s "$db"
+check "bib-add lowercases field names" grep -q '  author = {Donald E. Knuth},' "$db"
+check "bib-add collapses whitespace in values" \
+  grep -q '  title = {Literate Programming},' "$db"
+check "bib-add keeps bare numbers bare" grep -q '  year = 1984,' "$db"
+
+# duplicate detection
+if printf '%s\n' "$entry" | bib-add "$db" 2> /dev/null; then
+  not_ok "bib-add rejects duplicate key"
+else
+  ok "bib-add rejects duplicate key"
+fi
+
+# forced replacement
+printf '%s\n' "$entry" | sed 's/1984/1985/' | bib-add -f "$db"
+check "bib-add -f replaces entry" grep -q '  year = 1985,' "$db"
+n=$(grep -c '^@article{junk-key,' "$db")
+[ "$n" = 1 ] && ok "bib-add -f leaves one copy" || not_ok "bib-add -f leaves one copy"
+
+# ---- bib-extract -------------------------------------------------------
+cat > "$tmpd/all.bib" <<'EOF'
+@article{alpha2020one, author = {A. Alpha}, title = {One}, year = 2020}
+@article{beta2021two, author = {B. Beta}, title = {Two}, year = 2021}
+@article{gamma2022three, author = {C. Gamma}, title = {Three}, year = 2022}
+EOF
+cat > "$tmpd/doc.aux" <<'EOF'
+\relax
+\citation{alpha2020one}
+\citation{gamma2022three,alpha2020one}
+\bibstyle{plain}
+EOF
+out=$(bib-extract "$tmpd/doc.aux" "$tmpd/all.bib")
+check "bib-extract keeps cited entries" \
+  sh -c "printf '%s' '$out' | grep -q alpha2020one"
+check "bib-extract keeps all cited entries" \
+  sh -c "printf '%s' '$out' | grep -q gamma2022three"
+if printf '%s' "$out" | grep -q beta2021two; then
+  not_ok "bib-extract drops uncited entries"
+else
+  ok "bib-extract drops uncited entries"
+fi
+
+# ---- bib-convert -------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-convert)
+check "bib-convert emits refer author" \
+  sh -c "printf '%s' '$out' | grep -q '^%A Donald E. Knuth$'"
+check "bib-convert emits refer pages with single dash" \
+  sh -c "printf '%s' '$out' | grep -q '^%P 97-111$'"
+
+cat > "$tmpd/rec.ref" <<'EOF'
+%A Alan M. Turing
+%T Computing Machinery and Intelligence
+%J Mind
+%D 1950
+%V 59
+%P 433-460
+EOF
+out=$(bib-convert "$tmpd/rec.ref")
+check "bib-convert refer->bibtex type guess" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{turing1950computing,'"
+check "bib-convert refer->bibtex pages" \
+  sh -c "printf '%s' '$out' | grep -q '  pages = {433--460},'"
+
+# ---- bib-gen -----------------------------------------------------------
+out=$(bib-gen -t book author='Xavier Yu' title='Some Title' year=2001 publisher='Pub')
+check "bib-gen argument mode" \
+  sh -c "printf '%s' '$out' | grep -q '^@book{yu2001some,'"
+
+out=$(printf 'A. Author\tNeat Paper\tGood Journal\t1999\n' \
+    | bib-gen -F author,title,journal,year)
+check "bib-gen batch mode" \
+  sh -c "printf '%s' '$out' | grep -q '^@article{author1999neat,'"
+
+# ---- bib-ls ------------------------------------------------------------
+out=$(bib-ls "$tmpd/all.bib")
+check "bib-ls lists keys" \
+  sh -c "[ \"\$(printf '%s\n' '$out' | wc -l)\" = 3 ]"
+out=$(bib-ls -l "$tmpd/all.bib")
+check "bib-ls -l shows details" \
+  sh -c "printf '%s' '$out' | grep -q 'beta2021two	article	B. Beta	2021	Two'"
+
+# ---- bib-check ---------------------------------------------------------
+cat > "$tmpd/bad.bib" <<'EOF'
+@article{good2020fine, author = {A. Good}, title = {Fine}, journal = {J}, year = 2020}
+@article{noj2020sad, author = {B. Sad}, title = {No Journal Here}, year = 2020}
+@misc{noj2020sad, title = {Dup Key}}
+@book{dup2021title, author = {C. Dup}, title = {FINE!}, publisher = {P}, year = 2021}
+EOF
+out=$(bib-check "$tmpd/bad.bib")
+if [ $? -ne 0 ]; then ok "bib-check exits nonzero on problems"; else not_ok "bib-check exits nonzero on problems"; fi
+check "bib-check finds missing field" \
+  sh -c "printf '%s' '$out' | grep -q 'noj2020sad: missing required field: journal'"
+check "bib-check finds duplicate key" \
+  sh -c "printf '%s' '$out' | grep -q 'noj2020sad: duplicate key'"
+check "bib-check finds duplicate title" \
+  sh -c "printf '%s' '$out' | grep -q 'dup2021title: title duplicates good2020fine'"
+cat > "$tmpd/clean.bib" <<'EOF'
+@article{a2020x, author = {A. A}, title = {X}, journal = {J}, year = 2020}
+@misc{b2021y, title = {Y}}
+EOF
+check "bib-check passes a clean db" bib-check "$tmpd/clean.bib"
+
+# ---- biblatex aux ------------------------------------------------------
+cat > "$tmpd/bl.aux" <<'EOF'
+\abx@aux@refcontext{nty/global//global/global}
+\abx@aux@cite{0}{beta2021two}
+EOF
+out=$(bib-extract "$tmpd/bl.aux" "$tmpd/all.bib")
+check "bib-extract reads biblatex aux" \
+  sh -c "printf '%s' '$out' | grep -q beta2021two"
+
+# ---- bib-util ----------------------------------------------------------
+out=$(printf '%s\n' "$entry" | bib-util key)
+check "bib-util dispatches" \
+  sh -c "printf '%s' '$out' | grep -q knuth1984literate"
+
+# ---- @string passthrough -----------------------------------------------
+cat > "$tmpd/str.bib" <<'EOF'
+@string{cj = {The Computer Journal}}
+@article{knuth1984literate, author = {D. Knuth}, journal = cj, year = 1984}
+EOF
+out=$(printf '\\citation{knuth1984literate}\n' > "$tmpd/s.aux"; \
+      bib-extract "$tmpd/s.aux" "$tmpd/str.bib")
+check "bib-extract passes @string through" \
+  sh -c "printf '%s' '$out' | grep -q '@string{cj'"
+check "macro field stays raw" \
+  sh -c "printf '%s' '$out' | grep -q '  journal = cj,'"
+
+printf '\n%d passed, %d failed\n' "$pass" "$fail"
+[ "$fail" -eq 0 ]