diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | LICENSE | 29 | ||||
| -rw-r--r-- | Makefile | 27 | ||||
| -rw-r--r-- | README.md | 45 | ||||
| -rwxr-xr-x | bib-add | 67 | ||||
| -rwxr-xr-x | bib-check | 18 | ||||
| -rwxr-xr-x | bib-convert | 56 | ||||
| -rwxr-xr-x | bib-extract | 60 | ||||
| -rwxr-xr-x | bib-fetch | 82 | ||||
| -rwxr-xr-x | bib-gen | 95 | ||||
| -rwxr-xr-x | bib-key | 15 | ||||
| -rwxr-xr-x | bib-ls | 30 | ||||
| -rwxr-xr-x | bib-util | 28 | ||||
| -rw-r--r-- | lib/bib-canon.awk | 28 | ||||
| -rw-r--r-- | lib/bib-check.awk | 69 | ||||
| -rw-r--r-- | lib/bib-key.awk | 69 | ||||
| -rw-r--r-- | lib/bib-ls.awk | 25 | ||||
| -rw-r--r-- | lib/bib-lskeys.awk | 9 | ||||
| -rw-r--r-- | lib/bib-parse.awk | 216 | ||||
| -rw-r--r-- | lib/bib-select.awk | 29 | ||||
| -rw-r--r-- | lib/bib2ref.awk | 52 | ||||
| -rw-r--r-- | lib/ref2bib.awk | 107 | ||||
| -rwxr-xr-x | tests/integration.sh | 123 | ||||
| -rwxr-xr-x | tests/run-tests.sh | 187 |
24 files changed, 1465 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d38c149 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.swp +*~ @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2026, Douglas Brumbaugh + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..079df9f --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +PREFIX = /usr/local +BINDIR = $(PREFIX)/bin +LIBDIR = $(PREFIX)/share/bibutils + +SCRIPTS = bib-util bib-add bib-check bib-convert bib-extract bib-fetch \ + bib-gen bib-key bib-ls +LIBS = lib/bib-parse.awk lib/bib-canon.awk lib/bib-select.awk \ + lib/bib-lskeys.awk lib/bib-key.awk lib/bib-ls.awk \ + lib/bib-check.awk lib/bib2ref.awk lib/ref2bib.awk + +all: + @echo "nothing to build; run 'make test' or 'make install'" + +test: + tests/run-tests.sh + tests/integration.sh + +install: + -mkdir -p $(BINDIR) $(LIBDIR) + cp $(SCRIPTS) $(BINDIR) + cp $(LIBS) $(LIBDIR) + +uninstall: + cd $(BINDIR) && rm -f $(SCRIPTS) + rm -rf $(LIBDIR) + +.PHONY: all test install uninstall @@ -21,7 +21,19 @@ formatted bibtex entry will be emitted on stdout. ## bib-extract A script which filters a bibtex database provided on stdin or as an argument -and emits only those entries contained within a specified aux file. +and emits only those entries contained within a specified aux file. Both +classic bibtex and biblatex/biber aux files are understood, and \nocite{*} +selects the whole database. (roff citation sources are planned but not yet +supported.) + +## bib-ls +List the entries in a database, one key per line, or with -l as +tab-separated key, type, author, year and title. + +## bib-check +Lint a database: reports missing required fields, duplicate keys, +duplicate titles (likely duplicated entries) and empty field values. +Exits nonzero if any problem was found. ## bib-key A script which accepts a bibtex entry on stdin, and emits it on stdout with @@ -29,5 +41,32 @@ an automatically generated bibtex key. ## bib-fetch A script which accepts a pdf file as an input argument and will attempt to -fetch a corresponding bibtex entry from crossref.org based on its DOI, if -one is available. +fetch a corresponding bibtex entry based on its DOI (via crossref.org) or, +failing that, its arXiv id (via arxiv.org). An identifier can also be given +directly with -d (DOI) or -a (arXiv id). + +## bib-convert +Convert between bibtex and refer database formats. The direction is detected +automatically from the input, or can be forced with -b (to bibtex) or -r +(to refer). + +# Canonical form +Entries that pass through these tools are canonicalized: lowercase entry +types and field names, 2-space indentation, brace-delimited values with +internal whitespace collapsed, bare numbers left bare, and macro +references/concatenations preserved verbatim. @string and @preamble blocks +pass through untouched. + +# Installation + make install # PREFIX=/usr/local by default + +The scripts look for the shared awk library in $BIBUTILS_LIB, then in +lib/ next to the script, then in /usr/local/share/bibutils. If installing +with a non-default PREFIX, set BIBUTILS_LIB accordingly. + +# Dependencies +POSIX shell and awk only, with two exceptions: bib-fetch requires curl, +plus pdftotext (poppler) for DOI extraction from pdfs. + +# Tests + make test @@ -0,0 +1,67 @@ +#!/bin/sh +# bib-add - insert bibtex entries from stdin into a database file +# +# usage: bib-add [-f] db.bib < entry +# -f replace existing entries with the same key + +usage() { + printf 'usage: bib-add [-f] db.bib < entry\n' >&2 + exit 2 +} + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +force=0 +while getopts f opt; do + case $opt in + f) force=1 ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) +[ $# -eq 1 ] || usage +db=$1 + +tmp=$(mktemp) && tmpkeys=$(mktemp) && tmpdb=$(mktemp) || exit 1 +trap 'rm -f "$tmp" "$tmpkeys" "$tmpdb"' EXIT INT TERM + +# canonicalize the incoming entries +awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" -f "$LIB/bib-select.awk" \ + -v keys= -v invert=1 > "$tmp" + +if [ ! -s "$tmp" ]; then + printf 'bib-add: no entries on stdin\n' >&2 + exit 1 +fi + +awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$tmp" > "$tmpkeys" + +if [ -f "$db" ]; then + dups=$(awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-lskeys.awk" "$db" \ + | grep -Fxf "$tmpkeys") || dups= + if [ -n "$dups" ]; then + if [ "$force" -eq 1 ]; then + # rewrite the database without the entries being replaced + keys=$(printf '%s\n' "$dups" | paste -sd, -) + awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib-select.awk" -v keys="$keys" -v invert=1 \ + "$db" > "$tmpdb" || exit 1 + cp "$tmpdb" "$db" + else + printf 'bib-add: duplicate keys in %s:\n' "$db" >&2 + printf '%s\n' "$dups" >&2 + exit 1 + fi + fi +fi + +{ + [ -s "$db" ] && echo "" + cat "$tmp" +} >> "$db" diff --git a/bib-check b/bib-check new file mode 100755 index 0000000..062e157 --- /dev/null +++ b/bib-check @@ -0,0 +1,18 @@ +#!/bin/sh +# bib-check - lint a bibtex database +# +# usage: bib-check [file ...] (stdin if no file given) +# +# Reports missing required fields, duplicate keys, duplicate titles and +# empty field values. Exits nonzero if any problem was found. + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib-check.awk" "$@" diff --git a/bib-convert b/bib-convert new file mode 100755 index 0000000..ef4c0b0 --- /dev/null +++ b/bib-convert @@ -0,0 +1,56 @@ +#!/bin/sh +# bib-convert - convert between bibtex and refer database formats +# +# usage: bib-convert [-b | -r] [file] (stdin if no file given) +# -b force refer -> bibtex +# -r force bibtex -> refer +# +# Without a flag the direction is detected from the input: text whose +# first record starts with @ is taken as bibtex, with % as refer. + +usage() { + printf 'usage: bib-convert [-b | -r] [file]\n' >&2 + exit 2 +} + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +bibkey=$(dirname "$0")/bib-key +[ -x "$bibkey" ] || bibkey=bib-key + +mode=auto +while getopts br opt; do + case $opt in + b) mode=tobib ;; + r) mode=toref ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) +[ $# -le 1 ] || usage + +tmp=$(mktemp) || exit 1 +trap 'rm -f "$tmp"' EXIT INT TERM +cat "$@" > "$tmp" + +if [ "$mode" = auto ]; then + first=$(awk 'NF { sub(/^[ \t]+/, ""); print substr($0, 1, 1); exit }' "$tmp") + case $first in + @) mode=toref ;; + %) mode=tobib ;; + *) printf 'bib-convert: cannot detect input format\n' >&2; exit 1 ;; + esac +fi + +if [ "$mode" = toref ]; then + exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib2ref.awk" "$tmp" +else + awk -f "$LIB/ref2bib.awk" "$tmp" | "$bibkey" +fi diff --git a/bib-extract b/bib-extract new file mode 100755 index 0000000..52aa85b --- /dev/null +++ b/bib-extract @@ -0,0 +1,60 @@ +#!/bin/sh +# bib-extract - emit only the database entries cited in an aux file +# +# usage: bib-extract file.aux [db.bib] (db on stdin if omitted) +# +# roff/refer citation sources are planned but not yet supported. + +usage() { + printf 'usage: bib-extract file.aux [db.bib]\n' >&2 + exit 2 +} + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +[ $# -ge 1 ] && [ $# -le 2 ] || usage +aux=$1 +shift +[ -r "$aux" ] || { printf 'bib-extract: cannot read %s\n' "$aux" >&2; exit 1; } + +keys=$(awk ' + # classic bibtex: \citation{key,key,...} + { + line = $0 + while (match(line, /\\citation\{[^}]*\}/)) { + n = split(substr(line, RSTART + 10, RLENGTH - 11), a, ",") + for (i = 1; i <= n; i++) + if (a[i] != "") + print a[i] + line = substr(line, RSTART + RLENGTH) + } + } + # biblatex/biber: \abx@aux@cite{segment}{key} (older: one argument) + { + line = $0 + while (match(line, /\\abx@aux@cite(\{[0-9]*\})?\{[^}]*\}/)) { + s = substr(line, RSTART, RLENGTH) + sub(/\}$/, "", s) + sub(/^.*\{/, "", s) + if (s != "") + print s + line = substr(line, RSTART + RLENGTH) + } + }' "$aux" | sort -u | paste -sd, -) + +[ -n "$keys" ] || exit 0 + +# \nocite{*} cites everything: emit the whole database +case ",$keys," in + *,\*,*) keys= invert=1 ;; + *) invert=0 ;; +esac + +exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib-select.awk" -v keys="$keys" -v invert="$invert" "$@" diff --git a/bib-fetch b/bib-fetch new file mode 100755 index 0000000..62f7993 --- /dev/null +++ b/bib-fetch @@ -0,0 +1,82 @@ +#!/bin/sh +# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id +# +# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf] +# +# Unless given with -d or -a, an identifier is extracted from the first +# pages of the pdf (requires pdftotext): a DOI if one is found, falling +# back to an arXiv id. DOIs are resolved through doi.org content +# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry +# is emitted canonically on stdout with a generated key. + +usage() { + printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2 + exit 2 +} + +doi= +arxiv= +while getopts d:a: opt; do + case $opt in + d) doi=$OPTARG ;; + a) arxiv=$OPTARG ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) +[ -n "$doi" ] && [ -n "$arxiv" ] && usage + +bibkey=$(dirname "$0")/bib-key +[ -x "$bibkey" ] || bibkey=bib-key + +command -v curl > /dev/null 2>&1 || { + printf 'bib-fetch: curl is required\n' >&2 + exit 1 +} + +if [ -z "$doi" ] && [ -z "$arxiv" ]; then + [ $# -eq 1 ] || usage + pdf=$1 + [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; } + command -v pdftotext > /dev/null 2>&1 || { + printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2 + exit 1 + } + ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk ' + # first DOI on a "doi" line, first arXiv stamp; prefer the DOI + doi == "" { + if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) { + doi = substr($0, RSTART, RLENGTH) + sub(/[.,;)\]]+$/, "", doi) + } + } + arxiv == "" { + # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001) + if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) || + match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/)) + arxiv = substr($0, RSTART + 6, RLENGTH - 6) + } + END { printf "%s\t%s\n", doi, arxiv }') + doi=${ids%% *} + arxiv=${ids#* } + if [ -z "$doi" ] && [ -z "$arxiv" ]; then + printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2 + exit 1 + fi +fi + +if [ -n "$doi" ]; then + entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \ + "https://doi.org/$doi") || { + printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2 + exit 1 + } +else + arxiv=${arxiv#arXiv:} + entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || { + printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2 + exit 1 + } +fi + +printf '%s\n' "$entry" | "$bibkey" @@ -0,0 +1,95 @@ +#!/bin/sh +# bib-gen - generate a bibtex entry +# +# usage: bib-gen [-t type] [field=value ...] +# bib-gen [-t type] -F field,field,... (tab-separated stdin) +# +# With field=value arguments, one entry is built from them. With -F, +# one entry is built per tab-separated line of stdin, columns matching +# the listed fields. Otherwise the user is prompted interactively. +# Entries are emitted on stdout with generated keys. + +usage() { + printf 'usage: bib-gen [-t type] [field=value ...]\n' >&2 + printf ' bib-gen [-t type] -F field,field,... < data\n' >&2 + exit 2 +} + +type=article +fmt= +while getopts t:F: opt; do + case $opt in + t) type=$OPTARG ;; + F) fmt=$OPTARG ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) + +bibkey=$(dirname "$0")/bib-key +[ -x "$bibkey" ] || bibkey=bib-key + +# fields prompted for in interactive mode, per entry type +fields_for() { + case $1 in + article) echo "author title journal year volume number pages month doi" ;; + book) echo "author title publisher year volume series address edition" ;; + inproceedings|conference) + echo "author title booktitle year editor pages publisher doi" ;; + incollection) echo "author title booktitle publisher year editor pages chapter" ;; + techreport) echo "author title institution year number address month" ;; + phdthesis|mastersthesis) + echo "author title school year address month" ;; + *) echo "author title year howpublished note url" ;; + esac +} + +if [ -n "$fmt" ]; then + # batch mode: tab-separated values on stdin + awk -F '\t' -v fmt="$fmt" -v type="$type" ' + BEGIN { nf = split(fmt, F, ",") } + NF { + printf "@%s{FIXME,\n", type + for (i = 1; i <= nf && i <= NF; i++) + if ($i != "") + printf " %s = {%s},\n", F[i], $i + print "}" + }' | "$bibkey" + exit $? +fi + +tmp=$(mktemp) || exit 1 +trap 'rm -f "$tmp"' EXIT INT TERM + +if [ $# -gt 0 ]; then + # argument mode: field=value pairs + for arg in "$@"; do + case $arg in + *=*) printf '%s\t%s\n' "${arg%%=*}" "${arg#*=}" >> "$tmp" ;; + *) usage ;; + esac + done +else + # interactive mode + printf 'entry type [%s]: ' "$type" >&2 + read -r ans || exit 1 + [ -n "$ans" ] && type=$ans + for f in $(fields_for "$type"); do + printf '%s: ' "$f" >&2 + read -r ans || break + [ -n "$ans" ] && printf '%s\t%s\n' "$f" "$ans" >> "$tmp" + done +fi + +if [ ! -s "$tmp" ]; then + printf 'bib-gen: no fields given\n' >&2 + exit 1 +fi + +{ + printf '@%s{FIXME,\n' "$type" + while IFS=' ' read -r name value; do + printf ' %s = {%s},\n' "$name" "$value" + done < "$tmp" + printf '}\n' +} | "$bibkey" @@ -0,0 +1,15 @@ +#!/bin/sh +# bib-key - read bibtex entries and emit them with generated keys +# +# usage: bib-key [file ...] (stdin if no file given) + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib-key.awk" "$@" @@ -0,0 +1,30 @@ +#!/bin/sh +# bib-ls - list the entries in a bibtex database +# +# usage: bib-ls [-l] [file ...] (stdin if no file given) +# -l long format: key, type, author, year, title (tab-separated) + +usage() { + printf 'usage: bib-ls [-l] [file ...]\n' >&2 + exit 2 +} + +if [ -n "$BIBUTILS_LIB" ]; then + LIB=$BIBUTILS_LIB +elif [ -d "$(dirname "$0")/lib" ]; then + LIB=$(dirname "$0")/lib +else + LIB=/usr/local/share/bibutils +fi + +long=0 +while getopts l opt; do + case $opt in + l) long=1 ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) + +exec awk -f "$LIB/bib-parse.awk" -f "$LIB/bib-canon.awk" \ + -f "$LIB/bib-ls.awk" -v long="$long" "$@" diff --git a/bib-util b/bib-util new file mode 100755 index 0000000..e807b03 --- /dev/null +++ b/bib-util @@ -0,0 +1,28 @@ +#!/bin/sh +# bib-util - wrapper dispatching to the individual bibutils scripts +# +# usage: bib-util command [args ...] + +usage() { + printf 'usage: bib-util command [args ...]\n' >&2 + printf 'commands: add check convert extract fetch gen key ls\n' >&2 + exit 2 +} + +[ $# -ge 1 ] || usage +cmd=$1 +shift + +dir=$(dirname "$0") +case $cmd in + add|check|convert|extract|fetch|gen|key|ls) + exec "$dir/bib-$cmd" "$@" + ;; + help|-h|--help) + usage + ;; + *) + printf 'bib-util: unknown command: %s\n' "$cmd" >&2 + usage + ;; +esac diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk new file mode 100644 index 0000000..d11e9cb --- /dev/null +++ b/lib/bib-canon.awk @@ -0,0 +1,28 @@ +# bib-canon.awk - canonical output helpers for bibutils +# +# Requires bib-parse.awk. Provides bib_emit() to print the current +# entry in canonical form, and bib_get() to look up a field value. + +# print the current entry canonically: lowercase type and field names, +# 2-space indent, brace-delimited values with whitespace collapsed +function bib_emit(type, key, j, v) { + printf "@%s{%s,\n", type, key + for (j = 1; j <= BIB_N; j++) { + v = BIB_VAL[j] + if (BIB_KIND[j] == "s") { + gsub(/[ \t\r\n]+/, " ", v) + v = bib_trim(v) + printf " %s = {%s},\n", BIB_NAME[j], v + } else + printf " %s = %s,\n", BIB_NAME[j], v + } + print "}" +} + +# value of field `name` (lowercase) in the current entry, "" if absent +function bib_get(name, j) { + for (j = 1; j <= BIB_N; j++) + if (BIB_NAME[j] == name) + return BIB_VAL[j] + return "" +} diff --git a/lib/bib-check.awk b/lib/bib-check.awk new file mode 100644 index 0000000..4411a55 --- /dev/null +++ b/lib/bib-check.awk @@ -0,0 +1,69 @@ +# bib-check.awk - lint a bibtex database +# +# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per +# line on stdout: +# - missing fields required by the entry type +# - duplicate keys +# - entries whose titles normalize to the same string (likely dups) +# - empty field values +# Exits 1 if any problem was found. + +BEGIN { + REQ["article"] = "author title journal year" + REQ["book"] = "author|editor title publisher year" + REQ["booklet"] = "title" + REQ["inbook"] = "author|editor title publisher year" + REQ["incollection"] = "author title booktitle publisher year" + REQ["inproceedings"] = "author title booktitle year" + REQ["conference"] = "author title booktitle year" + REQ["manual"] = "title" + REQ["mastersthesis"] = "author title school year" + REQ["phdthesis"] = "author title school year" + REQ["proceedings"] = "title year" + REQ["techreport"] = "author title institution year" + REQ["unpublished"] = "author title note" +} + +function bib_pass(raw) { } + +function problem(key, msg) { + printf "%s: %s\n", key, msg + BIB_BAD = 1 +} + +function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) { + if (key in BIB_KEYS_SEEN) + problem(key, "duplicate key") + BIB_KEYS_SEEN[key] = 1 + + # required fields ("a|b" means at least one of a, b) + if (type in REQ) { + n = split(REQ[type], req, " ") + for (i = 1; i <= n; i++) { + na = split(req[i], alts, "|") + found = 0 + for (j = 1; j <= na; j++) + if (bib_get(alts[j]) != "") + found = 1 + if (!found) + problem(key, "missing required field: " req[i]) + } + } + + # empty values + for (i = 1; i <= BIB_N; i++) + if (bib_trim(BIB_VAL[i]) == "") + problem(key, "empty field: " BIB_NAME[i]) + + # likely duplicate entries: same normalized title + t = tolower(bib_get("title")) + gsub(/[^a-z0-9]/, "", t) + if (t != "") { + if (t in BIB_TITLES_SEEN) + problem(key, "title duplicates " BIB_TITLES_SEEN[t]) + else + BIB_TITLES_SEEN[t] = key + } +} + +END { exit BIB_BAD } diff --git a/lib/bib-key.awk b/lib/bib-key.awk new file mode 100644 index 0000000..41534ba --- /dev/null +++ b/lib/bib-key.awk @@ -0,0 +1,69 @@ +# bib-key.awk - rekey every entry with a generated citation key +# +# Requires bib-parse.awk and bib-canon.awk. Keys have the form +# <surname><year><word>, e.g. knuth1984literate. + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key, k, n) { + if (bib_out_n++) + print "" + k = bib_mkkey() + # disambiguate collisions with b, c, ... suffixes + if (k in BIB_KEYS_SEEN) { + n = ++BIB_KEYS_SEEN[k] + k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1) + } else + BIB_KEYS_SEEN[k] = 1 + bib_emit(type, k) +} + +function bib_mkkey( a, y, t, surname, word, n, parts, i, w) { + a = bib_get("author") + if (a == "") + a = bib_get("editor") + y = bib_get("year") + t = bib_get("title") + + # surname of the first author + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) + gsub(/[{}]/, "", a) + a = bib_trim(a) + if (index(a, ",") > 0) + surname = substr(a, 1, index(a, ",") - 1) + else { + n = split(a, parts, /[ \t]+/) + surname = (n > 0) ? parts[n] : "" + } + gsub(/[^A-Za-z0-9]/, "", surname) + surname = tolower(surname) + if (surname == "") + surname = "anon" + + # four-digit year + if (match(y, /[0-9][0-9][0-9][0-9]/)) + y = substr(y, RSTART, 4) + else + y = "" + + # first significant word of the title + gsub(/[{}]/, "", t) + word = "" + n = split(tolower(t), parts, /[^a-z0-9]+/) + for (i = 1; i <= n; i++) { + w = parts[i] + if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" || + w == "of" || w == "in" || w == "for" || w == "and" || w == "to" || + w == "with" || w == "from" || w == "by" || w == "at" || w == "is") + continue + word = w + break + } + + return surname y word +} diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk new file mode 100644 index 0000000..909b654 --- /dev/null +++ b/lib/bib-ls.awk @@ -0,0 +1,25 @@ +# bib-ls.awk - list database entries +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# long - 0: print one key per line +# 1: print key, type, author, year and title, tab-separated + +function bib_pass(raw) { } + +function bib_entry(type, key, a, t) { + if (long + 0 == 0) { + print key + return + } + a = bib_get("author") + if (a == "") + a = bib_get("editor") + gsub(/[{}]/, "", a) + gsub(/[ \t\r\n]+/, " ", a) + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) " et al." + t = bib_get("title") + gsub(/[{}]/, "", t) + gsub(/[ \t\r\n]+/, " ", t) + printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t +} diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk new file mode 100644 index 0000000..1932ced --- /dev/null +++ b/lib/bib-lskeys.awk @@ -0,0 +1,9 @@ +# bib-lskeys.awk - print the key of every entry, one per line +# +# Requires bib-parse.awk. + +function bib_pass(raw) { } + +function bib_entry(type, key) { + print key +} diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk new file mode 100644 index 0000000..e5bf9fa --- /dev/null +++ b/lib/bib-parse.awk @@ -0,0 +1,216 @@ +# bib-parse.awk - shared bibtex parsing library for bibutils +# +# Consumers must define two hook functions: +# bib_entry(type, key) - called once per regular entry. The fields are +# available in BIB_N, BIB_NAME[], BIB_VAL[] and +# BIB_KIND[]; the raw source text of the entry +# is in BIB_RAW. +# bib_pass(raw) - called for @string and @preamble blocks with +# their raw source text. +# +# BIB_KIND[j] is "s" for ordinary string values (content stored without +# delimiters; re-wrap in braces on output), "n" for bare numbers, and +# "r" for raw values (macros, # concatenation) which should be emitted +# verbatim. + +{ bib_buf = bib_buf $0 "\n" } + +END { bib_main(bib_buf) } + +function bib_main(s, i) { + i = 1 + while (i <= length(s)) { + if (substr(s, i, 1) == "@") + i = bib_entry_at(s, i) + else + i++ + } +} + +function bib_ws(s, i) { + while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/) + i++ + return i +} + +function bib_trim(t) { + sub(/^[ \t\r\n]+/, "", t) + sub(/[ \t\r\n]+$/, "", t) + return t +} + +# balanced {...} group starting at i; inner content goes to BIB_PIECE, +# returns the index just past the closing brace +function bib_braced(s, i, depth, start, c) { + start = i + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == "{") + depth++ + else if (c == "}") { + depth-- + if (depth == 0) + break + } + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# "..." group starting at i; braces protect embedded quotes +function bib_quoted(s, i, depth, start, c) { + start = i + i++ + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + if (c == "{") + depth++ + else if (c == "}") + depth-- + else if (c == "\"" && depth == 0) { + i++ + break + } + i++ + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# skip a balanced op...cl group starting at i (i must be at op) +function bib_skip_group(s, i, op, cl, depth, c) { + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == op) + depth++ + else if (c == cl) { + depth-- + if (depth == 0) + break + } + } + return i +} + +# field value at i, handling # concatenation; sets BIB_VALUE and +# BIB_VKIND, returns the index just past the value +function bib_value(s, i, start, c, piece, pieces, kind) { + start = i + pieces = 0 + kind = "" + BIB_VALUE = "" + while (1) { + c = substr(s, i, 1) + if (c == "{") { + i = bib_braced(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else if (c == "\"") { + i = bib_quoted(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else { + piece = "" + while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) { + piece = piece substr(s, i, 1) + i++ + } + BIB_VALUE = BIB_VALUE piece + kind = (piece ~ /^[0-9]+$/) ? "n" : "r" + } + pieces++ + i = bib_ws(s, i) + if (substr(s, i, 1) == "#") + i = bib_ws(s, i + 1) + else + break + } + if (pieces > 1) + kind = "r" + if (kind == "r") + BIB_VALUE = bib_trim(substr(s, start, i - start)) + BIB_VKIND = kind + return i +} + +# parse the construct whose "@" is at i; returns the index past it +function bib_entry_at(s, i, at, type, opener, closer, key, name, c) { + at = i + i++ + type = "" + while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) { + type = type substr(s, i, 1) + i++ + } + type = tolower(type) + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "{") { + opener = "{" + closer = "}" + } else if (c == "(") { + opener = "(" + closer = ")" + } else + return i # stray @, not an entry + + if (type == "comment") + return bib_skip_group(s, i, opener, closer) + if (type == "string" || type == "preamble") { + i = bib_skip_group(s, i, opener, closer) + bib_pass(bib_trim(substr(s, at, i - at))) + return i + } + + i++ # consume opener + i = bib_ws(s, i) + key = "" + while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) { + key = key substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) == ",") + i++ + + BIB_N = 0 + while (1) { + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "" || c == closer) { + if (c == closer) + i++ + break + } + if (c == ",") { + i++ + continue + } + name = "" + while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) { + name = name substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) != "=") { # malformed; skip a char and resync + i++ + continue + } + i = bib_ws(s, i + 1) + i = bib_value(s, i) + BIB_N++ + BIB_NAME[BIB_N] = tolower(name) + BIB_VAL[BIB_N] = BIB_VALUE + BIB_KIND[BIB_N] = BIB_VKIND + } + BIB_RAW = bib_trim(substr(s, at, i - at)) + bib_entry(type, key) + return i +} diff --git a/lib/bib-select.awk b/lib/bib-select.awk new file mode 100644 index 0000000..1900390 --- /dev/null +++ b/lib/bib-select.awk @@ -0,0 +1,29 @@ +# bib-select.awk - emit entries selected by key, canonically +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# keys - comma-separated list of entry keys +# invert - 0: emit entries whose key is in the list +# 1: emit entries whose key is NOT in the list +# +# With keys="" and invert=1 this acts as a canonicalizing filter for +# everything. @string and @preamble blocks always pass through. + +BEGIN { + bib_sel_n = split(keys, bib_sel_k, ",") + for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++) + BIB_SEL[bib_sel_k[bib_sel_i]] = 1 +} + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key) { + if ((key in BIB_SEL) != invert + 0) { + if (bib_out_n++) + print "" + bib_emit(type, key) + } +} diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk new file mode 100644 index 0000000..4d9e595 --- /dev/null +++ b/lib/bib2ref.awk @@ -0,0 +1,52 @@ +# bib2ref.awk - convert bibtex entries to refer records +# +# Requires bib-parse.awk and bib-canon.awk. + +function bib_pass(raw) { } + +function r_field(tag, v) { + if (v != "") { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + printf "%%%s %s\n", tag, bib_trim(v) + } +} + +function r_names(tag, v, n, parts, i) { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + n = split(v, parts, / +[Aa][Nn][Dd] +/) + for (i = 1; i <= n; i++) + if (bib_trim(parts[i]) != "") + printf "%%%s %s\n", tag, bib_trim(parts[i]) +} + +function bib_entry(type, key, d, p, m) { + if (bib_out_n++) + print "" + r_names("A", bib_get("author")) + r_names("E", bib_get("editor")) + r_field("T", bib_get("title")) + r_field("J", bib_get("journal")) + r_field("B", bib_get("booktitle")) + d = bib_get("year") + m = bib_get("month") + if (m != "") + d = (d != "") ? m " " d : m + r_field("D", d) + r_field("V", bib_get("volume")) + r_field("N", bib_get("number")) + p = bib_get("pages") + gsub(/--/, "-", p) + r_field("P", p) + if (bib_get("publisher") != "") + r_field("I", bib_get("publisher")) + else if (bib_get("institution") != "") + r_field("I", bib_get("institution")) + else if (bib_get("school") != "") + r_field("I", bib_get("school")) + r_field("C", bib_get("address")) + r_field("K", bib_get("keywords")) + r_field("X", bib_get("abstract")) + r_field("O", bib_get("note")) +} diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk new file mode 100644 index 0000000..422fdd7 --- /dev/null +++ b/lib/ref2bib.awk @@ -0,0 +1,107 @@ +# ref2bib.awk - convert refer records to bibtex entries +# +# Standalone (does not use bib-parse.awk). Records are separated by +# blank lines. Output keys are FIXME; pipe through bib-key. + +BEGIN { + RS = "" + FS = "\n" +} + +function r_trim(t) { + sub(/^[ \t\r]+/, "", t) + sub(/[ \t\r]+$/, "", t) + return t +} + +function r_emit(name, v) { + if (v != "") + printf " %s = {%s},\n", name, v +} + +{ + split("", val) + na = 0 + ne = 0 + split("", A) + split("", E) + lasttag = "" + for (i = 1; i <= NF; i++) { + line = $i + if (substr(line, 1, 1) == "%") { + tag = substr(line, 2, 1) + v = r_trim(substr(line, 3)) + if (tag == "A") + A[++na] = v + else if (tag == "E") + E[++ne] = v + else + val[tag] = v + lasttag = tag + } else if (lasttag == "A") + A[na] = A[na] " " r_trim(line) + else if (lasttag == "E") + E[ne] = E[ne] " " r_trim(line) + else if (lasttag != "") + val[lasttag] = val[lasttag] " " r_trim(line) + } + if (na == 0 && ne == 0 && !("T" in val)) + next + + # guess an entry type from the fields present + if ("J" in val) + type = "article" + else if ("B" in val) + type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \ + ? "inproceedings" : "incollection" + else if ("R" in val) + type = "techreport" + else if ("I" in val) + type = "book" + else + type = "misc" + + if (out_n++) + print "" + printf "@%s{FIXME,\n", type + + authors = "" + for (i = 1; i <= na; i++) + authors = (i == 1) ? A[i] : authors " and " A[i] + r_emit("author", authors) + editors = "" + for (i = 1; i <= ne; i++) + editors = (i == 1) ? E[i] : editors " and " E[i] + r_emit("editor", editors) + + r_emit("title", val["T"]) + r_emit("journal", val["J"]) + r_emit("booktitle", val["B"]) + + d = val["D"] + if (match(d, /[0-9][0-9][0-9][0-9]/)) { + r_emit("year", substr(d, RSTART, 4)) + m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4)) + if (m != "") + r_emit("month", m) + } else + r_emit("year", d) + + r_emit("volume", val["V"]) + r_emit("number", val["N"]) + p = val["P"] + gsub(/-+/, "--", p) + r_emit("pages", p) + r_emit(type == "techreport" ? "institution" : "publisher", val["I"]) + r_emit("address", val["C"]) + if ("R" in val) { + if ("N" in val) + r_emit("note", val["R"]) + else + r_emit("number", val["R"]) + } + r_emit("keywords", val["K"]) + r_emit("abstract", val["X"]) + r_emit("note", val["O"]) + print "}" +} diff --git a/tests/integration.sh b/tests/integration.sh new file mode 100755 index 0000000..ea847e4 --- /dev/null +++ b/tests/integration.sh @@ -0,0 +1,123 @@ +#!/bin/sh +# integration.sh - end-to-end test against a real LaTeX document +# +# Requires pdflatex and bibtex; skipped otherwise. Set BIBTEST_NET=1 to +# also exercise bib-fetch against doi.org (needs network access). + +ROOT=$(cd "$(dirname "$0")/.." && pwd) +PATH=$ROOT:$PATH +LSKEYS="awk -f $ROOT/lib/bib-parse.awk -f $ROOT/lib/bib-lskeys.awk" + +command -v pdflatex > /dev/null 2>&1 && command -v bibtex > /dev/null 2>&1 || { + printf 'integration: pdflatex/bibtex not found, skipping\n' >&2 + exit 0 +} + +tmpd=$(mktemp -d) || exit 1 +trap 'rm -rf "$tmpd"' EXIT INT TERM +cd "$tmpd" || exit 1 + +pass=0 +fail=0 +ok() { pass=$((pass + 1)); printf 'ok - %s\n' "$1"; } +not_ok() { fail=$((fail + 1)); printf 'FAIL - %s\n' "$1"; } + +# ---- build a database with bib-gen | bib-add --------------------------- +bib-gen -t article author='Donald E. Knuth' title='Literate Programming' \ + journal='The Computer Journal' year=1984 volume=27 number=2 \ + pages='97--111' | bib-add master.bib +bib-gen -t article author='Alan M. Turing' \ + title='Computing Machinery and Intelligence' journal='Mind' year=1950 \ + volume=59 pages='433--460' | bib-add master.bib +printf 'Claude E. Shannon\tA Mathematical Theory of Communication\tBell System Technical Journal\t1948 +Edsger W. Dijkstra\tGo To Statement Considered Harmful\tCommunications of the ACM\t1968 +' | bib-gen -F author,title,journal,year | bib-add master.bib + +n=$($LSKEYS master.bib | wc -l) +[ "$n" -eq 4 ] && ok "database built with 4 entries" \ + || not_ok "database built with 4 entries (got $n)" + +# ---- compile a document citing a subset -------------------------------- +cat > paper.tex <<'EOF' +\documentclass{article} +\begin{document} +Machines may think~\cite{turing1950computing}; programs are +literature~\cite{knuth1984literate}. + +DOI: 10.1093/comjnl/27.2.97 +\bibliographystyle{plain} +\bibliography{master} +\end{document} +EOF +pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1 + +grep -q 'citation{turing1950computing}' paper.aux \ + && ok "pdflatex produced citations in aux" \ + || not_ok "pdflatex produced citations in aux" + +# ---- extract the cited subset and build against it --------------------- +bib-extract paper.aux master.bib > paper.bib +n=$($LSKEYS paper.bib | wc -l) +[ "$n" -eq 2 ] && ok "bib-extract kept the 2 cited entries" \ + || not_ok "bib-extract kept the 2 cited entries (got $n)" + +sed 's/\\bibdata{master}/\\bibdata{paper}/' paper.aux > tmp.aux \ + && mv tmp.aux paper.aux +bibtex paper > bibtex.log 2>&1 +grep -qi 'error\|warning' bibtex.log \ + && not_ok "bibtex accepts canonical output cleanly" \ + || ok "bibtex accepts canonical output cleanly" + +pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1 +pdflatex -interaction=batchmode paper.tex > /dev/null 2>&1 +if grep -qi 'undefined' paper.log; then + not_ok "document resolves all citations" +else + ok "document resolves all citations" +fi +[ -s paper.pdf ] && ok "pdf produced" || not_ok "pdf produced" + +# ---- convert roundtrip -------------------------------------------------- +bib-convert master.bib | bib-convert > roundtrip.bib +if [ "$($LSKEYS master.bib | sort)" = "$($LSKEYS roundtrip.bib | sort)" ]; then + ok "bibtex -> refer -> bibtex preserves all keys" +else + not_ok "bibtex -> refer -> bibtex preserves all keys" +fi + +# ---- bib-fetch against the built pdf (network) -------------------------- +if [ "$BIBTEST_NET" = 1 ]; then + if bib-fetch paper.pdf > fetched.bib 2> /dev/null; then + grep -q '^@article{knuth1984literate,' fetched.bib \ + && ok "bib-fetch resolves DOI from built pdf" \ + || not_ok "bib-fetch resolves DOI from built pdf" + if bib-fetch paper.pdf 2> /dev/null | bib-add master.bib 2> /dev/null; then + not_ok "fetched entry detected as duplicate" + else + ok "fetched entry detected as duplicate" + fi + else + not_ok "bib-fetch resolves DOI from built pdf" + fi + bib-fetch -a 1706.03762 2> /dev/null \ + | grep -q '^@misc{vaswani[0-9]*attention,' \ + && ok "bib-fetch resolves arXiv id" \ + || not_ok "bib-fetch resolves arXiv id" + cat > arx.tex <<'EOF' +\documentclass{article} +\begin{document} +A preprint without any DOI. + +arXiv:1706.03762v7 [cs.CL] 2 Aug 2023 +\end{document} +EOF + pdflatex -interaction=batchmode arx.tex > /dev/null 2>&1 + bib-fetch arx.pdf 2> /dev/null | grep -q 'eprint = {1706.03762}' \ + && ok "bib-fetch extracts arXiv id from pdf" \ + || not_ok "bib-fetch extracts arXiv id from pdf" +else + printf 'skip - bib-fetch network tests (set BIBTEST_NET=1 to enable)\n' +fi + +printf '\n%d passed, %d failed\n' "$pass" "$fail" +[ "$fail" -eq 0 ] diff --git a/tests/run-tests.sh b/tests/run-tests.sh new file mode 100755 index 0000000..70721db --- /dev/null +++ b/tests/run-tests.sh @@ -0,0 +1,187 @@ +#!/bin/sh +# run-tests.sh - test suite for bibutils + +ROOT=$(cd "$(dirname "$0")/.." && pwd) +PATH=$ROOT:$PATH +tmpd=$(mktemp -d) || exit 1 +trap 'rm -rf "$tmpd"' EXIT INT TERM + +pass=0 +fail=0 + +ok() { + pass=$((pass + 1)) + printf 'ok - %s\n' "$1" +} + +not_ok() { + fail=$((fail + 1)) + printf 'FAIL - %s\n' "$1" +} + +# check description command... (passes if the command succeeds) +check() { + desc=$1 + shift + if "$@" > /dev/null 2>&1; then + ok "$desc" + else + not_ok "$desc" + fi +} + +entry='@ARTICLE{ junk-key , + AUTHOR = "Donald E. Knuth", + Title={Literate Programming}, + JOURNAL = {The Computer Journal}, + Year = 1984, volume={27}, + pages = {97--111} +}' + +# ---- bib-key ---------------------------------------------------------- +out=$(printf '%s\n' "$entry" | bib-key) +check "bib-key generates surname-year-word key" \ + sh -c "printf '%s' '$out' | grep -q '^@article{knuth1984literate,'" + +# key collisions get letter suffixes +out=$(printf '@inproceedings{a, author={J. Smith}, title={Fast Trees}, year=2020} +@article{b, author={J. Smith}, title={Fast Trees Extended}, year=2020} +@article{c, author={J. Smith}, title={Fast Tree Methods}, year=2020}\n' | bib-key) +check "bib-key disambiguates colliding keys" \ + sh -c "printf '%s' '$out' | grep -q '{smith2020fast,' && + printf '%s' '$out' | grep -q '{smith2020fastb,' && + printf '%s' '$out' | grep -q '{smith2020fastc,'" + +# ---- canonicalization via bib-add ------------------------------------- +db=$tmpd/refs.bib +printf '%s\n' "$entry" | bib-add "$db" +check "bib-add creates database" test -s "$db" +check "bib-add lowercases field names" grep -q ' author = {Donald E. Knuth},' "$db" +check "bib-add collapses whitespace in values" \ + grep -q ' title = {Literate Programming},' "$db" +check "bib-add keeps bare numbers bare" grep -q ' year = 1984,' "$db" + +# duplicate detection +if printf '%s\n' "$entry" | bib-add "$db" 2> /dev/null; then + not_ok "bib-add rejects duplicate key" +else + ok "bib-add rejects duplicate key" +fi + +# forced replacement +printf '%s\n' "$entry" | sed 's/1984/1985/' | bib-add -f "$db" +check "bib-add -f replaces entry" grep -q ' year = 1985,' "$db" +n=$(grep -c '^@article{junk-key,' "$db") +[ "$n" = 1 ] && ok "bib-add -f leaves one copy" || not_ok "bib-add -f leaves one copy" + +# ---- bib-extract ------------------------------------------------------- +cat > "$tmpd/all.bib" <<'EOF' +@article{alpha2020one, author = {A. Alpha}, title = {One}, year = 2020} +@article{beta2021two, author = {B. Beta}, title = {Two}, year = 2021} +@article{gamma2022three, author = {C. Gamma}, title = {Three}, year = 2022} +EOF +cat > "$tmpd/doc.aux" <<'EOF' +\relax +\citation{alpha2020one} +\citation{gamma2022three,alpha2020one} +\bibstyle{plain} +EOF +out=$(bib-extract "$tmpd/doc.aux" "$tmpd/all.bib") +check "bib-extract keeps cited entries" \ + sh -c "printf '%s' '$out' | grep -q alpha2020one" +check "bib-extract keeps all cited entries" \ + sh -c "printf '%s' '$out' | grep -q gamma2022three" +if printf '%s' "$out" | grep -q beta2021two; then + not_ok "bib-extract drops uncited entries" +else + ok "bib-extract drops uncited entries" +fi + +# ---- bib-convert ------------------------------------------------------- +out=$(printf '%s\n' "$entry" | bib-convert) +check "bib-convert emits refer author" \ + sh -c "printf '%s' '$out' | grep -q '^%A Donald E. Knuth$'" +check "bib-convert emits refer pages with single dash" \ + sh -c "printf '%s' '$out' | grep -q '^%P 97-111$'" + +cat > "$tmpd/rec.ref" <<'EOF' +%A Alan M. Turing +%T Computing Machinery and Intelligence +%J Mind +%D 1950 +%V 59 +%P 433-460 +EOF +out=$(bib-convert "$tmpd/rec.ref") +check "bib-convert refer->bibtex type guess" \ + sh -c "printf '%s' '$out' | grep -q '^@article{turing1950computing,'" +check "bib-convert refer->bibtex pages" \ + sh -c "printf '%s' '$out' | grep -q ' pages = {433--460},'" + +# ---- bib-gen ----------------------------------------------------------- +out=$(bib-gen -t book author='Xavier Yu' title='Some Title' year=2001 publisher='Pub') +check "bib-gen argument mode" \ + sh -c "printf '%s' '$out' | grep -q '^@book{yu2001some,'" + +out=$(printf 'A. Author\tNeat Paper\tGood Journal\t1999\n' \ + | bib-gen -F author,title,journal,year) +check "bib-gen batch mode" \ + sh -c "printf '%s' '$out' | grep -q '^@article{author1999neat,'" + +# ---- bib-ls ------------------------------------------------------------ +out=$(bib-ls "$tmpd/all.bib") +check "bib-ls lists keys" \ + sh -c "[ \"\$(printf '%s\n' '$out' | wc -l)\" = 3 ]" +out=$(bib-ls -l "$tmpd/all.bib") +check "bib-ls -l shows details" \ + sh -c "printf '%s' '$out' | grep -q 'beta2021two article B. Beta 2021 Two'" + +# ---- bib-check --------------------------------------------------------- +cat > "$tmpd/bad.bib" <<'EOF' +@article{good2020fine, author = {A. Good}, title = {Fine}, journal = {J}, year = 2020} +@article{noj2020sad, author = {B. Sad}, title = {No Journal Here}, year = 2020} +@misc{noj2020sad, title = {Dup Key}} +@book{dup2021title, author = {C. Dup}, title = {FINE!}, publisher = {P}, year = 2021} +EOF +out=$(bib-check "$tmpd/bad.bib") +if [ $? -ne 0 ]; then ok "bib-check exits nonzero on problems"; else not_ok "bib-check exits nonzero on problems"; fi +check "bib-check finds missing field" \ + sh -c "printf '%s' '$out' | grep -q 'noj2020sad: missing required field: journal'" +check "bib-check finds duplicate key" \ + sh -c "printf '%s' '$out' | grep -q 'noj2020sad: duplicate key'" +check "bib-check finds duplicate title" \ + sh -c "printf '%s' '$out' | grep -q 'dup2021title: title duplicates good2020fine'" +cat > "$tmpd/clean.bib" <<'EOF' +@article{a2020x, author = {A. A}, title = {X}, journal = {J}, year = 2020} +@misc{b2021y, title = {Y}} +EOF +check "bib-check passes a clean db" bib-check "$tmpd/clean.bib" + +# ---- biblatex aux ------------------------------------------------------ +cat > "$tmpd/bl.aux" <<'EOF' +\abx@aux@refcontext{nty/global//global/global} +\abx@aux@cite{0}{beta2021two} +EOF +out=$(bib-extract "$tmpd/bl.aux" "$tmpd/all.bib") +check "bib-extract reads biblatex aux" \ + sh -c "printf '%s' '$out' | grep -q beta2021two" + +# ---- bib-util ---------------------------------------------------------- +out=$(printf '%s\n' "$entry" | bib-util key) +check "bib-util dispatches" \ + sh -c "printf '%s' '$out' | grep -q knuth1984literate" + +# ---- @string passthrough ----------------------------------------------- +cat > "$tmpd/str.bib" <<'EOF' +@string{cj = {The Computer Journal}} +@article{knuth1984literate, author = {D. Knuth}, journal = cj, year = 1984} +EOF +out=$(printf '\\citation{knuth1984literate}\n' > "$tmpd/s.aux"; \ + bib-extract "$tmpd/s.aux" "$tmpd/str.bib") +check "bib-extract passes @string through" \ + sh -c "printf '%s' '$out' | grep -q '@string{cj'" +check "macro field stays raw" \ + sh -c "printf '%s' '$out' | grep -q ' journal = cj,'" + +printf '\n%d passed, %d failed\n' "$pass" "$fail" +[ "$fail" -eq 0 ] |