Initial implementation (only a few years later!)

This is pure Claude. I'd written out the plan for this suite of scripts eons ago, but never found the time to actual do it. Remembered it this morning, pointed Claude at the README, and had something that appears to work in minutes. caveat emptor: the design is mine, but the code is purely LLM generated at this point.
author: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 12:02:41 -0400
committer: Douglas B. Rumbaugh <doug@douglasrumbaugh.com> 2026-06-06 12:02:41 -0400
commit: eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch)
tree: 626d64c3574cfbc7cc38eae6d142ef22b21cf59b /lib
parent: 8351a1da3f56cde9939b934bc5533a95aff1c95e (diff)
download: bibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz
9 files changed, 604 insertions, 0 deletions
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key,    j, v) {
+  printf "@%s{%s,\n", type, key
+  for (j = 1; j <= BIB_N; j++) {
+    v = BIB_VAL[j]
+    if (BIB_KIND[j] == "s") {
+      gsub(/[ \t\r\n]+/, " ", v)
+      v = bib_trim(v)
+      printf "  %s = {%s},\n", BIB_NAME[j], v
+    } else
+      printf "  %s = %s,\n", BIB_NAME[j], v
+  }
+  print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name,    j) {
+  for (j = 1; j <= BIB_N; j++)
+    if (BIB_NAME[j] == name)
+      return BIB_VAL[j]
+  return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+#   - missing fields required by the entry type
+#   - duplicate keys
+#   - entries whose titles normalize to the same string (likely dups)
+#   - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+  REQ["article"] = "author title journal year"
+  REQ["book"] = "author|editor title publisher year"
+  REQ["booklet"] = "title"
+  REQ["inbook"] = "author|editor title publisher year"
+  REQ["incollection"] = "author title booktitle publisher year"
+  REQ["inproceedings"] = "author title booktitle year"
+  REQ["conference"] = "author title booktitle year"
+  REQ["manual"] = "title"
+  REQ["mastersthesis"] = "author title school year"
+  REQ["phdthesis"] = "author title school year"
+  REQ["proceedings"] = "title year"
+  REQ["techreport"] = "author title institution year"
+  REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+  printf "%s: %s\n", key, msg
+  BIB_BAD = 1
+}
+
+function bib_entry(type, key,    n, req, i, alts, na, j, found, t, k) {
+  if (key in BIB_KEYS_SEEN)
+    problem(key, "duplicate key")
+  BIB_KEYS_SEEN[key] = 1
+
+  # required fields ("a|b" means at least one of a, b)
+  if (type in REQ) {
+    n = split(REQ[type], req, " ")
+    for (i = 1; i <= n; i++) {
+      na = split(req[i], alts, "|")
+      found = 0
+      for (j = 1; j <= na; j++)
+        if (bib_get(alts[j]) != "")
+          found = 1
+      if (!found)
+        problem(key, "missing required field: " req[i])
+    }
+  }
+
+  # empty values
+  for (i = 1; i <= BIB_N; i++)
+    if (bib_trim(BIB_VAL[i]) == "")
+      problem(key, "empty field: " BIB_NAME[i])
+
+  # likely duplicate entries: same normalized title
+  t = tolower(bib_get("title"))
+  gsub(/[^a-z0-9]/, "", t)
+  if (t != "") {
+    if (t in BIB_TITLES_SEEN)
+      problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+    else
+      BIB_TITLES_SEEN[t] = key
+  }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key,    k, n) {
+  if (bib_out_n++)
+    print ""
+  k = bib_mkkey()
+  # disambiguate collisions with b, c, ... suffixes
+  if (k in BIB_KEYS_SEEN) {
+    n = ++BIB_KEYS_SEEN[k]
+    k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+  } else
+    BIB_KEYS_SEEN[k] = 1
+  bib_emit(type, k)
+}
+
+function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  y = bib_get("year")
+  t = bib_get("title")
+
+  # surname of the first author
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1)
+  gsub(/[{}]/, "", a)
+  a = bib_trim(a)
+  if (index(a, ",") > 0)
+    surname = substr(a, 1, index(a, ",") - 1)
+  else {
+    n = split(a, parts, /[ \t]+/)
+    surname = (n > 0) ? parts[n] : ""
+  }
+  gsub(/[^A-Za-z0-9]/, "", surname)
+  surname = tolower(surname)
+  if (surname == "")
+    surname = "anon"
+
+  # four-digit year
+  if (match(y, /[0-9][0-9][0-9][0-9]/))
+    y = substr(y, RSTART, 4)
+  else
+    y = ""
+
+  # first significant word of the title
+  gsub(/[{}]/, "", t)
+  word = ""
+  n = split(tolower(t), parts, /[^a-z0-9]+/)
+  for (i = 1; i <= n; i++) {
+    w = parts[i]
+    if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+        w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+        w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+      continue
+    word = w
+    break
+  }
+
+  return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   long - 0: print one key per line
+#          1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key,    a, t) {
+  if (long + 0 == 0) {
+    print key
+    return
+  }
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  gsub(/[{}]/, "", a)
+  gsub(/[ \t\r\n]+/, " ", a)
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1) " et al."
+  t = bib_get("title")
+  gsub(/[{}]/, "", t)
+  gsub(/[ \t\r\n]+/, " ", t)
+  printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+  print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+#   bib_entry(type, key) - called once per regular entry. The fields are
+#                          available in BIB_N, BIB_NAME[], BIB_VAL[] and
+#                          BIB_KIND[]; the raw source text of the entry
+#                          is in BIB_RAW.
+#   bib_pass(raw)        - called for @string and @preamble blocks with
+#                          their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s,    i) {
+  i = 1
+  while (i <= length(s)) {
+    if (substr(s, i, 1) == "@")
+      i = bib_entry_at(s, i)
+    else
+      i++
+  }
+}
+
+function bib_ws(s, i) {
+  while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+    i++
+  return i
+}
+
+function bib_trim(t) {
+  sub(/^[ \t\r\n]+/, "", t)
+  sub(/[ \t\r\n]+$/, "", t)
+  return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i,    depth, start, c) {
+  start = i
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == "{")
+      depth++
+    else if (c == "}") {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i,    depth, start, c) {
+  start = i
+  i++
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    if (c == "{")
+      depth++
+    else if (c == "}")
+      depth--
+    else if (c == "\"" && depth == 0) {
+      i++
+      break
+    }
+    i++
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl,    depth, c) {
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == op)
+      depth++
+    else if (c == cl) {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i,    start, c, piece, pieces, kind) {
+  start = i
+  pieces = 0
+  kind = ""
+  BIB_VALUE = ""
+  while (1) {
+    c = substr(s, i, 1)
+    if (c == "{") {
+      i = bib_braced(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else if (c == "\"") {
+      i = bib_quoted(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else {
+      piece = ""
+      while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+        piece = piece substr(s, i, 1)
+        i++
+      }
+      BIB_VALUE = BIB_VALUE piece
+      kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+    }
+    pieces++
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) == "#")
+      i = bib_ws(s, i + 1)
+    else
+      break
+  }
+  if (pieces > 1)
+    kind = "r"
+  if (kind == "r")
+    BIB_VALUE = bib_trim(substr(s, start, i - start))
+  BIB_VKIND = kind
+  return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i,    at, type, opener, closer, key, name, c) {
+  at = i
+  i++
+  type = ""
+  while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+    type = type substr(s, i, 1)
+    i++
+  }
+  type = tolower(type)
+  i = bib_ws(s, i)
+  c = substr(s, i, 1)
+  if (c == "{") {
+    opener = "{"
+    closer = "}"
+  } else if (c == "(") {
+    opener = "("
+    closer = ")"
+  } else
+    return i                       # stray @, not an entry
+
+  if (type == "comment")
+    return bib_skip_group(s, i, opener, closer)
+  if (type == "string" || type == "preamble") {
+    i = bib_skip_group(s, i, opener, closer)
+    bib_pass(bib_trim(substr(s, at, i - at)))
+    return i
+  }
+
+  i++                              # consume opener
+  i = bib_ws(s, i)
+  key = ""
+  while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+    key = key substr(s, i, 1)
+    i++
+  }
+  i = bib_ws(s, i)
+  if (substr(s, i, 1) == ",")
+    i++
+
+  BIB_N = 0
+  while (1) {
+    i = bib_ws(s, i)
+    c = substr(s, i, 1)
+    if (c == "" || c == closer) {
+      if (c == closer)
+        i++
+      break
+    }
+    if (c == ",") {
+      i++
+      continue
+    }
+    name = ""
+    while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+      name = name substr(s, i, 1)
+      i++
+    }
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) != "=") {  # malformed; skip a char and resync
+      i++
+      continue
+    }
+    i = bib_ws(s, i + 1)
+    i = bib_value(s, i)
+    BIB_N++
+    BIB_NAME[BIB_N] = tolower(name)
+    BIB_VAL[BIB_N] = BIB_VALUE
+    BIB_KIND[BIB_N] = BIB_VKIND
+  }
+  BIB_RAW = bib_trim(substr(s, at, i - at))
+  bib_entry(type, key)
+  return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   keys   - comma-separated list of entry keys
+#   invert - 0: emit entries whose key is in the list
+#            1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+  bib_sel_n = split(keys, bib_sel_k, ",")
+  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+    BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key) {
+  if ((key in BIB_SEL) != invert + 0) {
+    if (bib_out_n++)
+      print ""
+    bib_emit(type, key)
+  }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+  if (v != "") {
+    gsub(/[{}]/, "", v)
+    gsub(/[ \t\r\n]+/, " ", v)
+    printf "%%%s %s\n", tag, bib_trim(v)
+  }
+}
+
+function r_names(tag, v,    n, parts, i) {
+  gsub(/[{}]/, "", v)
+  gsub(/[ \t\r\n]+/, " ", v)
+  n = split(v, parts, / +[Aa][Nn][Dd] +/)
+  for (i = 1; i <= n; i++)
+    if (bib_trim(parts[i]) != "")
+      printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key,    d, p, m) {
+  if (bib_out_n++)
+    print ""
+  r_names("A", bib_get("author"))
+  r_names("E", bib_get("editor"))
+  r_field("T", bib_get("title"))
+  r_field("J", bib_get("journal"))
+  r_field("B", bib_get("booktitle"))
+  d = bib_get("year")
+  m = bib_get("month")
+  if (m != "")
+    d = (d != "") ? m " " d : m
+  r_field("D", d)
+  r_field("V", bib_get("volume"))
+  r_field("N", bib_get("number"))
+  p = bib_get("pages")
+  gsub(/--/, "-", p)
+  r_field("P", p)
+  if (bib_get("publisher") != "")
+    r_field("I", bib_get("publisher"))
+  else if (bib_get("institution") != "")
+    r_field("I", bib_get("institution"))
+  else if (bib_get("school") != "")
+    r_field("I", bib_get("school"))
+  r_field("C", bib_get("address"))
+  r_field("K", bib_get("keywords"))
+  r_field("X", bib_get("abstract"))
+  r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+  RS = ""
+  FS = "\n"
+}
+
+function r_trim(t) {
+  sub(/^[ \t\r]+/, "", t)
+  sub(/[ \t\r]+$/, "", t)
+  return t
+}
+
+function r_emit(name, v) {
+  if (v != "")
+    printf "  %s = {%s},\n", name, v
+}
+
+{
+  split("", val)
+  na = 0
+  ne = 0
+  split("", A)
+  split("", E)
+  lasttag = ""
+  for (i = 1; i <= NF; i++) {
+    line = $i
+    if (substr(line, 1, 1) == "%") {
+      tag = substr(line, 2, 1)
+      v = r_trim(substr(line, 3))
+      if (tag == "A")
+        A[++na] = v
+      else if (tag == "E")
+        E[++ne] = v
+      else
+        val[tag] = v
+      lasttag = tag
+    } else if (lasttag == "A")
+      A[na] = A[na] " " r_trim(line)
+    else if (lasttag == "E")
+      E[ne] = E[ne] " " r_trim(line)
+    else if (lasttag != "")
+      val[lasttag] = val[lasttag] " " r_trim(line)
+  }
+  if (na == 0 && ne == 0 && !("T" in val))
+    next
+
+  # guess an entry type from the fields present
+  if ("J" in val)
+    type = "article"
+  else if ("B" in val)
+    type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+        ? "inproceedings" : "incollection"
+  else if ("R" in val)
+    type = "techreport"
+  else if ("I" in val)
+    type = "book"
+  else
+    type = "misc"
+
+  if (out_n++)
+    print ""
+  printf "@%s{FIXME,\n", type
+
+  authors = ""
+  for (i = 1; i <= na; i++)
+    authors = (i == 1) ? A[i] : authors " and " A[i]
+  r_emit("author", authors)
+  editors = ""
+  for (i = 1; i <= ne; i++)
+    editors = (i == 1) ? E[i] : editors " and " E[i]
+  r_emit("editor", editors)
+
+  r_emit("title", val["T"])
+  r_emit("journal", val["J"])
+  r_emit("booktitle", val["B"])
+
+  d = val["D"]
+  if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+    r_emit("year", substr(d, RSTART, 4))
+    m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+    if (m != "")
+      r_emit("month", m)
+  } else
+    r_emit("year", d)
+
+  r_emit("volume", val["V"])
+  r_emit("number", val["N"])
+  p = val["P"]
+  gsub(/-+/, "--", p)
+  r_emit("pages", p)
+  r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+  r_emit("address", val["C"])
+  if ("R" in val) {
+    if ("N" in val)
+      r_emit("note", val["R"])
+    else
+      r_emit("number", val["R"])
+  }
+  r_emit("keywords", val["K"])
+  r_emit("abstract", val["X"])
+  r_emit("note", val["O"])
+  print "}"
+}
author	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 12:02:41 -0400
committer	Douglas B. Rumbaugh <doug@douglasrumbaugh.com>	2026-06-06 12:02:41 -0400
commit	eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch)
tree	626d64c3574cfbc7cc38eae6d142ef22b21cf59b /lib
parent	8351a1da3f56cde9939b934bc5533a95aff1c95e (diff)
download	bibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz