diff options
| author | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2026-06-06 12:02:41 -0400 |
|---|---|---|
| committer | Douglas B. Rumbaugh <doug@douglasrumbaugh.com> | 2026-06-06 12:02:41 -0400 |
| commit | eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch) | |
| tree | 626d64c3574cfbc7cc38eae6d142ef22b21cf59b /lib | |
| parent | 8351a1da3f56cde9939b934bc5533a95aff1c95e (diff) | |
| download | bibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz | |
Initial implementation (only a few years later!)
This is pure Claude. I'd written out the plan for
this suite of scripts eons ago, but never found the
time to actual do it. Remembered it this morning,
pointed Claude at the README, and had something
that appears to work in minutes.
caveat emptor: the design is mine, but the code is
purely LLM generated at this point.
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/bib-canon.awk | 28 | ||||
| -rw-r--r-- | lib/bib-check.awk | 69 | ||||
| -rw-r--r-- | lib/bib-key.awk | 69 | ||||
| -rw-r--r-- | lib/bib-ls.awk | 25 | ||||
| -rw-r--r-- | lib/bib-lskeys.awk | 9 | ||||
| -rw-r--r-- | lib/bib-parse.awk | 216 | ||||
| -rw-r--r-- | lib/bib-select.awk | 29 | ||||
| -rw-r--r-- | lib/bib2ref.awk | 52 | ||||
| -rw-r--r-- | lib/ref2bib.awk | 107 |
9 files changed, 604 insertions, 0 deletions
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk new file mode 100644 index 0000000..d11e9cb --- /dev/null +++ b/lib/bib-canon.awk @@ -0,0 +1,28 @@ +# bib-canon.awk - canonical output helpers for bibutils +# +# Requires bib-parse.awk. Provides bib_emit() to print the current +# entry in canonical form, and bib_get() to look up a field value. + +# print the current entry canonically: lowercase type and field names, +# 2-space indent, brace-delimited values with whitespace collapsed +function bib_emit(type, key, j, v) { + printf "@%s{%s,\n", type, key + for (j = 1; j <= BIB_N; j++) { + v = BIB_VAL[j] + if (BIB_KIND[j] == "s") { + gsub(/[ \t\r\n]+/, " ", v) + v = bib_trim(v) + printf " %s = {%s},\n", BIB_NAME[j], v + } else + printf " %s = %s,\n", BIB_NAME[j], v + } + print "}" +} + +# value of field `name` (lowercase) in the current entry, "" if absent +function bib_get(name, j) { + for (j = 1; j <= BIB_N; j++) + if (BIB_NAME[j] == name) + return BIB_VAL[j] + return "" +} diff --git a/lib/bib-check.awk b/lib/bib-check.awk new file mode 100644 index 0000000..4411a55 --- /dev/null +++ b/lib/bib-check.awk @@ -0,0 +1,69 @@ +# bib-check.awk - lint a bibtex database +# +# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per +# line on stdout: +# - missing fields required by the entry type +# - duplicate keys +# - entries whose titles normalize to the same string (likely dups) +# - empty field values +# Exits 1 if any problem was found. + +BEGIN { + REQ["article"] = "author title journal year" + REQ["book"] = "author|editor title publisher year" + REQ["booklet"] = "title" + REQ["inbook"] = "author|editor title publisher year" + REQ["incollection"] = "author title booktitle publisher year" + REQ["inproceedings"] = "author title booktitle year" + REQ["conference"] = "author title booktitle year" + REQ["manual"] = "title" + REQ["mastersthesis"] = "author title school year" + REQ["phdthesis"] = "author title school year" + REQ["proceedings"] = "title year" + REQ["techreport"] = "author title institution year" + REQ["unpublished"] = "author title note" +} + +function bib_pass(raw) { } + +function problem(key, msg) { + printf "%s: %s\n", key, msg + BIB_BAD = 1 +} + +function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) { + if (key in BIB_KEYS_SEEN) + problem(key, "duplicate key") + BIB_KEYS_SEEN[key] = 1 + + # required fields ("a|b" means at least one of a, b) + if (type in REQ) { + n = split(REQ[type], req, " ") + for (i = 1; i <= n; i++) { + na = split(req[i], alts, "|") + found = 0 + for (j = 1; j <= na; j++) + if (bib_get(alts[j]) != "") + found = 1 + if (!found) + problem(key, "missing required field: " req[i]) + } + } + + # empty values + for (i = 1; i <= BIB_N; i++) + if (bib_trim(BIB_VAL[i]) == "") + problem(key, "empty field: " BIB_NAME[i]) + + # likely duplicate entries: same normalized title + t = tolower(bib_get("title")) + gsub(/[^a-z0-9]/, "", t) + if (t != "") { + if (t in BIB_TITLES_SEEN) + problem(key, "title duplicates " BIB_TITLES_SEEN[t]) + else + BIB_TITLES_SEEN[t] = key + } +} + +END { exit BIB_BAD } diff --git a/lib/bib-key.awk b/lib/bib-key.awk new file mode 100644 index 0000000..41534ba --- /dev/null +++ b/lib/bib-key.awk @@ -0,0 +1,69 @@ +# bib-key.awk - rekey every entry with a generated citation key +# +# Requires bib-parse.awk and bib-canon.awk. Keys have the form +# <surname><year><word>, e.g. knuth1984literate. + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key, k, n) { + if (bib_out_n++) + print "" + k = bib_mkkey() + # disambiguate collisions with b, c, ... suffixes + if (k in BIB_KEYS_SEEN) { + n = ++BIB_KEYS_SEEN[k] + k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1) + } else + BIB_KEYS_SEEN[k] = 1 + bib_emit(type, k) +} + +function bib_mkkey( a, y, t, surname, word, n, parts, i, w) { + a = bib_get("author") + if (a == "") + a = bib_get("editor") + y = bib_get("year") + t = bib_get("title") + + # surname of the first author + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) + gsub(/[{}]/, "", a) + a = bib_trim(a) + if (index(a, ",") > 0) + surname = substr(a, 1, index(a, ",") - 1) + else { + n = split(a, parts, /[ \t]+/) + surname = (n > 0) ? parts[n] : "" + } + gsub(/[^A-Za-z0-9]/, "", surname) + surname = tolower(surname) + if (surname == "") + surname = "anon" + + # four-digit year + if (match(y, /[0-9][0-9][0-9][0-9]/)) + y = substr(y, RSTART, 4) + else + y = "" + + # first significant word of the title + gsub(/[{}]/, "", t) + word = "" + n = split(tolower(t), parts, /[^a-z0-9]+/) + for (i = 1; i <= n; i++) { + w = parts[i] + if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" || + w == "of" || w == "in" || w == "for" || w == "and" || w == "to" || + w == "with" || w == "from" || w == "by" || w == "at" || w == "is") + continue + word = w + break + } + + return surname y word +} diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk new file mode 100644 index 0000000..909b654 --- /dev/null +++ b/lib/bib-ls.awk @@ -0,0 +1,25 @@ +# bib-ls.awk - list database entries +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# long - 0: print one key per line +# 1: print key, type, author, year and title, tab-separated + +function bib_pass(raw) { } + +function bib_entry(type, key, a, t) { + if (long + 0 == 0) { + print key + return + } + a = bib_get("author") + if (a == "") + a = bib_get("editor") + gsub(/[{}]/, "", a) + gsub(/[ \t\r\n]+/, " ", a) + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) " et al." + t = bib_get("title") + gsub(/[{}]/, "", t) + gsub(/[ \t\r\n]+/, " ", t) + printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t +} diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk new file mode 100644 index 0000000..1932ced --- /dev/null +++ b/lib/bib-lskeys.awk @@ -0,0 +1,9 @@ +# bib-lskeys.awk - print the key of every entry, one per line +# +# Requires bib-parse.awk. + +function bib_pass(raw) { } + +function bib_entry(type, key) { + print key +} diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk new file mode 100644 index 0000000..e5bf9fa --- /dev/null +++ b/lib/bib-parse.awk @@ -0,0 +1,216 @@ +# bib-parse.awk - shared bibtex parsing library for bibutils +# +# Consumers must define two hook functions: +# bib_entry(type, key) - called once per regular entry. The fields are +# available in BIB_N, BIB_NAME[], BIB_VAL[] and +# BIB_KIND[]; the raw source text of the entry +# is in BIB_RAW. +# bib_pass(raw) - called for @string and @preamble blocks with +# their raw source text. +# +# BIB_KIND[j] is "s" for ordinary string values (content stored without +# delimiters; re-wrap in braces on output), "n" for bare numbers, and +# "r" for raw values (macros, # concatenation) which should be emitted +# verbatim. + +{ bib_buf = bib_buf $0 "\n" } + +END { bib_main(bib_buf) } + +function bib_main(s, i) { + i = 1 + while (i <= length(s)) { + if (substr(s, i, 1) == "@") + i = bib_entry_at(s, i) + else + i++ + } +} + +function bib_ws(s, i) { + while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/) + i++ + return i +} + +function bib_trim(t) { + sub(/^[ \t\r\n]+/, "", t) + sub(/[ \t\r\n]+$/, "", t) + return t +} + +# balanced {...} group starting at i; inner content goes to BIB_PIECE, +# returns the index just past the closing brace +function bib_braced(s, i, depth, start, c) { + start = i + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == "{") + depth++ + else if (c == "}") { + depth-- + if (depth == 0) + break + } + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# "..." group starting at i; braces protect embedded quotes +function bib_quoted(s, i, depth, start, c) { + start = i + i++ + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + if (c == "{") + depth++ + else if (c == "}") + depth-- + else if (c == "\"" && depth == 0) { + i++ + break + } + i++ + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# skip a balanced op...cl group starting at i (i must be at op) +function bib_skip_group(s, i, op, cl, depth, c) { + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == op) + depth++ + else if (c == cl) { + depth-- + if (depth == 0) + break + } + } + return i +} + +# field value at i, handling # concatenation; sets BIB_VALUE and +# BIB_VKIND, returns the index just past the value +function bib_value(s, i, start, c, piece, pieces, kind) { + start = i + pieces = 0 + kind = "" + BIB_VALUE = "" + while (1) { + c = substr(s, i, 1) + if (c == "{") { + i = bib_braced(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else if (c == "\"") { + i = bib_quoted(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else { + piece = "" + while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) { + piece = piece substr(s, i, 1) + i++ + } + BIB_VALUE = BIB_VALUE piece + kind = (piece ~ /^[0-9]+$/) ? "n" : "r" + } + pieces++ + i = bib_ws(s, i) + if (substr(s, i, 1) == "#") + i = bib_ws(s, i + 1) + else + break + } + if (pieces > 1) + kind = "r" + if (kind == "r") + BIB_VALUE = bib_trim(substr(s, start, i - start)) + BIB_VKIND = kind + return i +} + +# parse the construct whose "@" is at i; returns the index past it +function bib_entry_at(s, i, at, type, opener, closer, key, name, c) { + at = i + i++ + type = "" + while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) { + type = type substr(s, i, 1) + i++ + } + type = tolower(type) + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "{") { + opener = "{" + closer = "}" + } else if (c == "(") { + opener = "(" + closer = ")" + } else + return i # stray @, not an entry + + if (type == "comment") + return bib_skip_group(s, i, opener, closer) + if (type == "string" || type == "preamble") { + i = bib_skip_group(s, i, opener, closer) + bib_pass(bib_trim(substr(s, at, i - at))) + return i + } + + i++ # consume opener + i = bib_ws(s, i) + key = "" + while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) { + key = key substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) == ",") + i++ + + BIB_N = 0 + while (1) { + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "" || c == closer) { + if (c == closer) + i++ + break + } + if (c == ",") { + i++ + continue + } + name = "" + while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) { + name = name substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) != "=") { # malformed; skip a char and resync + i++ + continue + } + i = bib_ws(s, i + 1) + i = bib_value(s, i) + BIB_N++ + BIB_NAME[BIB_N] = tolower(name) + BIB_VAL[BIB_N] = BIB_VALUE + BIB_KIND[BIB_N] = BIB_VKIND + } + BIB_RAW = bib_trim(substr(s, at, i - at)) + bib_entry(type, key) + return i +} diff --git a/lib/bib-select.awk b/lib/bib-select.awk new file mode 100644 index 0000000..1900390 --- /dev/null +++ b/lib/bib-select.awk @@ -0,0 +1,29 @@ +# bib-select.awk - emit entries selected by key, canonically +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# keys - comma-separated list of entry keys +# invert - 0: emit entries whose key is in the list +# 1: emit entries whose key is NOT in the list +# +# With keys="" and invert=1 this acts as a canonicalizing filter for +# everything. @string and @preamble blocks always pass through. + +BEGIN { + bib_sel_n = split(keys, bib_sel_k, ",") + for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++) + BIB_SEL[bib_sel_k[bib_sel_i]] = 1 +} + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key) { + if ((key in BIB_SEL) != invert + 0) { + if (bib_out_n++) + print "" + bib_emit(type, key) + } +} diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk new file mode 100644 index 0000000..4d9e595 --- /dev/null +++ b/lib/bib2ref.awk @@ -0,0 +1,52 @@ +# bib2ref.awk - convert bibtex entries to refer records +# +# Requires bib-parse.awk and bib-canon.awk. + +function bib_pass(raw) { } + +function r_field(tag, v) { + if (v != "") { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + printf "%%%s %s\n", tag, bib_trim(v) + } +} + +function r_names(tag, v, n, parts, i) { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + n = split(v, parts, / +[Aa][Nn][Dd] +/) + for (i = 1; i <= n; i++) + if (bib_trim(parts[i]) != "") + printf "%%%s %s\n", tag, bib_trim(parts[i]) +} + +function bib_entry(type, key, d, p, m) { + if (bib_out_n++) + print "" + r_names("A", bib_get("author")) + r_names("E", bib_get("editor")) + r_field("T", bib_get("title")) + r_field("J", bib_get("journal")) + r_field("B", bib_get("booktitle")) + d = bib_get("year") + m = bib_get("month") + if (m != "") + d = (d != "") ? m " " d : m + r_field("D", d) + r_field("V", bib_get("volume")) + r_field("N", bib_get("number")) + p = bib_get("pages") + gsub(/--/, "-", p) + r_field("P", p) + if (bib_get("publisher") != "") + r_field("I", bib_get("publisher")) + else if (bib_get("institution") != "") + r_field("I", bib_get("institution")) + else if (bib_get("school") != "") + r_field("I", bib_get("school")) + r_field("C", bib_get("address")) + r_field("K", bib_get("keywords")) + r_field("X", bib_get("abstract")) + r_field("O", bib_get("note")) +} diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk new file mode 100644 index 0000000..422fdd7 --- /dev/null +++ b/lib/ref2bib.awk @@ -0,0 +1,107 @@ +# ref2bib.awk - convert refer records to bibtex entries +# +# Standalone (does not use bib-parse.awk). Records are separated by +# blank lines. Output keys are FIXME; pipe through bib-key. + +BEGIN { + RS = "" + FS = "\n" +} + +function r_trim(t) { + sub(/^[ \t\r]+/, "", t) + sub(/[ \t\r]+$/, "", t) + return t +} + +function r_emit(name, v) { + if (v != "") + printf " %s = {%s},\n", name, v +} + +{ + split("", val) + na = 0 + ne = 0 + split("", A) + split("", E) + lasttag = "" + for (i = 1; i <= NF; i++) { + line = $i + if (substr(line, 1, 1) == "%") { + tag = substr(line, 2, 1) + v = r_trim(substr(line, 3)) + if (tag == "A") + A[++na] = v + else if (tag == "E") + E[++ne] = v + else + val[tag] = v + lasttag = tag + } else if (lasttag == "A") + A[na] = A[na] " " r_trim(line) + else if (lasttag == "E") + E[ne] = E[ne] " " r_trim(line) + else if (lasttag != "") + val[lasttag] = val[lasttag] " " r_trim(line) + } + if (na == 0 && ne == 0 && !("T" in val)) + next + + # guess an entry type from the fields present + if ("J" in val) + type = "article" + else if ("B" in val) + type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \ + ? "inproceedings" : "incollection" + else if ("R" in val) + type = "techreport" + else if ("I" in val) + type = "book" + else + type = "misc" + + if (out_n++) + print "" + printf "@%s{FIXME,\n", type + + authors = "" + for (i = 1; i <= na; i++) + authors = (i == 1) ? A[i] : authors " and " A[i] + r_emit("author", authors) + editors = "" + for (i = 1; i <= ne; i++) + editors = (i == 1) ? E[i] : editors " and " E[i] + r_emit("editor", editors) + + r_emit("title", val["T"]) + r_emit("journal", val["J"]) + r_emit("booktitle", val["B"]) + + d = val["D"] + if (match(d, /[0-9][0-9][0-9][0-9]/)) { + r_emit("year", substr(d, RSTART, 4)) + m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4)) + if (m != "") + r_emit("month", m) + } else + r_emit("year", d) + + r_emit("volume", val["V"]) + r_emit("number", val["N"]) + p = val["P"] + gsub(/-+/, "--", p) + r_emit("pages", p) + r_emit(type == "techreport" ? "institution" : "publisher", val["I"]) + r_emit("address", val["C"]) + if ("R" in val) { + if ("N" in val) + r_emit("note", val["R"]) + else + r_emit("number", val["R"]) + } + r_emit("keywords", val["K"]) + r_emit("abstract", val["X"]) + r_emit("note", val["O"]) + print "}" +} |