aboutsummaryrefslogtreecommitdiffstats
path: root/lib/bib-check.awk
diff options
context:
space:
mode:
authorDouglas B. Rumbaugh <doug@douglasrumbaugh.com>2026-06-06 12:02:41 -0400
committerDouglas B. Rumbaugh <doug@douglasrumbaugh.com>2026-06-06 12:02:41 -0400
commiteabf1f6d74dac497ce31e3e2f441cfa25e9f74f2 (patch)
tree626d64c3574cfbc7cc38eae6d142ef22b21cf59b /lib/bib-check.awk
parent8351a1da3f56cde9939b934bc5533a95aff1c95e (diff)
downloadbibutils-eabf1f6d74dac497ce31e3e2f441cfa25e9f74f2.tar.gz
Initial implementation (only a few years later!)
This is pure Claude. I'd written out the plan for this suite of scripts eons ago, but never found the time to actual do it. Remembered it this morning, pointed Claude at the README, and had something that appears to work in minutes. caveat emptor: the design is mine, but the code is purely LLM generated at this point.
Diffstat (limited to 'lib/bib-check.awk')
-rw-r--r--lib/bib-check.awk69
1 files changed, 69 insertions, 0 deletions
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+# - missing fields required by the entry type
+# - duplicate keys
+# - entries whose titles normalize to the same string (likely dups)
+# - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+ REQ["article"] = "author title journal year"
+ REQ["book"] = "author|editor title publisher year"
+ REQ["booklet"] = "title"
+ REQ["inbook"] = "author|editor title publisher year"
+ REQ["incollection"] = "author title booktitle publisher year"
+ REQ["inproceedings"] = "author title booktitle year"
+ REQ["conference"] = "author title booktitle year"
+ REQ["manual"] = "title"
+ REQ["mastersthesis"] = "author title school year"
+ REQ["phdthesis"] = "author title school year"
+ REQ["proceedings"] = "title year"
+ REQ["techreport"] = "author title institution year"
+ REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+ printf "%s: %s\n", key, msg
+ BIB_BAD = 1
+}
+
+function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) {
+ if (key in BIB_KEYS_SEEN)
+ problem(key, "duplicate key")
+ BIB_KEYS_SEEN[key] = 1
+
+ # required fields ("a|b" means at least one of a, b)
+ if (type in REQ) {
+ n = split(REQ[type], req, " ")
+ for (i = 1; i <= n; i++) {
+ na = split(req[i], alts, "|")
+ found = 0
+ for (j = 1; j <= na; j++)
+ if (bib_get(alts[j]) != "")
+ found = 1
+ if (!found)
+ problem(key, "missing required field: " req[i])
+ }
+ }
+
+ # empty values
+ for (i = 1; i <= BIB_N; i++)
+ if (bib_trim(BIB_VAL[i]) == "")
+ problem(key, "empty field: " BIB_NAME[i])
+
+ # likely duplicate entries: same normalized title
+ t = tolower(bib_get("title"))
+ gsub(/[^a-z0-9]/, "", t)
+ if (t != "") {
+ if (t in BIB_TITLES_SEEN)
+ problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+ else
+ BIB_TITLES_SEEN[t] = key
+ }
+}
+
+END { exit BIB_BAD }