From c102ab995f9a86a77e40b9a952b2b23c0bd7de74 Mon Sep 17 00:00:00 2001
From: "Douglas B. Rumbaugh" <doug@douglasrumbaugh.com>
Date: Sat, 6 Jun 2026 13:44:00 -0400
Subject: Fuzzing with associated fixes

---
 lib/bib-key.awk    |  4 ++--
 lib/bib-parse.awk  |  5 ++++-
 lib/bib-select.awk | 30 ++++++++++++++++++++----------
 lib/bib-strip.awk  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 13 deletions(-)
 create mode 100644 lib/bib-strip.awk

(limited to 'lib')

diff --git a/lib/bib-key.awk b/lib/bib-key.awk
index 4223155..3f4117f 100644
--- a/lib/bib-key.awk
+++ b/lib/bib-key.awk
@@ -38,7 +38,7 @@ function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
     surname = (n > 0) ? parts[n] : ""
   }
   gsub(/[^A-Za-z0-9]/, "", surname)
-  surname = tolower(surname)
+  surname = tolower(substr(surname, 1, 30))
   if (surname == "")
     surname = "anon"
 
@@ -62,5 +62,5 @@ function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
     break
   }
 
-  return surname y word
+  return surname y substr(word, 1, 30)
 }
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
index e5bf9fa..e83cb07 100644
--- a/lib/bib-parse.awk
+++ b/lib/bib-parse.awk
@@ -4,7 +4,8 @@
 #   bib_entry(type, key) - called once per regular entry. The fields are
 #                          available in BIB_N, BIB_NAME[], BIB_VAL[] and
 #                          BIB_KIND[]; the raw source text of the entry
-#                          is in BIB_RAW.
+#                          is in BIB_RAW, and its position in the input
+#                          buffer bib_buf is BIB_START..BIB_END-1.
 #   bib_pass(raw)        - called for @string and @preamble blocks with
 #                          their raw source text.
 #
@@ -211,6 +212,8 @@ function bib_entry_at(s, i,    at, type, opener, closer, key, name, c) {
     BIB_KIND[BIB_N] = BIB_VKIND
   }
   BIB_RAW = bib_trim(substr(s, at, i - at))
+  BIB_START = at
+  BIB_END = i
   bib_entry(type, key)
   return i
 }
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
index 9aa5a37..3ebd16f 100644
--- a/lib/bib-select.awk
+++ b/lib/bib-select.awk
@@ -1,21 +1,31 @@
 # bib-select.awk - emit entries selected by key, canonically
 #
 # Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
-#   keys   - comma-separated list of entry keys; a key of "*" selects
-#            every entry (as produced by \nocite{*})
-#   invert - 0: emit entries whose key is in the list
-#            1: emit entries whose key is NOT in the list
+#   keys    - comma-separated list of entry keys; a key of "*" selects
+#             every entry (as produced by \nocite{*})
+#   keyfile - file with one key per line, for key lists too large to
+#             pass on the command line; merged with keys
+#   invert  - 0: emit entries whose key is in the list
+#             1: emit entries whose key is NOT in the list
 #
-# With keys="" and invert=1 this acts as a canonicalizing filter for
+# With no keys and invert=1 this acts as a canonicalizing filter for
 # everything. @string and @preamble blocks always pass through.
 
+function bib_sel_add(k) {
+  if (k == "*")
+    BIB_SEL_ALL = 1
+  else
+    BIB_SEL[k] = 1
+}
+
 BEGIN {
   bib_sel_n = split(keys, bib_sel_k, ",")
-  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++) {
-    if (bib_sel_k[bib_sel_i] == "*")
-      BIB_SEL_ALL = 1
-    else
-      BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+    bib_sel_add(bib_sel_k[bib_sel_i])
+  if (keyfile != "") {
+    while ((getline bib_sel_line < keyfile) > 0)
+      bib_sel_add(bib_sel_line)
+    close(keyfile)
   }
 }
 
diff --git a/lib/bib-strip.awk b/lib/bib-strip.awk
new file mode 100644
index 0000000..cecca3e
--- /dev/null
+++ b/lib/bib-strip.awk
@@ -0,0 +1,49 @@
+# bib-strip.awk - remove entries by key, preserving all other bytes
+#
+# Requires bib-parse.awk. Variables (set with -v):
+#   keys    - comma-separated list of entry keys to remove
+#   keyfile - file with one key per line, for key lists too large to
+#             pass on the command line; merged with keys
+#
+# Unlike bib-select.awk, which re-emits entries canonically, this
+# splices the matched entries' source spans out of the input and
+# leaves everything else - comments, formatting, @string blocks -
+# byte-for-byte intact. Used by bib-add -f so that replacing one
+# entry never rewrites the rest of the database.
+#
+# This END block runs after bib-parse.awk's (END blocks execute in
+# the order their files are given to awk), so the spans recorded by
+# the hooks below are complete by the time output happens.
+
+BEGIN {
+  bib_strip_n = split(keys, bib_strip_k, ",")
+  for (bib_strip_i = 1; bib_strip_i <= bib_strip_n; bib_strip_i++)
+    BIB_DROP[bib_strip_k[bib_strip_i]] = 1
+  if (keyfile != "") {
+    while ((getline bib_strip_line < keyfile) > 0)
+      BIB_DROP[bib_strip_line] = 1
+    close(keyfile)
+  }
+}
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+  if (key in BIB_DROP) {
+    BIB_NSPAN++
+    BIB_SPAN_S[BIB_NSPAN] = BIB_START
+    BIB_SPAN_E[BIB_NSPAN] = BIB_END
+  }
+}
+
+END {
+  i = 1
+  for (j = 1; j <= BIB_NSPAN; j++) {
+    printf "%s", substr(bib_buf, i, BIB_SPAN_S[j] - i)
+    i = BIB_SPAN_E[j]
+    # swallow the whitespace that followed the removed entry
+    while (i <= length(bib_buf) && substr(bib_buf, i, 1) ~ /[ \t\r\n]/)
+      i++
+  }
+  printf "%s", substr(bib_buf, i)
+}
-- 
cgit v1.2.3