9 files changed, 604 insertions, 0 deletions
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key,    j, v) {
+  printf "@%s{%s,\n", type, key
+  for (j = 1; j <= BIB_N; j++) {
+    v = BIB_VAL[j]
+    if (BIB_KIND[j] == "s") {
+      gsub(/[ \t\r\n]+/, " ", v)
+      v = bib_trim(v)
+      printf "  %s = {%s},\n", BIB_NAME[j], v
+    } else
+      printf "  %s = %s,\n", BIB_NAME[j], v
+  }
+  print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name,    j) {
+  for (j = 1; j <= BIB_N; j++)
+    if (BIB_NAME[j] == name)
+      return BIB_VAL[j]
+  return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+#   - missing fields required by the entry type
+#   - duplicate keys
+#   - entries whose titles normalize to the same string (likely dups)
+#   - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+  REQ["article"] = "author title journal year"
+  REQ["book"] = "author|editor title publisher year"
+  REQ["booklet"] = "title"
+  REQ["inbook"] = "author|editor title publisher year"
+  REQ["incollection"] = "author title booktitle publisher year"
+  REQ["inproceedings"] = "author title booktitle year"
+  REQ["conference"] = "author title booktitle year"
+  REQ["manual"] = "title"
+  REQ["mastersthesis"] = "author title school year"
+  REQ["phdthesis"] = "author title school year"
+  REQ["proceedings"] = "title year"
+  REQ["techreport"] = "author title institution year"
+  REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+  printf "%s: %s\n", key, msg
+  BIB_BAD = 1
+}
+
+function bib_entry(type, key,    n, req, i, alts, na, j, found, t, k) {
+  if (key in BIB_KEYS_SEEN)
+    problem(key, "duplicate key")
+  BIB_KEYS_SEEN[key] = 1
+
+  # required fields ("a|b" means at least one of a, b)
+  if (type in REQ) {
+    n = split(REQ[type], req, " ")
+    for (i = 1; i <= n; i++) {
+      na = split(req[i], alts, "|")
+      found = 0
+      for (j = 1; j <= na; j++)
+        if (bib_get(alts[j]) != "")
+          found = 1
+      if (!found)
+        problem(key, "missing required field: " req[i])
+    }
+  }
+
+  # empty values
+  for (i = 1; i <= BIB_N; i++)
+    if (bib_trim(BIB_VAL[i]) == "")
+      problem(key, "empty field: " BIB_NAME[i])
+
+  # likely duplicate entries: same normalized title
+  t = tolower(bib_get("title"))
+  gsub(/[^a-z0-9]/, "", t)
+  if (t != "") {
+    if (t in BIB_TITLES_SEEN)
+      problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+    else
+      BIB_TITLES_SEEN[t] = key
+  }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key,    k, n) {
+  if (bib_out_n++)
+    print ""
+  k = bib_mkkey()
+  # disambiguate collisions with b, c, ... suffixes
+  if (k in BIB_KEYS_SEEN) {
+    n = ++BIB_KEYS_SEEN[k]
+    k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+  } else
+    BIB_KEYS_SEEN[k] = 1
+  bib_emit(type, k)
+}
+
+function bib_mkkey(    a, y, t, surname, word, n, parts, i, w) {
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  y = bib_get("year")
+  t = bib_get("title")
+
+  # surname of the first author
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1)
+  gsub(/[{}]/, "", a)
+  a = bib_trim(a)
+  if (index(a, ",") > 0)
+    surname = substr(a, 1, index(a, ",") - 1)
+  else {
+    n = split(a, parts, /[ \t]+/)
+    surname = (n > 0) ? parts[n] : ""
+  }
+  gsub(/[^A-Za-z0-9]/, "", surname)
+  surname = tolower(surname)
+  if (surname == "")
+    surname = "anon"
+
+  # four-digit year
+  if (match(y, /[0-9][0-9][0-9][0-9]/))
+    y = substr(y, RSTART, 4)
+  else
+    y = ""
+
+  # first significant word of the title
+  gsub(/[{}]/, "", t)
+  word = ""
+  n = split(tolower(t), parts, /[^a-z0-9]+/)
+  for (i = 1; i <= n; i++) {
+    w = parts[i]
+    if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+        w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+        w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+      continue
+    word = w
+    break
+  }
+
+  return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   long - 0: print one key per line
+#          1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key,    a, t) {
+  if (long + 0 == 0) {
+    print key
+    return
+  }
+  a = bib_get("author")
+  if (a == "")
+    a = bib_get("editor")
+  gsub(/[{}]/, "", a)
+  gsub(/[ \t\r\n]+/, " ", a)
+  if (match(a, / [Aa][Nn][Dd] /))
+    a = substr(a, 1, RSTART - 1) " et al."
+  t = bib_get("title")
+  gsub(/[{}]/, "", t)
+  gsub(/[ \t\r\n]+/, " ", t)
+  printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+  print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+#   bib_entry(type, key) - called once per regular entry. The fields are
+#                          available in BIB_N, BIB_NAME[], BIB_VAL[] and
+#                          BIB_KIND[]; the raw source text of the entry
+#                          is in BIB_RAW.
+#   bib_pass(raw)        - called for @string and @preamble blocks with
+#                          their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s,    i) {
+  i = 1
+  while (i <= length(s)) {
+    if (substr(s, i, 1) == "@")
+      i = bib_entry_at(s, i)
+    else
+      i++
+  }
+}
+
+function bib_ws(s, i) {
+  while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+    i++
+  return i
+}
+
+function bib_trim(t) {
+  sub(/^[ \t\r\n]+/, "", t)
+  sub(/[ \t\r\n]+$/, "", t)
+  return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i,    depth, start, c) {
+  start = i
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == "{")
+      depth++
+    else if (c == "}") {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i,    depth, start, c) {
+  start = i
+  i++
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    if (c == "{")
+      depth++
+    else if (c == "}")
+      depth--
+    else if (c == "\"" && depth == 0) {
+      i++
+      break
+    }
+    i++
+  }
+  BIB_PIECE = substr(s, start + 1, i - start - 2)
+  return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl,    depth, c) {
+  depth = 0
+  while (i <= length(s)) {
+    c = substr(s, i, 1)
+    i++
+    if (c == op)
+      depth++
+    else if (c == cl) {
+      depth--
+      if (depth == 0)
+        break
+    }
+  }
+  return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i,    start, c, piece, pieces, kind) {
+  start = i
+  pieces = 0
+  kind = ""
+  BIB_VALUE = ""
+  while (1) {
+    c = substr(s, i, 1)
+    if (c == "{") {
+      i = bib_braced(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else if (c == "\"") {
+      i = bib_quoted(s, i)
+      BIB_VALUE = BIB_VALUE BIB_PIECE
+      if (kind == "")
+        kind = "s"
+    } else {
+      piece = ""
+      while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+        piece = piece substr(s, i, 1)
+        i++
+      }
+      BIB_VALUE = BIB_VALUE piece
+      kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+    }
+    pieces++
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) == "#")
+      i = bib_ws(s, i + 1)
+    else
+      break
+  }
+  if (pieces > 1)
+    kind = "r"
+  if (kind == "r")
+    BIB_VALUE = bib_trim(substr(s, start, i - start))
+  BIB_VKIND = kind
+  return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i,    at, type, opener, closer, key, name, c) {
+  at = i
+  i++
+  type = ""
+  while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+    type = type substr(s, i, 1)
+    i++
+  }
+  type = tolower(type)
+  i = bib_ws(s, i)
+  c = substr(s, i, 1)
+  if (c == "{") {
+    opener = "{"
+    closer = "}"
+  } else if (c == "(") {
+    opener = "("
+    closer = ")"
+  } else
+    return i                       # stray @, not an entry
+
+  if (type == "comment")
+    return bib_skip_group(s, i, opener, closer)
+  if (type == "string" || type == "preamble") {
+    i = bib_skip_group(s, i, opener, closer)
+    bib_pass(bib_trim(substr(s, at, i - at)))
+    return i
+  }
+
+  i++                              # consume opener
+  i = bib_ws(s, i)
+  key = ""
+  while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+    key = key substr(s, i, 1)
+    i++
+  }
+  i = bib_ws(s, i)
+  if (substr(s, i, 1) == ",")
+    i++
+
+  BIB_N = 0
+  while (1) {
+    i = bib_ws(s, i)
+    c = substr(s, i, 1)
+    if (c == "" || c == closer) {
+      if (c == closer)
+        i++
+      break
+    }
+    if (c == ",") {
+      i++
+      continue
+    }
+    name = ""
+    while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+      name = name substr(s, i, 1)
+      i++
+    }
+    i = bib_ws(s, i)
+    if (substr(s, i, 1) != "=") {  # malformed; skip a char and resync
+      i++
+      continue
+    }
+    i = bib_ws(s, i + 1)
+    i = bib_value(s, i)
+    BIB_N++
+    BIB_NAME[BIB_N] = tolower(name)
+    BIB_VAL[BIB_N] = BIB_VALUE
+    BIB_KIND[BIB_N] = BIB_VKIND
+  }
+  BIB_RAW = bib_trim(substr(s, at, i - at))
+  bib_entry(type, key)
+  return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+#   keys   - comma-separated list of entry keys
+#   invert - 0: emit entries whose key is in the list
+#            1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+  bib_sel_n = split(keys, bib_sel_k, ",")
+  for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+    BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+  if (bib_out_n++)
+    print ""
+  print raw
+}
+
+function bib_entry(type, key) {
+  if ((key in BIB_SEL) != invert + 0) {
+    if (bib_out_n++)
+      print ""
+    bib_emit(type, key)
+  }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+  if (v != "") {
+    gsub(/[{}]/, "", v)
+    gsub(/[ \t\r\n]+/, " ", v)
+    printf "%%%s %s\n", tag, bib_trim(v)
+  }
+}
+
+function r_names(tag, v,    n, parts, i) {
+  gsub(/[{}]/, "", v)
+  gsub(/[ \t\r\n]+/, " ", v)
+  n = split(v, parts, / +[Aa][Nn][Dd] +/)
+  for (i = 1; i <= n; i++)
+    if (bib_trim(parts[i]) != "")
+      printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key,    d, p, m) {
+  if (bib_out_n++)
+    print ""
+  r_names("A", bib_get("author"))
+  r_names("E", bib_get("editor"))
+  r_field("T", bib_get("title"))
+  r_field("J", bib_get("journal"))
+  r_field("B", bib_get("booktitle"))
+  d = bib_get("year")
+  m = bib_get("month")
+  if (m != "")
+    d = (d != "") ? m " " d : m
+  r_field("D", d)
+  r_field("V", bib_get("volume"))
+  r_field("N", bib_get("number"))
+  p = bib_get("pages")
+  gsub(/--/, "-", p)
+  r_field("P", p)
+  if (bib_get("publisher") != "")
+    r_field("I", bib_get("publisher"))
+  else if (bib_get("institution") != "")
+    r_field("I", bib_get("institution"))
+  else if (bib_get("school") != "")
+    r_field("I", bib_get("school"))
+  r_field("C", bib_get("address"))
+  r_field("K", bib_get("keywords"))
+  r_field("X", bib_get("abstract"))
+  r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+  RS = ""
+  FS = "\n"
+}
+
+function r_trim(t) {
+  sub(/^[ \t\r]+/, "", t)
+  sub(/[ \t\r]+$/, "", t)
+  return t
+}
+
+function r_emit(name, v) {
+  if (v != "")
+    printf "  %s = {%s},\n", name, v
+}
+
+{
+  split("", val)
+  na = 0
+  ne = 0
+  split("", A)
+  split("", E)
+  lasttag = ""
+  for (i = 1; i <= NF; i++) {
+    line = $i
+    if (substr(line, 1, 1) == "%") {
+      tag = substr(line, 2, 1)
+      v = r_trim(substr(line, 3))
+      if (tag == "A")
+        A[++na] = v
+      else if (tag == "E")
+        E[++ne] = v
+      else
+        val[tag] = v
+      lasttag = tag
+    } else if (lasttag == "A")
+      A[na] = A[na] " " r_trim(line)
+    else if (lasttag == "E")
+      E[ne] = E[ne] " " r_trim(line)
+    else if (lasttag != "")
+      val[lasttag] = val[lasttag] " " r_trim(line)
+  }
+  if (na == 0 && ne == 0 && !("T" in val))
+    next
+
+  # guess an entry type from the fields present
+  if ("J" in val)
+    type = "article"
+  else if ("B" in val)
+    type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+        ? "inproceedings" : "incollection"
+  else if ("R" in val)
+    type = "techreport"
+  else if ("I" in val)
+    type = "book"
+  else
+    type = "misc"
+
+  if (out_n++)
+    print ""
+  printf "@%s{FIXME,\n", type
+
+  authors = ""
+  for (i = 1; i <= na; i++)
+    authors = (i == 1) ? A[i] : authors " and " A[i]
+  r_emit("author", authors)
+  editors = ""
+  for (i = 1; i <= ne; i++)
+    editors = (i == 1) ? E[i] : editors " and " E[i]
+  r_emit("editor", editors)
+
+  r_emit("title", val["T"])
+  r_emit("journal", val["J"])
+  r_emit("booktitle", val["B"])
+
+  d = val["D"]
+  if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+    r_emit("year", substr(d, RSTART, 4))
+    m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+    if (m != "")
+      r_emit("month", m)
+  } else
+    r_emit("year", d)
+
+  r_emit("volume", val["V"])
+  r_emit("number", val["N"])
+  p = val["P"]
+  gsub(/-+/, "--", p)
+  r_emit("pages", p)
+  r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+  r_emit("address", val["C"])
+  if ("R" in val) {
+    if ("N" in val)
+      r_emit("note", val["R"])
+    else
+      r_emit("number", val["R"])
+  }
+  r_emit("keywords", val["K"])
+  r_emit("abstract", val["X"])
+  r_emit("note", val["O"])
+  print "}"
+}