aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/bib-canon.awk28
-rw-r--r--lib/bib-check.awk69
-rw-r--r--lib/bib-key.awk69
-rw-r--r--lib/bib-ls.awk25
-rw-r--r--lib/bib-lskeys.awk9
-rw-r--r--lib/bib-parse.awk216
-rw-r--r--lib/bib-select.awk29
-rw-r--r--lib/bib2ref.awk52
-rw-r--r--lib/ref2bib.awk107
9 files changed, 604 insertions, 0 deletions
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk
new file mode 100644
index 0000000..d11e9cb
--- /dev/null
+++ b/lib/bib-canon.awk
@@ -0,0 +1,28 @@
+# bib-canon.awk - canonical output helpers for bibutils
+#
+# Requires bib-parse.awk. Provides bib_emit() to print the current
+# entry in canonical form, and bib_get() to look up a field value.
+
+# print the current entry canonically: lowercase type and field names,
+# 2-space indent, brace-delimited values with whitespace collapsed
+function bib_emit(type, key, j, v) {
+ printf "@%s{%s,\n", type, key
+ for (j = 1; j <= BIB_N; j++) {
+ v = BIB_VAL[j]
+ if (BIB_KIND[j] == "s") {
+ gsub(/[ \t\r\n]+/, " ", v)
+ v = bib_trim(v)
+ printf " %s = {%s},\n", BIB_NAME[j], v
+ } else
+ printf " %s = %s,\n", BIB_NAME[j], v
+ }
+ print "}"
+}
+
+# value of field `name` (lowercase) in the current entry, "" if absent
+function bib_get(name, j) {
+ for (j = 1; j <= BIB_N; j++)
+ if (BIB_NAME[j] == name)
+ return BIB_VAL[j]
+ return ""
+}
diff --git a/lib/bib-check.awk b/lib/bib-check.awk
new file mode 100644
index 0000000..4411a55
--- /dev/null
+++ b/lib/bib-check.awk
@@ -0,0 +1,69 @@
+# bib-check.awk - lint a bibtex database
+#
+# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
+# line on stdout:
+# - missing fields required by the entry type
+# - duplicate keys
+# - entries whose titles normalize to the same string (likely dups)
+# - empty field values
+# Exits 1 if any problem was found.
+
+BEGIN {
+ REQ["article"] = "author title journal year"
+ REQ["book"] = "author|editor title publisher year"
+ REQ["booklet"] = "title"
+ REQ["inbook"] = "author|editor title publisher year"
+ REQ["incollection"] = "author title booktitle publisher year"
+ REQ["inproceedings"] = "author title booktitle year"
+ REQ["conference"] = "author title booktitle year"
+ REQ["manual"] = "title"
+ REQ["mastersthesis"] = "author title school year"
+ REQ["phdthesis"] = "author title school year"
+ REQ["proceedings"] = "title year"
+ REQ["techreport"] = "author title institution year"
+ REQ["unpublished"] = "author title note"
+}
+
+function bib_pass(raw) { }
+
+function problem(key, msg) {
+ printf "%s: %s\n", key, msg
+ BIB_BAD = 1
+}
+
+function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) {
+ if (key in BIB_KEYS_SEEN)
+ problem(key, "duplicate key")
+ BIB_KEYS_SEEN[key] = 1
+
+ # required fields ("a|b" means at least one of a, b)
+ if (type in REQ) {
+ n = split(REQ[type], req, " ")
+ for (i = 1; i <= n; i++) {
+ na = split(req[i], alts, "|")
+ found = 0
+ for (j = 1; j <= na; j++)
+ if (bib_get(alts[j]) != "")
+ found = 1
+ if (!found)
+ problem(key, "missing required field: " req[i])
+ }
+ }
+
+ # empty values
+ for (i = 1; i <= BIB_N; i++)
+ if (bib_trim(BIB_VAL[i]) == "")
+ problem(key, "empty field: " BIB_NAME[i])
+
+ # likely duplicate entries: same normalized title
+ t = tolower(bib_get("title"))
+ gsub(/[^a-z0-9]/, "", t)
+ if (t != "") {
+ if (t in BIB_TITLES_SEEN)
+ problem(key, "title duplicates " BIB_TITLES_SEEN[t])
+ else
+ BIB_TITLES_SEEN[t] = key
+ }
+}
+
+END { exit BIB_BAD }
diff --git a/lib/bib-key.awk b/lib/bib-key.awk
new file mode 100644
index 0000000..41534ba
--- /dev/null
+++ b/lib/bib-key.awk
@@ -0,0 +1,69 @@
+# bib-key.awk - rekey every entry with a generated citation key
+#
+# Requires bib-parse.awk and bib-canon.awk. Keys have the form
+# <surname><year><word>, e.g. knuth1984literate.
+
+function bib_pass(raw) {
+ if (bib_out_n++)
+ print ""
+ print raw
+}
+
+function bib_entry(type, key, k, n) {
+ if (bib_out_n++)
+ print ""
+ k = bib_mkkey()
+ # disambiguate collisions with b, c, ... suffixes
+ if (k in BIB_KEYS_SEEN) {
+ n = ++BIB_KEYS_SEEN[k]
+ k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1)
+ } else
+ BIB_KEYS_SEEN[k] = 1
+ bib_emit(type, k)
+}
+
+function bib_mkkey( a, y, t, surname, word, n, parts, i, w) {
+ a = bib_get("author")
+ if (a == "")
+ a = bib_get("editor")
+ y = bib_get("year")
+ t = bib_get("title")
+
+ # surname of the first author
+ if (match(a, / [Aa][Nn][Dd] /))
+ a = substr(a, 1, RSTART - 1)
+ gsub(/[{}]/, "", a)
+ a = bib_trim(a)
+ if (index(a, ",") > 0)
+ surname = substr(a, 1, index(a, ",") - 1)
+ else {
+ n = split(a, parts, /[ \t]+/)
+ surname = (n > 0) ? parts[n] : ""
+ }
+ gsub(/[^A-Za-z0-9]/, "", surname)
+ surname = tolower(surname)
+ if (surname == "")
+ surname = "anon"
+
+ # four-digit year
+ if (match(y, /[0-9][0-9][0-9][0-9]/))
+ y = substr(y, RSTART, 4)
+ else
+ y = ""
+
+ # first significant word of the title
+ gsub(/[{}]/, "", t)
+ word = ""
+ n = split(tolower(t), parts, /[^a-z0-9]+/)
+ for (i = 1; i <= n; i++) {
+ w = parts[i]
+ if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" ||
+ w == "of" || w == "in" || w == "for" || w == "and" || w == "to" ||
+ w == "with" || w == "from" || w == "by" || w == "at" || w == "is")
+ continue
+ word = w
+ break
+ }
+
+ return surname y word
+}
diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk
new file mode 100644
index 0000000..909b654
--- /dev/null
+++ b/lib/bib-ls.awk
@@ -0,0 +1,25 @@
+# bib-ls.awk - list database entries
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+# long - 0: print one key per line
+# 1: print key, type, author, year and title, tab-separated
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key, a, t) {
+ if (long + 0 == 0) {
+ print key
+ return
+ }
+ a = bib_get("author")
+ if (a == "")
+ a = bib_get("editor")
+ gsub(/[{}]/, "", a)
+ gsub(/[ \t\r\n]+/, " ", a)
+ if (match(a, / [Aa][Nn][Dd] /))
+ a = substr(a, 1, RSTART - 1) " et al."
+ t = bib_get("title")
+ gsub(/[{}]/, "", t)
+ gsub(/[ \t\r\n]+/, " ", t)
+ printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t
+}
diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk
new file mode 100644
index 0000000..1932ced
--- /dev/null
+++ b/lib/bib-lskeys.awk
@@ -0,0 +1,9 @@
+# bib-lskeys.awk - print the key of every entry, one per line
+#
+# Requires bib-parse.awk.
+
+function bib_pass(raw) { }
+
+function bib_entry(type, key) {
+ print key
+}
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk
new file mode 100644
index 0000000..e5bf9fa
--- /dev/null
+++ b/lib/bib-parse.awk
@@ -0,0 +1,216 @@
+# bib-parse.awk - shared bibtex parsing library for bibutils
+#
+# Consumers must define two hook functions:
+# bib_entry(type, key) - called once per regular entry. The fields are
+# available in BIB_N, BIB_NAME[], BIB_VAL[] and
+# BIB_KIND[]; the raw source text of the entry
+# is in BIB_RAW.
+# bib_pass(raw) - called for @string and @preamble blocks with
+# their raw source text.
+#
+# BIB_KIND[j] is "s" for ordinary string values (content stored without
+# delimiters; re-wrap in braces on output), "n" for bare numbers, and
+# "r" for raw values (macros, # concatenation) which should be emitted
+# verbatim.
+
+{ bib_buf = bib_buf $0 "\n" }
+
+END { bib_main(bib_buf) }
+
+function bib_main(s, i) {
+ i = 1
+ while (i <= length(s)) {
+ if (substr(s, i, 1) == "@")
+ i = bib_entry_at(s, i)
+ else
+ i++
+ }
+}
+
+function bib_ws(s, i) {
+ while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/)
+ i++
+ return i
+}
+
+function bib_trim(t) {
+ sub(/^[ \t\r\n]+/, "", t)
+ sub(/[ \t\r\n]+$/, "", t)
+ return t
+}
+
+# balanced {...} group starting at i; inner content goes to BIB_PIECE,
+# returns the index just past the closing brace
+function bib_braced(s, i, depth, start, c) {
+ start = i
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ i++
+ if (c == "{")
+ depth++
+ else if (c == "}") {
+ depth--
+ if (depth == 0)
+ break
+ }
+ }
+ BIB_PIECE = substr(s, start + 1, i - start - 2)
+ return i
+}
+
+# "..." group starting at i; braces protect embedded quotes
+function bib_quoted(s, i, depth, start, c) {
+ start = i
+ i++
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ if (c == "{")
+ depth++
+ else if (c == "}")
+ depth--
+ else if (c == "\"" && depth == 0) {
+ i++
+ break
+ }
+ i++
+ }
+ BIB_PIECE = substr(s, start + 1, i - start - 2)
+ return i
+}
+
+# skip a balanced op...cl group starting at i (i must be at op)
+function bib_skip_group(s, i, op, cl, depth, c) {
+ depth = 0
+ while (i <= length(s)) {
+ c = substr(s, i, 1)
+ i++
+ if (c == op)
+ depth++
+ else if (c == cl) {
+ depth--
+ if (depth == 0)
+ break
+ }
+ }
+ return i
+}
+
+# field value at i, handling # concatenation; sets BIB_VALUE and
+# BIB_VKIND, returns the index just past the value
+function bib_value(s, i, start, c, piece, pieces, kind) {
+ start = i
+ pieces = 0
+ kind = ""
+ BIB_VALUE = ""
+ while (1) {
+ c = substr(s, i, 1)
+ if (c == "{") {
+ i = bib_braced(s, i)
+ BIB_VALUE = BIB_VALUE BIB_PIECE
+ if (kind == "")
+ kind = "s"
+ } else if (c == "\"") {
+ i = bib_quoted(s, i)
+ BIB_VALUE = BIB_VALUE BIB_PIECE
+ if (kind == "")
+ kind = "s"
+ } else {
+ piece = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) {
+ piece = piece substr(s, i, 1)
+ i++
+ }
+ BIB_VALUE = BIB_VALUE piece
+ kind = (piece ~ /^[0-9]+$/) ? "n" : "r"
+ }
+ pieces++
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) == "#")
+ i = bib_ws(s, i + 1)
+ else
+ break
+ }
+ if (pieces > 1)
+ kind = "r"
+ if (kind == "r")
+ BIB_VALUE = bib_trim(substr(s, start, i - start))
+ BIB_VKIND = kind
+ return i
+}
+
+# parse the construct whose "@" is at i; returns the index past it
+function bib_entry_at(s, i, at, type, opener, closer, key, name, c) {
+ at = i
+ i++
+ type = ""
+ while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) {
+ type = type substr(s, i, 1)
+ i++
+ }
+ type = tolower(type)
+ i = bib_ws(s, i)
+ c = substr(s, i, 1)
+ if (c == "{") {
+ opener = "{"
+ closer = "}"
+ } else if (c == "(") {
+ opener = "("
+ closer = ")"
+ } else
+ return i # stray @, not an entry
+
+ if (type == "comment")
+ return bib_skip_group(s, i, opener, closer)
+ if (type == "string" || type == "preamble") {
+ i = bib_skip_group(s, i, opener, closer)
+ bib_pass(bib_trim(substr(s, at, i - at)))
+ return i
+ }
+
+ i++ # consume opener
+ i = bib_ws(s, i)
+ key = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) {
+ key = key substr(s, i, 1)
+ i++
+ }
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) == ",")
+ i++
+
+ BIB_N = 0
+ while (1) {
+ i = bib_ws(s, i)
+ c = substr(s, i, 1)
+ if (c == "" || c == closer) {
+ if (c == closer)
+ i++
+ break
+ }
+ if (c == ",") {
+ i++
+ continue
+ }
+ name = ""
+ while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) {
+ name = name substr(s, i, 1)
+ i++
+ }
+ i = bib_ws(s, i)
+ if (substr(s, i, 1) != "=") { # malformed; skip a char and resync
+ i++
+ continue
+ }
+ i = bib_ws(s, i + 1)
+ i = bib_value(s, i)
+ BIB_N++
+ BIB_NAME[BIB_N] = tolower(name)
+ BIB_VAL[BIB_N] = BIB_VALUE
+ BIB_KIND[BIB_N] = BIB_VKIND
+ }
+ BIB_RAW = bib_trim(substr(s, at, i - at))
+ bib_entry(type, key)
+ return i
+}
diff --git a/lib/bib-select.awk b/lib/bib-select.awk
new file mode 100644
index 0000000..1900390
--- /dev/null
+++ b/lib/bib-select.awk
@@ -0,0 +1,29 @@
+# bib-select.awk - emit entries selected by key, canonically
+#
+# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v):
+# keys - comma-separated list of entry keys
+# invert - 0: emit entries whose key is in the list
+# 1: emit entries whose key is NOT in the list
+#
+# With keys="" and invert=1 this acts as a canonicalizing filter for
+# everything. @string and @preamble blocks always pass through.
+
+BEGIN {
+ bib_sel_n = split(keys, bib_sel_k, ",")
+ for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++)
+ BIB_SEL[bib_sel_k[bib_sel_i]] = 1
+}
+
+function bib_pass(raw) {
+ if (bib_out_n++)
+ print ""
+ print raw
+}
+
+function bib_entry(type, key) {
+ if ((key in BIB_SEL) != invert + 0) {
+ if (bib_out_n++)
+ print ""
+ bib_emit(type, key)
+ }
+}
diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk
new file mode 100644
index 0000000..4d9e595
--- /dev/null
+++ b/lib/bib2ref.awk
@@ -0,0 +1,52 @@
+# bib2ref.awk - convert bibtex entries to refer records
+#
+# Requires bib-parse.awk and bib-canon.awk.
+
+function bib_pass(raw) { }
+
+function r_field(tag, v) {
+ if (v != "") {
+ gsub(/[{}]/, "", v)
+ gsub(/[ \t\r\n]+/, " ", v)
+ printf "%%%s %s\n", tag, bib_trim(v)
+ }
+}
+
+function r_names(tag, v, n, parts, i) {
+ gsub(/[{}]/, "", v)
+ gsub(/[ \t\r\n]+/, " ", v)
+ n = split(v, parts, / +[Aa][Nn][Dd] +/)
+ for (i = 1; i <= n; i++)
+ if (bib_trim(parts[i]) != "")
+ printf "%%%s %s\n", tag, bib_trim(parts[i])
+}
+
+function bib_entry(type, key, d, p, m) {
+ if (bib_out_n++)
+ print ""
+ r_names("A", bib_get("author"))
+ r_names("E", bib_get("editor"))
+ r_field("T", bib_get("title"))
+ r_field("J", bib_get("journal"))
+ r_field("B", bib_get("booktitle"))
+ d = bib_get("year")
+ m = bib_get("month")
+ if (m != "")
+ d = (d != "") ? m " " d : m
+ r_field("D", d)
+ r_field("V", bib_get("volume"))
+ r_field("N", bib_get("number"))
+ p = bib_get("pages")
+ gsub(/--/, "-", p)
+ r_field("P", p)
+ if (bib_get("publisher") != "")
+ r_field("I", bib_get("publisher"))
+ else if (bib_get("institution") != "")
+ r_field("I", bib_get("institution"))
+ else if (bib_get("school") != "")
+ r_field("I", bib_get("school"))
+ r_field("C", bib_get("address"))
+ r_field("K", bib_get("keywords"))
+ r_field("X", bib_get("abstract"))
+ r_field("O", bib_get("note"))
+}
diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk
new file mode 100644
index 0000000..422fdd7
--- /dev/null
+++ b/lib/ref2bib.awk
@@ -0,0 +1,107 @@
+# ref2bib.awk - convert refer records to bibtex entries
+#
+# Standalone (does not use bib-parse.awk). Records are separated by
+# blank lines. Output keys are FIXME; pipe through bib-key.
+
+BEGIN {
+ RS = ""
+ FS = "\n"
+}
+
+function r_trim(t) {
+ sub(/^[ \t\r]+/, "", t)
+ sub(/[ \t\r]+$/, "", t)
+ return t
+}
+
+function r_emit(name, v) {
+ if (v != "")
+ printf " %s = {%s},\n", name, v
+}
+
+{
+ split("", val)
+ na = 0
+ ne = 0
+ split("", A)
+ split("", E)
+ lasttag = ""
+ for (i = 1; i <= NF; i++) {
+ line = $i
+ if (substr(line, 1, 1) == "%") {
+ tag = substr(line, 2, 1)
+ v = r_trim(substr(line, 3))
+ if (tag == "A")
+ A[++na] = v
+ else if (tag == "E")
+ E[++ne] = v
+ else
+ val[tag] = v
+ lasttag = tag
+ } else if (lasttag == "A")
+ A[na] = A[na] " " r_trim(line)
+ else if (lasttag == "E")
+ E[ne] = E[ne] " " r_trim(line)
+ else if (lasttag != "")
+ val[lasttag] = val[lasttag] " " r_trim(line)
+ }
+ if (na == 0 && ne == 0 && !("T" in val))
+ next
+
+ # guess an entry type from the fields present
+ if ("J" in val)
+ type = "article"
+ else if ("B" in val)
+ type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \
+ ? "inproceedings" : "incollection"
+ else if ("R" in val)
+ type = "techreport"
+ else if ("I" in val)
+ type = "book"
+ else
+ type = "misc"
+
+ if (out_n++)
+ print ""
+ printf "@%s{FIXME,\n", type
+
+ authors = ""
+ for (i = 1; i <= na; i++)
+ authors = (i == 1) ? A[i] : authors " and " A[i]
+ r_emit("author", authors)
+ editors = ""
+ for (i = 1; i <= ne; i++)
+ editors = (i == 1) ? E[i] : editors " and " E[i]
+ r_emit("editor", editors)
+
+ r_emit("title", val["T"])
+ r_emit("journal", val["J"])
+ r_emit("booktitle", val["B"])
+
+ d = val["D"]
+ if (match(d, /[0-9][0-9][0-9][0-9]/)) {
+ r_emit("year", substr(d, RSTART, 4))
+ m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4))
+ if (m != "")
+ r_emit("month", m)
+ } else
+ r_emit("year", d)
+
+ r_emit("volume", val["V"])
+ r_emit("number", val["N"])
+ p = val["P"]
+ gsub(/-+/, "--", p)
+ r_emit("pages", p)
+ r_emit(type == "techreport" ? "institution" : "publisher", val["I"])
+ r_emit("address", val["C"])
+ if ("R" in val) {
+ if ("N" in val)
+ r_emit("note", val["R"])
+ else
+ r_emit("number", val["R"])
+ }
+ r_emit("keywords", val["K"])
+ r_emit("abstract", val["X"])
+ r_emit("note", val["O"])
+ print "}"
+}