diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/bib-canon.awk | 28 | ||||
| -rw-r--r-- | lib/bib-check.awk | 69 | ||||
| -rw-r--r-- | lib/bib-key.awk | 69 | ||||
| -rw-r--r-- | lib/bib-ls.awk | 25 | ||||
| -rw-r--r-- | lib/bib-lskeys.awk | 9 | ||||
| -rw-r--r-- | lib/bib-parse.awk | 216 | ||||
| -rw-r--r-- | lib/bib-select.awk | 29 | ||||
| -rw-r--r-- | lib/bib2ref.awk | 52 | ||||
| -rw-r--r-- | lib/ref2bib.awk | 107 |
9 files changed, 604 insertions, 0 deletions
diff --git a/lib/bib-canon.awk b/lib/bib-canon.awk new file mode 100644 index 0000000..d11e9cb --- /dev/null +++ b/lib/bib-canon.awk @@ -0,0 +1,28 @@ +# bib-canon.awk - canonical output helpers for bibutils +# +# Requires bib-parse.awk. Provides bib_emit() to print the current +# entry in canonical form, and bib_get() to look up a field value. + +# print the current entry canonically: lowercase type and field names, +# 2-space indent, brace-delimited values with whitespace collapsed +function bib_emit(type, key, j, v) { + printf "@%s{%s,\n", type, key + for (j = 1; j <= BIB_N; j++) { + v = BIB_VAL[j] + if (BIB_KIND[j] == "s") { + gsub(/[ \t\r\n]+/, " ", v) + v = bib_trim(v) + printf " %s = {%s},\n", BIB_NAME[j], v + } else + printf " %s = %s,\n", BIB_NAME[j], v + } + print "}" +} + +# value of field `name` (lowercase) in the current entry, "" if absent +function bib_get(name, j) { + for (j = 1; j <= BIB_N; j++) + if (BIB_NAME[j] == name) + return BIB_VAL[j] + return "" +} diff --git a/lib/bib-check.awk b/lib/bib-check.awk new file mode 100644 index 0000000..4411a55 --- /dev/null +++ b/lib/bib-check.awk @@ -0,0 +1,69 @@ +# bib-check.awk - lint a bibtex database +# +# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per +# line on stdout: +# - missing fields required by the entry type +# - duplicate keys +# - entries whose titles normalize to the same string (likely dups) +# - empty field values +# Exits 1 if any problem was found. + +BEGIN { + REQ["article"] = "author title journal year" + REQ["book"] = "author|editor title publisher year" + REQ["booklet"] = "title" + REQ["inbook"] = "author|editor title publisher year" + REQ["incollection"] = "author title booktitle publisher year" + REQ["inproceedings"] = "author title booktitle year" + REQ["conference"] = "author title booktitle year" + REQ["manual"] = "title" + REQ["mastersthesis"] = "author title school year" + REQ["phdthesis"] = "author title school year" + REQ["proceedings"] = "title year" + REQ["techreport"] = "author title institution year" + REQ["unpublished"] = "author title note" +} + +function bib_pass(raw) { } + +function problem(key, msg) { + printf "%s: %s\n", key, msg + BIB_BAD = 1 +} + +function bib_entry(type, key, n, req, i, alts, na, j, found, t, k) { + if (key in BIB_KEYS_SEEN) + problem(key, "duplicate key") + BIB_KEYS_SEEN[key] = 1 + + # required fields ("a|b" means at least one of a, b) + if (type in REQ) { + n = split(REQ[type], req, " ") + for (i = 1; i <= n; i++) { + na = split(req[i], alts, "|") + found = 0 + for (j = 1; j <= na; j++) + if (bib_get(alts[j]) != "") + found = 1 + if (!found) + problem(key, "missing required field: " req[i]) + } + } + + # empty values + for (i = 1; i <= BIB_N; i++) + if (bib_trim(BIB_VAL[i]) == "") + problem(key, "empty field: " BIB_NAME[i]) + + # likely duplicate entries: same normalized title + t = tolower(bib_get("title")) + gsub(/[^a-z0-9]/, "", t) + if (t != "") { + if (t in BIB_TITLES_SEEN) + problem(key, "title duplicates " BIB_TITLES_SEEN[t]) + else + BIB_TITLES_SEEN[t] = key + } +} + +END { exit BIB_BAD } diff --git a/lib/bib-key.awk b/lib/bib-key.awk new file mode 100644 index 0000000..41534ba --- /dev/null +++ b/lib/bib-key.awk @@ -0,0 +1,69 @@ +# bib-key.awk - rekey every entry with a generated citation key +# +# Requires bib-parse.awk and bib-canon.awk. Keys have the form +# <surname><year><word>, e.g. knuth1984literate. + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key, k, n) { + if (bib_out_n++) + print "" + k = bib_mkkey() + # disambiguate collisions with b, c, ... suffixes + if (k in BIB_KEYS_SEEN) { + n = ++BIB_KEYS_SEEN[k] + k = k substr("bcdefghijklmnopqrstuvwxyz", n - 1, 1) + } else + BIB_KEYS_SEEN[k] = 1 + bib_emit(type, k) +} + +function bib_mkkey( a, y, t, surname, word, n, parts, i, w) { + a = bib_get("author") + if (a == "") + a = bib_get("editor") + y = bib_get("year") + t = bib_get("title") + + # surname of the first author + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) + gsub(/[{}]/, "", a) + a = bib_trim(a) + if (index(a, ",") > 0) + surname = substr(a, 1, index(a, ",") - 1) + else { + n = split(a, parts, /[ \t]+/) + surname = (n > 0) ? parts[n] : "" + } + gsub(/[^A-Za-z0-9]/, "", surname) + surname = tolower(surname) + if (surname == "") + surname = "anon" + + # four-digit year + if (match(y, /[0-9][0-9][0-9][0-9]/)) + y = substr(y, RSTART, 4) + else + y = "" + + # first significant word of the title + gsub(/[{}]/, "", t) + word = "" + n = split(tolower(t), parts, /[^a-z0-9]+/) + for (i = 1; i <= n; i++) { + w = parts[i] + if (w == "" || w == "a" || w == "an" || w == "the" || w == "on" || + w == "of" || w == "in" || w == "for" || w == "and" || w == "to" || + w == "with" || w == "from" || w == "by" || w == "at" || w == "is") + continue + word = w + break + } + + return surname y word +} diff --git a/lib/bib-ls.awk b/lib/bib-ls.awk new file mode 100644 index 0000000..909b654 --- /dev/null +++ b/lib/bib-ls.awk @@ -0,0 +1,25 @@ +# bib-ls.awk - list database entries +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# long - 0: print one key per line +# 1: print key, type, author, year and title, tab-separated + +function bib_pass(raw) { } + +function bib_entry(type, key, a, t) { + if (long + 0 == 0) { + print key + return + } + a = bib_get("author") + if (a == "") + a = bib_get("editor") + gsub(/[{}]/, "", a) + gsub(/[ \t\r\n]+/, " ", a) + if (match(a, / [Aa][Nn][Dd] /)) + a = substr(a, 1, RSTART - 1) " et al." + t = bib_get("title") + gsub(/[{}]/, "", t) + gsub(/[ \t\r\n]+/, " ", t) + printf "%s\t%s\t%s\t%s\t%s\n", key, type, a, bib_get("year"), t +} diff --git a/lib/bib-lskeys.awk b/lib/bib-lskeys.awk new file mode 100644 index 0000000..1932ced --- /dev/null +++ b/lib/bib-lskeys.awk @@ -0,0 +1,9 @@ +# bib-lskeys.awk - print the key of every entry, one per line +# +# Requires bib-parse.awk. + +function bib_pass(raw) { } + +function bib_entry(type, key) { + print key +} diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk new file mode 100644 index 0000000..e5bf9fa --- /dev/null +++ b/lib/bib-parse.awk @@ -0,0 +1,216 @@ +# bib-parse.awk - shared bibtex parsing library for bibutils +# +# Consumers must define two hook functions: +# bib_entry(type, key) - called once per regular entry. The fields are +# available in BIB_N, BIB_NAME[], BIB_VAL[] and +# BIB_KIND[]; the raw source text of the entry +# is in BIB_RAW. +# bib_pass(raw) - called for @string and @preamble blocks with +# their raw source text. +# +# BIB_KIND[j] is "s" for ordinary string values (content stored without +# delimiters; re-wrap in braces on output), "n" for bare numbers, and +# "r" for raw values (macros, # concatenation) which should be emitted +# verbatim. + +{ bib_buf = bib_buf $0 "\n" } + +END { bib_main(bib_buf) } + +function bib_main(s, i) { + i = 1 + while (i <= length(s)) { + if (substr(s, i, 1) == "@") + i = bib_entry_at(s, i) + else + i++ + } +} + +function bib_ws(s, i) { + while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/) + i++ + return i +} + +function bib_trim(t) { + sub(/^[ \t\r\n]+/, "", t) + sub(/[ \t\r\n]+$/, "", t) + return t +} + +# balanced {...} group starting at i; inner content goes to BIB_PIECE, +# returns the index just past the closing brace +function bib_braced(s, i, depth, start, c) { + start = i + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == "{") + depth++ + else if (c == "}") { + depth-- + if (depth == 0) + break + } + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# "..." group starting at i; braces protect embedded quotes +function bib_quoted(s, i, depth, start, c) { + start = i + i++ + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + if (c == "{") + depth++ + else if (c == "}") + depth-- + else if (c == "\"" && depth == 0) { + i++ + break + } + i++ + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# skip a balanced op...cl group starting at i (i must be at op) +function bib_skip_group(s, i, op, cl, depth, c) { + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == op) + depth++ + else if (c == cl) { + depth-- + if (depth == 0) + break + } + } + return i +} + +# field value at i, handling # concatenation; sets BIB_VALUE and +# BIB_VKIND, returns the index just past the value +function bib_value(s, i, start, c, piece, pieces, kind) { + start = i + pieces = 0 + kind = "" + BIB_VALUE = "" + while (1) { + c = substr(s, i, 1) + if (c == "{") { + i = bib_braced(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else if (c == "\"") { + i = bib_quoted(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else { + piece = "" + while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) { + piece = piece substr(s, i, 1) + i++ + } + BIB_VALUE = BIB_VALUE piece + kind = (piece ~ /^[0-9]+$/) ? "n" : "r" + } + pieces++ + i = bib_ws(s, i) + if (substr(s, i, 1) == "#") + i = bib_ws(s, i + 1) + else + break + } + if (pieces > 1) + kind = "r" + if (kind == "r") + BIB_VALUE = bib_trim(substr(s, start, i - start)) + BIB_VKIND = kind + return i +} + +# parse the construct whose "@" is at i; returns the index past it +function bib_entry_at(s, i, at, type, opener, closer, key, name, c) { + at = i + i++ + type = "" + while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) { + type = type substr(s, i, 1) + i++ + } + type = tolower(type) + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "{") { + opener = "{" + closer = "}" + } else if (c == "(") { + opener = "(" + closer = ")" + } else + return i # stray @, not an entry + + if (type == "comment") + return bib_skip_group(s, i, opener, closer) + if (type == "string" || type == "preamble") { + i = bib_skip_group(s, i, opener, closer) + bib_pass(bib_trim(substr(s, at, i - at))) + return i + } + + i++ # consume opener + i = bib_ws(s, i) + key = "" + while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) { + key = key substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) == ",") + i++ + + BIB_N = 0 + while (1) { + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "" || c == closer) { + if (c == closer) + i++ + break + } + if (c == ",") { + i++ + continue + } + name = "" + while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) { + name = name substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) != "=") { # malformed; skip a char and resync + i++ + continue + } + i = bib_ws(s, i + 1) + i = bib_value(s, i) + BIB_N++ + BIB_NAME[BIB_N] = tolower(name) + BIB_VAL[BIB_N] = BIB_VALUE + BIB_KIND[BIB_N] = BIB_VKIND + } + BIB_RAW = bib_trim(substr(s, at, i - at)) + bib_entry(type, key) + return i +} diff --git a/lib/bib-select.awk b/lib/bib-select.awk new file mode 100644 index 0000000..1900390 --- /dev/null +++ b/lib/bib-select.awk @@ -0,0 +1,29 @@ +# bib-select.awk - emit entries selected by key, canonically +# +# Requires bib-parse.awk and bib-canon.awk. Variables (set with -v): +# keys - comma-separated list of entry keys +# invert - 0: emit entries whose key is in the list +# 1: emit entries whose key is NOT in the list +# +# With keys="" and invert=1 this acts as a canonicalizing filter for +# everything. @string and @preamble blocks always pass through. + +BEGIN { + bib_sel_n = split(keys, bib_sel_k, ",") + for (bib_sel_i = 1; bib_sel_i <= bib_sel_n; bib_sel_i++) + BIB_SEL[bib_sel_k[bib_sel_i]] = 1 +} + +function bib_pass(raw) { + if (bib_out_n++) + print "" + print raw +} + +function bib_entry(type, key) { + if ((key in BIB_SEL) != invert + 0) { + if (bib_out_n++) + print "" + bib_emit(type, key) + } +} diff --git a/lib/bib2ref.awk b/lib/bib2ref.awk new file mode 100644 index 0000000..4d9e595 --- /dev/null +++ b/lib/bib2ref.awk @@ -0,0 +1,52 @@ +# bib2ref.awk - convert bibtex entries to refer records +# +# Requires bib-parse.awk and bib-canon.awk. + +function bib_pass(raw) { } + +function r_field(tag, v) { + if (v != "") { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + printf "%%%s %s\n", tag, bib_trim(v) + } +} + +function r_names(tag, v, n, parts, i) { + gsub(/[{}]/, "", v) + gsub(/[ \t\r\n]+/, " ", v) + n = split(v, parts, / +[Aa][Nn][Dd] +/) + for (i = 1; i <= n; i++) + if (bib_trim(parts[i]) != "") + printf "%%%s %s\n", tag, bib_trim(parts[i]) +} + +function bib_entry(type, key, d, p, m) { + if (bib_out_n++) + print "" + r_names("A", bib_get("author")) + r_names("E", bib_get("editor")) + r_field("T", bib_get("title")) + r_field("J", bib_get("journal")) + r_field("B", bib_get("booktitle")) + d = bib_get("year") + m = bib_get("month") + if (m != "") + d = (d != "") ? m " " d : m + r_field("D", d) + r_field("V", bib_get("volume")) + r_field("N", bib_get("number")) + p = bib_get("pages") + gsub(/--/, "-", p) + r_field("P", p) + if (bib_get("publisher") != "") + r_field("I", bib_get("publisher")) + else if (bib_get("institution") != "") + r_field("I", bib_get("institution")) + else if (bib_get("school") != "") + r_field("I", bib_get("school")) + r_field("C", bib_get("address")) + r_field("K", bib_get("keywords")) + r_field("X", bib_get("abstract")) + r_field("O", bib_get("note")) +} diff --git a/lib/ref2bib.awk b/lib/ref2bib.awk new file mode 100644 index 0000000..422fdd7 --- /dev/null +++ b/lib/ref2bib.awk @@ -0,0 +1,107 @@ +# ref2bib.awk - convert refer records to bibtex entries +# +# Standalone (does not use bib-parse.awk). Records are separated by +# blank lines. Output keys are FIXME; pipe through bib-key. + +BEGIN { + RS = "" + FS = "\n" +} + +function r_trim(t) { + sub(/^[ \t\r]+/, "", t) + sub(/[ \t\r]+$/, "", t) + return t +} + +function r_emit(name, v) { + if (v != "") + printf " %s = {%s},\n", name, v +} + +{ + split("", val) + na = 0 + ne = 0 + split("", A) + split("", E) + lasttag = "" + for (i = 1; i <= NF; i++) { + line = $i + if (substr(line, 1, 1) == "%") { + tag = substr(line, 2, 1) + v = r_trim(substr(line, 3)) + if (tag == "A") + A[++na] = v + else if (tag == "E") + E[++ne] = v + else + val[tag] = v + lasttag = tag + } else if (lasttag == "A") + A[na] = A[na] " " r_trim(line) + else if (lasttag == "E") + E[ne] = E[ne] " " r_trim(line) + else if (lasttag != "") + val[lasttag] = val[lasttag] " " r_trim(line) + } + if (na == 0 && ne == 0 && !("T" in val)) + next + + # guess an entry type from the fields present + if ("J" in val) + type = "article" + else if ("B" in val) + type = (val["B"] ~ /[Pp]roceedings|[Cc]onference|[Ss]ymposium|[Ww]orkshop/) \ + ? "inproceedings" : "incollection" + else if ("R" in val) + type = "techreport" + else if ("I" in val) + type = "book" + else + type = "misc" + + if (out_n++) + print "" + printf "@%s{FIXME,\n", type + + authors = "" + for (i = 1; i <= na; i++) + authors = (i == 1) ? A[i] : authors " and " A[i] + r_emit("author", authors) + editors = "" + for (i = 1; i <= ne; i++) + editors = (i == 1) ? E[i] : editors " and " E[i] + r_emit("editor", editors) + + r_emit("title", val["T"]) + r_emit("journal", val["J"]) + r_emit("booktitle", val["B"]) + + d = val["D"] + if (match(d, /[0-9][0-9][0-9][0-9]/)) { + r_emit("year", substr(d, RSTART, 4)) + m = r_trim(substr(d, 1, RSTART - 1) substr(d, RSTART + 4)) + if (m != "") + r_emit("month", m) + } else + r_emit("year", d) + + r_emit("volume", val["V"]) + r_emit("number", val["N"]) + p = val["P"] + gsub(/-+/, "--", p) + r_emit("pages", p) + r_emit(type == "techreport" ? "institution" : "publisher", val["I"]) + r_emit("address", val["C"]) + if ("R" in val) { + if ("N" in val) + r_emit("note", val["R"]) + else + r_emit("number", val["R"]) + } + r_emit("keywords", val["K"]) + r_emit("abstract", val["X"]) + r_emit("note", val["O"]) + print "}" +} |