diff options
Diffstat (limited to 'lib/bib-parse.awk')
| -rw-r--r-- | lib/bib-parse.awk | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/lib/bib-parse.awk b/lib/bib-parse.awk new file mode 100644 index 0000000..e5bf9fa --- /dev/null +++ b/lib/bib-parse.awk @@ -0,0 +1,216 @@ +# bib-parse.awk - shared bibtex parsing library for bibutils +# +# Consumers must define two hook functions: +# bib_entry(type, key) - called once per regular entry. The fields are +# available in BIB_N, BIB_NAME[], BIB_VAL[] and +# BIB_KIND[]; the raw source text of the entry +# is in BIB_RAW. +# bib_pass(raw) - called for @string and @preamble blocks with +# their raw source text. +# +# BIB_KIND[j] is "s" for ordinary string values (content stored without +# delimiters; re-wrap in braces on output), "n" for bare numbers, and +# "r" for raw values (macros, # concatenation) which should be emitted +# verbatim. + +{ bib_buf = bib_buf $0 "\n" } + +END { bib_main(bib_buf) } + +function bib_main(s, i) { + i = 1 + while (i <= length(s)) { + if (substr(s, i, 1) == "@") + i = bib_entry_at(s, i) + else + i++ + } +} + +function bib_ws(s, i) { + while (i <= length(s) && substr(s, i, 1) ~ /[ \t\r\n]/) + i++ + return i +} + +function bib_trim(t) { + sub(/^[ \t\r\n]+/, "", t) + sub(/[ \t\r\n]+$/, "", t) + return t +} + +# balanced {...} group starting at i; inner content goes to BIB_PIECE, +# returns the index just past the closing brace +function bib_braced(s, i, depth, start, c) { + start = i + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == "{") + depth++ + else if (c == "}") { + depth-- + if (depth == 0) + break + } + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# "..." group starting at i; braces protect embedded quotes +function bib_quoted(s, i, depth, start, c) { + start = i + i++ + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + if (c == "{") + depth++ + else if (c == "}") + depth-- + else if (c == "\"" && depth == 0) { + i++ + break + } + i++ + } + BIB_PIECE = substr(s, start + 1, i - start - 2) + return i +} + +# skip a balanced op...cl group starting at i (i must be at op) +function bib_skip_group(s, i, op, cl, depth, c) { + depth = 0 + while (i <= length(s)) { + c = substr(s, i, 1) + i++ + if (c == op) + depth++ + else if (c == cl) { + depth-- + if (depth == 0) + break + } + } + return i +} + +# field value at i, handling # concatenation; sets BIB_VALUE and +# BIB_VKIND, returns the index just past the value +function bib_value(s, i, start, c, piece, pieces, kind) { + start = i + pieces = 0 + kind = "" + BIB_VALUE = "" + while (1) { + c = substr(s, i, 1) + if (c == "{") { + i = bib_braced(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else if (c == "\"") { + i = bib_quoted(s, i) + BIB_VALUE = BIB_VALUE BIB_PIECE + if (kind == "") + kind = "s" + } else { + piece = "" + while (i <= length(s) && substr(s, i, 1) !~ /[,#}) \t\r\n]/) { + piece = piece substr(s, i, 1) + i++ + } + BIB_VALUE = BIB_VALUE piece + kind = (piece ~ /^[0-9]+$/) ? "n" : "r" + } + pieces++ + i = bib_ws(s, i) + if (substr(s, i, 1) == "#") + i = bib_ws(s, i + 1) + else + break + } + if (pieces > 1) + kind = "r" + if (kind == "r") + BIB_VALUE = bib_trim(substr(s, start, i - start)) + BIB_VKIND = kind + return i +} + +# parse the construct whose "@" is at i; returns the index past it +function bib_entry_at(s, i, at, type, opener, closer, key, name, c) { + at = i + i++ + type = "" + while (i <= length(s) && substr(s, i, 1) ~ /[A-Za-z]/) { + type = type substr(s, i, 1) + i++ + } + type = tolower(type) + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "{") { + opener = "{" + closer = "}" + } else if (c == "(") { + opener = "(" + closer = ")" + } else + return i # stray @, not an entry + + if (type == "comment") + return bib_skip_group(s, i, opener, closer) + if (type == "string" || type == "preamble") { + i = bib_skip_group(s, i, opener, closer) + bib_pass(bib_trim(substr(s, at, i - at))) + return i + } + + i++ # consume opener + i = bib_ws(s, i) + key = "" + while (i <= length(s) && substr(s, i, 1) !~ /[, \t\r\n})]/) { + key = key substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) == ",") + i++ + + BIB_N = 0 + while (1) { + i = bib_ws(s, i) + c = substr(s, i, 1) + if (c == "" || c == closer) { + if (c == closer) + i++ + break + } + if (c == ",") { + i++ + continue + } + name = "" + while (i <= length(s) && substr(s, i, 1) !~ /[=, \t\r\n})]/) { + name = name substr(s, i, 1) + i++ + } + i = bib_ws(s, i) + if (substr(s, i, 1) != "=") { # malformed; skip a char and resync + i++ + continue + } + i = bib_ws(s, i + 1) + i = bib_value(s, i) + BIB_N++ + BIB_NAME[BIB_N] = tolower(name) + BIB_VAL[BIB_N] = BIB_VALUE + BIB_KIND[BIB_N] = BIB_VKIND + } + BIB_RAW = bib_trim(substr(s, at, i - at)) + bib_entry(type, key) + return i +} |