# bib-check.awk - lint a bibtex database
#
# Requires bib-parse.awk and bib-canon.awk. Reports, one problem per
# line on stdout:
#   - missing fields required by the entry type
#   - duplicate keys
#   - entries whose titles normalize to the same string (likely dups)
#   - empty field values
# Exits 1 if any problem was found.

BEGIN {
  REQ["article"] = "author title journal year"
  REQ["book"] = "author|editor title publisher year"
  REQ["booklet"] = "title"
  REQ["inbook"] = "author|editor title publisher year"
  REQ["incollection"] = "author title booktitle publisher year"
  REQ["inproceedings"] = "author title booktitle year"
  REQ["conference"] = "author title booktitle year"
  REQ["manual"] = "title"
  REQ["mastersthesis"] = "author title school year"
  REQ["phdthesis"] = "author title school year"
  REQ["proceedings"] = "title year"
  REQ["techreport"] = "author title institution year"
  REQ["unpublished"] = "author title note"
}

function bib_pass(raw) { }

function problem(key, msg) {
  printf "%s: %s\n", key, msg
  BIB_BAD = 1
}

function bib_entry(type, key,    n, req, i, alts, na, j, found, t) {
  if (key in BIB_KEYS_SEEN)
    problem(key, "duplicate key")
  BIB_KEYS_SEEN[key] = 1

  # required fields ("a|b" means at least one of a, b)
  if (type in REQ) {
    n = split(REQ[type], req, " ")
    for (i = 1; i <= n; i++) {
      na = split(req[i], alts, "|")
      found = 0
      for (j = 1; j <= na; j++)
        if (bib_get(alts[j]) != "")
          found = 1
      if (!found)
        problem(key, "missing required field: " req[i])
    }
  }

  # empty values
  for (i = 1; i <= BIB_N; i++)
    if (bib_trim(BIB_VAL[i]) == "")
      problem(key, "empty field: " BIB_NAME[i])

  # likely duplicate entries: same normalized title
  t = tolower(bib_get("title"))
  gsub(/[^a-z0-9]/, "", t)
  if (t != "") {
    if (t in BIB_TITLES_SEEN)
      problem(key, "title duplicates " BIB_TITLES_SEEN[t])
    else
      BIB_TITLES_SEEN[t] = key
  }
}

END { exit BIB_BAD }