1 files changed, 82 insertions, 0 deletions
diff --git a/bib-fetch b/bib-fetch
new file mode 100755
index 0000000..62f7993
--- /dev/null
+++ b/bib-fetch
@@ -0,0 +1,82 @@
+#!/bin/sh
+# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
+#
+# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
+#
+# Unless given with -d or -a, an identifier is extracted from the first
+# pages of the pdf (requires pdftotext): a DOI if one is found, falling
+# back to an arXiv id. DOIs are resolved through doi.org content
+# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
+# is emitted canonically on stdout with a generated key.
+
+usage() {
+  printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
+  exit 2
+}
+
+doi=
+arxiv=
+while getopts d:a: opt; do
+  case $opt in
+    d) doi=$OPTARG ;;
+    a) arxiv=$OPTARG ;;
+    *) usage ;;
+  esac
+done
+shift $((OPTIND - 1))
+[ -n "$doi" ] && [ -n "$arxiv" ] && usage
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+command -v curl > /dev/null 2>&1 || {
+  printf 'bib-fetch: curl is required\n' >&2
+  exit 1
+}
+
+if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+  [ $# -eq 1 ] || usage
+  pdf=$1
+  [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
+  command -v pdftotext > /dev/null 2>&1 || {
+    printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
+    exit 1
+  }
+  ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
+    # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
+    doi == "" {
+      if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
+        doi = substr($0, RSTART, RLENGTH)
+        sub(/[.,;)\]]+$/, "", doi)
+      }
+    }
+    arxiv == "" {
+      # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
+      if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
+          match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
+        arxiv = substr($0, RSTART + 6, RLENGTH - 6)
+    }
+    END { printf "%s\t%s\n", doi, arxiv }')
+  doi=${ids%%	*}
+  arxiv=${ids#*	}
+  if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+    printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
+    exit 1
+  fi
+fi
+
+if [ -n "$doi" ]; then
+  entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
+      "https://doi.org/$doi") || {
+    printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
+    exit 1
+  }
+else
+  arxiv=${arxiv#arXiv:}
+  entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
+    printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
+    exit 1
+  }
+fi
+
+printf '%s\n' "$entry" | "$bibkey"