aboutsummaryrefslogtreecommitdiffstats
path: root/bib-fetch
diff options
context:
space:
mode:
Diffstat (limited to 'bib-fetch')
-rwxr-xr-xbib-fetch82
1 files changed, 82 insertions, 0 deletions
diff --git a/bib-fetch b/bib-fetch
new file mode 100755
index 0000000..62f7993
--- /dev/null
+++ b/bib-fetch
@@ -0,0 +1,82 @@
+#!/bin/sh
+# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
+#
+# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
+#
+# Unless given with -d or -a, an identifier is extracted from the first
+# pages of the pdf (requires pdftotext): a DOI if one is found, falling
+# back to an arXiv id. DOIs are resolved through doi.org content
+# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
+# is emitted canonically on stdout with a generated key.
+
+usage() {
+ printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
+ exit 2
+}
+
+doi=
+arxiv=
+while getopts d:a: opt; do
+ case $opt in
+ d) doi=$OPTARG ;;
+ a) arxiv=$OPTARG ;;
+ *) usage ;;
+ esac
+done
+shift $((OPTIND - 1))
+[ -n "$doi" ] && [ -n "$arxiv" ] && usage
+
+bibkey=$(dirname "$0")/bib-key
+[ -x "$bibkey" ] || bibkey=bib-key
+
+command -v curl > /dev/null 2>&1 || {
+ printf 'bib-fetch: curl is required\n' >&2
+ exit 1
+}
+
+if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+ [ $# -eq 1 ] || usage
+ pdf=$1
+ [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
+ command -v pdftotext > /dev/null 2>&1 || {
+ printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
+ exit 1
+ }
+ ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
+ # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
+ doi == "" {
+ if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
+ doi = substr($0, RSTART, RLENGTH)
+ sub(/[.,;)\]]+$/, "", doi)
+ }
+ }
+ arxiv == "" {
+ # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
+ if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
+ match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
+ arxiv = substr($0, RSTART + 6, RLENGTH - 6)
+ }
+ END { printf "%s\t%s\n", doi, arxiv }')
+ doi=${ids%% *}
+ arxiv=${ids#* }
+ if [ -z "$doi" ] && [ -z "$arxiv" ]; then
+ printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
+ exit 1
+ fi
+fi
+
+if [ -n "$doi" ]; then
+ entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
+ "https://doi.org/$doi") || {
+ printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
+ exit 1
+ }
+else
+ arxiv=${arxiv#arXiv:}
+ entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
+ printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
+ exit 1
+ }
+fi
+
+printf '%s\n' "$entry" | "$bibkey"