diff options
Diffstat (limited to 'bib-fetch')
| -rwxr-xr-x | bib-fetch | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/bib-fetch b/bib-fetch new file mode 100755 index 0000000..62f7993 --- /dev/null +++ b/bib-fetch @@ -0,0 +1,82 @@ +#!/bin/sh +# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id +# +# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf] +# +# Unless given with -d or -a, an identifier is extracted from the first +# pages of the pdf (requires pdftotext): a DOI if one is found, falling +# back to an arXiv id. DOIs are resolved through doi.org content +# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry +# is emitted canonically on stdout with a generated key. + +usage() { + printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2 + exit 2 +} + +doi= +arxiv= +while getopts d:a: opt; do + case $opt in + d) doi=$OPTARG ;; + a) arxiv=$OPTARG ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) +[ -n "$doi" ] && [ -n "$arxiv" ] && usage + +bibkey=$(dirname "$0")/bib-key +[ -x "$bibkey" ] || bibkey=bib-key + +command -v curl > /dev/null 2>&1 || { + printf 'bib-fetch: curl is required\n' >&2 + exit 1 +} + +if [ -z "$doi" ] && [ -z "$arxiv" ]; then + [ $# -eq 1 ] || usage + pdf=$1 + [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; } + command -v pdftotext > /dev/null 2>&1 || { + printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2 + exit 1 + } + ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk ' + # first DOI on a "doi" line, first arXiv stamp; prefer the DOI + doi == "" { + if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) { + doi = substr($0, RSTART, RLENGTH) + sub(/[.,;)\]]+$/, "", doi) + } + } + arxiv == "" { + # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001) + if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) || + match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/)) + arxiv = substr($0, RSTART + 6, RLENGTH - 6) + } + END { printf "%s\t%s\n", doi, arxiv }') + doi=${ids%% *} + arxiv=${ids#* } + if [ -z "$doi" ] && [ -z "$arxiv" ]; then + printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2 + exit 1 + fi +fi + +if [ -n "$doi" ]; then + entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \ + "https://doi.org/$doi") || { + printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2 + exit 1 + } +else + arxiv=${arxiv#arXiv:} + entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || { + printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2 + exit 1 + } +fi + +printf '%s\n' "$entry" | "$bibkey" |