#!/bin/sh # bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id # # usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf] # # Unless given with -d or -a, an identifier is extracted from the first # pages of the pdf (requires pdftotext): a DOI if one is found, falling # back to an arXiv id. DOIs are resolved through doi.org content # negotiation (crossref et al.), arXiv ids through arxiv.org. The entry # is emitted canonically on stdout with a generated key. usage() { printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2 exit 2 } doi= arxiv= while getopts d:a: opt; do case $opt in d) doi=$OPTARG ;; a) arxiv=$OPTARG ;; *) usage ;; esac done shift $((OPTIND - 1)) [ -n "$doi" ] && [ -n "$arxiv" ] && usage bibkey=$(dirname "$0")/bib-key [ -x "$bibkey" ] || bibkey=bib-key command -v curl > /dev/null 2>&1 || { printf 'bib-fetch: curl is required\n' >&2 exit 1 } if [ -z "$doi" ] && [ -z "$arxiv" ]; then [ $# -eq 1 ] || usage pdf=$1 [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; } command -v pdftotext > /dev/null 2>&1 || { printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2 exit 1 } ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk ' # first DOI on a "doi" line, first arXiv stamp; prefer the DOI doi == "" { if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) { doi = substr($0, RSTART, RLENGTH) sub(/[.,;)\]]+$/, "", doi) } } arxiv == "" { # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001) if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) || match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/)) arxiv = substr($0, RSTART + 6, RLENGTH - 6) } END { printf "%s\t%s\n", doi, arxiv }') doi=${ids%% *} arxiv=${ids#* } if [ -z "$doi" ] && [ -z "$arxiv" ]; then printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2 exit 1 fi fi if [ -n "$doi" ]; then entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \ "https://doi.org/$doi") || { printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2 exit 1 } else arxiv=${arxiv#arXiv:} entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || { printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2 exit 1 } fi printf '%s\n' "$entry" | "$bibkey"