bib-fetch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

#!/bin/sh
# bib-fetch - fetch a bibtex entry for a pdf from its DOI or arXiv id
#
# usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]
#
# Unless given with -d or -a, an identifier is extracted from the first
# pages of the pdf (requires pdftotext): a DOI if one is found, falling
# back to an arXiv id. DOIs are resolved through doi.org content
# negotiation (crossref et al.), arXiv ids through arxiv.org. The entry
# is emitted canonically on stdout with a generated key.

usage() {
  printf 'usage: bib-fetch [-d doi | -a arxiv-id] [file.pdf]\n' >&2
  exit 2
}

doi=
arxiv=
while getopts d:a: opt; do
  case $opt in
    d) doi=$OPTARG ;;
    a) arxiv=$OPTARG ;;
    *) usage ;;
  esac
done
shift $((OPTIND - 1))
[ -n "$doi" ] && [ -n "$arxiv" ] && usage

bibkey=$(dirname "$0")/bib-key
[ -x "$bibkey" ] || bibkey=bib-key

command -v curl > /dev/null 2>&1 || {
  printf 'bib-fetch: curl is required\n' >&2
  exit 1
}

if [ -z "$doi" ] && [ -z "$arxiv" ]; then
  [ $# -eq 1 ] || usage
  pdf=$1
  [ -r "$pdf" ] || { printf 'bib-fetch: cannot read %s\n' "$pdf" >&2; exit 1; }
  command -v pdftotext > /dev/null 2>&1 || {
    printf 'bib-fetch: pdftotext is required to extract an identifier\n' >&2
    exit 1
  }
  ids=$(pdftotext -l 2 "$pdf" - 2> /dev/null | awk '
    # first DOI on a "doi" line, first arXiv stamp; prefer the DOI
    doi == "" {
      if (match($0, /10\.[0-9][0-9][0-9][0-9][0-9]*\/[^ \t"<>]+/)) {
        doi = substr($0, RSTART, RLENGTH)
        sub(/[.,;)\]]+$/, "", doi)
      }
    }
    arxiv == "" {
      # modern ids (arXiv:2104.01234v2) and old style (arXiv:cs/0101001)
      if (match($0, /[aA][rR][xX][iI][vV]:[0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9][0-9]?(v[0-9]+)?/) ||
          match($0, /[aA][rR][xX][iI][vV]:[a-z-]+(\.[A-Z][A-Z])?\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9](v[0-9]+)?/))
        arxiv = substr($0, RSTART + 6, RLENGTH - 6)
    }
    END { printf "%s\t%s\n", doi, arxiv }')
  doi=${ids%%	*}
  arxiv=${ids#*	}
  if [ -z "$doi" ] && [ -z "$arxiv" ]; then
    printf 'bib-fetch: no DOI or arXiv id found in %s\n' "$pdf" >&2
    exit 1
  fi
fi

if [ -n "$doi" ]; then
  entry=$(curl -sSfL -H 'Accept: application/x-bibtex' \
      "https://doi.org/$doi") || {
    printf 'bib-fetch: failed to fetch entry for doi %s\n' "$doi" >&2
    exit 1
  }
else
  arxiv=${arxiv#arXiv:}
  entry=$(curl -sSfL "https://arxiv.org/bibtex/$arxiv") || {
    printf 'bib-fetch: failed to fetch entry for arXiv id %s\n' "$arxiv" >&2
    exit 1
  }
fi

printf '%s\n' "$entry" | "$bibkey"