-
Notifications
You must be signed in to change notification settings - Fork 2
/
Dockerfile.dataseer
150 lines (115 loc) · 4.72 KB
/
Dockerfile.dataseer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -------------------
# build builder image
# -------------------
FROM openjdk:8u275-jdk as builder
USER root
RUN apt-get update && \
apt-get -y --no-install-recommends install unzip
WORKDIR /opt/grobid-source
# gradle
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./
# source
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
#COPY grobid-service/ ./grobid-service/
COPY grobid-trainer/ ./grobid-trainer/
COPY dataseer-ml/ ./dataseer-ml/
# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/pdfalto/mac-64
RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64
RUN ./gradlew clean assemble install --no-daemon --info --stacktrace
WORKDIR ./dataseer-ml/
RUN ./gradlew clean install --no-daemon --info --stacktrace
WORKDIR /opt/grobid
#RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \
# mv grobid-service* grobid-service
RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \
chmod -R 755 /opt/grobid/grobid-home/pdfalto
RUN rm -rf grobid-source
# -------------------
# build runtime image
# -------------------
# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine
FROM tensorflow/tensorflow:2.7.0-gpu
CMD nvidia-smi
# setting locale is likely useless but to be sure
ENV LANG C.UTF-8
# update NVIDIA Cuda key (following a key rotation in April 2022)
RUN apt-get install -y wget
RUN apt-key del 7fa2af80
RUN rm /etc/apt/sources.list.d/cuda.list
RUN rm /etc/apt/sources.list.d/nvidia-ml.list
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
RUN dpkg -i cuda-keyring_1.0-1_all.deb
# install JRE 8, python and other dependencies
RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 unzip curl \
openjdk-8-jre-headless openjdk-8-jdk ca-certificates-java \
musl gfortran \
python3 python3-pip python3-setuptools python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .
RUN python3 -m pip install pip --upgrade
# install DeLFT via pypi
RUN pip3 install requests delft==0.3.3
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
WORKDIR /opt/grobid
ENV JAVA_OPTS=-Xmx4g
# Add Tini
ENV TINI_VERSION v0.18.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]
# install jep (and temporarily the matching JDK)
ENV TEMP_JDK_HOME=/tmp/jdk-8
ENV JDK_URL=https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u212-b04/OpenJDK8U-x64_linux_8u212b04.tar.gz
RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz \
&& ls -lh /tmp/openjdk.tar.gz \
&& mkdir - "${TEMP_JDK_HOME}" \
&& tar --extract \
--file /tmp/openjdk.tar.gz \
--directory "${TEMP_JDK_HOME}" \
--strip-components 1 \
--no-same-owner \
&& JAVA_HOME=${TEMP_JDK_HOME} pip3 install jep==4.0.2 \
&& rm -f /tmp/openjdk.tar.gz \
&& rm -rf "${TEMP_JDK_HOME}"
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:${LD_LIBRARY_PATH}
# remove libjep.so because we are providing our own version in the virtual env
RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so
# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded
# download GROBID fine-tuned models based on SciBERT if selected
COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py .
# embeddings will be loaded when building and running tests
RUN ln -s /opt/grobid /opt/delft
COPY --from=builder /opt/grobid-source/dataseer-ml /opt/grobid/dataseer-ml
#COPY --from=builder /root/.m2/repository/org /opt/grobid/dataseer-ml/lib/org
WORKDIR /opt/grobid/dataseer-ml
RUN mkdir /opt/grobid/delft
RUN mkdir /opt/grobid/delft/delft
COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json /opt/grobid/delft/delft/resources-registry.json
WORKDIR /opt/grobid/dataseer-ml
# trigger gradle wrapper install
RUN ./gradlew --version
RUN ./gradlew copyModels && rm -rf resources/models
# this will build and load embeddings on the image forever :)
RUN ./gradlew clean build
CMD ["./gradlew", "run"]
ARG GROBID_VERSION