This repository has been archived by the owner on Feb 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Dockerfile
66 lines (53 loc) · 2.19 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
FROM centos:7
LABEL maintainer="amiyaguchi@mozilla.com"
ENV LANG en_US.utf8
COPY ./google-cloud-sdk.repo /etc/yum.repos.d/
RUN yum install -y epel-release \
&& yum install -y \
nss \
nspr \
msgpack \
python36 \
java-1.8.0-openjdk \
google-cloud-sdk \
rsync \
jq \
parallel \
which \
tree \
wget \
&& yum clean all \
&& rm -rf /var/cache/yum
RUN gcloud config set disable_usage_reporting true
RUN groupadd --gid 10001 app && \
useradd -g app --uid 10001 --shell /usr/sbin/nologin --create-home \
--home-dir /app app
WORKDIR /app
COPY requirements.txt requirements-dev.txt ./
ENV PATH="$PATH:~/.local/bin"
RUN python3 -m ensurepip && \
pip3 install --upgrade pip wheel && \
pip3 install -r requirements.txt -r requirements-dev.txt
ENV SPARK_HOME=/usr/local/lib/python3.6/site-packages/pyspark
ENV PYSPARK_PYTHON=python3
# Install libraries for interacting with cloud storage. We utilize the s3a
# adaptor for cross-cloud compatibility, but use of the gcs connector may be
# more performant when running directly in GCP.
# https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage
RUN gsutil cp gs://hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar "${SPARK_HOME}/jars"
RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar
RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar
# Use the MinIO client for cross platform behavior, even with self-hosting
RUN wget --directory-prefix /usr/local/bin https://dl.min.io/client/mc/release/linux-amd64/mc
RUN chmod +x /usr/local/bin/mc
ADD . /app
# Symlink the spark config into SPARK_HOME so it can be updated via volume mounts
RUN ln -s /app/config/spark ${SPARK_HOME}/conf
# build the binary egg for distribution on Spark clusters
RUN python3 setup.py bdist_egg && pip3 install -e .
RUN chown -R app:app /app
USER app
CMD pytest -v tests && \
scripts/test-cli-integration && \
prio --help && \
prio-processor --help