Skip to content

Commit

Permalink
Fix build
Browse files Browse the repository at this point in the history
  * Spark jars should have a dedicated artifact id
  * Use python 3 instead of python 2. This requires to make /usr/bin/python point to python3
  * Add -l option to docker useradd command to prevent a bug using long uid moby/moby#5419
  * Remove \\\ in the maven settings.xml file in the container otherwise the variables criteo.repo.username|password are not correctly used and lead to 401 issues when uploading files to nexus
  * Fix hive dependency. Groupid has to be org.spark-project.hive and artifactid version 1.2.1.spark2
  * use set -e in the build script to fail immediatly when an error occur in any command
  * Spark scala is a profile and cannot be activated with -D java option but rather with -P maven option
  * Use --no-transfer-progress in maven commands to make output readable
  * All mvn commands were using \\ instead of \ leading to bad command interpreation by bash
  * Fix tar command used to build the 'jar-only' tgz
  * Fix mvn jar:jar deploy:deploy parameters which are not exactly the same as the ones of mvn deploy:deploy-file
  * Upgrade pip in the venv otherwise pyarrow cannot install
  * Fix altDeploymentRepository declaration
  * Set pypandoc version to 1.5 as some functions have been removed in newer versions
  • Loading branch information
w.montaz committed May 16, 2023
1 parent 3b23a78 commit 632109a
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 60 deletions.
11 changes: 7 additions & 4 deletions external/docker/criteo-build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ RUN yum install -y \
java-1.8.0-openjdk \
pinentry-curses \
pkgconfig \
python2-pip \
python2-virtualenv \
python3-pip \
python3-virtualenv \
rsync \
ShellCheck \
sudo \
Expand Down Expand Up @@ -86,11 +86,14 @@ ENV MAVEN_OPTS -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g


RUN groupadd --non-unique -g ${GROUP_ID} ${USER_NAME}
RUN useradd -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
RUN useradd -l -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
RUN echo "${USER_NAME} ALL=NOPASSWD: ALL" > "/etc/sudoers.d/spark-build-${USER_ID}"
ENV HOME /home/${USER_NAME}
RUN mkdir /home/${USER_NAME}/.m2 && chown ${USER_NAME}: /home/${USER_NAME}/.m2
RUN echo '<settings><mirrors><mirror><id>criteo</id><mirrorOf>*</mirrorOf><url>http://nexus.criteo.prod/content/groups/criteodev</url></mirror></mirrors><servers><server><id>criteo</id><username>\\\${criteo.repo.username}</username><password>\\\${criteo.repo.password}</password></server></servers></settings>' > /home/${USER_NAME}/.m2/settings.xml
RUN echo '<settings><mirrors><mirror><id>criteo</id><mirrorOf>*</mirrorOf><url>http://nexus.criteo.prod/content/groups/criteodev</url></mirror></mirrors><servers><server><id>criteo</id><username>${criteo.repo.username}</username><password>${criteo.repo.password}</password></server></servers></settings>' > /home/${USER_NAME}/.m2/settings.xml

# Alias python3 to python otherwise python 2 is called
RUN mv /usr/bin/python /usr/bin/python2
RUN ln -s /usr/bin/python3 /usr/bin/python

RUN rm -f /var/log/faillog /var/log/lastlog
2 changes: 1 addition & 1 deletion external/docker/criteo-build/build_config.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
HDP_VERSION=3.3.0-criteo-20230320100819
HIVE_VERSION=1.2.1
HIVE_VERSION=1.2.1.spark2
102 changes: 50 additions & 52 deletions external/docker/criteo-build/build_script.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
set -x
set -e

MAVEN_USER=$1
MAVEN_PASSWORD=$2
Expand All @@ -20,7 +21,7 @@ if [ ${SCALA_RELEASE} == "2.12" ]; then
MVN_SCALA_PROPERTY="-Pscala-2.12"
elif [ ${SCALA_RELEASE} == "2.11" ]; then
./dev/change-scala-version.sh 2.11
MVN_SCALA_PROPERTY="-Dscala-2.11"
MVN_SCALA_PROPERTY="-Pscala-2.11"
else
echo "[ERROR] Scala release not provided"
exit 1
Expand All @@ -36,83 +37,80 @@ MVN_HDP_ARTIFACT_VERSION="${MVN_ARTIFACT_VERSION}-${HDP_VERSION}"
SHUFFLE_SERVICE_JAR_FILE="dist/yarn/spark-${CRITEO_VERSION}-yarn-shuffle.jar"
MVN_COMMON_PROPERTIES="-Dhive.version=${HIVE_VERSION} ${MVN_SCALA_PROPERTY}"
MVN_COMMON_PROPERTIES_NO_TESTS="${MVN_COMMON_PROPERTIES} -DskipTests"
MVN_COMMON_NEXUS_PROPERTIES="-DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"
MVN_COMMON_DEPLOY_FILE_PROPERTIES="-Durl=${NEXUS_ARTIFACT_URL} -DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"

# do some house cleaning
mvn clean
mvn --no-transfer-progress clean
rm -f spark-*.tgz
rm -f dist/python/dist/*
rm -f python/dist/*

# change version
mvn versions:set -DnewVersion=${CRITEO_VERSION}
mvn --no-transfer-progress versions:set -DnewVersion=${CRITEO_VERSION}

# Build distribution with hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -Phive -Phive-thriftserver -Pyarn -Dhadoop.version=${HDP_VERSION} ${MVN_COMMON_PROPERTIES_NO_TESTS}

./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz --no-transfer-progress -Phive -Phive-thriftserver -Pyarn -Dhadoop.version=${HDP_VERSION} ${MVN_COMMON_PROPERTIES_NO_TESTS}
# tgz artifact deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_HDP_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_HDP_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_HDP_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_HDP_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# Build distribution without hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -Phive -Phive-thriftserver -Pyarn -Phadoop-provided ${MVN_COMMON_PROPERTIES}
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz --no-transfer-progress -Phive -Phive-thriftserver -Pyarn -Phadoop-provided ${MVN_COMMON_PROPERTIES}
# tgz artifact deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# Create archive with jars only
cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} dist/jars; cd $OLDPWD
cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} *.jar; cd $OLDPWD

# Deploy tgz jars only artifact
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_JARS_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark-jars \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_JARS_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# shuffle service deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=org.apache.spark \\
-DartifactId=yarn-shuffle_${SCALA_RELEASE} \\
-Dversion=${CRITEO_VERSION} \\
-Dpackaging=jar \\
-Dfile=${SHUFFLE_SERVICE_JAR_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=org.apache.spark \
-DartifactId=yarn-shuffle_${SCALA_RELEASE} \
-Dversion=${CRITEO_VERSION} \
-Dpackaging=jar \
-Dfile=${SHUFFLE_SERVICE_JAR_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# jar artifacts (for parent poms) deployment
mvn jar:jar deploy:deploy \\
--batch-mode \\
-Phive -Phive-thriftserver \\
-Pyarn \\
-Phadoop-provided \\
-DaltDeploymentRepository=criteo::${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn jar:jar deploy:deploy \
--batch-mode \
-Phive -Phive-thriftserver \
-Pyarn \
-Phadoop-provided \
-DaltDeploymentRepository=criteo::default::${NEXUS_ARTIFACT_URL} \
-Dcriteo.repo.username=${MAVEN_USER} \
-Dcriteo.repo.password=${MAVEN_PASSWORD}

# python deployment
pyspark_version=${SPARK_RELEASE}+criteo_${SCALA_RELEASE}.${TIMESTAMP}
sed -i "s/__version__ = \\\".*\\\"/__version__ = \\\"${pyspark_version}\\\"/g" python/pyspark/version.py
python2.7 -m venv venv
python -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r python/requirements.txt
cd python
python setup.py bdist_wheel
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
<flume.version>1.6.0</flume.version>
<zookeeper.version>3.4.14</zookeeper.version>
<curator.version>2.13.0</curator.version>
<hive.group>org.apache.hive</hive.group>
<hive.group>org.spark-project.hive</hive.group>
<!-- Version used in Maven Hive dependency -->
<hive.version>1.2.1.spark2</hive.version>
<!-- Version used for internal directory structure -->
Expand Down Expand Up @@ -2632,7 +2632,7 @@
<profile>
<id>hive-1.2</id>
<properties>
<hive.group>org.apache.hive</hive.group>
<hive.group>org.spark-project.hive</hive.group>
<hive.classifier></hive.classifier>
<!-- Version used in Maven Hive dependency -->
<hive.version>1.2.1.spark2</hive.version>
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
wheel
numpy
pandas
pypandoc
pypandoc==1.5
py4j==0.10.7
pyarrow
twine
Expand Down

0 comments on commit 632109a

Please sign in to comment.