Skip to content

Commit

Permalink
Fix build
Browse files Browse the repository at this point in the history
  * Spark jars should have a dedicated artifact id
  * Use python 3 instead of python 2. This requires to make /usr/bin/python point to python3
  * Add -l option to docker useradd command to prevent a bug using long uid moby/moby#5419
  * Remove \\\ in the maven settings.xml file in the container otherwise the variables criteo.repo.username|password are not correctly used and lead to 401 issues when uploading files to nexus
  * Fix hive dependency. Groupid has to be org.spark-project.hive and artifactid version 1.2.1.spark2
  * use set -e in the build script to fail immediatly when an error occur in any command
  * Spark scala is a profile and cannot be activated with -D java option but rather with -P maven option
  * Use --no-transfer-progress in maven commands to make output readable
  * All mvn commands were using \\ instead of \ leading to bad command interpreation by bash
  * Fix tar command used to build the 'jar-only' tgz
  * Fix mvn jar:jar deploy:deploy parameters which are not exactly the same as the ones of mvn deploy:deploy-file
  * Upgrade pip in the venv otherwise pyarrow cannot install
  * Fix altDeploymentRepository declaration
  * Set pypandoc version to 1.5 as some functions have been removed in newer versions
  * Set back version of build-helper-maven-plugin and change maven-shade-plugin (error during git apply patch)
  * Remove leftover 2.4.3-criteo versions in pom.xml
  * Fix compilation error in sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
  • Loading branch information
w.montaz committed May 16, 2023
1 parent 14f4997 commit 10307ce
Show file tree
Hide file tree
Showing 16 changed files with 77 additions and 74 deletions.
2 changes: 1 addition & 1 deletion common/kvstore/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/avro/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
11 changes: 7 additions & 4 deletions external/docker/criteo-build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ RUN yum install -y \
java-1.8.0-openjdk \
pinentry-curses \
pkgconfig \
python2-pip \
python2-virtualenv \
python3-pip \
python3-virtualenv \
rsync \
ShellCheck \
sudo \
Expand Down Expand Up @@ -86,11 +86,14 @@ ENV MAVEN_OPTS -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g


RUN groupadd --non-unique -g ${GROUP_ID} ${USER_NAME}
RUN useradd -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
RUN useradd -l -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
RUN echo "${USER_NAME} ALL=NOPASSWD: ALL" > "/etc/sudoers.d/spark-build-${USER_ID}"
ENV HOME /home/${USER_NAME}
RUN mkdir /home/${USER_NAME}/.m2 && chown ${USER_NAME}: /home/${USER_NAME}/.m2
RUN echo '<settings><mirrors><mirror><id>criteo</id><mirrorOf>*</mirrorOf><url>http://nexus.criteo.prod/content/groups/criteodev</url></mirror></mirrors><servers><server><id>criteo</id><username>\\\${criteo.repo.username}</username><password>\\\${criteo.repo.password}</password></server></servers></settings>' > /home/${USER_NAME}/.m2/settings.xml
RUN echo '<settings><mirrors><mirror><id>criteo</id><mirrorOf>*</mirrorOf><url>http://nexus.criteo.prod/content/groups/criteodev</url></mirror></mirrors><servers><server><id>criteo</id><username>${criteo.repo.username}</username><password>${criteo.repo.password}</password></server></servers></settings>' > /home/${USER_NAME}/.m2/settings.xml

# Alias python3 to python otherwise python 2 is called
RUN mv /usr/bin/python /usr/bin/python2
RUN ln -s /usr/bin/python3 /usr/bin/python

RUN rm -f /var/log/faillog /var/log/lastlog
2 changes: 1 addition & 1 deletion external/docker/criteo-build/build_config.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
HDP_VERSION=3.3.0-criteo-20230320100819
HIVE_VERSION=1.2.1
HIVE_VERSION=1.2.1.spark2
101 changes: 50 additions & 51 deletions external/docker/criteo-build/build_script.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
set -x
set -e

MAVEN_USER=$1
MAVEN_PASSWORD=$2
Expand All @@ -20,7 +21,7 @@ if [ ${SCALA_RELEASE} == "2.12" ]; then
MVN_SCALA_PROPERTY="-Pscala-2.12"
elif [ ${SCALA_RELEASE} == "2.11" ]; then
./dev/change-scala-version.sh 2.11
MVN_SCALA_PROPERTY="-Dscala-2.11"
MVN_SCALA_PROPERTY="-Pscala-2.11"
else
echo "[ERROR] Scala release not provided"
exit 1
Expand All @@ -36,83 +37,81 @@ MVN_HDP_ARTIFACT_VERSION="${MVN_ARTIFACT_VERSION}-${HDP_VERSION}"
SHUFFLE_SERVICE_JAR_FILE="dist/yarn/spark-${CRITEO_VERSION}-yarn-shuffle.jar"
MVN_COMMON_PROPERTIES="-Dhive.version=${HIVE_VERSION} ${MVN_SCALA_PROPERTY}"
MVN_COMMON_PROPERTIES_NO_TESTS="${MVN_COMMON_PROPERTIES} -DskipTests"
MVN_COMMON_NEXUS_PROPERTIES="-DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"
MVN_COMMON_DEPLOY_FILE_PROPERTIES="-Durl=${NEXUS_ARTIFACT_URL} -DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"

# do some house cleaning
mvn clean
mvn --no-transfer-progress clean
rm -f spark-*.tgz
rm -f dist/python/dist/*
rm -f python/dist/*

# change version
mvn versions:set -DnewVersion=${CRITEO_VERSION}
mvn --no-transfer-progress versions:set -DnewVersion=${CRITEO_VERSION}

# Build distribution with hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -Phive -Phive-thriftserver -Pyarn -Dhadoop.version=${HDP_VERSION} ${MVN_COMMON_PROPERTIES_NO_TESTS}
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -ntp -Phive -Phive-thriftserver -Pyarn -Dhadoop.version=${HDP_VERSION} ${MVN_COMMON_PROPERTIES_NO_TESTS}

# tgz artifact deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_HDP_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_HDP_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_HDP_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_HDP_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# Build distribution without hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -Phive -Phive-thriftserver -Pyarn -Phadoop-provided ${MVN_COMMON_PROPERTIES}
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -ntp -Phive -Phive-thriftserver -Pyarn -Phadoop-provided ${MVN_COMMON_PROPERTIES}
# tgz artifact deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# Create archive with jars only
cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} dist/jars; cd $OLDPWD
cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} *.jar; cd $OLDPWD

# Deploy tgz jars only artifact
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=com.criteo.tarballs \\
-DartifactId=spark \\
-Dversion=${MVN_ARTIFACT_VERSION} \\
-Dpackaging=tar.gz \\
-Dfile=${SPARK_JARS_ARTIFACT_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark-jars \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_JARS_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# shuffle service deployment
mvn deploy:deploy-file \\
--batch-mode \\
-DgroupId=org.apache.spark \\
-DartifactId=yarn-shuffle_${SCALA_RELEASE} \\
-Dversion=${CRITEO_VERSION} \\
-Dpackaging=jar \\
-Dfile=${SHUFFLE_SERVICE_JAR_FILE} \\
-Durl=${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=org.apache.spark \
-DartifactId=yarn-shuffle_${SCALA_RELEASE} \
-Dversion=${CRITEO_VERSION} \
-Dpackaging=jar \
-Dfile=${SHUFFLE_SERVICE_JAR_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# jar artifacts (for parent poms) deployment
mvn jar:jar deploy:deploy \\
--batch-mode \\
-Phive -Phive-thriftserver \\
-Pyarn \\
-Phadoop-provided \\
-DaltDeploymentRepository=criteo::${NEXUS_ARTIFACT_URL} \\
${MVN_COMMON_NEXUS_PROPERTIES}
mvn jar:jar deploy:deploy \
--batch-mode \
-Phive -Phive-thriftserver \
-Pyarn \
-Phadoop-provided \
-DaltDeploymentRepository=criteo::default::${NEXUS_ARTIFACT_URL} \
-Dcriteo.repo.username=${MAVEN_USER} \
-Dcriteo.repo.password=${MAVEN_PASSWORD}

# python deployment
pyspark_version=${SPARK_RELEASE}+criteo_${SCALA_RELEASE}.${TIMESTAMP}
sed -i "s/__version__ = \\\".*\\\"/__version__ = \\\"${pyspark_version}\\\"/g" python/pyspark/version.py
python2.7 -m venv venv
python -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r python/requirements.txt
cd python
python setup.py bdist_wheel
Expand Down
2 changes: 1 addition & 1 deletion external/flume-assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/flume-sink/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/flume/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/kafka-0-10-assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/kafka-0-10-sql/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/kafka-0-10/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/kafka-0-8-assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion external/kafka-0-8/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.4.3-criteo</version>
<version>2.4.3</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
7 changes: 4 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@
<flume.version>1.6.0</flume.version>
<zookeeper.version>3.4.14</zookeeper.version>
<curator.version>2.13.0</curator.version>
<hive.group>org.apache.hive</hive.group>
<hive.group>org.spark-project.hive</hive.group>
<!-- Version used in Maven Hive dependency -->
<hive.version>1.2.1.spark2</hive.version>
<!-- Version used for internal directory structure -->
Expand Down Expand Up @@ -2161,7 +2161,7 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.1</version>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
Expand Down Expand Up @@ -2555,6 +2555,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.4.1</version>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<artifactSet>
Expand Down Expand Up @@ -2827,7 +2828,7 @@
<profile>
<id>hive-1.2</id>
<properties>
<hive.group>org.apache.hive</hive.group>
<hive.group>org.spark-project.hive</hive.group>
<hive.classifier></hive.classifier>
<!-- Version used in Maven Hive dependency -->
<hive.version>1.2.1.spark2</hive.version>
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
wheel
numpy
pandas
pypandoc
pypandoc==1.5
py4j==0.10.7
pyarrow
twine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,8 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
assert(spark.sessionState.conf.getConfString(fallback.key, "lzo") === "lzo")

val displayValue = spark.sessionState.conf.getAllDefinedConfs
.find { case (key, _, _) => key == fallback.key }
.map { case (_, v, _) => v }
.find { case (key, _, _, _) => key == fallback.key }
.map { case (_, v, _, _) => v }
.get
assert(displayValue === fallback.defaultValueString)

Expand All @@ -302,8 +302,8 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
assert(spark.sessionState.conf.getConfString(fallback.key) === "lzo")

val newDisplayValue = spark.sessionState.conf.getAllDefinedConfs
.find { case (key, _, _) => key == fallback.key }
.map { case (_, v, _) => v }
.find { case (key, _, _, _) => key == fallback.key }
.map { case (_, v, _, _) => v }
.get
assert(newDisplayValue === "lzo")

Expand Down

0 comments on commit 10307ce

Please sign in to comment.