Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Move user code background (#1461)
Browse files Browse the repository at this point in the history
* move user command backend, remove kill -9

* reduce check time to 30s

* mount yarn liveness file readonly
  • Loading branch information
mzmssg committed Oct 10, 2018
1 parent 57cde2d commit 5be7ffe
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 23 deletions.
36 changes: 26 additions & 10 deletions src/rest-server/src/templates/dockerContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ function exit_handler()
{
printf "%s %s\n" \
"[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..."
kill 0
}

set -x
Expand All @@ -36,11 +35,7 @@ trap exit_handler EXIT


touch "/alive/docker_$PAI_CONTAINER_ID"
while /bin/true; do
[ $(( $(date +%s) - $(stat -c %Y /alive/yarn_$PAI_CONTAINER_ID) )) -gt 60 ] \
&& pkill -9 --ns 1
sleep 20
done &



export PAI_WORK_DIR="$(pwd)"
Expand Down Expand Up @@ -183,8 +178,29 @@ fi
# Write env to system-wide environment
env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment

printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
{{{ taskData.command }}} || exit $?
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"
function run_user_command()
{
printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
{{{ taskData.command }}} || exit $?
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"
exit 0
}

run_user_command &
user_command_pid=$!

while [ $(( $(date +%s) - $(stat -c %Y /alive/yarn_$PAI_CONTAINER_ID) )) -lt 30 ] && \
kill -0 $user_command_pid 2>/dev/null; do
sleep 20
done

if kill -0 $user_command_pid 2>/dev/null; then
echo "job has been killed, docker container exiting"
exit 0
else
wait $user_command_pid
user_command_exitcode=$?
echo "job has finished with exit code $user_command_exitcode"
exit $user_command_exitcode
fi

exit 0
18 changes: 5 additions & 13 deletions src/rest-server/src/templates/yarnContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,14 @@ function exit_handler()
local handler="Yarn container exit handler"
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."

debug_log "$handler" "clean the container code"
rm -fr tmp/pai-root/code 2>/dev/null

debug_log "$handler" "trying to kill docker container $docker_name"
pid=$(docker inspect --format={{{ inspectFormat }}} $docker_name 2>/dev/null)
if [ $pid ]; then
kill -9 $pid &&\
debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\
debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited."
else
debug_log "$handler" "docker container $docker_name has already exited"
fi

debug_log "$handler" "write exit code to file"
debug_log "$handler" "yarn container exit code: $rc"
debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"

debug_log "$handler" "clean the container code"
rm -fr tmp/pai-root/code 2>/dev/null

exit $rc
}

Expand Down Expand Up @@ -265,6 +256,7 @@ docker run --name $docker_name \
--device=/dev/fuse \
--security-opt apparmor:unconfined \
--volume /tmp/pai-root/alive/$APP_ID:/alive \
--volume /tmp/pai-root/alive/$APP_ID/yarn_$CONTAINER_ID:/alive/yarn_$CONTAINER_ID:ro \
--volume /tmp/pai-root/log/$APP_ID/$CONTAINER_ID:/pai/log \
--volume $container_local_dir/$bootstrap_dir:/pai/bootstrap:ro \
--volume $container_local_dir/$code_dir:/pai/code:ro \
Expand Down

0 comments on commit 5be7ffe

Please sign in to comment.