Skip to content

Orquesta CI

Orquesta CI #4054

# We run orquesta integration tests as part of a separate workflow.
# Orquesta tests have a lot of race conditions which result in intermediate failures and timeouts.
# Utilizing separate workflow allows us to re-run just this workflow / job on failure instead of
# wasting time and resources by needing to re-run all the jobs.
name: Orquesta CI
on:
push:
branches:
# only on merges to master branch
- master
# and version branches, which only include minor versions (eg: v3.4)
- v[0-9]+.[0-9]+
tags:
# also version tags, which include bugfix releases (eg: v3.4.0)
- v[0-9]+.[0-9]+.[0-9]+
pull_request:
type: [opened, reopened, edited]
branches:
# Only for PRs targeting those branches
- master
- v[0-9]+.[0-9]+
schedule:
# run every night at midnight
- cron: '0 0 * * *'
jobs:
# TODO: Fix the required checks!
# When the pre_job triggers and skips builds, it prevents merging the PR because
# the required checks are reported as skipped instead of passed.
# Special job which automatically cancels old runs for the same branch, prevents runs for the
# same file set which has already passed, etc.
pre_job:
name: Skip Duplicate Jobs Pre Job
runs-on: ubuntu-20.04
outputs:
should_skip: ${{ steps.skip_check.outputs.should_skip }}
steps:
- id: skip_check
uses: fkirc/skip-duplicate-actions@4c656bbdb6906310fa6213604828008bc28fe55d # v3.3.0
with:
cancel_others: 'true'
github_token: ${{ github.token }}
integration-tests:
needs: pre_job
# NOTE: We always want to run job on master since we run some additional checks there (code
# coverage, etc)
# if: ${{ needs.pre_job.outputs.should_skip != 'true' || github.ref == 'refs/heads/master' }}
name: '${{ matrix.name }} - Python ${{ matrix.python-version-short }}'
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
# NOTE: We need to use full Python version as part of Python deps cache key otherwise
# setup virtualenv step will fail.
include:
- name: 'Integration Tests (Orquesta)'
task: 'ci-orquesta'
nosetests_node_total: 1
nosetests_node_index: 0
python-version-short: '3.8'
python-version: '3.8.10'
- name: 'Integration Tests (Orquesta)'
task: 'ci-orquesta'
nosetests_node_total: 1
nosetests_node_index: 0
python-version-short: '3.9'
python-version: '3.9.14'
services:
mongo:
image: mongo:4.4
ports:
- 27017:27017
rabbitmq:
image: rabbitmq:3.8-management
options: >-
--name rabbitmq
ports:
- 5671:5671/tcp # AMQP SSL port
- 5672:5672/tcp # AMQP standard port
- 15672:15672/tcp # Management: HTTP, CLI
# Used for the coordination backend for integration tests
# NOTE: To speed things up, we only start redis for integration tests
# where it's needed
# redis:
# # Docker Hub image
# image: redis
# # Set health checks to wait until redis has started
# options: >-
# --name "redis"
# --health-cmd "redis-cli ping"
# --health-interval 10s
# --health-timeout 5s
# --health-retries 5
# ports:
# - 6379:6379/tcp
env:
TASK: '${{ matrix.task }}'
NODE_TOTAL: '${{ matrix.nosetests_node_total }}'
NODE_INDEX: '${{ matrix.nosetests_node_index }}'
# We need to explicitly specify terminal width otherwise some CLI tests fail on container
# environments where small terminal size is used.
COLUMNS: '120'
# CI st2.conf (with ST2_CI_USER user instead of stanley)
ST2_CONF: 'conf/st2.ci.conf'
# Tell StackStorm that we are indeed in CI mode, previously we hard coded a Travis specific
# environment variable in our test code, making it a PITA when we switch CI providers.
# Now, we simply set this environment varible here in the CI portion of our testing and
# it avoids any CI provider type lock-in.
ST2_CI: 'true'
# Name of the user who is running the CI (on GitHub Actions this is 'runner')
ST2_CI_USER: 'runner'
# GitHub is juggling how to set vars for multiple shells. Protect our PATH assumptions.
PATH: /home/runner/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Custom Environment Setup
run: |
./scripts/github/setup-environment.sh
- name: 'Set up Python (${{ matrix.python-version }})'
uses: actions/setup-python@v5
with:
python-version: '${{ matrix.python-version }}'
- name: Cache Python Dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pip
virtualenv
~/virtualenv
# TODO: maybe make the virtualenv a partial cache to exclude st2*?
# !virtualenv/lib/python*/site-packages/st2*
# !virtualenv/bin/st2*
key: ${{ runner.os }}-v5-python-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'test-requirements.txt', 'lockfiles/*.lock') }}
# Don't use alternative key as if requirements.txt has altered we
# don't want to retrieve previous cache
#restore-keys: |
# ${{ runner.os }}-v5-python-${{ matrix.python }}-
- name: Cache APT Dependencies
id: cache-apt-deps
uses: actions/cache@v4
with:
path: |
~/apt_cache
key: ${{ runner.os }}-v8-apt-${{ hashFiles('scripts/github/apt-packages.txt') }}
restore-keys: |
${{ runner.os }}-v8-apt-
- name: Install APT Depedencies
env:
CACHE_HIT: ${{steps.cache-apt-deps.outputs.cache-hit}}
run: |
# install dev dependencies for Python YAML and LDAP packages
# https://github.com/StackStorm/st2-auth-ldap
./scripts/github/install-apt-packages-use-cache.sh
- name: Install virtualenv
run: |
./scripts/github/install-virtualenv.sh
- name: Install requirements
run: |
./scripts/ci/install-requirements.sh
- name: Setup Integration Tests
run: |
# prep a ci-specific dev conf file that uses runner instead of stanley
# this user is the username of the user in GitHub actions, used for SSH, etc during
# integration tests (important)
cp conf/st2.dev.conf "${ST2_CONF}" ; sed -i -e "s/stanley/${ST2_CI_USER}/" "${ST2_CONF}"
sudo -E ./scripts/ci/add-itest-user-key.sh
- name: Run Redis Service Container
timeout-minutes: 2
run: |
docker run --rm --detach -p 127.0.0.1:6379:6379/tcp --name redis redis:latest
until [ "$(docker inspect -f {{.State.Running}} redis)" == "true" ]; do sleep 0.1; done
- name: Permissions Workaround
run: |
echo "$ST2_CI_REPO_PATH"
sudo ST2_CI_REPO_PATH="${ST2_CI_REPO_PATH}" scripts/ci/permissions-workaround.sh
- name: Print versions
run: |
./scripts/ci/print-versions.sh
- name: make
timeout-minutes: 41
env:
MAX_ATTEMPTS: 3
RETRY_DELAY: 5
# use: script -e -c to print colors
run: |
# There is a race in some orequesta integration tests so they tend to fail quite often.
# To avoid needed to re-run whole workflow in such case, we should try to retry this
# specific step. This saves us a bunch of time manually re-running the whole workflow.
# TODO: Try to identify problematic tests (iirc mostly orquesta ones) and only retry /
# re-run those.
set +e
for i in $(seq 1 ${MAX_ATTEMPTS}); do
echo "Attempt: ${i}/${MAX_ATTEMPTS}"
script -e -c "timeout 10m make ${TASK}" && exit 0
exit_code=$?
echo "Command failed / timed out (exit_code=${exit_code}), will retry in ${RETRY_DELAY} seconds..."
sleep ${RETRY_DELAY}
done
set -e
echo "Failed after ${MAX_ATTEMPTS} attempts, failing the job."
exit 1
- name: Compress Service Logs Before upload
if: ${{ failure() }}
run: |
tar cvzpf logs.tar.gz logs/*
- name: Upload StackStorm services Logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: logs-py${{ matrix.python-version }}
path: logs.tar.gz
retention-days: 7
- name: Stop Redis Service Container
if: "${{ always() }}"
run: docker rm --force redis || true
slack-notification:
name: Slack notification for failed master builds
if: always()
needs:
- integration-tests
runs-on: ubuntu-20.04
steps:
- name: Workflow conclusion
# this step creates an environment variable WORKFLOW_CONCLUSION and is the most reliable way to check the status of previous jobs
uses: technote-space/workflow-conclusion-action@v2
- name: CI Run Failure Slack Notification
if: ${{ env.WORKFLOW_CONCLUSION == 'failure' && github.ref == 'refs/heads/master' }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: voxmedia/github-action-slack-notify-build@v1
with:
channel: development
status: FAILED
color: danger
# HELPER FOR FUTURE DEVELOPERS:
# If your GitHub Actions job is failing and you need to debug it, by default there is
# no way to SSH into the container.
# The step below can be uncommeted and will stop here and allow you to SSH in.
# When this step is reached, simply refresh the GitHub Actions output for this build
# and this SSH command will be printed every 5 seconds to the output.
# Once you are done debugging in your SSH session, simply: touch /continue
# and this will continue the build.
#
# - name: Setup tmate session for debugging failed jobs (allows SSH into the container)
# uses: mxschmitt/action-tmate@v3
# if: "${{ failure() }}"
#