From 567507d4b99622f7e42f449acabf40af485c3b27 Mon Sep 17 00:00:00 2001 From: Jackson Maxfield Brown Date: Sun, 12 Jun 2022 08:25:12 -0700 Subject: [PATCH 1/6] Fan out index upload to single file chunks (#1) --- .../workflows/event-index-pipeline.yml | 105 ++++++++++++++++-- 1 file changed, 96 insertions(+), 9 deletions(-) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index c114314b..88bffc3e 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -1,9 +1,8 @@ name: Event Index on: - push: - branches: - - main + workflow_dispatch: + schedule: # # https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07 @@ -12,39 +11,127 @@ on: # We offset from the hour and half hour to go easy on the servers :) - cron: '26 3 * * 4' +# We doubly fan out +# We first generate indexs for uni, bi, and trigrams with a matrix +# Each index is split into chunks of 50,000 grams +# Then we fan out by every chunk and upload + jobs: - index-events: + generate-index-chunks: runs-on: ubuntu-latest strategy: matrix: - # We fan out on n-gram to make it possible to run on GitHub Actions n-gram: [1, 2, 3] steps: + # Setup Runner - uses: actions/checkout@v2 - uses: actions/setup-python@v1 with: python-version: 3.9 + # Setup GCloud / Creds - name: Setup gcloud uses: google-github-actions/setup-gcloud@v0 with: project_id: {{ cookiecutter.infrastructure_slug }} service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} export_default_credentials: true + - name: Dump Credentials to JSON + run: | + echo "$GOOGLE_CREDS" > python/google-creds.json + env: + GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + # Installs - name: Install Python Dependencies run: | cd python/ pip install . + + # Index + - name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams + run: | + cd python/ + {% raw %}run_cdp_event_index_generation event-index-config.json \ + --n_grams ${{ matrix.n-gram }} \ + --parallel{% endraw %} + + # Storage and Outputs + - name: Store Index Chunks to Artifacts + uses: actions/upload-artifact@v3 + with: + path: python/index/* + if-no-files-found: error + retention-days: 6 + - name: Store Index Fileset to Outputs + run: | + {% raw %}cd python/index/ + output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))') + echo "::set-output name=ngram-${{ matrix.n-gram }}-chunkset::$output"{% endraw %} + + combine-matrix-ngram-chunksets: + needs: generate-index-chunks + runs-on: ubuntu-latest + steps: + # Setup Runner + - uses: actions/checkout@v2 + - uses: actions/setup-python@v1 + with: + python-version: 3.9 + + # Process + - name: Combine Chunksets + id: 'combine-index-chunksets' + run: | + {% raw %} echo 'print('${{ needs.generate-index-chunks.outputs.ngram-1-chunkset }}' + '${{ needs.generate-index-chunks.outputs.ngram-2-chunkset }}' + '${{ needs.generate-index-chunks.outputs.ngram-3-chunkset }}')' >> print-combined-chunkset.py + output=$(python print-combined-chunkset.py) + echo "::set-output name=combined-chunkset::$output"{% endraw %} + + upload-index-chunks: + needs: combine-matrix-ngram-chunksets + runs-on: ubuntu-latest + strategy: + max-parallel: 6 + matrix: + filename: {% raw %}${{ fromJson(needs.combine-matrix-ngram-chunksets.outputs.combined-chunkset) }}{% endraw %} + + steps: + # Setup Runner + - uses: actions/checkout@v2 + - uses: actions/setup-python@v1 + with: + python-version: 3.9 + + # Setup GCloud / Creds + - name: Setup gcloud + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: {{ cookiecutter.infrastructure_slug }} + service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + export_default_credentials: true - name: Dump Credentials to JSON run: | echo "$GOOGLE_CREDS" > python/google-creds.json env: GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} - - name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams + + # Installs + - name: Install Python Dependencies run: | cd python/ - {% raw %}run_cdp_event_index event-index-config.json \ - --n_grams ${{ matrix.n-gram }} \ - --parallel{% endraw %} + pip install . + + # Download Chunk File + - uses: actions/download-artifact@v3 + with: + name: {% raw %}${{ matrix.filename }}{% endraw %} + path: python/index/ + + # Upload Index Chunk + - name: Process Upload + run: | + cd python/ + {% raw %}upload_cdp_event_index_chunk event-index-config.json \ + index/${{ matrix.filename }} \ + --parallel{% endraw %} \ No newline at end of file From 2811b98d16d3af17b7f42c5224a0dca924474b1d Mon Sep 17 00:00:00 2001 From: JacksonMaxfield Date: Sun, 12 Jun 2022 08:31:43 -0700 Subject: [PATCH 2/6] Install branch of cdp-backend --- .../.github/workflows/event-index-pipeline.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index 88bffc3e..8ecffbb0 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -49,6 +49,13 @@ jobs: cd python/ pip install . + - uses: actions/checkout@v2 + repository: 'councildataproject/cdp-backend' + ref: 'feature/reduce-ngram-index' + path: 'cdp-backend-branch' + - run: | + pip install ./cdp-backend-branch[pipeline] + # Index - name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams run: | From d652e1e9fe7a26e3b576e985784cfc310944b903 Mon Sep 17 00:00:00 2001 From: JacksonMaxfield Date: Sun, 12 Jun 2022 08:45:17 -0700 Subject: [PATCH 3/6] Fast fail false --- .../.github/workflows/event-index-pipeline.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index 8ecffbb0..23e06877 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -22,6 +22,7 @@ jobs: strategy: matrix: n-gram: [1, 2, 3] + fail-fast: false steps: # Setup Runner @@ -49,6 +50,14 @@ jobs: cd python/ pip install . + - uses: actions/checkout@v2 + with: + repository: 'councildataproject/cdp-backend' + ref: 'feature/reduce-ngram-index' + path: 'cdp-backend-branch' + - run: | + pip install ./cdp-backend-branch[pipeline] + - uses: actions/checkout@v2 repository: 'councildataproject/cdp-backend' ref: 'feature/reduce-ngram-index' @@ -102,6 +111,7 @@ jobs: max-parallel: 6 matrix: filename: {% raw %}${{ fromJson(needs.combine-matrix-ngram-chunksets.outputs.combined-chunkset) }}{% endraw %} + fail-fast: false steps: # Setup Runner @@ -129,6 +139,14 @@ jobs: cd python/ pip install . + - uses: actions/checkout@v2 + with: + repository: 'councildataproject/cdp-backend' + ref: 'feature/reduce-ngram-index' + path: 'cdp-backend-branch' + - run: | + pip install ./cdp-backend-branch[pipeline] + # Download Chunk File - uses: actions/download-artifact@v3 with: From 2aeaa42f448b8efb62ecf41492d40850a129e357 Mon Sep 17 00:00:00 2001 From: JacksonMaxfield Date: Sun, 12 Jun 2022 08:47:37 -0700 Subject: [PATCH 4/6] Fix fast fail indent --- .../.github/workflows/event-index-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index 23e06877..9db16fd3 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -111,7 +111,7 @@ jobs: max-parallel: 6 matrix: filename: {% raw %}${{ fromJson(needs.combine-matrix-ngram-chunksets.outputs.combined-chunkset) }}{% endraw %} - fail-fast: false + fail-fast: false steps: # Setup Runner From e0f02a969cac3398ddeb217dbb1d82fe2ad09186 Mon Sep 17 00:00:00 2001 From: JacksonMaxfield Date: Mon, 13 Jun 2022 15:44:09 -0700 Subject: [PATCH 5/6] Working event index --- .../workflows/event-index-pipeline.yml | 94 +++++++------------ 1 file changed, 35 insertions(+), 59 deletions(-) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index 9db16fd3..8f541c20 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -15,7 +15,7 @@ on: # We first generate indexs for uni, bi, and trigrams with a matrix # Each index is split into chunks of 50,000 grams # Then we fan out by every chunk and upload - +{% raw %} jobs: generate-index-chunks: runs-on: ubuntu-latest @@ -24,6 +24,11 @@ jobs: n-gram: [1, 2, 3] fail-fast: false + outputs: + ngram-1-chunks: ${{ steps.output-index-chunks.outputs.ngram-1-chunks }} + ngram-2-chunks: ${{ steps.output-index-chunks.outputs.ngram-2-chunks }} + ngram-3-chunks: ${{ steps.output-index-chunks.outputs.ngram-3-chunks }} + steps: # Setup Runner - uses: actions/checkout@v2 @@ -35,14 +40,14 @@ jobs: - name: Setup gcloud uses: google-github-actions/setup-gcloud@v0 with: - project_id: {{ cookiecutter.infrastructure_slug }} - service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + project_id: {% endraw %}{{ cookiecutter.infrastructure_slug }}{% raw %} + service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} export_default_credentials: true - name: Dump Credentials to JSON run: | echo "$GOOGLE_CREDS" > python/google-creds.json env: - GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }} # Installs - name: Install Python Dependencies @@ -50,45 +55,29 @@ jobs: cd python/ pip install . - - uses: actions/checkout@v2 - with: - repository: 'councildataproject/cdp-backend' - ref: 'feature/reduce-ngram-index' - path: 'cdp-backend-branch' - - run: | - pip install ./cdp-backend-branch[pipeline] - - - uses: actions/checkout@v2 - repository: 'councildataproject/cdp-backend' - ref: 'feature/reduce-ngram-index' - path: 'cdp-backend-branch' - - run: | - pip install ./cdp-backend-branch[pipeline] - # Index - - name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams + - name: Index Events ${{ matrix.n-gram }}-grams run: | cd python/ - {% raw %}run_cdp_event_index_generation event-index-config.json \ + run_cdp_event_index_generation event-index-config.json \ --n_grams ${{ matrix.n-gram }} \ - --parallel{% endraw %} + --store_remote \ + --parallel - # Storage and Outputs - - name: Store Index Chunks to Artifacts - uses: actions/upload-artifact@v3 - with: - path: python/index/* - if-no-files-found: error - retention-days: 6 + # Store generated files to step output - name: Store Index Fileset to Outputs + id: output-index-chunks run: | - {% raw %}cd python/index/ + cd python/index/ output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))') - echo "::set-output name=ngram-${{ matrix.n-gram }}-chunkset::$output"{% endraw %} + echo "::set-output name=ngram-${{ matrix.n-gram }}-chunks::$output" - combine-matrix-ngram-chunksets: + combine-matrix-ngram-chunks: needs: generate-index-chunks runs-on: ubuntu-latest + outputs: + all-chunks: ${{ steps.combine-index-chunks.outputs.combined-chunks }} + steps: # Setup Runner - uses: actions/checkout@v2 @@ -97,20 +86,20 @@ jobs: python-version: 3.9 # Process - - name: Combine Chunksets - id: 'combine-index-chunksets' + - name: Combine Chunks + id: 'combine-index-chunks' run: | - {% raw %} echo 'print('${{ needs.generate-index-chunks.outputs.ngram-1-chunkset }}' + '${{ needs.generate-index-chunks.outputs.ngram-2-chunkset }}' + '${{ needs.generate-index-chunks.outputs.ngram-3-chunkset }}')' >> print-combined-chunkset.py - output=$(python print-combined-chunkset.py) - echo "::set-output name=combined-chunkset::$output"{% endraw %} + echo 'print(${{ needs.generate-index-chunks.outputs.ngram-1-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-2-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-3-chunks }})' >> print-combined-chunks.py + output=$(python print-combined-chunks.py) + echo "::set-output name=combined-chunks::$output" upload-index-chunks: - needs: combine-matrix-ngram-chunksets + needs: combine-matrix-ngram-chunks runs-on: ubuntu-latest strategy: max-parallel: 6 matrix: - filename: {% raw %}${{ fromJson(needs.combine-matrix-ngram-chunksets.outputs.combined-chunkset) }}{% endraw %} + filename: ${{ fromJson(needs.combine-matrix-ngram-chunks.outputs.all-chunks) }} fail-fast: false steps: @@ -124,39 +113,26 @@ jobs: - name: Setup gcloud uses: google-github-actions/setup-gcloud@v0 with: - project_id: {{ cookiecutter.infrastructure_slug }} - service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + project_id: {% endraw $}{{ cookiecutter.infrastructure_slug }}{$ raw $} + service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} export_default_credentials: true - name: Dump Credentials to JSON run: | echo "$GOOGLE_CREDS" > python/google-creds.json env: - GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }} # Installs - name: Install Python Dependencies run: | cd python/ pip install . - - - uses: actions/checkout@v2 - with: - repository: 'councildataproject/cdp-backend' - ref: 'feature/reduce-ngram-index' - path: 'cdp-backend-branch' - - run: | - pip install ./cdp-backend-branch[pipeline] - - # Download Chunk File - - uses: actions/download-artifact@v3 - with: - name: {% raw %}${{ matrix.filename }}{% endraw %} - path: python/index/ # Upload Index Chunk - name: Process Upload run: | cd python/ - {% raw %}upload_cdp_event_index_chunk event-index-config.json \ - index/${{ matrix.filename }} \ - --parallel{% endraw %} \ No newline at end of file + process_cdp_event_index_chunk event-index-config.json \ + ${{ matrix.filename }} \ + --parallel +{% endraw %} \ No newline at end of file From f0e633411ed646b1ef976aa7f6062faaeb532696 Mon Sep 17 00:00:00 2001 From: JacksonMaxfield Date: Mon, 13 Jun 2022 15:46:01 -0700 Subject: [PATCH 6/6] Upgrade frontend --- {{ cookiecutter.hosting_github_repo_name }}/web/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/{{ cookiecutter.hosting_github_repo_name }}/web/package.json b/{{ cookiecutter.hosting_github_repo_name }}/web/package.json index 93b2a914..9abd6cde 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/web/package.json +++ b/{{ cookiecutter.hosting_github_repo_name }}/web/package.json @@ -10,7 +10,7 @@ "deploy": "gh-pages -d build" }, "dependencies": { - "@councildataproject/cdp-frontend": "3.1.0", + "@councildataproject/cdp-frontend": "3.1.2", "react": "^16.13.1", "react-dom": "^16.13.1" },