Merge pull request #1913 from DaveLak/refactor-oss-fuzz-scripts-to-us…

…e-new-qa-assets-repo-layout Update OSS-Fuzz Scripts to Use New QA-Assets Repo Structure
gitpython-developers · May 8, 2024 · a5815b6 · a5815b6
2 parents cd490f8 + 2cfd200
commit a5815b6
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 114 deletions.
diff --git a/fuzzing/README.md b/fuzzing/README.md
@@ -76,25 +76,6 @@ Contains Python files for each fuzz test.
   reason, fuzz tests should gracefully handle anticipated exception cases with a `try`/`except` block to avoid false
   positives that halt the fuzzing engine.
 
-### Dictionaries (`dictionaries/`)
-
-Provides hints to the fuzzing engine about inputs that might trigger unique code paths. Each fuzz target may have a
-corresponding `.dict` file. For information about dictionary syntax, refer to
-the [LibFuzzer documentation on the subject](https://llvm.org/docs/LibFuzzer.html#dictionaries).
-
-**Things to Know**:
-
-- OSS-Fuzz loads dictionary files per fuzz target if one exists with the same name, all others are ignored.
-- Most entries in the dictionary files found here are escaped hex or Unicode values that were recommended by the fuzzing
-  engine after previous runs.
-- A default set of dictionary entries are created for all fuzz targets as part of the build process, regardless of an
-  existing file here.
-- Development or updates to dictionaries should reflect the varied formats and edge cases relevant to the
-  functionalities under test.
-- Example dictionaries (some of which are used to build the default dictionaries mentioned above) can be found here:
-  - [AFL++ dictionary repository](https://github.com/AFLplusplus/AFLplusplus/tree/stable/dictionaries#readme)
-  - [Google/fuzzing dictionary repository](https://github.com/google/fuzzing/tree/master/dictionaries)
-
 ### OSS-Fuzz Scripts (`oss-fuzz-scripts/`)
 
 Includes scripts for building and integrating fuzz targets with OSS-Fuzz:

diff --git a/fuzzing/dictionaries/fuzz_blob.dict b/fuzzing/dictionaries/fuzz_blob.dict
diff --git a/fuzzing/dictionaries/fuzz_config.dict b/fuzzing/dictionaries/fuzz_config.dict
diff --git a/fuzzing/oss-fuzz-scripts/build.sh b/fuzzing/oss-fuzz-scripts/build.sh
@@ -7,34 +7,13 @@ set -euo pipefail
 
 python3 -m pip install .
 
-# Directory to look in for dictionaries, options files, and seed corpora:
-SEED_DATA_DIR="$SRC/seed_data"
-
-find "$SEED_DATA_DIR" \( -name '*_seed_corpus.zip' -o -name '*.options' -o -name '*.dict' \) \
-  ! \( -name '__base.*' \) -exec printf 'Copying: %s\n' {} \; \
+find "$SRC" -maxdepth 1 \
+  \( -name '*_seed_corpus.zip' -o -name '*.options' -o -name '*.dict' \) \
+  -exec printf '[%s] Copying: %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" {} \; \
   -exec chmod a-x {} \; \
   -exec cp {} "$OUT" \;
 
 # Build fuzzers in $OUT.
 find "$SRC/gitpython/fuzzing" -name 'fuzz_*.py' -print0 | while IFS= read -r -d '' fuzz_harness; do
   compile_python_fuzzer "$fuzz_harness" --add-binary="$(command -v git):."
-
-  common_base_dictionary_filename="$SEED_DATA_DIR/__base.dict"
-  if [[ -r "$common_base_dictionary_filename" ]]; then
-    # Strip the `.py` extension from the filename and replace it with `.dict`.
-    fuzz_harness_dictionary_filename="$(basename "$fuzz_harness" .py).dict"
-    output_file="$OUT/$fuzz_harness_dictionary_filename"
-
-    printf 'Appending %s to %s\n' "$common_base_dictionary_filename" "$output_file"
-    if [[ -s "$output_file" ]]; then
-      # If a dictionary file for this fuzzer already exists and is not empty,
-      # we append a new line to the end of it before appending any new entries.
-      #
-      # LibFuzzer will happily ignore multiple empty lines in a dictionary but fail with an error
-      # if any single line has incorrect syntax (e.g., if we accidentally add two entries to the same line.)
-      # See docs for valid syntax: https://llvm.org/docs/LibFuzzer.html#id32
-      echo >>"$output_file"
-    fi
-    cat "$common_base_dictionary_filename" >>"$output_file"
-  fi
 done
diff --git a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh
@@ -9,23 +9,20 @@ set -euo pipefail
 # Prerequisites #
 #################
 
-for cmd in python3 git wget rsync; do
+for cmd in python3 git wget zip; do
   command -v "$cmd" >/dev/null 2>&1 || {
     printf '[%s] Required command %s not found, exiting.\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$cmd" >&2
     exit 1
   }
 done
 
-SEED_DATA_DIR="$SRC/seed_data"
-mkdir -p "$SEED_DATA_DIR"
-
 #############
 # Functions #
 #############
 
 download_and_concatenate_common_dictionaries() {
   # Assign the first argument as the target file where all contents will be concatenated
-  target_file="$1"
+  local target_file="$1"
 
   # Shift the arguments so the first argument (target_file path) is removed
   # and only URLs are left for the loop below.
@@ -38,22 +35,61 @@ download_and_concatenate_common_dictionaries() {
   done
 }
 
-fetch_seed_corpora() {
-  # Seed corpus zip files are hosted in a separate repository to avoid additional bloat in this repo.
-  git clone --depth 1 https://github.com/gitpython-developers/qa-assets.git qa-assets &&
-    rsync -avc qa-assets/gitpython/corpra/ "$SEED_DATA_DIR/" &&
-    rm -rf qa-assets # Clean up the cloned repo to keep the Docker image as slim as possible.
+create_seed_corpora_zips() {
+  local seed_corpora_dir="$1"
+  local output_zip
+  for dir in "$seed_corpora_dir"/*; do
+    if [ -d "$dir" ] && [ -n "$dir" ]; then
+      output_zip="$SRC/$(basename "$dir")_seed_corpus.zip"
+      printf '[%s] Zipping the contents of %s into %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$dir" "$output_zip"
+      zip -jur "$output_zip" "$dir"/*
+    fi
+  done
+}
+
+prepare_dictionaries_for_fuzz_targets() {
+  local dictionaries_dir="$1"
+  local fuzz_targets_dir="$2"
+  local common_base_dictionary_filename="$WORK/__base.dict"
+
+  printf '[%s] Copying .dict files from %s to %s\n' "$(date '+%Y-%m-%d %H:%M:%S')"  "$dictionaries_dir" "$SRC/"
+  cp -v "$dictionaries_dir"/*.dict "$SRC/"
+
+  download_and_concatenate_common_dictionaries "$common_base_dictionary_filename" \
+    "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/utf8.dict" \
+    "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/url.dict"
+
+  find "$fuzz_targets_dir" -name 'fuzz_*.py' -print0 | while IFS= read -r -d '' fuzz_harness; do
+    if [[ -r "$common_base_dictionary_filename" ]]; then
+      # Strip the `.py` extension from the filename and replace it with `.dict`.
+      fuzz_harness_dictionary_filename="$(basename "$fuzz_harness" .py).dict"
+      local output_file="$SRC/$fuzz_harness_dictionary_filename"
+
+      printf '[%s] Appending %s to %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$common_base_dictionary_filename" "$output_file"
+      if [[ -s "$output_file" ]]; then
+        # If a dictionary file for this fuzzer already exists and is not empty,
+        # we append a new line to the end of it before appending any new entries.
+        #
+        # LibFuzzer will happily ignore multiple empty lines in a dictionary but fail with an error
+        # if any single line has incorrect syntax (e.g., if we accidentally add two entries to the same line.)
+        # See docs for valid syntax: https://llvm.org/docs/LibFuzzer.html#id32
+        echo >>"$output_file"
+      fi
+      cat "$common_base_dictionary_filename" >>"$output_file"
+    fi
+  done
 }
 
 ########################
 # Main execution logic #
 ########################
+# Seed corpora and dictionaries are hosted in a separate repository to avoid additional bloat in this repo.
+# We clone into the $WORK directory because OSS-Fuzz cleans it up after building the image, keeping the image small.
+git clone --depth 1 https://github.com/gitpython-developers/qa-assets.git "$WORK/qa-assets"
 
-fetch_seed_corpora
+create_seed_corpora_zips "$WORK/qa-assets/gitpython/corpora"
 
-download_and_concatenate_common_dictionaries "$SEED_DATA_DIR/__base.dict" \
-  "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/utf8.dict" \
-  "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/url.dict"
+prepare_dictionaries_for_fuzz_targets "$WORK/qa-assets/gitpython/dictionaries" "$SRC/gitpython/fuzzing"
 
 # The OSS-Fuzz base image has outdated dependencies by default so we upgrade them below.
 python3 -m pip install --upgrade pip