feat(regTests): upload only failed test logs on ci and clean up logging (#2547)

* upload only failed test logs * remove printing log names for passed tests * print slow tests with --duration * separate regression and unit logs for CI workflow
2024-12-14 11:58:02 +00:00 · 2024-02-21 10:35:07 +02:00 · 2024-02-21 10:35:07 +02:00 · f32156788e
commit f32156788e
parent 8fd880c844
5 changed files with 52 additions and 17 deletions
--- a/.github/actions/regression-tests/action.yml
+++ b/.github/actions/regression-tests/action.yml
@ -37,7 +37,7 @@ runs:
        export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"
        export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1 # to crash on errors

-        timeout 20m pytest -m "${{inputs.filter}}" --color=yes --json-report --json-report-file=report.json dragonfly --ignore=dragonfly/replication_test.py --log-cli-level=INFO || code=$?
+        timeout 20m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report --json-report-file=report.json dragonfly --ignore=dragonfly/replication_test.py --log-cli-level=INFO || code=$?

        # timeout returns 124 if we exceeded the timeout duration
        if [[ $code -eq 124 ]]; then
@ -61,7 +61,7 @@ runs:
        export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"

        run_pytest_with_args() {
-          timeout 20m pytest -m "${{inputs.filter}}" --color=yes --json-report \
+          timeout 20m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report \
            --json-report-file=rep1_report.json dragonfly/replication_test.py --log-cli-level=INFO \
            --df alsologtostderr $1 $2 || code=$?

@ -87,14 +87,17 @@ runs:
        TIMEDOUT_STEP_1: ${{ steps.first.outputs.TIMEDOUT }}
        TIMEDOUT_STEP_2: ${{ steps.second.outputs.TIMEDOUT }}
      run: |
-        echo "🪵🪵🪵🪵🪵🪵 Latest log before timeout 🪵🪵🪵🪵🪵🪵\n\n"
-        ls -t /tmp/dragonfly*log*INFO* | head -n 1 | xargs cat
-        echo "🪵🪵🪵🪵🪵🪵 Latest log before timeout end 🪵🪵🪵🪵🪵🪵\n\n"
-
        if [[ "${{ env.TIMEDOUT_STEP_1 }}" -eq 1 ]] || [[ "${{ env.TIMEDOUT_STEP_2 }}" -eq 1 ]]; then
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 TESTS TIMEDOUT 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
+
+          # It could be the case that the first test failed and the folder was not created. We need mkdir
+          # therefore so plz do not remove
+          mkdir /tmp/failed
+          # Copy over the logs of the test that timedout. We need this because the exception/failure
+          # handlers do not run when the shell command TIMEOUT sends a SIGTERM to terminate the pytest process.
+          cat /tmp/last_test_log_files.txt | xargs -I {} cp {} /tmp/failed/
        fi

    - name: Send notification on failure
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -103,6 +103,7 @@ jobs:
            -L
          cd ${GITHUB_WORKSPACE}/build && pwd
          du -hcs _deps/
+
      - name: Build
        run: |
          cd ${GITHUB_WORKSPACE}/build
@ -110,11 +111,13 @@ jobs:
          df -h
          echo "-----------------------------"
          ninja src/all
+
      - name: PostFail
        if: failure()
        run: |
          echo "disk space is:"
          df -h
+
      - name: C++ Unit Tests
        run: |
          cd ${GITHUB_WORKSPACE}/build
@ -146,6 +149,14 @@ jobs:
          ./multi_test --multi_exec_mode=1
          ./multi_test --multi_exec_mode=3
          # GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test
+
+      - name: Upload unit logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          name: unit_logs
+          path: /tmp/*INFO*
+
      - name: Run regression tests
        if: matrix.container == 'ubuntu-dev:20'
        uses: ./.github/actions/regression-tests
@ -154,12 +165,14 @@ jobs:
          run-only-on-ubuntu-latest: true
          build-folder-name: build
          filter: ${{ matrix.build-type == 'Release' && 'not slow' || '(not slow) and (not opt_only)' }}
-      - name: Upload logs on failure
+
+      - name: Upload regression logs on failure
        if: failure()
        uses: actions/upload-artifact@v3
        with:
-          name: unit_logs
-          path: /tmp/*INFO*
+          name: regression_logs
+          path: /tmp/failed/*
+

  lint-test-chart:
    runs-on: ubuntu-latest
--- a/.github/workflows/regression-tests.yml
+++ b/.github/workflows/regression-tests.yml
@ -52,7 +52,7 @@ jobs:
        uses: actions/upload-artifact@v3
        with:
          name: logs
-          path: /tmp/dragonfly.*
+          path: /tmp/failed/*

  lint-test-chart:
    runs-on: ubuntu-latest
--- a/tests/dragonfly/conftest.py
+++ b/tests/dragonfly/conftest.py
@ -14,6 +14,7 @@ import redis
 import pymemcache
 import random
 import subprocess
+import shutil
 from copy import deepcopy

 from pathlib import Path
@ -326,3 +327,24 @@ def with_ca_tls_client_args(with_tls_client_args, with_tls_ca_cert_args):
    args = deepcopy(with_tls_client_args)
    args["ssl_ca_certs"] = with_tls_ca_cert_args["ca_cert"]
    return args
+
+
+def copy_failed_logs_and_clean_tmp_folder(report):
+    failed_path = "/tmp/failed"
+    path_exists = os.path.exists(failed_path)
+    if not path_exists:
+        os.makedirs(failed_path)
+
+    last_log_file = open("/tmp/last_test_log_files.txt", "r")
+    files = last_log_file.readlines()
+    logging.error(f"Test failed {report.nodeid} with logs: ")
+    for file in files:
+        # copy to failed folder
+        file = file.rstrip("\n")
+        logging.error(f"🪵🪵🪵🪵🪵🪵 {file} 🪵🪵🪵🪵🪵🪵")
+        shutil.copy(file, failed_path)
+
+
+def pytest_exception_interact(node, call, report):
+    if report.failed:
+        copy_failed_logs_and_clean_tmp_folder(report)
--- a/tests/dragonfly/instance.py
+++ b/tests/dragonfly/instance.py
@ -45,9 +45,6 @@ class DflyStartException(Exception):
    pass


-uid_iterator = itertools.count()
-
-
 class DflyInstance:
    """
    Represents a runnable and stoppable Dragonfly instance
@ -143,11 +140,11 @@ class DflyInstance:
            raise DflyStartException("Process didn't start listening on port in time")

        self.log_files = self.get_logs_from_psutil()
-        id = next(uid_iterator)
-        logging.info(f"Starting instance with id {id} and port {self._port}")
-        logging.info(f"Log files are: ")
+
+        last_log_file = open("/tmp/last_test_log_files.txt", "w")
+
        for log in self.log_files:
-            logging.info(f"🪵🪵🪵🪵🪵🪵 {log} 🪵🪵🪵🪵🪵🪵")
+            last_log_file.write(log + "\n")

        # Remove first 6 lines - our default header with log locations (as it carries no useful information)
        # Next, replace log-level + date with port and colored arrow