From c4769d7777336f7ce9fe38a12656b8401c5f8765 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 30 Nov 2023 15:10:26 -0600
Subject: [PATCH] CI log improvements (#621)

* Add groups to script steps.

* [skip-tests] missing quote

* [skip-tests] Use function to only print group in GHA.

* Fix color.

* [skip-tests] Add group for env details.

* [skip-tests] Add group to run_as_coder repro instructions.

* Don't error on unbound.

* Don't print script args

* Color coder print message.

* Avoid unbound errors with GITHUB_ACTIONS.

* Don't run nvidia-smi manually in the test job.

* sccache stats group.

* Avoid sccache stats if sccache is not available.

* [skip-tests] Inject intentional error.

* Revert "[skip-tests] Inject intentional error."

This reverts commit 7270a0cd0c5efd3e14f09bdb6355731e3bdf4a15.

* Use preset name in group name.

* Parameterize color.

* Print sccache stats in group.

* Add problem matcher.

* Add problem matcher before moving repo files.

* Remove the cmake regexs for now.

* Try different problem matcher.

* Just remove problem matchers for now.

* Fix if

* Remove redundant sccache stats.

* Try adding problem matcher again.

* Fix problem-matcher file name.

* [skip-tests] Run smaller matrix for debug.

* Fix path.

* Use json array for matcher.

* Fix json array.

* [skip-tests] Disable verify devcontainers for now.

* disable verify-devcontainers

* Exclude home/coder from the path in the matcher.

* Try a different regex.

* Exclude leading slash.

* Run as coder user.

* Revert "Run as coder user."

This reverts commit dace5f6963f58c4f545d0bf962aac2b987b6d75d.

* Add ninja summary stats.

* Fix permissions of ninja summary script.

* Make color conditional upon status.

* Make sure to get correct build status.

* Exit if build failed.

* Fix if statement.

* Print when build fails.

* Disable exiting on non-zero return.

* Don't use local, it resets exit code.

* Fix variable name.

* Emit error.

* Make sccache stats part of group title.

* Make repro instructions a conditional step.

* Get rid of old code.

* Go back to putting the repro instructions in the command step.

* Don't output error::.

* Update problem matcher.

* Don't capture cmake output.

* Fix group name.

* Actually disable exiting on non-zero return.

* Add echo -e.

* Fix spacing.

* Redundant "build".

* Add space to fix emoji.

* Move end message logic into end group.

* Fix group name.

* Don't print in GHA on success.

* Fix emojis.

* Refactor group command logic into function.

* Docs.

* Return status from run_command.

* Revert test changes.

* Update repro instructions.

* Remove excess.

* Use print_env_details directly to avoid duplicates.

* Update problem-matcher.json

* Add timing to build/test scripts.
---
 .github/problem-matchers/problem-matcher.json |  14 +
 .github/workflows/build-and-test-linux.yml    |   1 -
 .github/workflows/run-as-coder.yml            |  22 +-
 ci/build_common.sh                            | 198 +++++++--
 ci/build_cub.sh                               |   4 +
 ci/build_libcudacxx.sh                        |   4 +
 ci/build_thrust.sh                            |   4 +
 ci/ninja_summary.py                           | 381 ++++++++++++++++++
 ci/nvrtc_libcudacxx.sh                        |   2 +
 ci/sccache_stats.sh                           |   8 +-
 ci/test_cub.sh                                |   4 +
 ci/test_libcudacxx.sh                         |   4 +
 ci/test_thrust.sh                             |   4 +
 13 files changed, 606 insertions(+), 44 deletions(-)
 create mode 100644 .github/problem-matchers/problem-matcher.json
 create mode 100755 ci/ninja_summary.py

diff --git a/.github/problem-matchers/problem-matcher.json b/.github/problem-matchers/problem-matcher.json
new file mode 100644
index 0000000000..f196a5c884
--- /dev/null
+++ b/.github/problem-matchers/problem-matcher.json
@@ -0,0 +1,14 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "nvcc",
+      "pattern": [
+        {
+          "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
+          "severity": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml
index 32cfc25951..6c5ba40061 100644
--- a/.github/workflows/build-and-test-linux.yml
+++ b/.github/workflows/build-and-test-linux.yml
@@ -44,5 +44,4 @@ jobs:
       runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
       image: ${{inputs.container_image}}
       command: |
-        nvidia-smi
         ${{ inputs.test_script }}
diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml
index 6d09fd220f..40bbf97b29 100644
--- a/.github/workflows/run-as-coder.yml
+++ b/.github/workflows/run-as-coder.yml
@@ -39,18 +39,30 @@ jobs:
         run: |
           cp -R cccl /home/coder/cccl
           chown -R coder:coder /home/coder/
+      - name: Add NVCC problem matcher
+        run: |
+          echo "::add-matcher::cccl/.github/problem-matchers/problem-matcher.json"
       - name: Configure credentials and environment variables for sccache
         uses: ./cccl/.github/actions/configure_cccl_sccache
       - name: Run command
         shell: su coder {0}
         run: |
-            set -exo pipefail
+            set -eo pipefail
             cd ~/cccl
+            echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m"
+            echo -e "\e[1;34m${{inputs.command}}\e[0m"
             eval "${{inputs.command}}" || exit_code=$?
             if [ ! -z "$exit_code" ]; then
-                echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:"
-                echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
-                echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
-                exit $exit_code
+              echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
+              echo "::error:: To replicate this failure locally, follow the steps below:"
+              echo "1. Clone the repository, and navigate to the correct branch and commit:"
+              echo "   git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+              echo ""
+              echo "2. Run the failed command inside the same Docker container used by the CI:"
+              echo "   docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+              echo ""
+              echo "For additional information, see:"
+              echo "   - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
+              echo "   - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
             fi
 
diff --git a/ci/build_common.sh b/ci/build_common.sh
index b398d5e582..7959acfb59 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -37,7 +37,6 @@ function usage {
 # Copy the args into a temporary array, since we will modify them and
 # the parent script may still need them.
 args=("$@")
-echo "Args: ${args[@]}"
 while [ "${#args[@]}" -ne 0 ]; do
     case "${args[0]}" in
     -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
@@ -90,7 +89,6 @@ export CTEST_PARALLEL_LEVEL="1"
 export CXX="${HOST_COMPILER}"
 export CUDACXX="${CUDA_COMPILER}"
 export CUDAHOSTCXX="${HOST_COMPILER}"
-
 export CXX_STANDARD
 
 # Print "ARG=${ARG}" for all args.
@@ -107,67 +105,193 @@ function print_var_values() {
     done
 }
 
-echo "========================================"
-echo "pwd=$(pwd)"
-print_var_values \
-    BUILD_DIR \
-    CXX_STANDARD \
-    CXX \
-    CUDACXX \
-    CUDAHOSTCXX \
-    NVCC_VERSION \
-    CMAKE_BUILD_PARALLEL_LEVEL \
-    CTEST_PARALLEL_LEVEL \
-    CCCL_BUILD_INFIX \
-    GLOBAL_CMAKE_OPTIONS
-echo "========================================"
-echo
-echo "========================================"
-echo "Current commit is:"
-git log -1 || echo "Not a repository"
-echo "========================================"
-echo
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
+
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
 
 function configure_preset()
 {
     local BUILD_NAME=$1
     local PRESET=$2
     local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
 
     pushd .. > /dev/null
-
-    cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS
-    echo "$BUILD_NAME configure complete."
-
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS
+    status=$?
     popd > /dev/null
+    return $status
 }
 
-function build_preset()
-{
+function build_preset() {
     local BUILD_NAME=$1
     local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
 
     source "./sccache_stats.sh" "start"
-    pushd .. > /dev/null
-
-    cmake --build --preset=$PRESET -v
-    echo "$BUILD_NAME build complete."
 
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
     popd > /dev/null
-    source "./sccache_stats.sh" "end"
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET}
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
 }
 
 function test_preset()
 {
     local BUILD_NAME=$1
     local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
 
     pushd .. > /dev/null
-
-    ctest --preset=$PRESET
-    echo "$BUILD_NAME testing complete."
-
+    run_command "$GROUP_NAME" ctest --preset=$PRESET
+    status=$?
     popd > /dev/null
+    return $status
 }
 
 function configure_and_build_preset()
diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index f31ec4fd2a..d587c2a6ad 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 # CUB benchmarks require at least CUDA nvcc 11.5 for int128
 # Returns "true" if the first version is greater than or equal to the second
 version_compare() {
@@ -35,3 +37,5 @@ CMAKE_OPTIONS="
 "
 
 configure_and_build_preset "CUB" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/build_libcudacxx.sh b/ci/build_libcudacxx.sh
index 656851253a..1dc26f3228 100755
--- a/ci/build_libcudacxx.sh
+++ b/ci/build_libcudacxx.sh
@@ -2,7 +2,11 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="libcudacxx-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
 
 configure_and_build_preset libcudacxx "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/build_thrust.sh b/ci/build_thrust.sh
index 887f33b34b..6e4a82da0f 100755
--- a/ci/build_thrust.sh
+++ b/ci/build_thrust.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="thrust-cpp$CXX_STANDARD"
 
 CMAKE_OPTIONS=""
 
 configure_and_build_preset "Thrust" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/ninja_summary.py b/ci/ninja_summary.py
new file mode 100755
index 0000000000..f496db534b
--- /dev/null
+++ b/ci/ninja_summary.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+r"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+This script is designed to be automatically run after each ninja build in
+order to summarize the build's performance. Making build performance information
+more visible should make it easier to notice anomalies and opportunities. To use
+this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
+
+On Linux you can get autoninja to invoke this script using this syntax:
+
+$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
+
+You can also call this script directly using ninja's syntax to specify the
+output directory of interest:
+
+> python3 post_build_ninja_summary.py -C out/Default
+
+Typical output looks like this:
+
+>ninja -C out\debug_component base
+ninja.exe -C out\debug_component base -j 960 -l 48  -d keeprsp
+ninja: Entering directory `out\debug_component'
+[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
+Longest build steps:
+       0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
+       0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
+       0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
+       1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
+Time by build-step type:
+       0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
+       0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
+       0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
+       1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
+time sum)
+      23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
+26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
+839 build steps completed, average of 32.17/s
+
+If no gn clean has been done then results will be for the last non-NULL
+invocation of ninja. Ideas for future statistics, and implementations are
+appreciated.
+
+The "weighted" time is the elapsed time of each build step divided by the number
+of tasks that were running in parallel. This makes it an excellent approximation
+of how "important" a slow step was. A link that is entirely or mostly serialized
+will have a weighted time that is the same or similar to its elapsed time. A
+compile that runs in parallel with 999 other compiles will have a weighted time
+that is tiny."""
+
+import argparse
+import errno
+import fnmatch
+import os
+import subprocess
+import sys
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    # Handle empty ninja_log gracefully by silently returning an empty list of
+    # targets.
+    if not header:
+        return []
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version %r' % header
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              (length, weighted_total))
+
+    # Print the slowest build steps:
+    print('    Longest build steps:')
+    if elapsed_time_sorting:
+        entries.sort(key=lambda x: x.Duration())
+    else:
+        entries.sort(key=lambda x: x.WeightedDuration())
+    for target in entries[-long_count:]:
+        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+              (target.WeightedDuration(), target.DescribeTargets(),
+               target.Duration()))
+
+    # Sum up the time by file extension/type of the output file
+    count_by_ext = {}
+    time_by_ext = {}
+    weighted_time_by_ext = {}
+    # Scan through all of the targets to build up per-extension statistics.
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        time_by_ext[extension] = time_by_ext.get(extension,
+                                                 0) + target.Duration()
+        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
+            extension, 0) + target.WeightedDuration()
+        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
+
+    print('    Time by build-step type:')
+    # Copy to a list with extension name and total time swapped, to (time, ext)
+    if elapsed_time_sorting:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in time_by_ext.items())
+    else:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in weighted_time_by_ext.items())
+    # Print the slowest build target types:
+    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
+        print(
+            '      %8.1f s weighted time to generate %d %s files '
+            '(%1.1f s elapsed time sum)' %
+            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+
+    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+          'parallelism)' %
+          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+    print('    %d build steps completed, average of %1.2f/s' %
+          (len(entries), len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    metrics_file = 'siso_metrics.json'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument(
+        '-e',
+        '--elapsed_time_sorting',
+        default=False,
+        action='store_true',
+        help='Sort output by elapsed time instead of weighted time')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+        metrics_file = os.path.join(args.build_directory, metrics_file)
+    if args.log_file:
+        log_file = args.log_file
+    if not args.step_types:
+        # Offer a convenient way to add extra step types automatically,
+        # including when this script is run by autoninja. get() returns None if
+        # the variable isn't set.
+        args.step_types = os.environ.get('chromium_step_types')
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    if os.path.exists(metrics_file):
+        # Automatically handle summarizing siso builds.
+        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
+        cmd.extend(['metrics', 'summary'])
+        if args.build_directory:
+            cmd.extend(['-C', args.build_directory])
+        if args.step_types:
+            cmd.extend(['--step_types', args.step_types])
+        if args.elapsed_time_sorting:
+            cmd.append('--elapsed_time_sorting')
+        subprocess.run(cmd)
+    else:
+        try:
+            with open(log_file, 'r') as log:
+                entries = ReadTargets(log, False)
+                if entries:
+                    SummarizeEntries(entries, args.step_types,
+                                     args.elapsed_time_sorting)
+        except IOError:
+            print('Log file %r not found, no build summary created.' % log_file)
+            return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/ci/nvrtc_libcudacxx.sh b/ci/nvrtc_libcudacxx.sh
index 4a0d9f6e89..a33fb14522 100755
--- a/ci/nvrtc_libcudacxx.sh
+++ b/ci/nvrtc_libcudacxx.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 
 PRESET="libcudacxx-nvrtc-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh
index 8abb4125c2..3a3ebc421c 100755
--- a/ci/sccache_stats.sh
+++ b/ci/sccache_stats.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This script prints the sccache hit rate between two calls to sccache --show-stats.
-# It should be sourced in your script before and after the operations you want to profile, 
+# It should be sourced in your script before and after the operations you want to profile,
 # with the 'start' or 'end' argument respectively.
 
 mode=$1
@@ -12,6 +12,12 @@ if [[ "$mode" != "start" && "$mode" != "end" ]]; then
     exit 1
 fi
 
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
 case $mode in
   start)
     export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
diff --git a/ci/test_cub.sh b/ci/test_cub.sh
index b379cc2cbf..9fd9feff48 100755
--- a/ci/test_cub.sh
+++ b/ci/test_cub.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 ./build_cub.sh "$@"
 
 PRESET="cub-cpp$CXX_STANDARD"
 
 test_preset CUB "${PRESET}"
+
+print_time_summary
diff --git a/ci/test_libcudacxx.sh b/ci/test_libcudacxx.sh
index c433199cc4..64736f430e 100755
--- a/ci/test_libcudacxx.sh
+++ b/ci/test_libcudacxx.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="libcudacxx-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
 
@@ -18,3 +20,5 @@ test_preset "libcudacxx (CTest)" ${CTEST_PRESET}
 source "./sccache_stats.sh" "start"
 test_preset "libcudacxx (lit)" ${LIT_PRESET}
 source "./sccache_stats.sh" "end"
+
+print_time_summary
diff --git a/ci/test_thrust.sh b/ci/test_thrust.sh
index a2895f9aea..1385ef560e 100755
--- a/ci/test_thrust.sh
+++ b/ci/test_thrust.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 ./build_thrust.sh "$@"
 
 PRESET="thrust-cpp$CXX_STANDARD"
 
 test_preset "Thrust" ${PRESET}
+
+print_time_summary