From b9aeeac1f0618b206c4581685b43dff8e909fb58 Mon Sep 17 00:00:00 2001 From: Peter Arsenault Date: Thu, 26 Feb 2026 12:29:48 -0500 Subject: [PATCH] Add comprehensive package scanning support for Habitat, native, and modern installers - Implement Habitat package scanning with direct and transitive dependency support - Add Grype and Trivy vulnerability scanning with caching and retry logic - Support for Chef Infra Client, chef-ice, and other native/modern packages - Add CINC package scanning support - Implement download retry logic from Chef acceptance site - Add package size tracking and metadata generation - Support for HAB_AUTH_TOKEN for private channel access - Add full_scan mode for complete product rescanning - Include dpkg extraction support for tarballs - Add version matching for stable and current releases Signed-off-by: Peter Arsenault --- .../chef-download-grype-snapshot/.gitignore | 1 + .../chef-download-grype-snapshot/README.md | 504 ++++- .../chef-download-grype-snapshot/action.yml | 119 +- .../chef-download-grype-snapshot/run.py | 1692 +++++++++++++++-- .../run.py.backup | 501 +++++ 5 files changed, 2594 insertions(+), 223 deletions(-) create mode 100644 .github/actions/chef-download-grype-snapshot/.gitignore create mode 100644 .github/actions/chef-download-grype-snapshot/run.py.backup diff --git a/.github/actions/chef-download-grype-snapshot/.gitignore b/.github/actions/chef-download-grype-snapshot/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/.github/actions/chef-download-grype-snapshot/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/.github/actions/chef-download-grype-snapshot/README.md b/.github/actions/chef-download-grype-snapshot/README.md index ee045fd..6bdd5d1 100644 --- a/.github/actions/chef-download-grype-snapshot/README.md +++ b/.github/actions/chef-download-grype-snapshot/README.md @@ -1,11 +1,18 @@ -# Chef Download + Grype Snapshot Action +# Chef Download + Grype + Trivy Snapshot Action -Composite action that downloads Chef products from downloads.chef.io and runs Grype vulnerability scans. +Composite action that downloads Chef products and runs both Grype and Trivy vulnerability scans for comprehensive vulnerability detection. + +Supports three scan modes: +- **native**: Downloads packages from Chef download sites and scans them (supports both Grype and Trivy) +- **modern**: Downloads next-generation products (chef-ice) with flexible channel configurations +- **habitat**: Installs Habitat packages and scans each dependency separately (Grype only) ## Usage +### Native Mode - Standard Products + ```yaml -- name: Scan chef product +- name: Scan chef product (native) uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main with: product: chef @@ -17,25 +24,111 @@ Composite action that downloads Chef products from downloads.chef.io and runs Gr scan_mode: native scan_root: /opt/chef license_id: ${{ secrets.LICENSE_ID }} + enable_trivy: true + trivy_scanners: vuln +``` + +### Native Mode - CINC (Open Source) + +```yaml +- name: Scan CINC (open source Chef) + uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: chef # Mapped internally to 'cinc' + channel: stable + download_site: cinc + os: ubuntu + os_version: "24.04" + arch: x86_64 + scan_mode: native + scan_root: /opt/cinc + enable_trivy: true +``` + +### Modern Mode - Next-Gen Products (chef-ice) + +```yaml +- name: Scan chef-ice (modern product) + uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: chef-ice + channel: stable + download_site: commercial + os: linux + os_version: "" # Not needed for universal binaries + arch: x86_64 + package_manager: deb # Required: deb, rpm, or tar + scan_mode: modern + scan_root: /hab + license_id: ${{ secrets.LICENSE_ID }} + enable_trivy: true +``` + +### Modern Mode - With Base URL Override (current channel) + +```yaml +- name: Scan chef-ice current channel + uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: chef-ice + channel: current + download_site: commercial + os: linux + arch: x86_64 + package_manager: deb + scan_mode: modern + scan_root: /hab + license_id: ${{ secrets.CHEF_ACCEPTANCE_LICENSE_ID }} + base_url_override: https://commercial-acceptance.downloads.chef.co + enable_trivy: true +``` + +### Habitat Mode + +```yaml +- name: Scan chef habitat package + uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: chef-infra-client + channel: stable + os: ubuntu + os_version: "24.04" + arch: x86_64 + scan_mode: habitat + hab_ident: "chef/chef-infra-client" + hab_channel: stable + hab_auth_token: ${{ secrets.HAB_AUTH_TOKEN }} ``` ## Inputs | Input | Required | Default | Description | |-------|----------|---------|-------------| -| `product` | Yes | - | Chef product name (chef, chef-workstation, chef-server, etc.) | -| `channel` | Yes | - | Release channel (stable, current) | -| `download_site` | Yes | commercial | Download site (commercial or community) | -| `os` | Yes | ubuntu | OS platform | -| `os_version` | Yes | - | OS version (e.g., 24.04) | +| `product` | Yes | - | Chef product name (chef, chef-workstation, chef-server, inspec, chef-ice, etc.) | +| `channel` | Yes | - | Release channel (stable, current) or target channel for native/modern/habitat | +| `download_site` | Yes | commercial | Download site (commercial, community, or cinc) - native/modern modes only | +| `os` | Yes | ubuntu | OS platform (ubuntu for standard products, linux for universal binaries) | +| `os_version` | No | "" | OS version (e.g., 24.04) - optional for universal binaries like chef-ice | | `arch` | Yes | x86_64 | Architecture | -| `scan_mode` | Yes | native | Scan mode (native or habitat) | -| `scan_root` | Yes | - | Install root path for metadata (e.g., /opt/chef) | -| `resolve_version` | Yes | latest | Version resolution (latest or pinned) | -| `pinned_version` | No | "" | Specific version when resolve_version=pinned | -| `license_id` | No | "" | License ID for downloads (pass via secrets) | +| `package_manager` | No | "" | Package manager type (deb, rpm, tar) - required for universal binaries like chef-ice | +| `scan_mode` | Yes | native | Scan mode (native, modern, or habitat) | +| `scan_root` | Yes | - | Install root path for metadata (e.g., /opt/chef) - native/modern modes only | +| `resolve_version` | Yes | latest | Version resolution (latest or pinned) - native/modern modes only | +| `pinned_version` | No | "" | Specific version when resolve_version=pinned - native/modern modes only | +| `license_id` | No | "" | License ID for downloads (pass via secrets) - not required for CINC | +| `base_url_override` | No | "" | Override default base URL (e.g., https://commercial-acceptance.downloads.chef.co for current channel) | +| `hab_ident` | No | "" | Habitat package identifier (e.g., 'core/chef-infra-client') - habitat mode | +| `hab_channel` | No | stable | Habitat channel (stable, current, base-2025, etc.) - habitat mode | +| `hab_origin` | No | "" | Habitat origin (e.g., 'chef') - alternative to hab_ident - habitat mode | +| `hab_auth_token` | No | "" | Habitat Builder Personal Access Token for protected channels (pass via secrets) | | `out_dir` | No | out | Output directory for results | | `work_dir` | No | work | Working directory for temporary files | +| `enable_trivy` | No | true | Enable Trivy scanning alongside Grype (native/modern modes only) | +| `trivy_scanners` | No | vuln | Trivy scanner types (comma-separated: vuln, misconfig, secret, license) | +| `trivy_severity` | No | UNKNOWN,LOW,MEDIUM,HIGH,CRITICAL | Severity levels to report | +| `trivy_ignore_unfixed` | No | false | Ignore vulnerabilities without fixes | +| `trivy_timeout` | No | "" | Timeout for Trivy scan | +| `trivy_cache_dir` | No | "" | Directory for Trivy cache | ## Outputs @@ -44,25 +137,185 @@ Composite action that downloads Chef products from downloads.chef.io and runs Gr | `resolved_version` | The resolved product version that was scanned | | `download_url_redacted` | Download URL with license_id removed | +## Size Tracking + +**New in 2026**: The action now calculates and tracks the installed size of scanned products. + +### Size Metrics + +For native and modern modes, the metadata.json includes a `size` section under `target`: + +```json +{ + "target": { + "product": "chef-workstation", + "channel": "stable", + "resolved_version": "25.12.1102", + "download": {...}, + "size": { + "package_bytes": 134217728, + "installed_bytes": 536870912, + "installed_human_readable": "512.00 MB", + "file_count": 12543 + } + } +} +``` + +### Size Fields + +- **package_bytes**: Downloaded package size in bytes (compressed .deb file) +- **installed_bytes**: Total size after extraction in bytes (actual disk footprint) +- **installed_human_readable**: Human-readable installed size (e.g., "512.00 MB") +- **file_count**: Number of files after extraction + +**For Habitat mode**, size information is tracked differently: +- Each dependency includes its individual installed size +- The index.json includes aggregate totals: + - **total_installed_bytes**: Combined size of all dependencies + - **total_installed_human_readable**: Human-readable total size + - **total_file_count**: Total files across all dependencies + +### Use Cases + +This size information helps answer: +- **Disk footprint**: How much space does the product consume after installation? +- **CVE scan scope**: What is the actual size of content being scanned for vulnerabilities? +- **Capacity planning**: How much storage is needed for deployments? +- **Trend analysis**: How does installed size change across versions and channels? + +### Analysis Tools + +Use the provided `calculate_installed_sizes.py` script to analyze sizes across all scanned products: + +```bash +python3 calculate_installed_sizes.py +``` + +This generates a summary table showing package sizes, installed sizes, and file counts for all scanned products. + ## Output Files -The action generates two JSON files in the `out_dir`: +### Native and Modern Modes + +The action generates scanner-specific outputs in the `out_dir/scanners/` directory: + +**Scanner Outputs (Canonical):** +- **scanners/grype.latest.json**: Complete Grype scan results +- **scanners/grype.metadata.json**: Grype scan metadata (version, DB info, severity counts) +- **scanners/trivy.latest.json**: Complete Trivy scan results (if enabled) +- **scanners/trivy.metadata.json**: Trivy scan metadata (version, DB info, severity counts) +- **scanners/compare.json**: CVE-level comparison between Grype and Trivy results + +**Legacy Compatibility Files:** +For backward compatibility during migration: +- **latest.json**: Copy of `scanners/grype.latest.json` +- **metadata.json**: Copy of `scanners/grype.metadata.json` + +**Comparison Format:** +The `compare.json` file provides a CVE-level comparison: +```json +{ + "schema_version": "1.0", + "generated_at_utc": "2026-02-03T10:15:30Z", + "target": { + "product": "chef", + "channel": "stable", + "resolved_version": "18.5.0" + }, + "summary": { + "grype": { + "cve_count": 123, + "severity_counts": {"Critical": 5, "High": 20, ...} + }, + "trivy": { + "cve_count": 120, + "severity_counts": {"Critical": 4, "High": 18, ...} + } + }, + "diff": { + "only_in_grype": ["CVE-2023-1234", ...], + "only_in_trivy": ["CVE-2023-5678", ...], + "in_both": ["CVE-2023-9012", ...] + } +} +``` + +### Habitat Mode -- **latest.json**: Complete Grype scan results -- **metadata.json**: Scan metadata including version, environment, and severity counts +The action generates an index file and per-dependency scans organized by type: + +- **index.json**: Rollup of all dependencies (direct and transitive) with aggregate counts and metadata +- **direct-deps////.json**: Grype scan results for each direct dependency +- **direct-deps////.metadata.json**: Metadata for each direct dependency +- **transitive-deps////.json**: Grype scan results for each transitive dependency +- **transitive-deps////.metadata.json**: Metadata for each transitive dependency + +Example structure: +``` +out/ +├── index.json +├── direct-deps/ +│ ├── core/ +│ │ ├── openssl/ +│ │ │ └── 3.0.13/ +│ │ │ ├── 20250101120000.json +│ │ │ └── 20250101120000.metadata.json +│ │ └── glibc/ +│ │ └── 2.39/ +│ │ ├── 20250105140500.json +│ │ └── 20250105140500.metadata.json +│ └── chef/ +│ └── chef-infra-client/ +│ └── 18.5.0/ +│ ├── 20250110120000.json +│ └── 20250110120000.metadata.json +└── transitive-deps/ + └── core/ + ├── gcc-libs/ + │ └── 9.5.0/ + │ ├── 20240105173910.json + │ └── 20240105173910.metadata.json + └── zlib/ + └── 1.3/ + ├── 20240105173710.json + └── 20240105173710.metadata.json +``` ## Requirements +### Native and Modern Modes - Ubuntu runner (uses `dpkg` for package extraction) - Grype is automatically installed if not present +- Trivy is automatically installed if not present - Valid license_id for the specified download_site: - - Commercial sites require a commercial license - - Community sites require a Free license + - **Commercial**: Requires a commercial license + - **Community**: Requires a Free license + - **CINC**: No license required (open source) -## Download Site Constraints +### Habitat Mode +- Linux or Windows runner +- Habitat CLI is automatically installed if not present +- Grype is automatically installed if not present +- Valid HAB_AUTH_TOKEN (passed via license_id) for licensed channels + +## Download Sites -- **Commercial**: Supports both `stable` and `current` channels -- **Community**: Only supports `stable` channel (API enforced) +### Native/Modern Mode +- **Commercial** (`commercial`): Chef commercial downloads at `https://chefdownload-commercial.chef.io` + - Supports `stable` and `current` channels + - Requires commercial license_id +- **Community** (`community`): Chef community downloads at `https://chefdownload-community.chef.io` + - Only supports `stable` channel (API enforced) + - Requires Free license_id +- **CINC** (`cinc`): Open source Chef at `https://omnitruck.cinc.sh` + - Only supports `stable` channel + - No license required + - Product name mapping: chef→cinc, inspec→cinc-auditor, chef-server→cinc-server, chef-workstation→cinc-workstation + +### Habitat Mode +- Channels are flexible: `stable`, `current`, `base-2025`, custom channels, etc. +- Licensed channels (e.g., `base-2025`) require HAB_AUTH_TOKEN via license_id input ## Error Handling @@ -101,7 +354,216 @@ jobs: path: out/ ``` +## Example: Habitat Mode with Multiple Packages + +```yaml +jobs: + habitat-scan: + runs-on: ubuntu-latest + strategy: + matrix: + package: + - { ident: "chef/chef-infra-client", channel: "stable" } + - { ident: "chef/inspec", channel: "stable" } + - { ident: "core/ruby", channel: "stable" } + steps: + - uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: ${{ matrix.package.ident }} + channel: ${{ matrix.package.channel }} + os: ubuntu + os_version: "24.04" + arch: x86_64 + scan_mode: habitat + hab_ident: ${{ matrix.package.ident }} + hab_channel: ${{ matrix.package.channel }} + hab_auth_token: ${{ secrets.HAB_AUTH_TOKEN }} + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: habitat-scan-${{ matrix.package.ident }}-${{ matrix.package.channel }} + path: out/ +``` + +## Habitat Scan Path Conventions + +Habitat packages are scanned at their installation paths: + +- **Linux**: `/hab/pkgs/////` +- **Windows**: `C:\hab\pkgs\\\\\` + +Each dependency is scanned separately, and results are published to the data repo with this structure: + +``` +habitat//////// +├── .json ← Main package scan +├── .metadata.json ← Main package metadata +├── index.json ← Rollup of all dependencies +├── direct-deps/ ← Direct dependencies +│ ├── ///.json +│ └── ///.metadata.json +└── transitive-deps/ ← Transitive dependencies + ├── ///.json + └── ///.metadata.json +``` + +**Example:** +``` +habitat/inspec/stable/ubuntu/x86_64/chef/inspec/5.24.5/ +├── 20260128071642.json ← Main inspec scan +├── 20260128071642.metadata.json ← Main inspec metadata +├── index.json ← Rollup +├── direct-deps/ +│ ├── core/ruby31/3.1.7/20250728150529.json +│ ├── core/ruby31/3.1.7/20250728150529.metadata.json +│ ├── core/bash/5.1/20240105214248.json +│ └── core/bash/5.1/20240105214248.metadata.json +└── transitive-deps/ + ├── core/gcc-libs/9.5.0/20240105173910.json + ├── core/gcc-libs/9.5.0/20240105173910.metadata.json + ├── core/glibc/2.35/20240105171810.json + └── core/glibc/2.35/20240105171810.metadata.json +``` + ## Related Projects - [chef-vuln-scan-orchestrator](https://github.com/chef/chef-vuln-scan-orchestrator) - Orchestration workflow using this action - [chef-vuln-scan-data](https://github.com/chef/chef-vuln-scan-data) - Data repository for scan results + +# Scan Mode Support + +## Native Mode + +Native mode downloads and scans Chef product installers from Chef download sites. + +### Standard Products +Products like chef, chef-server, and chef-workstation use OS-specific packages: +- **Commercial/Community**: Download URL `?p=ubuntu&pv=24.04&m=x86_64&v=latest` +- **CINC**: Fetches direct .deb URL from `/packages` endpoint (no query params) +- Output path: `native/{product}/{channel}/{download_site}/{os}/{os_version}/{arch}/` + +### Download Site Specifics +- **Commercial** (`chefdownload-commercial.chef.io`): Requires license_id, supports stable/current +- **Community** (`chefdownload-community.chef.io`): Requires Free license_id, stable only +- **CINC** (`omnitruck.cinc.sh`): No license, stable only, direct .deb downloads + +### Product Name Mapping (CINC only) +CINC uses different product names than Chef: +- `chef` → `cinc` +- `chef-server` → `cinc-server` +- `chef-workstation` → `cinc-workstation` +- `inspec` → `cinc-auditor` + +The scanner automatically maps these internally while preserving Chef product names in output paths. + +## Modern Mode + +Modern mode is for next-generation Chef products with flexible deployment configurations. + +### Features +- Supports channel-specific base URLs via `base_url_override` +- Supports channel-specific license IDs +- Universal binaries (no OS version) +- Platform-agnostic packages +- Output path: `modern/{product}/{channel}/{download_site}/{os}/{arch}/{package_manager}/` + +### chef-ice Example +- Download URL: `?p=linux&pm=deb&m=x86_64&v=latest` (no OS version) +- Requires `package_manager` input (deb, rpm, or tar) +- Scan root: `/hab` (contains bundled Habitat-based Chef 19) +- Can use different base URLs per channel (e.g., commercial-acceptance for current channel) + +## Habitat Mode + +Habitat mode installs and scans Habitat packages with per-dependency tracking. + +### Features +- Installs Habitat CLI automatically if not present +- Installs specified Habitat package using `hab pkg install` +- Enumerates dependencies (direct and transitive separately) +- Scans each dependency at its install path +- Generates per-dependency JSON and metadata files +- Creates index.json rollup with aggregate counts +- Supports version-based cleanup to prevent historical accumulation +- Modified copy step to handle both native (latest.json/metadata.json) and habitat (index.json/deps/) outputs + +### 5. Updated targets.yml +Added example habitat targets: +- chef-habitat-infra-client +- chef-habitat-inspec +- core-habitat-ruby + +## Output Structure + +### Native Mode (unchanged) +``` +out/ +├── latest.json +└── metadata.json +``` + +Published to: `native///////` + +### Habitat Mode (new) +``` +out/ +├── index.json +└── deps/ + └── / + └── / + └── / + ├── .json + └── .metadata.json +``` + +Published to: `habitat/////` + +## Scan Paths + +### Linux +- Native: Extracted package directory +- Habitat: `/hab/pkgs/////` + +### Windows +- Habitat: `C:\hab\pkgs\\\\\` + +## Usage Example + +```yaml +- uses: chef/common-github-actions/.github/actions/chef-download-grype-snapshot@main + with: + product: chef-infra-client + channel: stable + os: ubuntu + os_version: "24.04" + arch: x86_64 + scan_mode: habitat + hab_ident: "chef/chef-infra-client" + hab_channel: stable + hab_auth_token: ${{ secrets.HAB_AUTH_TOKEN }} +``` + +## Testing Recommendations + +1. Start with `enabled: false` on habitat targets in targets.yml +2. Enable one target at a time for testing +3. Start with direct dependencies only (`transitive_deps: false`) +4. Monitor scan duration and data repo size +5. Add transitive dependencies later if needed + +## Implementation Notes + +- Habitat CLI is auto-installed using the official install script +- HAB_AUTH_TOKEN is passed via the `license_id` input for licensed channels +- Each dependency is scanned independently to allow granular tracking +- The index.json provides aggregate counts for quick "what changed" comparisons +- Per-dependency metadata enables detailed change tracking + +## Benefits + +1. **Granular vulnerability tracking**: See which dependency introduced which vulnerability +2. **Change attribution**: Know if vuln count changes are from the main package or a dependency +3. **Dependency awareness**: Track vulnerability landscape across the dependency tree +4. **Consistent structure**: Same JSON schema and metadata approach as native mode +5. **Flexible channels**: Support for stable, current, and custom channels like base-2025 diff --git a/.github/actions/chef-download-grype-snapshot/action.yml b/.github/actions/chef-download-grype-snapshot/action.yml index 5ab7580..7393d2a 100644 --- a/.github/actions/chef-download-grype-snapshot/action.yml +++ b/.github/actions/chef-download-grype-snapshot/action.yml @@ -16,15 +16,20 @@ inputs: description: "OS platform (ubuntu)" default: "ubuntu" os_version: - required: true - description: "OS version (e.g., 24.04)" + required: false + description: "OS version (e.g., 24.04) - optional for universal binaries like chef-ice" + default: "" arch: required: true description: "Architecture (x86_64)" default: "x86_64" + package_manager: + required: false + description: "Package manager type (deb, rpm, tar) - required for universal binaries like chef-ice" + default: "" scan_mode: required: true - description: "native|habitat (native for pilot)" + description: "native|modern|habitat (native for traditional products, modern for next-gen products, habitat for Habitat packages)" default: "native" scan_root: required: true @@ -41,6 +46,26 @@ inputs: required: false description: "License ID for commercial downloads (pass via secrets)" default: "" + base_url_override: + required: false + description: "Override the default base URL for downloads (e.g., https://commercial-acceptance.downloads.chef.co for current channel)" + default: "" + hab_ident: + required: false + description: "Habitat package identifier (e.g., 'core/chef-infra-client') - for habitat mode" + default: "" + hab_channel: + required: false + description: "Habitat channel (stable, current, base-2025, etc.) - for habitat mode" + default: "stable" + hab_origin: + required: false + description: "Habitat origin (e.g., 'chef') - alternative to hab_ident - for habitat mode" + default: "" + hab_auth_token: + required: false + description: "Habitat Builder Personal Access Token for protected channels (pass via secrets)" + default: "" out_dir: required: false description: "Output directory" @@ -49,6 +74,38 @@ inputs: required: false description: "Working directory" default: "work" + data_repo_path: + required: false + description: "Path to checked out data repository for version comparison (optional)" + default: "" + full_product_scan: + required: false + description: "Force full product scan, bypassing version check (default: true for scheduled runs, false for manual runs)" + default: "false" + enable_trivy: + required: false + description: "Enable Trivy scanning alongside Grype" + default: "true" + trivy_scanners: + required: false + description: "Trivy scanner types (comma-separated: vuln, misconfig, secret, license)" + default: "vuln" + trivy_severity: + required: false + description: "Severity levels to report (comma-separated)" + default: "UNKNOWN,LOW,MEDIUM,HIGH,CRITICAL" + trivy_ignore_unfixed: + required: false + description: "Ignore vulnerabilities without fixes" + default: "false" + trivy_timeout: + required: false + description: "Timeout for Trivy scan" + default: "" + trivy_cache_dir: + required: false + description: "Directory for Trivy cache" + default: "" outputs: resolved_version: @@ -61,6 +118,48 @@ outputs: runs: using: "composite" steps: + - name: Install Trivy + shell: bash + run: | + set -euo pipefail + # Tool versions (pinned for stability and cache reliability) + GRYPE_VERSION="0.109.0" + TRIVY_VERSION="0.58.1" + + # Export for use in run.py + echo "GRYPE_VERSION=${GRYPE_VERSION}" >> $GITHUB_ENV + + # Check if Trivy exists (may be from cache) + if [ -f /usr/local/bin/trivy ]; then + # Ensure executable permissions (cache may not preserve them) + sudo chmod +x /usr/local/bin/trivy 2>/dev/null || chmod +x /usr/local/bin/trivy + echo "✓ Trivy found in cache: $(trivy --version | head -n1)" + elif command -v trivy >/dev/null 2>&1; then + echo "✓ Trivy already installed: $(trivy --version | head -n1)" + else + echo "Installing Trivy ${TRIVY_VERSION}..." + # Retry logic for GitHub releases API (handles transient 502 errors) + MAX_RETRIES=5 + RETRY=0 + while [ $RETRY -lt $MAX_RETRIES ]; do + if curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin v${TRIVY_VERSION}; then + echo "✓ Trivy installed successfully" + break + else + RETRY=$((RETRY + 1)) + if [ $RETRY -lt $MAX_RETRIES ]; then + # Exponential backoff with jitter + SLEEP_TIME=$((2 ** RETRY + RANDOM % 2)) + echo "⚠️ Trivy installation failed (attempt $RETRY/$MAX_RETRIES), retrying in ${SLEEP_TIME}s..." + sleep $SLEEP_TIME + else + echo "✗ Trivy installation failed after $MAX_RETRIES attempts" + exit 1 + fi + fi + done + fi + - name: Run snapshot logic id: run shell: bash @@ -71,13 +170,27 @@ runs: OS: ${{ inputs.os }} OS_VERSION: ${{ inputs.os_version }} ARCH: ${{ inputs.arch }} + PACKAGE_MANAGER: ${{ inputs.package_manager }} SCAN_MODE: ${{ inputs.scan_mode }} SCAN_ROOT: ${{ inputs.scan_root }} RESOLVE_VERSION: ${{ inputs.resolve_version }} PINNED_VERSION: ${{ inputs.pinned_version }} LICENSE_ID: ${{ inputs.license_id }} + BASE_URL_OVERRIDE: ${{ inputs.base_url_override }} + HAB_IDENT: ${{ inputs.hab_ident }} + HAB_CHANNEL: ${{ inputs.hab_channel }} + HAB_ORIGIN: ${{ inputs.hab_origin }} + HAB_AUTH_TOKEN: ${{ inputs.hab_auth_token }} OUT_DIR: ${{ inputs.out_dir }} WORK_DIR: ${{ inputs.work_dir }} + DATA_REPO_PATH: ${{ inputs.data_repo_path }} + FULL_PRODUCT_SCAN: ${{ inputs.full_product_scan }} + ENABLE_TRIVY: ${{ inputs.enable_trivy }} + TRIVY_SCANNERS: ${{ inputs.trivy_scanners }} + TRIVY_SEVERITY: ${{ inputs.trivy_severity }} + TRIVY_IGNORE_UNFIXED: ${{ inputs.trivy_ignore_unfixed }} + TRIVY_TIMEOUT: ${{ inputs.trivy_timeout }} + TRIVY_CACHE_DIR: ${{ inputs.trivy_cache_dir }} run: | set -euo pipefail if [ -n "${LICENSE_ID:-}" ]; then diff --git a/.github/actions/chef-download-grype-snapshot/run.py b/.github/actions/chef-download-grype-snapshot/run.py index 758adbd..01c7d63 100644 --- a/.github/actions/chef-download-grype-snapshot/run.py +++ b/.github/actions/chef-download-grype-snapshot/run.py @@ -1,4 +1,4 @@ -import os, json, subprocess, re +import os, json, subprocess, re, time, random from datetime import datetime, timezone from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode @@ -8,16 +8,180 @@ def env(k, d=""): def now_utc(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") -def run(cmd, check=True): - p = subprocess.run(cmd, text=True, capture_output=True) - if check and p.returncode != 0: - raise RuntimeError(f"Command failed: {cmd}\nstdout:\n{p.stdout}\nstderr:\n{p.stderr}") +def run(cmd, check=True, retry_config=None): + """ + Execute command with optional retry logic. + + Args: + cmd: Command to execute + check: Raise on non-zero exit + retry_config: Dict with retry settings (if None, no retry) + {"max_retries": 5, "base_delay": 2, "max_delay": 30} + """ + if retry_config is None: + # No retry - original behavior + p = subprocess.run(cmd, text=True, capture_output=True) + if check and p.returncode != 0: + raise RuntimeError(f"Command failed: {cmd}\nstdout:\n{p.stdout}\nstderr:\n{p.stderr}") + return p.returncode, p.stdout.strip(), p.stderr.strip() + + # Retry logic + max_retries = retry_config.get("max_retries", 5) + base_delay = retry_config.get("base_delay", 2) + max_delay = retry_config.get("max_delay", 30) + + last_error = None + for attempt in range(max_retries): + p = subprocess.run(cmd, text=True, capture_output=True) + + if p.returncode == 0: + return p.returncode, p.stdout.strip(), p.stderr.strip() + + last_error = f"Command failed: {cmd}\nstdout:\n{p.stdout}\nstderr:\n{p.stderr}" + + # Check if error is retryable + if not is_retryable_error(p.stderr, p.stdout): + if check: + raise RuntimeError(last_error) + return p.returncode, p.stdout.strip(), p.stderr.strip() + + # Calculate backoff with jitter + if attempt < max_retries - 1: # Don't sleep on last attempt + jitter = random.uniform(0, 1) + sleep_time = min((base_delay * (2 ** attempt)) + jitter, max_delay) + print(f"⚠️ Retryable error on attempt {attempt + 1}/{max_retries}") + print(f" Error: {p.stderr.strip()[:200]}") + print(f" Retrying in {sleep_time:.1f}s...") + time.sleep(sleep_time) + + # All retries exhausted + if check: + raise RuntimeError(f"Failed after {max_retries} attempts: {last_error}") return p.returncode, p.stdout.strip(), p.stderr.strip() +def is_retryable_error(stderr, stdout): + """ + Determine if error is retryable based on curl error codes and messages. + + Retryable errors: + - (92) HTTP/2 stream errors + - (18) Partial file transfer + - (56) Failure receiving network data + - (7) Failed to connect + - (28) Timeout + - (52) Empty reply from server + - (55) Failed sending network data + + Non-retryable errors: + - 400, 401, 403, 404 (client errors) + """ + error_text = stderr.lower() + stdout.lower() + + # Retryable curl error codes + retryable_codes = [ + "(92)", # HTTP/2 stream error - THE MAIN ISSUE + "(18)", # Partial file + "(56)", # Recv error + "(7)", # Failed to connect + "(28)", # Timeout + "(52)", # Empty reply + "(55)", # Send error + "(16)", # HTTP/2 error + ] + + for code in retryable_codes: + if code in error_text: + return True + + # Check for HTTP server errors (5xx) + if any(code in error_text for code in ["500", "502", "503", "504"]): + return True + + # Non-retryable conditions + if any(code in error_text for code in ["401", "403", "404", "400"]): + return False + + # Default: don't retry unless explicitly identified as retryable + return False + def http_json(url): rc, out, err = run(["bash", "-lc", f"curl -fsSL '{url}'"], check=True) return json.loads(out) +def parse_version(version_str): + """ + Parse a semantic version string into comparable components. + + Args: + version_str: Version string (e.g., "5.24.7", "6.8.24") + + Returns: + Tuple of (major, minor, patch) as integers, or None if parsing fails + """ + if not version_str: + return None + + # Remove any 'v' prefix + clean_ver = version_str.strip().lstrip('v') + + # Split on dots and take first 3 components + parts = clean_ver.split('.') + + try: + major = int(parts[0]) if len(parts) > 0 else 0 + minor = int(parts[1]) if len(parts) > 1 else 0 + + # Handle patch version (may have additional text like "7-rc1") + patch = 0 + if len(parts) > 2: + # Extract numeric part only + patch_str = parts[2].split('-')[0].split('+')[0] + patch = int(patch_str) if patch_str.isdigit() else 0 + + return (major, minor, patch) + except (ValueError, IndexError): + return None + +def get_major_version(version_str): + """ + Extract the major version number from a version string. + + Args: + version_str: Version string (e.g., "5.24.7") + + Returns: + Major version as integer, or None if parsing fails + """ + parsed = parse_version(version_str) + return parsed[0] if parsed else None + +def find_best_stable_version_for_major(all_stable_versions, target_major): + """ + Find the highest stable version matching a specific major version. + + Args: + all_stable_versions: List of version strings from stable channel + target_major: Target major version number (int) + + Returns: + Highest matching version string, or None if no match found + """ + matching_versions = [] + + for ver_str in all_stable_versions: + parsed = parse_version(ver_str) + if parsed and parsed[0] == target_major: + matching_versions.append((parsed, ver_str)) + + if not matching_versions: + return None + + # Sort by (major, minor, patch) tuple - highest last + matching_versions.sort(key=lambda x: x[0]) + + # Return the version string of the highest match + return matching_versions[-1][1] + def ensure_dir(path): run(["bash","-lc", f"mkdir -p '{path}'"], check=True) @@ -25,233 +189,1363 @@ def write_text(path, content): with open(path, "w", encoding="utf-8") as f: f.write(content) +def get_directory_size(path): + """ + Calculate the total size of all files in a directory (recursively). + + Args: + path: Directory path to calculate size for + + Returns: + Dictionary with size information: + - bytes: Total size in bytes + - human_readable: Human-readable size string (e.g., "1.5 GB") + - file_count: Number of files + """ + total_size = 0 + file_count = 0 + + try: + for dirpath, dirnames, filenames in os.walk(path): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + try: + # Use lstat to not follow symlinks (avoid double-counting) + total_size += os.lstat(filepath).st_size + file_count += 1 + except (OSError, FileNotFoundError): + # Skip files we can't stat (permissions, removed during walk, etc.) + pass + + # Format human-readable size + size_bytes = total_size + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if size_bytes < 1024.0 or unit == 'TB': + human_readable = f"{size_bytes:.2f} {unit}" + break + size_bytes = size_bytes / 1024.0 + + # Return original bytes value + return { + "bytes": total_size, + "human_readable": human_readable, + "file_count": file_count + } + except Exception as e: + print(f"Warning: Error calculating directory size for {path}: {e}") + return { + "bytes": 0, + "human_readable": "0 B", + "file_count": 0, + "error": str(e) + } + +def download_with_fallback(url, output_path, timeout=300): + """ + Download file with HTTP/2 fallback to HTTP/1.1 and retry logic. + + This addresses the curl (92) HTTP/2 stream errors by: + 1. Trying HTTP/2 first with retries + 2. Falling back to HTTP/1.1 if HTTP/2 consistently fails + 3. Using exponential backoff with jitter + """ + print(f"Downloading: {output_path}") + print(f"URL (redacted): {url.split('?')[0]}...") + + # HTTP version strategies to try in order + http_strategies = [ + { + "name": "HTTP/2", + "flags": ["--http2"], + "retries": 3 # Try HTTP/2 3 times before falling back + }, + { + "name": "HTTP/1.1", + "flags": ["--http1.1"], + "retries": 5 # Try HTTP/1.1 more times as fallback + } + ] + + retry_config = { + "max_retries": 5, + "base_delay": 2, + "max_delay": 30 + } + + last_error = None + + for strategy in http_strategies: + print(f"Attempting download with {strategy['name']}...") + + for attempt in range(strategy["retries"]): + try: + cmd = [ + "bash", "-lc", + " ".join([ + "curl", + "-fsSL", + *strategy["flags"], + "--connect-timeout", "30", + "--max-time", str(timeout), + "--keepalive-time", "60", + "--tcp-nodelay", + "--compressed", + "-o", f"'{output_path}'", + f"'{url}'" + ]) + ] + + # Execute with retry logic + run(cmd, check=True, retry_config=retry_config) + + # Verify download + if os.path.exists(output_path): + size = os.path.getsize(output_path) + print(f"✓ Download successful ({size} bytes) using {strategy['name']}") + return True + else: + raise RuntimeError(f"Download completed but file not found: {output_path}") + + except RuntimeError as e: + last_error = e + error_str = str(e) + + # Check if this is a protocol error specific to current HTTP version + if "(92)" in error_str or "http/2" in error_str.lower(): + print(f"✗ {strategy['name']} protocol error, will try fallback strategy") + break # Move to next HTTP version + elif not is_retryable_error(error_str, ""): + # Non-retryable error, fail immediately + print(f"✗ Non-retryable error: {error_str[:200]}") + raise + else: + # Retryable error, continue with current strategy + if attempt < strategy["retries"] - 1: + jitter = random.uniform(0, 1) + sleep_time = min(2 ** attempt + jitter, 30) + print(f"⚠️ Attempt {attempt + 1}/{strategy['retries']} failed, retrying in {sleep_time:.1f}s...") + time.sleep(sleep_time) + else: + print(f"✗ All {strategy['retries']} attempts with {strategy['name']} failed") + break # Try next strategy + + # All strategies exhausted + raise RuntimeError( + f"Download failed after all retry strategies:\n" + f" URL: {url.split('?')[0]}\n" + f" Last error: {last_error}" + ) + +def map_cinc_product_name(product): + """ + Map Chef product names to CINC product names for API endpoints. + CINC (Cinc Is Not Chef) uses different package names. + """ + cinc_mapping = { + "chef": "cinc", + "chef-server": "cinc-server", + "chef-workstation": "cinc-workstation", + "inspec": "cinc-auditor" + } + return cinc_mapping.get(product, product) + +def check_existing_version(scan_mode, data_repo_path, product, channel, download_site, os_name, os_ver, arch, resolved_version=None, hab_ident=None): + """ + Check if existing scan data matches the resolved version. + Returns (should_skip, reason) tuple. + For native mode: checks metadata.json in native/ path + For modern mode: checks metadata.json in modern/ path + For habitat mode: checks index.json and extracts version from directory structure + """ + if not data_repo_path or not os.path.exists(data_repo_path): + return False, "No existing data repository found" + + try: + if scan_mode == "native": + # Native mode: check metadata.json + metadata_path = os.path.join( + data_repo_path, + "native", + product, + channel, + download_site, + os_name, + os_ver, + arch, + "metadata.json" + ) + + if os.path.exists(metadata_path): + with open(metadata_path, "r", encoding="utf-8") as f: + metadata = json.load(f) + existing_version = metadata.get("target", {}).get("resolved_version", "") + + print(f"Version comparison: existing='{existing_version}' vs resolved='{resolved_version}'") + if existing_version == resolved_version: + return True, f"Version {resolved_version} already scanned (found in metadata.json)" + else: + return False, f"Version mismatch: existing={existing_version}, resolved={resolved_version}" + + return False, "No existing metadata found" + + elif scan_mode == "habitat": + # Habitat mode: check for existing version directory under habitat/{product}/{channel}/{os}/{arch}/ + # Structure: habitat/{product}/{channel}/{os}/{arch}/{origin}/{name}/{version}/index.json + hab_base_path = os.path.join( + data_repo_path, + "habitat", + product, + channel, + os_name, + arch + ) + + if not os.path.exists(hab_base_path): + return False, "No existing habitat data found" + + # Parse hab_ident to get origin/name for path lookup + # Expected format: origin/name or origin/name/version/release + if hab_ident: + parts = hab_ident.split("/") + if len(parts) >= 2: + origin, name = parts[0], parts[1] + origin_name_path = os.path.join(hab_base_path, origin, name) + + if os.path.exists(origin_name_path): + # Get list of existing version directories + existing_versions = [d for d in os.listdir(origin_name_path) + if os.path.isdir(os.path.join(origin_name_path, d))] + + # Check if any existing version has an index.json with matching version + for version_dir in existing_versions: + index_path = os.path.join(origin_name_path, version_dir, "index.json") + if os.path.exists(index_path): + with open(index_path, "r", encoding="utf-8") as f: + index = json.load(f) + existing_ident = index.get("target", {}).get("package", {}).get("ident", "") + + # Compare full ident (origin/name/version/release) + if resolved_version and existing_ident == resolved_version: + return True, f"Habitat package {resolved_version} already scanned (found in index.json)" + + return False, "No existing habitat scan or version mismatch" + + elif scan_mode == "modern": + # Modern mode: check metadata.json (same as native but under modern/ path) + metadata_path = os.path.join( + data_repo_path, + "modern", + product, + channel, + download_site, + os_name, + os_ver, + arch, + "metadata.json" + ) + + if os.path.exists(metadata_path): + with open(metadata_path, "r", encoding="utf-8") as f: + metadata = json.load(f) + existing_version = metadata.get("target", {}).get("resolved_version", "") + + print(f"Version comparison: existing='{existing_version}' vs resolved='{resolved_version}'") + if existing_version == resolved_version: + return True, f"Version {resolved_version} already scanned (found in metadata.json)" + else: + return False, f"Version mismatch: existing={existing_version}, resolved={resolved_version}" + + return False, "No existing metadata found" + + except Exception as e: + print(f"Warning: Error checking existing version: {e}") + return False, f"Error checking existing version: {e}" + + return False, "Unknown check result" + # Inputs product = env("PRODUCT") channel = env("CHANNEL") download_site = env("DOWNLOAD_SITE", "commercial") os_name = env("OS", "ubuntu") -os_ver = env("OS_VERSION") +os_ver = env("OS_VERSION", "") arch = env("ARCH", "x86_64") +package_manager = env("PACKAGE_MANAGER", "") scan_mode = env("SCAN_MODE", "native") scan_root = env("SCAN_ROOT", "") resolve_ver = env("RESOLVE_VERSION", "latest") pinned_ver = env("PINNED_VERSION", "") license_id = env("LICENSE_ID", "") +base_url_override = env("BASE_URL_OVERRIDE", "") out_dir = env("OUT_DIR", "out") work_dir = env("WORK_DIR", "work") - - -# Guard: commercial downloads require a license_id (fail fast with a clear error) -if download_site == "commercial" and not license_id.strip(): - raise RuntimeError( - "Commercial download_site requires LICENSE_ID, but it was empty. " - "Fix by scoping GA_DOWNLOAD_GRYPE_LICENSE_ID to the orchestrator repo and passing it into the composite action, " - "or switch DOWNLOAD_SITE to 'community' for targets that do not require licensing." - ) +data_repo_path = env("DATA_REPO_PATH", "") +full_product_scan = env("FULL_PRODUCT_SCAN", "false").lower() in ("true", "1", "yes") +hab_ident = env("HAB_IDENT", "") +hab_channel = env("HAB_CHANNEL", "stable") +hab_origin = env("HAB_ORIGIN", "") +hab_auth_token = env("HAB_AUTH_TOKEN", "") +enable_trivy = env("ENABLE_TRIVY", "true").lower() in ("true", "1", "yes") +trivy_scanners = env("TRIVY_SCANNERS", "vuln") +trivy_severity = env("TRIVY_SEVERITY", "UNKNOWN,LOW,MEDIUM,HIGH,CRITICAL") +trivy_ignore_unfixed = env("TRIVY_IGNORE_UNFIXED", "false").lower() in ("true", "1", "yes") +trivy_timeout = env("TRIVY_TIMEOUT", "") +trivy_cache_dir = env("TRIVY_CACHE_DIR", "") ensure_dir(out_dir) ensure_dir(work_dir) -# Choose base URL -base = "https://chefdownload-commercial.chef.io" if download_site == "commercial" else "https://chefdownload-community.chef.io" +# Create scanners output directory (for native/modern mode) +if scan_mode in ["native", "modern"]: + scanners_dir = os.path.join(out_dir, "scanners") + ensure_dir(scanners_dir) -# Resolve version -resolved_version = pinned_ver -if resolve_ver == "latest" or not resolved_version: - ver_url = f"{base}/{channel}/{product}/versions/latest" - # Both commercial and community require license_id, but different license types - if license_id: - ver_url += f"?license_id={license_id}" +# Branch based on scan_mode +if scan_mode == "habitat": + # HABITAT MODE: Install hab package, enumerate deps, scan each separately - try: - ver_doc = http_json(ver_url) - if isinstance(ver_doc, dict): - resolved_version = ( - ver_doc.get("version") - or ver_doc.get("latest") - or ver_doc.get("artifact_version") - or ver_doc.get("value") - ) - if not resolved_version: - resolved_version = str(ver_doc) + # Guard: habitat mode requires hab_ident or hab_origin + if not hab_ident.strip() and not hab_origin.strip(): + raise RuntimeError( + "Habitat scan_mode requires HAB_IDENT (e.g., 'core/chef-infra-client') or HAB_ORIGIN (e.g., 'chef'). " + "Set one in the target configuration." + ) + + # Ensure hab CLI is available + run(["bash", "-lc", "command -v hab >/dev/null 2>&1 || (curl -fsSL https://raw.githubusercontent.com/habitat-sh/habitat/master/components/hab/install.sh | sudo bash)"], check=True) + + # Accept the Chef License for Habitat (CI environment - create marker file for root) + run(["bash", "-lc", "sudo mkdir -p /hab/accepted-licenses && sudo touch /hab/accepted-licenses/habitat"], check=True) + + # Determine package identifier + pkg_to_install = hab_ident if hab_ident else f"{hab_origin}/{product}" + + # Install the package (with channel if specified) - requires sudo for /hab/pkgs/ access + # Note: Chef packages now require HAB_AUTH_TOKEN even for stable channel + install_cmd = f"sudo hab pkg install {pkg_to_install}" + if hab_channel and hab_channel != "stable": + install_cmd += f" --channel {hab_channel}" + + # Set HAB_AUTH_TOKEN if provided (required for protected packages including chef/* in stable) + if hab_auth_token: + if hab_channel and hab_channel != "stable": + install_cmd = f"sudo HAB_AUTH_TOKEN={hab_auth_token} hab pkg install {pkg_to_install} --channel {hab_channel}" else: - resolved_version = str(ver_doc) - except RuntimeError as e: - error_msg = str(e) - if "403" in error_msg or "401" in error_msg or "Missing license_id" in error_msg or "License Id is not valid" in error_msg or "Only Free license" in error_msg: - site_type = "commercial" if download_site == "commercial" else "community" - license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" + install_cmd = f"sudo HAB_AUTH_TOKEN={hab_auth_token} hab pkg install {pkg_to_install}" + + run(["bash", "-lc", install_cmd], check=True) + + # Get installed package details + rc, out, err = run(["bash", "-lc", f"sudo hab pkg path {pkg_to_install}"], check=True) + installed_path = out.strip() + + # Parse origin/name/version/release from path + # Expected: /hab/pkgs//// or C:\hab\pkgs\\\\ + path_parts = installed_path.replace("\\", "/").split("/") + if len(path_parts) >= 4: + origin, name, version, release = path_parts[-4:] + resolved_version = f"{origin}/{name}/{version}/{release}" + else: + raise RuntimeError(f"Unable to parse habitat package path: {installed_path}") + + # Check if this version is already scanned (unless full_product_scan is enabled) + if not full_product_scan: + should_skip, skip_reason = check_existing_version( + scan_mode="habitat", + data_repo_path=data_repo_path, + product=product, + channel=hab_channel, + download_site="", # Not used for habitat + os_name=os_name, + os_ver=os_ver, + arch=arch, + resolved_version=resolved_version, + hab_ident=hab_ident + ) + + if should_skip: + print(f"SKIP: {skip_reason}") + print(f"::debug::Skipping habitat scan for {product} {hab_channel}: {skip_reason}") + # Write minimal outputs for workflow to continue + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), f"habitat://{resolved_version}@{hab_channel}") + write_text(os.path.join(out_dir, "_skipped.txt"), "true") + exit(0) + else: + print(f"INFO: Full product scan enabled - bypassing version check") + + + # Enumerate direct dependencies (from DEPS file) + main_ident = f"{origin}/{name}/{version}/{release}" + deps_file = f"{installed_path}/DEPS" + rc, out, err = run(["bash", "-lc", f"sudo cat {deps_file} 2>/dev/null || echo ''"], check=False) + if rc == 0 and out.strip(): + direct_dep_idents = [line.strip() for line in out.split("\n") if line.strip() and "/" in line] + else: + direct_dep_idents = [] + + # Enumerate transitive dependencies (full tree - includes direct deps per Habitat definition) + rc, out, err = run(["bash", "-lc", f"sudo hab pkg dependencies -t {pkg_to_install}"], check=True) + transitive_dep_idents = [line.strip() for line in out.split("\n") if line.strip() and "/" in line and line.strip() != main_ident] + + # Build combined list for scanning: main package + direct deps + all transitive deps + # Note: Direct deps will be scanned twice (once in direct-deps/, once in transitive-deps/) + # Tag each with its type for proper directory placement + deps_to_scan = [ + {"ident": main_ident, "type": "main"}, + ] + for ident in direct_dep_idents: + deps_to_scan.append({"ident": ident, "type": "direct"}) + for ident in transitive_dep_idents: + deps_to_scan.append({"ident": ident, "type": "transitive"}) + + # Ensure grype (may be restored from cache) + grype_version = os.getenv("GRYPE_VERSION", "0.109.0") + if os.path.isfile("/usr/local/bin/grype"): + # Ensure executable permissions (cache may not preserve them) + run(["chmod", "+x", "/usr/local/bin/grype"], check=False) + print("✓ Grype found in cache") + else: + rc, _, _ = run(["bash", "-lc", "command -v grype >/dev/null 2>&1"], check=False) + if rc == 0: + print("✓ Grype already installed") + else: + # Install with retry logic for GitHub releases API + print(f"Installing Grype {grype_version}...") + install_cmd = f"curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh -s -- -b /usr/local/bin v{grype_version}" + run(["bash", "-lc", install_cmd], check=True, retry_config={"max_retries": 5, "base_delay": 2, "max_delay": 30}) + + # Create main package directory structure: {origin}/{name}/{version}/ + main_pkg_dir = os.path.join(out_dir, origin, name, version) + ensure_dir(main_pkg_dir) + + # Log what we're scanning + print(f"Habitat scan: {main_ident}") + print(f"Channel: {hab_channel}") + print(f"Total dependencies to scan: {len(deps_to_scan)}") + print(f" - Main package: 1") + print(f" - Direct dependencies: {len(direct_dep_idents)}") + print(f" - Transitive dependencies: {len(transitive_dep_idents)}") + + # Scan each dependency separately + dep_results = [] + + for dep_info in deps_to_scan: + dep_ident = dep_info["ident"] + dep_type = dep_info["type"] + # Parse dependency ident: origin/name/version/release + dep_parts = dep_ident.split("/") + if len(dep_parts) != 4: + print(f"Skipping malformed dependency ident: {dep_ident}") + continue + + dep_origin, dep_name, dep_version, dep_release = dep_parts + + # Determine scan path + if os_name == "windows": + dep_scan_path = f"C:\\hab\\pkgs\\{dep_origin}\\{dep_name}\\{dep_version}\\{dep_release}" + else: + dep_scan_path = f"/hab/pkgs/{dep_origin}/{dep_name}/{dep_version}/{dep_release}" + + # Determine output location based on dependency type + if dep_type == "main": + # Main package files go directly in main_pkg_dir + dep_out_dir = main_pkg_dir + elif dep_type == "direct": + # Direct dependencies go under direct-deps/{origin}/{name}/{version}/ + dep_out_dir = os.path.join(main_pkg_dir, "direct-deps", dep_origin, dep_name, dep_version) + else: # transitive + # Transitive dependencies go under transitive-deps/{origin}/{name}/{version}/ + dep_out_dir = os.path.join(main_pkg_dir, "transitive-deps", dep_origin, dep_name, dep_version) + + ensure_dir(dep_out_dir) + + dep_json_path = os.path.join(dep_out_dir, f"{dep_release}.json") + dep_metadata_path = os.path.join(dep_out_dir, f"{dep_release}.metadata.json") + + # Calculate installed size for this Habitat package + dep_size = get_directory_size(dep_scan_path) + + # Run grype scan + try: + run(["bash", "-lc", f"grype dir:'{dep_scan_path}' --name '{dep_ident}' --output json > '{dep_json_path}'"], check=True) - if "Missing license_id" in error_msg: - raise RuntimeError( - f"LICENSE ERROR ({site_type}): Missing license_id parameter.\n" - f" Download site: {download_site}\n" - f" Required secret: {license_secret}\n" - f" Solution: Ensure the {license_secret} secret is set in the orchestrator repository" - ) from e - elif "License Id is not valid" in error_msg or "403" in error_msg: + # Parse and pretty-print + dep_doc = json.load(open(dep_json_path, "r", encoding="utf-8")) + json.dump(dep_doc, open(dep_json_path, "w", encoding="utf-8"), indent=2) + + # Count vulnerabilities by severity + dep_matches = dep_doc.get("matches", []) or [] + buckets = ["Critical", "High", "Medium", "Low", "Negligible", "Unknown"] + dep_sev_counts = {k: 0 for k in buckets} + + for m in dep_matches: + sev = (m.get("vulnerability", {}) or {}).get("severity", "Unknown") or "Unknown" + sev_norm = sev.strip().title() + if sev_norm in ("Negligible", "Minimal"): + sev_norm = "Negligible" + if sev_norm not in dep_sev_counts: + sev_norm = "Unknown" + dep_sev_counts[sev_norm] += 1 + + # Create per-dependency metadata + dep_metadata = { + "schema_version": "1.0", + "dependency": { + "ident": dep_ident, + "origin": dep_origin, + "name": dep_name, + "version": dep_version, + "release": dep_release, + "scan_path": dep_scan_path, + "size": { + "installed_bytes": dep_size["bytes"], + "installed_human_readable": dep_size["human_readable"], + "file_count": dep_size["file_count"] + } + }, + "scan": { + "timestamp_utc": now_utc(), + "matches_total": len(dep_matches), + "severity_counts": dep_sev_counts + } + } + json.dump(dep_metadata, open(dep_metadata_path, "w", encoding="utf-8"), indent=2) + + # Track for rollup + # Build json_path based on dependency type + if dep_type == "main": + json_rel_path = f"{dep_release}.json" + elif dep_type == "direct": + json_rel_path = f"direct-deps/{dep_origin}/{dep_name}/{dep_version}/{dep_release}.json" + else: # transitive + json_rel_path = f"transitive-deps/{dep_origin}/{dep_name}/{dep_version}/{dep_release}.json" + + dep_results.append({ + "ident": dep_ident, + "origin": dep_origin, + "name": dep_name, + "version": dep_version, + "release": dep_release, + "matches_total": len(dep_matches), + "severity_counts": dep_sev_counts, + "json_path": json_rel_path, + "dependency_type": dep_type, + "size": { + "installed_bytes": dep_size["bytes"], + "installed_human_readable": dep_size["human_readable"], + "file_count": dep_size["file_count"] + } + }) + + print(f"Scanned dependency: {dep_ident} - {dep_size['human_readable']} ({len(dep_matches)} matches)") + + except Exception as e: + print(f"Failed to scan dependency {dep_ident}: {e}") + # Continue with other dependencies + + # Create index.json rollup + # Grype version + DB status + grype_version = "" + rc, out, err = run(["bash", "-lc", "grype version"], check=False) + if rc == 0: + m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", out) + if m: + grype_version = m.group(1) + + db_info = {} + rc, out, err = run(["bash", "-lc", "grype db status -o json"], check=False) + if rc == 0 and out.startswith("{"): + try: + dbj = json.loads(out) + db_info["status_raw"] = dbj + for k in ("built", "builtAt", "lastBuilt", "updated", "updatedAt", "lastUpdated"): + if k in dbj: + db_info["built_utc"] = dbj.get(k) + break + for k in ("schemaVersion", "schema", "dbSchemaVersion"): + if k in dbj: + db_info["schema"] = dbj.get(k) + break + for k in ("checksum", "hash", "etag"): + if k in dbj: + db_info["checksum"] = dbj.get(k) + break + except Exception: + db_info["status_raw_text"] = out + + # Calculate aggregate counts + total_matches = sum(d["matches_total"] for d in dep_results) + aggregate_counts = {k: 0 for k in ["Critical", "High", "Medium", "Low", "Negligible", "Unknown"]} + for d in dep_results: + for sev, count in d["severity_counts"].items(): + aggregate_counts[sev] += count + + # Calculate aggregate size (total disk footprint of all dependencies) + total_size_bytes = sum(d.get("size", {}).get("installed_bytes", 0) for d in dep_results) + total_file_count = sum(d.get("size", {}).get("file_count", 0) for d in dep_results) + + # Format aggregate size + size_bytes = total_size_bytes + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if size_bytes < 1024.0 or unit == 'TB': + total_size_human = f"{size_bytes:.2f} {unit}" + break + size_bytes = size_bytes / 1024.0 + + # GitHub Actions context + gha_run_id = env("GITHUB_RUN_ID", "") + repo = env("GITHUB_REPOSITORY", "") + workflow = env("GITHUB_WORKFLOW", "") + sha = env("GITHUB_SHA", "") + + index = { + "schema_version": "1.0", + "snapshot": { + "timestamp_utc": now_utc(), + "run_id": f"gha-{gha_run_id}" if gha_run_id else "", + "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} + }, + "target": { + "product": product, + "channel": hab_channel, + "package": { + "ident": main_ident, + "origin": origin, + "name": name, + "version": version, + "release": release + }, + "size": { + "total_installed_bytes": total_size_bytes, + "total_installed_human_readable": total_size_human, + "total_file_count": total_file_count + } + }, + "environment": { + "runner": env("RUNNER_OS", ""), + "os": os_name, + "os_version": os_ver, + "arch": arch + }, + "scan": { + "mode": "habitat", + "grype": {"version": grype_version, "db": db_info} + }, + "summary": { + "dependencies_scanned": len(dep_results), + "total_matches": total_matches, + "aggregate_severity_counts": aggregate_counts + }, + "dependencies": dep_results + } + + # Write index.json in the main package directory + index_path = os.path.join(main_pkg_dir, "index.json") + json.dump(index, open(index_path, "w", encoding="utf-8"), indent=2) + + # Write resolved_version for workflow outputs (keep in out_dir root for workflow to find) + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), f"habitat://{main_ident}@{hab_channel}") + + print(f"Wrote habitat index: {index_path}") + print(f"Scanned {len(dep_results)} dependencies with {total_matches} total matches") + print(f"Total installed size: {total_size_human} ({total_file_count:,} files)") + print(f"::notice::✓ Habitat scan completed for {product} {hab_channel}: {main_ident} with {len(dep_results)} dependencies ({total_matches} total vulnerabilities, {total_size_human} disk footprint)") + +else: + # NATIVE/MODERN MODE: Download + extract + scan logic + # (modern mode is identical to native but uses /modern/ path for next-gen products) + + # Guard: commercial downloads require a license_id (fail fast with a clear error) + # CINC downloads don't require license_id + if download_site == "commercial" and not license_id.strip(): + raise RuntimeError( + "Commercial download_site requires LICENSE_ID, but it was empty. " + "Fix by scoping GA_DOWNLOAD_GRYPE_LICENSE_ID to the orchestrator repo and passing it into the composite action, " + "or switch DOWNLOAD_SITE to 'community' for targets that do not require licensing." + ) + + # Map product name for CINC downloads (chef -> cinc, inspec -> cinc-auditor, etc.) + api_product = map_cinc_product_name(product) if download_site == "cinc" else product + + # Choose base URL (support override for alternative download sites) + if base_url_override: + base = base_url_override.rstrip("/") + print(f"Using base URL override: {base}") + elif download_site == "cinc": + base = "https://omnitruck.cinc.sh" + else: + base = "https://chefdownload-commercial.chef.io" if download_site == "commercial" else "https://chefdownload-community.chef.io" + + # Resolve version + resolved_version = pinned_ver + if resolve_ver == "latest" or not resolved_version: + # For stable channel, implement major version matching logic + # to ensure we compare stable against the same major version as current + if channel == "stable": + try: + print("🔍 Major version matching enabled for stable channel") + + # Step 1: Get the latest version from current channel + current_ver_url = f"{base}/current/{api_product}/versions/latest" + if license_id and download_site != "cinc": + current_ver_url += f"?license_id={license_id}" + + print(f"Fetching current channel latest: {current_ver_url.split('?')[0]}{'?license_id=***' if license_id and download_site != 'cinc' else ''}") + current_ver_doc = http_json(current_ver_url) + + current_version = None + if isinstance(current_ver_doc, dict): + current_version = ( + current_ver_doc.get("version") + or current_ver_doc.get("latest") + or current_ver_doc.get("artifact_version") + or current_ver_doc.get("value") + ) + if not current_version: + current_version = str(current_ver_doc) + else: + current_version = str(current_ver_doc).strip().strip('"') + + print(f"Current channel latest version: {current_version}") + + # Step 2: Extract major version from current + current_major = get_major_version(current_version) + + if current_major is not None: + print(f"Current channel major version: {current_major}") + + # Step 3: Get all stable versions + stable_all_url = f"{base}/stable/{api_product}/versions/all" + if license_id and download_site != "cinc": + stable_all_url += f"?license_id={license_id}" + + print(f"Fetching all stable versions: {stable_all_url.split('?')[0]}{'?license_id=***' if license_id and download_site != 'cinc' else ''}") + stable_all_versions = http_json(stable_all_url) + + if isinstance(stable_all_versions, list) and stable_all_versions: + print(f"Found {len(stable_all_versions)} stable versions") + + # Step 4: Find the best matching version in stable + best_stable = find_best_stable_version_for_major(stable_all_versions, current_major) + + if best_stable: + print(f"✅ Best stable version matching major {current_major}: {best_stable}") + resolved_version = best_stable + else: + print(f"⚠️ No stable version found matching major {current_major}, falling back to /latest") + # Fall back to regular latest logic below + resolved_version = None + else: + print(f"⚠️ Could not fetch stable versions list, falling back to /latest") + resolved_version = None + else: + print(f"⚠️ Could not parse major version from current ({current_version}), falling back to /latest") + resolved_version = None + + except Exception as e: + print(f"⚠️ Major version matching failed: {e}") + print(f" Falling back to standard /latest endpoint") + resolved_version = None + + # Fall back to standard /latest logic if major version matching was skipped or failed + if not resolved_version: + ver_url = f"{base}/{channel}/{api_product}/versions/latest" + # Commercial and community require license_id, but CINC does not + if license_id and download_site != "cinc": + ver_url += f"?license_id={license_id}" + + print(f"Fetching latest version from: {ver_url.split('?')[0]}{'?license_id=***' if license_id and download_site != 'cinc' else ''}") + try: + ver_doc = http_json(ver_url) + print(f"API response type: {type(ver_doc).__name__}") + print(f"API response value: {ver_doc}") + if isinstance(ver_doc, dict): + resolved_version = ( + ver_doc.get("version") + or ver_doc.get("latest") + or ver_doc.get("artifact_version") + or ver_doc.get("value") + ) + if not resolved_version: + resolved_version = str(ver_doc) + else: + resolved_version = str(ver_doc).strip().strip('"') + except RuntimeError as e: + error_msg = str(e) + # CINC doesn't require licenses, so skip license-specific error handling + if download_site != "cinc" and ("403" in error_msg or "401" in error_msg or "Missing license_id" in error_msg or "License Id is not valid" in error_msg or "Only Free license" in error_msg): + site_type = "commercial" if download_site == "commercial" else "community" + license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" + + if "Missing license_id" in error_msg: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Missing license_id parameter.\n" + f" Download site: {download_site}\n" + f" Required secret: {license_secret}\n" + f" Solution: Ensure the {license_secret} secret is set in the orchestrator repository" + ) from e + elif "License Id is not valid" in error_msg or "403" in error_msg: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Invalid or expired license_id.\n" + f" Download site: {download_site}\n" + f" Product: {product}, Channel: {channel}\n" + f" Secret used: {license_secret}\n" + f" Solution: Update the {license_secret} secret with a valid {'commercial' if download_site == 'commercial' else 'Free'} license" + ) from e + elif "Only Free license" in error_msg: + raise RuntimeError( + f"LICENSE ERROR (community): Wrong license type provided.\n" + f" Download site: community\n" + f" Error: Community downloads require a 'Free' license, but a commercial license was provided\n" + f" Solution: Update GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE secret with a valid Free license (not commercial)" + ) from e + else: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Authentication failed.\n" + f" Download site: {download_site}\n" + f" Product: {product}, Channel: {channel}\n" + f" Secret used: {license_secret}\n" + f" Solution: Verify the {license_secret} secret contains a valid license for {download_site} downloads" + ) from e + raise + + print(f"Resolved version: '{resolved_version}' (type: {type(resolved_version).__name__})") + + # Check if this version is already scanned (unless full_product_scan is enabled) + if not full_product_scan: + should_skip, skip_reason = check_existing_version( + scan_mode=scan_mode, + data_repo_path=data_repo_path, + product=product, + channel=channel, + download_site=download_site, + os_name=os_name, + os_ver=os_ver, + arch=arch, + resolved_version=resolved_version, + hab_ident=None + ) + + if should_skip: + print(f"SKIP: {skip_reason}") + print(f"::debug::Skipping {scan_mode} scan for {product} {channel} ({download_site}): {skip_reason}") + # Write minimal outputs for workflow to continue + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + # Construct redacted URL for output + if download_site == "cinc": + # For CINC, construct a descriptive URL (actual URL would require fetching packages endpoint) + download_url_redacted = f"{base}/{channel}/{api_product}/packages (Platform: {os_name}/{os_ver}/{arch})" + else: + q_params = [("p", os_name), ("m", arch), ("v", resolved_version)] + if os_ver: + q_params.insert(1, ("pv", os_ver)) + if package_manager: + q_params.insert(2, ("pm", package_manager)) + parts = urlsplit(f"{base}/{channel}/{api_product}/download") + download_url_redacted = urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(q_params, doseq=True), parts.fragment)) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), download_url_redacted) + write_text(os.path.join(out_dir, "_skipped.txt"), "true") + exit(0) + else: + print(f"INFO: Full product scan enabled - bypassing version check") + + + # Construct download URL + # Support three patterns: + # 1. Standard Chef: ?p=ubuntu&pv=24.04&m=x86_64&v=latest (commercial/community) + # 2. Universal binaries: ?p=linux&pm=deb&m=x86_64&v=latest (chef-ice - no pv parameter) + # 3. CINC: Fetch from /packages endpoint and extract direct .deb URL + + if download_site == "cinc": + # CINC provides direct package URLs via /packages endpoint + packages_url = f"{base}/{channel}/{api_product}/packages" + print(f"Fetching CINC package info from: {packages_url}") + + try: + packages_doc = http_json(packages_url) + # Navigate: packages_doc[os][os_version][arch]["url"] + if os_name in packages_doc and os_ver in packages_doc[os_name] and arch in packages_doc[os_name][os_ver]: + pkg_info = packages_doc[os_name][os_ver][arch] + download_url = pkg_info.get("url", "") + if not download_url: + raise RuntimeError(f"No URL found in CINC package info for {os_name}/{os_ver}/{arch}") + + # Verify version matches + pkg_version = pkg_info.get("version", "") + if pkg_version and pkg_version != resolved_version: + print(f"Warning: Package version {pkg_version} differs from resolved version {resolved_version}") + else: raise RuntimeError( - f"LICENSE ERROR ({site_type}): Invalid or expired license_id.\n" - f" Download site: {download_site}\n" - f" Product: {product}, Channel: {channel}\n" - f" Secret used: {license_secret}\n" - f" Solution: Update the {license_secret} secret with a valid {'commercial' if download_site == 'commercial' else 'Free'} license" - ) from e - elif "Only Free license" in error_msg: + f"CINC package not found for platform combination.\n" + f" Product: {api_product} (Chef: {product})\n" + f" OS: {os_name} {os_ver}, Arch: {arch}\n" + f" Available in packages: {list(packages_doc.keys())}" + ) + except RuntimeError as e: + if "CINC package not found" in str(e): + raise + raise RuntimeError( + f"Failed to fetch CINC package information.\n" + f" Product: {api_product} (Chef: {product})\n" + f" URL: {packages_url}\n" + f" Error: {str(e)}" + ) from e + + download_url_redacted = download_url # CINC URLs don't contain secrets + else: + # Chef commercial/community pattern + download_url = f"{base}/{channel}/{api_product}/download?p={os_name}" + if os_ver: # Optional for universal binaries + download_url += f"&pv={os_ver}" + download_url += f"&m={arch}" + if package_manager: # Required for universal binaries like chef-ice + download_url += f"&pm={package_manager}" + download_url += f"&v={resolved_version}" + if license_id: + download_url += f"&license_id={license_id}" + + # Redact license_id (robust URL parsing) + parts = urlsplit(download_url) + q = [(k,v) for (k,v) in parse_qsl(parts.query, keep_blank_values=True) if k != "license_id"] + download_url_redacted = urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(q, doseq=True), parts.fragment)) + + # Persist small values for action outputs + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), download_url_redacted) + + # Log what we're downloading + print(f"Downloading {product} {channel} version {resolved_version}") + print(f"Download URL: {download_url_redacted}") + print(f"Target: {os_name}{'/' + os_ver if os_ver else ''}/{arch}{'/' + package_manager if package_manager else ''}") + + # Download package with resilient retry logic + pkg_path = os.path.join(work_dir, "package_downloaded.deb") + try: + # Use new download_with_fallback function with HTTP/2 → HTTP/1.1 fallback + download_with_fallback(download_url, pkg_path, timeout=300) + print(f"Downloaded package: {os.path.getsize(pkg_path)} bytes") + except RuntimeError as e: + if "500" in str(e): + raise RuntimeError( + f"DOWNLOAD ERROR: Server error (500) when downloading {product}.\n" + f" Product: {product} v{resolved_version}\n" + f" Channel: {channel}, OS: {os_name} {os_ver}, Arch: {arch}\n" + f" Package manager: {package_manager or 'N/A'}\n" + f" Download site: {download_site}\n" + f" This may indicate:\n" + f" 1. Channel '{channel}' doesn't exist for this product\n" + f" 2. Server-side error with Chef downloads infrastructure\n" + f" Solution: Verify the channel is available for this product, or try again later" + ) from e + elif "403" in str(e) or "401" in str(e): + if download_site == "cinc": raise RuntimeError( - f"LICENSE ERROR (community): Wrong license type provided.\n" - f" Download site: community\n" - f" Error: Community downloads require a 'Free' license, but a commercial license was provided\n" - f" Solution: Update GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE secret with a valid Free license (not commercial)" + f"DOWNLOAD ERROR (CINC): Failed to download package.\n" + f" Product: {product} ({api_product}) v{resolved_version}\n" + f" Channel: {channel}, OS: {os_name} {os_ver}\n" + f" This may indicate:\n" + f" 1. Package not available for this OS/version combination\n" + f" 2. Version {resolved_version} doesn't exist in {channel} channel\n" + f" Solution: Verify that the product/version/platform combination is valid" ) from e else: + site_type = "commercial" if download_site == "commercial" else "community" + license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" raise RuntimeError( - f"LICENSE ERROR ({site_type}): Authentication failed.\n" + f"DOWNLOAD ERROR ({site_type}): Failed to download package.\n" + f" Product: {product} v{resolved_version}\n" + f" Channel: {channel}, OS: {os_name} {os_ver}\n" f" Download site: {download_site}\n" - f" Product: {product}, Channel: {channel}\n" - f" Secret used: {license_secret}\n" - f" Solution: Verify the {license_secret} secret contains a valid license for {download_site} downloads" + f" This may indicate:\n" + f" 1. Invalid or expired {license_secret} secret\n" + f" 2. Package not available for this OS/version combination\n" + f" 3. Version {resolved_version} doesn't exist in {channel} channel\n" + f" Solution: Verify license and that the product/version/platform combination is valid" ) from e raise -# Construct download URL -download_url = f"{base}/{channel}/{product}/download?p={os_name}&pv={os_ver}&m={arch}&v={resolved_version}" -if license_id: - download_url += f"&license_id={license_id}" - -# Redact license_id (robust URL parsing) -parts = urlsplit(download_url) -q = [(k,v) for (k,v) in parse_qsl(parts.query, keep_blank_values=True) if k != "license_id"] -download_url_redacted = urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(q, doseq=True), parts.fragment)) - -# Persist small values for action outputs -write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) -write_text(os.path.join(out_dir, "_download_url_redacted.txt"), download_url_redacted) - -# Download package -pkg_path = os.path.join(work_dir, "package_downloaded.deb") -try: - run(["bash","-lc", f"curl -fsSL -o '{pkg_path}' '{download_url}'"], check=True) -except RuntimeError as e: - if "403" in str(e) or "401" in str(e): - site_type = "commercial" if download_site == "commercial" else "community" - license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" + # Validate downloaded file + if not os.path.exists(pkg_path): + raise RuntimeError(f"Download failed: package file not found at {pkg_path}") + + file_size = os.path.getsize(pkg_path) + if file_size < 1024: # Less than 1KB is likely an error page or empty file raise RuntimeError( - f"DOWNLOAD ERROR ({site_type}): Failed to download package.\n" + f"DOWNLOAD ERROR: Downloaded file is suspiciously small ({file_size} bytes).\n" f" Product: {product} v{resolved_version}\n" - f" Channel: {channel}, OS: {os_name} {os_ver}\n" - f" Download site: {download_site}\n" - f" This may indicate:\n" - f" 1. Invalid or expired {license_secret} secret\n" - f" 2. Package not available for this OS/version combination\n" - f" 3. Version {resolved_version} doesn't exist in {channel} channel\n" - f" Solution: Verify license and that the product/version/platform combination is valid" - ) from e - raise - -# Extract deterministically (pilot assumes Ubuntu .deb) -extract_dir = os.path.join(work_dir, "extracted") -run(["bash","-lc", f"rm -rf '{extract_dir}' && mkdir -p '{extract_dir}'"], check=True) -run(["bash","-lc", f"dpkg-deb -x '{pkg_path}' '{extract_dir}'"], check=True) - -# Ensure grype -run(["bash","-lc", "command -v grype >/dev/null 2>&1 || (curl -sSfL https://get.anchore.io/grype | sh -s -- -b /usr/local/bin)"], check=True) - -# Run grype scan to JSON (do not print findings to stdout) -latest_json_path = os.path.join(out_dir, "latest.json") -run(["bash","-lc", f"grype dir:'{extract_dir}' --name '{product}' --output json > '{latest_json_path}'"], check=True) - -# Parse counts and rewrite with pretty formatting -doc = json.load(open(latest_json_path, "r", encoding="utf-8")) -json.dump(doc, open(latest_json_path, "w", encoding="utf-8"), indent=2) -doc = json.load(open(latest_json_path, "r", encoding="utf-8")) -matches = doc.get("matches", []) or [] - -buckets = ["Critical","High","Medium","Low","Negligible","Unknown"] -sev_counts = {k: 0 for k in buckets} - -for m in matches: - sev = (m.get("vulnerability", {}) or {}).get("severity", "Unknown") or "Unknown" - sev_norm = sev.strip().title() - if sev_norm in ("Negligible","Minimal"): - sev_norm = "Negligible" - if sev_norm not in sev_counts: - sev_norm = "Unknown" - sev_counts[sev_norm] += 1 - -# Grype version + DB status (best effort) -grype_version = "" -rc, out, err = run(["bash","-lc", "grype version"], check=False) -if rc == 0: - m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", out) - if m: - grype_version = m.group(1) - -db_info = {} -rc, out, err = run(["bash","-lc", "grype db status -o json"], check=False) -if rc == 0 and out.startswith("{"): + f" Channel: {channel}, Download site: {download_site}\n" + f" This indicates an incomplete or corrupted download.\n" + f" Solution: Check network connectivity and retry. If issue persists, the package may not exist for this platform." + ) + + # Verify it's a valid debian package by checking for debian-binary member + rc, out, err = run(["bash","-lc", f"ar t '{pkg_path}' 2>/dev/null | grep -q 'debian-binary' && echo 'valid' || echo 'invalid'"], check=False) + if out.strip() != "valid": + raise RuntimeError( + f"DOWNLOAD ERROR: Downloaded file is not a valid Debian package.\n" + f" Product: {product} v{resolved_version}\n" + f" Channel: {channel}, Download site: {download_site}\n" + f" File size: {file_size} bytes\n" + f" This indicates a corrupted download or an error page was returned instead of the package.\n" + f" Solution: Retry the download. If issue persists, check if the package exists for this platform." + ) + + print(f"Downloaded package: {file_size} bytes") + + # Extract deterministically (pilot assumes Ubuntu .deb) + extract_dir = os.path.join(work_dir, "extracted") + run(["bash","-lc", f"rm -rf '{extract_dir}' && mkdir -p '{extract_dir}'"], check=True) + try: - dbj = json.loads(out) - db_info["status_raw"] = dbj - for k in ("built","builtAt","lastBuilt","updated","updatedAt","lastUpdated"): - if k in dbj: - db_info["built_utc"] = dbj.get(k) - break - for k in ("schemaVersion","schema","dbSchemaVersion"): - if k in dbj: - db_info["schema"] = dbj.get(k) - break - for k in ("checksum","hash","etag"): - if k in dbj: - db_info["checksum"] = dbj.get(k) - break - except Exception: - db_info["status_raw_text"] = out -else: - rc2, out2, err2 = run(["bash","-lc","grype db status"], check=False) - if rc2 == 0: - db_info["status_raw_text"] = out2 - m = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", out2) + run(["bash","-lc", f"dpkg-deb -x '{pkg_path}' '{extract_dir}'"], check=True) + except RuntimeError as e: + raise RuntimeError( + f"EXTRACTION ERROR: Failed to extract Debian package.\n" + f" Product: {product} v{resolved_version}\n" + f" Channel: {channel}, Download site: {download_site}\n" + f" File size: {file_size} bytes\n" + f" Error: {str(e)}\n" + f" This indicates a corrupted download or malformed package.\n" + f" Solution: The download will be retried on the next run. If issue persists, report to Chef support." + ) from e + + # Handle nested bundle extraction for migration packages (e.g., chef-ice) + # These packages contain a hab/migration/bundle/*.tar.gz with the actual software + bundle_glob = os.path.join(extract_dir, "hab", "migration", "bundle", "*.tar.gz") + rc, bundle_files, _ = run(["bash", "-lc", f"ls {bundle_glob} 2>/dev/null || true"], check=False) + if bundle_files.strip(): + bundle_tarball = bundle_files.strip().split('\n')[0] # Take first match + print(f"Detected migration bundle package: {os.path.basename(bundle_tarball)}") + print(f"Extracting nested Habitat package for scanning...") + + # Extract the bundle tarball into the extract_dir (will create hab/ structure) + try: + run(["bash", "-lc", f"tar -xzf '{bundle_tarball}' -C '{extract_dir}'"], check=True) + print(f"✓ Successfully extracted nested bundle") + except RuntimeError as e: + raise RuntimeError( + f"EXTRACTION ERROR: Failed to extract nested migration bundle.\n" + f" Product: {product} v{resolved_version}\n" + f" Bundle: {os.path.basename(bundle_tarball)}\n" + f" This is a migration package (e.g., chef-ice) with nested Habitat content.\n" + f" Error: {str(e)}" + ) from e + + # Calculate installed size (disk footprint after extraction) + print(f"Calculating installed size...") + installed_size = get_directory_size(extract_dir) + print(f"Installed size: {installed_size['human_readable']} ({installed_size['file_count']} files)") + + # Ensure grype (may be restored from cache) + grype_version = os.getenv("GRYPE_VERSION", "0.109.0") + if os.path.isfile("/usr/local/bin/grype"): + # Ensure executable permissions (cache may not preserve them) + run(["chmod", "+x", "/usr/local/bin/grype"], check=False) + print("✓ Grype found in cache") + else: + rc, _, _ = run(["bash","-lc", "command -v grype >/dev/null 2>&1"], check=False) + if rc == 0: + print("✓ Grype already installed") + else: + # Install with retry logic for GitHub releases API + print(f"Installing Grype {grype_version}...") + install_cmd = f"curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh -s -- -b /usr/local/bin v{grype_version}" + run(["bash","-lc", install_cmd], check=True, retry_config={"max_retries": 5, "base_delay": 2, "max_delay": 30}) + + # Run grype scan to JSON (do not print findings to stdout) + grype_latest_json = os.path.join(scanners_dir, "grype.latest.json") + run(["bash","-lc", f"grype dir:'{extract_dir}' --name '{product}' --output json > '{grype_latest_json}'"], check=True) + + # Parse counts and rewrite with pretty formatting + doc = json.load(open(grype_latest_json, "r", encoding="utf-8")) + json.dump(doc, open(grype_latest_json, "w", encoding="utf-8"), indent=2) + doc = json.load(open(grype_latest_json, "r", encoding="utf-8")) + matches = doc.get("matches", []) or [] + + buckets = ["Critical","High","Medium","Low","Negligible","Unknown"] + sev_counts = {k: 0 for k in buckets} + + for m in matches: + sev = (m.get("vulnerability", {}) or {}).get("severity", "Unknown") or "Unknown" + sev_norm = sev.strip().title() + if sev_norm in ("Negligible","Minimal"): + sev_norm = "Negligible" + if sev_norm not in sev_counts: + sev_norm = "Unknown" + sev_counts[sev_norm] += 1 + + # Grype version + DB status (best effort) + grype_version = "" + rc, out, err = run(["bash","-lc", "grype version"], check=False) + if rc == 0: + m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", out) if m: - db_info["built_utc"] = m.group(1) - -# Metadata -gha_run_id = env("GITHUB_RUN_ID", "") -repo = env("GITHUB_REPOSITORY", "") -workflow = env("GITHUB_WORKFLOW", "") -sha = env("GITHUB_SHA", "") - -metadata = { - "schema_version": "1.0", - "snapshot": { - "timestamp_utc": now_utc(), - "run_id": f"gha-{gha_run_id}" if gha_run_id else "", - "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} - }, - "target": { - "product": product, - "channel": channel, - "resolved_version": resolved_version, - "download": {"site": download_site, "url_redacted": download_url_redacted} - }, - "environment": { - "runner": env("RUNNER_OS",""), - "os": os_name, - "os_version": os_ver, - "arch": arch - }, - "scan": { - "mode": scan_mode, - "scan_root": scan_root, - "grype": {"version": grype_version, "db": db_info}, - "options": {"output": "json"} - }, - "summary": { - "matches_total": len(matches), - "severity_counts": sev_counts + grype_version = m.group(1) + + db_info = {} + rc, out, err = run(["bash","-lc", "grype db status -o json"], check=False) + if rc == 0 and out.startswith("{"): + try: + dbj = json.loads(out) + db_info["status_raw"] = dbj + for k in ("built","builtAt","lastBuilt","updated","updatedAt","lastUpdated"): + if k in dbj: + db_info["built_utc"] = dbj.get(k) + break + for k in ("schemaVersion","schema","dbSchemaVersion"): + if k in dbj: + db_info["schema"] = dbj.get(k) + break + for k in ("checksum","hash","etag"): + if k in dbj: + db_info["checksum"] = dbj.get(k) + break + except Exception: + db_info["status_raw_text"] = out + else: + rc2, out2, err2 = run(["bash","-lc","grype db status"], check=False) + if rc2 == 0: + db_info["status_raw_text"] = out2 + m = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", out2) + if m: + db_info["built_utc"] = m.group(1) + + # Metadata + gha_run_id = env("GITHUB_RUN_ID", "") + repo = env("GITHUB_REPOSITORY", "") + workflow = env("GITHUB_WORKFLOW", "") + sha = env("GITHUB_SHA", "") + + grype_metadata = { + "schema_version": "1.0", + "snapshot": { + "timestamp_utc": now_utc(), + "run_id": f"gha-{gha_run_id}" if gha_run_id else "", + "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} + }, + "target": { + "product": product, + "channel": channel, + "resolved_version": resolved_version, + "download": {"site": download_site, "url_redacted": download_url_redacted}, + "size": { + "package_bytes": file_size, + "installed_bytes": installed_size["bytes"], + "installed_human_readable": installed_size["human_readable"], + "file_count": installed_size["file_count"] + } + }, + "environment": { + "runner": env("RUNNER_OS",""), + "os": os_name, + "os_version": os_ver, + "arch": arch, + "package_manager": package_manager if package_manager else None + }, + "scan": { + "mode": scan_mode, + "scan_root": scan_root, + "grype": {"version": grype_version, "db": db_info}, + "options": {"output": "json"} + }, + "summary": { + "matches_total": len(matches), + "severity_counts": sev_counts + } } -} -metadata_path = os.path.join(out_dir, "metadata.json") -json.dump(metadata, open(metadata_path, "w", encoding="utf-8"), indent=2) -print("Wrote:", latest_json_path, metadata_path) \ No newline at end of file + grype_metadata_path = os.path.join(scanners_dir, "grype.metadata.json") + json.dump(grype_metadata, open(grype_metadata_path, "w", encoding="utf-8"), indent=2) + + # Legacy compatibility: copy Grype files to out/ root + import shutil + shutil.copy2(grype_latest_json, os.path.join(out_dir, "latest.json")) + shutil.copy2(grype_metadata_path, os.path.join(out_dir, "metadata.json")) + + print("Wrote Grype outputs:", grype_latest_json, grype_metadata_path) + print(f"::notice::✓ {scan_mode.title()} scan completed for {product} {channel} v{resolved_version}: {len(matches)} vulnerabilities found (Critical: {sev_counts['Critical']}, High: {sev_counts['High']}, Medium: {sev_counts['Medium']})") + + # ============================================================================ + # TRIVY SCANNING + # ============================================================================ + + if enable_trivy: + print("Running Trivy scan...") + + # Run Trivy filesystem scan + trivy_latest_json = os.path.join(scanners_dir, "trivy.latest.json") + trivy_cmd = f"trivy fs --format json --scanners {trivy_scanners} --severity {trivy_severity}" + + if trivy_ignore_unfixed: + trivy_cmd += " --ignore-unfixed" + if trivy_timeout: + trivy_cmd += f" --timeout {trivy_timeout}" + if trivy_cache_dir: + trivy_cmd += f" --cache-dir '{trivy_cache_dir}'" + + trivy_cmd += f" '{extract_dir}' > '{trivy_latest_json}'" + + try: + run(["bash", "-lc", trivy_cmd], check=True) + except RuntimeError as e: + print(f"WARNING: Trivy scan failed: {e}") + # Write empty results on failure + json.dump({"Results": [], "error": str(e)}, open(trivy_latest_json, "w", encoding="utf-8"), indent=2) + + # Parse Trivy results and pretty-print + trivy_doc = json.load(open(trivy_latest_json, "r", encoding="utf-8")) + json.dump(trivy_doc, open(trivy_latest_json, "w", encoding="utf-8"), indent=2) + trivy_doc = json.load(open(trivy_latest_json, "r", encoding="utf-8")) + + # Extract vulnerability counts from Trivy results (handle missing Results) + trivy_results = trivy_doc.get("Results", []) or [] + trivy_sev_counts = {k: 0 for k in ["Critical","High","Medium","Low","Negligible","Unknown"]} + trivy_cves = set() + + for result in trivy_results: + vulns = result.get("Vulnerabilities") or [] + for vuln in vulns: + cve_id = vuln.get("VulnerabilityID", "") + if cve_id: + trivy_cves.add(cve_id) + + sev = vuln.get("Severity", "Unknown") or "Unknown" + sev_norm = sev.strip().upper() + # Map Trivy severities to our buckets + if sev_norm in ("CRITICAL", "HIGH", "MEDIUM", "LOW", "UNKNOWN"): + trivy_sev_counts[sev_norm.title()] += 1 + elif sev_norm in ("NEGLIGIBLE", "MINIMAL"): + trivy_sev_counts["Negligible"] += 1 + else: + trivy_sev_counts["Unknown"] += 1 + + # Get Trivy version and DB info + trivy_version = "" + trivy_db_info = {} + + rc, out, err = run(["bash", "-lc", "trivy --version"], check=False) + if rc == 0: + # Parse version output + for line in out.split("\n"): + if "Version:" in line: + m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", line) + if m: + trivy_version = m.group(1) + elif "Vulnerability DB:" in line: + # Try to extract DB metadata from version output + if "Version:" in line: + m = re.search(r"Version:\s*(\d+)", line) + if m: + trivy_db_info["version"] = m.group(1) + if "UpdatedAt:" in line: + m = re.search(r"UpdatedAt:\s*([\d-]+T[\d:]+Z)", line) + if m: + trivy_db_info["updated_at"] = m.group(1) + if "NextUpdate:" in line: + m = re.search(r"NextUpdate:\s*([\d-]+T[\d:]+Z)", line) + if m: + trivy_db_info["next_update"] = m.group(1) + if "DownloadedAt:" in line: + m = re.search(r"DownloadedAt:\s*([\d-]+T[\d:]+Z)", line) + if m: + trivy_db_info["downloaded_at"] = m.group(1) + + # Build Trivy metadata + trivy_metadata = { + "schema_version": "1.0", + "snapshot": { + "timestamp_utc": now_utc(), + "run_id": f"gha-{gha_run_id}" if gha_run_id else "", + "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} + }, + "target": { + "product": product, + "channel": channel, + "resolved_version": resolved_version, + "download": {"site": download_site, "url_redacted": download_url_redacted}, + "size": { + "package_bytes": file_size, + "installed_bytes": installed_size["bytes"], + "installed_human_readable": installed_size["human_readable"], + "file_count": installed_size["file_count"] + } + }, + "environment": { + "runner": env("RUNNER_OS",""), + "os": os_name, + "os_version": os_ver, + "arch": arch, + "package_manager": package_manager if package_manager else None + }, + "scan": { + "mode": scan_mode, + "scan_root": scan_root, + "trivy": { + "version": trivy_version, + "db": trivy_db_info, + "options": { + "scanners": trivy_scanners.split(","), + "severity": trivy_severity, + "ignore_unfixed": trivy_ignore_unfixed, + "format": "json" + } + } + }, + "summary": { + "vulnerabilities_total": sum(trivy_sev_counts.values()), + "severity_counts": trivy_sev_counts + } + } + + trivy_metadata_path = os.path.join(scanners_dir, "trivy.metadata.json") + json.dump(trivy_metadata, open(trivy_metadata_path, "w", encoding="utf-8"), indent=2) + print("Wrote Trivy outputs:", trivy_latest_json, trivy_metadata_path) + + # ============================================================================ + # COMPARISON (CVE-based) + # ============================================================================ + + # Extract CVEs from Grype + grype_cves = set() + for m in matches: + cve_id = (m.get("vulnerability", {}) or {}).get("id", "") + if cve_id and cve_id.startswith("CVE-"): + grype_cves.add(cve_id) + + # Compute set differences + only_in_grype = sorted(list(grype_cves - trivy_cves)) + only_in_trivy = sorted(list(trivy_cves - grype_cves)) + in_both = sorted(list(grype_cves & trivy_cves)) + + compare_doc = { + "schema_version": "1.0", + "generated_at_utc": now_utc(), + "target": { + "product": product, + "channel": channel, + "resolved_version": resolved_version + }, + "summary": { + "grype": { + "cve_count": len(grype_cves), + "severity_counts": sev_counts + }, + "trivy": { + "cve_count": len(trivy_cves), + "severity_counts": trivy_sev_counts + } + }, + "diff": { + "only_in_grype": only_in_grype, + "only_in_trivy": only_in_trivy, + "in_both": in_both + } + } + + compare_path = os.path.join(scanners_dir, "compare.json") + json.dump(compare_doc, open(compare_path, "w", encoding="utf-8"), indent=2) + print("Wrote comparison:", compare_path) + else: + print("Trivy scanning disabled (enable_trivy=false)") \ No newline at end of file diff --git a/.github/actions/chef-download-grype-snapshot/run.py.backup b/.github/actions/chef-download-grype-snapshot/run.py.backup new file mode 100644 index 0000000..5c37c21 --- /dev/null +++ b/.github/actions/chef-download-grype-snapshot/run.py.backup @@ -0,0 +1,501 @@ +import os, json, subprocess, re +from datetime import datetime, timezone +from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode + +def env(k, d=""): + return os.environ.get(k, d) + +def now_utc(): + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + +def run(cmd, check=True): + p = subprocess.run(cmd, text=True, capture_output=True) + if check and p.returncode != 0: + raise RuntimeError(f"Command failed: {cmd}\nstdout:\n{p.stdout}\nstderr:\n{p.stderr}") + return p.returncode, p.stdout.strip(), p.stderr.strip() + +def http_json(url): + rc, out, err = run(["bash", "-lc", f"curl -fsSL '{url}'"], check=True) + return json.loads(out) + +def ensure_dir(path): + run(["bash","-lc", f"mkdir -p '{path}'"], check=True) + +def write_text(path, content): + with open(path, "w", encoding="utf-8") as f: + f.write(content) + +# Inputs +product = env("PRODUCT") +channel = env("CHANNEL") +download_site = env("DOWNLOAD_SITE", "commercial") +os_name = env("OS", "ubuntu") +os_ver = env("OS_VERSION") +arch = env("ARCH", "x86_64") +scan_mode = env("SCAN_MODE", "native") +scan_root = env("SCAN_ROOT", "") +resolve_ver = env("RESOLVE_VERSION", "latest") +pinned_ver = env("PINNED_VERSION", "") +license_id = env("LICENSE_ID", "") +out_dir = env("OUT_DIR", "out") +work_dir = env("WORK_DIR", "work") +hab_ident = env("HAB_IDENT", "") +hab_channel = env("HAB_CHANNEL", "stable") +hab_origin = env("HAB_ORIGIN", "") +transitive_deps = env("TRANSITIVE_DEPS", "false").lower() == "true" + +ensure_dir(out_dir) +ensure_dir(work_dir) + +# Branch based on scan_mode +if scan_mode == "habitat": + # HABITAT MODE: Install hab package, enumerate deps, scan each separately + + # Guard: habitat mode requires hab_ident or hab_origin + if not hab_ident.strip() and not hab_origin.strip(): + raise RuntimeError( + "Habitat scan_mode requires HAB_IDENT (e.g., 'core/chef-infra-client') or HAB_ORIGIN (e.g., 'chef'). " + "Set one in the target configuration." + ) + +else: + # NATIVE MODE: Original download + extract + scan logic + + # Guard: commercial downloads require a license_id (fail fast with a clear error) + if download_site == "commercial" and not license_id.strip(): + raise RuntimeError( + "Commercial download_site requires LICENSE_ID, but it was empty. " + "Fix by scoping GA_DOWNLOAD_GRYPE_LICENSE_ID to the orchestrator repo and passing it into the composite action, " + "or switch DOWNLOAD_SITE to 'community' for targets that do not require licensing." + ) + + # Ensure hab CLI is available + run(["bash", "-lc", "command -v hab >/dev/null 2>&1 || (curl -fsSL https://raw.githubusercontent.com/habitat-sh/habitat/master/components/hab/install.sh | bash)"], check=True) + + # Determine package identifier + pkg_to_install = hab_ident if hab_ident else f"{hab_origin}/{product}" + + # Install the package (with channel if specified) + install_cmd = f"hab pkg install {pkg_to_install}" + if hab_channel and hab_channel != "stable": + install_cmd += f" --channel {hab_channel}" + if license_id: + # Set HAB_AUTH_TOKEN for licensed channels + install_cmd = f"HAB_AUTH_TOKEN={license_id} {install_cmd}" + + run(["bash", "-lc", install_cmd], check=True) + + # Get installed package details + rc, out, err = run(["bash", "-lc", f"hab pkg path {pkg_to_install}"], check=True) + installed_path = out.strip() + + # Parse origin/name/version/release from path + # Expected: /hab/pkgs//// or C:\hab\pkgs\\\\ + path_parts = installed_path.replace("\\", "/").split("/") + if len(path_parts) >= 4: + origin, name, version, release = path_parts[-4:] + resolved_version = f"{version}/{release}" + else: + raise RuntimeError(f"Unable to parse habitat package path: {installed_path}") + + # Enumerate dependencies + deps_cmd = f"hab pkg deps {pkg_to_install}" + if transitive_deps: + # Include transitive dependencies + rc, out, err = run(["bash", "-lc", deps_cmd], check=True) + else: + # Direct dependencies only (tdeps output minus package itself) + rc, out, err = run(["bash", "-lc", f"hab pkg deps -r {pkg_to_install}"], check=True) + + dep_idents = [line.strip() for line in out.split("\n") if line.strip() and "/" in line] + + # Always include the main package itself + main_ident = f"{origin}/{name}/{version}/{release}" + if main_ident not in dep_idents: + dep_idents.insert(0, main_ident) + + # Ensure grype + run(["bash", "-lc", "command -v grype >/dev/null 2>&1 || (curl -sSfL https://get.anchore.io/grype | sh -s -- -b /usr/local/bin)"], check=True) + + # Scan each dependency separately + dep_results = [] + deps_dir = os.path.join(out_dir, "deps") + ensure_dir(deps_dir) + + for dep_ident in dep_idents: + # Parse dependency ident: origin/name/version/release + dep_parts = dep_ident.split("/") + if len(dep_parts) != 4: + print(f"Skipping malformed dependency ident: {dep_ident}") + continue + + dep_origin, dep_name, dep_version, dep_release = dep_parts + + # Determine scan path + if os_name == "windows": + dep_scan_path = f"C:\\hab\\pkgs\\{dep_origin}\\{dep_name}\\{dep_version}\\{dep_release}" + else: + dep_scan_path = f"/hab/pkgs/{dep_origin}/{dep_name}/{dep_version}/{dep_release}" + + # Create output directory structure: deps//// + dep_out_dir = os.path.join(deps_dir, dep_origin, dep_name, dep_version) + ensure_dir(dep_out_dir) + + dep_json_path = os.path.join(dep_out_dir, f"{dep_release}.json") + dep_metadata_path = os.path.join(dep_out_dir, f"{dep_release}.metadata.json") + + # Run grype scan + try: + run(["bash", "-lc", f"grype dir:'{dep_scan_path}' --name '{dep_ident}' --output json > '{dep_json_path}'"], check=True) + + # Parse and pretty-print + dep_doc = json.load(open(dep_json_path, "r", encoding="utf-8")) + json.dump(dep_doc, open(dep_json_path, "w", encoding="utf-8"), indent=2) + + # Count vulnerabilities by severity + dep_matches = dep_doc.get("matches", []) or [] + buckets = ["Critical", "High", "Medium", "Low", "Negligible", "Unknown"] + dep_sev_counts = {k: 0 for k in buckets} + + for m in dep_matches: + sev = (m.get("vulnerability", {}) or {}).get("severity", "Unknown") or "Unknown" + sev_norm = sev.strip().title() + if sev_norm in ("Negligible", "Minimal"): + sev_norm = "Negligible" + if sev_norm not in dep_sev_counts: + sev_norm = "Unknown" + dep_sev_counts[sev_norm] += 1 + + # Create per-dependency metadata + dep_metadata = { + "schema_version": "1.0", + "dependency": { + "ident": dep_ident, + "origin": dep_origin, + "name": dep_name, + "version": dep_version, + "release": dep_release, + "scan_path": dep_scan_path + }, + "scan": { + "timestamp_utc": now_utc(), + "matches_total": len(dep_matches), + "severity_counts": dep_sev_counts + } + } + json.dump(dep_metadata, open(dep_metadata_path, "w", encoding="utf-8"), indent=2) + + # Track for rollup + dep_results.append({ + "ident": dep_ident, + "origin": dep_origin, + "name": dep_name, + "version": dep_version, + "release": dep_release, + "matches_total": len(dep_matches), + "severity_counts": dep_sev_counts, + "json_path": f"deps/{dep_origin}/{dep_name}/{dep_version}/{dep_release}.json" + }) + + print(f"Scanned dependency: {dep_ident} ({len(dep_matches)} matches)") + + except Exception as e: + print(f"Failed to scan dependency {dep_ident}: {e}") + # Continue with other dependencies + + # Create index.json rollup + # Grype version + DB status + grype_version = "" + rc, out, err = run(["bash", "-lc", "grype version"], check=False) + if rc == 0: + m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", out) + if m: + grype_version = m.group(1) + + db_info = {} + rc, out, err = run(["bash", "-lc", "grype db status -o json"], check=False) + if rc == 0 and out.startswith("{"): + try: + dbj = json.loads(out) + db_info["status_raw"] = dbj + for k in ("built", "builtAt", "lastBuilt", "updated", "updatedAt", "lastUpdated"): + if k in dbj: + db_info["built_utc"] = dbj.get(k) + break + for k in ("schemaVersion", "schema", "dbSchemaVersion"): + if k in dbj: + db_info["schema"] = dbj.get(k) + break + for k in ("checksum", "hash", "etag"): + if k in dbj: + db_info["checksum"] = dbj.get(k) + break + except Exception: + db_info["status_raw_text"] = out + + # Calculate aggregate counts + total_matches = sum(d["matches_total"] for d in dep_results) + aggregate_counts = {k: 0 for k in ["Critical", "High", "Medium", "Low", "Negligible", "Unknown"]} + for d in dep_results: + for sev, count in d["severity_counts"].items(): + aggregate_counts[sev] += count + + # GitHub Actions context + gha_run_id = env("GITHUB_RUN_ID", "") + repo = env("GITHUB_REPOSITORY", "") + workflow = env("GITHUB_WORKFLOW", "") + sha = env("GITHUB_SHA", "") + + index = { + "schema_version": "1.0", + "snapshot": { + "timestamp_utc": now_utc(), + "run_id": f"gha-{gha_run_id}" if gha_run_id else "", + "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} + }, + "target": { + "product": product, + "channel": hab_channel, + "package": { + "ident": main_ident, + "origin": origin, + "name": name, + "version": version, + "release": release + } + }, + "environment": { + "runner": env("RUNNER_OS", ""), + "os": os_name, + "os_version": os_ver, + "arch": arch + }, + "scan": { + "mode": "habitat", + "transitive_deps": transitive_deps, + "grype": {"version": grype_version, "db": db_info} + }, + "summary": { + "dependencies_scanned": len(dep_results), + "total_matches": total_matches, + "aggregate_severity_counts": aggregate_counts + }, + "dependencies": dep_results + } + + index_path = os.path.join(out_dir, "index.json") + json.dump(index, open(index_path, "w", encoding="utf-8"), indent=2) + + # Write resolved_version for workflow outputs + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), f"habitat://{main_ident}@{hab_channel}") + + print(f"Wrote habitat index: {index_path}") + print(f"Scanned {len(dep_results)} dependencies with {total_matches} total matches") + +else: + # NATIVE MODE: Original download + extract + scan logic + + # Choose base URL + base = "https://chefdownload-commercial.chef.io" if download_site == "commercial" else "https://chefdownload-community.chef.io" + + # Resolve version + resolved_version = pinned_ver + if resolve_ver == "latest" or not resolved_version: + ver_url = f"{base}/{channel}/{product}/versions/latest" + # Both commercial and community require license_id, but different license types + if license_id: + ver_url += f"?license_id={license_id}" + + try: + ver_doc = http_json(ver_url) + if isinstance(ver_doc, dict): + resolved_version = ( + ver_doc.get("version") + or ver_doc.get("latest") + or ver_doc.get("artifact_version") + or ver_doc.get("value") + ) + if not resolved_version: + resolved_version = str(ver_doc) + else: + resolved_version = str(ver_doc) + except RuntimeError as e: + error_msg = str(e) + if "403" in error_msg or "401" in error_msg or "Missing license_id" in error_msg or "License Id is not valid" in error_msg or "Only Free license" in error_msg: + site_type = "commercial" if download_site == "commercial" else "community" + license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" + + if "Missing license_id" in error_msg: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Missing license_id parameter.\n" + f" Download site: {download_site}\n" + f" Required secret: {license_secret}\n" + f" Solution: Ensure the {license_secret} secret is set in the orchestrator repository" + ) from e + elif "License Id is not valid" in error_msg or "403" in error_msg: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Invalid or expired license_id.\n" + f" Download site: {download_site}\n" + f" Product: {product}, Channel: {channel}\n" + f" Secret used: {license_secret}\n" + f" Solution: Update the {license_secret} secret with a valid {'commercial' if download_site == 'commercial' else 'Free'} license" + ) from e + elif "Only Free license" in error_msg: + raise RuntimeError( + f"LICENSE ERROR (community): Wrong license type provided.\n" + f" Download site: community\n" + f" Error: Community downloads require a 'Free' license, but a commercial license was provided\n" + f" Solution: Update GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE secret with a valid Free license (not commercial)" + ) from e + else: + raise RuntimeError( + f"LICENSE ERROR ({site_type}): Authentication failed.\n" + f" Download site: {download_site}\n" + f" Product: {product}, Channel: {channel}\n" + f" Secret used: {license_secret}\n" + f" Solution: Verify the {license_secret} secret contains a valid license for {download_site} downloads" + ) from e + raise + + # Construct download URL + download_url = f"{base}/{channel}/{product}/download?p={os_name}&pv={os_ver}&m={arch}&v={resolved_version}" + if license_id: + download_url += f"&license_id={license_id}" + + # Redact license_id (robust URL parsing) + parts = urlsplit(download_url) + q = [(k,v) for (k,v) in parse_qsl(parts.query, keep_blank_values=True) if k != "license_id"] + download_url_redacted = urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(q, doseq=True), parts.fragment)) + + # Persist small values for action outputs + write_text(os.path.join(out_dir, "_resolved_version.txt"), resolved_version) + write_text(os.path.join(out_dir, "_download_url_redacted.txt"), download_url_redacted) + + # Download package + pkg_path = os.path.join(work_dir, "package_downloaded.deb") + try: + run(["bash","-lc", f"curl -fsSL -o '{pkg_path}' '{download_url}'"], check=True) + except RuntimeError as e: + if "403" in str(e) or "401" in str(e): + site_type = "commercial" if download_site == "commercial" else "community" + license_secret = "GA_DOWNLOAD_GRYPE_LICENSE_ID" if download_site == "commercial" else "GA_DOWNLOAD_GRYPE_LICENSE_ID_FREE" + raise RuntimeError( + f"DOWNLOAD ERROR ({site_type}): Failed to download package.\n" + f" Product: {product} v{resolved_version}\n" + f" Channel: {channel}, OS: {os_name} {os_ver}\n" + f" Download site: {download_site}\n" + f" This may indicate:\n" + f" 1. Invalid or expired {license_secret} secret\n" + f" 2. Package not available for this OS/version combination\n" + f" 3. Version {resolved_version} doesn't exist in {channel} channel\n" + f" Solution: Verify license and that the product/version/platform combination is valid" + ) from e + raise + + # Extract deterministically (pilot assumes Ubuntu .deb) + extract_dir = os.path.join(work_dir, "extracted") + run(["bash","-lc", f"rm -rf '{extract_dir}' && mkdir -p '{extract_dir}'"], check=True) + run(["bash","-lc", f"dpkg-deb -x '{pkg_path}' '{extract_dir}'"], check=True) + + # Ensure grype + run(["bash","-lc", "command -v grype >/dev/null 2>&1 || (curl -sSfL https://get.anchore.io/grype | sh -s -- -b /usr/local/bin)"], check=True) + + # Run grype scan to JSON (do not print findings to stdout) + latest_json_path = os.path.join(out_dir, "latest.json") + run(["bash","-lc", f"grype dir:'{extract_dir}' --name '{product}' --output json > '{latest_json_path}'"], check=True) + + # Parse counts and rewrite with pretty formatting + doc = json.load(open(latest_json_path, "r", encoding="utf-8")) + json.dump(doc, open(latest_json_path, "w", encoding="utf-8"), indent=2) + doc = json.load(open(latest_json_path, "r", encoding="utf-8")) + matches = doc.get("matches", []) or [] + + buckets = ["Critical","High","Medium","Low","Negligible","Unknown"] + sev_counts = {k: 0 for k in buckets} + + for m in matches: + sev = (m.get("vulnerability", {}) or {}).get("severity", "Unknown") or "Unknown" + sev_norm = sev.strip().title() + if sev_norm in ("Negligible","Minimal"): + sev_norm = "Negligible" + if sev_norm not in sev_counts: + sev_norm = "Unknown" + sev_counts[sev_norm] += 1 + + # Grype version + DB status (best effort) + grype_version = "" + rc, out, err = run(["bash","-lc", "grype version"], check=False) + if rc == 0: + m = re.search(r"Version:\s*([0-9]+\.[0-9]+\.[0-9]+(?:[-+.\w]+)?)", out) + if m: + grype_version = m.group(1) + + db_info = {} + rc, out, err = run(["bash","-lc", "grype db status -o json"], check=False) + if rc == 0 and out.startswith("{"): + try: + dbj = json.loads(out) + db_info["status_raw"] = dbj + for k in ("built","builtAt","lastBuilt","updated","updatedAt","lastUpdated"): + if k in dbj: + db_info["built_utc"] = dbj.get(k) + break + for k in ("schemaVersion","schema","dbSchemaVersion"): + if k in dbj: + db_info["schema"] = dbj.get(k) + break + for k in ("checksum","hash","etag"): + if k in dbj: + db_info["checksum"] = dbj.get(k) + break + except Exception: + db_info["status_raw_text"] = out + else: + rc2, out2, err2 = run(["bash","-lc","grype db status"], check=False) + if rc2 == 0: + db_info["status_raw_text"] = out2 + m = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", out2) + if m: + db_info["built_utc"] = m.group(1) + + # Metadata + gha_run_id = env("GITHUB_RUN_ID", "") + repo = env("GITHUB_REPOSITORY", "") + workflow = env("GITHUB_WORKFLOW", "") + sha = env("GITHUB_SHA", "") + + metadata = { + "schema_version": "1.0", + "snapshot": { + "timestamp_utc": now_utc(), + "run_id": f"gha-{gha_run_id}" if gha_run_id else "", + "pipeline": {"repo": repo, "workflow": workflow, "git_sha": sha} + }, + "target": { + "product": product, + "channel": channel, + "resolved_version": resolved_version, + "download": {"site": download_site, "url_redacted": download_url_redacted} + }, + "environment": { + "runner": env("RUNNER_OS",""), + "os": os_name, + "os_version": os_ver, + "arch": arch + }, + "scan": { + "mode": scan_mode, + "scan_root": scan_root, + "grype": {"version": grype_version, "db": db_info}, + "options": {"output": "json"} + }, + "summary": { + "matches_total": len(matches), + "severity_counts": sev_counts + } + } + + metadata_path = os.path.join(out_dir, "metadata.json") + json.dump(metadata, open(metadata_path, "w", encoding="utf-8"), indent=2) + print("Wrote:", latest_json_path, metadata_path) \ No newline at end of file