diff --git a/.github/workflows/saist.yml b/.github/workflows/saist.yml index cf64459..e4428f9 100644 --- a/.github/workflows/saist.yml +++ b/.github/workflows/saist.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.x" + python-version: "3.13" - name: Install Dependencies run: | diff --git a/README.md b/README.md index 65179ca..201db28 100644 --- a/README.md +++ b/README.md @@ -95,17 +95,23 @@ saist respects **file include/exclude rules** via two optional files in the root | File | Purpose | | ----------------- | ---------------------------------- | -| `saist.include` | List of glob patterns to **include** | -| `saist.ignore` | List of glob patterns to **ignore** | +| `saist.include` | List of `.gitignore`-style patterns to **include** | +| `saist.ignore` | List of `.gitignore`-style patterns to **ignore** | -- Patterns follow `glob` syntax (similar to `.gitignore`). +- Patterns follow `.gitignore` syntax. - If **`saist.include`** does not exist, default extensions are used (e.g., `.py`, `.js`, `.java`, `.go`, etc). - Examples: - `**/*.py` includes all Python files - `src/**/*.ts` includes TypeScript files inside `src` - - `tests/**` in `saist.ignore` will ignore the entire tests folder + - `build/` will ignore the entire build folder + - `*.log` will ignore all log files -> Note: saist currently does basic glob pattern matching. More advanced `.gitignore`-style support is coming soon! +You can also provide include/exclude patterns using the command-line arguments `--include` and `--exclude`. +- Patterns provided via command-line arguments are appended to any patterns loaded from the rule files. +- Examples: + - `--include '**/*.py' --include '**/*.ts'` includes all Python and TypeScript files + - `--include '**' --exclude '*.log'` includes all files except those ending in `.log` + - `--exclude 'node_modules/'` excludes the entire `node_modules` directory --- ### 📝 Example @@ -119,8 +125,8 @@ src/**/*.js `saist.ignore` ``` -tests/** -docs/** +tests/ +docs/ ``` This setup will: @@ -128,6 +134,7 @@ This setup will: - Ignore anything under `tests/` and `docs/` --- + ## 📄 PDF report generation saist allows you to generate PDF reports summarizing your findings, making it easier to share insights with your team. @@ -169,6 +176,11 @@ docker run -v$PWD/code:/code -v$PWD/reporting:/app/reporting punksecurity/saist | `--web` | Launch a local web server | | `--disable-tools` | Disable tool use during file analysis to reduce LLM token usage | | `--disable-caching` | Disable finding caching during file analysis | +| `--skip-line-length-check` | Skip checking files for a maximum line length | +| `--max-line-length` | Maximum allowed line length, files with lines longer than this value will be skipped | +| `--i, --include` | Pattern to explicitly include | +| `--e, --exclude` | Pattern to explicitly ignore | +| `--dry-run` | Exit after parsing configuration and collecting files, does not perform any analysis, useful for validating rules | | `--cache-folder` | Change the default cache folder | | `--csv` | Output findings to `findings.csv` | | `--pdf` | Output findings to PDF report (`report.pdf`) | diff --git a/requirements.txt b/requirements.txt index dfdd7ab..a1eaa77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ aiofiles==24.1.0 rich==14.0.0 flask==3.1.1 ollama==0.5.1 +gitignore-parser==0.1.13 diff --git a/saist.ignore b/saist.ignore index d97f542..138359c 100644 --- a/saist.ignore +++ b/saist.ignore @@ -1 +1 @@ -.github/** +.github/ diff --git a/saist/main.py b/saist/main.py index 92f458c..408922b 100755 --- a/saist/main.py +++ b/saist/main.py @@ -21,7 +21,7 @@ from scm import BaseScmAdapter from scm.adapters.git import GitAdapter from util.git import parse_unified_diff -from util.filtering import should_process +from util.filtering import FilterRules from util.prompts import prompts from scm.adapters.github import Github from scm import Scm @@ -195,12 +195,23 @@ async def main(): file_new_lines_text = {} app_files = [] + filter_rules = FilterRules(args.include, args.exclude) for f in changed_files: filename = f["filename"] patch_text = f.get("patch", "") - if not patch_text or not should_process(filename): + if not patch_text: + logging.debug(f"Skipped file {filename} as it contains no patch text") + continue + + if not filter_rules.filename_included(filename): + logging.debug(f"Skipped file {filename} as it is not included in rules") continue + if not args.skip_line_length_check: + if filter_rules.file_exceeds_line_length_limit(file_content=await scm.read_file_contents(filename), patch_text=patch_text, max_line_length=args.max_line_length): + logging.debug(f"Skipped file {filename} as it contains lines that exceed the maximum line length ({args.max_line_length})") + continue + line_map, new_lines_text = parse_unified_diff(patch_text) file_line_maps[filename] = line_map file_new_lines_text[filename] = new_lines_text @@ -211,6 +222,13 @@ async def main(): return print(f"✅ Prepared {len(app_files)} app files for analysis.\n") + app_filenames = list((filename for filename,_ in app_files)) + logging.debug(f"Files to process: {app_filenames}") + + if args.dry_run: + print("⚠️ --dry-run flag passed, exiting without analyzing files.") + exit(0) + # 3) Analyze each file in parallel print("🔍 Analyzing files for security issues...") max_workers = min(args.llm_rate_limit, len(app_files)) diff --git a/saist/util/argparsing.py b/saist/util/argparsing.py index 9e9d13a..1cdcace 100644 --- a/saist/util/argparsing.py +++ b/saist/util/argparsing.py @@ -2,7 +2,9 @@ from os import linesep, environ, cpu_count import sys from shutil import which +from dotenv import load_dotenv +load_dotenv(".env") runtime = environ.get("SAIST_COMMAND", f"{sys.argv[0]}") @@ -223,6 +225,29 @@ def __call__(self, parser, namespace, values, option_string=None): envvar="SAIST_PROJECT_NAME", action=EnvDefault, required=False, default=None ) +parser.add_argument( + "--skip-line-length-check", help = "Skip checking files for a maximum line length", + action='store_true', required=False + ) + +parser.add_argument( + "--max-line-length", type=int, help = "Maximum allowed line length, files with lines longer than this value will be skipped", + required=False, default=1000 + ) + +parser.add_argument( + '-i', '--include', action='append', help = "Pattern or filename to explicitly include, overrides saist.ignore", nargs=1, required=False +) + +parser.add_argument( + '-e', '--exclude', action='append', help = "Pattern or filename to explicitly ignore, overrides saist.include", nargs=1, required=False +) + +parser.add_argument( + "--dry-run", help = "Exit after parsing configuration and collecting files, does not perform any analysis. Useful for validating rules.", + action='store_true', required=False + ) + parser.add_argument( "-v", "--verbose", diff --git a/saist/util/filtering.py b/saist/util/filtering.py index 5074d9a..b0af7d4 100644 --- a/saist/util/filtering.py +++ b/saist/util/filtering.py @@ -1,10 +1,8 @@ import logging -import os -import fnmatch +from pathlib import Path +from gitignore_parser import parse_gitignore_str -#TODO: Make this recognise more gitignore patterns, like simple directory matching with / #TODO: Document in README -#TODO: Switch to verbose logging only DEFAULT_EXTENSIONS = [ ".c", ".cpp", ".h", ".hpp", @@ -16,43 +14,76 @@ logger = logging.getLogger(__name__) -def load_patterns(filename): - """ - Loads glob-style patterns from a file. - Returns a list of patterns. - """ - if not os.path.exists(filename): - return [] - - logger.debug(f"load_patterns: Found {filename}, processing...") - with open(filename, "r") as f: - lines = f.readlines() - - return [ - line.strip() - for line in lines - if line.strip() and not line.strip().startswith("#") - ] - -def pattern_match(filepath, patterns): - normalized_path = filepath.replace("\\", "/") - return any(fnmatch.fnmatch(normalized_path, pattern) for pattern in patterns) - -def should_process(filepath): - """ - Returns True if the file should be processed (included AND not ignored). - """ - logger.debug(f"should_process: {filepath}") - if not pattern_match(filepath, include_patterns): - return False - if pattern_match(filepath, ignore_patterns): +class FilterRules: + def __init__(self, include_patterns: list, exclude_patterns: list, include_rules_file: str | Path = "saist.include", exclude_rules_file: str | Path = "saist.ignore"): + """ + Load include/exclude rules from disk and command-line arguments + """ + # Load initial rules from disk + self.include_patterns = self.__load_rule_file(include_rules_file) + self.exclude_patterns = self.__load_rule_file(exclude_rules_file) + + logger.debug(f"Processing inclusion rules from {include_rules_file} and exclusion rules from {exclude_rules_file}") + + if not self.include_patterns: + # If no rules in saist.include, fallback to extension-based glob patterns like **/*.py + logger.debug("No saist.include, using defaults") + self.include_patterns = [f"*{ext}" for ext in DEFAULT_EXTENSIONS] + + # Extend list of patterns loaded from disk / defaults with additional patterns from CLI arguments + # The list comprehensions here flatten the pattern arguments into a single list as argparse will return a nested list + if include_patterns: + self.include_patterns.extend([pattern for item in include_patterns for pattern in item]) + + if exclude_patterns: + self.exclude_patterns.extend([pattern for item in exclude_patterns for pattern in item]) + + logger.debug(f"include_patterns: {self.include_patterns}\nignore_patterns:{self.exclude_patterns}") + + # Convert include/exclude pattern lists into a gitignore format for parsing with gitignore_parser + exclude_gitignore_str = "\n".join(self.exclude_patterns) + include_gitignore_str = "\n".join(self.include_patterns) + + # Define functions for checking filenames against exclusion/inclusion lists in gitignore format + self.__exclusion_matches = parse_gitignore_str(exclude_gitignore_str, base_dir=Path(exclude_rules_file).parent) + self.__inclusion_matches = parse_gitignore_str(include_gitignore_str, base_dir=Path(include_rules_file).parent) + + def __load_rule_file(self, file_path: str | Path): + """ + Load a exclude/include file from disk + """ + file_path = Path(file_path) + if file_path.exists(): + with open(file_path, 'r') as fp: + return fp.readlines() + else: + return [] + + def file_exceeds_line_length_limit(self, file_content: str, patch_text: str, max_line_length: int = 1000) -> bool: + """ + Checks if any line in the file exceeds max_length. + Returns True if all lines are within the limit, False otherwise. + """ + for line in file_content.splitlines(): + if len(line) > max_line_length: + return True + + for line in patch_text.splitlines(): + if len(line) > max_line_length: + return True + return False - return True -include_patterns = load_patterns("saist.include") -if not include_patterns: - # Fallback to extension-based glob patterns like **/*.py - include_patterns = [f"*{ext}" for ext in DEFAULT_EXTENSIONS] -ignore_patterns = load_patterns("saist.ignore") + def filename_included(self, filepath: str) -> bool: + """ + Returns True if the file is included in includelist and not explicitly ignored in ignorelist. + """ + if self.__exclusion_matches(filepath): + return False + if not self.__inclusion_matches(filepath): + return False + + return True + + -logger.info(f"include_patterns: {include_patterns}\nignore_patterns:{ignore_patterns}")