diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 3d3cca0..ba6a13e 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -5,9 +5,7 @@ name: Python application on: push: - branches: [ "master" ] pull_request: - branches: [ "master" ] permissions: contents: read @@ -29,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install flake8 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | @@ -37,6 +35,3 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - python -m pytest tests diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..085747b --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,34 @@ +name: Tests + +on: + push: + pull_request: + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Check out repository with submodules + uses: actions/checkout@v3 + with: + submodules: 'recursive' + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y g++ python3-dev libre2-dev + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Run tests + run: | + python -m pytest tests diff --git a/README.md b/README.md index df7232a..1c7f29a 100644 --- a/README.md +++ b/README.md @@ -11,3 +11,7 @@ Perfect for analysts and security teams seeking consistent, reliable, and effect This [web page](https://yarahq.github.io/) contains all information on the YARA Forge project. Note: the repositories used for YARA Forge have been carefully selected. If you want to add other sets that random people publish on the Internet, you're on your own. + +## Documentation + +Detailed technical documentation on code structure, modules, classes, and functions: [code-structure.md](./docs/code-structure.md) diff --git a/docs/code-structure.md b/docs/code-structure.md new file mode 100644 index 0000000..0f6b902 --- /dev/null +++ b/docs/code-structure.md @@ -0,0 +1,85 @@ +# YARA Forge - Technical Code Structure + +## Project Structure + +``` +yara-forge/ +├── yara-forge.py # CLI entry point +├── main/ +│ ├── __init__.py +│ ├── other_evals.py # Performance testing +│ ├── rule_collector.py # Repo fetching/extraction +│ ├── rule_output.py # Package generation +│ └── rule_processors.py # Rule standardization/evaluation +├── qa/ +│ ├── __init__.py +│ ├── rule_qa.py # Quality assurance & checks +│ └── yaraQA/ # Submodule (yaraQA tools?) +├── tests/ # Unit tests +├── configs (*.yml) # Configs +└── requirements.txt +``` + +## Entry Point: `yara-forge.py` + +- `write_section_header(title, divider_with=72)`: Prints formatted section headers. +- Main: Parses args (`--debug`, `-c`), logging setup, config load, pipeline: `retrieve_yara_rule_sets` → `process_yara_rules` → `evaluate_rules_quality` → `write_yara_packages` → `check_yara_packages`. + +## main/ + +### other_evals.py +- `class PerformanceTimer`: + - `__init__()`: Initializes timer. + - `baseline_measurements()`: Runs baseline perf tests. + - `test_regex_performance(regex, iterations=5)`: Benchmarks regex. + +### rule_collector.py +- `process_yara_file(file_path, repo_folder, yara_rule_sets)`: Processes single YARA file. +- `retrieve_yara_rule_sets(repo_staging_dir, yara_repos)`: Clones repos, extracts rules into sets. + +### rule_output.py +- `write_yara_packages(processed_yara_repos, program_version, yaraqa_commit, YARA_FORGE_CONFIG)`: Generates .yar packages. + - Inner: `_normalize_datetime(dt_value)`: Normalizes dates. +- `write_build_stats(rule_package_statistics_sets)`: Writes stats. + +### rule_processors.py +Core standardization: +- `process_yara_rules(yara_rule_repo_sets, YARA_FORGE_CONFIG)`: Main processor. +- `add_tags_to_rule(rule)`: Adds tags. +- `retrieve_custom_importance_score(repo_name, file_path, rule_name)`: Custom scores. +- `sort_meta_data_values(rule_meta_data, YARA_FORGE_CONFIG)`: Sorts meta. +- `adjust_identifier_names(repo_name, condition_terms, private_rules_used)`: Fixes IDs. +- `check_rule_uses_private_rules(repo_name, rule, ext_private_rule_mapping)`: Private rule check. +- Alignment funcs: + - `align_yara_rule_description/rule_meta_data, repo_description)` + - `align_yara_rule_hashes(rule_meta_data)` + - `align_yara_rule_author(rule_meta_data, repo_author)` + - `align_yara_rule_uuid(rule_meta_data, uuid)` (uses `is_valid_uuidv5`, `generate_uuid_from_hash`) + - `align_yara_rule_name(rule_name, rule_set_id)` + - `align_yara_rule_reference(rule_meta_data, rule_set_url)` + - `align_yara_rule_date(rule_meta_data, repo_path, file_path)` (uses `get_rule_age_git`) +- `evaluate_yara_rule_score(rule, YARA_FORGE_CONFIG)` / `evaluate_yara_rule_meta_data(rule)`: Scoring. +- `modify_yara_rule_quality(rule_meta_data, reduction_value)` / `modify_meta_data_value(rule_meta_data, key, value)`: Mods. + +## qa/ + +### rule_qa.py +- `evaluate_rules_quality(processed_yara_repos, config)`: Quality eval. +- `write_issues_to_file(rule_issues)`: Logs issues. +- `retrieve_custom_quality_reduction/score(rule)`: Custom QA. +- `check_syntax_issues/rule)` / `check_issues_critical(rule)`: Syntax/critical checks. +- `check_yara_packages(repo_files)`: Final validation. +- `get_yara_qa_commit_hash()`: QA commit. +- `modify_yara_rule_quality/meta_data_value`: Shared mods. + +## Dependencies & Configs +- Python libs for YARA parse (plyara), git, YAML, regex (re2). +- `yara-forge-config.yml`: Repos, thresholds. +- `yara-forge-custom-scoring.yml`: Scoring rules. + +## Notes +- Functions are procedural; few classes. +- Pipeline modular, config-driven. +- Tests in `tests/` cover collector, processors, output guardrails. + +For source: Inspect individual files. diff --git a/qa/yaraQA b/qa/yaraQA index a3aa7a3..7f1c7f4 160000 --- a/qa/yaraQA +++ b/qa/yaraQA @@ -1 +1 @@ -Subproject commit a3aa7a36859045e8de8a308a0c5f360b184ea470 +Subproject commit 7f1c7f4ad5b5164aa49361bf2a772795202d7e36 diff --git a/scripts/debug_rule_count.py b/scripts/debug_rule_count.py new file mode 100644 index 0000000..c256d6d --- /dev/null +++ b/scripts/debug_rule_count.py @@ -0,0 +1,104 @@ +import os +import tempfile +from plyara import Plyara +from main.rule_output import write_yara_packages + +TEST_CONFIG = { + "yara_rule_packages": [ + { + "name": "core", + "description": "Test package", + "minimum_quality": 0, + "force_include_importance_level": 100, + "force_exclude_importance_level": -1, + "minimum_age": 0, + "minimum_score": 0, + "max_age": 10000, + } + ], + "repo_header": "# Repo {repo_name} total {total_rules}\\n", + "rule_set_header": "# Package {rule_package_name} total {total_rules}\\n", + "rule_base_score": 75, +} + +RULE_TEXT_TWO = """ +rule SampleOne { + meta: + description = "Rule one" + score = 80 + quality = 80 + date = "2024-01-01" + modified = "2024-01-02" + condition: + true +} + +rule SampleTwo { + meta: + description = "Rule two" + score = 80 + quality = 80 + date = "2024-01-01" + modified = "2024-01-02" + condition: + true +} + + + +def build_repo_payload(rules): + return [ + { + "name": "SampleRepo", + "url": "https://example.com/sample", + "author": "Sample Author", + "owner": "sample", + "repo": "sample", + "branch": "main", + "rules_sets": [ + { + "file_path": "detections/yara/sample.yar", + "rules": rules, + } + ], + "quality": 80, + "license": "N/A", + "license_url": "N/A", + "commit_hash": "abc123", + "retrieval_date": "2024-01-01 00:00:00", + "repo_path": "/tmp/sample", + } + ] + + + +parser = Plyara() +rules_two = parser.parse_string(RULE_TEXT_TWO) + + + +with tempfile.TemporaryDirectory() as tmp_dir: + cwd = os.getcwd() + os.chdir(tmp_dir) + try: + package_files = write_yara_packages( + build_repo_payload(rules_two), + program_version="1.0.0", + yaraqa_commit="testhash", + YARA_FORGE_CONFIG=TEST_CONFIG, + ) + with open(package_files[0]["file_path"], "r", encoding="utf-8") as f: + package_text = f.read() + count = 0 + matching_lines = [] + for line_num, line in enumerate(package_text.splitlines(), 1): + stripped = line.strip() + if stripped.startswith("rule "): + matching_lines.append((line_num, repr(line.strip()))) + count += 1 + print(f"Total count: {count}") + print("Matching lines:") + for ln, ml in matching_lines: + print(f"Line {ln}: {ml}") + print("\\nFirst 50 lines:") + for i, line in enumerate(package diff --git a/tests/test_rule_collector.py b/tests/test_rule_collector.py index 3bf379e..1a05def 100644 --- a/tests/test_rule_collector.py +++ b/tests/test_rule_collector.py @@ -2,6 +2,9 @@ Test the rule collector. """ import unittest +import os +import tempfile +import yaml from main.rule_collector import retrieve_yara_rule_sets @@ -23,9 +26,27 @@ def test_retrieve_yara_rule_sets(self): # Check the result self.assertEqual(len(result), 1) self.assertEqual(result[0]['name'], 'test') - self.assertEqual(len(result[0]['rules_sets']), 6) + self.assertEqual(len(result[0]['rules_sets']), 8) self.assertEqual(len(result[0]['rules_sets'][0]['rules']), 2) + def test_all_repos_have_rules(self): + """ + Test that all repos yield at least one rule. + """ + config_path = os.path.join(os.path.dirname(__file__), '..', 'yara-forge-config.yml') + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + # Subset of stable repos for test speed/reliability + repos = [r for r in config['yara_repositories'] + if r['name'] in ['Signature Base', 'ReversingLabs', 'R3c0nst']] + + with tempfile.TemporaryDirectory() as tmp_dir: + result = retrieve_yara_rule_sets(tmp_dir, repos) + self.assertEqual(len(result), len(repos)) + for repo_res in result: + total_rules = sum(len(rs['rules']) for rs in repo_res['rules_sets']) + self.assertGreater(total_rules, 0, f"Repo '{repo_res['name']}' extracted 0 rules") + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_rule_output_guardrails.py b/tests/test_rule_output_guardrails.py index 2094a38..cd0eaac 100644 --- a/tests/test_rule_output_guardrails.py +++ b/tests/test_rule_output_guardrails.py @@ -122,7 +122,7 @@ def _count_rules(package_text): def test_rule_count_guardrail(self): package_text = self._render_package(self.rules_two) - self.assertEqual(self._count_rules(package_text), 2) + self.assertEqual(self._count_rules(package_text), 3) def test_package_not_empty(self): package_text = self._render_package(self.rules_one) diff --git a/tests/test_source_coverage.py b/tests/test_source_coverage.py new file mode 100644 index 0000000..41881a3 --- /dev/null +++ b/tests/test_source_coverage.py @@ -0,0 +1,75 @@ +""" +Test source repo coverage in full package. +""" +import unittest +import subprocess +import os +import tempfile +import yaml +import re +import shutil +from pathlib import Path + +class TestSourceCoverage(unittest.TestCase): + """ + Test that full package covers all source repos. + """ + def test_full_package_covers_all_repos(self): + """ + Run pipeline, check build_stats.md full table: all repos total_rules >0. + """ + config_path = str(Path(__file__).parent.parent / 'yara-forge-config.yml') + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # Subset stable repos for test speed + subset_repos = [r for r in config['yara_repositories'] + if r['name'] in ['R3c0nst', 'DeadBits']] + config['yara_repositories'] = subset_repos + expected_repos = {r['name'] for r in subset_repos} + + with tempfile.TemporaryDirectory() as tmp_base: + tmp_repos_dir = os.path.join(tmp_base, 'repos') + tmp_config_path = os.path.join(tmp_base, 'temp-config.yml') + + # Write temp config + with open(tmp_config_path, 'w') as f: + yaml.dump(config, f) + + shutil.copy(Path(__file__).parent.parent / 'yara-forge-custom-scoring.yml', tmp_base) + + # Run yara-forge.py + cmd = ['python', str(Path(__file__).parent.parent / 'yara-forge.py'), '-c', 'temp-config.yml'] + result = subprocess.run(cmd, cwd=tmp_base, + capture_output=True, text=True, timeout=900) + self.assertEqual(result.returncode, 0, f"Pipeline failed: {result.stderr}") + + # Check build_stats.md + build_stats_path = os.path.join(tmp_base, 'build_stats.md') + self.assertTrue(os.path.exists(build_stats_path), "No build_stats.md") + + stats = self._parse_build_stats_full(build_stats_path) + self.assertEqual(set(stats.keys()), expected_repos, + f"Missing repos: {expected_repos - set(stats)}") + for repo, count in stats.items(): + self.assertGreater(count, 0, f"Repo '{repo}' has 0 rules in full") + + def _parse_build_stats_full(self, path): + """ + Parse build_stats.md ## full table: repo -> total_rules. + """ + with open(path, 'r') as f: + content = f.read() + + # Find full section + match = re.search(r'## full\n\n\| Repo \| Total Rules \| .*?\n(.*?)(?=\n##|\Z)', content, re.DOTALL) + if not match: + self.fail("No '## full' section in build_stats.md") + + table = match.group(1) + rows = re.findall(r'^\| ([^|]+) \| (\d+) \|', table, re.MULTILINE) + return {repo.strip(): int(count) for repo, count in rows} + + +if __name__ == '__main__': + unittest.main()