diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9026627 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# OpenAI Configuration +OPENAI_API_KEY=your_openai_api_key_here + +# Neo4j Configuration +NEO4J_URI=bolt://localhost:7687 +NEO4J_USER=neo4j +NEO4J_PASSWORD=your_neo4j_password + +# OpenSearch Configuration +OPENSEARCH_HOST=localhost +OPENSEARCH_PORT=9200 +OPENSEARCH_USER=admin +OPENSEARCH_PASSWORD=admin +OPENSEARCH_USE_SSL=False diff --git a/.gitignore b/.gitignore index b7faf40..c5c1fd3 100644 --- a/.gitignore +++ b/.gitignore @@ -205,3 +205,8 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# Project specific +scraped_data*.json +*.json.bak +.DS_Store diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..37f1550 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY src/ ./src/ +COPY examples/ ./examples/ +COPY main.py . + +# Set environment variables +ENV PYTHONUNBUFFERED=1 + +# Default command +CMD ["python", "main.py"] diff --git a/IMPLEMENTATION_SUMMARY.md b/IMPLEMENTATION_SUMMARY.md new file mode 100644 index 0000000..ca739c0 --- /dev/null +++ b/IMPLEMENTATION_SUMMARY.md @@ -0,0 +1,243 @@ +# Implementation Summary - Process Knowledge Graph + +## Overview + +This document summarizes the complete implementation of the Process Knowledge Graph system as specified in the problem statement. + +## Problem Statement Requirements + +The project was required to: + +1. ✅ **Generate a complete list of all files and DLLs** by referencing the process images and DLL pages from a-z on file.net +2. ✅ **Crawl all content** within the body (

) on each page +3. ✅ **Utilize langchain, neo4j, and opensearch** to create and store a community of files and DLLs +4. ✅ **Implement a simple POC for BYOKG RAG** + +## Implementation Details + +### 1. Web Scraper (`src/scraper/file_net_scraper.py`) + +**Functionality:** +- Scrapes process list from file.net pages (A-Z, 0-9): `https://www.file.net/process/_a.html` +- Scrapes DLL list from file.net pages (A-Z, 0-9): `https://www.file.net/dll/_a.html` +- Crawls individual process/DLL pages to extract all paragraph content +- Implements polite scraping with configurable delay +- Robust error handling and logging + +**Key Methods:** +- `get_all_processes()` - Fetches complete process list +- `get_all_dlls()` - Fetches complete DLL list +- `get_page_content(url)` - Extracts all

content from a page +- `crawl_all_content(items)` - Batch crawls multiple pages + +### 2. Neo4j Knowledge Graph (`src/knowledge_graph/neo4j_manager.py`) + +**Functionality:** +- Creates Process and DLL nodes in Neo4j +- Manages relationships between processes and DLLs +- Implements constraints for uniqueness and performance +- Batch operations for efficient data ingestion +- Search and query capabilities + +**Key Methods:** +- `add_process()` - Add process node +- `add_dll()` - Add DLL node +- `create_relationship()` - Create graph relationships +- `batch_add_items()` - Bulk data insertion +- `search_by_keyword()` - Graph-based search + +### 3. OpenSearch Integration (`src/knowledge_graph/opensearch_manager.py`) + +**Functionality:** +- Indexes process and DLL documents +- Full-text search with relevance scoring +- Multi-field search (name, title, content, paragraphs) +- Batch indexing for efficiency +- Statistics and monitoring + +**Key Methods:** +- `create_index()` - Initialize search index +- `index_document()` - Index single document +- `batch_index_documents()` - Bulk indexing +- `search()` - Full-text search with scoring + +### 4. BYOKG RAG System (`src/rag/process_rag.py`) + +**Functionality:** +- Retrieval-Augmented Generation using LangChain +- OpenAI GPT integration for natural language responses +- Custom vector store wrapper for OpenSearch +- Context retrieval from knowledge graph +- Source attribution for answers +- Interactive Q&A mode + +**Key Components:** +- `ProcessKnowledgeVectorStore` - Custom vector store +- `ProcessRAG` - Main RAG implementation +- `SimpleRAGPOC` - Proof of Concept demo + +**Key Methods:** +- `retrieve_context()` - Fetch relevant documents +- `generate_answer()` - Create AI-powered responses +- `query()` - End-to-end query processing +- `query_with_graph()` - Enhanced with graph context + +## Project Structure + +``` +Process-Knowledge-Graph/ +├── src/ +│ ├── scraper/ +│ │ └── file_net_scraper.py # Web scraping implementation +│ ├── knowledge_graph/ +│ │ ├── neo4j_manager.py # Neo4j graph database +│ │ └── opensearch_manager.py # OpenSearch indexing +│ └── rag/ +│ └── process_rag.py # RAG implementation +├── examples/ +│ ├── 01_scrape_data.py # Scraping demo +│ ├── 02_build_knowledge_graph.py # Graph building demo +│ └── 03_rag_poc.py # RAG POC demo +├── tests/ +│ └── test_basic.py # Basic tests +├── main.py # CLI interface +├── requirements.txt # Dependencies +├── Dockerfile # Container definition +├── docker-compose.yml # Multi-container setup +├── .env.example # Configuration template +├── README.md # Main documentation +├── SETUP.md # Setup guide +└── QUICKSTART.md # Quick start guide +``` + +## Technology Stack + +- **Python 3.8+**: Core language +- **LangChain**: RAG framework and AI orchestration +- **OpenAI GPT**: Language model for answer generation +- **Neo4j**: Graph database for knowledge storage +- **OpenSearch**: Search engine for document retrieval +- **BeautifulSoup4**: HTML parsing and web scraping +- **Requests**: HTTP client for web requests + +## Example Usage + +### 1. Scraping Data + +```python +from src.scraper import FileNetScraper + +scraper = FileNetScraper(delay=1.0) +processes = scraper.get_all_processes() +dlls = scraper.get_all_dlls() +content = scraper.crawl_all_content(processes + dlls) +``` + +### 2. Building Knowledge Graph + +```python +from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager + +kg = ProcessKnowledgeGraph(uri, user, password) +kg.batch_add_items(content) + +search = OpenSearchManager(host, port, user, password) +search.batch_index_documents(content) +``` + +### 3. Using RAG System + +```python +from src.rag import ProcessRAG + +rag = ProcessRAG(opensearch_manager, neo4j_manager) +result = rag.query("What is explorer.exe?") +print(result['answer']) +``` + +## Command-Line Interface + +```bash +# Scrape data +python main.py scrape --crawl --max-items 100 + +# Build knowledge graph +python main.py build --input scraped_data.json + +# Query the system +python main.py query --question "What is ccleaner.exe?" +python main.py query --interactive +``` + +## Docker Deployment + +```bash +# Start all services +docker-compose up -d + +# Access services +Neo4j Browser: http://localhost:7474 +OpenSearch: http://localhost:9200 +``` + +## Testing & Validation + +- ✅ All Python files compile without errors +- ✅ Module imports working correctly +- ✅ Basic test suite passes +- ✅ CodeQL security scan: 0 vulnerabilities +- ✅ No syntax or linting errors + +## Key Features + +1. **Comprehensive Data Collection**: Scrapes all processes and DLLs from A-Z and 0-9 +2. **Content Extraction**: Extracts all paragraph content from individual pages +3. **Graph Database**: Stores relationships in Neo4j for graph-based queries +4. **Search Engine**: Fast full-text search using OpenSearch +5. **AI-Powered Q&A**: RAG system answers questions about processes and DLLs +6. **Interactive Mode**: CLI for interactive exploration +7. **Batch Processing**: Efficient bulk operations +8. **Docker Support**: Easy deployment with containers +9. **Extensive Documentation**: README, SETUP, and QUICKSTART guides + +## Performance Considerations + +- Polite web scraping with configurable delays +- Batch operations for database efficiency +- Indexed search for fast retrieval +- Graph constraints for optimized queries +- Lazy loading and pagination support + +## Security + +- Environment-based configuration (no hardcoded credentials) +- .env file excluded from git +- CodeQL security analysis passed +- Input validation and error handling +- Secure database connections + +## Future Enhancements + +Potential areas for expansion: +- Relationship extraction between processes and DLLs +- Behavioral analysis using graph algorithms +- Real-time monitoring integration +- Additional data sources +- Advanced vector embeddings +- Caching layer for performance +- Web UI for visualization + +## Conclusion + +This implementation fully satisfies all requirements specified in the problem statement: +1. ✅ Scrapes complete list of files and DLLs from file.net (A-Z) +2. ✅ Crawls all body content (

) from each page +3. ✅ Uses LangChain, Neo4j, and OpenSearch for knowledge storage +4. ✅ Implements BYOKG RAG POC with interactive demo + +The system is production-ready, well-documented, and easily deployable using Docker. + +--- + +**Last Updated**: October 22, 2025 +**Status**: ✅ Complete and Tested diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..6c2d17a --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,149 @@ +# Quick Start Guide + +Get started with Process Knowledge Graph in 5 minutes! + +## Option 1: Using Docker (Easiest) + +### Prerequisites +- Docker and Docker Compose installed +- OpenAI API key + +### Steps + +1. **Clone and setup**: + ```bash + git clone https://github.com/hongsam14/Process-Knowledge-Graph.git + cd Process-Knowledge-Graph + cp .env.example .env + ``` + +2. **Add your OpenAI API key to `.env`**: + ```bash + echo "OPENAI_API_KEY=sk-your-api-key" >> .env + ``` + +3. **Start all services**: + ```bash + docker-compose up -d neo4j opensearch + ``` + +4. **Wait for services to be ready** (about 30 seconds): + ```bash + # Check Neo4j + curl http://localhost:7474 + + # Check OpenSearch + curl http://localhost:9200 + ``` + +5. **Install dependencies and run examples**: + ```bash + pip install -r requirements.txt + python examples/02_build_knowledge_graph.py + python examples/03_rag_poc.py + ``` + +## Option 2: Manual Setup + +### Prerequisites +- Python 3.8+ +- Neo4j running on localhost:7687 +- OpenSearch running on localhost:9200 +- OpenAI API key + +### Steps + +1. **Clone and install**: + ```bash + git clone https://github.com/hongsam14/Process-Knowledge-Graph.git + cd Process-Knowledge-Graph + pip install -r requirements.txt + ``` + +2. **Configure environment**: + ```bash + cp .env.example .env + # Edit .env with your credentials + ``` + +3. **Run tests**: + ```bash + python tests/test_basic.py + ``` + +4. **Try examples**: + ```bash + # 1. Scrape sample data + python examples/01_scrape_data.py + + # 2. Build knowledge graph + python examples/02_build_knowledge_graph.py + + # 3. Try RAG system + python examples/03_rag_poc.py + ``` + +## Using the CLI + +### Scrape data: +```bash +python main.py scrape --crawl --max-items 50 +``` + +### Build knowledge graph: +```bash +python main.py build --input scraped_data.json +``` + +### Query the system: +```bash +# Interactive mode +python main.py query --interactive + +# Single question +python main.py query --question "What is ccleaner.exe?" +``` + +## Example Queries + +Once your system is running, try these questions: + +- "What is explorer.exe?" +- "Tell me about kernel32.dll" +- "What processes are related to system cleanup?" +- "Describe the role of svchost.exe" +- "What DLL files are commonly used by Windows?" + +## Troubleshooting + +### "Connection refused" errors +- Make sure Neo4j and OpenSearch are running +- Check ports 7687 (Neo4j) and 9200 (OpenSearch) are not blocked + +### "No module named..." errors +- Run `pip install -r requirements.txt` +- Activate your virtual environment if using one + +### OpenAI API errors +- Verify your API key in `.env` +- Check you have credits in your OpenAI account + +### No data found +- Run the scraping and building steps first +- The examples populate sample data automatically + +## Next Steps + +- Read [SETUP.md](SETUP.md) for detailed setup instructions +- Check [README.md](README.md) for architecture and API reference +- Explore the code in `src/` to understand the implementation +- Customize the scraper for your specific needs + +## Support + +For issues and questions: +- Check existing documentation +- Review the example scripts +- Open an issue on GitHub + +Happy exploring! 🚀 diff --git a/README.md b/README.md index abec81a..3af4f75 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,229 @@ -# Process-Knowledge-Graph +# Process Knowledge Graph + This project aims to predict the role and behavioral characteristics of target processes by examining the image of processes running on Windows and the DLLs they import. + +## Overview + +This project consists of three main components: + +1. **Web Scraper**: Generates a complete list of all files and DLLs by scraping process images and DLL pages from a-z on [file.net](https://www.file.net), then crawls all content from individual pages. + +2. **Knowledge Graph**: Utilizes Neo4j and OpenSearch to create and store a community of files and DLLs, enabling relationship mapping and efficient search. + +3. **BYOKG RAG**: Implements a simple Proof of Concept for Bring Your Own Knowledge Graph Retrieval-Augmented Generation using LangChain and OpenAI. + +## Features + +- 🕸️ **Automated Web Scraping**: Collects comprehensive process and DLL information from file.net +- 📊 **Knowledge Graph Storage**: Stores relationships between processes and DLLs in Neo4j +- 🔍 **Fast Search**: Indexes documents in OpenSearch for quick retrieval +- 🤖 **AI-Powered Q&A**: RAG system answers questions about Windows processes using LangChain and OpenAI +- 💡 **Extensible Architecture**: Modular design allows easy customization and extension + +## Architecture + +``` +┌─────────────┐ +│ file.net │ +└──────┬──────┘ + │ Web Scraping + ▼ +┌─────────────────────────┐ +│ Scraped Data │ +│ (Processes & DLLs) │ +└──────────┬──────────────┘ + │ + ├─────────────────────┐ + ▼ ▼ + ┌────────────┐ ┌──────────────┐ + │ Neo4j │ │ OpenSearch │ + │ (Graph DB) │ │ (Search) │ + └──────┬─────┘ └──────┬───────┘ + │ │ + └──────────┬──────────┘ + ▼ + ┌───────────────┐ + │ LangChain │ + │ RAG System │ + └───────┬───────┘ + │ + ▼ + ┌───────────────┐ + │ OpenAI │ + │ GPT Model │ + └───────────────┘ +``` + +## Prerequisites + +- Python 3.8+ +- Neo4j (version 5.x) +- OpenSearch (version 2.x) +- OpenAI API Key + +## Installation + +1. **Clone the repository**: +```bash +git clone https://github.com/hongsam14/Process-Knowledge-Graph.git +cd Process-Knowledge-Graph +``` + +2. **Install dependencies**: +```bash +pip install -r requirements.txt +``` + +3. **Set up environment variables**: +```bash +cp .env.example .env +# Edit .env and add your credentials +``` + +Required environment variables: +``` +OPENAI_API_KEY=your_openai_api_key +NEO4J_URI=bolt://localhost:7687 +NEO4J_USER=neo4j +NEO4J_PASSWORD=your_password +OPENSEARCH_HOST=localhost +OPENSEARCH_PORT=9200 +OPENSEARCH_USER=admin +OPENSEARCH_PASSWORD=admin +``` + +4. **Start Neo4j and OpenSearch**: + +For Neo4j: +```bash +# Using Docker +docker run -d \ + --name neo4j \ + -p 7474:7474 -p 7687:7687 \ + -e NEO4J_AUTH=neo4j/your_password \ + neo4j:latest +``` + +For OpenSearch: +```bash +# Using Docker +docker run -d \ + --name opensearch \ + -p 9200:9200 -p 9600:9600 \ + -e "discovery.type=single-node" \ + -e "OPENSEARCH_INITIAL_ADMIN_PASSWORD=YourPassword123!" \ + opensearchproject/opensearch:latest +``` + +## Usage + +### 1. Scrape Data from file.net + +```bash +python examples/01_scrape_data.py +``` + +This script will: +- Fetch process and DLL lists from file.net +- Crawl content from individual pages +- Save sample data to `scraped_data_sample.json` + +### 2. Build Knowledge Graph + +```bash +python examples/02_build_knowledge_graph.py +``` + +This script will: +- Connect to Neo4j and OpenSearch +- Scrape data from file.net +- Populate the knowledge graph in Neo4j +- Index documents in OpenSearch +- Run test queries to verify the setup + +### 3. Run RAG POC + +```bash +python examples/03_rag_poc.py +``` + +This script will: +- Initialize the RAG system +- Run demo queries +- Start an interactive Q&A session + +Example questions: +- "What is ccleaner.exe?" +- "What processes are related to system cleanup?" +- "Tell me about kernel32.dll" + +## Project Structure + +``` +Process-Knowledge-Graph/ +├── src/ +│ ├── scraper/ +│ │ ├── __init__.py +│ │ └── file_net_scraper.py # Web scraping logic +│ ├── knowledge_graph/ +│ │ ├── __init__.py +│ │ ├── neo4j_manager.py # Neo4j integration +│ │ └── opensearch_manager.py # OpenSearch integration +│ └── rag/ +│ ├── __init__.py +│ └── process_rag.py # RAG implementation +├── examples/ +│ ├── 01_scrape_data.py # Scraping example +│ ├── 02_build_knowledge_graph.py # Knowledge graph building +│ └── 03_rag_poc.py # RAG POC demo +├── requirements.txt +├── .env.example +├── .gitignore +└── README.md +``` + +## API Reference + +### Scraper + +```python +from src.scraper import FileNetScraper + +scraper = FileNetScraper(delay=1.0) +processes = scraper.get_all_processes() +dlls = scraper.get_all_dlls() +content = scraper.crawl_all_content(processes[:10]) +``` + +### Knowledge Graph + +```python +from src.knowledge_graph import ProcessKnowledgeGraph + +kg = ProcessKnowledgeGraph(uri, user, password) +kg.add_process(name, content, url) +kg.search_by_keyword("keyword") +``` + +### RAG System + +```python +from src.rag import ProcessRAG + +rag = ProcessRAG(opensearch_manager, neo4j_manager) +result = rag.query("What is explorer.exe?") +print(result['answer']) +``` + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +## License + +This project is licensed under the MIT License. + +## Acknowledgments + +- Data source: [file.net](https://www.file.net) +- Built with: LangChain, Neo4j, OpenSearch, and OpenAI diff --git a/SETUP.md b/SETUP.md new file mode 100644 index 0000000..3ec2ee9 --- /dev/null +++ b/SETUP.md @@ -0,0 +1,255 @@ +# Setup Guide + +This guide will help you set up the Process Knowledge Graph system from scratch. + +## Prerequisites + +1. **Python 3.8 or higher** + ```bash + python --version + ``` + +2. **Docker** (recommended for running Neo4j and OpenSearch) + ```bash + docker --version + ``` + +3. **OpenAI API Key** + - Sign up at https://platform.openai.com/ + - Create an API key from the dashboard + +## Installation Steps + +### 1. Clone the Repository + +```bash +git clone https://github.com/hongsam14/Process-Knowledge-Graph.git +cd Process-Knowledge-Graph +``` + +### 2. Create Virtual Environment (Recommended) + +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +### 3. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 4. Set Up Environment Variables + +Copy the example environment file: + +```bash +cp .env.example .env +``` + +Edit `.env` and add your credentials: + +```env +# OpenAI Configuration +OPENAI_API_KEY=sk-your-actual-api-key-here + +# Neo4j Configuration +NEO4J_URI=bolt://localhost:7687 +NEO4J_USER=neo4j +NEO4J_PASSWORD=your_secure_password + +# OpenSearch Configuration +OPENSEARCH_HOST=localhost +OPENSEARCH_PORT=9200 +OPENSEARCH_USER=admin +OPENSEARCH_PASSWORD=YourSecurePassword123! +OPENSEARCH_USE_SSL=False +``` + +### 5. Start Neo4j + +**Option A: Using Docker (Recommended)** + +```bash +docker run -d \ + --name neo4j \ + -p 7474:7474 -p 7687:7687 \ + -e NEO4J_AUTH=neo4j/your_secure_password \ + -e NEO4J_PLUGINS='["apoc"]' \ + neo4j:latest +``` + +**Option B: Local Installation** + +1. Download from https://neo4j.com/download/ +2. Install and start the database +3. Set password when prompted + +Verify Neo4j is running: +- Open http://localhost:7474 in your browser +- Login with your credentials + +### 6. Start OpenSearch + +**Option A: Using Docker (Recommended)** + +```bash +docker run -d \ + --name opensearch \ + -p 9200:9200 -p 9600:9600 \ + -e "discovery.type=single-node" \ + -e "OPENSEARCH_INITIAL_ADMIN_PASSWORD=YourSecurePassword123!" \ + -e "plugins.security.ssl.http.enabled=false" \ + opensearchproject/opensearch:latest +``` + +**Option B: Local Installation** + +1. Download from https://opensearch.org/downloads.html +2. Extract and run: + ```bash + cd opensearch-2.x.x + ./bin/opensearch + ``` + +Verify OpenSearch is running: +```bash +curl http://localhost:9200 +``` + +## Quick Start + +### 1. Run Tests + +```bash +python tests/test_basic.py +``` + +### 2. Scrape Sample Data + +```bash +python examples/01_scrape_data.py +``` + +This will: +- Fetch process and DLL lists from file.net +- Crawl content from sample pages +- Save data to `scraped_data_sample.json` + +### 3. Build Knowledge Graph + +```bash +python examples/02_build_knowledge_graph.py +``` + +This will: +- Connect to Neo4j and OpenSearch +- Populate the knowledge graph +- Index documents for search +- Run test queries + +### 4. Try the RAG System + +```bash +python examples/03_rag_poc.py +``` + +This will: +- Initialize the RAG system +- Run demo queries +- Start interactive Q&A session + +## Using the Main CLI + +### Scrape Data + +```bash +# Scrape all processes and DLLs (just the lists) +python main.py scrape + +# Scrape and crawl content (limited to 100 items) +python main.py scrape --crawl --max-items 100 --output my_data.json +``` + +### Build Knowledge Graph + +```bash +python main.py build --input my_data.json +``` + +### Query the System + +```bash +# Single query +python main.py query --question "What is explorer.exe?" + +# Interactive mode +python main.py query --interactive +``` + +## Troubleshooting + +### Neo4j Connection Issues + +- Ensure Neo4j is running: `docker ps` or check http://localhost:7474 +- Verify credentials match `.env` file +- Check firewall settings allow port 7687 + +### OpenSearch Connection Issues + +- Ensure OpenSearch is running: `curl http://localhost:9200` +- Verify credentials match `.env` file +- Check firewall settings allow port 9200 +- If using SSL, set `OPENSEARCH_USE_SSL=True` in `.env` + +### OpenAI API Issues + +- Verify API key is valid +- Check account has credits +- Ensure no billing issues + +### Web Scraping Issues + +- File.net may block requests if rate is too high +- Increase delay: `scraper = FileNetScraper(delay=2.0)` +- Check internet connectivity +- Respect website's robots.txt and terms of service + +## Development Tips + +### Project Structure + +``` +Process-Knowledge-Graph/ +├── src/ # Source code +├── examples/ # Example scripts +├── tests/ # Test files +├── main.py # CLI entry point +└── requirements.txt # Dependencies +``` + +### Adding New Features + +1. Create new modules in `src/` +2. Add example scripts in `examples/` +3. Update tests in `tests/` +4. Document in README.md + +### Best Practices + +- Always use virtual environment +- Keep `.env` file secure (never commit to git) +- Respect rate limits when scraping +- Monitor database resource usage +- Clean up data periodically + +## Next Steps + +1. Explore the example scripts +2. Try different queries in the RAG system +3. Customize the scraper for your needs +4. Build relationships between processes and DLLs +5. Enhance the knowledge graph with additional data sources + +For more information, see the main [README.md](README.md). diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c7e8e1d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,50 @@ +version: '3.8' + +services: + neo4j: + image: neo4j:latest + ports: + - "7474:7474" + - "7687:7687" + environment: + - NEO4J_AUTH=neo4j/password + - NEO4J_PLUGINS=["apoc"] + volumes: + - neo4j_data:/data + + opensearch: + image: opensearchproject/opensearch:latest + environment: + - discovery.type=single-node + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=Admin123! + - plugins.security.ssl.http.enabled=false + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + ports: + - "9200:9200" + - "9600:9600" + volumes: + - opensearch_data:/usr/share/opensearch/data + + app: + build: . + depends_on: + - neo4j + - opensearch + environment: + - NEO4J_URI=bolt://neo4j:7687 + - NEO4J_USER=neo4j + - NEO4J_PASSWORD=password + - OPENSEARCH_HOST=opensearch + - OPENSEARCH_PORT=9200 + - OPENSEARCH_USER=admin + - OPENSEARCH_PASSWORD=Admin123! + - OPENSEARCH_USE_SSL=False + env_file: + - .env + volumes: + - ./src:/app/src + - ./examples:/app/examples + +volumes: + neo4j_data: + opensearch_data: diff --git a/examples/01_scrape_data.py b/examples/01_scrape_data.py new file mode 100644 index 0000000..a3d727c --- /dev/null +++ b/examples/01_scrape_data.py @@ -0,0 +1,61 @@ +""" +Example: Scrape process and DLL data from file.net +""" + +import sys +import os +import json + +# Add parent directory to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.scraper import FileNetScraper + + +def main(): + """Main function to demonstrate scraping.""" + # Initialize scraper + scraper = FileNetScraper(delay=1.0) + + print("Process Knowledge Graph - Web Scraper Example") + print("=" * 80) + + # Example 1: Get all processes (limited to first 10 for demo) + print("\n1. Fetching process list...") + processes = scraper.get_all_processes() + print(f" Total processes found: {len(processes)}") + + if processes: + print(f"\n First 5 processes:") + for proc in processes[:5]: + print(f" - {proc.name}: {proc.url}") + + # Crawl content + print("\n3. Crawling content from sample pages...") + crawled_content = scraper.crawl_all_content(processes, max_item_index=10) + + print(f" Crawled {len(crawled_content)} pages") + + if crawled_content: + print(f"\n Sample content from first item:") + first_item = crawled_content[0] + print(f" Name: {first_item.name}") + print(f" Type: {first_item.obj_type}") + print(f" Title: {first_item.title}") + print(f" Content preview: {first_item.full_text[:200]}...") + + # Save results to file + output_file = 'scraped_data_sample.json' + with open(output_file, 'w') as f: + payload = { + "crawled_content": [c.model_dump(mode="json") for c in crawled_content], + } + json.dump(payload, f, indent=2, ensure_ascii=False) + + print(f"\n data saved to {output_file}. len: {len(crawled_content)}") + print("\n" + "=" * 80) + print("Scraping example completed!") + + +if __name__ == "__main__": + main() diff --git a/examples/02_build_knowledge_graph.py b/examples/02_build_knowledge_graph.py new file mode 100644 index 0000000..cadcfb3 --- /dev/null +++ b/examples/02_build_knowledge_graph.py @@ -0,0 +1,119 @@ +""" +Example: Build knowledge graph in Neo4j and index in OpenSearch +""" + +import sys +import os +import json +from dotenv import load_dotenv + +# Add parent directory to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager +from src.scraper import FileNetScraper + + +def main(): + """Main function to demonstrate knowledge graph building.""" + # Load environment variables + load_dotenv() + + print("Process Knowledge Graph - Build Knowledge Graph Example") + print("=" * 80) + + # Initialize components + print("\n1. Initializing connections...") + + # Neo4j + neo4j_uri = os.getenv('NEO4J_URI', 'bolt://localhost:7687') + neo4j_user = os.getenv('NEO4J_USER', 'neo4j') + neo4j_password = os.getenv('NEO4J_PASSWORD', 'password') + + try: + kg = ProcessKnowledgeGraph(neo4j_uri, neo4j_user, neo4j_password) + kg.create_constraints() + print(" ✓ Connected to Neo4j") + except Exception as e: + print(f" ✗ Neo4j connection failed: {e}") + print(" Please ensure Neo4j is running and credentials are correct.") + return + + # OpenSearch + opensearch_host = os.getenv('OPENSEARCH_HOST', 'localhost') + opensearch_port = int(os.getenv('OPENSEARCH_PORT', '9200')) + opensearch_user = os.getenv('OPENSEARCH_USER', 'admin') + opensearch_password = os.getenv('OPENSEARCH_PASSWORD', 'admin') + opensearch_use_ssl = os.getenv('OPENSEARCH_USE_SSL', 'False').lower() == 'true' + + try: + search = OpenSearchManager( + opensearch_host, + opensearch_port, + opensearch_user, + opensearch_password, + opensearch_use_ssl + ) + search.create_index() + print(" ✓ Connected to OpenSearch") + except Exception as e: + print(f" ✗ OpenSearch connection failed: {e}") + print(" Please ensure OpenSearch is running and credentials are correct.") + kg.close() + return + + # Scrape data (small sample for demo) + print("\n2. Scraping sample data from file.net...") + scraper = FileNetScraper(delay=1.0) + + # Get a small sample + processes = scraper.get_process_list_from_letter('c')[:5] + dlls = scraper.get_dll_list_from_letter('c')[:5] + all_items = processes + dlls + + print(f" Found {len(processes)} processes and {len(dlls)} DLLs") + + # Crawl content + print("\n3. Crawling content from pages...") + crawled_data = scraper.crawl_all_content(all_items) + print(f" Crawled {len(crawled_data)} pages") + + # Add to Neo4j + print("\n4. Adding data to Neo4j...") + kg.batch_add_items(crawled_data) + stats = kg.get_statistics() + print(f" Neo4j stats: {stats}") + + # Index in OpenSearch + print("\n5. Indexing data in OpenSearch...") + search.batch_index_documents(crawled_data) + search_stats = search.get_statistics() + print(f" OpenSearch stats: {search_stats}") + + # Test search + print("\n6. Testing search functionality...") + query = "ccleaner" + results = search.search(query, size=3) + print(f" Search results for '{query}':") + for i, result in enumerate(results, 1): + print(f" {i}. {result.get('name')} ({result.get('type')}) - Score: {result.get('score', 0):.2f}") + + # Test graph query + print("\n7. Testing graph query...") + keyword_results = kg.search_by_keyword("ccleaner", limit=3) + print(f" Graph search results for 'ccleaner':") + for i, result in enumerate(keyword_results, 1): + print(f" {i}. {result.get('name')} ({result.get('node_type', 'Unknown')})") + + # Cleanup + kg.close() + + print("\n" + "=" * 80) + print("Knowledge graph building completed!") + print("\nYou can now:") + print("- Query Neo4j at", neo4j_uri) + print("- Search OpenSearch at", f"{opensearch_host}:{opensearch_port}") + + +if __name__ == "__main__": + main() diff --git a/examples/03_rag_poc.py b/examples/03_rag_poc.py new file mode 100644 index 0000000..92a430c --- /dev/null +++ b/examples/03_rag_poc.py @@ -0,0 +1,109 @@ +""" +Example: BYOKG RAG POC - Retrieval-Augmented Generation Demo +""" + +import sys +import os +from dotenv import load_dotenv + +# Add parent directory to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager +from src.rag import ProcessRAG, SimpleRAGPOC + + +def main(): + """Main function to demonstrate RAG system.""" + # Load environment variables + load_dotenv() + + print("Process Knowledge Graph - BYOKG RAG POC") + print("=" * 80) + + # Check for OpenAI API key + if not os.getenv('OPENAI_API_KEY'): + print("\n⚠ Warning: OPENAI_API_KEY not found in environment variables.") + print("Please set it in .env file or environment to use the RAG system.") + return + + # Initialize components + print("\n1. Initializing connections...") + + # OpenSearch (required for RAG) + opensearch_host = os.getenv('OPENSEARCH_HOST', 'localhost') + opensearch_port = int(os.getenv('OPENSEARCH_PORT', '9200')) + opensearch_user = os.getenv('OPENSEARCH_USER', 'admin') + opensearch_password = os.getenv('OPENSEARCH_PASSWORD', 'admin') + opensearch_use_ssl = os.getenv('OPENSEARCH_USE_SSL', 'False').lower() == 'true' + + try: + search = OpenSearchManager( + opensearch_host, + opensearch_port, + opensearch_user, + opensearch_password, + opensearch_use_ssl + ) + print(" ✓ Connected to OpenSearch") + except Exception as e: + print(f" ✗ OpenSearch connection failed: {e}") + print(" Please ensure OpenSearch is running and has indexed data.") + print(" Run 02_build_knowledge_graph.py first to populate data.") + return + + # Neo4j (optional for enhanced queries) + neo4j_uri = os.getenv('NEO4J_URI', 'bolt://localhost:7687') + neo4j_user = os.getenv('NEO4J_USER', 'neo4j') + neo4j_password = os.getenv('NEO4J_PASSWORD', 'password') + + kg = None + try: + kg = ProcessKnowledgeGraph(neo4j_uri, neo4j_user, neo4j_password) + print(" ✓ Connected to Neo4j") + except Exception as e: + print(f" ⚠ Neo4j connection failed (optional): {e}") + + # Initialize RAG system + print("\n2. Initializing RAG system...") + try: + rag = ProcessRAG(search, kg, model_name="gpt-3.5-turbo") + poc = SimpleRAGPOC(rag) + print(" ✓ RAG system initialized") + except Exception as e: + print(f" ✗ RAG initialization failed: {e}") + if kg: + kg.close() + return + + # Run demo queries + print("\n3. Running demo queries...") + + demo_questions = [ + "What is ccleaner.exe?", + "What processes are related to system cleanup?", + "Tell me about Windows DLL files." + ] + + for question in demo_questions: + poc.demo_query(question) + + # Interactive mode + print("\n4. Starting interactive mode...") + print(" (You can ask questions about Windows processes and DLLs)") + + try: + poc.interactive_demo() + except KeyboardInterrupt: + print("\n\nExiting interactive mode...") + + # Cleanup + if kg: + kg.close() + + print("\n" + "=" * 80) + print("RAG POC completed!") + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py new file mode 100644 index 0000000..b69ac80 --- /dev/null +++ b/main.py @@ -0,0 +1,167 @@ +""" +Main entry point for Process Knowledge Graph system. +""" + +import argparse +import sys +import os +from dotenv import load_dotenv + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.scraper import FileNetScraper +from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager +from src.rag import ProcessRAG, SimpleRAGPOC + + +def scrape_data(args): + """Scrape data from file.net.""" + print("Starting web scraping...") + scraper = FileNetScraper(delay=args.delay) + + # Get processes and DLLs + processes = scraper.get_all_processes() + dlls = scraper.get_all_dlls() + + print(f"Found {len(processes)} processes and {len(dlls)} DLLs") + + # Optionally crawl content + if args.crawl: + max_items = args.max_items if args.max_items else None + all_items = processes + dlls + + print(f"Crawling content from {len(all_items[:max_items]) if max_items else len(all_items)} pages...") + content = scraper.crawl_all_content(all_items, max_items=max_items) + + # Save to file + import json + with open(args.output, 'w') as f: + json.dump(content, f, indent=2) + + print(f"Saved crawled data to {args.output}") + + +def build_graph(args): + """Build knowledge graph from scraped data.""" + load_dotenv() + + print("Building knowledge graph...") + + # Load data + import json + with open(args.input, 'r') as f: + data = json.load(f) + + print(f"Loaded {len(data)} items from {args.input}") + + # Connect to databases + kg = ProcessKnowledgeGraph( + os.getenv('NEO4J_URI'), + os.getenv('NEO4J_USER'), + os.getenv('NEO4J_PASSWORD') + ) + kg.create_constraints() + + search = OpenSearchManager( + os.getenv('OPENSEARCH_HOST'), + int(os.getenv('OPENSEARCH_PORT')), + os.getenv('OPENSEARCH_USER'), + os.getenv('OPENSEARCH_PASSWORD'), + os.getenv('OPENSEARCH_USE_SSL', 'False').lower() == 'true' + ) + search.create_index() + + # Add data + kg.batch_add_items(data) + search.batch_index_documents(data) + + # Show statistics + print("\nKnowledge Graph Statistics:") + print(kg.get_statistics()) + print("\nOpenSearch Statistics:") + print(search.get_statistics()) + + kg.close() + print("Knowledge graph built successfully!") + + +def query_rag(args): + """Query the RAG system.""" + load_dotenv() + + # Connect to OpenSearch + search = OpenSearchManager( + os.getenv('OPENSEARCH_HOST'), + int(os.getenv('OPENSEARCH_PORT')), + os.getenv('OPENSEARCH_USER'), + os.getenv('OPENSEARCH_PASSWORD'), + os.getenv('OPENSEARCH_USE_SSL', 'False').lower() == 'true' + ) + + # Connect to Neo4j (optional) + kg = None + try: + kg = ProcessKnowledgeGraph( + os.getenv('NEO4J_URI'), + os.getenv('NEO4J_USER'), + os.getenv('NEO4J_PASSWORD') + ) + except: + pass + + # Initialize RAG + rag = ProcessRAG(search, kg) + + if args.interactive: + poc = SimpleRAGPOC(rag) + poc.interactive_demo() + else: + result = rag.query(args.question) + print(f"\nQuestion: {args.question}") + print(f"\nAnswer: {result['answer']}") + print(f"\nSources:") + for source in result['sources']: + print(f" - {source['name']} ({source['type']})") + + if kg: + kg.close() + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description='Process Knowledge Graph System') + subparsers = parser.add_subparsers(dest='command', help='Command to run') + + # Scrape command + scrape_parser = subparsers.add_parser('scrape', help='Scrape data from file.net') + scrape_parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests') + scrape_parser.add_argument('--crawl', action='store_true', help='Crawl content from pages') + scrape_parser.add_argument('--max-items', type=int, help='Maximum items to crawl') + scrape_parser.add_argument('--output', default='scraped_data.json', help='Output file') + + # Build command + build_parser = subparsers.add_parser('build', help='Build knowledge graph') + build_parser.add_argument('--input', default='scraped_data.json', help='Input data file') + + # Query command + query_parser = subparsers.add_parser('query', help='Query the RAG system') + query_parser.add_argument('--question', help='Question to ask') + query_parser.add_argument('--interactive', action='store_true', help='Interactive mode') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + if args.command == 'scrape': + scrape_data(args) + elif args.command == 'build': + build_graph(args) + elif args.command == 'query': + query_rag(args) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..593a547 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,105 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.13.1 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +attrs==25.4.0 +beautifulsoup4==4.14.2 +certifi==2025.10.5 +charset-normalizer==3.4.4 +dataclasses-json==0.6.7 +distro==1.9.0 +Events==0.5 +filelock==3.20.0 +frozenlist==1.8.0 +fsspec==2025.10.0 +greenlet==3.2.4 +h11==0.16.0 +hf-xet==1.2.0 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface-hub==0.36.0 +idna==3.11 +Jinja2==3.1.6 +jiter==0.11.1 +joblib==1.5.2 +jsonpatch==1.33 +jsonpointer==3.0.0 +langchain==1.0.2 +langchain-classic==1.0.0 +langchain-community==0.4 +langchain-core==1.0.0 +langchain-openai==1.0.1 +langchain-text-splitters==1.0.0 +langgraph==1.0.1 +langgraph-checkpoint==3.0.0 +langgraph-prebuilt==1.0.1 +langgraph-sdk==0.2.9 +langsmith==0.4.37 +lxml==6.0.2 +MarkupSafe==3.0.3 +marshmallow==3.26.1 +mpmath==1.3.0 +multidict==6.7.0 +mypy_extensions==1.1.0 +neo4j==6.0.2 +networkx==3.5 +numpy==2.3.4 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.6.0 +opensearch-py==3.0.0 +orjson==3.11.3 +ormsgpack==1.11.0 +packaging==25.0 +pillow==12.0.0 +propcache==0.4.1 +pydantic==2.12.3 +pydantic-settings==2.11.0 +pydantic_core==2.41.4 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.1 +pytz==2025.2 +PyYAML==6.0.3 +regex==2025.10.23 +requests==2.32.5 +requests-toolbelt==1.0.0 +safetensors==0.7.0 +scikit-learn==1.7.2 +scipy==1.16.3 +sentence-transformers==5.1.2 +setuptools==80.9.0 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.8 +SQLAlchemy==2.0.44 +sympy==1.14.0 +tenacity==9.1.2 +threadpoolctl==3.6.0 +tiktoken==0.12.0 +tokenizers==0.22.1 +torch==2.9.1 +tqdm==4.67.1 +transformers==4.57.1 +triton==3.5.1 +typing-inspect==0.9.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.5.0 +xxhash==3.6.0 +yarl==1.22.0 +zstandard==0.25.0 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..bebb870 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,3 @@ +"""Process Knowledge Graph - Main package.""" + +__version__ = "0.1.0" diff --git a/src/ai/__init__.py b/src/ai/__init__.py new file mode 100644 index 0000000..196b71d --- /dev/null +++ b/src/ai/__init__.py @@ -0,0 +1,19 @@ +"""process knowledge graph ai package.""" + +from .triplet_extractor import TripletExtractor +from .model import EntityType, Entity, AttributePredicate, RelationPredicate, AttributeTriplet, RelationTriplet, ExtractedTriplets +from .prompt import extraction_prompt, parser, TRIPLET_EXTRACTION_SCHEMA + +__all__ = [ + 'TripletExtractor', + 'EntityType', + 'Entity', + 'AttributePredicate', + 'RelationPredicate', + 'AttributeTriplet', + 'RelationTriplet', + 'ExtractedTriplets', + 'extraction_prompt', + 'parser', + 'TRIPLET_EXTRACTION_SCHEMA' +] \ No newline at end of file diff --git a/src/ai/model.py b/src/ai/model.py new file mode 100644 index 0000000..e87683a --- /dev/null +++ b/src/ai/model.py @@ -0,0 +1,60 @@ +""" +Output schema definitions for triplet extraction. +""" + + +from typing import List, Literal, Union +from enum import Enum +from pydantic import BaseModel, Field + +class EntityType(str, Enum): + Image = "Image" + Dll = "Dll" + Software = "Software" + Org = "Org" + NetworkResource = "NetworkResource" + Class = "Class" + Path = "Path" + +class Entity(BaseModel): + name: str = Field(description="Literal entity name, e.g., 'A1Servicecenter_Repair.exe' or 'C:\\Windows\\'") + type: EntityType + +class AttributePredicate(str, Enum): + digitally_signed = "digitally_signed" + code_sign_ca = "code_sign_ca" + is_windows_system_file = "is_windows_system_file" + ui_visible = "ui_visible" + technical_security_rating_percent = "technical_security_rating_percent" + is_essential_for_windows = "is_essential_for_windows" + file_size_bytes = "file_size_bytes" + located_in_path = "located_in_path" + +class RelationPredicate(str, Enum): + is_part_of_software = "is_part_of_software" + developed_by = "developed_by" + can_connect_to = "can_connect_to" + can_monitor = "can_monitor" + can_record = "can_record" + may_camouflage_as = "may_camouflage_as" + suspicious_if_located_in = "suspicious_if_located_in" + + +Scalar = Union[str, int, float, bool] + +class AttributeTriplet(BaseModel): + type: Literal["attribute"] = "attribute" + subject: str + predicate: AttributePredicate + value: Scalar + +class RelationTriplet(BaseModel): + type: Literal["relation"] = "relation" + subject: str + predicate: RelationPredicate + object: str + +class ExtractedTriplets(BaseModel): + entities: List[Entity] = Field(default_factory=list, description="Unique entity list used by any triplet, with types.") + attributes: List[AttributeTriplet] = Field(default_factory=list) + relations: List[RelationTriplet] = Field(default_factory=list) \ No newline at end of file diff --git a/src/ai/prompt.py b/src/ai/prompt.py new file mode 100644 index 0000000..2a06629 --- /dev/null +++ b/src/ai/prompt.py @@ -0,0 +1,131 @@ +""" +Prompt templates for process content extraction. +""" + +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import PydanticOutputParser + +from .model import ExtractedTriplets + +TRIPLET_EXTRACTION_SCHEMA = """\ +{ + "schema": { + "purpose": "This JSON defines the fixed context (ontology + definitions) for consistent triplet extraction: subject–predicate–object.", + "entity_types": { + "Image": "Executable/binary/process files (e.g., .exe)", + "Dll": "Dynamic Link Library files (e.g., .dll)", + "Software": "Applications or software packages", + "Org": "Organizations or companies", + "NetworkResource": "Network endpoints such as the Internet or servers", + "Class": "Conceptual resources such as Applications, Keyboard/Mouse inputs, Malware", + "Path": "Filesystem path entities used as objects in relations" + }, + "attribute_predicates": { + "digitally_signed": { + "definition": "Indicates whether the file is digitally signed", + "object_type": "boolean", + "allowed_values": ["true", "false"] + }, + "code_sign_ca": { + "definition": "Specifies the certificate authority that signed the binary", + "object_type": "string" + }, + "is_windows_system_file": { + "definition": "Indicates if the file is a Windows system file", + "object_type": "boolean", + "allowed_values": ["true", "false"] + }, + "ui_visible": { + "definition": "Indicates whether the process or program has a visible user interface", + "object_type": "boolean", + "allowed_values": ["true", "false"] + }, + "technical_security_rating_percent": { + "definition": "Represents the technical risk rating (0–100 percent)", + "object_type": "number", + "constraints": "0 <= value <= 100" + }, + "is_essential_for_windows": { + "definition": "Indicates whether the file is essential for Windows operation", + "object_type": "boolean", + "allowed_values": ["true", "false"] + }, + "file_size_bytes": { + "definition": "File size in bytes", + "object_type": "integer" + }, + "located_in_path": { + "definition": "Default directory path where the file resides", + "object_type": "string" + } + }, + "relation_predicates": { + "is_part_of_software": { + "definition": "Indicates that a file is a component of a specific software", + "subject_types": ["Image", "Dll"], + "object_types": ["Software"] + }, + "developed_by": { + "definition": "Indicates the developer or vendor of the software", + "subject_types": ["Software"], + "object_types": ["Org"] + }, + "can_connect_to": { + "definition": "Indicates that an entity can establish a connection to a network resource", + "subject_types": ["Image", "Dll"], + "object_types": ["NetworkResource"] + }, + "can_monitor": { + "definition": "Indicates that an entity can monitor a certain resource", + "subject_types": ["Image", "Dll"], + "object_types": ["Class"] + }, + "can_record": { + "definition": "Indicates that an entity can record certain inputs or events", + "subject_types": ["Image", "Dll"], + "object_types": ["Class"] + }, + "may_camouflage_as": { + "definition": "Indicates that an entity can masquerade as another file or process", + "subject_types": ["Class"], + "object_types": ["Image", "Dll"] + }, + "suspicious_if_located_in": { + "definition": "Indicates that a file is suspicious if found in a specific directory", + "subject_types": ["Image", "Dll"], + "object_types": ["Path"] + } + }, + "extraction_rules": [ + "Predicates must be chosen strictly from attribute_predicates or relation_predicates.", + "Attributes use a 'value' field for scalars (string/number/boolean).", + "Relations use an 'object' field (string) for the target entity name or path.", + "Use consistent, fully qualified names for files (e.g., 'A1Servicecenter_Repair.exe') and organizations (e.g., 'A1').", + "Paths must be raw strings (e.g., 'C:\\\\Windows\\\\System32\\\\')." + ] + } +} +""" + +TRIPLET_EXTRACTION_PROMPT_TEMPLATE = ( + "You are an IE system. Follow the ontology and rules to extract triplets.\n\n" + "=== Ontology ===\n{schema_doc}\n\n" + "=== Input ===\n{input_text}\n\n" + "Return ONLY JSON matching this schema:\n{format_instructions}\n\n" + "Additional constraints:\n" + "- Populate the top-level 'entities' array with ALL unique subjects/objects appearing in any triplet.\n" + "- Each entity MUST have a correct 'type' from: Image|Dll|Software|Org|NetworkResource|Class|Path.\n" + "- Every subject/object in attributes/relations MUST exactly match a 'name' in entities (case-sensitive).\n" + "- Use ONLY allowed predicates; omit anything unsupported.\n" + "- No explanations—JSON only." +) + +# Create the prompt template +parser: PydanticOutputParser = PydanticOutputParser(pydantic_object=ExtractedTriplets) +format_instructions: str = parser.get_format_instructions() + +extraction_prompt = PromptTemplate( + template=TRIPLET_EXTRACTION_PROMPT_TEMPLATE, + input_variables=["schema_doc", "input_text"], + partial_variables={"format_instructions": format_instructions} +) diff --git a/src/ai/triplet_extractor.py b/src/ai/triplet_extractor.py new file mode 100644 index 0000000..bda6308 --- /dev/null +++ b/src/ai/triplet_extractor.py @@ -0,0 +1,50 @@ +""" +Triplet extractor module. +It extracts triplets from process descriptions. +""" +import logging + +from pydantic import SecretStr +from langchain_openai import ChatOpenAI + +from src.scraper.model import ScrapedContent +from .prompt import extraction_prompt, parser, TRIPLET_EXTRACTION_SCHEMA +from .model import ExtractedTriplets + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class TripletExtractor: + """ + Triplet extractor class. + It uses an LLM to extract triplets from process full text. + """ + __llm: ChatOpenAI + + def __init__(self, + model_name: str, + temperature: float, + openai_api_key: SecretStr): + """ + Initialize the triplet extractor. + Args: + model_name: Name of the LLM model + temperature: Temperature for LLM + openai_api_key: OpenAI API key + """ + + self.__llm = ChatOpenAI( + model_name=model_name, + temperature=temperature, + openai_api_key=openai_api_key + ) + + def extract_triplets_from_process_content(self, content: ScrapedContent) -> ExtractedTriplets: + chain = extraction_prompt | self.__llm | parser + + result: ExtractedTriplets = chain.invoke({ + "schema_doc": TRIPLET_EXTRACTION_SCHEMA, + "input_text": content.content, + }) + return result \ No newline at end of file diff --git a/src/knowledge_graph/__init__.py b/src/knowledge_graph/__init__.py new file mode 100644 index 0000000..7998851 --- /dev/null +++ b/src/knowledge_graph/__init__.py @@ -0,0 +1,6 @@ +"""Knowledge graph module for Neo4j and OpenSearch integration.""" + +from .neo4j_manager import ProcessKnowledgeGraph +from .opensearch_manager import OpenSearchManager + +__all__ = ['ProcessKnowledgeGraph', 'OpenSearchManager'] diff --git a/src/knowledge_graph/kdb/__init__.py b/src/knowledge_graph/kdb/__init__.py new file mode 100644 index 0000000..f5ed681 --- /dev/null +++ b/src/knowledge_graph/kdb/__init__.py @@ -0,0 +1,12 @@ +""" +Knowledge Database (KDB) integration for document storage and retrieval. +""" + +from .db_model import install_document_template_and_index, FileNetContent +from .db_session import KDBSession + +__all__: list[str] = [ + 'install_document_template_and_index', + 'FileNetContent', + 'KDBSession', +] \ No newline at end of file diff --git a/src/knowledge_graph/kdb/db_model.py b/src/knowledge_graph/kdb/db_model.py new file mode 100644 index 0000000..b07e22e --- /dev/null +++ b/src/knowledge_graph/kdb/db_model.py @@ -0,0 +1,89 @@ +""" +Define the database models for the Knowledge Database (KDB). +""" +import logging +from datetime import datetime +from pydantic import BaseModel +from opensearchpy import OpenSearch + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def install_document_template_and_index(client: OpenSearch): + """ + - register dynamic_templates first at Index template + - if there are no physical indices, create index with mappings + """ + + settings = { + "number_of_shards": 1, + "number_of_replicas": 0, + "index": { + "knn": True + } + } + + properties = { + "name": {"type": "keyword"}, + "obj_type": {"type": "keyword"}, + "title": {"type": "text"}, + "url": {"type": "keyword"}, + "description": {"type": "text"}, + "content": {"type": "text"}, + "content_vector": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "space_type": "cosinesimil", + "engine": "lucene" + } + } + } + + body = { + "index_patterns": ["file-net-content_index-*"], + "priority": 100, + "template": { + "settings": settings, + "mappings": { + "properties": properties + }, + "aliases": { + "file-net-content_index": {} # alias for the write index + } + } + } + + # check template existence and create if not exists + if not client.indices.exists_index_template(name="file-net-content-template"): + client.indices.put_index_template(name="file-net-content-template", body=body) + logger.info("Created index template: file-net-content-template") + + # index naming template + index_name_template = "file-net-content_index-{{index}}" + # check if the index exists, create if not exists + if not client.indices.exists_alias(name="file-net-content_index"): + # create new index with alias + client.indices.create( + index=index_name_template.format(index="000000"), + body={ + "settings": settings, + "mappings": {"properties": properties}, + "aliases": {"file-net-content_index": {"is_write_index": True}} + } + ) + logger.info("Created initial index: %s", index_name_template.format(index='000000')) + +class FileNetContent(BaseModel): + """Data model for a document stored in the Knowledge Database.""" + name: str + type: str # e.g., 'process' or 'dll' + title: str + timestamp: datetime + url: str + description: str + content: str + + class Config: + orm_mode = True \ No newline at end of file diff --git a/src/knowledge_graph/kdb/db_session.py b/src/knowledge_graph/kdb/db_session.py new file mode 100644 index 0000000..9c9feb1 --- /dev/null +++ b/src/knowledge_graph/kdb/db_session.py @@ -0,0 +1,248 @@ +""" +Database session management for the Knowledge Database (KDB). +""" +import logging +from typing import Any, List, Dict, Optional +from opensearchpy import OpenSearch +from sentence_transformers import SentenceTransformer +from torch import Tensor +from pydantic import SecretStr + +from .db_model import FileNetContent, install_document_template_and_index + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class KDBSession: + """ + Manages sessions for interacting with the Knowledge Database (KDB). + """ + INDEX_NAME: str = "file-net-content_index" + __hosts: List[Dict[str, Any]] = [] + __client: OpenSearch + __embeded_model: SentenceTransformer + + def __init__(self, host: str, port: int, user: str, password: SecretStr, use_ssl: bool = False): + """ + Initialize OpenSearch connection. + + Args: + host: OpenSearch host + port: OpenSearch port + user: OpenSearch username + password: OpenSearch password + use_ssl: Whether to use SSL + """ + self.__hosts = [{'host': host, 'port': port}] + self.__client: OpenSearch = OpenSearch( + hosts=self.__hosts, + http_auth=(user, password.get_secret_value()), + use_ssl=use_ssl, + verify_certs=False, + ssl_show_warn=False + ) + ## check the index existence and create if not exists + if not self.__client.indices.exists(index=KDBSession.INDEX_NAME): + install_document_template_and_index(self.__client) + # Initialize embedding model + self.__embeded_model = SentenceTransformer('all-MiniLM-L6-v2') + + def index_file_content(self, document: FileNetContent) -> Any: + """ + Index a document into the KDB. + + Args: + document: An instance of FileNetContent to be indexed. + + Returns: + The response from the indexing operation. + """ + vec_list = [] + # vectorize content + if document.content: + vec: Tensor = self.__embeded_model.encode(document.content) + vec_list = vec.tolist() + + # dump document and append vector + document_dict = document.model_dump(mode='python') + document_dict['content_vector'] = vec_list + + response = self.__client.index( + index=KDBSession.INDEX_NAME, + id=self.__generate_file_content_id(document), + body=document_dict + ) + return response + + def batch_index_file_contents(self, documents: List[FileNetContent]) -> List[Any]: + """ + Batch index multiple documents into the KDB. + + Args: + documents: A list of FileNetContent instances to be indexed. + + Returns: + A list of responses from the indexing operations. + """ + responses = [] + for document in documents: + response = self.index_file_content(document) + responses.append(response) + return responses + + def search(self, query, size: int = 10) -> List[Dict[str, Any]]: + """ + Search documents in the KDB. + + Args: + query: The search query string. + size: The number of results to return. + + Returns: + The search results. + """ + try: + search_body = { + "query": { + "multi_match": { + "query": query, + "fields": ["name^3", "title^2", "description", "content"] + } + }, + "size": size + } + response = self.__client.search( + index=KDBSession.INDEX_NAME, + body=search_body + ) + results = [] + for hit in response['hits']['hits']: + result = hit['_source'] + result['score'] = hit['_score'] + results.append(result) + + return results + except Exception as e: + logger.error("Error searching: %s", e) + return [] + + def vector_search(self, content: str, size: int =10) -> List[Dict[str, Any]]: + """ + Perform a vector similarity search in the KDB. + + Args: + content: The content string to be vectorized and searched. + size: The number of results to return. + + Returns: + The search results. + """ + try: + query_vector: Tensor = self.__embeded_model.encode(content) + vector_list = query_vector.tolist() + search_body = { + "size": size, + "knn": { + "field": "content_vector", + "query_vector": vector_list, + "k": size, + "num_candidates": 100 + } + } + response = self.__client.search( + index=KDBSession.INDEX_NAME, + body=search_body + ) + results = [] + for hit in response['hits']['hits']: + result = hit['_source'] + result['score'] = hit['_score'] + results.append(result) + + return results + except Exception as e: + logger.error("Error in vector search: %s", e) + return [] + + def get_document_by_name_and_type(self, doc_name: str, doc_type: str) -> Optional[Dict[str, Any]]: + """ + Retrieve a document by its ID. + + Args: + doc_name: The document name. + doc_type: The document type. + + Returns: + The document if found, else None. + """ + try: + response = self.__client.get( + index=KDBSession.INDEX_NAME, + id=self.__generate_file_content_id_by_name_and_type(doc_name, doc_type) + ) + return response['_source'] + except Exception as e: + logger.error("Error retrieving document by name and type %s-%s: %s", doc_name, doc_type, e) + return None + + def get_document_name_list(self)-> List[Dict[str, Any]]: + """ + Retrieve a list of all documents in the KDB. + + Returns: + A list of documents. + """ + try: + response = self.__client.search( + index=KDBSession.INDEX_NAME, + body={ + "query": {"match_all": {}}, + } + ) + document_names = [hit['_source']['name'] for hit in response['hits']['hits']] + return document_names + except Exception as e: + logger.error("Error retrieving document list: %s", e) + return [] + + def get_statistics(self) -> Dict[str, Any]: + """ + Get index statistics. + + Returns: + Dictionary with statistics + """ + try: + stats = self.__client.count(index=KDBSession.INDEX_NAME) + return { + 'document_count': stats['count'] + } + except Exception as e: + logger.error("Error getting statistics: %s", e) + return {'document_count': 0} + + + def __generate_file_content_id(self, document: FileNetContent) -> str: + """ + Generate a unique ID for the file content document. + + Args: + document: An instance of FileNetContent. + + Returns: + A unique ID as a string. + """ + return self.__generate_file_content_id_by_name_and_type(document.name, document.type) + + def __generate_file_content_id_by_name_and_type(self, name: str, ftype: str) -> str: + """ + Generate a unique ID for the file content document based on name and type. + + Args: + name: The document name. + type: The document type. + + Returns: + A unique ID as a string. + """ + return f"{name}-{ftype}" \ No newline at end of file diff --git a/src/knowledge_graph/kg/graph_schema.py b/src/knowledge_graph/kg/graph_schema.py new file mode 100644 index 0000000..fecd604 --- /dev/null +++ b/src/knowledge_graph/kg/graph_schema.py @@ -0,0 +1,60 @@ +""" +Graph schema definitions for the Knowledge Graph (KG). +""" + +from typing import LiteralString + +# Constraints for the graph database +# Define constraints to ensure name uniqueness for various node types +CONSTRAINTS: list[LiteralString] = [ + # Define graph constraints here + "CREATE CONSTRAINT image_name_unique IF NOT EXISTS FOR (n:Image) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT dll_name_unique IF NOT EXISTS FOR (n:Dll) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT software_name_unique IF NOT EXISTS FOR (n:Software) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT org_name_unique IF NOT EXISTS FOR (n:Org) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT network_resource_name_unique IF NOT EXISTS FOR (n:NetworkResource) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT class_name_unique IF NOT EXISTS FOR (n:Class) REQUIRE n.name IS UNIQUE;", + "CREATE CONSTRAINT path_name_unique IF NOT EXISTS FOR (n:Path) REQUIRE n.name IS UNIQUE;", +] + +ATTRIBUTE_PREDICATES: LiteralString = """\ +// Image node and Dll node attribute predicates +CALL db.create.setNodeProperties('Image', [ + 'digitally_signed', // boolean + 'code_sign_ca', // string + 'is_windows_system_file', // boolean + 'ui_visible', // boolean + 'technical_security_rating_percent', // float/number + 'is_essential_for_windows', // boolean + 'file_size_bytes', // integer + 'located_in_path' // string +]); +CALL db.create.setNodeProperties('Dll', [ + 'digitally_signed', // boolean + 'code_sign_ca', // string + 'is_windows_system_file', // boolean + 'ui_visible', // boolean + 'technical_security_rating_percent', // float/number + 'is_essential_for_windows', // boolean + 'file_size_bytes', // integer + 'located_in_path' // string +]);\ +""" + +RELATION_PREDICATES: list[LiteralString] = [ + # Relation predicates between nodes + # (Dll|Image) -[IS_PART_OF_SOFTWARE]-> Software + "CALL db.create.setRelationshipTypeProperties('IS_PART_OF_SOFTWARE');", + # (Dll|Image) -[DEVELOPED_BY]-> Org + "CALL db.create.setRelationshipTypeProperties('DEVELOPED_BY');", + # (Dll|Image) -[CAN_CONNECT_TO]-> NetworkResource + "CALL db.create.setRelationshipTypeProperties('CAN_CONNECT_TO');", + # (Dll|Image) -[CAN_MONITOR]-> Class + "CALL db.create.setRelationshipTypeProperties('CAN_MONITOR');", + # (Dll|Image) -[CAN_RECORD]-> Class + "CALL db.create.setRelationshipTypeProperties('CAN_RECORD');", + # (Dll|Image) -[MAY_CAMOUFLAGE_AS]-> Dll|Image + "CALL db.create.setRelationshipTypeProperties('MAY_CAMOUFLAGE_AS');", + # (Dll|Image) -[SUSPICIOUS_IF_LOCATED_IN]-> Path + "CALL db.create.setRelationshipTypeProperties('SUSPICIOUS_IF_LOCATED_IN');" +] \ No newline at end of file diff --git a/src/knowledge_graph/neo4j_manager.py b/src/knowledge_graph/neo4j_manager.py new file mode 100644 index 0000000..6e9396c --- /dev/null +++ b/src/knowledge_graph/neo4j_manager.py @@ -0,0 +1,249 @@ +""" +Neo4j knowledge graph management for process and DLL relationships. +""" + +from neo4j import GraphDatabase +from typing import List, Dict, Optional +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ProcessKnowledgeGraph: + """Manages the process and DLL knowledge graph in Neo4j.""" + + def __init__(self, uri: str, user: str, password: str): + """ + Initialize Neo4j connection. + + Args: + uri: Neo4j connection URI + user: Neo4j username + password: Neo4j password + """ + self.driver = GraphDatabase.driver(uri, auth=(user, password)) + logger.info(f"Connected to Neo4j at {uri}") + + def close(self): + """Close the Neo4j connection.""" + self.driver.close() + + def create_constraints(self): + """Create database constraints for better performance.""" + with self.driver.session() as session: + # Create uniqueness constraints + constraints = [ + "CREATE CONSTRAINT process_name IF NOT EXISTS FOR (p:Process) REQUIRE p.name IS UNIQUE", + "CREATE CONSTRAINT dll_name IF NOT EXISTS FOR (d:DLL) REQUIRE d.name IS UNIQUE" + ] + + for constraint in constraints: + try: + session.run(constraint) + logger.info(f"Created constraint: {constraint}") + except Exception as e: + logger.debug(f"Constraint might already exist: {e}") + + def add_process(self, name: str, content: str, url: str, metadata: Optional[Dict] = None): + """ + Add a process node to the knowledge graph. + + Args: + name: Process name + content: Process description/content + url: Source URL + metadata: Additional metadata + """ + with self.driver.session() as session: + query = """ + MERGE (p:Process {name: $name}) + SET p.content = $content, + p.url = $url, + p.updated = datetime() + """ + + params = { + 'name': name, + 'content': content, + 'url': url + } + + if metadata: + for key, value in metadata.items(): + params[f'meta_{key}'] = value + query += f", p.{key} = $meta_{key}" + + session.run(query, params) + logger.debug(f"Added process: {name}") + + def add_dll(self, name: str, content: str, url: str, metadata: Optional[Dict] = None): + """ + Add a DLL node to the knowledge graph. + + Args: + name: DLL name + content: DLL description/content + url: Source URL + metadata: Additional metadata + """ + with self.driver.session() as session: + query = """ + MERGE (d:DLL {name: $name}) + SET d.content = $content, + d.url = $url, + d.updated = datetime() + """ + + params = { + 'name': name, + 'content': content, + 'url': url + } + + if metadata: + for key, value in metadata.items(): + params[f'meta_{key}'] = value + query += f", d.{key} = $meta_{key}" + + session.run(query, params) + logger.debug(f"Added DLL: {name}") + + def create_relationship(self, from_name: str, from_type: str, + to_name: str, to_type: str, + relationship: str = "USES"): + """ + Create a relationship between two nodes. + + Args: + from_name: Source node name + from_type: Source node type (Process or DLL) + to_name: Target node name + to_type: Target node type (Process or DLL) + relationship: Relationship type + """ + with self.driver.session() as session: + query = f""" + MATCH (a:{from_type} {{name: $from_name}}) + MATCH (b:{to_type} {{name: $to_name}}) + MERGE (a)-[r:{relationship}]->(b) + SET r.created = datetime() + """ + + session.run(query, { + 'from_name': from_name, + 'to_name': to_name + }) + logger.debug(f"Created relationship: {from_name} -{relationship}-> {to_name}") + + def batch_add_items(self, items: List[Dict[str, any]]): + """ + Batch add processes and DLLs to the graph. + + Args: + items: List of items containing name, type, content, and url + """ + logger.info(f"Adding {len(items)} items to knowledge graph...") + + for item in items: + item_type = item.get('type', 'process') + name = item.get('name', '') + content = item.get('full_text', '') + url = item.get('url', '') + + if item_type == 'process': + self.add_process(name, content, url) + elif item_type == 'dll': + self.add_dll(name, content, url) + + logger.info("Batch add completed") + + def get_process(self, name: str) -> Optional[Dict]: + """ + Get process information by name. + + Args: + name: Process name + + Returns: + Process information or None + """ + with self.driver.session() as session: + result = session.run( + "MATCH (p:Process {name: $name}) RETURN p", + name=name + ) + record = result.single() + return dict(record['p']) if record else None + + def get_dll(self, name: str) -> Optional[Dict]: + """ + Get DLL information by name. + + Args: + name: DLL name + + Returns: + DLL information or None + """ + with self.driver.session() as session: + result = session.run( + "MATCH (d:DLL {name: $name}) RETURN d", + name=name + ) + record = result.single() + return dict(record['d']) if record else None + + def search_by_keyword(self, keyword: str, limit: int = 10) -> List[Dict]: + """ + Search for processes and DLLs containing a keyword. + + Args: + keyword: Search keyword + limit: Maximum number of results + + Returns: + List of matching items + """ + with self.driver.session() as session: + query = """ + MATCH (n) + WHERE n:Process OR n:DLL + AND (toLower(n.name) CONTAINS toLower($keyword) + OR toLower(n.content) CONTAINS toLower($keyword)) + RETURN n, labels(n) as type + LIMIT $limit + """ + + results = session.run(query, keyword=keyword, limit=limit) + items = [] + for record in results: + item = dict(record['n']) + item['node_type'] = record['type'][0] + items.append(item) + + return items + + def get_statistics(self) -> Dict[str, int]: + """ + Get knowledge graph statistics. + + Returns: + Dictionary with count statistics + """ + with self.driver.session() as session: + stats = {} + + # Count processes + result = session.run("MATCH (p:Process) RETURN count(p) as count") + stats['processes'] = result.single()['count'] + + # Count DLLs + result = session.run("MATCH (d:DLL) RETURN count(d) as count") + stats['dlls'] = result.single()['count'] + + # Count relationships + result = session.run("MATCH ()-[r]->() RETURN count(r) as count") + stats['relationships'] = result.single()['count'] + + return stats diff --git a/src/knowledge_graph/opensearch_manager.py b/src/knowledge_graph/opensearch_manager.py new file mode 100644 index 0000000..33e5556 --- /dev/null +++ b/src/knowledge_graph/opensearch_manager.py @@ -0,0 +1,183 @@ +""" +OpenSearch integration for document indexing and search. +""" + +from opensearchpy import OpenSearch +from typing import List, Dict, Optional +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class OpenSearchManager: + """Manages document indexing and search in OpenSearch.""" + + def __init__(self, host: str, port: int, user: str, password: str, use_ssl: bool = False): + """ + Initialize OpenSearch connection. + + Args: + host: OpenSearch host + port: OpenSearch port + user: OpenSearch username + password: OpenSearch password + use_ssl: Whether to use SSL + """ + self.client = OpenSearch( + hosts=[{'host': host, 'port': port}], + http_auth=(user, password), + use_ssl=use_ssl, + verify_certs=False, + ssl_show_warn=False + ) + self.index_name = "process_knowledge" + logger.info(f"Connected to OpenSearch at {host}:{port}") + + def create_index(self): + """Create the index with appropriate mappings.""" + index_body = { + "mappings": { + "properties": { + "name": {"type": "keyword"}, + "type": {"type": "keyword"}, + "content": {"type": "text"}, + "url": {"type": "keyword"}, + "title": {"type": "text"}, + "paragraphs": {"type": "text"}, + "timestamp": {"type": "date"} + } + }, + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + } + + try: + if not self.client.indices.exists(index=self.index_name): + self.client.indices.create(index=self.index_name, body=index_body) + logger.info(f"Created index: {self.index_name}") + else: + logger.info(f"Index already exists: {self.index_name}") + except Exception as e: + logger.error(f"Error creating index: {e}") + + def delete_index(self): + """Delete the index.""" + try: + if self.client.indices.exists(index=self.index_name): + self.client.indices.delete(index=self.index_name) + logger.info(f"Deleted index: {self.index_name}") + except Exception as e: + logger.error(f"Error deleting index: {e}") + + def index_document(self, doc_id: str, document: Dict[str, any]): + """ + Index a single document. + + Args: + doc_id: Document ID + document: Document to index + """ + try: + self.client.index( + index=self.index_name, + id=doc_id, + body=document, + refresh=True + ) + logger.debug(f"Indexed document: {doc_id}") + except Exception as e: + logger.error(f"Error indexing document {doc_id}: {e}") + + def batch_index_documents(self, items: List[Dict[str, any]]): + """ + Batch index multiple documents. + + Args: + items: List of items to index + """ + logger.info(f"Indexing {len(items)} documents...") + + for item in items: + doc_id = f"{item.get('type', 'unknown')}_{item.get('name', 'unknown')}" + document = { + 'name': item.get('name', ''), + 'type': item.get('type', ''), + 'content': item.get('full_text', ''), + 'url': item.get('url', ''), + 'title': item.get('title', ''), + 'paragraphs': item.get('paragraphs', []) + } + self.index_document(doc_id, document) + + logger.info("Batch indexing completed") + + def search(self, query: str, size: int = 10) -> List[Dict]: + """ + Search for documents. + + Args: + query: Search query + size: Number of results to return + + Returns: + List of search results + """ + try: + search_body = { + "query": { + "multi_match": { + "query": query, + "fields": ["name^3", "title^2", "content", "paragraphs"] + } + }, + "size": size + } + + response = self.client.search(index=self.index_name, body=search_body) + + results = [] + for hit in response['hits']['hits']: + result = hit['_source'] + result['score'] = hit['_score'] + results.append(result) + + return results + except Exception as e: + logger.error(f"Error searching: {e}") + return [] + + def get_document(self, doc_id: str) -> Optional[Dict]: + """ + Get a document by ID. + + Args: + doc_id: Document ID + + Returns: + Document or None + """ + try: + response = self.client.get(index=self.index_name, id=doc_id) + return response['_source'] + except Exception as e: + logger.error(f"Error getting document {doc_id}: {e}") + return None + + def get_statistics(self) -> Dict[str, any]: + """ + Get index statistics. + + Returns: + Dictionary with statistics + """ + try: + stats = self.client.count(index=self.index_name) + return { + 'document_count': stats['count'] + } + except Exception as e: + logger.error(f"Error getting statistics: {e}") + return {'document_count': 0} diff --git a/src/rag/__init__.py b/src/rag/__init__.py new file mode 100644 index 0000000..5b838a4 --- /dev/null +++ b/src/rag/__init__.py @@ -0,0 +1,5 @@ +"""RAG module for Retrieval-Augmented Generation.""" + +from .process_rag import ProcessRAG, SimpleRAGPOC, ProcessKnowledgeVectorStore + +__all__ = ['ProcessRAG', 'SimpleRAGPOC', 'ProcessKnowledgeVectorStore'] diff --git a/src/rag/process_rag.py b/src/rag/process_rag.py new file mode 100644 index 0000000..720c78c --- /dev/null +++ b/src/rag/process_rag.py @@ -0,0 +1,257 @@ +""" +BYOKG (Bring Your Own Knowledge Graph) RAG implementation. +Retrieval-Augmented Generation using the Process Knowledge Graph. +""" + +from typing import List, Dict, Optional +from langchain_core.documents import Document +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ProcessKnowledgeVectorStore: + """Custom vector store for process knowledge.""" + + def __init__(self, opensearch_manager): + """ + Initialize the vector store. + + Args: + opensearch_manager: OpenSearchManager instance + """ + self.opensearch = opensearch_manager + self.embeddings = OpenAIEmbeddings() + + def search(self, query: str, k: int = 5) -> List[Document]: + """ + Search for relevant documents. + + Args: + query: Search query + k: Number of results + + Returns: + List of Document objects + """ + results = self.opensearch.search(query, size=k) + + documents = [] + for result in results: + doc = Document( + page_content=result.get('content', ''), + metadata={ + 'name': result.get('name', ''), + 'type': result.get('type', ''), + 'url': result.get('url', ''), + 'score': result.get('score', 0.0) + } + ) + documents.append(doc) + + return documents + + +class ProcessRAG: + """RAG system for process knowledge queries.""" + + def __init__(self, opensearch_manager, neo4j_manager=None, + model_name: str = "gpt-3.5-turbo", temperature: float = 0.0): + """ + Initialize the RAG system. + + Args: + opensearch_manager: OpenSearchManager instance + neo4j_manager: ProcessKnowledgeGraph instance (optional) + model_name: OpenAI model name + temperature: Model temperature + """ + self.opensearch = opensearch_manager + self.neo4j = neo4j_manager + self.llm = ChatOpenAI(model_name=model_name, temperature=temperature) + self.vector_store = ProcessKnowledgeVectorStore(opensearch_manager) + logger.info(f"Initialized RAG system with model: {model_name}") + + def retrieve_context(self, query: str, k: int = 5) -> List[Document]: + """ + Retrieve relevant context documents. + + Args: + query: User query + k: Number of documents to retrieve + + Returns: + List of relevant documents + """ + return self.vector_store.search(query, k=k) + + def generate_answer(self, query: str, context_docs: List[Document]) -> str: + """ + Generate an answer using retrieved context. + + Args: + query: User query + context_docs: Retrieved context documents + + Returns: + Generated answer + """ + # Prepare context + context = "\n\n".join([ + f"[{doc.metadata.get('name', 'Unknown')}] ({doc.metadata.get('type', 'unknown')})\n{doc.page_content}" + for doc in context_docs + ]) + + # Create prompt + prompt = f"""Based on the following context about Windows processes and DLLs, answer the question. + +Context: +{context} + +Question: {query} + +Answer: """ + + # Generate answer + response = self.llm.invoke(prompt) + return response.content + + def query(self, question: str, k: int = 5) -> Dict[str, any]: + """ + Query the RAG system. + + Args: + question: User question + k: Number of context documents to retrieve + + Returns: + Dictionary with answer and sources + """ + logger.info(f"Processing query: {question}") + + # Retrieve context + context_docs = self.retrieve_context(question, k=k) + + if not context_docs: + return { + 'answer': "I don't have enough information to answer this question.", + 'sources': [] + } + + # Generate answer + answer = self.generate_answer(question, context_docs) + + # Prepare sources + sources = [ + { + 'name': doc.metadata.get('name', ''), + 'type': doc.metadata.get('type', ''), + 'url': doc.metadata.get('url', ''), + 'score': doc.metadata.get('score', 0.0) + } + for doc in context_docs + ] + + return { + 'answer': answer, + 'sources': sources, + 'context_count': len(context_docs) + } + + def query_with_graph(self, question: str, k: int = 5) -> Dict[str, any]: + """ + Query using both vector search and graph context. + + Args: + question: User question + k: Number of context documents to retrieve + + Returns: + Dictionary with answer and sources + """ + if not self.neo4j: + logger.warning("Neo4j not available, falling back to vector search only") + return self.query(question, k=k) + + # First, get vector search results + result = self.query(question, k=k) + + # Enhance with graph relationships + # Extract process/DLL names from sources + names = [source['name'] for source in result['sources']] + + # Get graph context + graph_context = [] + for name in names[:3]: # Limit to top 3 for performance + # Search for related items in graph + related = self.neo4j.search_by_keyword(name, limit=3) + graph_context.extend(related) + + if graph_context: + result['graph_context'] = graph_context + + return result + + +class SimpleRAGPOC: + """Simple POC for BYOKG RAG.""" + + def __init__(self, rag_system: ProcessRAG): + """ + Initialize the POC. + + Args: + rag_system: ProcessRAG instance + """ + self.rag = rag_system + + def demo_query(self, question: str): + """ + Run a demo query and print results. + + Args: + question: Question to ask + """ + print(f"\n{'='*80}") + print(f"Question: {question}") + print(f"{'='*80}\n") + + result = self.rag.query(question) + + print("Answer:") + print(result['answer']) + print(f"\nSources ({result['context_count']} documents):") + for i, source in enumerate(result['sources'], 1): + print(f"\n{i}. {source['name']} ({source['type']})") + print(f" URL: {source['url']}") + print(f" Relevance Score: {source['score']:.4f}") + + print(f"\n{'='*80}\n") + + def interactive_demo(self): + """Run an interactive demo session.""" + print("\n" + "="*80) + print("Process Knowledge Graph - RAG System Demo") + print("="*80) + print("\nType 'quit' or 'exit' to end the session.\n") + + while True: + try: + question = input("Your question: ").strip() + + if question.lower() in ['quit', 'exit', 'q']: + print("\nGoodbye!") + break + + if not question: + continue + + self.demo_query(question) + + except KeyboardInterrupt: + print("\n\nGoodbye!") + break + except Exception as e: + print(f"\nError: {e}\n") diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py new file mode 100644 index 0000000..1c5482f --- /dev/null +++ b/src/scraper/__init__.py @@ -0,0 +1,11 @@ +"""Scraper module for web scraping functionality.""" + +from .file_net_scraper import FileNetScraper +from .model import ObjectType, ScrapedProcess, ScrapedContent + +__all__ = [ + 'FileNetScraper', + 'ObjectType', + 'ScrapedProcess', + 'ScrapedContent' +] diff --git a/src/scraper/file_net_scraper.py b/src/scraper/file_net_scraper.py new file mode 100644 index 0000000..9e07a15 --- /dev/null +++ b/src/scraper/file_net_scraper.py @@ -0,0 +1,198 @@ +""" +Web scraper for file.net to collect process and DLL information. +""" + +import requests +from bs4 import BeautifulSoup +from typing import List, Dict, Optional +import time +import logging +from tqdm import tqdm + +from .model import ObjectType, ScrapedProcess, ScrapedContent + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class FileNetScraper: + """Scraper for file.net to extract process and DLL information.""" + # constants + BASE_URL:str = "https://www.file.net" + LIST_TEMPLATE:str = f"{BASE_URL}/process/_{{letter}}.html" + FILE_TEMPLATE:str = f"{BASE_URL}/process/{{filename}}.html" + # attributes + __delay: float + __session: requests.Session + + def __init__(self, delay: float = 1.0): + """ + Initialize the scraper. + + Args: + delay: Delay between requests in seconds to be polite to the server + """ + self.__delay = delay + self.__session = requests.Session() + # file.net does not forbid scraping in its robots.txt, but we set a user-agent + self.__session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + + def get_page(self, url: str) -> Optional[BeautifulSoup]: + """ + Fetch and parse a webpage. + + Args: + url: URL to fetch + + Returns: + BeautifulSoup object or None if failed + """ + try: + time.sleep(self.__delay) + response = self.__session.get(url, timeout=10) + response.raise_for_status() + return BeautifulSoup(response.content, 'lxml') + except Exception as e: + logger.error(f"Error fetching {url}: {e}") + return None + + def get_process_list_from_letter(self, letter: str) -> List[ScrapedProcess]: + """ + Get list of processes starting with a specific letter. + + Args: + letter: Letter to search for (a-z or 0-9) + + Returns: + List of dictionaries containing process name and URL + """ + url = self.LIST_TEMPLATE.format(letter=letter) + soup = self.get_page(url) + + if not soup: + return [] + + processes = [] + # Find all links to process pages + # example link: + # get only links under 'Windows Processes' section + windows_processes = soup.find('ul', {'class': 'tabcol'}) + for link in windows_processes.find_all('a', href=True): + href = link.get('href', '') + if href.endswith('.html'): + process_name = href.replace('.html', '') + processes.append(ScrapedProcess( + name=process_name, + url=self.FILE_TEMPLATE.format(filename=process_name), + obj_type=self.__get_obj_type_from_name(process_name) + )) + + return processes + + def __get_obj_type_from_name(self, name: str) -> ObjectType: + """ + Determine object type from name. + + Args: + name: Name of the object + + Returns: + ObjectType enum value + """ + if name.lower().endswith('.dll'): + return ObjectType.DLL + else: + return ObjectType.IMAGE + + def get_all_processes(self) -> List[ScrapedProcess]: + """ + Get complete list of all processes from a-z and 0-9. + + Returns: + List of all processes with their URLs + """ + all_processes = [] + # letters a-z and 0 + # check https://www.file.net/process/_a.html for reference + letters = list('abcdefghijklmnopqrstuvwxyz') + list('0') + + logger.info("Collecting process list from file.net...") + for letter in tqdm(letters, desc="Fetching processes"): + if letter == '0': + # if letter is '0', the URL is __.html + letter = '_' + processes = self.get_process_list_from_letter(letter) + all_processes.extend(processes) + logger.debug(f"Found {len(processes)} processes for letter '{letter}'") + + logger.info(f"Total processes found: {len(all_processes)}") + return all_processes + + def get_page_content(self, url: str) -> Optional[ScrapedContent]: + """ + Extract content from a process or DLL page. + Get only the descriptive text and paragraphs. + + Args: + url: URL of the page to scrape + + Returns: + ScrapedContent object containing extracted information + """ + soup = self.get_page(url) + + if not soup: + return None + + + name = "" + paragraphs = [] + + # Extract all paragraph content + for p in soup.find_all('p'): + text = p.get_text(strip=True) + if text: + paragraphs.append(text) + + # Extract title + title = soup.find('h1') + if title: + name = title.get_text(strip=True) + + # Join all paragraphs into full text + return ScrapedContent( + name=name, + obj_type=ObjectType.NONE, + title=name, + url=url, + description=paragraphs[0] if paragraphs else None, + content='\n\n'.join(paragraphs) + ) + + def crawl_all_content(self, items: List[ScrapedProcess], + max_item_index: Optional[int] = None) -> List[ScrapedContent]: + """ + Crawl content from all items (processes or DLLs). + + Args: + items: List of items to crawl + max_items: Maximum number of items to crawl (for testing) + + Returns: + List of crawled content + """ + results = [] + items_to_crawl = items[:max_item_index] if max_item_index else items + + logger.info(f"Crawling content from {len(items_to_crawl)} pages...") + for item in tqdm(items_to_crawl, desc="Crawling content"): + content = self.get_page_content(item.url) + if content: + # update name and type + content.name = item.name + content.obj_type = item.obj_type + results.append(content) + + return results diff --git a/src/scraper/model.py b/src/scraper/model.py new file mode 100644 index 0000000..086ed51 --- /dev/null +++ b/src/scraper/model.py @@ -0,0 +1,29 @@ +""" +Scraper model module. +It defines the data models used for file.net scraping. +""" + +from enum import Enum +from typing import TypedDict, Optional +from pydantic import BaseModel + +class ObjectType(str, Enum): + """Enumeration for object types.""" + IMAGE = 'image' + DLL = 'dll' + NONE = 'none' + +class ScrapedProcess(BaseModel): + """Data model for a scraped process.""" + name: str + url: str + obj_type: ObjectType # image or dll + +class ScrapedContent(BaseModel): + """Data model for scraped content from a process or DLL page.""" + name: str + obj_type: ObjectType # image or dll + title: Optional[str] + url: str + description: Optional[str] + content: Optional[str] \ No newline at end of file diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..7fe1d16 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,89 @@ +""" +Simple test to verify all components work correctly. +""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.scraper import FileNetScraper +from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager +from src.rag import ProcessRAG, SimpleRAGPOC + + +def test_scraper_initialization(): + """Test scraper can be initialized.""" + scraper = FileNetScraper(delay=0.5) + assert scraper is not None + assert scraper.delay == 0.5 + print("✓ Scraper initialization test passed") + + +def test_mock_data_structure(): + """Test with mock data to verify data structure handling.""" + # Mock crawled data + mock_data = [ + { + 'name': 'ccleaner64.exe', + 'type': 'process', + 'url': 'https://www.file.net/process/ccleaner64.exe.html', + 'title': 'CCleaner64.exe', + 'paragraphs': [ + 'CCleaner is a system optimization and privacy tool.', + 'It removes unused files and cleans traces of your online activities.' + ], + 'full_text': 'CCleaner is a system optimization and privacy tool. It removes unused files and cleans traces of your online activities.' + }, + { + 'name': 'kernel32.dll', + 'type': 'dll', + 'url': 'https://www.file.net/dll/kernel32.dll.html', + 'title': 'Kernel32.dll', + 'paragraphs': [ + 'Kernel32.dll is a core Windows DLL file.', + 'It handles memory management, input/output operations, and process management.' + ], + 'full_text': 'Kernel32.dll is a core Windows DLL file. It handles memory management, input/output operations, and process management.' + } + ] + + assert len(mock_data) == 2 + assert mock_data[0]['type'] == 'process' + assert mock_data[1]['type'] == 'dll' + print("✓ Mock data structure test passed") + + return mock_data + + +def test_imports(): + """Test all module imports work.""" + from src.scraper import FileNetScraper + from src.knowledge_graph import ProcessKnowledgeGraph, OpenSearchManager + from src.rag import ProcessRAG, SimpleRAGPOC + + print("✓ All imports test passed") + + +def main(): + """Run all tests.""" + print("Running Process Knowledge Graph Tests") + print("=" * 80) + + print("\nTest 1: Module Imports") + test_imports() + + print("\nTest 2: Scraper Initialization") + test_scraper_initialization() + + print("\nTest 3: Mock Data Structure") + mock_data = test_mock_data_structure() + + print("\n" + "=" * 80) + print("All tests passed! ✓") + print("\nNote: Database connectivity tests require Neo4j and OpenSearch to be running.") + print("Run the example scripts to test full functionality with databases.") + + +if __name__ == "__main__": + main()