diff --git a/examples/semantic_search/README.md b/examples/semantic_search/README.md new file mode 100644 index 0000000..33a8f64 --- /dev/null +++ b/examples/semantic_search/README.md @@ -0,0 +1,56 @@ +# Semantic Search Example with sqlite-vector + +This example in Python demonstrates how to build a semantic search engine using the [sqlite-vector](https://github.com/sqliteai/sqlite-vector) extension and a Sentence Transformer model. It allows you to index and search documents using vector similarity, powered by a local LLM embedding model. + +### How it works + +- **Embeddings**: Uses [sentence-transformers](https://huggingface.co/sentence-transformers) to generate dense vector representations (embeddings) for text. The default model is [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2), a fast, lightweight model (384 dimensions) suitable for semantic search and retrieval tasks. +- **Vector Store and Search**: Embeddings are stored in SQLite using the [`sqlite-vector`](https://github.com/sqliteai/sqlite-vector) extension, enabling fast similarity search (cosine distance) directly in the database. +- **Sample Data**: The `samples/` directory contains example documents you can index and search immediately. + +### Installation + +1. Download the `sqlite-vector` extension for your platform [here](https://github.com/sqliteai/sqlite-vector/releases). + +2. Extract the `vector.so` file in the main directory of the project. + +3. Install the dependencies: + + +```bash +$ python -m venv venv + +$ source venv/bin/activate + +$ pip install -r requirements.txt +``` + +4. On first use, the required model will be downloaded automatically. + +### Usage + +Use the interactive mode to keep the model in memory and run multiple queries efficiently: + +```bash +python semsearch.py --repl + +# Index a directory of documents +semsearch> index ./samples + +# Search for similar documents +semsearch> search "neural network architectures for image recognition" +``` + +### Example Queries + +Try these queries to test semantic similarity: + +- "neural network architectures for image recognition" +- "reinforcement learning in autonomous systems" +- "explainable artificial intelligence methods" +- "AI governance and regulatory compliance" +- "network intrusion detection systems" + +**Note:** +- Supported extension are `.md`, `.txt`, `.py`, `.js`, `.html`, `.css`, `.sql`, `.json`, `.xml`. +- For more details, see the code in `semsearch.py` and `semantic_search.py`. \ No newline at end of file diff --git a/examples/semantic_search/requirements.txt b/examples/semantic_search/requirements.txt new file mode 100644 index 0000000..e282c66 --- /dev/null +++ b/examples/semantic_search/requirements.txt @@ -0,0 +1 @@ +sentence-transformers diff --git a/examples/semantic_search/samples/sample-1.md b/examples/semantic_search/samples/sample-1.md new file mode 100644 index 0000000..ad9b442 --- /dev/null +++ b/examples/semantic_search/samples/sample-1.md @@ -0,0 +1,3 @@ +# Article 1: Deep Learning Neural Networks + +Deep learning utilizes artificial neural networks with multiple layers to process and learn from vast amounts of data. These networks automatically discover intricate patterns and representations without manual feature engineering. Convolutional neural networks excel at image recognition tasks, while recurrent neural networks handle sequential data like text and speech. Popular frameworks include TensorFlow, PyTorch, and Keras. Deep learning has revolutionized computer vision, natural language processing, and speech recognition applications. diff --git a/examples/semantic_search/samples/sample-10.md b/examples/semantic_search/samples/sample-10.md new file mode 100644 index 0000000..74dc614 --- /dev/null +++ b/examples/semantic_search/samples/sample-10.md @@ -0,0 +1,3 @@ +# Article 10: Zero Trust Security Architecture + +Zero trust security operates on the principle of "never trust, always verify," requiring authentication and authorization for every access request regardless of location. This approach assumes breach scenarios and implements continuous verification throughout the network. Key components include identity verification, device compliance checking, least privilege access, and micro-segmentation. Zero trust frameworks help organizations protect against insider threats and advanced persistent attacks. diff --git a/examples/semantic_search/samples/sample-11.md b/examples/semantic_search/samples/sample-11.md new file mode 100644 index 0000000..e0db3ae --- /dev/null +++ b/examples/semantic_search/samples/sample-11.md @@ -0,0 +1,3 @@ +# Article 11: Incident Response and Recovery + +Effective incident response requires predefined procedures for detecting, containing, and recovering from security breaches. Response teams follow structured phases: preparation, identification, containment, eradication, recovery, and lessons learned. Critical activities include forensic analysis, stakeholder communication, system restoration, and process improvement. Regular tabletop exercises and response plan updates ensure organizations can quickly minimize damage and restore normal operations after security incidents. diff --git a/examples/semantic_search/samples/sample-12.md b/examples/semantic_search/samples/sample-12.md new file mode 100644 index 0000000..4f7133e --- /dev/null +++ b/examples/semantic_search/samples/sample-12.md @@ -0,0 +1,3 @@ +# Article 12: Machine Learning for Malware Detection + +Machine learning enhances malware detection by analyzing file characteristics, behavioral patterns, and network communications to identify threats. Static analysis examines file properties without execution, while dynamic analysis observes runtime behavior in controlled environments. Ensemble methods combining multiple algorithms improve detection accuracy and reduce false positives. AI-powered systems can identify zero-day threats and polymorphic malware that traditional signature-based solutions miss. diff --git a/examples/semantic_search/samples/sample-13.md b/examples/semantic_search/samples/sample-13.md new file mode 100644 index 0000000..5f8776c --- /dev/null +++ b/examples/semantic_search/samples/sample-13.md @@ -0,0 +1,3 @@ +# Article 13: Behavioral Analytics for Anomaly Detection + +Behavioral analytics leverages machine learning to establish baseline patterns of normal user and system behavior, flagging deviations that may indicate security threats. User and entity behavior analytics (UEBA) systems monitor login patterns, data access, and application usage to detect insider threats and compromised accounts. Machine learning models adapt to changing behavior patterns while maintaining sensitivity to subtle anomalies that human analysts might overlook. diff --git a/examples/semantic_search/samples/sample-14.md b/examples/semantic_search/samples/sample-14.md new file mode 100644 index 0000000..c3530b3 --- /dev/null +++ b/examples/semantic_search/samples/sample-14.md @@ -0,0 +1,3 @@ +# Article 14: AI-Driven Security Orchestration + +Security orchestration platforms integrate multiple security tools and automate incident response workflows using artificial intelligence. These systems correlate alerts from various sources, prioritize threats based on risk assessment, and execute automated remediation actions. Natural language processing helps analyze threat intelligence reports, while machine learning improves decision-making accuracy over time. Orchestration reduces response times and analyst workload while maintaining consistent security procedures. diff --git a/examples/semantic_search/samples/sample-15.md b/examples/semantic_search/samples/sample-15.md new file mode 100644 index 0000000..25a7f1b --- /dev/null +++ b/examples/semantic_search/samples/sample-15.md @@ -0,0 +1,3 @@ +# Article 15: Advanced Persistent Threats (APTs) + +Advanced persistent threats represent sophisticated, long-term cyberattacks typically conducted by nation-states or organized criminal groups. APTs use multiple attack vectors, maintain persistent access, and employ stealth techniques to avoid detection. Common tactics include spear-phishing, zero-day exploits, living-off-the-land techniques, and lateral movement within networks. Defense requires continuous monitoring, threat hunting, and intelligence-driven security strategies to detect and neutralize these patient adversaries. diff --git a/examples/semantic_search/samples/sample-16.md b/examples/semantic_search/samples/sample-16.md new file mode 100644 index 0000000..dd549e3 --- /dev/null +++ b/examples/semantic_search/samples/sample-16.md @@ -0,0 +1,3 @@ +# Article 16: Social Engineering Attack Vectors + +Social engineering exploits human psychology rather than technical vulnerabilities to gain unauthorized access to systems and information. Common techniques include phishing emails, pretexting phone calls, baiting with infected media, and physical tailgating. Attackers research targets through social media and public information to craft convincing scenarios. Defense requires security awareness training, verification procedures, and creating organizational cultures that encourage reporting suspicious communications. diff --git a/examples/semantic_search/samples/sample-17.md b/examples/semantic_search/samples/sample-17.md new file mode 100644 index 0000000..e9deae8 --- /dev/null +++ b/examples/semantic_search/samples/sample-17.md @@ -0,0 +1,3 @@ +# Article 17: Supply Chain Security Risks + +Supply chain attacks target third-party vendors and software dependencies to compromise multiple organizations simultaneously. Attackers may insert malicious code into legitimate software updates, compromise hardware during manufacturing, or exploit trusted vendor relationships. Notable incidents include SolarWinds and Kaseya attacks affecting thousands of organizations. Mitigation strategies include vendor risk assessment, software composition analysis, and zero-trust principles for third-party integrations. diff --git a/examples/semantic_search/samples/sample-18.md b/examples/semantic_search/samples/sample-18.md new file mode 100644 index 0000000..e92bf85 --- /dev/null +++ b/examples/semantic_search/samples/sample-18.md @@ -0,0 +1,3 @@ +# Article 18: Quantum Computing and Cryptography + +Quantum computing poses both opportunities and threats for cybersecurity. Quantum computers could break current cryptographic algorithms like RSA and ECC that secure internet communications and data protection. Organizations must prepare for post-quantum cryptography by implementing quantum-resistant algorithms. However, quantum technologies also enable quantum key distribution for theoretically unbreakable communication channels. The transition period requires careful planning and gradual migration strategies. diff --git a/examples/semantic_search/samples/sample-19.md b/examples/semantic_search/samples/sample-19.md new file mode 100644 index 0000000..72cb356 --- /dev/null +++ b/examples/semantic_search/samples/sample-19.md @@ -0,0 +1,3 @@ +# Article 19: Edge Computing Security Challenges + +Edge computing brings data processing closer to end users and devices, improving performance but creating new security challenges. Distributed edge nodes have limited security controls compared to centralized data centers. Attack surfaces expand across numerous endpoints with varying security capabilities. Key concerns include device authentication, data encryption, secure updates, and centralized security management. Zero-trust architectures and hardware-based security become essential for edge deployments. diff --git a/examples/semantic_search/samples/sample-2.md b/examples/semantic_search/samples/sample-2.md new file mode 100644 index 0000000..a32f61c --- /dev/null +++ b/examples/semantic_search/samples/sample-2.md @@ -0,0 +1,3 @@ +# Article 2: Natural Language Processing Fundamentals + +Natural language processing enables computers to understand, interpret, and generate human language. Key techniques include tokenization, part-of-speech tagging, named entity recognition, and sentiment analysis. Modern NLP leverages transformer architectures like BERT and GPT models for tasks such as language translation, text summarization, and question answering. Applications span chatbots, voice assistants, content moderation, and automated document analysis across various industries. diff --git a/examples/semantic_search/samples/sample-20.md b/examples/semantic_search/samples/sample-20.md new file mode 100644 index 0000000..cc7169c --- /dev/null +++ b/examples/semantic_search/samples/sample-20.md @@ -0,0 +1,3 @@ +# Article 20: IoT Security Vulnerabilities + +Internet of Things devices often have weak security controls due to cost constraints and rapid deployment cycles. Common vulnerabilities include default passwords, unencrypted communications, lack of update mechanisms, and insufficient access controls. IoT botnets can launch massive distributed denial-of-service attacks. Security strategies include network segmentation, device lifecycle management, security-by-design principles, and regulatory compliance requirements for IoT manufacturers and deployments. diff --git a/examples/semantic_search/samples/sample-3.md b/examples/semantic_search/samples/sample-3.md new file mode 100644 index 0000000..0d87337 --- /dev/null +++ b/examples/semantic_search/samples/sample-3.md @@ -0,0 +1,3 @@ +# Article 3: Computer Vision Applications + +Computer vision empowers machines to interpret and analyze visual information from images and videos. Core techniques include object detection, image classification, facial recognition, and motion tracking. Convolutional neural networks form the backbone of modern computer vision systems. Applications include autonomous vehicles, medical imaging diagnosis, quality control in manufacturing, augmented reality, and surveillance systems. Edge computing enables real-time computer vision processing on mobile devices. diff --git a/examples/semantic_search/samples/sample-4.md b/examples/semantic_search/samples/sample-4.md new file mode 100644 index 0000000..6592d85 --- /dev/null +++ b/examples/semantic_search/samples/sample-4.md @@ -0,0 +1,3 @@ +# Article 4: Reinforcement Learning Algorithms + +Reinforcement learning trains agents to make optimal decisions through trial and error interactions with environments. Agents receive rewards or penalties based on their actions, gradually learning policies that maximize cumulative rewards. Q-learning and policy gradient methods are fundamental approaches. Applications include game playing (AlphaGo), robotics control, autonomous driving, recommendation systems, and financial trading algorithms. The exploration-exploitation trade-off remains a central challenge. diff --git a/examples/semantic_search/samples/sample-5.md b/examples/semantic_search/samples/sample-5.md new file mode 100644 index 0000000..1636431 --- /dev/null +++ b/examples/semantic_search/samples/sample-5.md @@ -0,0 +1,3 @@ +# Article 5: Supervised vs Unsupervised Learning + +Supervised learning uses labeled training data to predict outcomes for new inputs, including classification and regression tasks. Common algorithms include decision trees, support vector machines, and random forests. Unsupervised learning discovers hidden patterns in unlabeled data through clustering, dimensionality reduction, and association rules. Semi-supervised learning combines both approaches when labeled data is scarce. Each paradigm serves different problem types and data availability scenarios. diff --git a/examples/semantic_search/samples/sample-6.md b/examples/semantic_search/samples/sample-6.md new file mode 100644 index 0000000..102564c --- /dev/null +++ b/examples/semantic_search/samples/sample-6.md @@ -0,0 +1,3 @@ +# Article 6: AI Ethics and Bias Mitigation + +Artificial intelligence systems can perpetuate or amplify human biases present in training data, leading to unfair outcomes across different demographic groups. Bias mitigation strategies include diverse dataset collection, algorithmic fairness constraints, and regular bias auditing. Ethical AI development requires transparency, accountability, and stakeholder involvement. Organizations must establish governance frameworks addressing privacy, consent, and algorithmic decision-making impacts on individuals and society. diff --git a/examples/semantic_search/samples/sample-7.md b/examples/semantic_search/samples/sample-7.md new file mode 100644 index 0000000..4cf0892 --- /dev/null +++ b/examples/semantic_search/samples/sample-7.md @@ -0,0 +1,3 @@ +# Article 7: Explainable AI and Interpretability + +Explainable AI focuses on making machine learning models more transparent and interpretable to human users. Black-box models like deep neural networks often lack interpretability, creating trust and accountability issues. Techniques include feature importance analysis, LIME (Local Interpretable Model-agnostic Explanations), and SHAP (SHapley Additive exPlanations). Interpretability is crucial for high-stakes applications like healthcare, finance, and criminal justice where decisions require justification. diff --git a/examples/semantic_search/samples/sample-8.md b/examples/semantic_search/samples/sample-8.md new file mode 100644 index 0000000..8117d30 --- /dev/null +++ b/examples/semantic_search/samples/sample-8.md @@ -0,0 +1,3 @@ +# Article 8: AI Regulation and Compliance + +Governments worldwide are developing regulatory frameworks for artificial intelligence deployment and development. The European Union's AI Act categorizes AI systems by risk levels, imposing strict requirements for high-risk applications. Compliance involves documentation, risk assessment, human oversight, and algorithmic auditing. Organizations must navigate evolving regulations while maintaining innovation capabilities. Privacy laws like GDPR also impact AI data processing and automated decision-making systems. diff --git a/examples/semantic_search/samples/sample-9.md b/examples/semantic_search/samples/sample-9.md new file mode 100644 index 0000000..356454f --- /dev/null +++ b/examples/semantic_search/samples/sample-9.md @@ -0,0 +1,3 @@ +# Article 9: Threat Detection and Prevention + +Cybersecurity threat detection employs various technologies to identify malicious activities before they cause damage. Intrusion detection systems monitor network traffic for suspicious patterns, while endpoint protection software guards individual devices. Behavioral analysis identifies anomalies in user activities that may indicate compromised accounts. Security information and event management (SIEM) platforms aggregate and analyze security logs from multiple sources to provide comprehensive threat visibility. diff --git a/examples/semantic_search/semantic_search.py b/examples/semantic_search/semantic_search.py new file mode 100644 index 0000000..1eaa2dc --- /dev/null +++ b/examples/semantic_search/semantic_search.py @@ -0,0 +1,202 @@ +import json +import os +import sqlite3 +import sys +import time +from pathlib import Path +from typing import List, Tuple + +from sentence_transformers import SentenceTransformer + + +class SemanticSearch: + def __init__(self, db_path: str = "semsearch.db", model_name: str = "all-MiniLM-L6-v2"): + self.db_path = db_path + self.model_name = model_name + self.model = None + self.conn = None + + def _get_model(self): + """Lazy load the sentence transformer model""" + if self.model is None: + print(f"Loading model {self.model_name}...") + self.model = SentenceTransformer(self.model_name) + return self.model + + def _get_connection(self): + """Get database connection, load SQLite Vector extension + and ensure schema is created""" + if self.conn is None: + self.conn = sqlite3.connect(self.db_path) + + self.conn.enable_load_extension(True) + self.conn.load_extension("./vector.so") + self.conn.enable_load_extension(False) + + # Check if sqlite-vector is available + try: + self.conn.execute("SELECT vector_version()") + except sqlite3.OperationalError: + print("Error: sqlite-vector extension not found.") + print( + "Download it from https://github.com/sqliteai/sqlite-vector/releases") + sys.exit(1) + + self._create_schema() + return self.conn + + def _create_schema(self): + """Create the documents table with vector support""" + conn = self._get_connection() + cursor = conn.cursor() + + # Create documents table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filepath TEXT NOT NULL, + content TEXT NOT NULL, + embedding BLOB, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Create vector table using sqlite-vector extension + # The default model 'all-MiniLM-L6-v2' produces 384-dimensional embeddings + + # Initialize the vector + cursor.execute(""" + SELECT vector_init('documents', 'embedding', 'type=FLOAT32,dimension=384'); + """) + + conn.commit() + + def _chunk_text(self, text: str, chunk_size: int = 250, overlap: int = 50) -> List[str]: + """Split text into overlapping chunks for better semantic search""" + words = text.split() + chunks = [] + + for i in range(0, len(words), chunk_size - overlap): + chunk = ' '.join(words[i:i + chunk_size]) + chunk = chunk.strip() + if chunk: + chunks.append(chunk) + + # Return original if no chunks created + return chunks if chunks else [text] + + def index_file(self, filepath: str) -> int: + """Index a single file and return number of chunks processed""" + if not os.path.exists(filepath): + print(f"File not found: {filepath}") + return 0 + + model = self._get_model() + conn = self._get_connection() + + cursor = conn.execute( + "SELECT id FROM documents WHERE filepath = ?", (filepath,)) + if cursor.fetchone() is not None: + print(f"File already indexed: {filepath}") + return 0 + + try: + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read().strip() + except Exception as e: + print(f"Error reading {filepath}: {e}") + return 0 + + if not content: + print(f"Empty file: {filepath}") + return 0 + + cursor = conn.cursor() + + # Split content into chunks. + # The default model truncates text after 256 word pieces + chunks = self._chunk_text(content) + chunk_count = 0 + + for chunk in chunks: + # Generate embedding and insert into database + embedding = model.encode(chunk) + embedding_json = json.dumps(embedding.tolist()) + + cursor.execute( + "INSERT INTO documents (filepath, content, embedding) VALUES (?, ?, vector_convert_f32(?))", + (filepath, chunk, embedding_json) + ) + chunk_count += 1 + + conn.commit() + + # Perform quantization on the vector column + cursor.execute(""" + SELECT vector_quantize('documents', 'embedding'); + """) + + print(f"Indexed {filepath}: {chunk_count} chunks") + return chunk_count + + def index_directory(self, directory: str) -> int: + """Index all text files in a directory""" + total_chunks = 0 + text_extensions = {'.txt', '.md', '.mdx', '.py', '.js', + '.html', '.css', '.sql', '.json', '.xml'} + + for root, _, files in os.walk(directory): + for file in files: + if Path(file).suffix.lower() in text_extensions: + filepath = os.path.join(root, file) + total_chunks += self.index_file(filepath) + + return total_chunks + + def search(self, query: str, limit: int = 3) -> Tuple[float, List[Tuple[str, str, float]]]: + """Search for similar documents""" + model = self._get_model() + conn = self._get_connection() + + # Generate query embedding + query_embedding = model.encode(query) + query_json = json.dumps(query_embedding.tolist()) + + # Search using sqlite-vec cosine similarity + cursor = conn.cursor() + start_time = time.time() + cursor.execute(""" + SELECT d.id, d.filepath, d.content, v.distance + FROM documents AS d + JOIN vector_quantize_scan('documents', 'embedding', vector_convert_f32(?), ?) AS v + ON d.id = v.rowid; + """, (query_json, limit)) + elapsed_ms = round((time.time() - start_time) * 1000, 2) + + results = [] + for id, filepath, content, distance in cursor.fetchall(): + results.append((filepath, content, distance)) + + return (elapsed_ms, results) + + def stats(self): + """Print database statistics""" + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute("SELECT COUNT(*) FROM documents") + doc_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(DISTINCT filepath) FROM documents") + file_count = cursor.fetchone()[0] + + print(f"Database: {self.db_path}") + print(f"Files indexed: {file_count}") + print(f"Document chunks: {doc_count}") + + def close(self): + """Close the database connection""" + if self.conn: + self.conn.close() + self.conn = None + print("Database connection closed.") diff --git a/examples/semantic_search/semsearch.py b/examples/semantic_search/semsearch.py new file mode 100755 index 0000000..8f275f8 --- /dev/null +++ b/examples/semantic_search/semsearch.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Semantic Search CLI Tool using SQLite + sqlite-vec + sentence-transformers +Usage: + semsearch "query text" # Search for similar documents + semsearch -i /path/to/documents # Index documents from directory + semsearch -i /path/to/file.txt # Index single file +""" + +import argparse +import os +import sys + +from semantic_search import SemanticSearch + + +def main(): + parser = argparse.ArgumentParser( + description="Semantic search using SQLite + sqlite-vector", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + semsearch "machine learning algorithms" + semsearch -i /path/to/documents + semsearch -i document.txt + semsearch --stats + """ + ) + + parser.add_argument("query", nargs="?", help="Search query") + parser.add_argument("-i", "--index", metavar="PATH", + help="Index file or directory") + parser.add_argument("--limit", type=int, default=5, + help="Number of results to return (default: 5)") + parser.add_argument("--db", default="semsearch.db", + help="Database file path (default: semsearch.db)") + parser.add_argument("--model", default="all-MiniLM-L6-v2", + help="Sentence transformer model (default: all-MiniLM-L6-v2)") + parser.add_argument("--stats", action="store_true", + help="Show database statistics") + parser.add_argument("--repl", action="store_true", + help="Run in interactive (keep model in memory)") + + args = parser.parse_args() + + if not any([args.query, args.index, args.stats, args.repl]): + parser.print_help() + return + + searcher = SemanticSearch(args.db, args.model) + + try: + if args.stats: + searcher.stats() + + elif args.index: + if os.path.isdir(args.index): + total = searcher.index_directory(args.index) + print(f"Total chunks indexed: {total}") + else: + searcher.index_file(args.index) + + elif args.query: + elapsed_ms, results = searcher.search(args.query, args.limit) + + if not results: + print("No results found.") + return + + print(f"Results for: '{args.query}' in {elapsed_ms}ms\n") + for i, (filepath, content, similarity) in enumerate(results, 1): + print(f"{i}. {filepath} (similarity: {similarity:.3f})") + # Show first 200 chars of content + preview = content[:200] + \ + "..." if len(content) > 200 else content + print(f" {preview}\n") + + if args.repl: + print("Entering interactive mode (keep the model in memory).\nType 'help' for commands, 'exit' to quit.") + while True: + try: + cmd = input("semsearch> ").strip() + if not cmd: + continue + if cmd in {"exit", "quit"}: + break + if cmd == "help": + print( + "Commands: search , index , stats, exit") + continue + if cmd.startswith("search "): + query = cmd[len("search "):].strip() + elapsed_ms, results = searcher.search( + query, args.limit) + if not results: + print("No results found.") + continue + print(f"Results for: '{query}' in {elapsed_ms}ms\n") + for i, (filepath, content, similarity) in enumerate(results, 1): + print( + f"{i}. {filepath} (similarity: {similarity:.3f})") + preview = content[:200] + \ + ("..." if len(content) > 200 else "") + print(f" {preview}\n") + continue + if cmd.startswith("index "): + path = cmd[len("index "):].strip() + if os.path.isdir(path): + total = searcher.index_directory(path) + print(f"Total chunks indexed: {total}") + else: + searcher.index_file(path) + continue + if cmd == "stats": + searcher.stats() + continue + print("Unknown command. Type 'help' for available commands.") + except KeyboardInterrupt: + print("\nExiting REPL.") + break + except Exception as e: + print(f"Error: {e}") + + if searcher: + searcher.close() + return + + except KeyboardInterrupt: + print("\nOperation cancelled.") + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + finally: + if searcher: + searcher.close() + + +if __name__ == "__main__": + main()