PDF-Text-Analyzer/search.py at main · cortega26/PDF-Text-Analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import hashlib
import nltk
from collections import defaultdict
from typing import Dict, Any, List, Set
from typing import Dict, Any, List, Set
from utils import setup_logging

logger = setup_logging(__name__)

class PdfSearchEngine:
    """Search engine for processed PDF content."""

    def __init__(self):
        self.index = defaultdict(list)
        self.documents: Dict[str, Dict[str, Any]] = {}

    def add_document(self, url: str, analysis_results: Dict[str, Any], metadata: Dict[str, Any], full_text: str = "") -> None:
        """Add a document to the search index using analysis results."""
        doc_id = hashlib.md5(url.encode()).hexdigest()

        # Use full text if available, otherwise fallback to preview
        content = full_text if full_text else analysis_results.get('text_preview', '')

        self.documents[doc_id] = {
            'url': url,
            'metadata': metadata,
            'content': content,
            'keywords': analysis_results.get('keywords', []),
            'matching_keywords': analysis_results.get('matching_keywords', []),
            'search_term_count': analysis_results.get('search_term_count', 0),
            'language': analysis_results.get('language', 'unknown')
        }

        # Index words from content
        words = set(word.lower() for word in nltk.word_tokenize(content))
        for word in words:
            self.index[word].append(doc_id)

    def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
        """Search for documents matching query."""
        query_words = set(word.lower() for word in nltk.word_tokenize(query))

        # Calculate document scores
        doc_scores = defaultdict(float)
        for word in query_words:
            matching_docs = self.index.get(word, [])
            word_score = 1.0 / (len(matching_docs) if matching_docs else 1.0)
            for doc_id in matching_docs:
                doc_scores[doc_id] += word_score

        # Sort documents by score
        sorted_docs = sorted(
            doc_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )[:limit]

        # Format results
        results = []
        for doc_id, score in sorted_docs:
            doc = self.documents[doc_id]
            snippet = self._generate_snippet(doc['content'], query_words)

            results.append({
                'url': doc['url'],
                'metadata': doc['metadata'],
                'relevance_score': round(score, 3),
                'snippet': snippet,
                'language': doc['language'],
                'search_term_count': doc['search_term_count'],
                'matching_keywords': [
                    {'keyword': kw, 'score': score}
                    for kw, score in doc['matching_keywords']
                ]
            })

        return results

    def _generate_snippet(self, content: str, query_words: Set[str],
                         context_words: int = 10) -> str:
        """Generate a relevant text snippet containing query words."""
        words = content.split()
        best_snippet = ""
        max_matches = 0

        # Slide a window over the text to find the best matching context
        for i in range(len(words)):
            window = words[i:i + context_words * 2]
            if not window:
                break

            # Count query word matches in this window
            matches = sum(1 for word in window
                         if word.lower() in query_words)

            # Update best snippet if this window has more matches
            if matches > max_matches:
                max_matches = matches
                best_snippet = ' '.join(window)

        # Add ellipsis if we have a snippet
        return f"{best_snippet}..." if best_snippet else ""