PDF-Text-Analyzer/text_analysis.py at main · cortega26/PDF-Text-Analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Tuple
from utils import setup_logging

logger = setup_logging(__name__)

class ContentAnalyzer:
    """Analyzes text content using various NLP techniques."""

    def __init__(self, language: str):
        self.language = language

        # Map detected language code to NLTK language name
        self.lang_mapping = {
            "en": "english", "fr": "french", "de": "german", "es": "spanish",
            "it": "italian", "pt": "portuguese", "nl": "dutch", "sv": "swedish",
            "no": "norwegian", "fi": "finnish", "ru": "russian"
        }
        nltk_lang = self.lang_mapping.get(language, "english")

        try:
            self.stop_words = list(nltk.corpus.stopwords.words(nltk_lang))
        except LookupError:
            self.stop_words = 'english'

        self.vectorizer = TfidfVectorizer(
            stop_words=self.stop_words,
            max_features=1000,
            ngram_range=(1, 2)
        )

    def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str, float]]:
        """Extract important keywords using TF-IDF."""
        try:
            tfidf_matrix = self.vectorizer.fit_transform([text])
            feature_names = self.vectorizer.get_feature_names_out()
            scores = zip(feature_names, tfidf_matrix.toarray()[0])
            sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
            return sorted_scores[:top_n]
        except Exception as e:
            logger.error(f"Keyword extraction failed: {e}")
            return []

    def calculate_readability_score(self, text: str) -> float:
        """Calculate text readability using Flesch Reading Ease."""
        try:
            words = text.split()
            sentences = nltk.sent_tokenize(text)

            if not words or not sentences:
                return 0.0

            word_count = len(words)
            sentence_count = len(sentences)
            syllable_count = sum(self._count_syllables(word) for word in words)

            score = 206.835 - 1.015 * (word_count / sentence_count)
            if word_count > 0:
                score -= 84.6 * (syllable_count / word_count)

            return round(max(0.0, min(100.0, score)), 2)
        except Exception as e:
            logger.error(f"Readability calculation failed: {e}")
            return 0.0

    @staticmethod
    def _count_syllables(word: str) -> int:
        """Count syllables in a word."""
        word = word.lower().strip()
        if not word:
            return 0

        count = 0
        vowels = set("aeiouy")
        prev_char = None

        for char in word:
            if char in vowels and (prev_char is None or prev_char not in vowels):
                count += 1
            prev_char = char

        if word.endswith(('e', 'es', 'ed')) and count > 1:
            count -= 1

        return max(1, count)