-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_analysis.py
More file actions
86 lines (69 loc) · 3.01 KB
/
text_analysis.py
File metadata and controls
86 lines (69 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Tuple
from utils import setup_logging
logger = setup_logging(__name__)
class ContentAnalyzer:
"""Analyzes text content using various NLP techniques."""
def __init__(self, language: str):
self.language = language
# Map detected language code to NLTK language name
self.lang_mapping = {
"en": "english", "fr": "french", "de": "german", "es": "spanish",
"it": "italian", "pt": "portuguese", "nl": "dutch", "sv": "swedish",
"no": "norwegian", "fi": "finnish", "ru": "russian"
}
nltk_lang = self.lang_mapping.get(language, "english")
try:
self.stop_words = list(nltk.corpus.stopwords.words(nltk_lang))
except LookupError:
self.stop_words = 'english'
self.vectorizer = TfidfVectorizer(
stop_words=self.stop_words,
max_features=1000,
ngram_range=(1, 2)
)
def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str, float]]:
"""Extract important keywords using TF-IDF."""
try:
tfidf_matrix = self.vectorizer.fit_transform([text])
feature_names = self.vectorizer.get_feature_names_out()
scores = zip(feature_names, tfidf_matrix.toarray()[0])
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
return sorted_scores[:top_n]
except Exception as e:
logger.error(f"Keyword extraction failed: {e}")
return []
def calculate_readability_score(self, text: str) -> float:
"""Calculate text readability using Flesch Reading Ease."""
try:
words = text.split()
sentences = nltk.sent_tokenize(text)
if not words or not sentences:
return 0.0
word_count = len(words)
sentence_count = len(sentences)
syllable_count = sum(self._count_syllables(word) for word in words)
score = 206.835 - 1.015 * (word_count / sentence_count)
if word_count > 0:
score -= 84.6 * (syllable_count / word_count)
return round(max(0.0, min(100.0, score)), 2)
except Exception as e:
logger.error(f"Readability calculation failed: {e}")
return 0.0
@staticmethod
def _count_syllables(word: str) -> int:
"""Count syllables in a word."""
word = word.lower().strip()
if not word:
return 0
count = 0
vowels = set("aeiouy")
prev_char = None
for char in word:
if char in vowels and (prev_char is None or prev_char not in vowels):
count += 1
prev_char = char
if word.endswith(('e', 'es', 'ed')) and count > 1:
count -= 1
return max(1, count)