NatureMagazineScraper/analyzer.py at main · DesktopCleaner/NatureMagazineScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Before running, name your word frequency file to "word_frequency.json" to build on top.
# Otherwise the function will create a blank new word frequency file and then start counting.
# After analysis, a new word_freq file will be created with its date of creation in its file name.import os
import json
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import datetime # For naming files

stopwords = stopwords.words("english")

with open('word_families_mapping.json', 'r', encoding='utf-8') as reference_f:
    word_families_mapping = json.load(reference_f)
variants = word_families_mapping.keys() # Used to reduce words to their root forms

# Try to open existing word frequency file
try:
    with open('word_frequency.json', "r") as collection_f:
        word_freq = json.load(collection_f)
except:
    print("Previous word frequency dict not found!!")
    word_freq = {}

# Read files and calculate word frequencies
# Analyze files in scraped_articles folder by default
def read_files_by_year(year1, year2):
    global word_freq
    global word_cap # Maximum number of a specific word counted per article. To reduce bias.
    global article_counter
    article_counter = 0

    for folder_num in range(year1, year2 + 1): # Scrape from starting folder to ending folder. Include both.
        directory = "scraped_articles" + "/" + str(folder_num)
        print("analyzing year:", folder_num)

        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith('.txt'): # Only read text files
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                        tokens = word_tokenize(text)

                        word_cap_dict = {} # Clear word cap dict
                        article_counter += 1
                        for token in tokens:
                            if not all([
                                token.isalpha(),
                                len(token) > 2,
                                bool(wordnet.synsets(token)), # Is a word
                                token not in stopwords # Is not a stepword
                            ]):
                                continue

                            token = token.lower()
                            # Reduce word to its root
                            if token in variants:
                                root_word = word_families_mapping[token]

                                if root_word not in word_cap_dict:
                                    word_cap_dict.update({root_word : 1})
                                elif word_cap_dict[root_word] == word_cap: # Don't count it word if it reaches maximum count
                                    continue
                                else:
                                    word_cap_dict[root_word] += 1

                                if root_word not in word_freq:
                                    word_freq.update({root_word : {"freq" : 1, "existing" : []}})
                                else:
                                    word_freq[root_word]["freq"] += 1
                                    #print("updated!")
                                    existing_variants = word_freq[root_word]["existing"]

                                    if token not in existing_variants:
                                        existing_variants.append(token)

def read_files_all(directory):
    global word_freq
    global word_cap
    global article_counter
    article_counter = 0

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    tokens = word_tokenize(text)

                    word_cap_dict = {}
                    article_counter += 1
                    for token in tokens:
                        if not all([
                            token.isalpha(),
                            len(token) > 2,
                            bool(wordnet.synsets(token)),
                            token not in stopwords
                        ]):
                            continue

                        token = token.lower()
                        if token in variants:
                            root_word = word_families_mapping[token]
                            capped = 0

                            if root_word not in word_cap_dict:
                                word_cap_dict.update({root_word : 1})
                            elif word_cap_dict[root_word] == word_cap:
                                capped = 1
                            else:
                                word_cap_dict[root_word] += 1

                            if root_word not in word_freq:
                                word_freq.update({root_word : {"freq" : 1, "existing" : []}})
                            else:
                                if not capped:
                                    word_freq[root_word]["freq"] += 1
                                #print("updated!")
                                existing_variants = word_freq[root_word]["existing"]

                                if token not in existing_variants:
                                    existing_variants.append(token)


# Input year
input_year_result = input("Input starting and ending years to analyze (separated by space). Type 'all' to analyze all years.")

if input_year_result == "all":
    print("Analyze all years.")

    word_cap = int(input("Input the maximum number a word can be counted per passage: (recommend:3)"))
    print("Word count cap number per passage:", word_cap)

    read_files_all()
    exit()

year1, year2 = map(int, input_year_result.split())
print("Analyze years:", year1, "to", year2)

# Input word cap
word_cap = int(input("Input the maximum number a word can be counted per passage: (recommend:3)"))
print("Word count cap number per passage:", word_cap)

read_files_by_year(year1, year2)
word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1]["freq"], reverse=True))

# Save the word frequency data to a JSON file
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"word_freq {current_time}.json"
with open(output_file, 'w') as f:
    json.dump(word_freq, f, indent = 4)
print(f"Word frequency data saved to {output_file}")
print(f"Articles analyzed: {article_counter}")