From af34aa34000ffd506ffaa80d5fcfd6616d2fd5a9 Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Thu, 27 May 2021 17:11:10 +0700
Subject: [PATCH 1/7] TF-TDF

---
 tf-idf_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 tf-idf_test.py

diff --git a/tf-idf_test.py b/tf-idf_test.py
new file mode 100644
index 0000000..e0d52cb
--- /dev/null
+++ b/tf-idf_test.py
@@ -0,0 +1,13 @@
+import tf-idf_1.py
+import unittest
+
+
+class Tests(unittest.TestCase):
+    def test_tf_idf(self):
+        a = text.tf_idf('ничуть не бывало', text._texts)
+        b = [('ничуть', 1.330379145040104), ('не', 1.330379145040104), ('бывало', 1.330379145040104)]
+        self.assertEqual(a, b)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 0aa1ea34677fe8f2511392ed6f6cd19e1e670e9a Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Thu, 27 May 2021 17:23:08 +0700
Subject: [PATCH 2/7] TF-TDF

---
 tf-idf_test.py |  6 +++---
 tfidf_1.py     | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 tfidf_1.py

diff --git a/tf-idf_test.py b/tf-idf_test.py
index e0d52cb..2a5ca0e 100644
--- a/tf-idf_test.py
+++ b/tf-idf_test.py
@@ -1,11 +1,11 @@
-import tf-idf_1.py
 import unittest
+from tfidf_1 import text
 
 
 class Tests(unittest.TestCase):
     def test_tf_idf(self):
-        a = text.tf_idf('ничуть не бывало', text._texts)
-        b = [('ничуть', 1.330379145040104), ('не', 1.330379145040104), ('бывало', 1.330379145040104)]
+        a = text.tf_idf('народные любимцы', text._texts)
+        b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)]
         self.assertEqual(a, b)
 
 
diff --git a/tfidf_1.py b/tfidf_1.py
new file mode 100644
index 0000000..c05c4c9
--- /dev/null
+++ b/tfidf_1.py
@@ -0,0 +1,52 @@
+import xml.etree.ElementTree as ET
+import re
+import json
+import collections
+import math
+import os.path
+
+
+class Texts:
+
+    def __init__(self, path_to_corpus):
+        self._texts = []
+        tree = ET.parse(path_to_corpus)
+        root = tree.getroot()
+        for text in root.iter('text'):
+            sent = []
+            for source in text.iter('source'):
+                sent.append(source.text)
+            for i in sent:
+                self._texts.append(re.sub(r'[^\w\s]', '', i.lower()))
+
+    def tf_idf(self, text, corpus):
+        tf_idf = []
+        tf = []
+        idf = {}
+        text = text.split()
+        _tf_txt = collections.Counter(text)
+        for item in _tf_txt:
+            tf.append((item, _tf_txt[item] / len(text)))
+
+        if os.path.isfile('idf.json'):
+            with open('idf.json', 'r', encoding='utf-8') as f_idf:
+                idf = json.load(f_idf)
+        else:
+            words = str(corpus)
+            words = set(re.sub(r'[^\w\s]', '', words).split())
+            for i in words:
+                for text in corpus:
+                    if i in text:
+                        a = sum([1])
+                if a != 0:
+                    idf[i] = math.log10(len(corpus) / a)
+            f = open('idf.json', 'w', encoding='utf-8')
+            f.write(json.dumps(idf))
+        for i in tf:
+            tf_idf.append((i[0], i[1]*idf.get(i[0])))
+        return tf_idf
+
+
+text = Texts('annot.opcorpora.no_ambig.xml')
+print(text._texts[10])
+print(text.tf_idf('народные любимцы' ,text._texts))
\ No newline at end of file

From 1ec8a18fdde2f25aa6168c4785966c0ffa11e084 Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Fri, 28 May 2021 01:05:13 +0700
Subject: [PATCH 3/7] =?UTF-8?q?TF-IDF=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?=
 =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf-idf_test.py |  6 ++++--
 tfidf_1.py     | 21 ++++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tf-idf_test.py b/tf-idf_test.py
index 2a5ca0e..2e6c6d5 100644
--- a/tf-idf_test.py
+++ b/tf-idf_test.py
@@ -1,10 +1,12 @@
 import unittest
-from tfidf_1 import text
+from tfidf_1 import Texts
+
+text = Texts('annot.opcorpora.no_ambig.xml')
 
 
 class Tests(unittest.TestCase):
     def test_tf_idf(self):
-        a = text.tf_idf('народные любимцы', text._texts)
+        a = text.tf_idf(text.get_text(10), text.get_corpus())
         b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)]
         self.assertEqual(a, b)
 
diff --git a/tfidf_1.py b/tfidf_1.py
index c05c4c9..b0aa0b2 100644
--- a/tfidf_1.py
+++ b/tfidf_1.py
@@ -18,6 +18,7 @@ def __init__(self, path_to_corpus):
                 sent.append(source.text)
             for i in sent:
                 self._texts.append(re.sub(r'[^\w\s]', '', i.lower()))
+        self.f = open('idf.json', 'r', encoding='utf-8')
 
     def tf_idf(self, text, corpus):
         tf_idf = []
@@ -29,24 +30,30 @@ def tf_idf(self, text, corpus):
             tf.append((item, _tf_txt[item] / len(text)))
 
         if os.path.isfile('idf.json'):
-            with open('idf.json', 'r', encoding='utf-8') as f_idf:
-                idf = json.load(f_idf)
+            idf = json.load(self.f)
         else:
             words = str(corpus)
             words = set(re.sub(r'[^\w\s]', '', words).split())
+            a = 0
             for i in words:
                 for text in corpus:
                     if i in text:
-                        a = sum([1])
+                        a += 1
                 if a != 0:
                     idf[i] = math.log10(len(corpus) / a)
-            f = open('idf.json', 'w', encoding='utf-8')
-            f.write(json.dumps(idf))
+            self.f.write(json.dumps(idf))
         for i in tf:
             tf_idf.append((i[0], i[1]*idf.get(i[0])))
         return tf_idf
 
+    def get_text(self, num):
+        if num < len(self._texts):
+            return self._texts[num]
+
+    def get_corpus(self):
+        return self._texts
+
 
 text = Texts('annot.opcorpora.no_ambig.xml')
-print(text._texts[10])
-print(text.tf_idf('народные любимцы' ,text._texts))
\ No newline at end of file
+print(text.get_text(10))
+print(text.tf_idf(text.get_text(10), text.get_corpus()))
\ No newline at end of file

From f77273b5843d45493c1d6b466aa2ed0ed0369fef Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Fri, 28 May 2021 11:44:54 +0700
Subject: [PATCH 4/7] =?UTF-8?q?tf-idf=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?=
 =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf-idf_test.py |  7 ++++---
 tfidf_1.py     | 18 ++++++++----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tf-idf_test.py b/tf-idf_test.py
index 2e6c6d5..1ad473a 100644
--- a/tf-idf_test.py
+++ b/tf-idf_test.py
@@ -1,12 +1,13 @@
 import unittest
 from tfidf_1 import Texts
 
-text = Texts('annot.opcorpora.no_ambig.xml')
-
 
 class Tests(unittest.TestCase):
+    def setUp(self):
+        self.text = Texts('annot.opcorpora.no_ambig.xml')
+
     def test_tf_idf(self):
-        a = text.tf_idf(text.get_text(10), text.get_corpus())
+        a = self.text.tf_idf('народные любимцы', self.text.get_corpus())
         b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)]
         self.assertEqual(a, b)
 
diff --git a/tfidf_1.py b/tfidf_1.py
index b0aa0b2..94295e2 100644
--- a/tfidf_1.py
+++ b/tfidf_1.py
@@ -18,32 +18,30 @@ def __init__(self, path_to_corpus):
                 sent.append(source.text)
             for i in sent:
                 self._texts.append(re.sub(r'[^\w\s]', '', i.lower()))
-        self.f = open('idf.json', 'r', encoding='utf-8')
+        with open('idf.json', 'r', encoding='utf-8') as f:
+            self.idf = json.load(f)
 
     def tf_idf(self, text, corpus):
         tf_idf = []
         tf = []
-        idf = {}
         text = text.split()
         _tf_txt = collections.Counter(text)
         for item in _tf_txt:
             tf.append((item, _tf_txt[item] / len(text)))
 
-        if os.path.isfile('idf.json'):
-            idf = json.load(self.f)
-        else:
+        if not os.path.isfile('idf.json'):
             words = str(corpus)
             words = set(re.sub(r'[^\w\s]', '', words).split())
-            a = 0
             for i in words:
+                a = 0
                 for text in corpus:
                     if i in text:
                         a += 1
                 if a != 0:
-                    idf[i] = math.log10(len(corpus) / a)
-            self.f.write(json.dumps(idf))
+                    self.idf[i] = math.log10(len(corpus) / a)
+            self.f.write(json.dumps(self.idf))
         for i in tf:
-            tf_idf.append((i[0], i[1]*idf.get(i[0])))
+            tf_idf.append((i[0], i[1]*self.idf.get(i[0])))
         return tf_idf
 
     def get_text(self, num):
@@ -56,4 +54,4 @@ def get_corpus(self):
 
 text = Texts('annot.opcorpora.no_ambig.xml')
 print(text.get_text(10))
-print(text.tf_idf(text.get_text(10), text.get_corpus()))
\ No newline at end of file
+print(text.tf_idf('народные любимцы', text.get_corpus()))

From f60ef66ac84fba4d65fec8f9e9d03109b30bf93d Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Fri, 28 May 2021 15:16:52 +0700
Subject: [PATCH 5/7] =?UTF-8?q?tf-idf=20=D0=BF=D0=BE=D0=BB=D1=83=D1=87?=
 =?UTF-8?q?=D0=B0=D0=B5=D1=82=D1=81=D1=8F=20=D1=82=D0=B0=D0=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tfidf_1.py | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/tfidf_1.py b/tfidf_1.py
index 94295e2..508bb62 100644
--- a/tfidf_1.py
+++ b/tfidf_1.py
@@ -18,30 +18,39 @@ def __init__(self, path_to_corpus):
                 sent.append(source.text)
             for i in sent:
                 self._texts.append(re.sub(r'[^\w\s]', '', i.lower()))
-        with open('idf.json', 'r', encoding='utf-8') as f:
-            self.idf = json.load(f)
+        if os.path.exists('idf_file'):
+            self._idf = self._load_idf()
+        else:
+            self._idf = self._count_idf()
 
-    def tf_idf(self, text, corpus):
+    def _load_idf(self):
+        with open('idf.json', 'r', encoding='utf-8') as f_idf:
+            return json.load(f_idf)
+
+    def _count_idf(self):
+        idf = {}
+        words = str(self._texts)
+        words = set(re.sub(r'[^\w\s]', '', words).split())
+        for i in words:
+            a = 0
+            for text in self._texts:
+                if i in text:
+                    a += 1
+            if a != 0:
+                idf[i] = math.log10(len(self._texts) / a)
+        with open('idf.json', 'w', encoding='utf-8') as f_idf:
+            f_idf.write(json.dumps(idf))
+        return idf
+
+    def tf_idf(self, text):
         tf_idf = []
         tf = []
         text = text.split()
         _tf_txt = collections.Counter(text)
         for item in _tf_txt:
             tf.append((item, _tf_txt[item] / len(text)))
-
-        if not os.path.isfile('idf.json'):
-            words = str(corpus)
-            words = set(re.sub(r'[^\w\s]', '', words).split())
-            for i in words:
-                a = 0
-                for text in corpus:
-                    if i in text:
-                        a += 1
-                if a != 0:
-                    self.idf[i] = math.log10(len(corpus) / a)
-            self.f.write(json.dumps(self.idf))
         for i in tf:
-            tf_idf.append((i[0], i[1]*self.idf.get(i[0])))
+            tf_idf.append((i[0], i[1]*self._idf.get(i[0])))
         return tf_idf
 
     def get_text(self, num):
@@ -54,4 +63,4 @@ def get_corpus(self):
 
 text = Texts('annot.opcorpora.no_ambig.xml')
 print(text.get_text(10))
-print(text.tf_idf('народные любимцы', text.get_corpus()))
+print(text.tf_idf('народные любимцы'))

From 18ccd8b443552e7376a97c0fdb6c15407ccc0bb8 Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Fri, 28 May 2021 15:32:10 +0700
Subject: [PATCH 6/7] =?UTF-8?q?tf-idf=20=D0=BF=D0=BE=D0=BB=D1=83=D1=87?=
 =?UTF-8?q?=D0=B0=D0=B5=D1=82=D1=81=D1=8F=20=D1=82=D0=B0=D0=BA=3F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf-idf_test.py | 6 +++---
 tfidf_1.py     | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tf-idf_test.py b/tf-idf_test.py
index 1ad473a..b3fffc2 100644
--- a/tf-idf_test.py
+++ b/tf-idf_test.py
@@ -2,13 +2,13 @@
 from tfidf_1 import Texts
 
 
-class Tests(unittest.TestCase):
+class TestTFIDF(unittest.TestCase):
     def setUp(self):
         self.text = Texts('annot.opcorpora.no_ambig.xml')
 
     def test_tf_idf(self):
-        a = self.text.tf_idf('народные любимцы', self.text.get_corpus())
-        b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)]
+        a = self.text.tf_idf('народные любимцы')
+        b = [('народные', 1.7570080902003247), ('любимцы', 1.995568717560156)]
         self.assertEqual(a, b)
 
 
diff --git a/tfidf_1.py b/tfidf_1.py
index 508bb62..88b0183 100644
--- a/tfidf_1.py
+++ b/tfidf_1.py
@@ -57,9 +57,6 @@ def get_text(self, num):
         if num < len(self._texts):
             return self._texts[num]
 
-    def get_corpus(self):
-        return self._texts
-
 
 text = Texts('annot.opcorpora.no_ambig.xml')
 print(text.get_text(10))

From 0b2f6a58e43c227e985efdbf8a67f8aa1ce0e5c3 Mon Sep 17 00:00:00 2001
From: alina <a.sarzhanova@g.nsu.ru>
Date: Fri, 28 May 2021 18:07:43 +0700
Subject: [PATCH 7/7] tf-idf

---
 tfidf_1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfidf_1.py b/tfidf_1.py
index 88b0183..8930bd4 100644
--- a/tfidf_1.py
+++ b/tfidf_1.py
@@ -18,7 +18,7 @@ def __init__(self, path_to_corpus):
                 sent.append(source.text)
             for i in sent:
                 self._texts.append(re.sub(r'[^\w\s]', '', i.lower()))
-        if os.path.exists('idf_file'):
+        if os.path.exists('idf_json'):
             self._idf = self._load_idf()
         else:
             self._idf = self._count_idf()