From af34aa34000ffd506ffaa80d5fcfd6616d2fd5a9 Mon Sep 17 00:00:00 2001 From: alina Date: Thu, 27 May 2021 17:11:10 +0700 Subject: [PATCH 1/7] TF-TDF --- tf-idf_test.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tf-idf_test.py diff --git a/tf-idf_test.py b/tf-idf_test.py new file mode 100644 index 0000000..e0d52cb --- /dev/null +++ b/tf-idf_test.py @@ -0,0 +1,13 @@ +import tf-idf_1.py +import unittest + + +class Tests(unittest.TestCase): + def test_tf_idf(self): + a = text.tf_idf('ничуть не бывало', text._texts) + b = [('ничуть', 1.330379145040104), ('не', 1.330379145040104), ('бывало', 1.330379145040104)] + self.assertEqual(a, b) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 0aa1ea34677fe8f2511392ed6f6cd19e1e670e9a Mon Sep 17 00:00:00 2001 From: alina Date: Thu, 27 May 2021 17:23:08 +0700 Subject: [PATCH 2/7] TF-TDF --- tf-idf_test.py | 6 +++--- tfidf_1.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 tfidf_1.py diff --git a/tf-idf_test.py b/tf-idf_test.py index e0d52cb..2a5ca0e 100644 --- a/tf-idf_test.py +++ b/tf-idf_test.py @@ -1,11 +1,11 @@ -import tf-idf_1.py import unittest +from tfidf_1 import text class Tests(unittest.TestCase): def test_tf_idf(self): - a = text.tf_idf('ничуть не бывало', text._texts) - b = [('ничуть', 1.330379145040104), ('не', 1.330379145040104), ('бывало', 1.330379145040104)] + a = text.tf_idf('народные любимцы', text._texts) + b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)] self.assertEqual(a, b) diff --git a/tfidf_1.py b/tfidf_1.py new file mode 100644 index 0000000..c05c4c9 --- /dev/null +++ b/tfidf_1.py @@ -0,0 +1,52 @@ +import xml.etree.ElementTree as ET +import re +import json +import collections +import math +import os.path + + +class Texts: + + def __init__(self, path_to_corpus): + self._texts = [] + tree = ET.parse(path_to_corpus) + root = tree.getroot() + for text in root.iter('text'): + sent = [] + for source in text.iter('source'): + sent.append(source.text) + for i in sent: + self._texts.append(re.sub(r'[^\w\s]', '', i.lower())) + + def tf_idf(self, text, corpus): + tf_idf = [] + tf = [] + idf = {} + text = text.split() + _tf_txt = collections.Counter(text) + for item in _tf_txt: + tf.append((item, _tf_txt[item] / len(text))) + + if os.path.isfile('idf.json'): + with open('idf.json', 'r', encoding='utf-8') as f_idf: + idf = json.load(f_idf) + else: + words = str(corpus) + words = set(re.sub(r'[^\w\s]', '', words).split()) + for i in words: + for text in corpus: + if i in text: + a = sum([1]) + if a != 0: + idf[i] = math.log10(len(corpus) / a) + f = open('idf.json', 'w', encoding='utf-8') + f.write(json.dumps(idf)) + for i in tf: + tf_idf.append((i[0], i[1]*idf.get(i[0]))) + return tf_idf + + +text = Texts('annot.opcorpora.no_ambig.xml') +print(text._texts[10]) +print(text.tf_idf('народные любимцы' ,text._texts)) \ No newline at end of file From 1ec8a18fdde2f25aa6168c4785966c0ffa11e084 Mon Sep 17 00:00:00 2001 From: alina Date: Fri, 28 May 2021 01:05:13 +0700 Subject: [PATCH 3/7] =?UTF-8?q?TF-IDF=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf-idf_test.py | 6 ++++-- tfidf_1.py | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tf-idf_test.py b/tf-idf_test.py index 2a5ca0e..2e6c6d5 100644 --- a/tf-idf_test.py +++ b/tf-idf_test.py @@ -1,10 +1,12 @@ import unittest -from tfidf_1 import text +from tfidf_1 import Texts + +text = Texts('annot.opcorpora.no_ambig.xml') class Tests(unittest.TestCase): def test_tf_idf(self): - a = text.tf_idf('народные любимцы', text._texts) + a = text.tf_idf(text.get_text(10), text.get_corpus()) b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)] self.assertEqual(a, b) diff --git a/tfidf_1.py b/tfidf_1.py index c05c4c9..b0aa0b2 100644 --- a/tfidf_1.py +++ b/tfidf_1.py @@ -18,6 +18,7 @@ def __init__(self, path_to_corpus): sent.append(source.text) for i in sent: self._texts.append(re.sub(r'[^\w\s]', '', i.lower())) + self.f = open('idf.json', 'r', encoding='utf-8') def tf_idf(self, text, corpus): tf_idf = [] @@ -29,24 +30,30 @@ def tf_idf(self, text, corpus): tf.append((item, _tf_txt[item] / len(text))) if os.path.isfile('idf.json'): - with open('idf.json', 'r', encoding='utf-8') as f_idf: - idf = json.load(f_idf) + idf = json.load(self.f) else: words = str(corpus) words = set(re.sub(r'[^\w\s]', '', words).split()) + a = 0 for i in words: for text in corpus: if i in text: - a = sum([1]) + a += 1 if a != 0: idf[i] = math.log10(len(corpus) / a) - f = open('idf.json', 'w', encoding='utf-8') - f.write(json.dumps(idf)) + self.f.write(json.dumps(idf)) for i in tf: tf_idf.append((i[0], i[1]*idf.get(i[0]))) return tf_idf + def get_text(self, num): + if num < len(self._texts): + return self._texts[num] + + def get_corpus(self): + return self._texts + text = Texts('annot.opcorpora.no_ambig.xml') -print(text._texts[10]) -print(text.tf_idf('народные любимцы' ,text._texts)) \ No newline at end of file +print(text.get_text(10)) +print(text.tf_idf(text.get_text(10), text.get_corpus())) \ No newline at end of file From f77273b5843d45493c1d6b466aa2ed0ed0369fef Mon Sep 17 00:00:00 2001 From: alina Date: Fri, 28 May 2021 11:44:54 +0700 Subject: [PATCH 4/7] =?UTF-8?q?tf-idf=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf-idf_test.py | 7 ++++--- tfidf_1.py | 18 ++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tf-idf_test.py b/tf-idf_test.py index 2e6c6d5..1ad473a 100644 --- a/tf-idf_test.py +++ b/tf-idf_test.py @@ -1,12 +1,13 @@ import unittest from tfidf_1 import Texts -text = Texts('annot.opcorpora.no_ambig.xml') - class Tests(unittest.TestCase): + def setUp(self): + self.text = Texts('annot.opcorpora.no_ambig.xml') + def test_tf_idf(self): - a = text.tf_idf(text.get_text(10), text.get_corpus()) + a = self.text.tf_idf('народные любимцы', self.text.get_corpus()) b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)] self.assertEqual(a, b) diff --git a/tfidf_1.py b/tfidf_1.py index b0aa0b2..94295e2 100644 --- a/tfidf_1.py +++ b/tfidf_1.py @@ -18,32 +18,30 @@ def __init__(self, path_to_corpus): sent.append(source.text) for i in sent: self._texts.append(re.sub(r'[^\w\s]', '', i.lower())) - self.f = open('idf.json', 'r', encoding='utf-8') + with open('idf.json', 'r', encoding='utf-8') as f: + self.idf = json.load(f) def tf_idf(self, text, corpus): tf_idf = [] tf = [] - idf = {} text = text.split() _tf_txt = collections.Counter(text) for item in _tf_txt: tf.append((item, _tf_txt[item] / len(text))) - if os.path.isfile('idf.json'): - idf = json.load(self.f) - else: + if not os.path.isfile('idf.json'): words = str(corpus) words = set(re.sub(r'[^\w\s]', '', words).split()) - a = 0 for i in words: + a = 0 for text in corpus: if i in text: a += 1 if a != 0: - idf[i] = math.log10(len(corpus) / a) - self.f.write(json.dumps(idf)) + self.idf[i] = math.log10(len(corpus) / a) + self.f.write(json.dumps(self.idf)) for i in tf: - tf_idf.append((i[0], i[1]*idf.get(i[0]))) + tf_idf.append((i[0], i[1]*self.idf.get(i[0]))) return tf_idf def get_text(self, num): @@ -56,4 +54,4 @@ def get_corpus(self): text = Texts('annot.opcorpora.no_ambig.xml') print(text.get_text(10)) -print(text.tf_idf(text.get_text(10), text.get_corpus())) \ No newline at end of file +print(text.tf_idf('народные любимцы', text.get_corpus())) From f60ef66ac84fba4d65fec8f9e9d03109b30bf93d Mon Sep 17 00:00:00 2001 From: alina Date: Fri, 28 May 2021 15:16:52 +0700 Subject: [PATCH 5/7] =?UTF-8?q?tf-idf=20=D0=BF=D0=BE=D0=BB=D1=83=D1=87?= =?UTF-8?q?=D0=B0=D0=B5=D1=82=D1=81=D1=8F=20=D1=82=D0=B0=D0=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tfidf_1.py | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/tfidf_1.py b/tfidf_1.py index 94295e2..508bb62 100644 --- a/tfidf_1.py +++ b/tfidf_1.py @@ -18,30 +18,39 @@ def __init__(self, path_to_corpus): sent.append(source.text) for i in sent: self._texts.append(re.sub(r'[^\w\s]', '', i.lower())) - with open('idf.json', 'r', encoding='utf-8') as f: - self.idf = json.load(f) + if os.path.exists('idf_file'): + self._idf = self._load_idf() + else: + self._idf = self._count_idf() - def tf_idf(self, text, corpus): + def _load_idf(self): + with open('idf.json', 'r', encoding='utf-8') as f_idf: + return json.load(f_idf) + + def _count_idf(self): + idf = {} + words = str(self._texts) + words = set(re.sub(r'[^\w\s]', '', words).split()) + for i in words: + a = 0 + for text in self._texts: + if i in text: + a += 1 + if a != 0: + idf[i] = math.log10(len(self._texts) / a) + with open('idf.json', 'w', encoding='utf-8') as f_idf: + f_idf.write(json.dumps(idf)) + return idf + + def tf_idf(self, text): tf_idf = [] tf = [] text = text.split() _tf_txt = collections.Counter(text) for item in _tf_txt: tf.append((item, _tf_txt[item] / len(text))) - - if not os.path.isfile('idf.json'): - words = str(corpus) - words = set(re.sub(r'[^\w\s]', '', words).split()) - for i in words: - a = 0 - for text in corpus: - if i in text: - a += 1 - if a != 0: - self.idf[i] = math.log10(len(corpus) / a) - self.f.write(json.dumps(self.idf)) for i in tf: - tf_idf.append((i[0], i[1]*self.idf.get(i[0]))) + tf_idf.append((i[0], i[1]*self._idf.get(i[0]))) return tf_idf def get_text(self, num): @@ -54,4 +63,4 @@ def get_corpus(self): text = Texts('annot.opcorpora.no_ambig.xml') print(text.get_text(10)) -print(text.tf_idf('народные любимцы', text.get_corpus())) +print(text.tf_idf('народные любимцы')) From 18ccd8b443552e7376a97c0fdb6c15407ccc0bb8 Mon Sep 17 00:00:00 2001 From: alina Date: Fri, 28 May 2021 15:32:10 +0700 Subject: [PATCH 6/7] =?UTF-8?q?tf-idf=20=D0=BF=D0=BE=D0=BB=D1=83=D1=87?= =?UTF-8?q?=D0=B0=D0=B5=D1=82=D1=81=D1=8F=20=D1=82=D0=B0=D0=BA=3F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf-idf_test.py | 6 +++--- tfidf_1.py | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tf-idf_test.py b/tf-idf_test.py index 1ad473a..b3fffc2 100644 --- a/tf-idf_test.py +++ b/tf-idf_test.py @@ -2,13 +2,13 @@ from tfidf_1 import Texts -class Tests(unittest.TestCase): +class TestTFIDF(unittest.TestCase): def setUp(self): self.text = Texts('annot.opcorpora.no_ambig.xml') def test_tf_idf(self): - a = self.text.tf_idf('народные любимцы', self.text.get_corpus()) - b = [('народные', 1.995568717560156), ('любимцы', 1.995568717560156)] + a = self.text.tf_idf('народные любимцы') + b = [('народные', 1.7570080902003247), ('любимцы', 1.995568717560156)] self.assertEqual(a, b) diff --git a/tfidf_1.py b/tfidf_1.py index 508bb62..88b0183 100644 --- a/tfidf_1.py +++ b/tfidf_1.py @@ -57,9 +57,6 @@ def get_text(self, num): if num < len(self._texts): return self._texts[num] - def get_corpus(self): - return self._texts - text = Texts('annot.opcorpora.no_ambig.xml') print(text.get_text(10)) From 0b2f6a58e43c227e985efdbf8a67f8aa1ce0e5c3 Mon Sep 17 00:00:00 2001 From: alina Date: Fri, 28 May 2021 18:07:43 +0700 Subject: [PATCH 7/7] tf-idf --- tfidf_1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tfidf_1.py b/tfidf_1.py index 88b0183..8930bd4 100644 --- a/tfidf_1.py +++ b/tfidf_1.py @@ -18,7 +18,7 @@ def __init__(self, path_to_corpus): sent.append(source.text) for i in sent: self._texts.append(re.sub(r'[^\w\s]', '', i.lower())) - if os.path.exists('idf_file'): + if os.path.exists('idf_json'): self._idf = self._load_idf() else: self._idf = self._count_idf()