From 29f92878486e6702dbdd6958f06c56053b37c265 Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Tue, 21 Jan 2020 23:09:00 -0600
Subject: [PATCH 1/7] Add <s></s> tags and remove extra single quotes

This removes extra single quotes at the beginning and end of paragraphs inherited from the scraped text
---
 project-1/cleanup.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 project-1/cleanup.py
diff --git a/project-1/cleanup.py b/project-1/cleanup.py
new file mode 100644
index 0000000..728e159
--- /dev/null
+++ b/project-1/cleanup.py
@@ -0,0 +1,19 @@
+fin = open("raw_text.txt", "rt")
+fout = open("out.txt", "wt")
+
+for line in fin:
+	fout.write(line.replace('\'\'', ' '))
+
+fin.close()
+fout.close()
+
+fin = open("out.txt", "rt")
+fout = open("out2.txt", "wt")
+
+for line in fin:
+    fout.write(line.replace('. ', '</s>. <s>'))
+
+fin.close()
+fout.close()
+
+#would be great to find a better of doing it without multiple files

From 450ac291aedb2aafe4c18d834829190ed3daa3ce Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 01:07:18 -0600
Subject: [PATCH 2/7] Inserting ' @-@ ' in place of special characters

---
 project-1/cleanup.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index 728e159..029fc58 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -7,13 +7,19 @@
 fin.close()
 fout.close()
 
-fin = open("out.txt", "rt")
-fout = open("out2.txt", "wt")
+import re
+original_string = open('out.txt').read()
+new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 ]+', ' @-@ ', original_string)
+open('out2.txt', 'w').write(new_string)
+
+fin = open("out2.txt", "rt")
+fout = open("out3.txt", "wt")
 
 for line in fin:
-    fout.write(line.replace('. ', '</s>. <s>'))
+    fout.write(line.replace('. ', '</s> <s>'))
 
 fin.close()
 fout.close()
 
+
 #would be great to find a better of doing it without multiple files

From f72c55b344346e8050752f2c143edbacc2cd1884 Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 01:20:03 -0600
Subject: [PATCH 3/7] doesn't replace a period

---
 project-1/cleanup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index 029fc58..ae63565 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -9,7 +9,7 @@
 
 import re
 original_string = open('out.txt').read()
-new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 ]+', ' @-@ ', original_string)
+new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .]+', '@-@', original_string)
 open('out2.txt', 'w').write(new_string)
 
 fin = open("out2.txt", "rt")

From befbfab758388e661590b63d0b3f27e8d98bc4e9 Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 10:54:06 -0600
Subject: [PATCH 4/7] Update cleanup.py

---
 project-1/cleanup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index ae63565..ac67427 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -9,7 +9,7 @@
 
 import re
 original_string = open('out.txt').read()
-new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .]+', '@-@', original_string)
+new_string = re.sub('[ ](?=[ ])|[^,A-Za-z0-9 .?!;:]+', '@-@', original_string)
 open('out2.txt', 'w').write(new_string)
 
 fin = open("out2.txt", "rt")

From cbc7b93267288d4c47a42127d075dbae763ae852 Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 15:33:03 -0600
Subject: [PATCH 5/7] Tokenise and CLEAN clean the file

---
 project-1/cleanup.py | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index ac67427..605877c 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -1,25 +1,51 @@
+from nltk import tokenize
+import nltk
+import re
+nltk.download('punkt')
+
+#remove quotes resulting from <p> elements of text scraping
 fin = open("raw_text.txt", "rt")
 fout = open("out.txt", "wt")
 
 for line in fin:
-	fout.write(line.replace('\'\'', ' '))
+	fout.write(line.replace('\'\'', ''))
 
 fin.close()
 fout.close()
 
-import re
+#replace all the punctuations except .?!,:; with @-@
 original_string = open('out.txt').read()
-new_string = re.sub('[ ](?=[ ])|[^,A-Za-z0-9 .?!;:]+', '@-@', original_string)
+new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .?!,:;]+', '@-@', original_string)
 open('out2.txt', 'w').write(new_string)
 
+#add spaces between sentences which don't have a space between period and capital word due to the fetching pattern
 fin = open("out2.txt", "rt")
-fout = open("out3.txt", "wt")
+fout = open("out.txt", "wt")
+for line in fin:
+    fout.write(re.sub('([.])([A-Z])', r'\1 \2', line))
+fin.close()
+fout.close()
 
+#tokenise and add <s> </s> tags
+fin = open("out.txt", "rt")
+fout = open("out2.txt", "wt")
 for line in fin:
-    fout.write(line.replace('. ', '</s> <s>'))
+    y = tokenize.sent_tokenize(line)
 
+for line in y:
+    fout.write('<s> ')
+    fout.write(line)
+    fout.write(' </s> ')
 fin.close()
 fout.close()
 
+#remove double hyphens with
+fin = open("out2.txt", "rt")
+fout = open("out_final.txt", "wt")
+
+for line in fin:
+    fout.write(line.replace('--', ''))
+fin.close()
+fout.close()
 
-#would be great to find a better of doing it without multiple files
+#final output file to consider is out_final.txt

From febbe6bd60bdcb8c1ce90ba112b6f4bbbc6e8eee Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 17:54:02 -0600
Subject: [PATCH 6/7] Divides the corpus to train, test and valid

---
 project-1/cleanup.py | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index 605877c..fa0469d 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -4,7 +4,7 @@
 nltk.download('punkt')
 
 #remove quotes resulting from <p> elements of text scraping
-fin = open("raw_text.txt", "rt")
+fin = open("elections_raw_text.txt", "rt")
 fout = open("out.txt", "wt")
 
 for line in fin:
@@ -48,4 +48,40 @@
 fin.close()
 fout.close()
 
-#final output file to consider is out_final.txt
+num_words = 0
+num_lines = 0
+
+with open("out_final.txt", 'r') as f:
+    for line in f:
+        words = line.split()
+        num_words += len(words)
+
+print("Number of total tokens",num_words)
+
+#dividing the corpus in three file
+ftr = open("group3_train.txt", "wt")
+fts = open("group3_test.txt", "wt")
+fv = open("group3_valid.txt", "wt")
+
+tr = round(0.7*num_words)
+ts = round(0.85*num_words)
+
+with open("out_final.txt", "rt") as f:
+    data = f.read().split()
+
+train_data = data[:tr]
+test_data = data[tr+1:ts]
+valid_data = data[ts+1:]
+
+for line in train_data:
+    ftr.write(line + ' ')
+
+for line in test_data:
+    fts.write(line + ' ')
+
+for line in valid_data:
+    fv.write(line + ' ')
+
+ftr.close()
+fts.close()
+fv.close()

From 0ddba6ff8e2754faa63a91413143f14c4ca506cb Mon Sep 17 00:00:00 2001
From: Neelanshi Varia <neelanshiV2@gmail.com>
Date: Wed, 22 Jan 2020 20:11:08 -0600
Subject: [PATCH 7/7] Final version

---
 project-1/cleanup.py | 113 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/project-1/cleanup.py b/project-1/cleanup.py
index fa0469d..a9fb23c 100644
--- a/project-1/cleanup.py
+++ b/project-1/cleanup.py
@@ -85,3 +85,116 @@
 ftr.close()
 fts.close()
 fv.close()
+
+
+
+
+#######################################################
+#append the second topic's raw text
+
+fin = open("foreign_rel_raw_text.txt", "rt")
+fout = open("out.txt", "wt")
+
+for line in fin:
+	fout.write(line.replace('\'\'', ''))
+
+fin.close()
+fout.close()
+
+#replace all the punctuations except .?!,:; with @-@
+original_string = open('out.txt').read()
+new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .?!,:;]+', '@-@', original_string)
+open('out2.txt', 'w').write(new_string)
+
+#add spaces between sentences which don't have a space between period and capital word due to the fetching pattern
+fin = open("out2.txt", "rt")
+fout = open("out.txt", "wt")
+for line in fin:
+    fout.write(re.sub('([.])([A-Z])', r'\1 \2', line))
+fin.close()
+fout.close()
+
+#tokenise and add <s> </s> tags
+fin = open("out.txt", "rt")
+fout = open("out2.txt", "wt")
+for line in fin:
+    y = tokenize.sent_tokenize(line)
+
+for line in y:
+    fout.write('<s> ')
+    fout.write(line)
+    fout.write(' </s> ')
+fin.close()
+fout.close()
+
+#remove double hyphens with
+fin = open("out2.txt", "rt")
+fout = open("out_final.txt", "wt")
+
+for line in fin:
+    fout.write(line.replace('--', ''))
+fin.close()
+fout.close()
+
+num_words = 0
+num_lines = 0
+
+
+#dividing the corpus in three file
+ftr = open("group3_train.txt", "a")
+fts = open("group3_test.txt", "a")
+fv = open("group3_valid.txt", "a")
+
+tr = 7000000
+ts = 7700000
+
+with open("out_final.txt", "rt") as f:
+    data = f.read().split()
+
+train_data = data[:tr]
+test_data = data[tr+1:ts]
+valid_data = data[ts+1:]
+
+for line in train_data:
+    ftr.write(line + ' ')
+
+for line in test_data:
+    fts.write(line + ' ')
+
+for line in valid_data:
+    fv.write(line + ' ')
+
+ftr.close()
+fts.close()
+fv.close()
+
+with open("out_final.txt", 'r') as f:
+    for line in f:
+        words = line.split()
+        num_words += len(words)
+
+print("Number of total tokens",num_words)
+
+num_words = 0
+with open("group3_train.txt", 'r') as f:
+    for line in f:
+        words = line.split()
+        num_words += len(words)
+
+print("Number of total tokens for train",num_words)
+
+num_words = 0
+with open("group3_test.txt", 'r') as f:
+    for line in f:
+        words = line.split()
+        num_words += len(words)
+
+print("Number of total tokens for test",num_words)
+
+num_words = 0
+with open("group3_valid.txt", 'r') as f:
+    for line in f:
+        words = line.split()
+        num_words += len(words)
+
+print("Number of total tokens for valid",num_words)