From 29f92878486e6702dbdd6958f06c56053b37c265 Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Tue, 21 Jan 2020 23:09:00 -0600 Subject: [PATCH 1/7] Add tags and remove extra single quotes This removes extra single quotes at the beginning and end of paragraphs inherited from the scraped text --- project-1/cleanup.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 project-1/cleanup.py diff --git a/project-1/cleanup.py b/project-1/cleanup.py new file mode 100644 index 0000000..728e159 --- /dev/null +++ b/project-1/cleanup.py @@ -0,0 +1,19 @@ +fin = open("raw_text.txt", "rt") +fout = open("out.txt", "wt") + +for line in fin: + fout.write(line.replace('\'\'', ' ')) + +fin.close() +fout.close() + +fin = open("out.txt", "rt") +fout = open("out2.txt", "wt") + +for line in fin: + fout.write(line.replace('. ', '. ')) + +fin.close() +fout.close() + +#would be great to find a better of doing it without multiple files From 450ac291aedb2aafe4c18d834829190ed3daa3ce Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 01:07:18 -0600 Subject: [PATCH 2/7] Inserting ' @-@ ' in place of special characters --- project-1/cleanup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index 728e159..029fc58 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -7,13 +7,19 @@ fin.close() fout.close() -fin = open("out.txt", "rt") -fout = open("out2.txt", "wt") +import re +original_string = open('out.txt').read() +new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 ]+', ' @-@ ', original_string) +open('out2.txt', 'w').write(new_string) + +fin = open("out2.txt", "rt") +fout = open("out3.txt", "wt") for line in fin: - fout.write(line.replace('. ', '. ')) + fout.write(line.replace('. ', ' ')) fin.close() fout.close() + #would be great to find a better of doing it without multiple files From f72c55b344346e8050752f2c143edbacc2cd1884 Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 01:20:03 -0600 Subject: [PATCH 3/7] doesn't replace a period --- project-1/cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index 029fc58..ae63565 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -9,7 +9,7 @@ import re original_string = open('out.txt').read() -new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 ]+', ' @-@ ', original_string) +new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .]+', '@-@', original_string) open('out2.txt', 'w').write(new_string) fin = open("out2.txt", "rt") From befbfab758388e661590b63d0b3f27e8d98bc4e9 Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 10:54:06 -0600 Subject: [PATCH 4/7] Update cleanup.py --- project-1/cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index ae63565..ac67427 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -9,7 +9,7 @@ import re original_string = open('out.txt').read() -new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .]+', '@-@', original_string) +new_string = re.sub('[ ](?=[ ])|[^,A-Za-z0-9 .?!;:]+', '@-@', original_string) open('out2.txt', 'w').write(new_string) fin = open("out2.txt", "rt") From cbc7b93267288d4c47a42127d075dbae763ae852 Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 15:33:03 -0600 Subject: [PATCH 5/7] Tokenise and CLEAN clean the file --- project-1/cleanup.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index ac67427..605877c 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -1,25 +1,51 @@ +from nltk import tokenize +import nltk +import re +nltk.download('punkt') + +#remove quotes resulting from

elements of text scraping fin = open("raw_text.txt", "rt") fout = open("out.txt", "wt") for line in fin: - fout.write(line.replace('\'\'', ' ')) + fout.write(line.replace('\'\'', '')) fin.close() fout.close() -import re +#replace all the punctuations except .?!,:; with @-@ original_string = open('out.txt').read() -new_string = re.sub('[ ](?=[ ])|[^,A-Za-z0-9 .?!;:]+', '@-@', original_string) +new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .?!,:;]+', '@-@', original_string) open('out2.txt', 'w').write(new_string) +#add spaces between sentences which don't have a space between period and capital word due to the fetching pattern fin = open("out2.txt", "rt") -fout = open("out3.txt", "wt") +fout = open("out.txt", "wt") +for line in fin: + fout.write(re.sub('([.])([A-Z])', r'\1 \2', line)) +fin.close() +fout.close() +#tokenise and add tags +fin = open("out.txt", "rt") +fout = open("out2.txt", "wt") for line in fin: - fout.write(line.replace('. ', ' ')) + y = tokenize.sent_tokenize(line) +for line in y: + fout.write(' ') + fout.write(line) + fout.write(' ') fin.close() fout.close() +#remove double hyphens with +fin = open("out2.txt", "rt") +fout = open("out_final.txt", "wt") + +for line in fin: + fout.write(line.replace('--', '')) +fin.close() +fout.close() -#would be great to find a better of doing it without multiple files +#final output file to consider is out_final.txt From febbe6bd60bdcb8c1ce90ba112b6f4bbbc6e8eee Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 17:54:02 -0600 Subject: [PATCH 6/7] Divides the corpus to train, test and valid --- project-1/cleanup.py | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index 605877c..fa0469d 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -4,7 +4,7 @@ nltk.download('punkt') #remove quotes resulting from

elements of text scraping -fin = open("raw_text.txt", "rt") +fin = open("elections_raw_text.txt", "rt") fout = open("out.txt", "wt") for line in fin: @@ -48,4 +48,40 @@ fin.close() fout.close() -#final output file to consider is out_final.txt +num_words = 0 +num_lines = 0 + +with open("out_final.txt", 'r') as f: + for line in f: + words = line.split() + num_words += len(words) + +print("Number of total tokens",num_words) + +#dividing the corpus in three file +ftr = open("group3_train.txt", "wt") +fts = open("group3_test.txt", "wt") +fv = open("group3_valid.txt", "wt") + +tr = round(0.7*num_words) +ts = round(0.85*num_words) + +with open("out_final.txt", "rt") as f: + data = f.read().split() + +train_data = data[:tr] +test_data = data[tr+1:ts] +valid_data = data[ts+1:] + +for line in train_data: + ftr.write(line + ' ') + +for line in test_data: + fts.write(line + ' ') + +for line in valid_data: + fv.write(line + ' ') + +ftr.close() +fts.close() +fv.close() From 0ddba6ff8e2754faa63a91413143f14c4ca506cb Mon Sep 17 00:00:00 2001 From: Neelanshi Varia Date: Wed, 22 Jan 2020 20:11:08 -0600 Subject: [PATCH 7/7] Final version --- project-1/cleanup.py | 113 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/project-1/cleanup.py b/project-1/cleanup.py index fa0469d..a9fb23c 100644 --- a/project-1/cleanup.py +++ b/project-1/cleanup.py @@ -85,3 +85,116 @@ ftr.close() fts.close() fv.close() + + + + +####################################################### +#append the second topic's raw text + +fin = open("foreign_rel_raw_text.txt", "rt") +fout = open("out.txt", "wt") + +for line in fin: + fout.write(line.replace('\'\'', '')) + +fin.close() +fout.close() + +#replace all the punctuations except .?!,:; with @-@ +original_string = open('out.txt').read() +new_string = re.sub('[ ](?=[ ])|[^-_,A-Za-z0-9 .?!,:;]+', '@-@', original_string) +open('out2.txt', 'w').write(new_string) + +#add spaces between sentences which don't have a space between period and capital word due to the fetching pattern +fin = open("out2.txt", "rt") +fout = open("out.txt", "wt") +for line in fin: + fout.write(re.sub('([.])([A-Z])', r'\1 \2', line)) +fin.close() +fout.close() + +#tokenise and add tags +fin = open("out.txt", "rt") +fout = open("out2.txt", "wt") +for line in fin: + y = tokenize.sent_tokenize(line) + +for line in y: + fout.write(' ') + fout.write(line) + fout.write(' ') +fin.close() +fout.close() + +#remove double hyphens with +fin = open("out2.txt", "rt") +fout = open("out_final.txt", "wt") + +for line in fin: + fout.write(line.replace('--', '')) +fin.close() +fout.close() + +num_words = 0 +num_lines = 0 + + +#dividing the corpus in three file +ftr = open("group3_train.txt", "a") +fts = open("group3_test.txt", "a") +fv = open("group3_valid.txt", "a") + +tr = 7000000 +ts = 7700000 + +with open("out_final.txt", "rt") as f: + data = f.read().split() + +train_data = data[:tr] +test_data = data[tr+1:ts] +valid_data = data[ts+1:] + +for line in train_data: + ftr.write(line + ' ') + +for line in test_data: + fts.write(line + ' ') + +for line in valid_data: + fv.write(line + ' ') + +ftr.close() +fts.close() +fv.close() + +with open("out_final.txt", 'r') as f: + for line in f: + words = line.split() + num_words += len(words) + +print("Number of total tokens",num_words) + +num_words = 0 +with open("group3_train.txt", 'r') as f: + for line in f: + words = line.split() + num_words += len(words) + +print("Number of total tokens for train",num_words) + +num_words = 0 +with open("group3_test.txt", 'r') as f: + for line in f: + words = line.split() + num_words += len(words) + +print("Number of total tokens for test",num_words) + +num_words = 0 +with open("group3_valid.txt", 'r') as f: + for line in f: + words = line.split() + num_words += len(words) + +print("Number of total tokens for valid",num_words)