From db76f8527299c33e0edf90d0d88a75f8ddb47466 Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Sun, 5 Sep 2021 17:09:21 -0500 Subject: [PATCH 1/6] Added typos for all letters and some punctuation. Also updated prompts. --- bot/prompt.py | 67 ++++++++++--------- bot/typos.py | 180 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 206 insertions(+), 41 deletions(-) diff --git a/bot/prompt.py b/bot/prompt.py index e0bddf8..6e07f69 100644 --- a/bot/prompt.py +++ b/bot/prompt.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import random from collections import Counter -from .typos import add_typos +from typos import add_typos def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] @@ -126,13 +126,13 @@ def random_select_weighted_list(ls): (8.0, 'violated'), (8.0, 'disregarded'), (8.0, 'disobeyed'), - (8.0, 'assisted someone in violating'), - (8.0, 'assisted someone in breaking'), - (8.0, 'assisted someone in disobeying'), - (2.0, 'helped someone violate'), - (2.0, 'helped someone disobey'), - (2.0, 'helped someone break'), - (2.0, 'helped violate'), + (4.0, 'assisted someone in violating'), + (4.0, 'assisted someone in breaking'), + (4.0, 'assisted someone in disobeying'), + (3.0, 'helped someone violate'), + (3.0, 'helped someone disobey'), + (3.0, 'helped someone break'), + (3.0, 'helped violate'), (0.4, 'helped someone get an abortion in violation of'), (0.4, 'helped someone have an abortion in violation of'), (0.4, 'helped someone get an abortion, violating'), @@ -209,11 +209,12 @@ def random_select_weighted_list(ls): future_time_frames.extend(['next ' + k for k in days_of_the_week]) future_time_frames.extend(['on ' + k for k in days_of_the_week]) abortion_ban_words = [ - 'abortion ban', 'ban on abortion', 'new abortion law', 'law on abortion', 'recent abortion law', 'abortion restrictions', - 'restrictions on abortion', 'law' + 'abortion ban', 'ban on abortion', 'law on abortion', 'recent abortion law', 'abortion restrictions', 'restrictions on abortion', ] -abortion_ban_words = [*["Texas's " + k for k in abortion_ban_words], *["the " + k for k in abortion_ban_words]] -abortion_ban_words.extend(['Texas law', 'the new law']) +abortion_ban_words = [ *["recently passed " + k for k in abortion_ban_words], *[k for k in abortion_ban_words] ] +abortion_ban_words = [ *["Texas's " + k for k in abortion_ban_words], *["the " + k for k in abortion_ban_words] ] +abortion_ban_words.extend(['Texas law', 'the new law', 'Texas law on abortion', 'the Texas law on abortion', 'the Texas abortion law', + 'the new Texas abortion law', 'the recently passed Texas abortion law', 'new abortion law']) def gen_abortion_prompt_I(accused): abortion_prompt = 'I ' @@ -249,7 +250,10 @@ def gen_abortion_prompt_My(accused): abortion_prompt += '.' return abortion_prompt +counter = 0 + def gen_abortion_prompt(): + global counter accused_family_person = random_select_weighted_list(my_family_possessive_adj) accused_family_person += random_select_weighted_list(my_family_words) accused_nonfamily_person = random_select_weighted_list(my_nonfamily_possessive_adj) @@ -262,11 +266,14 @@ def gen_abortion_prompt(): (0.5, accused_teacher) ]) abortion_prompts = [ - (5.2, gen_abortion_prompt_I(accused)), - (2.6, gen_abortion_prompt_My(accused)) + (1.0, gen_abortion_prompt_I(accused)), + (1.0, gen_abortion_prompt_My(accused)) ] - return random_select_weighted_list(abortion_prompts) - #return add_typos(random_select_weighted_list(abortion_prompts)) + #return random_select_weighted_list(abortion_prompts) + counter += 1 + if random.random() < 0.001: + print('\r\x1b[K' + str(counter), end='') + return add_typos(random_select_weighted_list(abortion_prompts)) bigram_counter = Counter() trigram_counter = Counter() @@ -285,21 +292,21 @@ def check_ngram_frequency(prompt): quadgram_counter[cur_quadgram] += 1 return prompt +def write_ngram_to_file(counter, filename, total): + with open(filename, 'w') as writer: + for k, v in counter.most_common(): + writer.write( "{} {}\n".format(k, float(v) / total) ) + if float(v) / total < 0.001: + break + if __name__ == "__main__": total_number = 2000000 - sample_abortion_prompts = { check_ngram_frequency(gen_abortion_prompt()) for k in range(total_number) } + sample_abortion_prompts = [ check_ngram_frequency(gen_abortion_prompt()) for k in range(total_number) ] for k in sorted(list(sample_abortion_prompts)[:200], key = lambda o: random.random()): print(k) - print('Duplicates: ' + str(total_number - len(sample_abortion_prompts))) - print('Unique: ' + str(len(sample_abortion_prompts))) - print('I [think]: ' + str(len([k for k in sample_abortion_prompts if 'I' in k]))) - print('Other: ' + str(len(sample_abortion_prompts) - len([k for k in sample_abortion_prompts if 'I' in k]))) - with open('bigram_freq.txt', 'w') as writer: - for k,v in bigram_counter.most_common(): - writer.write( "{} {}\n".format(k,v) ) - with open('trigram_freq.txt', 'w') as writer: - for k,v in trigram_counter.most_common(): - writer.write( "{} {}\n".format(k,v) ) - with open('quadgram_freq.txt', 'w') as writer: - for k,v in quadgram_counter.most_common(): - writer.write( "{} {}\n".format(k,v) ) + unique = len(set(sample_abortion_prompts)) + print('Duplicates: ' + str(total_number - unique)) + print('Unique: ' + str(unique)) + write_ngram_to_file(bigram_counter, 'bigram_freq.txt', len(sample_abortion_prompts)) + write_ngram_to_file(trigram_counter, 'trigram_freq.txt', len(sample_abortion_prompts)) + write_ngram_to_file(quadgram_counter, 'quadgram_freq.txt', len(sample_abortion_prompts)) diff --git a/bot/typos.py b/bot/typos.py index 2c9c1a5..145071f 100644 --- a/bot/typos.py +++ b/bot/typos.py @@ -1,14 +1,19 @@ #!/usr/bin/env python3 import random +import re def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] def gen_typo_odds(): - return 1.0 - (random.betavariate(16, 2) * 0.3 + 0.6995) + return 1.0 - (random.betavariate(0.5, 0.15) * 0.3 + 0.699995) def add_typos(in_str): typo_odds = gen_typo_odds() + # Outside of English transliterations, the letter 'q' is always followed by + # a 'u'. By combining them into one letter, typos are easier to introduce. + in_str = re.sub('qu', 'q', in_str) + in_str = re.sub("'s", "'", in_str) sub_dict = { 'a': [ # First element is always 1 - typo_odds and it's always the correct @@ -16,17 +21,170 @@ def add_typos(in_str): # They can also be multiple letters or zero letters. (1 - typo_odds, 'a'), # Make sure that these add up to 1.0. - (0.1 * typo_odds, 's'), - (0.2 * typo_odds, 'q'), - (0.1 * typo_odds, 'w'), - (0.5 * typo_odds, 'e'), + (0.2 * typo_odds, 'e'), (0.1 * typo_odds, 'i') ], + 'b': [ + (1 - typo_odds, 'b'), + (0.2 * typo_odds, 'p'), + (0.1 * typo_odds, 'h'), + (0.1 * typo_odds, 'n'), + ], + 'c': [ + (1 - typo_odds, 'c'), + (0.1 * typo_odds, 'ts'), + (0.3 * typo_odds, 's'), + (0.3 * typo_odds, 'k'), + ], + 'd': [ + (1 - typo_odds, 'd'), + (0.5 * typo_odds, 't'), + ], + 'e': [ + (1 - typo_odds, 'e'), + (0.1 * typo_odds, 'a'), + (0.5 * typo_odds, 'i'), + ], + 'f': [ + (1 - typo_odds, 'f'), + (0.5 * typo_odds, 'v'), + ], + 'g': [ + (1 - typo_odds, 'g'), + (0.1 * typo_odds, 'k'), + ], + 'h': [ + (1 - typo_odds, 'h'), + (0.2 * typo_odds, ''), + ], + 'i': [ + (1 - typo_odds, 'i'), + (0.3 * typo_odds, 'e'), + (0.1 * typo_odds, 'a'), + (0.01 * typo_odds, 'k'), + ], + 'j': [ + (1 - typo_odds, 'j'), + (0.01 * typo_odds, 'ch'), + ], + 'k': [ + (1 - typo_odds, 'k'), + (0.1 * typo_odds, 'g'), + (0.4 * typo_odds, 'c') + ], + 'l': [ + (1 - typo_odds, 'l'), + (0.03 * typo_odds, 'r'), + (0.03 * typo_odds, 'w'), + ], + 'm': [ + (1 - typo_odds, 'm'), + (0.5 * typo_odds, 'n'), + ], + 'n': [ + (1 - typo_odds, 'n'), + (0.5 * typo_odds, 'n'), + (0.2 * typo_odds, 'b') + ], + 'o': [ + (1 - typo_odds, 'o'), + (0.2 * typo_odds, 'u'), + (0.1 * typo_odds, 'p') + ], + 'p': [ + (1 - typo_odds, 'p'), + (0.5 * typo_odds, 'b'), + (0.2 * typo_odds, 'o'), + ], + 'q': [ + (1 - typo_odds, 'qu'), + (0.8 * typo_odds, 'kw'), + ], + 'r': [ + (1 - typo_odds, 'r'), + (0.3 * typo_odds, 'l'), + (0.1 * typo_odds, 't') + ], + 's': [ + (1 - typo_odds, 's'), + (0.1 * typo_odds, 'sh'), + (0.2 * typo_odds, 'c'), + (0.2 * typo_odds, 'z') + ], + 't': [ + (1 - typo_odds, 't'), + (0.5 * typo_odds, 'd'), + (0.3 * typo_odds, 'th'), + (0.1 * typo_odds, 'r') + ], + 'u': [ + (1 - typo_odds, 'u'), + (0.01 * typo_odds, 'yu'), + (0.2 * typo_odds, 'o'), + (0.1 * typo_odds, ''), + ], + 'v': [ + (1 - typo_odds, 'v'), + (typo_odds, 'f') + ], + 'w': [ + (1 - typo_odds, 'w'), + (0.5 * typo_odds, ''), + ], + 'x': [ + (1 - typo_odds, 'x'), + (0.6 * typo_odds, 'ks'), + (0.1 * typo_odds, 'z'), + ], + 'y': [ + (1 - typo_odds, 'y'), + (0.5 * typo_odds, 'u'), + (0.2 * typo_odds, 'h'), + (0.1 * typo_odds, 'j') + ], + 'z': [ + (1 - typo_odds, 'z'), + (0.9 * typo_odds, 's'), + (0.05 * typo_odds, 'x') + ], + ',': [ + (1 - typo_odds ** 0.5, ','), + (0.9 * typo_odds ** 0.5, ''), + (0.1 * typo_odds ** 0.5, '.'), + ], + '.': [ + (1 - typo_odds ** 0.75, '.'), + (0.3 * typo_odds ** 0.75, ''), + (0.3 * typo_odds ** 0.75, ','), + ], + "'": [ + (1 - typo_odds ** 0.5, "'s"), + (0.7 * typo_odds ** 0.5, "s"), + (0.3 * typo_odds ** 0.5, "s'") + ] } out_str = '' - for k in in_str: - if k in sub_dict: - out_str += random_select_weighted_list(sub_dict[k]) - else: - out_str += k - return out_str + words = in_str.split() + out_words = [] + misspelled_words = {} + for word in words: + prev_char_in = '' + prev_char_out = '' + out_word = '' + if word in misspelled_words and random.random() < 0.5: + out_words.append(misspelled_words[word]) + continue + for k in word: + if k in sub_dict: + if k == prev_char_in: + out_word += prev_char_out + else: + prev_char_out = random_select_weighted_list(sub_dict[k]) + out_word += prev_char_out + else: + out_word += k + prev_char_in = k + if out_word != word: + misspelled_words[word] = out_word + out_words.append(out_word) + return ' '.join(out_words) From fbb54137ec236d628840692662c32116f2e2fa4b Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Sun, 5 Sep 2021 18:28:30 -0500 Subject: [PATCH 2/6] Added more variation in the kinds of typos and used better probability distributions --- bot/typos.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/bot/typos.py b/bot/typos.py index 145071f..949149e 100644 --- a/bot/typos.py +++ b/bot/typos.py @@ -10,6 +10,9 @@ def gen_typo_odds(): def add_typos(in_str): typo_odds = gen_typo_odds() + space_typo_odds = gen_typo_odds() / 1.5 + lowercase_odds = 0.5 * (gen_typo_odds() ** 0.5) + punct_typo_odds = gen_typo_odds() + typo_odds ** 0.5 # Outside of English transliterations, the letter 'q' is always followed by # a 'u'. By combining them into one letter, typos are easier to introduce. in_str = re.sub('qu', 'q', in_str) @@ -148,22 +151,26 @@ def add_typos(in_str): (0.05 * typo_odds, 'x') ], ',': [ - (1 - typo_odds ** 0.5, ','), - (0.9 * typo_odds ** 0.5, ''), - (0.1 * typo_odds ** 0.5, '.'), + (1 - punct_typo_odds, ','), + (0.9 * punct_typo_odds, ''), + (0.1 * punct_typo_odds, '.'), ], '.': [ - (1 - typo_odds ** 0.75, '.'), - (0.3 * typo_odds ** 0.75, ''), - (0.3 * typo_odds ** 0.75, ','), + (1 - punct_typo_odds, '.'), + (0.3 * punct_typo_odds, ''), + (0.3 * punct_typo_odds, ','), ], "'": [ - (1 - typo_odds ** 0.5, "'s"), - (0.7 * typo_odds ** 0.5, "s"), - (0.3 * typo_odds ** 0.5, "s'") - ] + (1 - punct_typo_odds, "'s"), + (0.7 * punct_typo_odds, "s"), + (0.3 * punct_typo_odds, "s'") + ], } - out_str = '' + space_subs = [ + (1 - space_typo_odds, ' '), + (0.9 * space_typo_odds, ''), + (0.1 * space_typo_odds, ' ') + ] words = in_str.split() out_words = [] misspelled_words = {} @@ -187,4 +194,13 @@ def add_typos(in_str): if out_word != word: misspelled_words[word] = out_word out_words.append(out_word) - return ' '.join(out_words) + intermediate_str = ' '.join(out_words) + if lowercase_odds < 0.1: + intermediate_str = intermediate_str.lower() + out_str = '' + for k in intermediate_str: + if k == ' ': + out_str += random_select_weighted_list(space_subs) + else: + out_str += k + return out_str From a5fc892eea7faf2d4c97f3396aa5932bc3323784 Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Sun, 5 Sep 2021 18:29:13 -0500 Subject: [PATCH 3/6] Fixed local import --- bot/prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/prompt.py b/bot/prompt.py index 6e07f69..615531c 100644 --- a/bot/prompt.py +++ b/bot/prompt.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import random from collections import Counter -from typos import add_typos +from .typos import add_typos def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] From 0028c5645dc4ef4db3db70a7fab44c33db7e7adf Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Sun, 5 Sep 2021 19:47:45 -0500 Subject: [PATCH 4/6] Added arguments for controlling typo frequencies --- bot/typos.py | 390 ++++++++++++++++++++++++++------------------------- 1 file changed, 197 insertions(+), 193 deletions(-) diff --git a/bot/typos.py b/bot/typos.py index 949149e..9e5641e 100644 --- a/bot/typos.py +++ b/bot/typos.py @@ -3,204 +3,208 @@ import re def random_select_weighted_list(ls): - return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] + return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] def gen_typo_odds(): - return 1.0 - (random.betavariate(0.5, 0.15) * 0.3 + 0.699995) + return 1.0 - (random.betavariate(0.5, 0.15) * 0.3 + 0.699995) -def add_typos(in_str): +def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, punct_typo_odds = -1): + if typo_odds < 0: typo_odds = gen_typo_odds() + if space_typo_odds < 0: space_typo_odds = gen_typo_odds() / 1.5 + if lowercase_odds < 0: lowercase_odds = 0.5 * (gen_typo_odds() ** 0.5) + if punct_typo_odds < 0: punct_typo_odds = gen_typo_odds() + typo_odds ** 0.5 - # Outside of English transliterations, the letter 'q' is always followed by - # a 'u'. By combining them into one letter, typos are easier to introduce. - in_str = re.sub('qu', 'q', in_str) - in_str = re.sub("'s", "'", in_str) - sub_dict = { - 'a': [ - # First element is always 1 - typo_odds and it's always the correct - # letter. The rest of the elements are incorrect substitutions. - # They can also be multiple letters or zero letters. - (1 - typo_odds, 'a'), - # Make sure that these add up to 1.0. - (0.2 * typo_odds, 'e'), - (0.1 * typo_odds, 'i') - ], - 'b': [ - (1 - typo_odds, 'b'), - (0.2 * typo_odds, 'p'), - (0.1 * typo_odds, 'h'), - (0.1 * typo_odds, 'n'), - ], - 'c': [ - (1 - typo_odds, 'c'), - (0.1 * typo_odds, 'ts'), - (0.3 * typo_odds, 's'), - (0.3 * typo_odds, 'k'), - ], - 'd': [ - (1 - typo_odds, 'd'), - (0.5 * typo_odds, 't'), - ], - 'e': [ - (1 - typo_odds, 'e'), - (0.1 * typo_odds, 'a'), - (0.5 * typo_odds, 'i'), - ], - 'f': [ - (1 - typo_odds, 'f'), - (0.5 * typo_odds, 'v'), - ], - 'g': [ - (1 - typo_odds, 'g'), - (0.1 * typo_odds, 'k'), - ], - 'h': [ - (1 - typo_odds, 'h'), - (0.2 * typo_odds, ''), - ], - 'i': [ - (1 - typo_odds, 'i'), - (0.3 * typo_odds, 'e'), - (0.1 * typo_odds, 'a'), - (0.01 * typo_odds, 'k'), - ], - 'j': [ - (1 - typo_odds, 'j'), - (0.01 * typo_odds, 'ch'), - ], - 'k': [ - (1 - typo_odds, 'k'), - (0.1 * typo_odds, 'g'), - (0.4 * typo_odds, 'c') - ], - 'l': [ - (1 - typo_odds, 'l'), - (0.03 * typo_odds, 'r'), - (0.03 * typo_odds, 'w'), - ], - 'm': [ - (1 - typo_odds, 'm'), - (0.5 * typo_odds, 'n'), - ], - 'n': [ - (1 - typo_odds, 'n'), - (0.5 * typo_odds, 'n'), - (0.2 * typo_odds, 'b') - ], - 'o': [ - (1 - typo_odds, 'o'), - (0.2 * typo_odds, 'u'), - (0.1 * typo_odds, 'p') - ], - 'p': [ - (1 - typo_odds, 'p'), - (0.5 * typo_odds, 'b'), - (0.2 * typo_odds, 'o'), - ], - 'q': [ - (1 - typo_odds, 'qu'), - (0.8 * typo_odds, 'kw'), - ], - 'r': [ - (1 - typo_odds, 'r'), - (0.3 * typo_odds, 'l'), - (0.1 * typo_odds, 't') - ], - 's': [ - (1 - typo_odds, 's'), - (0.1 * typo_odds, 'sh'), - (0.2 * typo_odds, 'c'), - (0.2 * typo_odds, 'z') - ], - 't': [ - (1 - typo_odds, 't'), - (0.5 * typo_odds, 'd'), - (0.3 * typo_odds, 'th'), - (0.1 * typo_odds, 'r') - ], - 'u': [ - (1 - typo_odds, 'u'), - (0.01 * typo_odds, 'yu'), - (0.2 * typo_odds, 'o'), - (0.1 * typo_odds, ''), - ], - 'v': [ - (1 - typo_odds, 'v'), - (typo_odds, 'f') - ], - 'w': [ - (1 - typo_odds, 'w'), - (0.5 * typo_odds, ''), - ], - 'x': [ - (1 - typo_odds, 'x'), - (0.6 * typo_odds, 'ks'), - (0.1 * typo_odds, 'z'), - ], - 'y': [ - (1 - typo_odds, 'y'), - (0.5 * typo_odds, 'u'), - (0.2 * typo_odds, 'h'), - (0.1 * typo_odds, 'j') - ], - 'z': [ - (1 - typo_odds, 'z'), - (0.9 * typo_odds, 's'), - (0.05 * typo_odds, 'x') - ], - ',': [ - (1 - punct_typo_odds, ','), - (0.9 * punct_typo_odds, ''), - (0.1 * punct_typo_odds, '.'), - ], - '.': [ - (1 - punct_typo_odds, '.'), - (0.3 * punct_typo_odds, ''), - (0.3 * punct_typo_odds, ','), - ], - "'": [ - (1 - punct_typo_odds, "'s"), - (0.7 * punct_typo_odds, "s"), - (0.3 * punct_typo_odds, "s'") - ], - } - space_subs = [ - (1 - space_typo_odds, ' '), - (0.9 * space_typo_odds, ''), - (0.1 * space_typo_odds, ' ') - ] - words = in_str.split() - out_words = [] - misspelled_words = {} - for word in words: - prev_char_in = '' - prev_char_out = '' - out_word = '' - if word in misspelled_words and random.random() < 0.5: - out_words.append(misspelled_words[word]) - continue - for k in word: - if k in sub_dict: - if k == prev_char_in: - out_word += prev_char_out - else: - prev_char_out = random_select_weighted_list(sub_dict[k]) - out_word += prev_char_out - else: - out_word += k - prev_char_in = k - if out_word != word: - misspelled_words[word] = out_word - out_words.append(out_word) - intermediate_str = ' '.join(out_words) - if lowercase_odds < 0.1: - intermediate_str = intermediate_str.lower() - out_str = '' - for k in intermediate_str: - if k == ' ': - out_str += random_select_weighted_list(space_subs) + # Outside of English transliterations, the letter 'q' is always followed by + # a 'u'. By combining them into one letter, typos are easier to introduce. + in_str = re.sub('qu', 'q', in_str) + in_str = re.sub("'s", "'", in_str) + sub_dict = { + 'a': [ + # First element is always 1 - typo_odds and it's always the correct + # letter. The rest of the elements are incorrect substitutions. + # They can also be multiple letters or zero letters. + (1 - typo_odds, 'a'), + # Make sure that these add up to 1.0. + (0.2 * typo_odds, 'e'), + (0.1 * typo_odds, 'i') + ], + 'b': [ + (1 - typo_odds, 'b'), + (0.2 * typo_odds, 'p'), + (0.1 * typo_odds, 'h'), + (0.1 * typo_odds, 'n'), + ], + 'c': [ + (1 - typo_odds, 'c'), + (0.1 * typo_odds, 'ts'), + (0.3 * typo_odds, 's'), + (0.3 * typo_odds, 'k'), + ], + 'd': [ + (1 - typo_odds, 'd'), + (0.5 * typo_odds, 't'), + ], + 'e': [ + (1 - typo_odds, 'e'), + (0.1 * typo_odds, 'a'), + (0.5 * typo_odds, 'i'), + ], + 'f': [ + (1 - typo_odds, 'f'), + (0.5 * typo_odds, 'v'), + ], + 'g': [ + (1 - typo_odds, 'g'), + (0.1 * typo_odds, 'k'), + ], + 'h': [ + (1 - typo_odds, 'h'), + (0.2 * typo_odds, ''), + ], + 'i': [ + (1 - typo_odds, 'i'), + (0.3 * typo_odds, 'e'), + (0.1 * typo_odds, 'a'), + (0.01 * typo_odds, 'k'), + ], + 'j': [ + (1 - typo_odds, 'j'), + (0.01 * typo_odds, 'ch'), + ], + 'k': [ + (1 - typo_odds, 'k'), + (0.1 * typo_odds, 'g'), + (0.4 * typo_odds, 'c') + ], + 'l': [ + (1 - typo_odds, 'l'), + (0.03 * typo_odds, 'r'), + (0.03 * typo_odds, 'w'), + ], + 'm': [ + (1 - typo_odds, 'm'), + (0.5 * typo_odds, 'n'), + ], + 'n': [ + (1 - typo_odds, 'n'), + (0.5 * typo_odds, 'n'), + (0.2 * typo_odds, 'b') + ], + 'o': [ + (1 - typo_odds, 'o'), + (0.2 * typo_odds, 'u'), + (0.1 * typo_odds, 'p') + ], + 'p': [ + (1 - typo_odds, 'p'), + (0.5 * typo_odds, 'b'), + (0.2 * typo_odds, 'o'), + ], + 'q': [ + (1 - typo_odds, 'qu'), + (0.8 * typo_odds, 'kw'), + ], + 'r': [ + (1 - typo_odds, 'r'), + (0.3 * typo_odds, 'l'), + (0.1 * typo_odds, 't') + ], + 's': [ + (1 - typo_odds, 's'), + (0.1 * typo_odds, 'sh'), + (0.2 * typo_odds, 'c'), + (0.2 * typo_odds, 'z') + ], + 't': [ + (1 - typo_odds, 't'), + (0.5 * typo_odds, 'd'), + (0.3 * typo_odds, 'th'), + (0.1 * typo_odds, 'r') + ], + 'u': [ + (1 - typo_odds, 'u'), + (0.01 * typo_odds, 'yu'), + (0.2 * typo_odds, 'o'), + (0.1 * typo_odds, ''), + ], + 'v': [ + (1 - typo_odds, 'v'), + (typo_odds, 'f') + ], + 'w': [ + (1 - typo_odds, 'w'), + (0.5 * typo_odds, ''), + ], + 'x': [ + (1 - typo_odds, 'x'), + (0.6 * typo_odds, 'ks'), + (0.1 * typo_odds, 'z'), + ], + 'y': [ + (1 - typo_odds, 'y'), + (0.5 * typo_odds, 'u'), + (0.2 * typo_odds, 'h'), + (0.1 * typo_odds, 'j') + ], + 'z': [ + (1 - typo_odds, 'z'), + (0.9 * typo_odds, 's'), + (0.05 * typo_odds, 'x') + ], + ',': [ + (1 - punct_typo_odds, ','), + (0.9 * punct_typo_odds, ''), + (0.1 * punct_typo_odds, '.'), + ], + '.': [ + (1 - punct_typo_odds, '.'), + (0.3 * punct_typo_odds, ''), + (0.3 * punct_typo_odds, ','), + ], + "'": [ + (1 - punct_typo_odds, "'s"), + (0.7 * punct_typo_odds, "s"), + (0.3 * punct_typo_odds, "s'") + ], + } + space_subs = [ + (1 - space_typo_odds, ' '), + (0.9 * space_typo_odds, ''), + (0.1 * space_typo_odds, ' ') + ] + words = in_str.split() + out_words = [] + misspelled_words = {} + for word in words: + prev_char_in = '' + prev_char_out = '' + out_word = '' + if word in misspelled_words and random.random() < 0.5: + out_words.append(misspelled_words[word]) + continue + for k in word: + if k in sub_dict: + if k == prev_char_in: + out_word += prev_char_out else: - out_str += k - return out_str + prev_char_out = random_select_weighted_list(sub_dict[k]) + out_word += prev_char_out + else: + out_word += k + prev_char_in = k + if out_word != word: + misspelled_words[word] = out_word + out_words.append(out_word) + intermediate_str = ' '.join(out_words) + if lowercase_odds < 0.1: + intermediate_str = intermediate_str.lower() + out_str = '' + for k in intermediate_str: + if k == ' ': + out_str += random_select_weighted_list(space_subs) + else: + out_str += k + return out_str From 4f62d91fc45cd3b79794a413441c84a59b55f5dc Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Tue, 7 Sep 2021 16:59:28 -0500 Subject: [PATCH 5/6] Updated typo generation --- bot/prompt.py | 1 - bot/typos.py | 85 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/bot/prompt.py b/bot/prompt.py index 51ef28b..2f43b69 100644 --- a/bot/prompt.py +++ b/bot/prompt.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import random from collections import Counter -from typos import add_typos def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] diff --git a/bot/typos.py b/bot/typos.py index 9e5641e..107c273 100644 --- a/bot/typos.py +++ b/bot/typos.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 import random import re +#import seaborn as sb +#import matplotlib.pyplot as plt def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] @@ -8,19 +10,38 @@ def random_select_weighted_list(ls): def gen_typo_odds(): return 1.0 - (random.betavariate(0.5, 0.15) * 0.3 + 0.699995) +#typo_data = [] +#space_data = [] +#punct_data = [] + def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, punct_typo_odds = -1): + #global typo_data + #global space_data + #global punct_data if typo_odds < 0: - typo_odds = gen_typo_odds() + if random.random() < 0.1: + typo_odds = 0.0 + else: + typo_odds = gen_typo_odds() + # typo_data.append(typo_odds) if space_typo_odds < 0: - space_typo_odds = gen_typo_odds() / 1.5 + space_typo_odds = gen_typo_odds() / 2.0 + # space_data.append(space_typo_odds) if lowercase_odds < 0: - lowercase_odds = 0.5 * (gen_typo_odds() ** 0.5) + lowercase_odds = random.random() + uppercase_odds = random.random() if punct_typo_odds < 0: - punct_typo_odds = gen_typo_odds() + typo_odds ** 0.5 + punct_typo_odds = (gen_typo_odds() + typo_odds ** 0.5) / 2.0 + # punct_data.append(punct_typo_odds) + in_str = re.sub('ies ', '\u2605 ', in_str) + in_str = re.sub('es ', '\u2604 ', in_str) + in_str = re.sub('ie ', '\u2606 ', in_str) + in_str = re.sub('ie', '\u2603', in_str) + in_str = re.sub('ei', '\u2602', in_str) # Outside of English transliterations, the letter 'q' is always followed by # a 'u'. By combining them into one letter, typos are easier to introduce. - in_str = re.sub('qu', 'q', in_str) - in_str = re.sub("'s", "'", in_str) + in_str = re.sub('qu', '\u2601', in_str) + in_str = re.sub("'s", "\u2600", in_str) sub_dict = { 'a': [ # First element is always 1 - typo_odds and it's always the correct @@ -103,9 +124,10 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, (0.5 * typo_odds, 'b'), (0.2 * typo_odds, 'o'), ], - 'q': [ + '\u2601': [ (1 - typo_odds, 'qu'), - (0.8 * typo_odds, 'kw'), + (0.9 * typo_odds, 'kw'), + (0.1 * typo_odds, 'q'), ], 'r': [ (1 - typo_odds, 'r'), @@ -114,8 +136,6 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, ], 's': [ (1 - typo_odds, 's'), - (0.1 * typo_odds, 'sh'), - (0.2 * typo_odds, 'c'), (0.2 * typo_odds, 'z') ], 't': [ @@ -154,6 +174,26 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, (0.9 * typo_odds, 's'), (0.05 * typo_odds, 'x') ], + "\u2602": [ + (1 - typo_odds, 'ei'), + (1.0 * typo_odds, 'ie') + ], + "\u2603": [ + (1 - typo_odds, 'ie'), + (1.0 * typo_odds, 'ei') + ], + "\u2604": [ + (1 - typo_odds, 'es'), + (1.0 * typo_odds, 's') + ], + "\u2605": [ + (1 - typo_odds, 'ies'), + (1.0 * typo_odds, 'ys') + ], + "\u2606": [ + (1 - typo_odds, 'ie'), + (1.0 * typo_odds, 'y') + ], ',': [ (1 - punct_typo_odds, ','), (0.9 * punct_typo_odds, ''), @@ -162,13 +202,20 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, '.': [ (1 - punct_typo_odds, '.'), (0.3 * punct_typo_odds, ''), + (0.1 * punct_typo_odds, ' .'), + (0.01 * punct_typo_odds, '>'), (0.3 * punct_typo_odds, ','), ], - "'": [ + "\u2600": [ (1 - punct_typo_odds, "'s"), (0.7 * punct_typo_odds, "s"), - (0.3 * punct_typo_odds, "s'") + (0.1 * punct_typo_odds, "s'"), + (0.2 * punct_typo_odds, "'") ], + "'": [ + (1 - punct_typo_odds, "'"), + (1.0 * punct_typo_odds, "") + ] } space_subs = [ (1 - space_typo_odds, ' '), @@ -187,7 +234,7 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, continue for k in word: if k in sub_dict: - if k == prev_char_in: + if k == prev_char_in and random.random() > typo_odds: out_word += prev_char_out else: prev_char_out = random_select_weighted_list(sub_dict[k]) @@ -201,6 +248,8 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, intermediate_str = ' '.join(out_words) if lowercase_odds < 0.1: intermediate_str = intermediate_str.lower() + elif uppercase_odds < 0.1: + intermediate_str = intermediate_str.upper() out_str = '' for k in intermediate_str: if k == ' ': @@ -208,3 +257,13 @@ def add_typos(in_str, typo_odds = -1, space_typo_odds = -1, lowercase_odds = -1, else: out_str += k return out_str + +#def test_plot(): +# global typo_data +# global space_data +# global punct_data +# sb.kdeplot(typo_data, label="Typos") +# sb.kdeplot(space_data, label="Spaces") +# sb.kdeplot(punct_data, label="Punctuation") +# plt.legend() +# plt.show() From 9f795609451a19c7747be80dcd8af91adbb2f448 Mon Sep 17 00:00:00 2001 From: Joseph Mellor Date: Wed, 8 Sep 2021 19:30:30 -0500 Subject: [PATCH 6/6] Made everything four spaces --- bot/prompt.py | 588 +++++++++++++++++++++++++------------------------- 1 file changed, 294 insertions(+), 294 deletions(-) diff --git a/bot/prompt.py b/bot/prompt.py index 2f43b69..c134030 100644 --- a/bot/prompt.py +++ b/bot/prompt.py @@ -6,367 +6,367 @@ def random_select_weighted_list(ls): return random.choices([k[1] for k in ls], weights = [k[0] for k in ls], k = 1)[0] suspect_words = [ - (1.0, 'suspect'), - (0.3, 'have reason to suspect'), - (1.0, 'believe'), - (0.3, 'have reason to believe'), - (1.0, 'think'), - (0.5, 'have evidence'), - (0.2, 'have strong evidence'), - (1.0, 'am convinced'), - (1.0, 'am certain'), - (1.0, 'can prove'), - (1.0, 'have proof'), + (1.0, 'suspect'), + (0.3, 'have reason to suspect'), + (1.0, 'believe'), + (0.3, 'have reason to believe'), + (1.0, 'think'), + (0.5, 'have evidence'), + (0.2, 'have strong evidence'), + (1.0, 'am convinced'), + (1.0, 'am certain'), + (1.0, 'can prove'), + (1.0, 'have proof'), ] my_family_words = [ - (0.5, 'father'), - (0.5, 'mother'), - (1.0, 'brother'), - (1.0, 'sister'), - (0.5, 'older brother'), - (0.5, 'older sister'), - (0.5, 'younger brother'), - (0.5, 'younger sister'), - (1.0, 'cousin'), - (2.0, 'aunt'), - (0.5, 'uncle'), - (0.6, 'daughter'), - (0.6, 'son'), - (0.2, 'step-son'), - (0.1, 'step son'), - (0.2, 'step-daughter'), - (0.1, 'step daughter'), - (0.7, 'nephew'), - (0.7, 'niece'), - (0.5, 'grandmother'), - (0.2, 'grandma'), - (0.5, 'grandfather'), - (0.2, 'grandpa'), - (0.5, 'granddad'), - (1.0, 'grandson'), - (1.0, 'granddaughter'), - (1.0, 'son-in-law'), - (1.0, 'daughter-in-law'), - (1.0, 'mother-in-law'), - (1.0, 'father-in-law'), - (0.1, 'half-brother'), - (0.1, 'half-sister'), + (0.5, 'father'), + (0.5, 'mother'), + (1.0, 'brother'), + (1.0, 'sister'), + (0.5, 'older brother'), + (0.5, 'older sister'), + (0.5, 'younger brother'), + (0.5, 'younger sister'), + (1.0, 'cousin'), + (2.0, 'aunt'), + (0.5, 'uncle'), + (0.6, 'daughter'), + (0.6, 'son'), + (0.2, 'step-son'), + (0.1, 'step son'), + (0.2, 'step-daughter'), + (0.1, 'step daughter'), + (0.7, 'nephew'), + (0.7, 'niece'), + (0.5, 'grandmother'), + (0.2, 'grandma'), + (0.5, 'grandfather'), + (0.2, 'grandpa'), + (0.5, 'granddad'), + (1.0, 'grandson'), + (1.0, 'granddaughter'), + (1.0, 'son-in-law'), + (1.0, 'daughter-in-law'), + (1.0, 'mother-in-law'), + (1.0, 'father-in-law'), + (0.1, 'half-brother'), + (0.1, 'half-sister'), ] subjects = [ - 'science', - 'math', - 'history', - 'social studies', - 'chemistry', - 'algebra', - 'Spanish', - 'calculus', - 'art', - 'music', - 'gym', - 'English', - 'language arts', - 'composition', - 'geometry', - 'statistics', - 'physics', - 'earth science', - 'economics', - 'geography', - 'government', - 'French', - 'business', + 'science', + 'math', + 'history', + 'social studies', + 'chemistry', + 'algebra', + 'Spanish', + 'calculus', + 'art', + 'music', + 'gym', + 'English', + 'language arts', + 'composition', + 'geometry', + 'statistics', + 'physics', + 'earth science', + 'economics', + 'geography', + 'government', + 'French', + 'business', ] my_teacher_words = [ - (1.0, 'teacher'), - *[(0.2, k + ' teacher') for k in subjects], - (0.5, 'tutor'), - *[(0.1, k + ' tutor') for k in subjects], - (1.0, 'babysitter'), - (1.0, 'instructor'), - *[(0.2, k + ' instructor') for k in subjects], - (0.5, 'professor'), - *[(0.1, k + ' professor') for k in subjects], + (1.0, 'teacher'), + *[(0.2, k + ' teacher') for k in subjects], + (0.5, 'tutor'), + *[(0.1, k + ' tutor') for k in subjects], + (1.0, 'babysitter'), + (1.0, 'instructor'), + *[(0.2, k + ' instructor') for k in subjects], + (0.5, 'professor'), + *[(0.1, k + ' professor') for k in subjects], ] my_nonfamily_words = [ - (2.0, 'neighbor'), - (0.6, 'next-door neighbor'), - (1.5, 'boss'), - (0.7, 'landlord'), - (1.5, 'doctor'), - (0.7, 'employee'), - (0.5, 'roommate'), - (1.0, 'friend'), - (1.0, 'girlfriend'), - (1.0, 'boyfriend'), - (0.3, 'maid'), - (0.2, 'live-in maid'), - (0.1, 'live in maid'), - (0.2, 'housekeeper'), - (0.1, 'cleaning lady'), - (2.0, 'ex'), - (0.3, 'therapist'), - (0.5, 'supervisor'), - (1.0, 'employer'), - (0.2, 'lawyer'), - (0.4, 'dentist'), - (0.2, 'plumber'), - (1.5, 'pastor'), - (0.5, 'deacon'), - (0.8, 'priest'), - (0.2, 'accountant'), + (2.0, 'neighbor'), + (0.6, 'next-door neighbor'), + (1.5, 'boss'), + (0.7, 'landlord'), + (1.5, 'doctor'), + (0.7, 'employee'), + (0.5, 'roommate'), + (1.0, 'friend'), + (1.0, 'girlfriend'), + (1.0, 'boyfriend'), + (0.3, 'maid'), + (0.2, 'live-in maid'), + (0.1, 'live in maid'), + (0.2, 'housekeeper'), + (0.1, 'cleaning lady'), + (2.0, 'ex'), + (0.3, 'therapist'), + (0.5, 'supervisor'), + (1.0, 'employer'), + (0.2, 'lawyer'), + (0.4, 'dentist'), + (0.2, 'plumber'), + (1.5, 'pastor'), + (0.5, 'deacon'), + (0.8, 'priest'), + (0.2, 'accountant'), ] my_family_possessive_adj = [(k[0], k[1] + "'s ") for k in my_nonfamily_words] my_family_possessive_adj.append((20.0, '')) my_nonfamily_possessive_adj = [(k[0], k[1] + "'s ") for k in my_family_words] my_nonfamily_possessive_adj.append((20.0, '')) my_teacher_possessive_adj = [ - (0.2, 'younger brother'), - (0.1, 'older brother'), - (0.2, 'younger sister'), - (0.1, 'older sister'), - (0.8, 'brother'), - (0.4, 'step-brother'), - (0.8, 'sister'), - (0.4, 'step-sister'), - (1.0, 'cousin'), - (2.0, 'daughter'), - (2.0, 'son'), - (0.4, 'step-son'), - (0.2, 'step son'), - (0.4, 'step-daughter'), - (0.2, 'step daughter'), - (0.7, 'nephew'), - (0.7, 'niece'), + (0.2, 'younger brother'), + (0.1, 'older brother'), + (0.2, 'younger sister'), + (0.1, 'older sister'), + (0.8, 'brother'), + (0.4, 'step-brother'), + (0.8, 'sister'), + (0.4, 'step-sister'), + (1.0, 'cousin'), + (2.0, 'daughter'), + (2.0, 'son'), + (0.4, 'step-son'), + (0.2, 'step son'), + (0.4, 'step-daughter'), + (0.2, 'step daughter'), + (0.7, 'nephew'), + (0.7, 'niece'), ] my_teacher_possessive_adj = [ (k[0], k[1] + "'s ") for k in my_teacher_possessive_adj ] violated_words = [ - (8.0, 'violated'), - (8.0, 'disregarded'), - (8.0, 'disobeyed'), - (4.0, 'assisted someone in violating'), - (4.0, 'assisted someone in breaking'), - (4.0, 'assisted someone in disobeying'), - (3.0, 'helped someone violate'), - (3.0, 'helped someone disobey'), - (3.0, 'helped someone break'), - (3.0, 'helped violate'), - (3.0, 'helped break'), - (3.0, 'helped disobey'), + (8.0, 'violated'), + (8.0, 'disregarded'), + (8.0, 'disobeyed'), + (4.0, 'assisted someone in violating'), + (4.0, 'assisted someone in breaking'), + (4.0, 'assisted someone in disobeying'), + (3.0, 'helped someone violate'), + (3.0, 'helped someone disobey'), + (3.0, 'helped someone break'), + (3.0, 'helped violate'), + (3.0, 'helped break'), + (3.0, 'helped disobey'), - (0.4, 'helped someone have an abortion, violating'), - (0.4, 'helped someone have an abortion, breaking'), - (0.4, 'helped someone have an abortion in violation of'), + (0.4, 'helped someone have an abortion, violating'), + (0.4, 'helped someone have an abortion, breaking'), + (0.4, 'helped someone have an abortion in violation of'), - (0.4, 'helped someone get an abortion, violating'), - (0.4, 'helped someone get an abortion, breaking'), - (0.4, 'helped someone get an abortion in violation of'), + (0.4, 'helped someone get an abortion, violating'), + (0.4, 'helped someone get an abortion, breaking'), + (0.4, 'helped someone get an abortion in violation of'), - (0.4, 'helped someone to get an abortion, violating'), - (0.4, 'helped someone to get an abortion, breaking'), - (0.4, 'helped someone to get an abortion in violation of'), + (0.4, 'helped someone to get an abortion, violating'), + (0.4, 'helped someone to get an abortion, breaking'), + (0.4, 'helped someone to get an abortion in violation of'), - (0.1, 'helped someone kill her child and violate'), - (0.1, 'helped someone kill her baby and violate'), - (0.1, 'helped someone kill a child and violate'), - (0.1, 'helped someone kill a baby and violate'), + (0.1, 'helped someone kill her child and violate'), + (0.1, 'helped someone kill her baby and violate'), + (0.1, 'helped someone kill a child and violate'), + (0.1, 'helped someone kill a baby and violate'), - (0.1, 'helped someone kill a child and disobey'), - (0.1, 'helped someone kill a baby and disobey'), - (0.1, 'helped someone kill her child and disobey'), - (0.1, 'helped someone kill her baby and disobey'), + (0.1, 'helped someone kill a child and disobey'), + (0.1, 'helped someone kill a baby and disobey'), + (0.1, 'helped someone kill her child and disobey'), + (0.1, 'helped someone kill her baby and disobey'), - (0.4, 'helped someone abort her child and violate'), - (0.4, 'helped someone abort her baby and violate'), - (0.4, 'helped someone abort her child and disobey'), - (0.4, 'helped someone abort her baby and disobey'), + (0.4, 'helped someone abort her child and violate'), + (0.4, 'helped someone abort her baby and violate'), + (0.4, 'helped someone abort her child and disobey'), + (0.4, 'helped someone abort her baby and disobey'), - (0.1, 'helped someone murder her child and violate'), - (0.1, 'helped someone murder her baby and violate'), + (0.1, 'helped someone murder her child and violate'), + (0.1, 'helped someone murder her baby and violate'), - (0.1, 'helped someone murder a child and disobey'), - (0.1, 'helped someone murder a baby and disobey'), - (0.1, 'helped someone murder her child and disobey'), - (0.1, 'helped someone murder her baby and disobey'), + (0.1, 'helped someone murder a child and disobey'), + (0.1, 'helped someone murder a baby and disobey'), + (0.1, 'helped someone murder her child and disobey'), + (0.1, 'helped someone murder her baby and disobey'), - (0.1, 'helped someone murder her child in violation of'), - (0.1, 'helped someone murder her baby in violation of'), - (0.1, 'helped someone murder a child in violation of'), - (0.1, 'helped someone murder a baby in violation of'), + (0.1, 'helped someone murder her child in violation of'), + (0.1, 'helped someone murder her baby in violation of'), + (0.1, 'helped someone murder a child in violation of'), + (0.1, 'helped someone murder a baby in violation of'), - (0.1, 'helped someone kill her child in violation of'), - (0.1, 'helped someone kill her baby in violation of'), - (0.1, 'helped someone kill a child in violation of'), - (0.1, 'helped someone kill a baby in violation of'), + (0.1, 'helped someone kill her child in violation of'), + (0.1, 'helped someone kill her baby in violation of'), + (0.1, 'helped someone kill a child in violation of'), + (0.1, 'helped someone kill a baby in violation of'), - (0.1, 'helped someone kill her child, violating'), - (0.1, 'helped someone kill her baby, violating'), - (0.1, 'helped someone kill a child, violating'), - (0.1, 'helped someone kill a baby, violating'), + (0.1, 'helped someone kill her child, violating'), + (0.1, 'helped someone kill her baby, violating'), + (0.1, 'helped someone kill a child, violating'), + (0.1, 'helped someone kill a baby, violating'), - (0.1, 'helped someone murder a child, violating'), - (0.1, 'helped someone murder a baby, violating'), - (0.1, 'helped someone murder her child, violating'), - (0.1, 'helped someone murder her baby, violating'), + (0.1, 'helped someone murder a child, violating'), + (0.1, 'helped someone murder a baby, violating'), + (0.1, 'helped someone murder her child, violating'), + (0.1, 'helped someone murder her baby, violating'), - (0.1, 'aided in the killing of a child, violating'), - (0.1, 'aided in the killing of a baby, violating'), - (0.1, 'aided in the killing of her child, violating'), - (0.1, 'aided in the killing of her baby, violating'), - (0.1, 'aided her in killing her baby, violating'), - (0.1, 'aided her in killing her child, violating'), + (0.1, 'aided in the killing of a child, violating'), + (0.1, 'aided in the killing of a baby, violating'), + (0.1, 'aided in the killing of her child, violating'), + (0.1, 'aided in the killing of her baby, violating'), + (0.1, 'aided her in killing her baby, violating'), + (0.1, 'aided her in killing her child, violating'), - (0.1, 'aided in the killing of a child, disobeying'), - (0.1, 'aided in the killing of a baby, disobeying'), - (0.1, 'aided her in killing her baby, disobeying'), - (0.1, 'aided her in killing her child, disobeying'), + (0.1, 'aided in the killing of a child, disobeying'), + (0.1, 'aided in the killing of a baby, disobeying'), + (0.1, 'aided her in killing her baby, disobeying'), + (0.1, 'aided her in killing her child, disobeying'), - (0.1, 'aided in the killing of a child, breaking'), - (0.1, 'aided in the killing of a baby, breaking'), - (0.1, 'aided in the killing of her child, breaking'), - (0.1, 'aided in the killing of her baby, breaking'), - (0.1, 'aided in the killing of a child, breaking'), - (0.1, 'aided in the killing of a baby, breaking'), + (0.1, 'aided in the killing of a child, breaking'), + (0.1, 'aided in the killing of a baby, breaking'), + (0.1, 'aided in the killing of her child, breaking'), + (0.1, 'aided in the killing of her baby, breaking'), + (0.1, 'aided in the killing of a child, breaking'), + (0.1, 'aided in the killing of a baby, breaking'), - (0.1, 'aided her in killing her baby, breaking'), - (0.1, 'aided her in killing her child, breaking'), - (0.1, 'aided her in killing her baby, breaking'), - (0.1, 'aided her in killing her child, breaking'), + (0.1, 'aided her in killing her baby, breaking'), + (0.1, 'aided her in killing her child, breaking'), + (0.1, 'aided her in killing her baby, breaking'), + (0.1, 'aided her in killing her child, breaking'), - (0.1, 'aided in the killing of a child, in violation of'), - (0.1, 'aided in the killing of a baby, in violation of'), - (0.1, 'aided in the killing of her child, in violation of'), - (0.1, 'aided in the killing of her baby, in violation of'), - (0.1, 'aided in the killing of a child, in violation of'), - (0.1, 'aided in the killing of a baby, in violation of'), + (0.1, 'aided in the killing of a child, in violation of'), + (0.1, 'aided in the killing of a baby, in violation of'), + (0.1, 'aided in the killing of her child, in violation of'), + (0.1, 'aided in the killing of her baby, in violation of'), + (0.1, 'aided in the killing of a child, in violation of'), + (0.1, 'aided in the killing of a baby, in violation of'), - (0.1, 'aided her in killing her baby, in violation of'), - (0.1, 'aided her in killing her child, in violation of'), - (0.1, 'aided her in killing her baby, in violation of'), - (0.1, 'aided her in killing her child, in violation of'), + (0.1, 'aided her in killing her baby, in violation of'), + (0.1, 'aided her in killing her child, in violation of'), + (0.1, 'aided her in killing her baby, in violation of'), + (0.1, 'aided her in killing her child, in violation of'), - (0.1, 'aided in the killing of a child and violated'), - (0.1, 'aided in the killing of a baby and violated'), - (0.1, 'aided in the killing of her child and violated'), - (0.1, 'aided in the killing of her baby and violated'), + (0.1, 'aided in the killing of a child and violated'), + (0.1, 'aided in the killing of a baby and violated'), + (0.1, 'aided in the killing of her child and violated'), + (0.1, 'aided in the killing of her baby and violated'), - (0.1, 'aided her in killing her baby and violated'), - (0.1, 'aided her in killing her child and violated'), + (0.1, 'aided her in killing her baby and violated'), + (0.1, 'aided her in killing her child and violated'), ] days_of_the_week = [ - 'Sunday', - 'Monday', - 'Tuesday', - 'Wednesday', - 'Thursday', - 'Friday', - 'Saturday', + 'Sunday', + 'Monday', + 'Tuesday', + 'Wednesday', + 'Thursday', + 'Friday', + 'Saturday', ] got_words = [ - 'got', 'had', 'helped someone get', 'assisted someone in getting', 'helped someone have' + 'got', 'had', 'helped someone get', 'assisted someone in getting', 'helped someone have' ] past_time_frames = [ - 'last week', 'last month', 'this week', 'this month', 'yesterday', 'a week ago', 'two weeks ago', 'two days ago', 'on the weekend', - 'this weekend', 'last weekend' + 'last week', 'last month', 'this week', 'this month', 'yesterday', 'a week ago', 'two weeks ago', 'two days ago', 'on the weekend', + 'this weekend', 'last weekend' ] past_time_frames.extend([ 'last ' + k for k in days_of_the_week ]) past_time_frames.extend([ 'on ' + k for k in days_of_the_week ]) will_get_words = [ - 'is getting', 'will get', 'plans on having', 'is trying to get', 'is trying to have', 'will try to get', 'is helping someone get', 'is planning to get', 'is planning on getting', - 'plans to get' + 'is getting', 'will get', 'plans on having', 'is trying to get', 'is trying to have', 'will try to get', 'is helping someone get', 'is planning to get', 'is planning on getting', + 'plans to get' ] past_time_frames.extend(['last ' + k for k in days_of_the_week]) past_time_frames.extend(['on ' + k for k in days_of_the_week]) will_get_words = ['is getting', 'will get', 'plans on having', 'is trying to get', 'is trying to have', 'will try to get'] future_time_frames = [ - 'next week', 'this week', 'tomorrow', 'two days from now', 'a week from now', 'after she leaves work', 'after work', 'on the weekend', - 'this weekend', 'next weekend' + 'next week', 'this week', 'tomorrow', 'two days from now', 'a week from now', 'after she leaves work', 'after work', 'on the weekend', + 'this weekend', 'next weekend' ] future_time_frames.extend(['next ' + k for k in days_of_the_week]) future_time_frames.extend(['on ' + k for k in days_of_the_week]) abortion_ban_words = [ - 'abortion ban', 'ban on abortion', 'law on abortion', 'recent abortion law', 'abortion restrictions', 'restrictions on abortion', + 'abortion ban', 'ban on abortion', 'law on abortion', 'recent abortion law', 'abortion restrictions', 'restrictions on abortion', ] abortion_ban_words = [ *["recently passed " + k for k in abortion_ban_words], *[k for k in abortion_ban_words] ] abortion_ban_words = [ *["Texas's " + k for k in abortion_ban_words], *["the " + k for k in abortion_ban_words] ] abortion_ban_words.extend(['Texas law', 'the new law', 'Texas law on abortion', 'the Texas law on abortion', 'the Texas abortion law', - 'the new Texas abortion law', 'the recently passed Texas abortion law', 'new abortion law']) + 'the new Texas abortion law', 'the recently passed Texas abortion law', 'new abortion law']) def gen_abortion_prompt_I(accused): - abortion_prompt = 'I ' - abortion_prompt += random_select_weighted_list(suspect_words) - abortion_prompt += random.choices([' that', ''], weights=[0.75, 0.25], k=1)[0] - abortion_prompt += ' my ' - abortion_prompt += accused - abortion_prompt += random.choice([' has ', ' ']) - abortion_prompt += random_select_weighted_list(violated_words) - abortion_prompt += ' ' - abortion_prompt += random.choice(abortion_ban_words) - abortion_prompt += '.' - return abortion_prompt + abortion_prompt = 'I ' + abortion_prompt += random_select_weighted_list(suspect_words) + abortion_prompt += random.choices([' that', ''], weights=[0.75, 0.25], k=1)[0] + abortion_prompt += ' my ' + abortion_prompt += accused + abortion_prompt += random.choice([' has ', ' ']) + abortion_prompt += random_select_weighted_list(violated_words) + abortion_prompt += ' ' + abortion_prompt += random.choice(abortion_ban_words) + abortion_prompt += '.' + return abortion_prompt def gen_abortion_prompt_My(accused): - abortion_prompt = 'My ' - abortion_prompt += accused - abortion_prompt += ' ' - past = random.random() > 0.5 - if past: - abortion_prompt += random.choice(got_words) - else: - abortion_prompt += random.choice(will_get_words) - abortion_prompt += ' an' - abortion_prompt += random.choices(['', ' illegal', ' unlawful'], weights=[0.625, 0.375 / 2.0, 0.375 / 2.0], k=1)[0] - abortion_prompt += ' abortion' - if random.random() > 0.5: + abortion_prompt = 'My ' + abortion_prompt += accused abortion_prompt += ' ' + past = random.random() > 0.5 if past: - abortion_prompt += random.choice(past_time_frames) + abortion_prompt += random.choice(got_words) else: - abortion_prompt += random.choice(future_time_frames) - abortion_prompt += '.' - return abortion_prompt + abortion_prompt += random.choice(will_get_words) + abortion_prompt += ' an' + abortion_prompt += random.choices(['', ' illegal', ' unlawful'], weights=[0.625, 0.375 / 2.0, 0.375 / 2.0], k=1)[0] + abortion_prompt += ' abortion' + if random.random() > 0.5: + abortion_prompt += ' ' + if past: + abortion_prompt += random.choice(past_time_frames) + else: + abortion_prompt += random.choice(future_time_frames) + abortion_prompt += '.' + return abortion_prompt counter = 0 def gen_abortion_prompt(): - global counter - accused_family_person = random_select_weighted_list(my_family_possessive_adj) - accused_family_person += random_select_weighted_list(my_family_words) - accused_nonfamily_person = random_select_weighted_list(my_nonfamily_possessive_adj) - accused_nonfamily_person += random_select_weighted_list(my_nonfamily_words) - accused_teacher = random_select_weighted_list(my_teacher_possessive_adj) - accused_teacher += random_select_weighted_list(my_teacher_words) - accused = random_select_weighted_list([ - (1.0, accused_family_person), - (1.5, accused_nonfamily_person), - (0.5, accused_teacher) - ]) - abortion_prompts = [ - (1.0, gen_abortion_prompt_I(accused)), - (1.0, gen_abortion_prompt_My(accused)) - ] - counter += 1 - if random.random() < 0.001: - print('\r\x1b[K' + str(counter), end='') - return random_select_weighted_list(abortion_prompts) - #return add_typos(random_select_weighted_list(abortion_prompts)) + global counter + accused_family_person = random_select_weighted_list(my_family_possessive_adj) + accused_family_person += random_select_weighted_list(my_family_words) + accused_nonfamily_person = random_select_weighted_list(my_nonfamily_possessive_adj) + accused_nonfamily_person += random_select_weighted_list(my_nonfamily_words) + accused_teacher = random_select_weighted_list(my_teacher_possessive_adj) + accused_teacher += random_select_weighted_list(my_teacher_words) + accused = random_select_weighted_list([ + (1.0, accused_family_person), + (1.5, accused_nonfamily_person), + (0.5, accused_teacher) + ]) + abortion_prompts = [ + (1.0, gen_abortion_prompt_I(accused)), + (1.0, gen_abortion_prompt_My(accused)) + ] + counter += 1 + if random.random() < 0.001: + print('\r\x1b[K' + str(counter), end='') + return random_select_weighted_list(abortion_prompts) + #return add_typos(random_select_weighted_list(abortion_prompts)) bigram_counter = Counter() trigram_counter = Counter() quadgram_counter = Counter() def check_ngram_frequency(prompt): - words = prompt.split() - for i in range(len(words) - 1): - cur_bigram = ' '.join(words[i:i+2]) - bigram_counter[cur_bigram] += 1 - for i in range(len(words) - 2): - cur_trigram = ' '.join(words[i:i+3]) - trigram_counter[cur_trigram] += 1 - for i in range(len(words) - 3): - cur_quadgram = ' '.join(words[i:i+4]) - quadgram_counter[cur_quadgram] += 1 - return prompt + words = prompt.split() + for i in range(len(words) - 1): + cur_bigram = ' '.join(words[i:i+2]) + bigram_counter[cur_bigram] += 1 + for i in range(len(words) - 2): + cur_trigram = ' '.join(words[i:i+3]) + trigram_counter[cur_trigram] += 1 + for i in range(len(words) - 3): + cur_quadgram = ' '.join(words[i:i+4]) + quadgram_counter[cur_quadgram] += 1 + return prompt def write_ngram_to_file(counter, filename, total): with open(filename, 'w') as writer: @@ -376,13 +376,13 @@ def write_ngram_to_file(counter, filename, total): break if __name__ == "__main__": - total_number = 2000000 - sample_abortion_prompts = [ check_ngram_frequency(gen_abortion_prompt()) for k in range(total_number) ] - for k in sorted(list(sample_abortion_prompts)[:200], key = lambda o: random.random()): - print(k) - unique = len(set(sample_abortion_prompts)) - print('Duplicates: ' + str(total_number - unique)) - print('Unique: ' + str(unique)) - write_ngram_to_file(bigram_counter, 'bigram_freq.txt', len(sample_abortion_prompts)) - write_ngram_to_file(trigram_counter, 'trigram_freq.txt', len(sample_abortion_prompts)) - write_ngram_to_file(quadgram_counter, 'quadgram_freq.txt', len(sample_abortion_prompts)) + total_number = 2000000 + sample_abortion_prompts = [ check_ngram_frequency(gen_abortion_prompt()) for k in range(total_number) ] + for k in sorted(list(sample_abortion_prompts)[:200], key = lambda o: random.random()): + print(k) + unique = len(set(sample_abortion_prompts)) + print('Duplicates: ' + str(total_number - unique)) + print('Unique: ' + str(unique)) + write_ngram_to_file(bigram_counter, 'bigram_freq.txt', len(sample_abortion_prompts)) + write_ngram_to_file(trigram_counter, 'trigram_freq.txt', len(sample_abortion_prompts)) + write_ngram_to_file(quadgram_counter, 'quadgram_freq.txt', len(sample_abortion_prompts))