-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconversationFeatures.py
More file actions
executable file
·124 lines (91 loc) · 4.26 KB
/
conversationFeatures.py
File metadata and controls
executable file
·124 lines (91 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
import sys
import csv
import datetime
import argparse
from statistics import mean, median
from collections import defaultdict
import matplotlib
"""
Generates a .csv of features of a text conversation.
Author: Julie Medero
"""
class Utterance():
def __init__(self, speaker="", dt=None, body=""):
self.speaker=speaker
self.dt = datetime.datetime.strptime(dt, "%m/%d/%Y %H:%M")
self.body = body
def main(args):
writer = csv.writer(args.output_file)
headers = ("File",
"Speaker",
"Mean Utterance Length",
"Median Utterance Length",
"Mean Consecutive Utterances",
"Median Consecutive Utterances",
"Mean Consecutive Words",
"Median Consecutive Words",
"Mean Response Time",
"Median Response Time")
writer.writerow(headers)
for input_file in args.input_files:
reader = csv.reader(input_file, delimiter='\t', quotechar='"')
utterances = [Utterance(speaker, dt, body) for speaker, dt, body in reader]
utterance_lengths = defaultdict(list)
consecutive_texts = defaultdict(list)
consecutive_words = defaultdict(list)
response_time = defaultdict(list)
last_speaker = None
last_dt = None
last_sequence_start = None
num_utterances = 0
consecutive_body_length = 0
for utterance in utterances:
this_speaker = utterance.speaker
this_dt = utterance.dt
this_body = utterance.body
this_body_length = len(this_body.split())
utterance_lengths[this_speaker].append(this_body_length)
num_utterances += 1
if last_speaker is None: last_speaker = this_speaker
if last_sequence_start is None: last_sequence_start = this_dt
if last_speaker != this_speaker:
consecutive_texts[last_speaker].append(num_utterances)
consecutive_words[last_speaker].append(consecutive_body_length)
response_time[this_speaker].append(this_dt - last_sequence_start)
num_utterances = 0
consecutive_body_length = 0
last_sequence_start = this_dt
consecutive_body_length += this_body_length
last_speaker = this_speaker
last_dt = this_dt
## Add stats for the last speaker
consecutive_texts[last_speaker].append(num_utterances)
consecutive_words[last_speaker].append(consecutive_body_length)
response_time[last_speaker].append(this_dt - last_sequence_start)
for speaker in utterance_lengths:
avg_len = mean(utterance_lengths[speaker])
med_len = median(utterance_lengths[speaker])
avg_consec_utterances = mean(consecutive_texts[speaker])
med_consec_utterances = median(consecutive_texts[speaker])
avg_consec_words = mean(consecutive_words[speaker])
med_consec_words = median(consecutive_words[speaker])
response_time[speaker].sort()
total_response_time = sum(response_time[speaker], datetime.timedelta())
num_responses = len(response_time[speaker])
median_response_time = response_time[speaker][int(num_responses/2)]
avg_response_time = total_response_time / len(response_time[speaker])
row = (input_file.name, speaker, avg_len, med_len, avg_consec_utterances, med_consec_utterances,
avg_consec_words, med_consec_words, avg_response_time, median_response_time)
writer.writerow(row)
def csv_readable(string):
return open(string, 'r', newline='')
def csv_writeable(string):
return open(string, 'w', newline='')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('input_files', metavar='FILE', nargs='+',
help='csv file(s) to analyze', type=csv_readable)
parser.add_argument('-o', '--output', dest='output_file', help='csv file to save analysis to', type=csv_writeable, default=sys.stdout)
args = parser.parse_args()
main(args)