-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpreprocess.py
More file actions
42 lines (37 loc) · 1.51 KB
/
preprocess.py
File metadata and controls
42 lines (37 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os, sys
import subprocess
import glob
RAW_DATA_DIR = str(sys.argv[1])
OUTPUT_DIR = "{}_parts".format(RAW_DATA_DIR)
os.makedirs(OUTPUT_DIR)
print RAW_DATA_DIR
print OUTPUT_DIR
# Step 1: write all filenames to a list
with open(os.path.join(OUTPUT_DIR, 'preprocess_file_list.txt'), 'w') as f:
for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR):
for filename in filenames:
if ".wav" in filename:
f.write("file '" + dirpath + '/' + filename + "'\n")
# Step 2: concatenate everything into one massive wav file
os.system(
"ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".
format(OUTPUT_DIR, OUTPUT_DIR))
# # get the length of the resulting file
length = float(
subprocess.check_output(
'ffprobe -i {}/preprocess_all_audio.wav -show_entries format=duration -v quiet -of csv="p=0"'.
format(OUTPUT_DIR),
shell=True))
# # Step 3: split the big file into 8-second chunks
for i in xrange(int(length) // 8 - 1):
os.system(
'ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.wav'.
format(8 * i, OUTPUT_DIR, OUTPUT_DIR, i))
# # Step 4: clean up temp files
os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
os.system('rm {}/preprocess_file_list.txt'.format(OUTPUT_DIR))
with open(os.path.join(OUTPUT_DIR, 'prompts.txt'), 'w') as f:
parts = glob.glob(OUTPUT_DIR + "/*.wav")
for p in parts:
w = p.split('/')[-1]
f.write('{}\tnone\n'.format(w))