From 961706b27ef792a4a69aa5d79dec60d46d22a27b Mon Sep 17 00:00:00 2001 From: YiwenShaoStephen Date: Wed, 1 Nov 2017 19:38:29 -0400 Subject: [PATCH 1/3] enable parallel jobs in feature extraction; minor changes on formatting --- .../s5/local/augment_and_make_feature_vect.py | 288 ------------------ egs/iam/s5/local/extract_feature.sh | 70 +++++ egs/iam/s5/local/make_feature_vect.py | 246 +++++++++++++-- egs/iam/s5/local/prepare_lexicon.py | 51 ++-- egs/iam/s5/local/process_augment_data.py | 75 +++++ egs/iam/s5/local/process_data.py | 112 +++---- .../s5/local/unk_arc_post_to_transcription.py | 105 +++---- egs/iam/s5/run.sh | 38 +-- 8 files changed, 516 insertions(+), 469 deletions(-) delete mode 100755 egs/iam/s5/local/augment_and_make_feature_vect.py create mode 100755 egs/iam/s5/local/extract_feature.sh create mode 100755 egs/iam/s5/local/process_augment_data.py diff --git a/egs/iam/s5/local/augment_and_make_feature_vect.py b/egs/iam/s5/local/augment_and_make_feature_vect.py deleted file mode 100755 index b1c179d71ed..00000000000 --- a/egs/iam/s5/local/augment_and_make_feature_vect.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python -import random -import argparse -import os -import sys -import scipy.io as sio -import numpy as np -from scipy import misc -from scipy.ndimage.interpolation import affine_transform -import math -from signal import signal, SIGPIPE, SIG_DFL -signal(SIGPIPE, SIG_DFL) - -parser = argparse.ArgumentParser( - description="""Generates and saves the feature vectors""") -parser.add_argument( - 'dir', type=str, help='directory of images.scp and is also output directory') -parser.add_argument('--seg', type=str, default='1', - help='JOB number of images.JOB.scp if run in parallel mode') -parser.add_argument('--out-ark', type=str, default='-', - help='where to write the output feature file') -parser.add_argument('--scale-size', type=int, default=40, - help='size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='size to scale the height of all images') -parser.add_argument('--vertical-shift', type=int, default=10, - help='total number of padding pixel per column') -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.scale_size - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - padding_x = max(5, int((args.padding / 100) * im.shape[1])) - padding_y = im.shape[0] - im_pad = np.concatenate( - (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) - im_pad1 = np.concatenate( - (im_pad, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) - return im_pad1 - - -def contrast_normalization(im, low_pct, high_pct): - element_number = im.size - rows = im.shape[0] - cols = im.shape[1] - im_contrast = np.zeros(shape=im.shape) - low_index = int(low_pct * element_number) - high_index = int(high_pct * element_number) - sorted_im = np.sort(im, axis=None) - low_thred = sorted_im[low_index] - high_thred = sorted_im[high_index] - for i in range(rows): - for j in range(cols): - if im[i, j] > high_thred: - im_contrast[i, j] = 255 # lightest to white - elif im[i, j] < low_thred: - im_contrast[i, j] = 0 # darkest to black - else: - # linear normalization - im_contrast[i, j] = (im[i, j] - low_thred) * \ - 255 / (high_thred - low_thred) - return im_contrast - - -def geometric_moment(frame, p, q): - m = 0 - for i in range(frame.shape[1]): - for j in range(frame.shape[0]): - m += (i ** p) * (j ** q) * frame[i][i] - return m - - -def central_moment(frame, p, q): - u = 0 - x_bar = geometric_moment(frame, 1, 0) / \ - geometric_moment(frame, 0, 0) # m10/m00 - y_bar = geometric_moment(frame, 0, 1) / \ - geometric_moment(frame, 0, 0) # m01/m00 - for i in range(frame.shape[1]): - for j in range(frame.shape[0]): - u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] - return u - - -def height_normalization(frame, w, h): - frame_normalized = np.zeros(shape=(h, w)) - alpha = 4 - x_bar = geometric_moment(frame, 1, 0) / \ - geometric_moment(frame, 0, 0) # m10/m00 - y_bar = geometric_moment(frame, 0, 1) / \ - geometric_moment(frame, 0, 0) # m01/m00 - sigma_x = (alpha * ((central_moment(frame, 2, 0) / - geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) - sigma_y = (alpha * ((central_moment(frame, 0, 2) / - geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) - for x in range(w): - for y in range(h): - i = int((x / w - 0.5) * sigma_x + x_bar) - j = int((y / h - 0.5) * sigma_y + y_bar) - frame_normalized[x][y] = frame[i][j] - return frame_normalized - - -def find_slant(im): - rows = im.shape[0] - cols = im.shape[1] - sum_max = 0 - slant_degree = 0 - for shear_degree in range(-45, 45, 5): - sum = 0 - shear_rad = shear_degree / 360.0 * 2 * math.pi - shear_matrix = np.array([[1, 0], - [np.tan(shear_rad), 1]]) - sheared_im = affine_transform(im, shear_matrix, cval=255.0) - for j in range(cols): - foreground = (sheared_im[:, j] < 100) - number = np.sum(foreground) - # print(number) - if number != 0: - start_point = -1 - end_point = -1 - start_point = 0 - for i in range(rows): - if foreground[i] == 1: - start_point = i - break - for i in range(rows - 1, -1, -1): - if foreground[i] == 1: - end_point = i - break - length = end_point - start_point + 1 - #print(number, length) - if length == number: - sum = sum + number * number - #print(shear_degree, sum) - if sum > sum_max: - sum_max = sum - slant_degree = shear_degree - return slant_degree - - -def deslant(im, shear): - padding_x = int(abs(np.tan(shear)) * im.shape[0]) - padding_y = im.shape[0] - if shear > 0: - im_pad = np.concatenate( - (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) - else: - im_pad = np.concatenate( - (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) - - shear_matrix = np.array([[1, 0], - [np.tan(shear), 1]]) - # sheared_im = affine_transform(image, shear_matrix, output_shape=( - # im.shape[0], im.shape[1] + abs(int(im.shape[0] * np.tan(shear)))), cval=128.0) - sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) - return sheared_im - - -def vertical_shift(im, mode='mid'): - total = args.vertical_shift - if mode == 'mid': - top = total / 2 - bottom = total - top - elif mode == 'top': # more padding on top - top = random.randint(total / 2, total) - bottom = total - top - elif mode == 'bottom': # more padding on bottom - top = random.randint(0, total / 2) - bottom = total - top - width = im.shape[1] - im_pad = np.concatenate( - (255 * np.ones((top, width), dtype=int), im), axis=0) - im_pad = np.concatenate( - (im_pad, 255 * np.ones((bottom, width), dtype=int)), axis=0) - return im_pad - - -def image_augment(im, out_fh, image_id): - random.seed(1) - shift_setting = ['mid', 'top', 'bottom'] - image_shift_id = [] - for i in range(3): - image_shift_id.append(image_id + '_shift' + str(i + 1)) - im_shift = vertical_shift(im, shift_setting[i]) - im_scaled = get_scaled_image(im_shift) - data = np.transpose(im_scaled, (1, 0)) - data = np.divide(data, 255.0) - new_scp_list.append(image_id + '_shift' + str(i + 1)) - write_kaldi_matrix(out_fh, data, image_shift_id[i]) - - -# main # -new_scp_list = list() -text_file = os.path.join(args.dir, 'backup', 'text') -text_dict = dict() # stores imageID and text - -with open(text_file) as text_fh: - for uttID_text in text_fh: - uttID_text = uttID_text.strip() - uttID_text_vect = uttID_text.split(" ") - uttID = uttID_text_vect[0] - imageID = uttID.split("_")[1] - text_vect = uttID_text_vect[1:] - text = " ".join(text_vect) - text_dict[imageID] = text - -utt2spk_file = os.path.join(args.dir, 'backup', 'utt2spk') -uttID_spk_dict = dict() # stores imageID and speaker - -with open(utt2spk_file) as utt2spk_fh: - for uttID_spk in utt2spk_fh: - uttID_spk = uttID_spk.strip() - uttID_spk_vect = uttID_spk.split(" ") - uttID = uttID_spk_vect[0] - imageID = uttID.split("_")[1] - spk = uttID_spk_vect[1] - uttID_spk_dict[imageID] = spk - -image_file = os.path.join(args.dir, 'backup', 'images.scp') -uttID_path_dict = dict() # stores imageID and image path - -with open(image_file) as image_fh: - for uttID_path in image_fh: - uttID_path = uttID_path.strip() - uttID_path_vect = uttID_path.split(" ") - uttID = uttID_path_vect[0] - imageID = uttID.split("_")[1] - path = uttID_path_vect[1] - uttID_path_dict[imageID] = path - -scp_name = 'images.scp' -data_list_path = os.path.join(args.dir, 'backup', scp_name) - -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark, 'wb') - -text_file = os.path.join(args.dir, 'text') -text_fh = open(text_file, 'w+') - -utt2spk_file = os.path.join(args.dir, 'utt2spk') -utt2spk_fh = open(utt2spk_file, 'w+') - -image_file = os.path.join(args.dir, 'images.scp') -image_fh = open(image_file, 'w+') - -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - #im_contrast = contrast_normalization(im, 0.05, 0.2) - #shear = (find_slant(im_contrast) / 360.0) * 2 * math.pi - im_scaled = get_scaled_image(im) - image_augment(im_scaled, out_fh, image_id) - -for uttID in new_scp_list: - imageID = uttID.split("_")[1] - text_fh.write(uttID + ' ' + text_dict[imageID] + '\n') - utt2spk_fh.write(uttID + ' ' + uttID_spk_dict[imageID] + '\n') - image_fh.write(uttID + ' ' + uttID_path_dict[imageID] + '\n') diff --git a/egs/iam/s5/local/extract_feature.sh b/egs/iam/s5/local/extract_feature.sh new file mode 100755 index 00000000000..d7b4ba79a54 --- /dev/null +++ b/egs/iam/s5/local/extract_feature.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +nj=4 +cmd=run.pl +compress=true +scale_size=40 +vertical_shift=10 +horizontal_shear=45 +augment=false +echo "$0 $@" + +. utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +logdir=$data/log + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +if [ -f $data/feats.scp ]; then + mkdir -p $data/.backup + echo "$0: moving $data/feats.scp to $data/.backup" + mv $data/feats.scp $data/.backup +fi + +if [ $augment = true ] && [[ $data = *'train'* ]]; then + if [ ! -d $data/backup ]; then + mkdir -p $data/backup + mv $data/text $data/utt2spk $data/images.scp $data/backup/ + else + cp $data/backup/* $data + fi +fi + + +scp=$data/images.scp +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +utils/split_scp.pl $scp $split_scps || exit 1; + + +# add ,p to the input rspecifier so that we can just skip over +# utterances that have bad wave data. +$cmd JOB=1:$nj $logdir/extract_feature.JOB.log \ + local/make_feature_vect.py $logdir --job JOB --scale-size $scale_size --augment $augment --horizontal-shear $horizontal_shear \| \ + copy-feats --compress=$compress --compression-method=7 ark:- \ + ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp \ + || exit 1; + +# concatenate the .scp files together. +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 + +# re-map utt2spk, images.scp and text if doing image augmentation +# on training set +if [ $augment = true ] && [[ $data = *'train'* ]]; then + local/process_augment_data.py $data + utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt +fi + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully processed ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi diff --git a/egs/iam/s5/local/make_feature_vect.py b/egs/iam/s5/local/make_feature_vect.py index dd35f1b14c7..697f9d92b86 100755 --- a/egs/iam/s5/local/make_feature_vect.py +++ b/egs/iam/s5/local/make_feature_vect.py @@ -1,20 +1,34 @@ #!/usr/bin/env python - +import random import argparse import os import sys import scipy.io as sio import numpy as np from scipy import misc - +from scipy.ndimage.interpolation import affine_transform +import math from signal import signal, SIGPIPE, SIG_DFL -signal(SIGPIPE,SIG_DFL) +signal(SIGPIPE, SIG_DFL) -parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""") -parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file') -parser.add_argument('--scale-size', type=int, default=40, help='size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images') +parser = argparse.ArgumentParser( + description="""Generates and saves the feature vectors""") +parser.add_argument( + 'dir', type=str, help='directory of images.scp and is also output directory') +parser.add_argument('--job', type=str, default='', + help='JOB number of images.JOB.scp') +parser.add_argument('--out-ark', type=str, default='-', + help='where to write the output feature file') +parser.add_argument('--scale-size', type=int, default=40, + help='size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='size to scale the height of all images') +parser.add_argument('--augment', type=str, default='false', + help='whether or not to do image augmentation on training set') +parser.add_argument('--vertical-shift', type=int, default=10, + help='total number of padding pixel per column') +parser.add_argument('--horizontal-shear', type=int, default=45, + help='maximum horizontal shearing degree') args = parser.parse_args() @@ -34,38 +48,212 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") + def get_scaled_image(im): scale_size = args.scale_size - sx = im.shape[1] - sy = im.shape[0] + sx = im.shape[1] # width + sy = im.shape[0] # height scale = (1.0 * scale_size) / sy nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) - padding_x = max(5,int((args.padding/100)*im.shape[1])) + padding_x = max(5, int((args.padding / 100) * im.shape[1])) padding_y = im.shape[0] - im_pad = np.concatenate((255 * np.ones((padding_y,padding_x), dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad,255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + im_pad1 = np.concatenate( + (im_pad, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) return im_pad1 -### main ### -data_list_path = os.path.join(args.dir,'images.scp') +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + # sheared_im = affine_transform(image, shear_matrix, output_shape=( + # im.shape[0], im.shape[1] + abs(int(im.shape[0] * np.tan(shear)))), cval=128.0) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'mid': + top = total / 2 + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad + + +def image_augment(im, out_fh, image_id): + # shift_setting = ['mid', 'top', 'bottom'] + slant_degree = find_slant_project(im) + shear_degrees = [0, random.randint(0, args.horizontal_shear), + random.randint(-args.horizontal_shear, 0)] + im_deslanted = horizontal_shear(im, slant_degree) + image_shear_id = [] + for i in range(3): + image_shear_id.append(image_id + '_shear' + str(i + 1)) + im_shear = horizontal_shear(im_deslanted, shear_degrees[i]) + data = np.transpose(im_shear, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_shear_id[i]) + + # image_shift_id.append(image_id + '_shift' + str(i + 1)) + # im_shift = vertical_shift(im, shift_setting[i]) + # data = np.transpose(im_shift, (1, 0)) + # data = np.divide(data, 255.0) + # new_scp_list.append(image_id + '_shift' + str(i + 1)) + # write_kaldi_matrix(out_fh, data, image_shift_id[i]) + + +# main # + +random.seed(1) + +scp_name = 'images.' + args.job + '.scp' # parallel +data_list_path = os.path.join(args.dir, scp_name) +# output dir of feature matrix if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') - -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scale = get_scaled_image(im) - - data = np.transpose(im_scale, (1, 0)) - data = np.divide(data, 255.0) - write_kaldi_matrix(out_fh, data, image_id) + out_fh = open(args.out_ark, 'wb') + + +if (args.augment == 'true') and ('train' in args.dir): + # only do image augmentation for training data + with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_contrast = contrast_normalization(im, 0.05, 0.2) + im_scaled = get_scaled_image(im) + image_augment(im_scaled, out_fh, image_id) +else: # settings for without augmentation or test data + with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im) + im_contrast = contrast_normalization(im_scaled, 0.05, 0.2) + # slant_degree = find_slant_project(im_contrast) + # im_sheared = horizontal_shear(im_contrast, slant_degree) + # im_padded = vertical_shift(im_scaled, 10) + data = np.transpose(im_contrast, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/iam/s5/local/prepare_lexicon.py b/egs/iam/s5/local/prepare_lexicon.py index 86298c45733..6a31e635a18 100755 --- a/egs/iam/s5/local/prepare_lexicon.py +++ b/egs/iam/s5/local/prepare_lexicon.py @@ -4,9 +4,11 @@ import os import sys -parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser = argparse.ArgumentParser( + description="""Creates the list of characters and wor ds in lexicon""") parser.add_argument('database_path', type=str, help='path to train text file') -parser.add_argument('test_text', type=str, help='path to test text file to include it in lexicon') +parser.add_argument('test_text', type=str, + help='path to test text file to include it in lexicon') parser.add_argument('dir', type=str, help='output path') args = parser.parse_args() @@ -14,33 +16,32 @@ char = {} lex = {} -text_path = os.path.join(args.database_path,'text') +text_path = os.path.join(args.database_path, 'text') with open(text_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - for i in range(1,len(line_vect)): - characters = list(line_vect[i]) - entry = " ".join(characters) - entry = entry.replace("#", "") - if line_vect[i]: - lex[line_vect[i]] = entry - -if args.test_text > 1: - text_path = os.path.join(args.test_text,'text') - with open(text_path) as f: for line in f: - line = line.strip() - line_vect = line.split(' ') - for i in range(1,len(line_vect)): - characters = list(line_vect[i]) - entry = " ".join(characters) - entry = entry.replace("#", "") - if line_vect[i]: - lex[line_vect[i]] = entry + line = line.strip() + line_vect = line.split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + entry = " ".join(characters) + entry = entry.replace("#", "") + if line_vect[i]: + lex[line_vect[i]] = entry +if args.test_text > 1: + text_path = os.path.join(args.test_text, 'text') + with open(text_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + entry = " ".join(characters) + entry = entry.replace("#", "") + if line_vect[i]: + lex[line_vect[i]] = entry lex_file = os.path.join(args.dir, 'lexicon.txt') lex_fh = open(lex_file, 'w+') for key in sorted(lex): - lex_fh.write(key + " " + lex[key] + "\n") + lex_fh.write(key + " " + lex[key] + "\n") diff --git a/egs/iam/s5/local/process_augment_data.py b/egs/iam/s5/local/process_augment_data.py new file mode 100755 index 00000000000..e5f98b44f46 --- /dev/null +++ b/egs/iam/s5/local/process_augment_data.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +import os +import argparse + +parser = argparse.ArgumentParser( + description="""Regenerate images.scp, text, utt2spk and spk2utt from feats.scp for augment data""") +parser.add_argument( + 'dir', type=str, help='directory of images.scp') +args = parser.parse_args() + + +text_file = os.path.join(args.dir, 'backup', 'text') +#text_file = os.path.join(args.dir, 'text.txt') +text_dict = dict() # stores imageID and text + +with open(text_file) as text_fh: + for uttID_text in text_fh: + uttID_text = uttID_text.strip() + uttID_text_vect = uttID_text.split(" ") + uttID = uttID_text_vect[0] + imageID = uttID.split("_")[1] + text_vect = uttID_text_vect[1:] + text = " ".join(text_vect) + text_dict[imageID] = text + # print "%s: %s" % (imageID, text) + +utt2spk_file = os.path.join(args.dir, 'backup', 'utt2spk') +#utt2spk_file = os.path.join(args.dir, 'utt2spk') +uttID_spk_dict = dict() # stores imageID and speaker + +with open(utt2spk_file) as utt2spk_fh: + for uttID_spk in utt2spk_fh: + uttID_spk = uttID_spk.strip() + uttID_spk_vect = uttID_spk.split(" ") + uttID = uttID_spk_vect[0] + imageID = uttID.split("_")[1] + spk = uttID_spk_vect[1] + uttID_spk_dict[imageID] = spk + # print "%s: %s" % (imageID, spk) + +image_file = os.path.join(args.dir, 'backup', 'images.scp') +#image_file = os.path.join(args.dir, 'images.scp') +uttID_path_dict = dict() # stores imageID and image path + +with open(image_file) as image_fh: + for uttID_path in image_fh: + uttID_path = uttID_path.strip() + uttID_path_vect = uttID_path.split(" ") + uttID = uttID_path_vect[0] + imageID = uttID.split("_")[1] + path = uttID_path_vect[1] + uttID_path_dict[imageID] = path + # print "%s: %s" % (imageID, path) + + +image_file = os.path.join(args.dir + '/', 'images.scp') +image_fh = open(image_file, 'w+') + +text_file = os.path.join(args.dir + '/', 'text') +text_fh = open(text_file, 'w+') + +utt2spk_file = os.path.join(args.dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w+') + +print('generate new files') +feats_scp_file = os.path.join(args.dir, 'feats.scp') +with open(feats_scp_file) as feats_scp_fh: + for uttID_image_path in feats_scp_fh: + uttID_image_path = uttID_image_path.strip() + uttID_path_vect = uttID_image_path.split(" ") + uttID = uttID_path_vect[0] + imageID = uttID.split("_")[1] + text_fh.write(uttID + ' ' + text_dict[imageID] + '\n') + utt2spk_fh.write(uttID + ' ' + uttID_spk_dict[imageID] + '\n') + image_fh.write(uttID + ' ' + uttID_path_dict[imageID] + '\n') diff --git a/egs/iam/s5/local/process_data.py b/egs/iam/s5/local/process_data.py index f9838d34563..ca954abac50 100755 --- a/egs/iam/s5/local/process_data.py +++ b/egs/iam/s5/local/process_data.py @@ -8,19 +8,19 @@ import xml.dom.minidom as minidom parser = argparse.ArgumentParser(description="""Creates text utt2spk - and image file """) + and image file """) parser.add_argument('database_path', type=str, help='path to downloaded iam data') parser.add_argument('out_dir', type=str, help='where to write output files') +parser.add_argument('--model_type', type=str, default='word', + choices=['word', 'character'], + help='word model or character model') parser.add_argument('dataset_dir', type=str, help='directory containing dataset') parser.add_argument('--dataset', type=str, default='new_trainset', - choices=['new_trainset', 'new_testset','new_valset'], - help='choose new_trainset, testset') -parser.add_argument('--model_type', type=str,default='word', - choices=['word', 'character'], - help='word model or character model') + choices=['new_trainset', 'new_testset', 'new_valset'], + help='choose new_trainset, new_testset, new_valset') args = parser.parse_args() ### main ### @@ -37,61 +37,65 @@ args.dataset + '.txt') text_file_path = os.path.join(args.database_path, - 'ascii','lines.txt') + 'ascii', 'lines.txt') text_dict = {} + + def process_text_file_for_word_model(): - with open (text_file_path, 'rt') as in_file: - for line in in_file: - if line[0]=='#': - continue - line = line.strip() - line_vect = line.split(' ') - text_vect = line.split(' ')[8:] - text = "".join(text_vect) - text = text.replace("|", " ") - text_dict[line_vect[0]] = text + with open(text_file_path, 'rt') as in_file: + for line in in_file: + if line[0] == '#': + continue + line = line.strip() + line_vect = line.split(' ') + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[line_vect[0]] = text + def process_text_file_for_char_model(): - with open (text_file_path, 'rt') as in_file: - for line in in_file: - if line[0]=='#': - continue - line = line.strip() - line_vect = line.split(' ') - text_vect = line.split(' ')[8:] - text = "".join(text_vect) - characters = list(text) - spaced_characters = " ".join(characters) - spaced_characters = spaced_characters.replace("|", "SIL") - spaced_characters = "SIL " + spaced_characters - spaced_characters = spaced_characters + " SIL" - text_dict[line_vect[0]] = spaced_characters + with open(text_file_path, 'rt') as in_file: + for line in in_file: + if line[0] == '#': + continue + line = line.strip() + line_vect = line.split(' ') + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + characters = list(text) + spaced_characters = " ".join(characters) + spaced_characters = spaced_characters.replace("|", "SIL") + spaced_characters = "SIL " + spaced_characters + spaced_characters = spaced_characters + " SIL" + text_dict[line_vect[0]] = spaced_characters -if args.model_type=='word': - print 'processing word model' - process_text_file_for_word_model() +if args.model_type == 'word': + print 'processing word model' + process_text_file_for_word_model() else: - print 'processing char model' - process_text_file_for_char_model() + print 'processing char model' + process_text_file_for_char_model() with open(dataset_path) as f: - for line in f: - line = line.strip() - line_vect = line.split('-') - xml_file = line_vect[0] + '-' + line_vect[1] - xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') - img_num = line[-3:] - doc = minidom.parse(xml_path) + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) - form_elements = doc.getElementsByTagName('form')[0] - writer_id = form_elements.getAttribute('writer-id') - outerfolder = form_elements.getAttribute('id')[0:3] - innerfolder = form_elements.getAttribute('id') - lines_path = os.path.join(args.database_path, 'lines', outerfolder, innerfolder, innerfolder) - image_file_path = lines_path + img_num + '.png' - text = text_dict[line] - utt_id = writer_id + '_' + line - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join( + args.database_path, 'lines', outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/s5/local/unk_arc_post_to_transcription.py b/egs/iam/s5/local/unk_arc_post_to_transcription.py index c27bf226cf9..ee09db54702 100755 --- a/egs/iam/s5/local/unk_arc_post_to_transcription.py +++ b/egs/iam/s5/local/unk_arc_post_to_transcription.py @@ -5,82 +5,85 @@ import sys import numpy as np from scipy import misc -parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser = argparse.ArgumentParser( + description="""uses phones to convert unk to word""") parser.add_argument('phones', type=str, help='phones and phonesID') parser.add_argument('words', type=str, help='word and wordID') parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('--input-ark', type=str, default='-', + help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', + help='where to write the output data') args = parser.parse_args() ### main ### phone_fh = open(args.phones, 'r') word_fh = open(args.words, 'r') -unk_fh = open(args.unk,'r') +unk_fh = open(args.unk, 'r') if args.input_ark == '-': input_fh = sys.stdin else: - input_fh = open(args.input_ark,'r') + input_fh = open(args.input_ark, 'r') if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark, 'wb') -phone_dict = dict()# stores phoneID and phone mapping +phone_dict = dict() # stores phoneID and phone mapping phone_data_vect = phone_fh.read().strip().split("\n") for key_val in phone_data_vect: - key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] word_dict = dict() word_data_vect = word_fh.read().strip().split("\n") for key_val in word_data_vect: - key_val = key_val.split(" ") - word_dict[key_val[1]] = key_val[0] + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] unk_val = unk_fh.read().strip().split(" ")[0] utt_word_dict = dict() -utt_phone_dict = dict()# stores utteranceID and phoneID +utt_phone_dict = dict() # stores utteranceID and phoneID unk_word_dict = dict() -count=0 +count = 0 for line in input_fh: - line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print "IndexError" - print line_vect - continue - uttID = line_vect[0] - word = line_vect[4] - phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) - phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word - else: - if word == '0': - word_val = ' ' + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones else: - word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 transcription = "" for key in sorted(utt_word_dict.iterkeys()): - transcription = key - for index in sorted(utt_word_dict[key].iterkeys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/iam/s5/run.sh b/egs/iam/s5/run.sh index d1eeda7e0d1..27b72ff1fff 100755 --- a/egs/iam/s5/run.sh +++ b/egs/iam/s5/run.sh @@ -5,37 +5,28 @@ nj=20 color=1 data_dir=data exp_dir=exp -augment=false +augment=true + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . utils/parse_options.sh # e.g. this parses the --stage option if supplied. if [ $stage -le 0 ]; then + # data preparation local/prepare_data.sh --nj $nj --dir $data_dir fi -mkdir -p $data_dir/{train,test}/data if [ $stage -le 1 ]; then - local/make_feature_vect.py $data_dir/test --scale-size 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:$data_dir/test/data/images.ark,$data_dir/test/feats.scp || exit 1 - steps/compute_cmvn_stats.sh $data_dir/test || exit 1; - - if [ $augment = true ]; then - # create a backup directory to store text, utt2spk and image.scp file - mkdir -p $data_dir/train/backup - mv $data_dir/train/text $data_dir/train/utt2spk $data_dir/train/images.scp $data_dir/train/backup/ - local/augment_and_make_feature_vect.py $data_dir/train --scale-size 40 --vertical-shift 10 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:$data_dir/train/data/images.ark,$data_dir/train/feats.scp || exit 1 - utils/utt2spk_to_spk2utt.pl $data_dir/train/utt2spk > $data_dir/train/spk2utt - else - local/make_feature_vect.py $data_dir/train --scale-size 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:$data_dir/train/data/images.ark,$data_dir/train/feats.scp || exit 1 - fi - steps/compute_cmvn_stats.sh $data_dir/train || exit 1; + for f in test; do + local/extract_feature.sh --nj $nj --cmd $cmd \ + --scale_size 40 \ + --augment $augment \ + $data_dir/$f + + steps/compute_cmvn_stats.sh $data_dir/$f || exit 1; + done fi +exit 0 numSilStates=4 numStates=8 @@ -149,8 +140,11 @@ if [ $stage -le 12 ]; then $exp_dir/tri3_ali fi + +affix=_aug +nnet3_affix=_shear + affix=_1a -nnet3_affix= if [ $stage -le 13 ]; then local/chain/run_cnn_1a.sh --stage 0 \ --gmm tri3 \ From b154f3b6713fd7ceabc4f6f54ac0502671da81fd Mon Sep 17 00:00:00 2001 From: YiwenShaoStephen Date: Wed, 1 Nov 2017 20:03:42 -0400 Subject: [PATCH 2/3] bug fixed in run.sh --- egs/iam/s5/local/make_feature_vect.py | 8 ++++---- egs/iam/s5/local/prepare_lexicon.py | 4 ++-- egs/iam/s5/run.sh | 15 ++++++++------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/egs/iam/s5/local/make_feature_vect.py b/egs/iam/s5/local/make_feature_vect.py index 697f9d92b86..8d00b56e8a5 100755 --- a/egs/iam/s5/local/make_feature_vect.py +++ b/egs/iam/s5/local/make_feature_vect.py @@ -11,10 +11,10 @@ from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) -parser = argparse.ArgumentParser( - description="""Generates and saves the feature vectors""") -parser.add_argument( - 'dir', type=str, help='directory of images.scp and is also output directory') +parser = argparse.ArgumentParser(description="""Generates and saves + the feature vectors""") +parser.add_argument('dir', type=str, + help='directory of images.scp and is also output directory') parser.add_argument('--job', type=str, default='', help='JOB number of images.JOB.scp') parser.add_argument('--out-ark', type=str, default='-', diff --git a/egs/iam/s5/local/prepare_lexicon.py b/egs/iam/s5/local/prepare_lexicon.py index 6a31e635a18..5844f809e1d 100755 --- a/egs/iam/s5/local/prepare_lexicon.py +++ b/egs/iam/s5/local/prepare_lexicon.py @@ -4,8 +4,8 @@ import os import sys -parser = argparse.ArgumentParser( - description="""Creates the list of characters and wor ds in lexicon""") +parser = argparse.ArgumentParser(description="""Creates the list of characters + and words in lexicon""") parser.add_argument('database_path', type=str, help='path to train text file') parser.add_argument('test_text', type=str, help='path to test text file to include it in lexicon') diff --git a/egs/iam/s5/run.sh b/egs/iam/s5/run.sh index 27b72ff1fff..661ef97517f 100755 --- a/egs/iam/s5/run.sh +++ b/egs/iam/s5/run.sh @@ -5,7 +5,7 @@ nj=20 color=1 data_dir=data exp_dir=exp -augment=true +augment=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -16,17 +16,18 @@ if [ $stage -le 0 ]; then local/prepare_data.sh --nj $nj --dir $data_dir fi + if [ $stage -le 1 ]; then - for f in test; do + for f in train test; do + mkdir -p $data_dir/$f/data local/extract_feature.sh --nj $nj --cmd $cmd \ - --scale_size 40 \ - --augment $augment \ + --scale_size 40 \ + --augment $augment \ $data_dir/$f steps/compute_cmvn_stats.sh $data_dir/$f || exit 1; done fi -exit 0 numSilStates=4 numStates=8 @@ -141,8 +142,8 @@ if [ $stage -le 12 ]; then fi -affix=_aug -nnet3_affix=_shear +affix=_1a +nnet3_affix= affix=_1a if [ $stage -le 13 ]; then From 0f77a74150a7fa17bc92d50c1144123d2e9daa51 Mon Sep 17 00:00:00 2001 From: YiwenShaoStephen Date: Wed, 1 Nov 2017 20:14:23 -0400 Subject: [PATCH 3/3] fix bugs in make_feature_vect.py --- egs/iam/s5/local/make_feature_vect.py | 5 ++--- egs/iam/s5/run.sh | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/egs/iam/s5/local/make_feature_vect.py b/egs/iam/s5/local/make_feature_vect.py index 8d00b56e8a5..ecf4da02462 100755 --- a/egs/iam/s5/local/make_feature_vect.py +++ b/egs/iam/s5/local/make_feature_vect.py @@ -211,7 +211,6 @@ def image_augment(im, out_fh, image_id): # im_shift = vertical_shift(im, shift_setting[i]) # data = np.transpose(im_shift, (1, 0)) # data = np.divide(data, 255.0) - # new_scp_list.append(image_id + '_shift' + str(i + 1)) # write_kaldi_matrix(out_fh, data, image_shift_id[i]) @@ -250,10 +249,10 @@ def image_augment(im, out_fh, image_id): image_path = line_vect[1] im = misc.imread(image_path) im_scaled = get_scaled_image(im) - im_contrast = contrast_normalization(im_scaled, 0.05, 0.2) + # im_contrast = contrast_normalization(im_scaled, 0.05, 0.2) # slant_degree = find_slant_project(im_contrast) # im_sheared = horizontal_shear(im_contrast, slant_degree) # im_padded = vertical_shift(im_scaled, 10) - data = np.transpose(im_contrast, (1, 0)) + data = np.transpose(im_scaled, (1, 0)) data = np.divide(data, 255.0) write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/iam/s5/run.sh b/egs/iam/s5/run.sh index 661ef97517f..6c94f732c86 100755 --- a/egs/iam/s5/run.sh +++ b/egs/iam/s5/run.sh @@ -141,11 +141,8 @@ if [ $stage -le 12 ]; then $exp_dir/tri3_ali fi - affix=_1a nnet3_affix= - -affix=_1a if [ $stage -le 13 ]; then local/chain/run_cnn_1a.sh --stage 0 \ --gmm tri3 \