From 8adc972acf3b5a4b8ab917eb7bde3b488fab3978 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Sat, 16 Dec 2017 08:47:31 -0600 Subject: [PATCH 1/8] closes split CSV files --- __init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/__init__.py b/__init__.py index e69ab3c..3574909 100644 --- a/__init__.py +++ b/__init__.py @@ -79,19 +79,25 @@ def csvsplit(reader, max_size, encoding, tmp_dir): writer = None current_size = 0 split_filenames = [] + fout = None # break CSV file into smaller merge files for row in reader: if not writer: filename = os.path.join(tmp_dir, 'split{}.csv'.format(len(split_filenames))) - writer = csv.writer(open(filename, 'w', newline='\n', encoding=encoding)) + fout = open(filename, 'w', newline='\n', encoding=encoding) + writer = csv.writer(fout) split_filenames.append(filename) writer.writerow(row) current_size += sys.getsizeof(row) if current_size > max_size: writer = None + fout.close() current_size = 0 + + if not fout.closed: + fout.close() return split_filenames From c39d96aebd9f331203e1edc3432ac14f4827a2ef Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 09:26:46 -0600 Subject: [PATCH 2/8] ignores vim .swp files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c5236de..4270fed 100644 --- a/.gitignore +++ b/.gitignore @@ -92,4 +92,7 @@ ENV/ # Rope project settings .ropeproject -.idea \ No newline at end of file +.idea + +# Vim swp files +*.swp From d8a1513e69902fbd50b221d0f8fbcf1081bb3c01 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 11:23:28 -0600 Subject: [PATCH 3/8] refactors project to add unit tests --- .travis.yml | 14 +++++ __init__.py | 139 +-------------------------------------------- csvsorter.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/context.py | 11 ++++ tests/test.py | 72 ++++++++++++++++++++++++ 6 files changed, 239 insertions(+), 137 deletions(-) create mode 100644 .travis.yml create mode 100644 csvsorter.py create mode 100644 tests/__init__.py create mode 100644 tests/context.py create mode 100644 tests/test.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..d80ea52 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +dist: trusty +sudo: false +language: python +python: + - "3.2" + - "3.3" + - "3.4" + - "3.5" + - "3.6" + - "nightly" + +script: + - python -m unittest + diff --git a/__init__.py b/__init__.py index 3574909..83cd173 100644 --- a/__init__.py +++ b/__init__.py @@ -1,144 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import os, sys, csv, heapq, shutil from optparse import OptionParser - -class CsvSortError(Exception): - pass - - -def csvsort(input_filename, columns, output_filename='', max_size=100, has_header=True, delimiter=',', quoting=csv.QUOTE_MINIMAL, encoding='utf-8'): - """Sort the CSV file on disk rather than in memory - The merge sort algorithm is used to break the file into smaller sub files and - - :param input_filename: the CSV filename to sort - :param columns: a list of column to sort on (can be 0 based indices or header keys) - :param output_filename: optional filename for sorted file. If not given then input file will be overriden. - :param max_size: the maximum size (in MB) of CSV file to load in memory at once - :param has_header: whether the CSV contains a header to keep separated from sorting - :param delimiter: character used to separate fields, default ',' - :param quoting: type of quoting used in the output - :param encoding: file encoding used in input/output files - """ - tmp_dir = '.csvsorter.{}'.format(os.getpid()) - os.makedirs(tmp_dir, exist_ok=True) - - try: - with open(input_filename, 'r', encoding=encoding) as input_fp: - reader = csv.reader(input_fp, delimiter=delimiter) - if has_header: - header = next(reader) - else: - header = None - - columns = parse_columns(columns, header) - - filenames = csvsplit(reader, max_size, encoding, tmp_dir) - for filename in filenames: - memorysort(filename, columns, encoding) - sorted_filename = mergesort(filenames, columns, tmp_dir=tmp_dir, encoding=encoding) - - # XXX make more efficient by passing quoting, delimiter, and moving result - # generate the final output file - with open(output_filename or input_filename, 'w', newline='\n', encoding=encoding) as output_fp: - writer = csv.writer(output_fp, delimiter=delimiter, quoting=quoting) - if header: - writer.writerow(header) - with open(sorted_filename, 'r', encoding=encoding) as sorted_fp: - rows = csv.reader(sorted_fp) - writer.writerows(rows) - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - - -def parse_columns(columns, header): - """check the provided column headers - """ - for i, column in enumerate(columns): - if isinstance(column, int): - if (header and column >= len(header)): - raise CsvSortError('Column index is out of range: "{}"'.format(column)) - else: - # find index of column from header - if header: - if column in header: - columns[i] = header.index(column) - else: - raise CsvSortError('Column name is not found in header: "{}"'.format(column)) - else: - raise CsvSortError('CSV needs a header to find index of this column name: "{}"'.format(column)) - return columns - - -def csvsplit(reader, max_size, encoding, tmp_dir): - """Split into smaller CSV files of maximum size and return the list of filenames - """ - max_size = max_size * 1024 * 1024 # convert to bytes - writer = None - current_size = 0 - split_filenames = [] - fout = None - - # break CSV file into smaller merge files - for row in reader: - if not writer: - filename = os.path.join(tmp_dir, 'split{}.csv'.format(len(split_filenames))) - fout = open(filename, 'w', newline='\n', encoding=encoding) - writer = csv.writer(fout) - split_filenames.append(filename) - - writer.writerow(row) - current_size += sys.getsizeof(row) - if current_size > max_size: - writer = None - fout.close() - current_size = 0 - - if not fout.closed: - fout.close() - return split_filenames - - -def memorysort(filename, columns, encoding): - """Sort this CSV file in memory on the given columns - """ - with open(filename, encoding=encoding) as input_fp: - rows = list(csv.reader(input_fp)) - rows.sort(key=lambda row: [row[column] for column in columns]) - with open(filename, 'w', newline='\n', encoding=encoding) as output_fp: - writer = csv.writer(output_fp) - writer.writerows(rows) - - -def yield_csv_rows(filename, columns, encoding): - """Iterator to sort CSV rows - """ - with open(filename, 'r', encoding=encoding) as fp: - for row in csv.reader(fp): - yield row - - -def mergesort(sorted_filenames, columns, nway=2, tmp_dir='', encoding='utf-8'): - """Merge these 2 sorted csv files into a single output file - """ - merge_n = 0 - while len(sorted_filenames) > 1: - merge_filenames, sorted_filenames = sorted_filenames[:nway], sorted_filenames[nway:] - - output_filename = os.path.join(tmp_dir, 'merge{}.csv'.format(merge_n)) - with open(output_filename, 'w', newline='\n', encoding=encoding) as output_fp: - writer = csv.writer(output_fp) - merge_n += 1 - rows = (yield_csv_rows(filename, columns, encoding) for filename in merge_filenames) - writer.writerows(heapq.merge(*rows)) - sorted_filenames.append(output_filename) - - for filename in merge_filenames: - os.remove(filename) - return sorted_filenames[0] - +import csvsorter def main(): parser = OptionParser() @@ -156,7 +21,7 @@ def main(): else: # escape backslashes args.columns = [int(column) if column.isdigit() else column for column in args.columns] - csvsort(input_files[0], columns=args.columns, max_size=args.max_size, has_header=args.has_header, delimiter=args.delimiter, encoding=args.encoding) + csvsorter.csvsort(input_files[0], columns=args.columns, max_size=args.max_size, has_header=args.has_header, delimiter=args.delimiter, encoding=args.encoding) if __name__ == '__main__': diff --git a/csvsorter.py b/csvsorter.py new file mode 100644 index 0000000..83e94fc --- /dev/null +++ b/csvsorter.py @@ -0,0 +1,140 @@ + + +import os, sys, csv, heapq, shutil + +class CsvSortError(Exception): + pass + + +def csvsort(input_filename, columns, output_filename='', max_size=100, has_header=True, delimiter=',', quoting=csv.QUOTE_MINIMAL, encoding='utf-8'): + """Sort the CSV file on disk rather than in memory + The merge sort algorithm is used to break the file into smaller sub files and + + :param input_filename: the CSV filename to sort + :param columns: a list of column to sort on (can be 0 based indices or header keys) + :param output_filename: optional filename for sorted file. If not given then input file will be overriden. + :param max_size: the maximum size (in MB) of CSV file to load in memory at once + :param has_header: whether the CSV contains a header to keep separated from sorting + :param delimiter: character used to separate fields, default ',' + :param quoting: type of quoting used in the output + :param encoding: file encoding used in input/output files + """ + tmp_dir = '.csvsorter.{}'.format(os.getpid()) + os.makedirs(tmp_dir, exist_ok=True) + + try: + with open(input_filename, 'r', encoding=encoding) as input_fp: + reader = csv.reader(input_fp, delimiter=delimiter) + if has_header: + header = next(reader) + else: + header = None + + columns = parse_columns(columns, header) + + filenames = csvsplit(reader, max_size, encoding, tmp_dir) + for filename in filenames: + memorysort(filename, columns, encoding) + sorted_filename = mergesort(filenames, columns, tmp_dir=tmp_dir, encoding=encoding) + + # XXX make more efficient by passing quoting, delimiter, and moving result + # generate the final output file + with open(output_filename or input_filename, 'w', newline='\n', encoding=encoding) as output_fp: + writer = csv.writer(output_fp, delimiter=delimiter, quoting=quoting) + if header: + writer.writerow(header) + with open(sorted_filename, 'r', encoding=encoding) as sorted_fp: + rows = csv.reader(sorted_fp) + writer.writerows(rows) + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) + + +def parse_columns(columns, header): + """check the provided column headers + """ + for i, column in enumerate(columns): + if isinstance(column, int): + if (header and column >= len(header)): + raise CsvSortError('Column index is out of range: "{}"'.format(column)) + else: + # find index of column from header + if header: + if column in header: + columns[i] = header.index(column) + else: + raise CsvSortError('Column name is not found in header: "{}"'.format(column)) + else: + raise CsvSortError('CSV needs a header to find index of this column name: "{}"'.format(column)) + return columns + + +def csvsplit(reader, max_size, encoding, tmp_dir): + """Split into smaller CSV files of maximum size and return the list of filenames + """ + max_size = max_size * 1024 * 1024 # convert to bytes + writer = None + current_size = 0 + split_filenames = [] + fout = None + + # break CSV file into smaller merge files + for row in reader: + if not writer: + filename = os.path.join(tmp_dir, 'split{}.csv'.format(len(split_filenames))) + fout = open(filename, 'w', newline='\n', encoding=encoding) + writer = csv.writer(fout) + split_filenames.append(filename) + + writer.writerow(row) + current_size += sys.getsizeof(row) + if current_size > max_size: + writer = None + fout.close() + current_size = 0 + + if not fout.closed: + fout.close() + return split_filenames + + +def memorysort(filename, columns, encoding): + """Sort this CSV file in memory on the given columns + """ + with open(filename, encoding=encoding) as input_fp: + rows = list(csv.reader(input_fp)) + rows.sort(key=lambda row: [row[column] for column in columns]) + with open(filename, 'w', newline='\n', encoding=encoding) as output_fp: + writer = csv.writer(output_fp) + writer.writerows(rows) + + +def yield_csv_rows(filename, columns, encoding): + """Iterator to sort CSV rows + """ + with open(filename, 'r', encoding=encoding) as fp: + for row in csv.reader(fp): + yield row + + +def mergesort(sorted_filenames, columns, nway=2, tmp_dir='', encoding='utf-8'): + """Merge these 2 sorted csv files into a single output file + """ + merge_n = 0 + while len(sorted_filenames) > 1: + merge_filenames, sorted_filenames = sorted_filenames[:nway], sorted_filenames[nway:] + + output_filename = os.path.join(tmp_dir, 'merge{}.csv'.format(merge_n)) + with open(output_filename, 'w', newline='\n', encoding=encoding) as output_fp: + writer = csv.writer(output_fp) + merge_n += 1 + rows = (yield_csv_rows(filename, columns, encoding) for filename in merge_filenames) + keyfunc = lambda row: [row[column] for column in columns] + writer.writerows(heapq.merge(*rows, key=keyfunc)) + sorted_filenames.append(output_filename) + + for filename in merge_filenames: + os.remove(filename) + return sorted_filenames[0] + + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/context.py b/tests/context.py new file mode 100644 index 0000000..d86d6f3 --- /dev/null +++ b/tests/context.py @@ -0,0 +1,11 @@ + +# +# Inserts module at beginning of sys.path in order to test source code, not +# installed package. +# + +import os +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +import csvsorter diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..cafa1b3 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,72 @@ + +import os +import unittest + +from .context import csvsorter + +import uuid + +class TestCSVSorter(unittest.TestCase): + + def setUp(self): + # Create CSV file 5MB in size, reverse sorted + self.tmp_name = str(uuid.uuid4().hex) + '.csv' + self.num_lines = 200000 + with open(self.tmp_name, 'w') as fout: + for line in range(self.num_lines, 0, -1): + print('{},{},{},{}'.format(line+4, line+3, line+2, line+1), file=fout) + + def tearDown(self): + os.remove(self.tmp_name) + + + def check_file_sorted(self): + with open(self.tmp_name, 'r') as fin: + prev_line = fin.readline() + for line in fin: + self.assertTrue(prev_line < line) + prev_line = line + + def check_col_sorted(self, col): + with open(self.tmp_name, 'r') as fin: + prev_line = fin.readline().split(',') + for line in fin: + line = line.split(',') + self.assertTrue(prev_line[col] <= line[col]) + prev_line = line + + + def test_memorysort_allcols(self): + csvsorter.memorysort(self.tmp_name, [0,1,2,3], 'utf-8') + self.check_file_sorted() + + def test_memorysort_onecol(self): + csvsorter.memorysort(self.tmp_name, [3], 'utf-8') + self.check_col_sorted(3) + + def test_csvsort(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [0,1,2,3], max_size=1, has_header=False) + self.check_file_sorted() + + # make sure all the lines are here after merging + linecount = 0 + with open(self.tmp_name, 'r') as fin: + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount) + + def test_csvsort_onecol(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=False) + self.check_col_sorted(3) + + # make sure all the lines are here after merging + linecount = 0 + with open(self.tmp_name, 'r') as fin: + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount) + + + From 364a210f41a708bd33731c274c4324ee7506aae1 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 11:27:45 -0600 Subject: [PATCH 4/8] better col_sorted check --- tests/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test.py b/tests/test.py index cafa1b3..7368d12 100644 --- a/tests/test.py +++ b/tests/test.py @@ -29,11 +29,11 @@ def check_file_sorted(self): def check_col_sorted(self, col): with open(self.tmp_name, 'r') as fin: - prev_line = fin.readline().split(',') - for line in fin: - line = line.split(',') - self.assertTrue(prev_line[col] <= line[col]) - prev_line = line + sorted_lines = fin.readlines() + gold = sorted(sorted_lines, key=lambda x : x.split(',')[col]) + + for x in range(len(sorted_lines)): + self.assertEqual(sorted_lines[x], gold[x]) def test_memorysort_allcols(self): From f676fe99d8f908b42a107c01c7e3fe5dc57f7870 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 11:29:24 -0600 Subject: [PATCH 5/8] python versions -- heap.merge() does not support key before 3.5 --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d80ea52..d002c7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,6 @@ dist: trusty sudo: false language: python python: - - "3.2" - - "3.3" - - "3.4" - "3.5" - "3.6" - "nightly" From b68dafb26d6eae380192fec76904e157a3983991 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 15:03:06 -0600 Subject: [PATCH 6/8] adds test with CSV header --- tests/test.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 7368d12..44d6588 100644 --- a/tests/test.py +++ b/tests/test.py @@ -27,9 +27,11 @@ def check_file_sorted(self): self.assertTrue(prev_line < line) prev_line = line - def check_col_sorted(self, col): + def check_col_sorted(self, col, skip_header=False): with open(self.tmp_name, 'r') as fin: sorted_lines = fin.readlines() + if skip_header: + sorted_lines.pop(0) gold = sorted(sorted_lines, key=lambda x : x.split(',')[col]) for x in range(len(sorted_lines)): @@ -55,7 +57,7 @@ def test_csvsort(self): for line in fin: linecount += 1 self.assertEqual(self.num_lines, linecount) - + def test_csvsort_onecol(self): # sort and force merges csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=False) @@ -67,6 +69,21 @@ def test_csvsort_onecol(self): for line in fin: linecount += 1 self.assertEqual(self.num_lines, linecount) - + + def test_header(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=True) + self.check_col_sorted(3, skip_header=True) + + # make sure all the lines are present (header not missing) + with open(self.tmp_name, 'r') as fin: + header = fin.readline() + self.assertEqual(header, '{},{},{},{}\n'.format( + self.num_lines+4, self.num_lines+3, self.num_lines+2, + self.num_lines+1)) + linecount = 1 + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount) From c19710d6e26f26050752a3bacbf8982a9ef6ec55 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 15:38:53 -0600 Subject: [PATCH 7/8] fixes import --- __init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/__init__.py b/__init__.py index 83cd173..16a5a1a 100644 --- a/__init__.py +++ b/__init__.py @@ -3,7 +3,7 @@ from optparse import OptionParser -import csvsorter +from .csvsorter import csvsort def main(): parser = OptionParser() @@ -21,8 +21,10 @@ def main(): else: # escape backslashes args.columns = [int(column) if column.isdigit() else column for column in args.columns] - csvsorter.csvsort(input_files[0], columns=args.columns, max_size=args.max_size, has_header=args.has_header, delimiter=args.delimiter, encoding=args.encoding) + csvsort(input_files[0], columns=args.columns, max_size=args.max_size, has_header=args.has_header, delimiter=args.delimiter, encoding=args.encoding) if __name__ == '__main__': main() + + From e59efe3a3d17041b971be71f188ac745ac392f0d Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Dec 2017 15:39:25 -0600 Subject: [PATCH 8/8] 4-space indents --- tests/test.py | 140 ++++++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/tests/test.py b/tests/test.py index cafa1b3..2043116 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,72 +1,88 @@ import os +import uuid import unittest from .context import csvsorter -import uuid - class TestCSVSorter(unittest.TestCase): - def setUp(self): - # Create CSV file 5MB in size, reverse sorted - self.tmp_name = str(uuid.uuid4().hex) + '.csv' - self.num_lines = 200000 - with open(self.tmp_name, 'w') as fout: - for line in range(self.num_lines, 0, -1): - print('{},{},{},{}'.format(line+4, line+3, line+2, line+1), file=fout) - - def tearDown(self): - os.remove(self.tmp_name) - - - def check_file_sorted(self): - with open(self.tmp_name, 'r') as fin: - prev_line = fin.readline() - for line in fin: - self.assertTrue(prev_line < line) - prev_line = line - - def check_col_sorted(self, col): - with open(self.tmp_name, 'r') as fin: - prev_line = fin.readline().split(',') - for line in fin: - line = line.split(',') - self.assertTrue(prev_line[col] <= line[col]) - prev_line = line - - - def test_memorysort_allcols(self): - csvsorter.memorysort(self.tmp_name, [0,1,2,3], 'utf-8') - self.check_file_sorted() - - def test_memorysort_onecol(self): - csvsorter.memorysort(self.tmp_name, [3], 'utf-8') - self.check_col_sorted(3) - - def test_csvsort(self): - # sort and force merges - csvsorter.csvsort(self.tmp_name, [0,1,2,3], max_size=1, has_header=False) - self.check_file_sorted() - - # make sure all the lines are here after merging - linecount = 0 - with open(self.tmp_name, 'r') as fin: - for line in fin: - linecount += 1 - self.assertEqual(self.num_lines, linecount) - - def test_csvsort_onecol(self): - # sort and force merges - csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=False) - self.check_col_sorted(3) - - # make sure all the lines are here after merging - linecount = 0 - with open(self.tmp_name, 'r') as fin: - for line in fin: - linecount += 1 - self.assertEqual(self.num_lines, linecount) - + def setUp(self): + # Create CSV file 5MB in size, reverse sorted + self.tmp_name = str(uuid.uuid4().hex) + '.csv' + self.num_lines = 200000 + with open(self.tmp_name, 'w') as fout: + for line in range(self.num_lines, 0, -1): + print('{},{},{},{}'.format(line+4, line+3, line+2, line+1), file=fout) + + def tearDown(self): + os.remove(self.tmp_name) + + + def check_file_sorted(self): + with open(self.tmp_name, 'r') as fin: + prev_line = fin.readline() + for line in fin: + self.assertTrue(prev_line < line) + prev_line = line + + def check_col_sorted(self, col, skip_header=False): + with open(self.tmp_name, 'r') as fin: + sorted_lines = fin.readlines() + if skip_header: + sorted_lines.pop(0) + gold = sorted(sorted_lines, key=lambda x : x.split(',')[col]) + + for x in range(len(sorted_lines)): + self.assertEqual(sorted_lines[x], gold[x]) + + + def test_memorysort_allcols(self): + csvsorter.memorysort(self.tmp_name, [0,1,2,3], 'utf-8') + self.check_file_sorted() + + def test_memorysort_onecol(self): + csvsorter.memorysort(self.tmp_name, [3], 'utf-8') + self.check_col_sorted(3) + + def test_csvsort(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [0,1,2,3], max_size=1, has_header=False) + self.check_file_sorted() + + # make sure all the lines are here after merging + linecount = 0 + with open(self.tmp_name, 'r') as fin: + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount) + + def test_csvsort_onecol(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=False) + self.check_col_sorted(3) + + # make sure all the lines are here after merging + linecount = 0 + with open(self.tmp_name, 'r') as fin: + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount) + + def test_header(self): + # sort and force merges + csvsorter.csvsort(self.tmp_name, [3], max_size=1, has_header=True) + self.check_col_sorted(3, skip_header=True) + + # make sure all the lines are present (header not missing) + with open(self.tmp_name, 'r') as fin: + header = fin.readline() + self.assertEqual(header, '{},{},{},{}\n'.format( + self.num_lines+4, self.num_lines+3, self.num_lines+2, + self.num_lines+1)) + linecount = 1 + for line in fin: + linecount += 1 + self.assertEqual(self.num_lines, linecount)