diff --git a/apply_model.py b/apply_model.py index 18de12e..e646fa5 100644 --- a/apply_model.py +++ b/apply_model.py @@ -136,7 +136,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ #read in bedfile, grab reads from valid chromosomes chrom='' - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, me_col], names=['chrom', 'start', 'end', 'rid', 'me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'me'], sep='\t', comment='#', chunksize=chunk_size) with tqdm(total=len(chromlist), leave=True) as pbar: i=-1 @@ -161,13 +161,15 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ no_me_b12['blockCount'] = 1 no_me_b12['blockStarts'] = 1 no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] + no_me_b12['score'] = '.' if not circle: no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] else: no_me_b12['blockSizes'] = no_me_b12['end']*3 - no_me_b12['start'] - - no_me_b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockSizes', 'blockStarts'] + + no_me_b12 = no_me_b12.rename(columns={'rid': 'name'}) + no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] chrom = chunk['chrom'].iloc[0] #generate bed12 for reads with methylation @@ -176,7 +178,8 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ b12['thickStart'] = b12['start'] b12['thickEnd'] = b12['end'] b12['itemRgb'] = '255,0,0' - + b12['score'] = '.' + #grab methylations chunk = chunk['me'].str.split(pat=',', expand=True) @@ -223,7 +226,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ #combine, sort bed12s b12 = b12.rename(columns={'rid': 'name'}) - b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes'] + b12 = b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) @@ -283,4 +286,4 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ fout.write(fin.read()) os.remove(tmp_file) -os.rmdir(tmp_dir) \ No newline at end of file +os.rmdir(tmp_dir) diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py index c69ab5c..c68bdfd 100644 --- a/apply_model_multiprocess.py +++ b/apply_model_multiprocess.py @@ -135,12 +135,16 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si no_me_b12['itemRgb'] = '255,0,0' no_me_b12['blockCount'] = 1 no_me_b12['blockStarts'] = 1 + no_me_b12['score'] = '.' + if not circle: no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] else: no_me_b12['blockSizes'] = no_me_b12['end']*3 - no_me_b12['start'] - no_me_b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockSizes', 'blockStarts'] + no_me_b12 = no_me_b12.rename(columns={'rid': 'name'}) + no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] + chrom = chunk['chrom'].iloc[0] else: no_me_b12=pd.DataFrame() @@ -157,6 +161,7 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si b12['thickStart'] = b12['start'] b12['thickEnd'] = b12['end'] b12['itemRgb'] = '255,0,0' + b12['score'] = '.' # Grab methylations chunk = chunk['me'].str.split(pat=',', expand=True) @@ -199,7 +204,7 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si # Combine, sort bed12s b12 = b12.rename(columns={'rid': 'name'}) - b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes'] + b12 = b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) @@ -243,9 +248,9 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ try: # read in fibertools output bedfile in chunks if min_me > 0: - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 13, 14, me_col], names=['chrom', 'start', 'end', 'rid', 'at_ct','me_ct','me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, 13, 14, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'at_ct','me_ct','me'], sep='\t', comment='#', chunksize=chunk_size) else: - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, me_col], names=['chrom', 'start', 'end', 'rid', 'me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'me'], sep='\t', comment='#', chunksize=chunk_size) #assign each chunk to a pool with Pool(core_count) as pool: for i, chunk in enumerate(reader): @@ -322,4 +327,5 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset): #this consistently fails on my tests because of permissions, but it's not a huge issue #os.rmdir(tmp_dir) -logging.info("Temporary directory removed and script completed.") \ No newline at end of file +logging.info("Temporary directory removed and script completed.") +