From fdce6cde55835d29f363f0ba7f667d9a5d7ca74b Mon Sep 17 00:00:00 2001 From: Edmri Date: Tue, 28 Oct 2025 11:43:04 +0100 Subject: [PATCH 1/3] fix: reformat output bed to respect convention --- apply_model.py | 17 ++++++++++------- apply_model_multiprocess.py | 14 +++++++++----- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/apply_model.py b/apply_model.py index 18de12e..f241e9b 100644 --- a/apply_model.py +++ b/apply_model.py @@ -136,7 +136,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ #read in bedfile, grab reads from valid chromosomes chrom='' - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, me_col], names=['chrom', 'start', 'end', 'rid', 'me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'me'], sep='\t', comment='#', chunksize=chunk_size) with tqdm(total=len(chromlist), leave=True) as pbar: i=-1 @@ -154,6 +154,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ chunk=chunk.loc[~chunk['rid'].isin(train_rids)] #generate bed12 for reads with no methylation (so, all one footprint) + no_me_b12['score'] = '.' no_me_b12 = chunk.loc[chunk['me'] == '.'].drop('me', axis=1) no_me_b12['thickStart'] = no_me_b12['start'] no_me_b12['thickEnd'] = no_me_b12['end'] @@ -161,13 +162,14 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ no_me_b12['blockCount'] = 1 no_me_b12['blockStarts'] = 1 no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] - + if not circle: no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] else: no_me_b12['blockSizes'] = no_me_b12['end']*3 - no_me_b12['start'] - - no_me_b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockSizes', 'blockStarts'] + + no_me_b12 = no_me_b12.rename(columns={'rid': 'name'}) + no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] chrom = chunk['chrom'].iloc[0] #generate bed12 for reads with methylation @@ -176,7 +178,8 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ b12['thickStart'] = b12['start'] b12['thickEnd'] = b12['end'] b12['itemRgb'] = '255,0,0' - + b12['score'] = '.' + #grab methylations chunk = chunk['me'].str.split(pat=',', expand=True) @@ -223,7 +226,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ #combine, sort bed12s b12 = b12.rename(columns={'rid': 'name'}) - b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes'] + b12 = b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) @@ -283,4 +286,4 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ fout.write(fin.read()) os.remove(tmp_file) -os.rmdir(tmp_dir) \ No newline at end of file +os.rmdir(tmp_dir) diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py index c69ab5c..7c3a6fe 100644 --- a/apply_model_multiprocess.py +++ b/apply_model_multiprocess.py @@ -129,6 +129,7 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si if min_me == 0: # Generate bed12 for reads with no methylation + no_me_b12['score'] = '.' no_me_b12 = chunk.loc[chunk['me'] == '.'].drop('me', axis=1) no_me_b12['thickStart'] = no_me_b12['start'] no_me_b12['thickEnd'] = no_me_b12['end'] @@ -140,7 +141,9 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si else: no_me_b12['blockSizes'] = no_me_b12['end']*3 - no_me_b12['start'] - no_me_b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockSizes', 'blockStarts'] + no_me_b12 = no_me_b12.rename(columns={'rid': 'name'}) + no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] + chrom = chunk['chrom'].iloc[0] else: no_me_b12=pd.DataFrame() @@ -157,6 +160,7 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si b12['thickStart'] = b12['start'] b12['thickEnd'] = b12['end'] b12['itemRgb'] = '255,0,0' + b12['score'] = '.' # Grab methylations chunk = chunk['me'].str.split(pat=',', expand=True) @@ -199,7 +203,7 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si # Combine, sort bed12s b12 = b12.rename(columns={'rid': 'name'}) - b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes'] + b12 = b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) @@ -243,9 +247,9 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ try: # read in fibertools output bedfile in chunks if min_me > 0: - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 13, 14, me_col], names=['chrom', 'start', 'end', 'rid', 'at_ct','me_ct','me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, 13, 14, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'at_ct','me_ct','me'], sep='\t', comment='#', chunksize=chunk_size) else: - reader = pd.read_csv(f, usecols=[0, 1, 2, 3, me_col], names=['chrom', 'start', 'end', 'rid', 'me'], sep='\t', comment='#', chunksize=chunk_size) + reader = pd.read_csv(f, usecols=[0, 1, 2, 3, 5, me_col], names=['chrom', 'start', 'end', 'rid', 'strand', 'me'], sep='\t', comment='#', chunksize=chunk_size) #assign each chunk to a pool with Pool(core_count) as pool: for i, chunk in enumerate(reader): @@ -322,4 +326,4 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset): #this consistently fails on my tests because of permissions, but it's not a huge issue #os.rmdir(tmp_dir) -logging.info("Temporary directory removed and script completed.") \ No newline at end of file +logging.info("Temporary directory removed and script completed.") From 71547f7620c6f3bfcd89896c2614e58e1aa6b7e7 Mon Sep 17 00:00:00 2001 From: Edmri Date: Tue, 28 Oct 2025 12:25:10 +0100 Subject: [PATCH 2/3] fix: score for no_me_b12 --- apply_model.py | 4 ++-- apply_model_multiprocess.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apply_model.py b/apply_model.py index f241e9b..e646fa5 100644 --- a/apply_model.py +++ b/apply_model.py @@ -154,7 +154,6 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ chunk=chunk.loc[~chunk['rid'].isin(train_rids)] #generate bed12 for reads with no methylation (so, all one footprint) - no_me_b12['score'] = '.' no_me_b12 = chunk.loc[chunk['me'] == '.'].drop('me', axis=1) no_me_b12['thickStart'] = no_me_b12['start'] no_me_b12['thickEnd'] = no_me_b12['end'] @@ -162,7 +161,8 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ no_me_b12['blockCount'] = 1 no_me_b12['blockStarts'] = 1 no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] - + no_me_b12['score'] = '.' + if not circle: no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] else: diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py index 7c3a6fe..5f5a0b7 100644 --- a/apply_model_multiprocess.py +++ b/apply_model_multiprocess.py @@ -129,13 +129,14 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si if min_me == 0: # Generate bed12 for reads with no methylation - no_me_b12['score'] = '.' no_me_b12 = chunk.loc[chunk['me'] == '.'].drop('me', axis=1) no_me_b12['thickStart'] = no_me_b12['start'] no_me_b12['thickEnd'] = no_me_b12['end'] no_me_b12['itemRgb'] = '255,0,0' no_me_b12['blockCount'] = 1 no_me_b12['blockStarts'] = 1 + no_me_b12['score'] = '.' + if not circle: no_me_b12['blockSizes'] = no_me_b12['end'] - no_me_b12['start'] else: From 7c19f9a111e00dbaf5bfc9270ca48dbccd0d8eda Mon Sep 17 00:00:00 2001 From: Edmri Date: Thu, 30 Oct 2025 10:01:55 +0100 Subject: [PATCH 3/3] Update apply_model_multiprocess.py fix: wrong tab for no_me_b12 column order --- apply_model_multiprocess.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py index 5f5a0b7..c68bdfd 100644 --- a/apply_model_multiprocess.py +++ b/apply_model_multiprocess.py @@ -143,8 +143,8 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si no_me_b12['blockSizes'] = no_me_b12['end']*3 - no_me_b12['start'] no_me_b12 = no_me_b12.rename(columns={'rid': 'name'}) - no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] - chrom = chunk['chrom'].iloc[0] + no_me_b12 = no_me_b12[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']] + chrom = chunk['chrom'].iloc[0] else: no_me_b12=pd.DataFrame() @@ -328,3 +328,4 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset): #this consistently fails on my tests because of permissions, but it's not a huge issue #os.rmdir(tmp_dir) logging.info("Temporary directory removed and script completed.") +