From 59833313a6e92f025bd9724cafd663b93dc5b770 Mon Sep 17 00:00:00 2001 From: Adam Yala Date: Thu, 9 Aug 2018 14:03:04 -0400 Subject: [PATCH 1/4] prostate core vs noncore --- oncotext/utils/postprocess.py | 7 +++++++ oncotext/utils/preprocess.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/oncotext/utils/postprocess.py b/oncotext/utils/postprocess.py index 863d39a..2f49622 100644 --- a/oncotext/utils/postprocess.py +++ b/oncotext/utils/postprocess.py @@ -113,6 +113,13 @@ def generate_automatic_feilds(reportDB, organ, config): elif organ == "OrganProstate": for r in reportDB: + if r['BiopsyType'] == 'Core': + r['OrganProstateCore'] = '1' + r['OrganProstateNonCore'] = '0' + else: + r['OrganProstateCore'] = '0' + r['OrganProstateNonCore'] = '1' + if r['ProstateCa'] == '0': numerical = [k for k in config['POST_DIAGNOSES']['OrganProstate'] if config['POST_DIAGNOSES']['OrganProstate'][k] == ["NUM"]] for k in numerical: diff --git a/oncotext/utils/preprocess.py b/oncotext/utils/preprocess.py index 32445fd..d9d78d5 100644 --- a/oncotext/utils/preprocess.py +++ b/oncotext/utils/preprocess.py @@ -171,7 +171,7 @@ def segment_prostate(report, raw_text_key, preprocessed_text_key, segment_id_key def segment_reports(r, organ, raw_text_key, preprocessed_text_key, side_key, segment_id_key, segment_type_key, logger): if organ == "OrganBreast": segmented_reports = segment_breast(r, raw_text_key, preprocessed_text_key, side_key, logger) - elif organ == "OrganProstate": + elif organ == "OrganProstateCore": segmented_reports = segment_prostate(r, raw_text_key, preprocessed_text_key, segment_id_key, segment_type_key, logger) else: r[preprocessed_text_key] = preprocess_text(r[raw_text_key]) From ff94948c09946ecbdb6a67708b18d88c228f7009 Mon Sep 17 00:00:00 2001 From: Adam Yala Date: Mon, 13 Aug 2018 18:02:58 -0400 Subject: [PATCH 2/4] test --- config.py | 2 +- oncotext/utils/postprocess.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config.py b/config.py index 5e5688a..e12d070 100644 --- a/config.py +++ b/config.py @@ -92,7 +92,7 @@ class Config(object): 'patience': 5, 'snapshot': None, 'objective':'cross_entropy', - 'num_gpus': 1, + 'num_gpus': 2, 'save_dir': 'snapshot', 'model_dir': SNAPSHOT_DIR+'/{}', 'model_file': 'oncotext_{}.pt', diff --git a/oncotext/utils/postprocess.py b/oncotext/utils/postprocess.py index 2f49622..e8638e8 100644 --- a/oncotext/utils/postprocess.py +++ b/oncotext/utils/postprocess.py @@ -219,9 +219,9 @@ def apply_rules(reportDB, trainDB, organ, config, logger): ''' logger.info("postprocess - apply corrections") reportDB = apply_corrections(reportDB, trainDB, config, logger) - logger.info("postprocess - generate automatic fields") - reportDB = generate_automatic_feilds(reportDB, organ, config) - logger.info("postprocess - aggregate episodes") - reportDB = aggregate_episodes(reportDB, organ, config, logger) + # logger.info("postprocess - generate automatic fields") + # reportDB = generate_automatic_feilds(reportDB, organ, config) + # logger.info("postprocess - aggregate episodes") + # reportDB = aggregate_episodes(reportDB, organ, config, logger) return reportDB From ebca6f5c0a5381ebc5e6c6ea1e284cae08094e1f Mon Sep 17 00:00:00 2001 From: Adam Yala Date: Fri, 17 Aug 2018 16:07:24 -0400 Subject: [PATCH 3/4] many changes --- .gitignore | 2 +- config.py | 6 +++--- oncotext/utils/generic.py | 4 ++-- oncotext/utils/parsing.py | 2 +- oncotext/utils/postprocess.py | 24 +++++++++++++++--------- scripts/app.py | 2 +- text_nn | 2 +- 7 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 960d824..f9fce9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # large files snapshot -LOGS +LOGS* LOGS_Tagging *.bin secrets diff --git a/config.py b/config.py index e12d070..e2a3779 100644 --- a/config.py +++ b/config.py @@ -49,7 +49,7 @@ class Config(object): PRUNE_KEY = "OrganBreast" PRUNE_AFTER_PREDICT = False - COLUMN_KEYS = parsing.parse_XLS( os.environ['CONFIG_XLSX']) + COLUMN_KEYS = parsing.parse_XLS(os.environ['CONFIG_XLSX']) DIAGNOSES = {o: {} for o in COLUMN_KEYS.keys()} for organ in DIAGNOSES: @@ -69,7 +69,7 @@ class Config(object): MARKERS = ['ER', "ER_Intensity", 'PR', "PR_Intensity", "her2", 'Her2Fish', "Her2_IHC", 'PositiveLN', 'ECE', 'ITC', 'BVI', 'LVI'] RATIONALE_NET_CONFIG = { - 'cuda': CUDA, + 'cuda': CUDA=='true', 'num_workers': 8, 'train_batch_size': 32, 'pred_batch_size': 200, @@ -92,7 +92,7 @@ class Config(object): 'patience': 5, 'snapshot': None, 'objective':'cross_entropy', - 'num_gpus': 2, + 'num_gpus': 1, 'save_dir': 'snapshot', 'model_dir': SNAPSHOT_DIR+'/{}', 'model_file': 'oncotext_{}.pt', diff --git a/oncotext/utils/generic.py b/oncotext/utils/generic.py index 07f1e12..3b5936a 100644 --- a/oncotext/utils/generic.py +++ b/oncotext/utils/generic.py @@ -22,8 +22,8 @@ def hasCat(r, cat, loose=False): return False -def contains_annotations(reports, config): - diagnoses = config['DIAGNOSES'] +def contains_annotations(reports, organ, config): + diagnoses = config['DIAGNOSES'][organ] for r in reports: if hasCat(r, diagnoses, loose=True): return True diff --git a/oncotext/utils/parsing.py b/oncotext/utils/parsing.py index 15d2bb0..7a1bf39 100644 --- a/oncotext/utils/parsing.py +++ b/oncotext/utils/parsing.py @@ -13,7 +13,7 @@ def parse_XLS(path): values = [str(cell.value) for cell in row if cell.value is not None] if len(values) > 1: data[sheet.title][values[0]] = values[1: ] - elif len(values) == 0: + elif len(values) == 1: data[sheet.title][values[0]] = [] return data diff --git a/oncotext/utils/postprocess.py b/oncotext/utils/postprocess.py index e8638e8..ae0ad69 100644 --- a/oncotext/utils/postprocess.py +++ b/oncotext/utils/postprocess.py @@ -111,15 +111,21 @@ def generate_automatic_feilds(reportDB, organ, config): if 'CancerInvasive' in r and r['CancerInvasive'] == '0': r['GradeMaxInvasive'] = '9' - elif organ == "OrganProstate": + elif organ == "Meta": for r in reportDB: - if r['BiopsyType'] == 'Core': - r['OrganProstateCore'] = '1' - r['OrganProstateNonCore'] = '0' + if r['OrganProstate'] == '1': + if r['BiopsyType'] == 'Core': + r['OrganProstateCore'] = '1' + r['OrganProstateNonCore'] = '0' + else: + r['OrganProstateCore'] = '0' + r['OrganProstateNonCore'] = '1' else: r['OrganProstateCore'] = '0' - r['OrganProstateNonCore'] = '1' + r['OrganProstateNonCore'] = '0' + elif organ == "OrganProstateCore" or organ == "OrganProstateNonCore": + for r in reportDB: if r['ProstateCa'] == '0': numerical = [k for k in config['POST_DIAGNOSES']['OrganProstate'] if config['POST_DIAGNOSES']['OrganProstate'][k] == ["NUM"]] for k in numerical: @@ -219,9 +225,9 @@ def apply_rules(reportDB, trainDB, organ, config, logger): ''' logger.info("postprocess - apply corrections") reportDB = apply_corrections(reportDB, trainDB, config, logger) - # logger.info("postprocess - generate automatic fields") - # reportDB = generate_automatic_feilds(reportDB, organ, config) - # logger.info("postprocess - aggregate episodes") - # reportDB = aggregate_episodes(reportDB, organ, config, logger) + logger.info("postprocess - generate automatic fields") + reportDB = generate_automatic_feilds(reportDB, organ, config) + logger.info("postprocess - aggregate episodes") + reportDB = aggregate_episodes(reportDB, organ, config, logger) return reportDB diff --git a/scripts/app.py b/scripts/app.py index f6bf2ae..2679eb9 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -66,7 +66,7 @@ def addTrainData(): config['SEGMENT_TYPE_KEY'], logger) - if len(data) == 0 or not generic.contains_annotations(data, config): + if len(data) == 0 or not generic.contains_annotations(data, organ, config): logger.warn("addTrain[ - did not include any reports with labels. No op.") return NOP_MSG diff --git a/text_nn b/text_nn index 57716b0..05733b7 160000 --- a/text_nn +++ b/text_nn @@ -1 +1 @@ -Subproject commit 57716b0526001b3a68a822c84dba87eee9d29607 +Subproject commit 05733b7cf20b5b739a520e8c5b092ef932306fcc From 72eb09ac471f69a2d6ce2793748c52e35d36e9ed Mon Sep 17 00:00:00 2001 From: Adam Yala Date: Mon, 22 Oct 2018 00:48:27 -0400 Subject: [PATCH 4/4] misc debug --- config.py | 4 +++- oncotext/evaluation.py | 7 +++---- oncotext/utils/postprocess.py | 2 +- oncotext/utils/preprocess.py | 6 +++++- scripts/app.py | 4 ++-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/config.py b/config.py index e2a3779..e71ec19 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,7 @@ class Config(object): PICKLE_DIR = os.environ['PICKLE_DIR'] SNAPSHOT_DIR = os.environ['SNAPSHOT_DIR'] CUDA = os.environ['CUDA'] + print('CUDA', CUDA, CUDA=='true') DB_TRAIN_PATH = os.path.join(PICKLE_DIR, "reportDBAPI_train.p") DB_BASE_PATH = os.path.join(PICKLE_DIR, "reportDB_base_train.p") @@ -61,6 +62,7 @@ class Config(object): post_diagnoses['OrganBreast']['ER_Intensity'] = ['0', '1', '2', '3', '9'] post_diagnoses['OrganBreast']['PR_Intensity'] = ['0', '1', '2', '3', '9'] POST_DIAGNOSES = post_diagnoses + ORGANS = post_diagnoses.keys() CANCERS = ['ILC', 'DCIS', 'IDC', 'TubularCancer', 'CancerInvasive', 'CancerInvNOS', 'CancerNotOfBreastOrigin'] @@ -69,7 +71,7 @@ class Config(object): MARKERS = ['ER', "ER_Intensity", 'PR', "PR_Intensity", "her2", 'Her2Fish', "Her2_IHC", 'PositiveLN', 'ECE', 'ITC', 'BVI', 'LVI'] RATIONALE_NET_CONFIG = { - 'cuda': CUDA=='true', + 'cuda': True, 'num_workers': 8, 'train_batch_size': 32, 'pred_batch_size': 200, diff --git a/oncotext/evaluation.py b/oncotext/evaluation.py index 48bd70a..48fd554 100644 --- a/oncotext/evaluation.py +++ b/oncotext/evaluation.py @@ -10,8 +10,7 @@ import oncotext.utils.generic as generic import sklearn.metrics -def score_on_test_set(reports, test_set, config, logger): - organ = generic.getOrgan(test_set[0], config) +def score_on_test_set(reports, test_set, organ, config, logger): gold_reports = preprocess.apply_rules( test_set, organ, @@ -83,7 +82,7 @@ def score_on_test_set(reports, test_set, config, logger): return results, keys -def evaluate(reportDB, eval_sets, config, logger): +def evaluate(reportDB, eval_sets, organ, config, logger): all_results = {} for file_name in eval_sets: @@ -91,7 +90,7 @@ def evaluate(reportDB, eval_sets, config, logger): relevant_reports = [r for r in reportDB if r['filename'] == file_name] logger.info("Scoring reportDB against test_set {}".format(file_name)) logger.info("Scoring reportDB has {} records matching test_set {} records".format(len(relevant_reports), len(test_set))) - results, result_keys = score_on_test_set(relevant_reports, test_set, config, logger) + results, result_keys = score_on_test_set(relevant_reports, test_set, organ, config, logger) all_results[file_name]= results return all_results diff --git a/oncotext/utils/postprocess.py b/oncotext/utils/postprocess.py index ae0ad69..77b4875 100644 --- a/oncotext/utils/postprocess.py +++ b/oncotext/utils/postprocess.py @@ -127,7 +127,7 @@ def generate_automatic_feilds(reportDB, organ, config): elif organ == "OrganProstateCore" or organ == "OrganProstateNonCore": for r in reportDB: if r['ProstateCa'] == '0': - numerical = [k for k in config['POST_DIAGNOSES']['OrganProstate'] if config['POST_DIAGNOSES']['OrganProstate'][k] == ["NUM"]] + numerical = [k for k in config['POST_DIAGNOSES']['OrganProstateCore'] if config['POST_DIAGNOSES']['OrganProstateCore'][k] == ["NUM"]] for k in numerical: r[k] = '0' diff --git a/oncotext/utils/preprocess.py b/oncotext/utils/preprocess.py index d9d78d5..e6eb4df 100644 --- a/oncotext/utils/preprocess.py +++ b/oncotext/utils/preprocess.py @@ -149,7 +149,8 @@ def segment_prostate(report, raw_text_key, preprocessed_text_key, segment_id_key segments[0][0] += line+"\n" segments[0][1] = alpha[0]+"." else: - segments.append([line+"\n", alpha[len(segments)]+"."]) + segment_name = alpha[len(segments)%len(alpha)] + segments.append([line+"\n", segment_name + "."]) else: segments[-1][0] += line+"\n" @@ -231,6 +232,9 @@ def apply_rules(reports, organ, raw_text_key, preprocessed_text_key, time_key, s r[raw_text_key] = r[raw_text_key] if raw_text_key in r else r[preprocessed_text_key] r[raw_text_key] = remove_bad_chars(r[raw_text_key]) + if organ == 'Meta': + r[preprocessed_text_key] = r[raw_text_key] + if preprocessed_text_key in r: r[preprocessed_text_key] = preprocess_text(r[preprocessed_text_key]) preprocessed_reports.append(r) diff --git a/scripts/app.py b/scripts/app.py index 2679eb9..5230ed4 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -187,7 +187,7 @@ def predict(): eval_sets = json.loads(request.data.decode()) except Exception as e: eval_sets = {} - logger.warn("No eval sets provided for prediction!") + logger.warn("No eval sets provided for prediction!", e) filename = DB_UNLABLED_PATH+"_"+organ+".p" db_unlabeled = pickle.load(open(filename, 'rb')) @@ -216,7 +216,7 @@ def predict(): config, logger) - results = evaluation.evaluate(reportDB, eval_sets, config, logger) + results = evaluation.evaluate(reportDB, eval_sets, organ, config, logger) return json.dumps({'reportDB': json_utils.make_json_compliant(reportDB), 'results': results,