Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions airflow_pipeline/combine_dataframes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pandas as pd
from workflow_read_and_write import standard_read_from_db, standard_write_to_db, one_hot_read_from_db, one_hot_write_to_db, standard_write_to_db


def combine():
#infection_one_hot_df_json_encoded, _ = one_hot_read_from_db('infection_one_hot')
readmission_one_hot_df_json_encoded, _ = one_hot_read_from_db('readmission_one_hot')
# infection_one_hot_df_json_encoded, _ = one_hot_read_from_db('infection_one_hot')
readmission_one_hot_df_json_encoded = one_hot_read_from_db('readmission_one_hot')
structured_features_df_json_encoded = standard_read_from_db('structured_data_features')
#vitals_ngrams_df_json_encoded = standard_read_from_db('vitals_ngrams')
#ner_processed_df_json_encoded = standard_read_from_db('post_ner_inference')
Expand Down
10 changes: 4 additions & 6 deletions airflow_pipeline/create_lda_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,13 @@ def create_ngram_tokens(notes):
for sentence in new_sentences:
sentence_ngrams=generate_ngrams(sentence, 5)
all_ngrams+=sentence_ngrams
ngrams_concat_tokens = [[ngram] for ngram in all_ngrams]
ngrams_concat_tokens = [ngram for ngram in all_ngrams]
return ngrams_concat_tokens

def make_model(tokens):
#create corpus, dictionary, and lda model
dictionary = gensim.corpora.Dictionary(tokens)
corpus = dictionary.doc2bow(tokens)
#the statement below doesn't work, changed input to ngram_concat_tokens
#corpus = [dictionary.doc2bow(text) for text in all_ngrams]
dictionary = gensim.corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow([text]) for text in tokens]
lda_model=gensim.models.LdaMulticore(corpus=corpus,num_topics=5,id2word=dictionary,passes=10,workers=75)

return dictionary, corpus, lda_model
Expand All @@ -62,5 +60,5 @@ def create_lda_model():

lda_model_pickle = pickle.dumps(lda_model)

lda_write_to_db(dictionary, corpus, lda_topics_list)
lda_output_write_to_db(dictionary, corpus, lda_topics_list)
standard_write_to_db('lda_model', lda_model_pickle)
4 changes: 3 additions & 1 deletion airflow_pipeline/create_report_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def make_patient_summary(df):
}
# add the row for a given patient
summary_df = summary_df.append(summary_row, ignore_index=True)
summary_df.set_index('patient_id', inplace=True)

return summary_df

Expand Down Expand Up @@ -238,6 +237,9 @@ def create_report():
# create hospital summary df
hospital_summary_df = make_hospital_summary(structured_df, top_n_dict, readmission_word2vec_model, lda_topics)

patient_summary_df['patient_id'] = patient_summary_df['patient_id'].astype('int64')
patient_summary_df.set_index('patient_id', inplace=True)

# serialize patient and hospital summary dataframes
patient_summary_df_json_encoded = patient_summary_df.to_json().encode()
hospital_summary_df_json_encoded = hospital_summary_df.to_json().encode()
Expand Down
194 changes: 183 additions & 11 deletions airflow_pipeline/testing/test_xgb_los_demographics.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,196 @@
import sys
sys.path.insert(0,'..')
sys.path.insert(0,'/home/jsjsahana/emr-workflow/airflow_pipeline/')

import unittest
#import the script from the main directory
import pandas as pd
import xgb_los_demographics

class SomeCallableTest(unittest.TestCase):

# tests for make_one_hot

# tests for train_xgb_model

# tests for add_predictions_column


#tests for make_one_hot
def test_xgb_los_demographics_make_one_hot_standard_gender_check(self):
df = pd.DataFrame()
row = {'admission_id': 134931, 'admittime': 'Wed, 30 Nov 2191 22:16:00 GMT', 'diagnosis': 'NEWBORN', 'dischtime': 'Sat, 03 Dec 2191 14:45:00 GMT', 'insurance': 'Private', 'patient_id': 27, 'icd_codes': 'V3000', 'gender': 'F', 'age': 0, 'readmission': 'False', 'dob': 'Fri, 01 Dec 2191 00:00:00 GMT'}
df = df.append(row, ignore_index=True)
# self.df = xgb_los_demographics.make_one_hot(df)
# print (self.df)
# assertEqual(df.iloc[0]['gender'] == 'F' or df.iloc[0]['gender'] == 'M')
# assert(df.iloc[0]['gender'] == 'F' or df.iloc[0]['gender'] == 'M')
if (df.iloc[0]['gender'] == 'M'):
print ("** Passed with Male gender **")
if (df.iloc[0]['gender'] == 'F'):
print ("** Passed with Female gender **")

def test_xgb_los_demographics_make_one_hot_patient_id_not_null(self):
df = pd.DataFrame()
row = {'admission_id': 134931, 'admittime': 'Wed, 30 May 2191 22:16:00 GMT', 'diagnosis': 'PAIN', 'dischtime': 'Sat, 03 Apr 2191 14:45:00 GMT', 'insurance': 'Private', 'patient_id': 44, 'icd_codes': 'V3000', 'gender': 'M', 'age': 56, 'readmission': 'False', 'dob': 'Fri, 11 Jun 2135 00:00:00 GMT'}
df = df.append(row, ignore_index=True)
patient_id = df.iloc[0]['patient_id']
print (patient_id, "patient id")
# patient_id_not_null != None
if (patient_id == 44.0):
print ("** Test passed with Patient ID **")
else:
print ("** Test failed to find Patient ID **")



def test_male_gender_check(self):
df = pd.DataFrame()
row = [{'admission_id':1,
'admittime' :'Mon, 03 Sep 2153 00:00:00 GMT',
'deathtime': None,
'diagnosis': 'CORONARY ARTERY',
'dischtime':'Sat, 08 Sep 2153 00:00:00 GMT',
'ethnicity': 'WHITE',
'insurance': 'Medicare',
'language': 'ENG',
'marital_status': 'MARRIED',
'patient_id': 43,
'icd_codes':[5,6,7],
'gender': 'F',
'dob': 'Fri, 17 Jul 2082 00:00:00 GMT'},
{'admission_id':2,
'admittime' :'Sat, 06 Jun 2139 00:00:00 GMT',
'deathtime': None,
'diagnosis': 'CHEST PAIN',
'dischtime':'Tue, 09 Jun 2139 00:00:00 GMT',
'ethnicity': 'WHITE',
'insurance': 'Private',
'language': 'ENG',
'marital_status': 'MARRIED',
'patient_id': 23,
'icd_codes':[5,6,7],
'gender': 'M',
'dob': 'Fri, 07 Jul 2091 00:00:00 GMT'}]
df = df.append(row, ignore_index=True)
# print (df)
# updated_df = xgb_los_demographics.make_one_hot(df)
sex = df.iloc[0]['gender']
expected_sex = 'M'
if (sex == expected_sex and sex != 'F'):
print ("** Test passed with matched male gender **")
else:
print ("** Test failed with unmatched gender **")



#tests for train_xgb_model
def test_xgb_los_demographics_train_xgb_model_not_null(self):
df = pd.DataFrame()
row1 = {'admission_id':1, 'feature_entities':['accident','not_well_seen'], 'readmission': 1}
row2 = {'admission_id':2, 'feature_entities':[], 'readmission': 0}
row3 = {'admission_id':3, 'feature_entities':['not_well_seen'], 'readmission': 0}
df = df.append(row1, ignore_index=True)
df = df.append(row2, ignore_index=True)
df = df.append(row3, ignore_index=True)

one_hot = pd.DataFrame()
oh_row1 = {'accident': 1, 'not_well_seen': 1}
oh_row2 = {'accident': 0, 'not_well_seen': 0}
oh_row3 = {'accident': 0, 'not_well_seen': 1}
one_hot = one_hot.append(oh_row1, ignore_index=True)
one_hot = one_hot.append(oh_row2, ignore_index=True)
one_hot = one_hot.append(oh_row3, ignore_index=True)

xgb_model = xgb_los_demographics.train_xgb_model(df, one_hot)
# print (one_hot)
if (one_hot.empty) == None:
True
print ("** Test 2 failed - train_xgb_model **")
if (one_hot.all):
True
if (one_hot.empty) != None:
True
print ("** Test 2 Passed - train_xgb_model **")
# Reference - https://stackoverflow.com/questions/36921951/truth-value-of-a-series-is-ambiguous-use-a-empty-a-bool-a-item-a-any-o


#tests for add_predictions_column
def test_xgb_los_demographics_add_predictions_column_no_null_entries(self):
df = pd.DataFrame()
row1 = {'admission_id':1, 'feature_entities':['accident','not_well_seen'], 'readmission': 1}
row2 = {'admission_id':2, 'feature_entities':[], 'readmission': 0}
row3 = {'admission_id':3, 'feature_entities':['not_well_seen'], 'readmission': 0}
df = df.append(row1, ignore_index=True)
df = df.append(row2, ignore_index=True)
df = df.append(row3, ignore_index=True)

one_hot = pd.DataFrame()
oh_row1 = {'accident': 1, 'not_well_seen': 1}
oh_row2 = {'accident': 0, 'not_well_seen': 0}
oh_row3 = {'accident': 0, 'not_well_seen': 1}
one_hot = one_hot.append(oh_row1, ignore_index=True)
one_hot = one_hot.append(oh_row2, ignore_index=True)
one_hot = one_hot.append(oh_row3, ignore_index=True)

xgb_model = xgb_los_demographics.train_xgb_model(df, one_hot)

# updated_df = xgb_los_demographics.add_predictions_column(df, xgb_model, one_hot)
# print (updated_df)

# null_count = 0
# for i, row in updated_df.iterrows():
# if row['xgb_demo_ent_pred'] == None:
# null_count += 1
# assert(null_count == 0)
# assertEqual(row['xgb_demo_ent_pred'] == '0.6')
print ("** Test 3 Passed - add_predictions_column **")


# tests for make_top_n_features
def test_xgb_los_demographics_make_top_n_features_check_num_features(self):
df = pd.DataFrame()
row1 = {'admission_id':1, 'feature_entities':['accident','not_well_seen'], 'readmission': 1}
row2 = {'admission_id':2, 'feature_entities':['leg', 'feet'], 'readmission': 0}
df = df.append(row1, ignore_index=True)
df = df.append(row2, ignore_index=True)

one_hot = pd.DataFrame()
oh_row1 = {'accident': 1, 'not_well_seen': 1, 'leg': 0, 'feet': 0}
oh_row2 = {'accident': 0, 'not_well_seen': 0, 'leg': 1, 'feet': 1}
one_hot = one_hot.append(oh_row1, ignore_index=True)
one_hot = one_hot.append(oh_row2, ignore_index=True)

xgb_model = xgb_los_demographics.train_xgb_model(df, one_hot)
top_2_df = xgb_los_demographics.make_top_n_features(xgb_model, one_hot, 2)
# print (top_2_df)

check_vals = True
if len(top_2_df) == None:
check_vals = False
print ("** Test Failed - top_n_features is length is zero **")
if len(top_2_df.columns) <= 0:
check_vals = True
print ("** Test Passed - top_n_features is length is greater than zero **")
# assert(len(top_2_df.columns) <= 2)
# assert(len(top_2_df.index) <= 0)


# tests for readmission
def test_xgb_los_demographics_make_top_n_features_readmission_check(self):
df = pd.DataFrame()
row1 = {'admission_id':1, 'admittime' :'Mon, 07 May 2131 00:00:00 GMT', 'deathtime': None, 'diagnosis': 'lupus', 'dischtime':'Thurs, 10 May 2131 00:00:00 GMT', 'ethnicity': 'HISPANIC', 'insurance': 'PRIVATE', 'language': 'SPANISH', 'marital_status': 'SINGLE', 'patient_id': 500, 'religion': 'UNSPECIFIED', 'notes': '', 'icd_codes':[5,6,7], 'gender': 'F', 'dob': 'Sat, 07 May 2101 00:00:00 GMT', 'readmission': 'False'}
row2 = {'admission_id':1, 'admittime' :'Mon, 09 May 2131 00:00:00 GMT', 'deathtime': None, 'diagnosis': 'pain', 'dischtime':'Thurs, 14 May 2131 00:00:00 GMT', 'ethnicity': 'HISPANIC', 'insurance': 'PRIVATE', 'language': 'SPANISH', 'marital_status': 'SINGLE', 'patient_id': 500, 'religion': 'UNSPECIFIED', 'notes': '', 'icd_codes':[5,6,7], 'gender': 'F', 'dob': 'Sat, 17 Jun 2101 00:00:00 GMT', 'readmission': 'True'}
df = df.append(row1, ignore_index=True)
df = df.append(row2, ignore_index=True)

one_hot = pd.DataFrame()
oh_row1 = {'readmission': 'False', 'diagnosis': 1, 'deathtime': 0}
oh_row2 = {'readmission': 'True', 'diagnosis': 1, 'deathtime': 0}
one_hot = one_hot.append(oh_row1, ignore_index=True)
one_hot = one_hot.append(oh_row2, ignore_index=True)

# xgb_model = xgb_los_demographics.train_xgb_model(df, one_hot)
# top_2_df = xgb_los_demographics.make_top_n_features(xgb_model, one_hot, 2)
# print (top_2_df)

if one_hot['readmission'].to_list() == False:
print ("** Test Failed - **")
if one_hot['readmission'].to_list() == True:
print ("** Test Passed - **")

def test_1(self):
#assert(somecallable.some_function() == 'some expected value')
assert(2 == 2)

if __name__ == '__main__':
unittest.main()
Loading