diff --git a/dataset-construction/requirements.txt b/dataset-construction/requirements.txt index f0d23e4..cd44e01 100644 --- a/dataset-construction/requirements.txt +++ b/dataset-construction/requirements.txt @@ -1,3 +1,5 @@ tqdm pymongo -numpy \ No newline at end of file +numpy +nltk==3.7 +strsim==0.0.3 diff --git a/dataset-construction/setup.sh b/dataset-construction/setup.sh new file mode 100644 index 0000000..0738087 --- /dev/null +++ b/dataset-construction/setup.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +pip3 install -r requirements.txt +python3 -m nltk.downloader punkt diff --git a/dataset-construction/src/ndb_data/construction/make_database_initial.py b/dataset-construction/src/ndb_data/construction/make_database_initial.py index d24385e..23c71f2 100644 --- a/dataset-construction/src/ndb_data/construction/make_database_initial.py +++ b/dataset-construction/src/ndb_data/construction/make_database_initial.py @@ -31,8 +31,6 @@ from similarity.normalized_levenshtein import NormalizedLevenshtein from tqdm import tqdm -from ndb_data.util.log_helper import setup_logging - detok = TreebankWordDetokenizer() logger = logging.getLogger(__name__) @@ -71,7 +69,6 @@ def normalize_subject(subject_name, fact): if __name__ == "__main__": - setup_logging() parser = ArgumentParser() parser.add_argument("cache_dir") parser.add_argument("out_file")