From 34f1a0673b1e7db16abffccef3e00042839f9284 Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Wed, 7 Oct 2020 14:36:53 -0600 Subject: [PATCH 1/8] added dictionary of max_token_lengths --- deepform/data/add_features.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index 330a39a..6829911 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -45,6 +45,15 @@ class TokenType(Enum): "gross_amount": dollar_similarity, } +MAX_TOKENS_BY_TARGET = { + # Each label column, and the maximum expected tokens that it uses. + "contract_num": 3, + "advertiser": 11, + "flight_from": 3, + "flight_to": 3, + "gross_amount": 3, +} + def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): """Split data into individual documents, add features, and write to parquet.""" @@ -137,10 +146,10 @@ def label_tokens(tokens, labels, max_token_count): for col_name, label_value in labels.items(): tokens[col_name] = 0.0 match_fn = LABEL_COLS[col_name] - + max_token_count = MAX_TOKENS_BY_TARGET[col_name] if col_name == "advertiser": tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, max_token_count, match_fn + tokens.token.to_numpy(), label_value, max_token_count[0], match_fn ) else: tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) From 60ce87614d9b84524d9d7604362f5b76be63fb10 Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Wed, 7 Oct 2020 14:49:29 -0600 Subject: [PATCH 2/8] removed offending subscript --- deepform/data/add_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index 6829911..ef61a4b 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -149,7 +149,7 @@ def label_tokens(tokens, labels, max_token_count): max_token_count = MAX_TOKENS_BY_TARGET[col_name] if col_name == "advertiser": tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, max_token_count[0], match_fn + tokens.token.to_numpy(), label_value, max_token_count, match_fn ) else: tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) From f8c4f7681381031a16e545f8febfaf65c20d9c0c Mon Sep 17 00:00:00 2001 From: Hugh Wimberly Date: Wed, 7 Oct 2020 15:08:07 -0700 Subject: [PATCH 3/8] Allow token length to be passed in from the command line --- deepform/data/add_features.py | 43 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index ef61a4b..ed3f6e4 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -55,7 +55,9 @@ class TokenType(Enum): } -def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): +def extend_and_write_docs( + source_dir, manifest, pq_index, out_path, max_token_count=None +): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} @@ -107,7 +109,7 @@ def pq_index_and_dir(pq_index, pq_path=None): return pq_index, pq_path -def process_document_tokens(token_file, dest_file, labels, max_token_count): +def process_document_tokens(token_file, dest_file, labels, max_token_count=None): """Filter out short tokens, add computed features, and return index info.""" slug = token_file.stem doc = pd.read_parquet(token_file).reset_index(drop=True) @@ -142,17 +144,18 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count): return {"slug": slug, "length": len(doc), **labels, **best_matches} -def label_tokens(tokens, labels, max_token_count): +def label_tokens(tokens, labels, max_token_count=None): + if max_token_count is None: + max_token_count = MAX_TOKENS_BY_TARGET + max_token_count = {**MAX_TOKENS_BY_TARGET, **max_token_count} + for col_name, label_value in labels.items(): tokens[col_name] = 0.0 - match_fn = LABEL_COLS[col_name] - max_token_count = MAX_TOKENS_BY_TARGET[col_name] - if col_name == "advertiser": - tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, max_token_count, match_fn - ) - else: - tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) + match_fn = LABEL_COLS.get(col_name, default=default_similarity) + span_limit = max_token_count.get(col_name, default=3) + tokens[col_name] = label_multitoken( + tokens.token.to_numpy(), label_value, span_limit, match_fn + ) return tokens @@ -214,12 +217,12 @@ def add_base_features(token_df): default=TRAINING_DIR, help="directory of parquet files", ) - parser.add_argument( - "--max-token-count", - type=int, - default=5, - help="maximum number of contiguous tokens to match against each label", - ) + for key in LABEL_COLS: + parser.add_argument( + f"--max-span-{key}", + type=int, + help=f"maximum number of contiguous tokens to match {key} against", + ) parser.add_argument("--log-level", dest="log_level", default="INFO") args = parser.parse_args() logger.setLevel(args.log_level.upper()) @@ -230,4 +233,8 @@ def add_base_features(token_df): indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir) index.parent.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True) - extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count) + + max_token_count = { + key[11:]: value for key, value in args.items() if key.startswith("--max-span-") + } + extend_and_write_docs(indir, manifest, index, outdir, max_token_count) From 79190b4e391fff165cc9c2638a16a2cc23cf7428 Mon Sep 17 00:00:00 2001 From: Hugh Wimberly Date: Wed, 7 Oct 2020 15:12:18 -0700 Subject: [PATCH 4/8] Revert "Allow token length to be passed in from the command line" This reverts commit f8c4f7681381031a16e545f8febfaf65c20d9c0c. --- deepform/data/add_features.py | 43 +++++++++++++++-------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index ed3f6e4..ef61a4b 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -55,9 +55,7 @@ class TokenType(Enum): } -def extend_and_write_docs( - source_dir, manifest, pq_index, out_path, max_token_count=None -): +def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} @@ -109,7 +107,7 @@ def pq_index_and_dir(pq_index, pq_path=None): return pq_index, pq_path -def process_document_tokens(token_file, dest_file, labels, max_token_count=None): +def process_document_tokens(token_file, dest_file, labels, max_token_count): """Filter out short tokens, add computed features, and return index info.""" slug = token_file.stem doc = pd.read_parquet(token_file).reset_index(drop=True) @@ -144,18 +142,17 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count=None) return {"slug": slug, "length": len(doc), **labels, **best_matches} -def label_tokens(tokens, labels, max_token_count=None): - if max_token_count is None: - max_token_count = MAX_TOKENS_BY_TARGET - max_token_count = {**MAX_TOKENS_BY_TARGET, **max_token_count} - +def label_tokens(tokens, labels, max_token_count): for col_name, label_value in labels.items(): tokens[col_name] = 0.0 - match_fn = LABEL_COLS.get(col_name, default=default_similarity) - span_limit = max_token_count.get(col_name, default=3) - tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, span_limit, match_fn - ) + match_fn = LABEL_COLS[col_name] + max_token_count = MAX_TOKENS_BY_TARGET[col_name] + if col_name == "advertiser": + tokens[col_name] = label_multitoken( + tokens.token.to_numpy(), label_value, max_token_count, match_fn + ) + else: + tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) return tokens @@ -217,12 +214,12 @@ def add_base_features(token_df): default=TRAINING_DIR, help="directory of parquet files", ) - for key in LABEL_COLS: - parser.add_argument( - f"--max-span-{key}", - type=int, - help=f"maximum number of contiguous tokens to match {key} against", - ) + parser.add_argument( + "--max-token-count", + type=int, + default=5, + help="maximum number of contiguous tokens to match against each label", + ) parser.add_argument("--log-level", dest="log_level", default="INFO") args = parser.parse_args() logger.setLevel(args.log_level.upper()) @@ -233,8 +230,4 @@ def add_base_features(token_df): indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir) index.parent.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True) - - max_token_count = { - key[11:]: value for key, value in args.items() if key.startswith("--max-span-") - } - extend_and_write_docs(indir, manifest, index, outdir, max_token_count) + extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count) From 6c25da44d060ae63f637d4a3018345362ff2d3a0 Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Sat, 10 Oct 2020 13:32:45 -0600 Subject: [PATCH 5/8] updated makefile to use 3_year_manifest --- Makefile | 7 ++++++- deepform/{ => data}/combine_manifests.py | 0 2 files changed, 6 insertions(+), 1 deletion(-) rename deepform/{ => data}/combine_manifests.py (100%) diff --git a/Makefile b/Makefile index 56cb53c..3478319 100644 --- a/Makefile +++ b/Makefile @@ -62,10 +62,15 @@ data/token_frequency.csv: data/tokenized ## Produce token frequency csv file docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ python -m deepform.data.create_vocabulary +data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data + docker build -t $(CONTAINER) . + docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ + python -m deepform.data.combine_manifests + data/doc_index.parquet: data/tokenized data/token_frequency.csv ## Create the training data from the token files and label manifest docker build -t $(CONTAINER) . docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ - python -m deepform.data.add_features data/fcc-data-2020-labeled-manifest.csv + python -m deepform.data.add_features data/3_year_manifest.csv .PHONY: train train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training diff --git a/deepform/combine_manifests.py b/deepform/data/combine_manifests.py similarity index 100% rename from deepform/combine_manifests.py rename to deepform/data/combine_manifests.py From 017cf07f25eb2cb3caaa724093223b150c820efc Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Sat, 10 Oct 2020 12:34:41 -0700 Subject: [PATCH 6/8] Update deepform/data/add_features.py Co-authored-by: Hugh Wimberly --- deepform/data/add_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index ef61a4b..dd9653d 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -146,7 +146,7 @@ def label_tokens(tokens, labels, max_token_count): for col_name, label_value in labels.items(): tokens[col_name] = 0.0 match_fn = LABEL_COLS[col_name] - max_token_count = MAX_TOKENS_BY_TARGET[col_name] + max_token_count = MAX_TOKENS_BY_TARGET.get(col_name, default=3) if col_name == "advertiser": tokens[col_name] = label_multitoken( tokens.token.to_numpy(), label_value, max_token_count, match_fn From 9038f1bc956fc06d85d15059dd5000b41c59db3a Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Sat, 10 Oct 2020 12:38:40 -0700 Subject: [PATCH 7/8] Update deepform/data/add_features.py Co-authored-by: Hugh Wimberly --- deepform/data/add_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index dd9653d..5c66d51 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -55,7 +55,7 @@ class TokenType(Enum): } -def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): +def extend_and_write_docs(source_dir, manifest, pq_index, out_path): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} From 37f0cccd7d0a9d3401ba009446a1ee6b044a8dfc Mon Sep 17 00:00:00 2001 From: Gray Davidson Date: Sat, 10 Oct 2020 13:42:12 -0600 Subject: [PATCH 8/8] removing max_token_count from args, removing conditional on advertiser --- deepform/data/add_features.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index ef61a4b..84dbe90 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -55,7 +55,7 @@ class TokenType(Enum): } -def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): +def extend_and_write_docs(source_dir, manifest, pq_index, out_path): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} @@ -76,7 +76,6 @@ def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_co "token_file": token_files[slug], "dest_file": out_path / f"{slug}.parquet", "labels": labels, - "max_token_count": max_token_count, } ) @@ -107,12 +106,12 @@ def pq_index_and_dir(pq_index, pq_path=None): return pq_index, pq_path -def process_document_tokens(token_file, dest_file, labels, max_token_count): +def process_document_tokens(token_file, dest_file, labels): """Filter out short tokens, add computed features, and return index info.""" slug = token_file.stem doc = pd.read_parquet(token_file).reset_index(drop=True) - doc = label_tokens(doc, labels, max_token_count) + doc = label_tokens(doc, labels) # Strip whitespace off all tokens. doc["token"] = doc.token.str.strip() @@ -142,18 +141,15 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count): return {"slug": slug, "length": len(doc), **labels, **best_matches} -def label_tokens(tokens, labels, max_token_count): +def label_tokens(tokens, labels): for col_name, label_value in labels.items(): tokens[col_name] = 0.0 match_fn = LABEL_COLS[col_name] max_token_count = MAX_TOKENS_BY_TARGET[col_name] - if col_name == "advertiser": - tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, max_token_count, match_fn - ) - else: - tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) + tokens[col_name] = label_multitoken( + tokens.token.to_numpy(), label_value, max_token_count, match_fn + ) return tokens @@ -214,12 +210,7 @@ def add_base_features(token_df): default=TRAINING_DIR, help="directory of parquet files", ) - parser.add_argument( - "--max-token-count", - type=int, - default=5, - help="maximum number of contiguous tokens to match against each label", - ) + parser.add_argument("--log-level", dest="log_level", default="INFO") args = parser.parse_args() logger.setLevel(args.log_level.upper()) @@ -230,4 +221,4 @@ def add_base_features(token_df): indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir) index.parent.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True) - extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count) + extend_and_write_docs(indir, manifest, index, outdir)