diff --git a/Makefile b/Makefile index 56cb53c..3478319 100644 --- a/Makefile +++ b/Makefile @@ -62,10 +62,15 @@ data/token_frequency.csv: data/tokenized ## Produce token frequency csv file docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ python -m deepform.data.create_vocabulary +data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data + docker build -t $(CONTAINER) . + docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ + python -m deepform.data.combine_manifests + data/doc_index.parquet: data/tokenized data/token_frequency.csv ## Create the training data from the token files and label manifest docker build -t $(CONTAINER) . docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ - python -m deepform.data.add_features data/fcc-data-2020-labeled-manifest.csv + python -m deepform.data.add_features data/3_year_manifest.csv .PHONY: train train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index 330a39a..14aa542 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -45,8 +45,17 @@ class TokenType(Enum): "gross_amount": dollar_similarity, } +MAX_TOKENS_BY_TARGET = { + # Each label column, and the maximum expected tokens that it uses. + "contract_num": 3, + "advertiser": 11, + "flight_from": 3, + "flight_to": 3, + "gross_amount": 3, +} + -def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count): +def extend_and_write_docs(source_dir, manifest, pq_index, out_path): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} @@ -67,7 +76,6 @@ def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_co "token_file": token_files[slug], "dest_file": out_path / f"{slug}.parquet", "labels": labels, - "max_token_count": max_token_count, } ) @@ -98,12 +106,12 @@ def pq_index_and_dir(pq_index, pq_path=None): return pq_index, pq_path -def process_document_tokens(token_file, dest_file, labels, max_token_count): +def process_document_tokens(token_file, dest_file, labels): """Filter out short tokens, add computed features, and return index info.""" slug = token_file.stem doc = pd.read_parquet(token_file).reset_index(drop=True) - doc = label_tokens(doc, labels, max_token_count) + doc = label_tokens(doc, labels) # Strip whitespace off all tokens. doc["token"] = doc.token.str.strip() @@ -133,18 +141,16 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count): return {"slug": slug, "length": len(doc), **labels, **best_matches} -def label_tokens(tokens, labels, max_token_count): +def label_tokens(tokens, labels): for col_name, label_value in labels.items(): tokens[col_name] = 0.0 match_fn = LABEL_COLS[col_name] - if col_name == "advertiser": - tokens[col_name] = label_multitoken( - tokens.token.to_numpy(), label_value, max_token_count, match_fn - ) - else: - tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) + max_token_count = MAX_TOKENS_BY_TARGET.get(col_name, default=3) + tokens[col_name] = label_multitoken( + tokens.token.to_numpy(), label_value, max_token_count, match_fn + ) return tokens @@ -205,12 +211,7 @@ def add_base_features(token_df): default=TRAINING_DIR, help="directory of parquet files", ) - parser.add_argument( - "--max-token-count", - type=int, - default=5, - help="maximum number of contiguous tokens to match against each label", - ) + parser.add_argument("--log-level", dest="log_level", default="INFO") args = parser.parse_args() logger.setLevel(args.log_level.upper()) @@ -221,4 +222,4 @@ def add_base_features(token_df): indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir) index.parent.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True) - extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count) + extend_and_write_docs(indir, manifest, index, outdir) diff --git a/deepform/combine_manifests.py b/deepform/data/combine_manifests.py similarity index 100% rename from deepform/combine_manifests.py rename to deepform/data/combine_manifests.py