From 34f1a0673b1e7db16abffccef3e00042839f9284 Mon Sep 17 00:00:00 2001
From: Gray Davidson <gray.davidson.00@gmail.com>
Date: Wed, 7 Oct 2020 14:36:53 -0600
Subject: [PATCH 1/8] added dictionary of max_token_lengths

---
 deepform/data/add_features.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index 330a39a..6829911 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -45,6 +45,15 @@ class TokenType(Enum):
     "gross_amount": dollar_similarity,
 }
 
+MAX_TOKENS_BY_TARGET = {
+    # Each label column, and the maximum expected tokens that it uses.
+    "contract_num": 3,
+    "advertiser": 11,
+    "flight_from": 3,
+    "flight_to": 3,
+    "gross_amount": 3,
+}
+
 
 def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
     """Split data into individual documents, add features, and write to parquet."""
@@ -137,10 +146,10 @@ def label_tokens(tokens, labels, max_token_count):
     for col_name, label_value in labels.items():
         tokens[col_name] = 0.0
         match_fn = LABEL_COLS[col_name]
-
+        max_token_count = MAX_TOKENS_BY_TARGET[col_name]
         if col_name == "advertiser":
             tokens[col_name] = label_multitoken(
-                tokens.token.to_numpy(), label_value, max_token_count, match_fn
+                tokens.token.to_numpy(), label_value, max_token_count[0], match_fn
             )
         else:
             tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))

From 60ce87614d9b84524d9d7604362f5b76be63fb10 Mon Sep 17 00:00:00 2001
From: Gray Davidson <gray.davidson.00@gmail.com>
Date: Wed, 7 Oct 2020 14:49:29 -0600
Subject: [PATCH 2/8] removed offending subscript

---
 deepform/data/add_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index 6829911..ef61a4b 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -149,7 +149,7 @@ def label_tokens(tokens, labels, max_token_count):
         max_token_count = MAX_TOKENS_BY_TARGET[col_name]
         if col_name == "advertiser":
             tokens[col_name] = label_multitoken(
-                tokens.token.to_numpy(), label_value, max_token_count[0], match_fn
+                tokens.token.to_numpy(), label_value, max_token_count, match_fn
             )
         else:
             tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))

From f8c4f7681381031a16e545f8febfaf65c20d9c0c Mon Sep 17 00:00:00 2001
From: Hugh Wimberly <hugh.wimberly@gmail.com>
Date: Wed, 7 Oct 2020 15:08:07 -0700
Subject: [PATCH 3/8] Allow token length to be passed in from the command line

---
 deepform/data/add_features.py | 43 ++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index ef61a4b..ed3f6e4 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -55,7 +55,9 @@ class TokenType(Enum):
 }
 
 
-def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
+def extend_and_write_docs(
+    source_dir, manifest, pq_index, out_path, max_token_count=None
+):
     """Split data into individual documents, add features, and write to parquet."""
 
     token_files = {p.stem: p for p in source_dir.glob("*.parquet")}
@@ -107,7 +109,7 @@ def pq_index_and_dir(pq_index, pq_path=None):
     return pq_index, pq_path
 
 
-def process_document_tokens(token_file, dest_file, labels, max_token_count):
+def process_document_tokens(token_file, dest_file, labels, max_token_count=None):
     """Filter out short tokens, add computed features, and return index info."""
     slug = token_file.stem
     doc = pd.read_parquet(token_file).reset_index(drop=True)
@@ -142,17 +144,18 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count):
     return {"slug": slug, "length": len(doc), **labels, **best_matches}
 
 
-def label_tokens(tokens, labels, max_token_count):
+def label_tokens(tokens, labels, max_token_count=None):
+    if max_token_count is None:
+        max_token_count = MAX_TOKENS_BY_TARGET
+    max_token_count = {**MAX_TOKENS_BY_TARGET, **max_token_count}
+
     for col_name, label_value in labels.items():
         tokens[col_name] = 0.0
-        match_fn = LABEL_COLS[col_name]
-        max_token_count = MAX_TOKENS_BY_TARGET[col_name]
-        if col_name == "advertiser":
-            tokens[col_name] = label_multitoken(
-                tokens.token.to_numpy(), label_value, max_token_count, match_fn
-            )
-        else:
-            tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))
+        match_fn = LABEL_COLS.get(col_name, default=default_similarity)
+        span_limit = max_token_count.get(col_name, default=3)
+        tokens[col_name] = label_multitoken(
+            tokens.token.to_numpy(), label_value, span_limit, match_fn
+        )
 
     return tokens
 
@@ -214,12 +217,12 @@ def add_base_features(token_df):
         default=TRAINING_DIR,
         help="directory of parquet files",
     )
-    parser.add_argument(
-        "--max-token-count",
-        type=int,
-        default=5,
-        help="maximum number of contiguous tokens to match against each label",
-    )
+    for key in LABEL_COLS:
+        parser.add_argument(
+            f"--max-span-{key}",
+            type=int,
+            help=f"maximum number of contiguous tokens to match {key} against",
+        )
     parser.add_argument("--log-level", dest="log_level", default="INFO")
     args = parser.parse_args()
     logger.setLevel(args.log_level.upper())
@@ -230,4 +233,8 @@ def add_base_features(token_df):
     indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
     index.parent.mkdir(parents=True, exist_ok=True)
     outdir.mkdir(parents=True, exist_ok=True)
-    extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count)
+
+    max_token_count = {
+        key[11:]: value for key, value in args.items() if key.startswith("--max-span-")
+    }
+    extend_and_write_docs(indir, manifest, index, outdir, max_token_count)

From 79190b4e391fff165cc9c2638a16a2cc23cf7428 Mon Sep 17 00:00:00 2001
From: Hugh Wimberly <hugh.wimberly@gmail.com>
Date: Wed, 7 Oct 2020 15:12:18 -0700
Subject: [PATCH 4/8] Revert "Allow token length to be passed in from the
 command line"

This reverts commit f8c4f7681381031a16e545f8febfaf65c20d9c0c.
---
 deepform/data/add_features.py | 43 +++++++++++++++--------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index ed3f6e4..ef61a4b 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -55,9 +55,7 @@ class TokenType(Enum):
 }
 
 
-def extend_and_write_docs(
-    source_dir, manifest, pq_index, out_path, max_token_count=None
-):
+def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
     """Split data into individual documents, add features, and write to parquet."""
 
     token_files = {p.stem: p for p in source_dir.glob("*.parquet")}
@@ -109,7 +107,7 @@ def pq_index_and_dir(pq_index, pq_path=None):
     return pq_index, pq_path
 
 
-def process_document_tokens(token_file, dest_file, labels, max_token_count=None):
+def process_document_tokens(token_file, dest_file, labels, max_token_count):
     """Filter out short tokens, add computed features, and return index info."""
     slug = token_file.stem
     doc = pd.read_parquet(token_file).reset_index(drop=True)
@@ -144,18 +142,17 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count=None)
     return {"slug": slug, "length": len(doc), **labels, **best_matches}
 
 
-def label_tokens(tokens, labels, max_token_count=None):
-    if max_token_count is None:
-        max_token_count = MAX_TOKENS_BY_TARGET
-    max_token_count = {**MAX_TOKENS_BY_TARGET, **max_token_count}
-
+def label_tokens(tokens, labels, max_token_count):
     for col_name, label_value in labels.items():
         tokens[col_name] = 0.0
-        match_fn = LABEL_COLS.get(col_name, default=default_similarity)
-        span_limit = max_token_count.get(col_name, default=3)
-        tokens[col_name] = label_multitoken(
-            tokens.token.to_numpy(), label_value, span_limit, match_fn
-        )
+        match_fn = LABEL_COLS[col_name]
+        max_token_count = MAX_TOKENS_BY_TARGET[col_name]
+        if col_name == "advertiser":
+            tokens[col_name] = label_multitoken(
+                tokens.token.to_numpy(), label_value, max_token_count, match_fn
+            )
+        else:
+            tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))
 
     return tokens
 
@@ -217,12 +214,12 @@ def add_base_features(token_df):
         default=TRAINING_DIR,
         help="directory of parquet files",
     )
-    for key in LABEL_COLS:
-        parser.add_argument(
-            f"--max-span-{key}",
-            type=int,
-            help=f"maximum number of contiguous tokens to match {key} against",
-        )
+    parser.add_argument(
+        "--max-token-count",
+        type=int,
+        default=5,
+        help="maximum number of contiguous tokens to match against each label",
+    )
     parser.add_argument("--log-level", dest="log_level", default="INFO")
     args = parser.parse_args()
     logger.setLevel(args.log_level.upper())
@@ -233,8 +230,4 @@ def add_base_features(token_df):
     indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
     index.parent.mkdir(parents=True, exist_ok=True)
     outdir.mkdir(parents=True, exist_ok=True)
-
-    max_token_count = {
-        key[11:]: value for key, value in args.items() if key.startswith("--max-span-")
-    }
-    extend_and_write_docs(indir, manifest, index, outdir, max_token_count)
+    extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count)

From 6c25da44d060ae63f637d4a3018345362ff2d3a0 Mon Sep 17 00:00:00 2001
From: Gray Davidson <gray.davidson.00@gmail.com>
Date: Sat, 10 Oct 2020 13:32:45 -0600
Subject: [PATCH 5/8] updated makefile to use 3_year_manifest

---
 Makefile                                 | 7 ++++++-
 deepform/{ => data}/combine_manifests.py | 0
 2 files changed, 6 insertions(+), 1 deletion(-)
 rename deepform/{ => data}/combine_manifests.py (100%)

diff --git a/Makefile b/Makefile
index 56cb53c..3478319 100644
--- a/Makefile
+++ b/Makefile
@@ -62,10 +62,15 @@ data/token_frequency.csv: data/tokenized ## Produce token frequency csv file
 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
 	python -m deepform.data.create_vocabulary
 
+data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data
+	docker build -t $(CONTAINER) .
+	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
+	python -m deepform.data.combine_manifests
+
 data/doc_index.parquet: data/tokenized data/token_frequency.csv ## Create the training data from the token files and label manifest
 	docker build -t $(CONTAINER) .
 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
-	python -m deepform.data.add_features data/fcc-data-2020-labeled-manifest.csv
+	python -m deepform.data.add_features data/3_year_manifest.csv
 
 .PHONY: train
 train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training
diff --git a/deepform/combine_manifests.py b/deepform/data/combine_manifests.py
similarity index 100%
rename from deepform/combine_manifests.py
rename to deepform/data/combine_manifests.py

From 017cf07f25eb2cb3caaa724093223b150c820efc Mon Sep 17 00:00:00 2001
From: Gray Davidson <silent.harlequin@gmail.com>
Date: Sat, 10 Oct 2020 12:34:41 -0700
Subject: [PATCH 6/8] Update deepform/data/add_features.py

Co-authored-by: Hugh Wimberly <hugh.wimberly@gmail.com>
---
 deepform/data/add_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index ef61a4b..dd9653d 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -146,7 +146,7 @@ def label_tokens(tokens, labels, max_token_count):
     for col_name, label_value in labels.items():
         tokens[col_name] = 0.0
         match_fn = LABEL_COLS[col_name]
-        max_token_count = MAX_TOKENS_BY_TARGET[col_name]
+        max_token_count = MAX_TOKENS_BY_TARGET.get(col_name, default=3)
         if col_name == "advertiser":
             tokens[col_name] = label_multitoken(
                 tokens.token.to_numpy(), label_value, max_token_count, match_fn

From 9038f1bc956fc06d85d15059dd5000b41c59db3a Mon Sep 17 00:00:00 2001
From: Gray Davidson <silent.harlequin@gmail.com>
Date: Sat, 10 Oct 2020 12:38:40 -0700
Subject: [PATCH 7/8] Update deepform/data/add_features.py

Co-authored-by: Hugh Wimberly <hugh.wimberly@gmail.com>
---
 deepform/data/add_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index dd9653d..5c66d51 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -55,7 +55,7 @@ class TokenType(Enum):
 }
 
 
-def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
+def extend_and_write_docs(source_dir, manifest, pq_index, out_path):
     """Split data into individual documents, add features, and write to parquet."""
 
     token_files = {p.stem: p for p in source_dir.glob("*.parquet")}

From 37f0cccd7d0a9d3401ba009446a1ee6b044a8dfc Mon Sep 17 00:00:00 2001
From: Gray Davidson <gray.davidson.00@gmail.com>
Date: Sat, 10 Oct 2020 13:42:12 -0600
Subject: [PATCH 8/8] removing max_token_count from args, removing conditional
 on advertiser

---
 deepform/data/add_features.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py
index ef61a4b..84dbe90 100644
--- a/deepform/data/add_features.py
+++ b/deepform/data/add_features.py
@@ -55,7 +55,7 @@ class TokenType(Enum):
 }
 
 
-def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
+def extend_and_write_docs(source_dir, manifest, pq_index, out_path):
     """Split data into individual documents, add features, and write to parquet."""
 
     token_files = {p.stem: p for p in source_dir.glob("*.parquet")}
@@ -76,7 +76,6 @@ def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_co
                 "token_file": token_files[slug],
                 "dest_file": out_path / f"{slug}.parquet",
                 "labels": labels,
-                "max_token_count": max_token_count,
             }
         )
 
@@ -107,12 +106,12 @@ def pq_index_and_dir(pq_index, pq_path=None):
     return pq_index, pq_path
 
 
-def process_document_tokens(token_file, dest_file, labels, max_token_count):
+def process_document_tokens(token_file, dest_file, labels):
     """Filter out short tokens, add computed features, and return index info."""
     slug = token_file.stem
     doc = pd.read_parquet(token_file).reset_index(drop=True)
 
-    doc = label_tokens(doc, labels, max_token_count)
+    doc = label_tokens(doc, labels)
 
     # Strip whitespace off all tokens.
     doc["token"] = doc.token.str.strip()
@@ -142,18 +141,15 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count):
     return {"slug": slug, "length": len(doc), **labels, **best_matches}
 
 
-def label_tokens(tokens, labels, max_token_count):
+def label_tokens(tokens, labels):
     for col_name, label_value in labels.items():
         tokens[col_name] = 0.0
         match_fn = LABEL_COLS[col_name]
         max_token_count = MAX_TOKENS_BY_TARGET[col_name]
-        if col_name == "advertiser":
-            tokens[col_name] = label_multitoken(
-                tokens.token.to_numpy(), label_value, max_token_count, match_fn
-            )
-        else:
-            tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))
 
+        tokens[col_name] = label_multitoken(
+            tokens.token.to_numpy(), label_value, max_token_count, match_fn
+        )
     return tokens
 
 
@@ -214,12 +210,7 @@ def add_base_features(token_df):
         default=TRAINING_DIR,
         help="directory of parquet files",
     )
-    parser.add_argument(
-        "--max-token-count",
-        type=int,
-        default=5,
-        help="maximum number of contiguous tokens to match against each label",
-    )
+
     parser.add_argument("--log-level", dest="log_level", default="INFO")
     args = parser.parse_args()
     logger.setLevel(args.log_level.upper())
@@ -230,4 +221,4 @@ def add_base_features(token_df):
     indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
     index.parent.mkdir(parents=True, exist_ok=True)
     outdir.mkdir(parents=True, exist_ok=True)
-    extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count)
+    extend_and_write_docs(indir, manifest, index, outdir)