Skip to content
Open
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,15 @@ data/token_frequency.csv: data/tokenized ## Produce token frequency csv file
docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
python -m deepform.data.create_vocabulary

data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data
docker build -t $(CONTAINER) .
docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
python -m deepform.data.combine_manifests

data/doc_index.parquet: data/tokenized data/token_frequency.csv ## Create the training data from the token files and label manifest
docker build -t $(CONTAINER) .
docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
python -m deepform.data.add_features data/fcc-data-2020-labeled-manifest.csv
python -m deepform.data.add_features data/3_year_manifest.csv

.PHONY: train
train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training
Expand Down
37 changes: 19 additions & 18 deletions deepform/data/add_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,17 @@ class TokenType(Enum):
"gross_amount": dollar_similarity,
}

MAX_TOKENS_BY_TARGET = {
# Each label column, and the maximum expected tokens that it uses.
"contract_num": 3,
"advertiser": 11,
"flight_from": 3,
"flight_to": 3,
"gross_amount": 3,
}


def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_count):
def extend_and_write_docs(source_dir, manifest, pq_index, out_path):
"""Split data into individual documents, add features, and write to parquet."""

token_files = {p.stem: p for p in source_dir.glob("*.parquet")}
Expand All @@ -67,7 +76,6 @@ def extend_and_write_docs(source_dir, manifest, pq_index, out_path, max_token_co
"token_file": token_files[slug],
"dest_file": out_path / f"{slug}.parquet",
"labels": labels,
"max_token_count": max_token_count,
}
)

Expand Down Expand Up @@ -98,12 +106,12 @@ def pq_index_and_dir(pq_index, pq_path=None):
return pq_index, pq_path


def process_document_tokens(token_file, dest_file, labels, max_token_count):
def process_document_tokens(token_file, dest_file, labels):
"""Filter out short tokens, add computed features, and return index info."""
slug = token_file.stem
doc = pd.read_parquet(token_file).reset_index(drop=True)

doc = label_tokens(doc, labels, max_token_count)
doc = label_tokens(doc, labels)

# Strip whitespace off all tokens.
doc["token"] = doc.token.str.strip()
Expand Down Expand Up @@ -133,18 +141,16 @@ def process_document_tokens(token_file, dest_file, labels, max_token_count):
return {"slug": slug, "length": len(doc), **labels, **best_matches}


def label_tokens(tokens, labels, max_token_count):
def label_tokens(tokens, labels):
for col_name, label_value in labels.items():
tokens[col_name] = 0.0
match_fn = LABEL_COLS[col_name]

if col_name == "advertiser":
tokens[col_name] = label_multitoken(
tokens.token.to_numpy(), label_value, max_token_count, match_fn
)
else:
tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))
max_token_count = MAX_TOKENS_BY_TARGET.get(col_name, default=3)

tokens[col_name] = label_multitoken(
tokens.token.to_numpy(), label_value, max_token_count, match_fn
)
return tokens


Expand Down Expand Up @@ -205,12 +211,7 @@ def add_base_features(token_df):
default=TRAINING_DIR,
help="directory of parquet files",
)
parser.add_argument(
"--max-token-count",
type=int,
default=5,
help="maximum number of contiguous tokens to match against each label",
)

parser.add_argument("--log-level", dest="log_level", default="INFO")
args = parser.parse_args()
logger.setLevel(args.log_level.upper())
Expand All @@ -221,4 +222,4 @@ def add_base_features(token_df):
indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
index.parent.mkdir(parents=True, exist_ok=True)
outdir.mkdir(parents=True, exist_ok=True)
extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count)
extend_and_write_docs(indir, manifest, index, outdir)
File renamed without changes.