Skip to content
This repository was archived by the owner on Jul 7, 2025. It is now read-only.
This repository was archived by the owner on Jul 7, 2025. It is now read-only.

Zero Division Error: two empty strings. #53

@kotoroshinoto

Description

@kotoroshinoto

i'm not entirely sure which records in my dataset caused this, so I can't offer much aid in that regard, but there should probably be a logical protection against feeding two empty strings to the function and a default score or behavior for cases like this.

/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
247 # Train or load the model
248 deduper = _train(settings_file, training_file, data_d, field_properties,
--> 249 sample_size, update_model, n_cores)
250
251 # Cluster the records

/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores)
106
107 # Launch active learning
--> 108 deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
109
110 else:

/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _active_learning(data, sample_size, deduper, training_file, settings_file)
39 # To train dedupe, we feed it a sample of records.
40 sample_num = math.floor(len(data) * sample_size)
---> 41 deduper.prepare_training(data, sample_size=sample_num)
42
43 print('Starting active labeling...')

/opt/conda/lib/python3.7/site-packages/dedupe/api.py in prepare_training(self, data, training_file, sample_size, blocked_proportion)
1292 if training_file:
1293 self._read_training(training_file)
-> 1294 self._sample(data, sample_size, blocked_proportion)
1295
1296 def _sample(self,

/opt/conda/lib/python3.7/site-packages/dedupe/api.py in _sample(self, data, sample_size, blocked_proportion)
1322 blocked_proportion,
1323 sample_size,
-> 1324 index_include=examples)
1325
1326 self.active_learner.mark(examples, y)

/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in init(self, data_model, data, blocked_proportion, sample_size, index_include)
440
441 self.classifier = RLRLearner(self.data_model)
--> 442 self.classifier.candidates = self.candidates
443
444 self._common_init()

/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in candidates(self, new_candidates)
54 self._candidates = new_candidates
55
---> 56 self.distances = self.transform(self._candidates)
57
58 random_pair = random.choice(self._candidates)

/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in transform(self, pairs)
62
63 def transform(self, pairs):
---> 64 return self.data_model.distances(pairs)
65
66 def fit(self, X, y):

/opt/conda/lib/python3.7/site-packages/dedupe/datamodel.py in distances(self, record_pairs)
83 if record_1[field] is not None and record_2[field] is not None:
84 distances[i, start:stop] = compare(record_1[field],
---> 85 record_2[field])
86 elif hasattr(compare, 'missing'):
87 distances[i, start:stop] = compare(record_1[field],

affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()

affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()

ZeroDivisionError: normalizedAffineGapDistance cannot take two empty strings

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions