i'm not entirely sure which records in my dataset caused this, so I can't offer much aid in that regard, but there should probably be a logical protection against feeding two empty strings to the function and a default score or behavior for cases like this.
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
247 # Train or load the model
248 deduper = _train(settings_file, training_file, data_d, field_properties,
--> 249 sample_size, update_model, n_cores)
250
251 # Cluster the records
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores)
106
107 # Launch active learning
--> 108 deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
109
110 else:
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _active_learning(data, sample_size, deduper, training_file, settings_file)
39 # To train dedupe, we feed it a sample of records.
40 sample_num = math.floor(len(data) * sample_size)
---> 41 deduper.prepare_training(data, sample_size=sample_num)
42
43 print('Starting active labeling...')
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in prepare_training(self, data, training_file, sample_size, blocked_proportion)
1292 if training_file:
1293 self._read_training(training_file)
-> 1294 self._sample(data, sample_size, blocked_proportion)
1295
1296 def _sample(self,
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in _sample(self, data, sample_size, blocked_proportion)
1322 blocked_proportion,
1323 sample_size,
-> 1324 index_include=examples)
1325
1326 self.active_learner.mark(examples, y)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in init(self, data_model, data, blocked_proportion, sample_size, index_include)
440
441 self.classifier = RLRLearner(self.data_model)
--> 442 self.classifier.candidates = self.candidates
443
444 self._common_init()
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in candidates(self, new_candidates)
54 self._candidates = new_candidates
55
---> 56 self.distances = self.transform(self._candidates)
57
58 random_pair = random.choice(self._candidates)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in transform(self, pairs)
62
63 def transform(self, pairs):
---> 64 return self.data_model.distances(pairs)
65
66 def fit(self, X, y):
/opt/conda/lib/python3.7/site-packages/dedupe/datamodel.py in distances(self, record_pairs)
83 if record_1[field] is not None and record_2[field] is not None:
84 distances[i, start:stop] = compare(record_1[field],
---> 85 record_2[field])
86 elif hasattr(compare, 'missing'):
87 distances[i, start:stop] = compare(record_1[field],
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
ZeroDivisionError: normalizedAffineGapDistance cannot take two empty strings
i'm not entirely sure which records in my dataset caused this, so I can't offer much aid in that regard, but there should probably be a logical protection against feeding two empty strings to the function and a default score or behavior for cases like this.
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
247 # Train or load the model
248 deduper = _train(settings_file, training_file, data_d, field_properties,
--> 249 sample_size, update_model, n_cores)
250
251 # Cluster the records
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores)
106
107 # Launch active learning
--> 108 deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
109
110 else:
/opt/conda/lib/python3.7/site-packages/pandas_dedupe/dedupe_dataframe.py in _active_learning(data, sample_size, deduper, training_file, settings_file)
39 # To train dedupe, we feed it a sample of records.
40 sample_num = math.floor(len(data) * sample_size)
---> 41 deduper.prepare_training(data, sample_size=sample_num)
42
43 print('Starting active labeling...')
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in prepare_training(self, data, training_file, sample_size, blocked_proportion)
1292 if training_file:
1293 self._read_training(training_file)
-> 1294 self._sample(data, sample_size, blocked_proportion)
1295
1296 def _sample(self,
/opt/conda/lib/python3.7/site-packages/dedupe/api.py in _sample(self, data, sample_size, blocked_proportion)
1322 blocked_proportion,
1323 sample_size,
-> 1324 index_include=examples)
1325
1326 self.active_learner.mark(examples, y)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in init(self, data_model, data, blocked_proportion, sample_size, index_include)
440
441 self.classifier = RLRLearner(self.data_model)
--> 442 self.classifier.candidates = self.candidates
443
444 self._common_init()
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in candidates(self, new_candidates)
54 self._candidates = new_candidates
55
---> 56 self.distances = self.transform(self._candidates)
57
58 random_pair = random.choice(self._candidates)
/opt/conda/lib/python3.7/site-packages/dedupe/labeler.py in transform(self, pairs)
62
63 def transform(self, pairs):
---> 64 return self.data_model.distances(pairs)
65
66 def fit(self, X, y):
/opt/conda/lib/python3.7/site-packages/dedupe/datamodel.py in distances(self, record_pairs)
83 if record_1[field] is not None and record_2[field] is not None:
84 distances[i, start:stop] = compare(record_1[field],
---> 85 record_2[field])
86 elif hasattr(compare, 'missing'):
87 distances[i, start:stop] = compare(record_1[field],
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
affinegap/affinegap.pyx in affinegap.affinegap.normalizedAffineGapDistance()
ZeroDivisionError: normalizedAffineGapDistance cannot take two empty strings