From b6354fd6ad719e15c0a34900978540ac1807898d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 23 Oct 2023 14:52:54 -0600 Subject: [PATCH] logger + nans --- .../single_column/statistical/key_uniqueness.py | 13 ++++++++----- .../statistical/test_key_uniqueness.py | 15 ++++++++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/sdmetrics/single_column/statistical/key_uniqueness.py b/sdmetrics/single_column/statistical/key_uniqueness.py index be25c0e8..4b3bda41 100644 --- a/sdmetrics/single_column/statistical/key_uniqueness.py +++ b/sdmetrics/single_column/statistical/key_uniqueness.py @@ -1,9 +1,11 @@ """Key Uniqueness Metric.""" +import logging -from sdmetrics.errors import InvalidDataError from sdmetrics.goal import Goal from sdmetrics.single_column.base import SingleColumnMetric +LOGGER = logging.getLogger(__name__) + class KeyUniqueness(SingleColumnMetric): """Key uniqueness metric. @@ -41,11 +43,12 @@ def compute_breakdown(cls, real_data, synthetic_data): The score breakdown of the key uniqueness metric. """ has_duplicates = real_data.duplicated().any() - if has_duplicates: - raise InvalidDataError('The real data contains NA or duplicate values.') + has_nans = real_data.isna().any() + if has_duplicates or has_nans: + LOGGER.info('The real data contains NA or duplicate values.') - duplicates_synthetic = synthetic_data.duplicated() - score = 1 - duplicates_synthetic.sum() / len(synthetic_data) + nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna() + score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data) return {'score': score} diff --git a/tests/unit/single_column/statistical/test_key_uniqueness.py b/tests/unit/single_column/statistical/test_key_uniqueness.py index 797e75e9..d95fa7eb 100644 --- a/tests/unit/single_column/statistical/test_key_uniqueness.py +++ b/tests/unit/single_column/statistical/test_key_uniqueness.py @@ -2,9 +2,7 @@ import numpy as np import pandas as pd -import pytest -from sdmetrics.errors import InvalidDataError from sdmetrics.single_column.statistical import KeyUniqueness @@ -22,19 +20,22 @@ def test_compute_breakdown(self): result = metric.compute_breakdown(real_data, synthetic_data) # Assert - assert result == {'score': 0.6} + assert result == {'score': 0.5} - def test_compute_breakdown_with_duplicates_in_real_data(self): + @patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER') + def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock): """Test the ``compute_breakdown`` method with duplicates in the real data.""" # Setup real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2) synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) metric = KeyUniqueness() - # Run and Assert + # Run + metric.compute_breakdown(real_data, synthetic_data) + + # Assert expected_message = 'The real data contains NA or duplicate values.' - with pytest.raises(InvalidDataError, match=expected_message): - metric.compute_breakdown(real_data, synthetic_data) + logger_mock.info.assert_called_once_with(expected_message) @patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown') def test_compute(self, compute_breakdown_mock):