Skip to content

Commit

Permalink
logger + nans
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Oct 23, 2023
1 parent 7b360b5 commit b6354fd
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
13 changes: 8 additions & 5 deletions sdmetrics/single_column/statistical/key_uniqueness.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Key Uniqueness Metric."""
import logging

from sdmetrics.errors import InvalidDataError
from sdmetrics.goal import Goal
from sdmetrics.single_column.base import SingleColumnMetric

LOGGER = logging.getLogger(__name__)


class KeyUniqueness(SingleColumnMetric):
"""Key uniqueness metric.
Expand Down Expand Up @@ -41,11 +43,12 @@ def compute_breakdown(cls, real_data, synthetic_data):
The score breakdown of the key uniqueness metric.
"""
has_duplicates = real_data.duplicated().any()
if has_duplicates:
raise InvalidDataError('The real data contains NA or duplicate values.')
has_nans = real_data.isna().any()
if has_duplicates or has_nans:
LOGGER.info('The real data contains NA or duplicate values.')

duplicates_synthetic = synthetic_data.duplicated()
score = 1 - duplicates_synthetic.sum() / len(synthetic_data)
nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna()
score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data)

return {'score': score}

Expand Down
15 changes: 8 additions & 7 deletions tests/unit/single_column/statistical/test_key_uniqueness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

import numpy as np
import pandas as pd
import pytest

from sdmetrics.errors import InvalidDataError
from sdmetrics.single_column.statistical import KeyUniqueness


Expand All @@ -22,19 +20,22 @@ def test_compute_breakdown(self):
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.6}
assert result == {'score': 0.5}

def test_compute_breakdown_with_duplicates_in_real_data(self):
@patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER')
def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock):
"""Test the ``compute_breakdown`` method with duplicates in the real data."""
# Setup
real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2)
synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
metric = KeyUniqueness()

# Run and Assert
# Run
metric.compute_breakdown(real_data, synthetic_data)

# Assert
expected_message = 'The real data contains NA or duplicate values.'
with pytest.raises(InvalidDataError, match=expected_message):
metric.compute_breakdown(real_data, synthetic_data)
logger_mock.info.assert_called_once_with(expected_message)

@patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown')
def test_compute(self, compute_breakdown_mock):
Expand Down

0 comments on commit b6354fd

Please sign in to comment.