Skip to content

Commit

Permalink
logger + test
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Jun 23, 2023
1 parent ef91b68 commit cdb5787
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 57 deletions.
27 changes: 15 additions & 12 deletions sdmetrics/reports/single_table/_properties/column_pair_trends.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import logging

import numpy as np
import pandas as pd
Expand All @@ -9,6 +10,8 @@
from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
from sdmetrics.utils import create_unique_name, is_datetime

LOGGER = logging.getLogger(__name__)


class ColumnPairTrends(BaseSingleTableProperty):
"""Column pair trends property.
Expand Down Expand Up @@ -64,6 +67,7 @@ def _convert_datetime_columns_to_numeric(self, data, metadata):
except Exception as e:
message = f'Error: {type(e).__name__} {e}'
self._columns_datetime_conversion_failed[column_name] = message
LOGGER.debug(message)
continue

return data
Expand Down Expand Up @@ -94,6 +98,7 @@ def _discretize_column(self, column_name, data, bin_edges=None):
except Exception as e:
message = f'Error: {type(e).__name__} {e}'
self._columns_discretization_failed[column_name] = message
LOGGER.debug(message)

return column_result, bin_edges

Expand Down Expand Up @@ -147,7 +152,7 @@ def _get_metric(self, sdtype_col_1, sdtype_col_2):

return metric

def _get_columns_data(self, column_name_1, column_name_2, real_data, synthetic_data, metadata):
def _get_columns_data(self, column_name_1, column_name_2, data, metadata):
"""Get the data for the property.
If one is comparing a continuous column to a discrete column, use the discrete version
Expand All @@ -158,10 +163,8 @@ def _get_columns_data(self, column_name_1, column_name_2, real_data, synthetic_d
The name of the first column
column_name_2 (str):
The name of the second column
real_data (pandas.DataFrame):
The real data
synthetic_data (pandas.DataFrame):
The synthetic data
data (pandas.DataFrame):
The data
metadata (dict):
The metadata of the table
"""
Expand All @@ -176,10 +179,7 @@ def _get_columns_data(self, column_name_1, column_name_2, real_data, synthetic_d
else:
col_name_2 = create_unique_name(column_name_2 + '_discrete', metadata['columns'])

columns_real = real_data[[col_name_1, col_name_2]]
columns_synthetic = synthetic_data[[col_name_1, col_name_2]]

return columns_real, columns_synthetic
return data[[col_name_1, col_name_2]]

def _required_preprocessing(self, sdtype_col_1, sdtype_col_2):
"""Check if a processing of one of the columns was required to compute the metric.
Expand Down Expand Up @@ -274,9 +274,11 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar):
if error:
raise Exception('Preprocessing failed')

columns_real, columns_synthetic = self._get_columns_data(
column_name_1, column_name_2, processed_real_data,
processed_synthetic_data, metadata
columns_real = self._get_columns_data(
column_name_1, column_name_2, processed_real_data, metadata
)
columns_synthetic = self._get_columns_data(
column_name_1, column_name_2, processed_synthetic_data, metadata
)

score_breakdown = metric.compute_breakdown(
Expand All @@ -296,6 +298,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar):
synthetic_correlation = np.nan
if not str(e) == 'Preprocessing failed':
error = f'Error: {type(e).__name__} {e}'
LOGGER.debug(error)

column_names_1.append(column_name_1)
column_names_2.append(column_name_2)
Expand Down
233 changes: 188 additions & 45 deletions tests/unit/reports/single_table/_properties/test_column_pair_trends.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,79 @@
from unittest.mock import Mock, patch, call
import itertools
from unittest.mock import Mock, patch

import pandas as pd
import numpy as np
import pandas as pd

from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends


class TestColumnPairTrends:

def test__convert_datetime_columns_to_numeric(self):
"""Test the ``_convert_datetime_columns_to_numeric`` method."""
# Setup
data = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
'col3': ['a', 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-01']),
'col5': [None, '2020-01-02', '2020-01-03'],
'col6': ['error', '2020-01-02', '2020-01-03']
})

metadata = {
'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'col5': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'col6': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}
}
}
cpt_property = ColumnPairTrends()

# Run
cpt_property._convert_datetime_columns_to_numeric(data, metadata)

# Assert
expected_error = (
'Error: ValueError time data "error" doesn\'t match format "%Y-%m-%d", at '
'position 0. You might want to try:\n - passing `format` if your strings'
" have a consistent format;\n - passing `format=\'ISO8601\'` if your"
' strings are all ISO8601 but not necessarily in exactly the same format;\n'
" - passing `format=\'mixed\'`, and the format will be inferred for each"
' element individually. You might want to use `dayfirst` alongside this.'
)

assert data['col4'].dtype == np.int64
assert data['col5'].dtype == np.float64
assert 'col6' in list(cpt_property._columns_datetime_conversion_failed.keys())
assert cpt_property._columns_datetime_conversion_failed['col6'] == expected_error

def test__discretize_column(self):
"""Test the ``_discretize_column`` method."""
# Setup
data = pd.DataFrame({
'err_col': ['a', 'b', 'c', 'd', 'e'],
'int_col': [1, 2, 3, None, 5],
'float_col': [1.1, np.nan, 3.3, 4.4, 5.5]
})
bin_edges = None
cpt_property = ColumnPairTrends()

# Run
col_err, bin_edges = cpt_property._discretize_column('err_col', data['err_col'])
col_int, bin_edges = cpt_property._discretize_column('int_col', data['int_col'])
col_float, bin_edges = cpt_property._discretize_column(
'float_col', data['float_col'], bin_edges
)

# Assert
assert 'err_col' in list(cpt_property._columns_discretization_failed.keys())
assert list(col_int) == [1, 3, 6, 11, 11]
assert list(col_float) == [1, 11, 6, 9, 11]

def test__get_processed_data(self):
"""Test the ``_get_processed_data`` method."""
# Setup
Expand All @@ -32,7 +97,9 @@ def test__get_processed_data(self):
processed_data = cpt_property._get_processed_data(data, metadata)

# Assert
expected_datetime = pd.to_numeric(pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']))
expected_datetime = pd.to_numeric(
pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
)
expected_processed_real = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
Expand Down Expand Up @@ -83,7 +150,10 @@ def test__get_processed_data_with_nans(self):
pd.testing.assert_frame_equal(processed_data, expected_processed_real)

def test__get_metric(self):
"""Test the ``_get_metric`` method."""
"""Test the ``_get_metric`` method.
The method should return the correct metric for each combination of column types.
"""
# Setup
cpt = ColumnPairTrends()

Expand All @@ -99,11 +169,100 @@ def test__get_metric(self):
cpt._get_metric('categorical', 'categorical').__name__ == 'ContingencySimilarity'
cpt._get_metric('boolean', 'boolean').__name__ == 'ContingencySimilarity'

@patch('sdmetrics.reports.single_table._properties.column_pair_trends'
'.CorrelationSimilarity.compute_breakdown')
@patch('sdmetrics.reports.single_table._properties.column_pair_trends'
'.ContingencySimilarity.compute_breakdown')
def test__generate_details(self, contingency_compute_mock, correlation_compute_mock):
def get_columns_data(self):
"""Test the ``_get_columns_data`` method.
The method should return the correct data for each combination of column types.
"""
# Setup
data = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
'col3': ['a', 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']),
'col1_discrete': [1, 6, 11],
'col4_discrete': [1, 6, 11],
})
metadata = {
'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime'},
}
}

cpt_property = ColumnPairTrends()

# Run and Assert
expected_return = [
data[['col1_discrete', 'col2']],
data[['col1_discrete', 'col3']],
data[['col1', 'col4']],
data[['col2', 'col3']],
data[['col2', 'col4_discrete']],
data[['col3', 'col4_discrete']],
]
for idx, col1, col2 in enumerate(itertools.combinations(data.columns, 2)):
columns_data = cpt_property._get_columns_data(data, metadata)
pd.testing.assert_frame_equal(columns_data, expected_return[idx])

def test_required_preprocessing(self):
"""Test the ``_required_preprocessing`` method.
The method should return the correct boolean for each combination of column types.
The output is True if one of the column has been preprocessed.
"""
# Setup
sdtype_pairs = [
('datetime', 'datetime'),
('numerical', 'numerical'),
('datetime', 'numerical'),
('datetime', 'categorical'),
('datetime', 'boolean'),
('numerical', 'categorical'),
('numerical', 'boolean'),
('categorical', 'boolean'),
('categorical', 'categorical'),
('boolean', 'boolean'),
]

cpt_property = ColumnPairTrends()

# Run and Assert
expected_return = [
True, False, True, True, True, True, True, False, False, False
]
for idx, sdtype_pair in enumerate(sdtype_pairs):
sdtype_1 = sdtype_pair[0]
sdtype_2 = sdtype_pair[1]
result = cpt_property._required_preprocessing(sdtype_1, sdtype_2)
assert result == expected_return[idx]

def test_preprocessing_failed(self):
"""Test the ``_preprocessing_failed`` method."""
# Setup
cpt_property = ColumnPairTrends()
cpt_property._columns_datetime_conversion_failed = {'col1': 'Error1'}
cpt_property._columns_discretization_failed = {'col3': 'Error3'}

# Run
result_1 = cpt_property._preprocessing_failed('col1', 'col2', 'datetime', 'datetime')
result_2 = cpt_property._preprocessing_failed('col2', 'col1', 'datetime', 'datetime')
result_3 = cpt_property._preprocessing_failed('col3', 'col4', 'numerical', 'boolean')
result_4 = cpt_property._preprocessing_failed('col2', 'col4', 'datetime', 'boolean')

# Assert
assert result_1 == 'Error1'
assert result_2 == 'Error1'
assert result_3 == 'Error3'
assert result_4 is None

@patch('sdmetrics.reports.single_table._properties.column_pair_trends.'
'ContingencySimilarity.compute_breakdown')
@patch('sdmetrics.reports.single_table._properties.column_pair_trends.'
'CorrelationSimilarity.compute_breakdown')
def test__generate_details(self, correlation_compute_mock, contingency_compute_mock):
"""Test the ``_generate_details`` method."""
# Setup
real_data = pd.DataFrame({
Expand All @@ -114,62 +273,45 @@ def test__generate_details(self, contingency_compute_mock, correlation_compute_m
})

synthetic_data = pd.DataFrame({
'col1': [4, 5, 6],
'col1': [3, 1, 2],
'col2': [False, False, True],
'col3': ['a', 'b', 'b'],
'col4': pd.to_datetime(['2020-01-04', '2020-01-05', '2020-01-06']),
'col4': pd.to_datetime(['2020-01-03', '2020-01-01', '2020-01-02']),
})

metadata = {'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime'},
}}
metadata = {
'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime'},
}
}

processed_real = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
'col3': ['a', 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']),
'col1_discrete': [1, 2, 3],
'col4_discrete': [0, 1, 2],
'col4': [1577836800000000000, 1577923200000000000, 1578009600000000000],
'col1_discrete': [1, 6, 11],
'col4_discrete': [1, 6, 11],
})

processed_synthetic = pd.DataFrame({
'col1': [4, 5, 6],
'col1': [3, 1, 2],
'col2': [False, False, True],
'col3': ['a', 'b', 'b'],
'col4': pd.to_datetime(['2020-01-04', '2020-01-05', '2020-01-06']),
'col1_discrete': [4, 5, 6],
'col4_discrete': [3, 4, 5],
'col4': [1578009600000000000, 1577836800000000000, 1577923200000000000],
'col1_discrete': [11, 1, 6],
'col4_discrete': [11, 1, 6],
})

cpt_property = ColumnPairTrends()

mock_processed_data = Mock()
cpt_property._get_processed_data = mock_processed_data

mock_get_columns_data = Mock()
cpt_property._get_columns_data = mock_get_columns_data


# Run
cpt_property._generate_details(real_data, synthetic_data, metadata, None)

# Assert
mock_processed_data.assert_has_calls(
[call(real_data, metadata), call(synthetic_data, metadata)]
)

columns = ['col1', 'col2', 'col3', 'col4']
list_calls = [
call(col_name_1, col_name_2, processed_real, processed_synthetic, metadata)
for col_name_1, col_name_2 in itertools.combinations(columns, 2)
]
mock_get_columns_data.assert_has_calls(list_calls)


_, correlation_kwargs = correlation_compute_mock.call_args
assert correlation_kwargs['real_data'].equals(processed_real[['col1', 'col4']])
assert correlation_kwargs['synthetic_data'].equals(processed_synthetic[['col1', 'col4']])
Expand Down Expand Up @@ -267,11 +409,12 @@ def test_get_visualization(self, mock_make_subplots):
cpt_property._update_layout = mock__update_layout

# Run
cpt_property.get_visualization()
result = cpt_property.get_visualization()

# Assert
assert mock__get_correlation_matrix.call_count == 3
mock_make_subplots.assert_called()
assert cpt_property._get_heatmap.call_count == 3
assert fig_mock.add_trace.call_count == 3
cpt_property._update_layout.assert_called()
cpt_property._update_layout.assert_called_once_with(fig_mock)
assert result == fig_mock

0 comments on commit cdb5787

Please sign in to comment.