Skip to content

Commit

Permalink
address comments 2
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Jun 22, 2023
1 parent 8ec3992 commit e78d307
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 59 deletions.
18 changes: 9 additions & 9 deletions sdmetrics/reports/single_table/_properties/column_shapes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
Expand Down Expand Up @@ -39,7 +37,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
The progress bar to use. Defaults to tqdm.
"""
column_names, metric_names, scores = [], [], []
warning_messages = []
error_messages = []
for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
try:
Expand All @@ -48,14 +46,14 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
column_score = metric.compute(
real_data[column_name], synthetic_data[column_name]
)
error_message = None
else:
continue

except Exception as e:
column_score = np.nan
warning_messages.append(
f"Unable to compute Column Shape for column '{column_name}'. "
f'Encountered Error: {type(e).__name__} {e}'
error_message = (
f'Error: {type(e).__name__} {e}'
)
finally:
if progress_bar:
Expand All @@ -64,19 +62,21 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
column_names.append(column_name)
metric_names.append(metric.__name__)
scores.append(column_score)
error_messages.append(error_message)

if progress_bar:
progress_bar.close()

for message in warning_messages:
warnings.warn(message)

result = pd.DataFrame({
'Column': column_names,
'Metric': metric_names,
'Score': scores,
'Error': error_messages,
})

if result['Error'].isna().all():
result = result.drop('Error', axis=1)

return result

def get_visualization(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

import pandas as pd

from sdmetrics.demos import load_demo
Expand Down Expand Up @@ -38,8 +36,8 @@ def test_get_score(self):
pd.testing.assert_frame_equal(column_shape_property._details, expected_details)
assert score == 0.816

def test_get_score_warnings(self, recwarn):
"""Test the ``get_score`` method when the metrics are raising erros for some columns."""
def test_get_score_warnings(self):
"""Test the ``get_score`` method when the metrics are raising errors for some columns."""
# Setup
real_data, synthetic_data, metadata = load_demo('single_table')

Expand All @@ -49,22 +47,22 @@ def test_get_score_warnings(self, recwarn):
# Run
column_shape_property = ColumnShapes()

expected_message_1 = re.escape(
"Unable to compute Column Shape for column 'start_date'. Encountered Error:"
" TypeError '<' not supported between instances of 'Timestamp' and 'int'"
expected_message_1 = (
"Error: TypeError '<' not supported between instances of 'Timestamp' and 'int'"
)
expected_message_2 = re.escape(
"Unable to compute Column Shape for column 'employability_perc'. "
"Encountered Error: TypeError '<' not supported between instances of 'str' and 'float'"
expected_message_2 = (
"Error: TypeError '<' not supported between instances of 'str' and 'float'"
)

score = column_shape_property.get_score(real_data, synthetic_data, metadata)

# Assert
assert re.match(expected_message_1, str(recwarn[0].message))
assert re.match(expected_message_2, str(recwarn[1].message))

details = column_shape_property._details
column_names_nan = list(details.loc[pd.isna(details['Score'])]['Column'])
details_nan = details.loc[pd.isna(details['Score'])]
column_names_nan = details_nan['Column'].tolist()
error_messages = details_nan['Error'].tolist()
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.826
157 changes: 120 additions & 37 deletions tests/unit/reports/single_table/_properties/test_column_shapes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@

import re
from unittest.mock import Mock, call, patch

import numpy as np
import pandas as pd
import pytest

from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes

Expand Down Expand Up @@ -53,49 +51,134 @@ def test__generate_details(self, tv_complement_compute_mock, ks_complement_compu
ks_complement_compute_mock.assert_has_calls(expected_calls_ksc)
tv_complement_compute_mock.assert_has_calls(expected_calls_tvc)

def test__generate_details_warning(self):
"""Test the ``_generate_details`` method."""
@patch('sdmetrics.reports.single_table._properties.column_shapes.KSComplement.compute')
@patch('sdmetrics.reports.single_table._properties.column_shapes.TVComplement.compute')
def test__generate_details_with_nans(
self, tv_complement_compute_mock, ks_complement_compute_mock
):
"""Test the ``_generate_details`` method when there is NaNs in the data."""
# Setup
real_data = pd.DataFrame({
'col1': [1, None, 3],
'col2': [False, True, np.nan],
'col3': [None, 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', np.nan, '2020-01-03'])
})
synthetic_data = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
'col3': ['a', None, 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
})
metadata = {
'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime'}
}
}

# Run
column_shape_property = ColumnShapes()
column_shape_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls_ksc = [
call(real_data['col1'], synthetic_data['col1']),
call(real_data['col4'], synthetic_data['col4']),
]
expected_calls_tvc = [
call(real_data['col2'], synthetic_data['col2']),
call(real_data['col3'], synthetic_data['col3']),
]

ks_complement_compute_mock.assert_has_calls(expected_calls_ksc)
tv_complement_compute_mock.assert_has_calls(expected_calls_tvc)

def test__generate_details_error(self):
"""Test the ``_generate_details`` method with the error column."""
# Setup
real_data = pd.DataFrame({'col1': [1, '2', 3]})
synthetic_data = pd.DataFrame({'col1': [4, 5, 6]})
metadata = {'columns': {'col1': {'sdtype': 'numerical'}}}

# Run and Assert
column_shape_property = ColumnShapes()
expected_message = re.escape(
"Unable to compute Column Shape for column 'col1'. Encountered Error: "
"TypeError '<' not supported between instances of 'str' and 'int'"
)
with pytest.warns(UserWarning, match=expected_message):
column_shape_property._generate_details(real_data, synthetic_data, metadata)

@patch('sdmetrics.reports.single_table._properties.column_shapes.px')
def test_get_visualization(self, mock_px):
"""Test the ``get_visualization`` method."""
# Setup
column_shape_property = ColumnShapes()
# Run
result = column_shape_property._generate_details(real_data, synthetic_data, metadata)

column_shape_property._details = {
'Column': ['Column1', 'Column2'],
'Score': [0.7, 0.3],
'Metric': ['KSComplement', 'TVComplement']
}
# Assert
expected_message = (
"Error: TypeError '<' not supported between instances of 'str' and 'int'"
)
result_nan = result.loc[pd.isna(result['Score'])]
column_names_nan = result_nan['Column'].tolist()
error_message = result_nan['Error'].tolist()

assert column_names_nan == ['col1']
assert error_message == [expected_message]

@patch('sdmetrics.reports.single_table._properties.column_shapes.px')
def test_get_visualization(self, mock_px):
"""Test the ``get_visualization`` method."""
# Setup
column_shape_property = ColumnShapes()

mock_df = pd.DataFrame({
'Column': ['Column1', 'Column2'],
'Score': [0.7, 0.3],
'Metric': ['KSComplement', 'TVComplement']
})
column_shape_property._details = mock_df

mock__compute_average = Mock(return_value=0.5)
column_shape_property._compute_average = mock__compute_average

mock_bar = Mock()
mock_px.bar.return_value = mock_bar

# Run
column_shape_property.get_visualization()

# Assert
mock__compute_average.assert_called_once()

# Expected call
expected_kwargs = {
'data_frame': mock_df,
'x': 'Column',
'y': 'Score',
'title': (
'Data Quality: Column Shapes (Average'
f'Score={mock__compute_average.return_value})'
),
'category_orders': {'group': mock_df['Column'].tolist()},
'color': 'Metric',
'color_discrete_map': {
'KSComplement': '#000036',
'TVComplement': '#03AFF1',
},
'pattern_shape': 'Metric',
'pattern_shape_sequence': ['', '/'],
'hover_name': 'Column',
'hover_data': {
'Column': False,
'Metric': True,
'Score': True,
},
}

mock__compute_average = Mock(return_value=0.5)
column_shape_property._compute_average = mock__compute_average
# Check call_args of mock_px.bar
_, kwargs = mock_px.bar.call_args

mock_bar = Mock()
mock_update_yaxes = Mock()
mock_update_layout = Mock()
mock_px.bar.return_value = mock_bar
mock_bar.update_yaxes.return_value = mock_update_yaxes
mock_bar.update_layout.return_value = mock_update_layout
# Check DataFrame separately
assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame'))

# Run
column_shape_property.get_visualization()
# Check other arguments
assert kwargs == expected_kwargs

# Assert
mock__compute_average.assert_called_once()
mock_px.bar.assert_called_once()
mock_bar.update_yaxes.assert_called_once()
mock_bar.update_layout.assert_called_once()
mock_bar.update_yaxes.assert_called_once_with(range=[0, 1])
mock_bar.update_layout.assert_called_once_with(
xaxis_categoryorder='total ascending', plot_bgcolor='#F5F5F8', margin={'t': 150}
)

0 comments on commit e78d307

Please sign in to comment.