Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create single table Boundaries property #400

Merged
merged 5 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sdmetrics/reports/single_table/_properties/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Single table properties for sdmetrics."""

from sdmetrics.reports.single_table._properties.base import BaseSingleTableProperty
from sdmetrics.reports.single_table._properties.boundary import Boundary
from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
from sdmetrics.reports.single_table._properties.coverage import Coverage
Expand All @@ -10,4 +11,5 @@
'ColumnShapes',
'ColumnPairTrends',
'Coverage',
'Boundary',
]
109 changes: 109 additions & 0 deletions sdmetrics/reports/single_table/_properties/boundary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import pandas as pd
import plotly.express as px

from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
from sdmetrics.single_column import BoundaryAdherence


class Boundary(BaseSingleTableProperty):
"""Boundary property class for single table.

This property assesses the boundary adherence of the synthetic data over the real data.
The BoundaryAdherence metric is computed column-wise and the final score is the average
over all columns. This metric is computed over numerical and datetime columns only.
The other column types are ignored by this property.
"""

metric = BoundaryAdherence

def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
"""Generate the _details dataframe for the boundary property.

Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
metadata (dict):
The metadata of the table.
progress_bar (tqdm.tqdm or None):
The progress bar to use. Defaults to tqdm.

Returns:
pandas.DataFrame
"""
column_names, metric_names, scores = [], [], []
error_messages = []
for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
try:
if sdtype in ('numerical', 'datetime'):
column_score = self.metric.compute(
real_data[column_name], synthetic_data[column_name]
)
error_message = None
else:
continue

except Exception as e:
column_score = np.nan
error_message = f'Error: {type(e).__name__} {e}'
finally:
if progress_bar:
progress_bar.update()

column_names.append(column_name)
metric_names.append(self.metric.__name__)
scores.append(column_score)
error_messages.append(error_message)

result = pd.DataFrame({
'Column': column_names,
'Metric': metric_names,
'Score': scores,
'Error': error_messages,
})

if result['Error'].isna().all():
result = result.drop('Error', axis=1)

return result

def get_visualization(self):
"""Create a plot to show the column boundary scores.

Returns:
plotly.graph_objects._figure.Figure.
"""
average_score = self._compute_average()

fig = px.bar(
data_frame=self._details,
x='Column',
y='Score',
title=f'Data Diagnostics: Column Boundary (Average Score={round(average_score, 2)})',
category_orders={'group': list(self._details['Column'])},
color='Metric',
color_discrete_map={
'BoundaryAdherence': '#000036',
},
pattern_shape='Metric',
pattern_shape_sequence=['', '/'],
hover_name='Column',
hover_data={
'Column': False,
'Metric': True,
'Score': True,
},
)

fig.update_yaxes(range=[0, 1], title_text='Diagnostic Score')

fig.update_layout(
xaxis_categoryorder='total ascending',
plot_bgcolor='#F5F5F8',
margin={'t': 150},
)

return fig
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd

from sdmetrics.demos import load_demo
from sdmetrics.reports.single_table._properties import Boundary


class TestBoundary:

def test_get_score(self):
"""Test the ``get_score`` method."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='single_table')
boundary_property = Boundary()

# Run
score = boundary_property.get_score(real_data, synthetic_data, metadata)

# Assert
assert score == 0.92

expected_details = pd.DataFrame({
'Column': [
'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc',
'degree_perc', 'experience_years', 'employability_perc', 'mba_perc'
],
'Metric': ['BoundaryAdherence'] * 10,
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0,
0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0,
0.8883720930232558, 0.8930232558139535
]
})

pd.testing.assert_frame_equal(boundary_property._details, expected_details)

def test_get_score_error(self):
"""Test the ``get_score`` method with errors."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='single_table')
real_data['start_date'].iloc[0] = 0
real_data['employability_perc'].iloc[2] = 'a'

boundary_property = Boundary()

# Run
score = boundary_property.get_score(real_data, synthetic_data, metadata)

# Assert
expected_message_1 = (
"Error: TypeError '<=' not supported between instances of 'int' and 'Timestamp'"
)
expected_message_2 = (
"Error: TypeError '<=' not supported between instances of 'float' and 'str'"
)
details = boundary_property._details
details_nan = details.loc[pd.isna(details['Score'])]
column_names_nan = details_nan['Column'].tolist()
error_messages = details_nan['Error'].tolist()
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.93
138 changes: 138 additions & 0 deletions tests/unit/reports/single_table/_properties/test_boundary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from unittest.mock import Mock, call, patch

import numpy as np
import pandas as pd

from sdmetrics.reports.single_table._properties.boundary import Boundary


class TestBoundary:

@patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute')
def test__generate_details(self, boundary_adherence_mock):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add one unit test where the metric is mocked to throw an error so we can test the exception handling?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, done in e260202

"""Test the ``_generate_details`` method."""
# Setup
real_data = pd.DataFrame({
'col1': [1, 2, np.nan],
'col2': [False, True, True],
'col3': [None, 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
})
synthetic_data = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [False, True, True],
'col3': ['a', 'b', 'c'],
'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
})
metadata = {
'columns': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'boolean'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'datetime'}
}
}

# Run
boundary_property = Boundary()
boundary_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls_boundary = [
call(real_data['col1'], synthetic_data['col1']),
call(real_data['col4'], synthetic_data['col4']),
]

boundary_adherence_mock.assert_has_calls(expected_calls_boundary)

@patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute')
def test__generate_details_error(self, boundary_adherence_mock):
"""Test the ``_generate_details`` method when the metric raises an error."""
# Setup

boundary_adherence_mock.side_effect = ValueError('Mock Error')
real_data = pd.DataFrame({
'col1': [1, 2, np.nan],
})
synthetic_data = pd.DataFrame({
'col1': [1, 2, 3]
})
metadata = {
'columns': {
'col1': {'sdtype': 'numerical'}
}
}

# Run
boundary_property = Boundary()
details = boundary_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls_boundary = [
call(real_data['col1'], synthetic_data['col1']),
]

boundary_adherence_mock.assert_has_calls(expected_calls_boundary)
expected_details = pd.DataFrame({
'Column': ['col1'],
'Metric': ['BoundaryAdherence'],
'Score': [np.nan],
'Error': ['Error: ValueError Mock Error']
})

pd.testing.assert_frame_equal(details, expected_details)

@patch('sdmetrics.reports.single_table._properties.boundary.px')
def test_get_visualization(self, mock_px):
"""Test the ``get_visualization`` method."""
# Setup
boundary_property = Boundary()

mock_df = pd.DataFrame({
'Column': ['Column1', 'Column2'],
'Score': [0.7, 0.3],
'Metric': ['Rangeboundary', 'Categoryboundary']
})
boundary_property._details = mock_df

mock__compute_average = Mock(return_value=0.5)
boundary_property._compute_average = mock__compute_average

mock_fig = Mock()
mock_px.bar.return_value = mock_fig

# Run
boundary_property.get_visualization()

# Assert
mock__compute_average.assert_called_once()

expected_kwargs = {
'data_frame': mock_df,
'x': 'Column',
'y': 'Score',
'title': f'Data Diagnostics: Column Boundary (Average Score={0.5})',
'category_orders': {'group': list(mock_df['Column'])},
'color': 'Metric',
'color_discrete_map': {
'BoundaryAdherence': '#000036',
},
'pattern_shape': 'Metric',
'pattern_shape_sequence': ['', '/'],
'hover_name': 'Column',
'hover_data': {
'Column': False,
'Metric': True,
'Score': True,
},
}

_, kwargs = mock_px.bar.call_args

assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame'))
assert kwargs == expected_kwargs

mock_fig.update_yaxes.assert_called_once_with(range=[0, 1], title_text='Diagnostic Score')
mock_fig.update_layout.assert_called_once_with(
xaxis_categoryorder='total ascending', plot_bgcolor='#F5F5F8', margin={'t': 150}
)
Loading