From 96a5ea076e7fa3e7d92b2f844f3e84ec6c072b42 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 20 Jul 2023 16:05:14 +0100 Subject: [PATCH 1/5] def 1 --- .../reports/single_table/_properties/boundary.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sdmetrics/reports/single_table/_properties/boundary.py diff --git a/sdmetrics/reports/single_table/_properties/boundary.py b/sdmetrics/reports/single_table/_properties/boundary.py new file mode 100644 index 00000000..eebda1c4 --- /dev/null +++ b/sdmetrics/reports/single_table/_properties/boundary.py @@ -0,0 +1,15 @@ +import numpy as np +import pandas as pd +import plotly.express as px + +from sdmetrics.reports.single_table._properties import BaseSingleTableProperty +from sdmetrics.single_column import BoundaryAdherence + + +class Boundary(BaseSingleTableProperty): + """Boundary property class for single table.""" + + metric = BoundaryAdherence + + def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): + return super()._generate_details(real_data, synthetic_data, metadata, progress_bar) From ce56068fe055fe88bc8216eb53bde583f46bb6ce Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 20 Jul 2023 16:58:33 +0100 Subject: [PATCH 2/5] def 2 + test --- .../single_table/_properties/__init__.py | 2 + .../single_table/_properties/boundary.py | 98 ++++++++++++++++- .../single_table/_properties/test_boundary.py | 62 +++++++++++ .../single_table/_properties/test_boundary.py | 102 ++++++++++++++++++ 4 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 tests/integration/reports/single_table/_properties/test_boundary.py create mode 100644 tests/unit/reports/single_table/_properties/test_boundary.py diff --git a/sdmetrics/reports/single_table/_properties/__init__.py b/sdmetrics/reports/single_table/_properties/__init__.py index 288bcfec..efeccb53 100644 --- a/sdmetrics/reports/single_table/_properties/__init__.py +++ b/sdmetrics/reports/single_table/_properties/__init__.py @@ -1,6 +1,7 @@ """Single table properties for sdmetrics.""" from sdmetrics.reports.single_table._properties.base import BaseSingleTableProperty +from sdmetrics.reports.single_table._properties.boundary import Boundary from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes from sdmetrics.reports.single_table._properties.coverage import Coverage @@ -10,4 +11,5 @@ 'ColumnShapes', 'ColumnPairTrends', 'Coverage', + 'Boundary', ] diff --git a/sdmetrics/reports/single_table/_properties/boundary.py b/sdmetrics/reports/single_table/_properties/boundary.py index eebda1c4..8cbb92ce 100644 --- a/sdmetrics/reports/single_table/_properties/boundary.py +++ b/sdmetrics/reports/single_table/_properties/boundary.py @@ -7,9 +7,103 @@ class Boundary(BaseSingleTableProperty): - """Boundary property class for single table.""" + """Boundary property class for single table. + + This property assesses the boundary adherence of the synthetic data over the real data. + The BoundaryAdherence metric is computed column-wise and the final score is the average + over all columns. This metric is computed over numerical and datetime columns only. + The other column types are ignored by this property. + """ metric = BoundaryAdherence def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): - return super()._generate_details(real_data, synthetic_data, metadata, progress_bar) + """Generate the _details dataframe for the boundary property. + + Args: + real_data (pandas.DataFrame): + The real data + synthetic_data (pandas.DataFrame): + The synthetic data + metadata (dict): + The metadata of the table + progress_bar (tqdm.tqdm or None): + The progress bar to use. Defaults to tqdm. + + Returns: + pandas.DataFrame + """ + column_names, metric_names, scores = [], [], [] + error_messages = [] + for column_name in metadata['columns']: + sdtype = metadata['columns'][column_name]['sdtype'] + try: + if sdtype in ('numerical', 'datetime'): + column_score = self.metric.compute( + real_data[column_name], synthetic_data[column_name] + ) + error_message = None + else: + continue + + except Exception as e: + column_score = np.nan + error_message = f'Error: {type(e).__name__} {e}' + finally: + if progress_bar: + progress_bar.update() + + column_names.append(column_name) + metric_names.append(self.metric.__name__) + scores.append(column_score) + error_messages.append(error_message) + + result = pd.DataFrame({ + 'Column': column_names, + 'Metric': metric_names, + 'Score': scores, + 'Error': error_messages, + }) + + if result['Error'].isna().all(): + result = result.drop('Error', axis=1) + + return result + + def get_visualization(self): + """Create a plot to show the column boundary scores. + + Returns: + plotly.graph_objects._figure.Figure + """ + average_score = self._compute_average() + + fig = px.bar( + data_frame=self._details, + x='Column', + y='Score', + title=f'Data Diagnostics: Column Boundary (Average Score={round(average_score, 2)})', + category_orders={'group': list(self._details['Column'])}, + color='Metric', + color_discrete_map={ + 'BoundaryAdherence': '#000036', + }, + pattern_shape='Metric', + pattern_shape_sequence=['', '/'], + hover_name='Column', + hover_data={ + 'Column': False, + 'Metric': True, + 'Score': True, + }, + ) + + fig.update_yaxes(range=[0, 1], title_text='Diagnostic Score') + + fig.update_layout( + xaxis_categoryorder='total ascending', + plot_bgcolor='#F5F5F8', + margin={'t': 150}, + ) + + return fig diff --git a/tests/integration/reports/single_table/_properties/test_boundary.py b/tests/integration/reports/single_table/_properties/test_boundary.py new file mode 100644 index 00000000..51722856 --- /dev/null +++ b/tests/integration/reports/single_table/_properties/test_boundary.py @@ -0,0 +1,62 @@ +import pandas as pd + +from sdmetrics.demos import load_demo +from sdmetrics.reports.single_table._properties import Boundary + + +class TestBoundary: + + def test_get_score(self): + """Test the ``get_score`` method.""" + # Setup + real_data, synthetic_data, metadata = load_demo(modality='single_table') + boundary_property = Boundary() + + # Run + score = boundary_property.get_score(real_data, synthetic_data, metadata) + + # Assert + assert score == 0.92 + + expected_details = pd.DataFrame({ + 'Column': [ + 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc', + 'degree_perc', 'experience_years', 'employability_perc', 'mba_perc' + ], + 'Metric': ['BoundaryAdherence'] * 10, + 'Score': [ + 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, + 0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0, + 0.8883720930232558, 0.8930232558139535 + ] + }) + + pd.testing.assert_frame_equal(boundary_property._details, expected_details) + + def test_get_score_error(self): + """Test the ``get_score`` method with errors.""" + # Setup + real_data, synthetic_data, metadata = load_demo(modality='single_table') + real_data['start_date'].iloc[0] = 0 + real_data['employability_perc'].iloc[2] = 'a' + + boundary_property = Boundary() + + # Run + score = boundary_property.get_score(real_data, synthetic_data, metadata) + + # Assert + expected_message_1 = ( + "Error: TypeError '<=' not supported between instances of 'int' and 'Timestamp'" + ) + expected_message_2 = ( + "Error: TypeError '<=' not supported between instances of 'float' and 'str'" + ) + details = boundary_property._details + details_nan = details.loc[pd.isna(details['Score'])] + column_names_nan = details_nan['Column'].tolist() + error_messages = details_nan['Error'].tolist() + assert column_names_nan == ['start_date', 'employability_perc'] + assert error_messages[0] == expected_message_1 + assert error_messages[1] == expected_message_2 + assert score == 0.93 diff --git a/tests/unit/reports/single_table/_properties/test_boundary.py b/tests/unit/reports/single_table/_properties/test_boundary.py new file mode 100644 index 00000000..7dedb765 --- /dev/null +++ b/tests/unit/reports/single_table/_properties/test_boundary.py @@ -0,0 +1,102 @@ +from unittest.mock import Mock, call, patch + +import numpy as np +import pandas as pd + +from sdmetrics.reports.single_table._properties.boundary import Boundary + + +class TestBoundary: + + @patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute') + def test__generate_details(self, boundary_adherence_mock): + """Test the ``_generate_details`` method.""" + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, np.nan], + 'col2': [False, True, True], + 'col3': [None, 'b', 'c'], + 'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']) + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': [False, True, True], + 'col3': ['a', 'b', 'c'], + 'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']) + }) + metadata = { + 'columns': { + 'col1': {'sdtype': 'numerical'}, + 'col2': {'sdtype': 'boolean'}, + 'col3': {'sdtype': 'categorical'}, + 'col4': {'sdtype': 'datetime'} + } + } + + # Run + boundary_property = Boundary() + boundary_property._generate_details(real_data, synthetic_data, metadata) + + # Assert + expected_calls_boundary = [ + call(real_data['col1'], synthetic_data['col1']), + call(real_data['col4'], synthetic_data['col4']), + ] + + boundary_adherence_mock.assert_has_calls(expected_calls_boundary) + + @patch('sdmetrics.reports.single_table._properties.boundary.px') + def test_get_visualization(self, mock_px): + """Test the ``get_visualization`` method.""" + # Setup + boundary_property = Boundary() + + mock_df = pd.DataFrame({ + 'Column': ['Column1', 'Column2'], + 'Score': [0.7, 0.3], + 'Metric': ['Rangeboundary', 'Categoryboundary'] + }) + boundary_property._details = mock_df + + mock__compute_average = Mock(return_value=0.5) + boundary_property._compute_average = mock__compute_average + + mock_fig = Mock() + mock_px.bar.return_value = mock_fig + + # Run + boundary_property.get_visualization() + + # Assert + mock__compute_average.assert_called_once() + + expected_kwargs = { + 'data_frame': mock_df, + 'x': 'Column', + 'y': 'Score', + 'title': f'Data Diagnostics: Column Boundary (Average Score={0.5})', + 'category_orders': {'group': list(mock_df['Column'])}, + 'color': 'Metric', + 'color_discrete_map': { + 'BoundaryAdherence': '#000036', + }, + 'pattern_shape': 'Metric', + 'pattern_shape_sequence': ['', '/'], + 'hover_name': 'Column', + 'hover_data': { + 'Column': False, + 'Metric': True, + 'Score': True, + }, + } + + # Check call_args of mock_px.bar + _, kwargs = mock_px.bar.call_args + + assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame')) + assert kwargs == expected_kwargs + + mock_fig.update_yaxes.assert_called_once_with(range=[0, 1], title_text='Diagnostic Score') + mock_fig.update_layout.assert_called_once_with( + xaxis_categoryorder='total ascending', plot_bgcolor='#F5F5F8', margin={'t': 150} + ) From 9b6550abfb822c49c309cf551ba722b2f9cbb3ef Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 24 Jul 2023 12:36:42 +0100 Subject: [PATCH 3/5] docstring --- sdmetrics/reports/single_table/_properties/boundary.py | 8 ++++---- .../reports/single_table/_properties/test_boundary.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sdmetrics/reports/single_table/_properties/boundary.py b/sdmetrics/reports/single_table/_properties/boundary.py index 8cbb92ce..f71c772f 100644 --- a/sdmetrics/reports/single_table/_properties/boundary.py +++ b/sdmetrics/reports/single_table/_properties/boundary.py @@ -22,11 +22,11 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No Args: real_data (pandas.DataFrame): - The real data + The real data. synthetic_data (pandas.DataFrame): - The synthetic data + The synthetic data. metadata (dict): - The metadata of the table + The metadata of the table. progress_bar (tqdm.tqdm or None): The progress bar to use. Defaults to tqdm. @@ -74,7 +74,7 @@ def get_visualization(self): """Create a plot to show the column boundary scores. Returns: - plotly.graph_objects._figure.Figure + plotly.graph_objects._figure.Figure. """ average_score = self._compute_average() diff --git a/tests/unit/reports/single_table/_properties/test_boundary.py b/tests/unit/reports/single_table/_properties/test_boundary.py index 7dedb765..85bfb869 100644 --- a/tests/unit/reports/single_table/_properties/test_boundary.py +++ b/tests/unit/reports/single_table/_properties/test_boundary.py @@ -90,7 +90,6 @@ def test_get_visualization(self, mock_px): }, } - # Check call_args of mock_px.bar _, kwargs = mock_px.bar.call_args assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame')) From 3b2aa78bd6d2e0a346a53976ce46bb8377433548 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 24 Jul 2023 18:00:20 +0100 Subject: [PATCH 4/5] add test error --- .../single_table/_properties/test_boundary.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/unit/reports/single_table/_properties/test_boundary.py b/tests/unit/reports/single_table/_properties/test_boundary.py index 85bfb869..7008e477 100644 --- a/tests/unit/reports/single_table/_properties/test_boundary.py +++ b/tests/unit/reports/single_table/_properties/test_boundary.py @@ -45,6 +45,43 @@ def test__generate_details(self, boundary_adherence_mock): boundary_adherence_mock.assert_has_calls(expected_calls_boundary) + @patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute') + def test__generate_details_error(self, boundary_adherence_mock): + """Test the ``_generate_details`` method.""" + # Setup + + boundary_adherence_mock.side_effect = ValueError('Mock Error') + real_data = pd.DataFrame({ + 'col1': [1, 2, np.nan], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 2, 3] + }) + metadata = { + 'columns': { + 'col1': {'sdtype': 'numerical'} + } + } + + # Run + boundary_property = Boundary() + details = boundary_property._generate_details(real_data, synthetic_data, metadata) + + # Assert + expected_calls_boundary = [ + call(real_data['col1'], synthetic_data['col1']), + ] + + boundary_adherence_mock.assert_has_calls(expected_calls_boundary) + expected_details = pd.DataFrame({ + 'Column': ['col1'], + 'Metric': ['BoundaryAdherence'], + 'Score': [np.nan], + 'Error': ['Error: ValueError Mock Error'] + }) + + pd.testing.assert_frame_equal(details, expected_details) + @patch('sdmetrics.reports.single_table._properties.boundary.px') def test_get_visualization(self, mock_px): """Test the ``get_visualization`` method.""" From 8ebc13500361f7aba76e7774151397bee9f536c1 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 25 Jul 2023 10:08:42 +0100 Subject: [PATCH 5/5] docstring --- tests/unit/reports/single_table/_properties/test_boundary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/reports/single_table/_properties/test_boundary.py b/tests/unit/reports/single_table/_properties/test_boundary.py index 7008e477..66967c61 100644 --- a/tests/unit/reports/single_table/_properties/test_boundary.py +++ b/tests/unit/reports/single_table/_properties/test_boundary.py @@ -47,7 +47,7 @@ def test__generate_details(self, boundary_adherence_mock): @patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute') def test__generate_details_error(self, boundary_adherence_mock): - """Test the ``_generate_details`` method.""" + """Test the ``_generate_details`` method when the metric raises an error.""" # Setup boundary_adherence_mock.side_effect = ValueError('Mock Error')