sdv-dev · R-Palazzo · Jul 27, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 24, 2023
diff --git a/sdmetrics/reports/single_table/_properties/__init__.py b/sdmetrics/reports/single_table/_properties/__init__.py
@@ -1,6 +1,7 @@
 """Single table properties for sdmetrics."""
 
 from sdmetrics.reports.single_table._properties.base import BaseSingleTableProperty
+from sdmetrics.reports.single_table._properties.boundary import Boundary
 from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
 from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
 from sdmetrics.reports.single_table._properties.coverage import Coverage
@@ -10,4 +11,5 @@
     'ColumnShapes',
     'ColumnPairTrends',
     'Coverage',
+    'Boundary',
 ]
diff --git a/sdmetrics/reports/single_table/_properties/boundary.py b/sdmetrics/reports/single_table/_properties/boundary.py
@@ -0,0 +1,109 @@
+import numpy as np
+import pandas as pd
+import plotly.express as px
+
+from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
+from sdmetrics.single_column import BoundaryAdherence
+
+
+class Boundary(BaseSingleTableProperty):
+    """Boundary property class for single table.
+
+    This property assesses the boundary adherence of the synthetic data over the real data.
+    The BoundaryAdherence metric is computed column-wise and the final score is the average
+    over all columns. This metric is computed over numerical and datetime columns only.
+    The other column types are ignored by this property.
+    """
+
+    metric = BoundaryAdherence
+
+    def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
+        """Generate the _details dataframe for the boundary property.
+
+        Args:
+            real_data (pandas.DataFrame):
+                The real data.
+            synthetic_data (pandas.DataFrame):
+                The synthetic data.
+            metadata (dict):
+                The metadata of the table.
+            progress_bar (tqdm.tqdm or None):
+                The progress bar to use. Defaults to tqdm.
+
+        Returns:
+            pandas.DataFrame
+        """
+        column_names, metric_names, scores = [], [], []
+        error_messages = []
+        for column_name in metadata['columns']:
+            sdtype = metadata['columns'][column_name]['sdtype']
+            try:
+                if sdtype in ('numerical', 'datetime'):
+                    column_score = self.metric.compute(
+                        real_data[column_name], synthetic_data[column_name]
+                    )
+                    error_message = None
+                else:
+                    continue
+
+            except Exception as e:
+                column_score = np.nan
+                error_message = f'Error: {type(e).__name__} {e}'
+            finally:
+                if progress_bar:
+                    progress_bar.update()
+
+            column_names.append(column_name)
+            metric_names.append(self.metric.__name__)
+            scores.append(column_score)
+            error_messages.append(error_message)
+
+        result = pd.DataFrame({
+            'Column': column_names,
+            'Metric': metric_names,
+            'Score': scores,
+            'Error': error_messages,
+        })
+
+        if result['Error'].isna().all():
+            result = result.drop('Error', axis=1)
+
+        return result
+
+    def get_visualization(self):
+        """Create a plot to show the column boundary scores.
+
+        Returns:
+            plotly.graph_objects._figure.Figure.
+        """
+        average_score = self._compute_average()
+
+        fig = px.bar(
+            data_frame=self._details,
+            x='Column',
+            y='Score',
+            title=f'Data Diagnostics: Column Boundary (Average Score={round(average_score, 2)})',
+            category_orders={'group': list(self._details['Column'])},
+            color='Metric',
+            color_discrete_map={
+                'BoundaryAdherence': '#000036',
+            },
+            pattern_shape='Metric',
+            pattern_shape_sequence=['', '/'],
+            hover_name='Column',
+            hover_data={
+                'Column': False,
+                'Metric': True,
+                'Score': True,
+            },
+        )
+
+        fig.update_yaxes(range=[0, 1], title_text='Diagnostic Score')
+
+        fig.update_layout(
+            xaxis_categoryorder='total ascending',
+            plot_bgcolor='#F5F5F8',
+            margin={'t': 150},
+        )
+
+        return fig
diff --git a/tests/integration/reports/single_table/_properties/test_boundary.py b/tests/integration/reports/single_table/_properties/test_boundary.py
@@ -0,0 +1,62 @@
+import pandas as pd
+
+from sdmetrics.demos import load_demo
+from sdmetrics.reports.single_table._properties import Boundary
+
+
+class TestBoundary:
+
+    def test_get_score(self):
+        """Test the ``get_score`` method."""
+        # Setup
+        real_data, synthetic_data, metadata = load_demo(modality='single_table')
+        boundary_property = Boundary()
+
+        # Run
+        score = boundary_property.get_score(real_data, synthetic_data, metadata)
+
+        # Assert
+        assert score == 0.92
+
+        expected_details = pd.DataFrame({
+            'Column': [
+                'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc',
+                'degree_perc', 'experience_years', 'employability_perc', 'mba_perc'
+            ],
+            'Metric': ['BoundaryAdherence'] * 10,
+            'Score': [
+                0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0,
+                0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0,
+                0.8883720930232558, 0.8930232558139535
+            ]
+        })
+
+        pd.testing.assert_frame_equal(boundary_property._details, expected_details)
+
+    def test_get_score_error(self):
+        """Test the ``get_score`` method with errors."""
+        # Setup
+        real_data, synthetic_data, metadata = load_demo(modality='single_table')
+        real_data['start_date'].iloc[0] = 0
+        real_data['employability_perc'].iloc[2] = 'a'
+
+        boundary_property = Boundary()
+
+        # Run
+        score = boundary_property.get_score(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_message_1 = (
+            "Error: TypeError '<=' not supported between instances of 'int' and 'Timestamp'"
+        )
+        expected_message_2 = (
+            "Error: TypeError '<=' not supported between instances of 'float' and 'str'"
+        )
+        details = boundary_property._details
+        details_nan = details.loc[pd.isna(details['Score'])]
+        column_names_nan = details_nan['Column'].tolist()
+        error_messages = details_nan['Error'].tolist()
+        assert column_names_nan == ['start_date', 'employability_perc']
+        assert error_messages[0] == expected_message_1
+        assert error_messages[1] == expected_message_2
+        assert score == 0.93
diff --git a/tests/unit/reports/single_table/_properties/test_boundary.py b/tests/unit/reports/single_table/_properties/test_boundary.py
@@ -0,0 +1,138 @@
+from unittest.mock import Mock, call, patch
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.reports.single_table._properties.boundary import Boundary
+
+
+class TestBoundary:
+
+    @patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute')
+    def test__generate_details(self, boundary_adherence_mock):
+        """Test the ``_generate_details`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, np.nan],
+            'col2': [False, True, True],
+            'col3': [None, 'b', 'c'],
+            'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 2, 3],
+            'col2': [False, True, True],
+            'col3': ['a', 'b', 'c'],
+            'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
+        })
+        metadata = {
+            'columns': {
+                'col1': {'sdtype': 'numerical'},
+                'col2': {'sdtype': 'boolean'},
+                'col3': {'sdtype': 'categorical'},
+                'col4': {'sdtype': 'datetime'}
+            }
+        }
+
+        # Run
+        boundary_property = Boundary()
+        boundary_property._generate_details(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_calls_boundary = [
+            call(real_data['col1'], synthetic_data['col1']),
+            call(real_data['col4'], synthetic_data['col4']),
+        ]
+
+        boundary_adherence_mock.assert_has_calls(expected_calls_boundary)
+
+    @patch('sdmetrics.reports.single_table._properties.boundary.BoundaryAdherence.compute')
+    def test__generate_details_error(self, boundary_adherence_mock):
+        """Test the ``_generate_details`` method when the metric raises an error."""
+        # Setup
+
+        boundary_adherence_mock.side_effect = ValueError('Mock Error')
+        real_data = pd.DataFrame({
+            'col1': [1, 2, np.nan],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 2, 3]
+        })
+        metadata = {
+            'columns': {
+                'col1': {'sdtype': 'numerical'}
+            }
+        }
+
+        # Run
+        boundary_property = Boundary()
+        details = boundary_property._generate_details(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_calls_boundary = [
+            call(real_data['col1'], synthetic_data['col1']),
+        ]
+
+        boundary_adherence_mock.assert_has_calls(expected_calls_boundary)
+        expected_details = pd.DataFrame({
+            'Column': ['col1'],
+            'Metric': ['BoundaryAdherence'],
+            'Score': [np.nan],
+            'Error': ['Error: ValueError Mock Error']
+        })
+
+        pd.testing.assert_frame_equal(details, expected_details)
+
+    @patch('sdmetrics.reports.single_table._properties.boundary.px')
+    def test_get_visualization(self, mock_px):
+        """Test the ``get_visualization`` method."""
+        # Setup
+        boundary_property = Boundary()
+
+        mock_df = pd.DataFrame({
+            'Column': ['Column1', 'Column2'],
+            'Score': [0.7, 0.3],
+            'Metric': ['Rangeboundary', 'Categoryboundary']
+        })
+        boundary_property._details = mock_df
+
+        mock__compute_average = Mock(return_value=0.5)
+        boundary_property._compute_average = mock__compute_average
+
+        mock_fig = Mock()
+        mock_px.bar.return_value = mock_fig
+
+        # Run
+        boundary_property.get_visualization()
+
+        # Assert
+        mock__compute_average.assert_called_once()
+
+        expected_kwargs = {
+            'data_frame': mock_df,
+            'x': 'Column',
+            'y': 'Score',
+            'title': f'Data Diagnostics: Column Boundary (Average Score={0.5})',
+            'category_orders': {'group': list(mock_df['Column'])},
+            'color': 'Metric',
+            'color_discrete_map': {
+                'BoundaryAdherence': '#000036',
+            },
+            'pattern_shape': 'Metric',
+            'pattern_shape_sequence': ['', '/'],
+            'hover_name': 'Column',
+            'hover_data': {
+                'Column': False,
+                'Metric': True,
+                'Score': True,
+            },
+        }
+
+        _, kwargs = mock_px.bar.call_args
+
+        assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame'))
+        assert kwargs == expected_kwargs
+
+        mock_fig.update_yaxes.assert_called_once_with(range=[0, 1], title_text='Diagnostic Score')
+        mock_fig.update_layout.assert_called_once_with(
+            xaxis_categoryorder='total ascending', plot_bgcolor='#F5F5F8', margin={'t': 150}
+        )