address comments 2

sdv-dev · Jun 22, 2023 · e78d307 · e78d307
1 parent 8ec3992
commit e78d307
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 59 deletions.
diff --git a/sdmetrics/reports/single_table/_properties/column_shapes.py b/sdmetrics/reports/single_table/_properties/column_shapes.py
@@ -1,5 +1,3 @@
-import warnings
-
 import numpy as np
 import pandas as pd
 import plotly.express as px
@@ -39,7 +37,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
                 The progress bar to use. Defaults to tqdm.
         """
         column_names, metric_names, scores = [], [], []
-        warning_messages = []
+        error_messages = []
         for column_name in metadata['columns']:
             sdtype = metadata['columns'][column_name]['sdtype']
             try:
@@ -48,14 +46,14 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
                     column_score = metric.compute(
                         real_data[column_name], synthetic_data[column_name]
                     )
+                    error_message = None
                 else:
                     continue
 
             except Exception as e:
                 column_score = np.nan
-                warning_messages.append(
-                        f"Unable to compute Column Shape for column '{column_name}'. "
-                        f'Encountered Error: {type(e).__name__} {e}'
+                error_message = (
+                        f'Error: {type(e).__name__} {e}'
                 )
             finally:
                 if progress_bar:
@@ -64,19 +62,21 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
             column_names.append(column_name)
             metric_names.append(metric.__name__)
             scores.append(column_score)
+            error_messages.append(error_message)
 
         if progress_bar:
             progress_bar.close()
 
-        for message in warning_messages:
-            warnings.warn(message)
-
         result = pd.DataFrame({
             'Column': column_names,
             'Metric': metric_names,
             'Score': scores,
+            'Error': error_messages,
         })
 
+        if result['Error'].isna().all():
+            result = result.drop('Error', axis=1)
+
         return result
 
     def get_visualization(self):

diff --git a/tests/integration/reports/single_table/_properties/test_column_shapes.py b/tests/integration/reports/single_table/_properties/test_column_shapes.py
@@ -1,5 +1,3 @@
-import re
-
 import pandas as pd
 
 from sdmetrics.demos import load_demo
@@ -38,8 +36,8 @@ def test_get_score(self):
         pd.testing.assert_frame_equal(column_shape_property._details, expected_details)
         assert score == 0.816
 
-    def test_get_score_warnings(self, recwarn):
-        """Test the ``get_score`` method when the metrics are raising erros for some columns."""
+    def test_get_score_warnings(self):
+        """Test the ``get_score`` method when the metrics are raising errors for some columns."""
         # Setup
         real_data, synthetic_data, metadata = load_demo('single_table')
 
@@ -49,22 +47,22 @@ def test_get_score_warnings(self, recwarn):
         # Run
         column_shape_property = ColumnShapes()
 
-        expected_message_1 = re.escape(
-            "Unable to compute Column Shape for column 'start_date'. Encountered Error:"
-            " TypeError '<' not supported between instances of 'Timestamp' and 'int'"
+        expected_message_1 = (
+            "Error: TypeError '<' not supported between instances of 'Timestamp' and 'int'"
         )
-        expected_message_2 = re.escape(
-            "Unable to compute Column Shape for column 'employability_perc'. "
-            "Encountered Error: TypeError '<' not supported between instances of 'str' and 'float'"
+        expected_message_2 = (
+            "Error: TypeError '<' not supported between instances of 'str' and 'float'"
         )
 
         score = column_shape_property.get_score(real_data, synthetic_data, metadata)
 
         # Assert
-        assert re.match(expected_message_1, str(recwarn[0].message))
-        assert re.match(expected_message_2, str(recwarn[1].message))
 
         details = column_shape_property._details
-        column_names_nan = list(details.loc[pd.isna(details['Score'])]['Column'])
+        details_nan = details.loc[pd.isna(details['Score'])]
+        column_names_nan = details_nan['Column'].tolist()
+        error_messages = details_nan['Error'].tolist()
         assert column_names_nan == ['start_date', 'employability_perc']
+        assert error_messages[0] == expected_message_1
+        assert error_messages[1] == expected_message_2
         assert score == 0.826
diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py
@@ -1,9 +1,7 @@
-
-import re
 from unittest.mock import Mock, call, patch
 
+import numpy as np
 import pandas as pd
-import pytest
 
 from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
 
@@ -53,49 +51,134 @@ def test__generate_details(self, tv_complement_compute_mock, ks_complement_compu
         ks_complement_compute_mock.assert_has_calls(expected_calls_ksc)
         tv_complement_compute_mock.assert_has_calls(expected_calls_tvc)
 
-    def test__generate_details_warning(self):
-        """Test the ``_generate_details`` method."""
+    @patch('sdmetrics.reports.single_table._properties.column_shapes.KSComplement.compute')
+    @patch('sdmetrics.reports.single_table._properties.column_shapes.TVComplement.compute')
+    def test__generate_details_with_nans(
+        self, tv_complement_compute_mock, ks_complement_compute_mock
+    ):
+        """Test the ``_generate_details`` method when there is NaNs in the data."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, None, 3],
+            'col2': [False, True, np.nan],
+            'col3': [None, 'b', 'c'],
+            'col4': pd.to_datetime(['2020-01-01', np.nan, '2020-01-03'])
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 2, 3],
+            'col2': [False, True, True],
+            'col3': ['a', None, 'c'],
+            'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
+        })
+        metadata = {
+            'columns': {
+                'col1': {'sdtype': 'numerical'},
+                'col2': {'sdtype': 'boolean'},
+                'col3': {'sdtype': 'categorical'},
+                'col4': {'sdtype': 'datetime'}
+            }
+        }
+
+        # Run
+        column_shape_property = ColumnShapes()
+        column_shape_property._generate_details(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_calls_ksc = [
+            call(real_data['col1'], synthetic_data['col1']),
+            call(real_data['col4'], synthetic_data['col4']),
+        ]
+        expected_calls_tvc = [
+            call(real_data['col2'], synthetic_data['col2']),
+            call(real_data['col3'], synthetic_data['col3']),
+        ]
+
+        ks_complement_compute_mock.assert_has_calls(expected_calls_ksc)
+        tv_complement_compute_mock.assert_has_calls(expected_calls_tvc)
+
+    def test__generate_details_error(self):
+        """Test the ``_generate_details`` method with the error column."""
         # Setup
         real_data = pd.DataFrame({'col1': [1, '2', 3]})
         synthetic_data = pd.DataFrame({'col1': [4, 5, 6]})
         metadata = {'columns': {'col1': {'sdtype': 'numerical'}}}
 
-        # Run and Assert
         column_shape_property = ColumnShapes()
-        expected_message = re.escape(
-            "Unable to compute Column Shape for column 'col1'. Encountered Error: "
-            "TypeError '<' not supported between instances of 'str' and 'int'"
-        )
-        with pytest.warns(UserWarning, match=expected_message):
-            column_shape_property._generate_details(real_data, synthetic_data, metadata)
 
-    @patch('sdmetrics.reports.single_table._properties.column_shapes.px')
-    def test_get_visualization(self, mock_px):
-        """Test the ``get_visualization`` method."""
-        # Setup
-        column_shape_property = ColumnShapes()
+        # Run
+        result = column_shape_property._generate_details(real_data, synthetic_data, metadata)
 
-        column_shape_property._details = {
-            'Column': ['Column1', 'Column2'],
-            'Score': [0.7, 0.3],
-            'Metric': ['KSComplement', 'TVComplement']
-        }
+        # Assert
+        expected_message = (
+            "Error: TypeError '<' not supported between instances of 'str' and 'int'"
+        )
+        result_nan = result.loc[pd.isna(result['Score'])]
+        column_names_nan = result_nan['Column'].tolist()
+        error_message = result_nan['Error'].tolist()
+
+        assert column_names_nan == ['col1']
+        assert error_message == [expected_message]
+
+        @patch('sdmetrics.reports.single_table._properties.column_shapes.px')
+        def test_get_visualization(self, mock_px):
+            """Test the ``get_visualization`` method."""
+            # Setup
+            column_shape_property = ColumnShapes()
+
+            mock_df = pd.DataFrame({
+                'Column': ['Column1', 'Column2'],
+                'Score': [0.7, 0.3],
+                'Metric': ['KSComplement', 'TVComplement']
+            })
+            column_shape_property._details = mock_df
+
+            mock__compute_average = Mock(return_value=0.5)
+            column_shape_property._compute_average = mock__compute_average
+
+            mock_bar = Mock()
+            mock_px.bar.return_value = mock_bar
+
+            # Run
+            column_shape_property.get_visualization()
+
+            # Assert
+            mock__compute_average.assert_called_once()
+
+            # Expected call
+            expected_kwargs = {
+                'data_frame': mock_df,
+                'x': 'Column',
+                'y': 'Score',
+                'title': (
+                    'Data Quality: Column Shapes (Average'
+                    f'Score={mock__compute_average.return_value})'
+                ),
+                'category_orders': {'group': mock_df['Column'].tolist()},
+                'color': 'Metric',
+                'color_discrete_map': {
+                    'KSComplement': '#000036',
+                    'TVComplement': '#03AFF1',
+                },
+                'pattern_shape': 'Metric',
+                'pattern_shape_sequence': ['', '/'],
+                'hover_name': 'Column',
+                'hover_data': {
+                    'Column': False,
+                    'Metric': True,
+                    'Score': True,
+                },
+            }
 
-        mock__compute_average = Mock(return_value=0.5)
-        column_shape_property._compute_average = mock__compute_average
+            # Check call_args of mock_px.bar
+            _, kwargs = mock_px.bar.call_args
 
-        mock_bar = Mock()
-        mock_update_yaxes = Mock()
-        mock_update_layout = Mock()
-        mock_px.bar.return_value = mock_bar
-        mock_bar.update_yaxes.return_value = mock_update_yaxes
-        mock_bar.update_layout.return_value = mock_update_layout
+            # Check DataFrame separately
+            assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame'))
 
-        # Run
-        column_shape_property.get_visualization()
+            # Check other arguments
+            assert kwargs == expected_kwargs
 
-        # Assert
-        mock__compute_average.assert_called_once()
-        mock_px.bar.assert_called_once()
-        mock_bar.update_yaxes.assert_called_once()
-        mock_bar.update_layout.assert_called_once()
+            mock_bar.update_yaxes.assert_called_once_with(range=[0, 1])
+            mock_bar.update_layout.assert_called_once_with(
+                xaxis_categoryorder='total ascending', plot_bgcolor='#F5F5F8', margin={'t': 150}
+            )