Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Contracts: Handle struct column specified both at root and nested levels + arrays of structs #806

Merged
merged 11 commits into from
Jul 11, 2023
7 changes: 7 additions & 0 deletions .changes/unreleased/Fixes-20230630-213112.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
kind: Fixes
body: 'Contracts: Handle struct column specified both at root and nested levels +
arrays of structs'
time: 2023-06-30T21:31:12.63257-04:00
custom:
Author: michelleark
Issue: 781 782
77 changes: 60 additions & 17 deletions dbt/adapters/bigquery/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from google.cloud.bigquery import SchemaField

_PARENT_DATA_TYPE_KEY = "__parent_data_type"

Self = TypeVar("Self", bound="BigQueryColumn")


Expand Down Expand Up @@ -131,7 +133,7 @@ def column_to_bq_schema(self) -> SchemaField:
def get_nested_column_data_types(
columns: Dict[str, Dict[str, Any]],
constraints: Optional[Dict[str, str]] = None,
) -> Dict[str, Dict[str, str]]:
) -> Dict[str, Dict[str, Optional[str]]]:
"""
columns:
* Dictionary where keys are of flat columns names and values are dictionary of column attributes
Expand Down Expand Up @@ -159,16 +161,16 @@ def get_nested_column_data_types(
"""
constraints = constraints or {}

nested_column_data_types: Dict[str, Union[str, Dict]] = {}
nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {}
for column in columns.values():
_update_nested_column_data_types(
column["name"],
column["data_type"],
column.get("data_type"),
constraints.get(column["name"]),
nested_column_data_types,
)

formatted_nested_column_data_types: Dict[str, Dict[str, str]] = {}
formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {}
for column_name, unformatted_column_type in nested_column_data_types.items():
formatted_nested_column_data_types[column_name] = {
"name": column_name,
Expand All @@ -191,9 +193,9 @@ def get_nested_column_data_types(

def _update_nested_column_data_types(
column_name: str,
column_data_type: str,
column_data_type: Optional[str],
column_rendered_constraint: Optional[str],
nested_column_data_types: Dict[str, Union[str, Dict]],
nested_column_data_types: Dict[str, Optional[Union[str, Dict]]],
) -> None:
"""
Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint.
Expand All @@ -215,15 +217,38 @@ def _update_nested_column_data_types(

if len(column_name_parts) == 1:
# Base case: column is not nested - store its data_type concatenated with constraint if provided.
nested_column_data_types[root_column_name] = (
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
column_data_type_and_constraints = (
(
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
)
if column_data_type
else None
)

if existing_nested_column_data_type := nested_column_data_types.get(root_column_name):
assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy
# entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY"
existing_nested_column_data_type.update(
{_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints}
)
else:
nested_column_data_types.update({root_column_name: column_data_type_and_constraints})
else:
# Initialize nested dictionary
if root_column_name not in nested_column_data_types:
nested_column_data_types[root_column_name] = {}
parent_data_type = nested_column_data_types.get(root_column_name)
if isinstance(parent_data_type, dict):
# nested dictionary already initialized
pass
elif parent_data_type is None:
# initialize nested dictionary
nested_column_data_types.update({root_column_name: {}})
else:
# a parent specified its base type -- preserve its data_type and potential rendered constraints
# this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc
nested_column_data_types.update(
{root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}}
)

# Recursively process rest of remaining column name
remaining_column_name = ".".join(column_name_parts[1:])
Expand All @@ -237,7 +262,9 @@ def _update_nested_column_data_types(
)


def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, Any]]) -> str:
def _format_nested_data_type(
unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]]
) -> Optional[str]:
"""
Recursively format a (STRUCT) data type given an arbitrarily nested data type structure.

Expand All @@ -249,11 +276,27 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str,
>>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}})
'struct<c string not_null, d struct<e string>>'
"""
if isinstance(unformatted_nested_data_type, str):
if unformatted_nested_data_type is None:
return None
elif isinstance(unformatted_nested_data_type, str):
return unformatted_nested_data_type
else:
parent_data_type, *parent_constraints = unformatted_nested_data_type.pop(
_PARENT_DATA_TYPE_KEY, ""
).split() or [None]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need the or [None] here? I thought I tried this in a toy example where .pop() returned "" and it still worked as anticipated. But I might have missed some edge case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately not :( it broke a couple unit tests with the following error:

>>> foo, *bar = "".split()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ValueError: not enough values to unpack (expected at least 1, got 0)


formatted_nested_types = [
f"{column_name} {_format_nested_data_type(column_type)}"
f"{column_name} {_format_nested_data_type(column_type) or ''}".strip()
for column_name, column_type in unformatted_nested_data_type.items()
]
return f"""struct<{", ".join(formatted_nested_types)}>"""

formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>"""

if parent_data_type and parent_data_type.lower() == "array":
formatted_nested_type = f"""array<{formatted_nested_type}>"""

if parent_constraints:
parent_constraints = " ".join(parent_constraints)
formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}"""

return formatted_nested_type
2 changes: 1 addition & 1 deletion dbt/adapters/bigquery/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def nest_column_data_types(
cls,
columns: Dict[str, Dict[str, Any]],
constraints: Optional[Dict[str, str]] = None,
) -> Dict[str, Dict[str, str]]:
) -> Dict[str, Dict[str, Optional[str]]]:
return get_nested_column_data_types(columns, constraints)

def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]:
Expand Down
10 changes: 10 additions & 0 deletions dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
{%- endmacro -%}

{% macro bigquery__get_empty_schema_sql(columns) %}
{%- set col_err = [] -%}
{% for col in columns.values() %}
{%- if col['data_type'] is not defined -%}
{{ col_err.append(col['name']) }}
{%- endif -%}
{%- endfor -%}
{%- if (col_err | length) > 0 -%}
{{ exceptions.column_type_missing(column_names=col_err) }}
{%- endif -%}
Copy link
Contributor Author

@MichelleArk MichelleArk Jul 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduces some unsatisfying duplication in the bigquery version of the get_empty_schema_sql macro.

However, it's necessary to do this before the columns get nested to provide granular error messages as opposed to deferring it to the handling in default__get_empty_schema_sql once the columns are already nested and it is not possible to determine which nested field had a missing data_type value.

Once the nested handling of model contracts moves into core, this duplication should be resolved and only present in the default__get_empty_schema_sql implementation.


{%- set columns = adapter.nest_column_data_types(columns) -%}
{{ return(dbt.default__get_empty_schema_sql(columns)) }}
{% endmacro %}
Expand Down
111 changes: 111 additions & 0 deletions tests/unit/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
None,
{"a": {"name": "a", "data_type": "string"}},
),
# Flat column - missing data_type
(
{"a": {"name": "a"}},
None,
{"a": {"name": "a", "data_type": None}},
),
# Flat column - with constraints
(
{"a": {"name": "a", "data_type": "string"}},
Expand All @@ -32,18 +38,75 @@
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - missing data_type
(
{"b.nested": {"name": "b.nested"}},
None,
{"b": {"name": "b", "data_type": "struct<nested>"}},
),
# Single nested column, 1 level - with constraints
(
{"b.nested": {"name": "b.nested", "data_type": "string"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string not null>"}},
),
# Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type)
(
{"b.nested": {"name": "b.nested"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested>"}},
),
# Single nested column, 1 level - with constraints + other keys
(
{"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string not null>"}},
),
# Single nested column, 1 level - with corresponding parent column
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column specified last
(
{
"b.nested": {"name": "b.nested", "data_type": "string"},
"b": {"name": "b", "data_type": "struct"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column + parent constraint
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string> not null"}},
),
# Single nested column, 1 level - with corresponding parent column as array
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "array<struct<nested string>>"}},
),
# Single nested column, 1 level - with corresponding parent column as array + constraint
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "array<struct<nested string>> not null"}},
),
# Multiple nested columns, 1 level
(
{
Expand Down Expand Up @@ -106,6 +169,28 @@
},
},
),
# Nested columns, multiple levels - missing data_type
(
{
"b.user.name.first": {
"name": "b.user.name.first",
"data_type": "string",
},
"b.user.name.last": {
"name": "b.user.name.last",
"data_type": "string",
},
"b.user.id": {"name": "b.user.id", "data_type": "int64"},
"b.user.country": {"name": "b.user.country"}, # missing data_type
},
None,
{
"b": {
"name": "b",
"data_type": "struct<user struct<name struct<first string, last string>, id int64, country>>",
},
},
),
# Nested columns, multiple levels - with constraints!
(
{
Expand All @@ -128,6 +213,32 @@
},
},
),
# Nested columns, multiple levels - with parent arrays and constraints!
(
{
"b.user.names": {
"name": "b.user.names",
"data_type": "array",
},
"b.user.names.first": {
"name": "b.user.names.first",
"data_type": "string",
},
"b.user.names.last": {
"name": "b.user.names.last",
"data_type": "string",
},
"b.user.id": {"name": "b.user.id", "data_type": "int64"},
"b.user.country": {"name": "b.user.country", "data_type": "string"},
},
{"b.user.names.first": "not null", "b.user.id": "unique"},
{
"b": {
"name": "b",
"data_type": "struct<user struct<names array<struct<first string not null, last string>>, id int64 unique, country string>>",
},
},
),
],
)
def test_get_nested_column_data_types(columns, constraints, expected_nested_columns):
Expand Down