Skip to content

Commit

Permalink
Contracts: Handle struct column specified both at root and nested lev…
Browse files Browse the repository at this point in the history
…els + arrays of structs (#806)
  • Loading branch information
MichelleArk committed Jul 11, 2023
1 parent 2f80a27 commit 291713c
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 18 deletions.
7 changes: 7 additions & 0 deletions .changes/unreleased/Fixes-20230630-213112.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
kind: Fixes
body: 'Contracts: Handle struct column specified both at root and nested levels +
arrays of structs'
time: 2023-06-30T21:31:12.63257-04:00
custom:
Author: michelleark
Issue: 781 782
77 changes: 60 additions & 17 deletions dbt/adapters/bigquery/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from google.cloud.bigquery import SchemaField

_PARENT_DATA_TYPE_KEY = "__parent_data_type"

Self = TypeVar("Self", bound="BigQueryColumn")


Expand Down Expand Up @@ -131,7 +133,7 @@ def column_to_bq_schema(self) -> SchemaField:
def get_nested_column_data_types(
columns: Dict[str, Dict[str, Any]],
constraints: Optional[Dict[str, str]] = None,
) -> Dict[str, Dict[str, str]]:
) -> Dict[str, Dict[str, Optional[str]]]:
"""
columns:
* Dictionary where keys are of flat columns names and values are dictionary of column attributes
Expand Down Expand Up @@ -159,16 +161,16 @@ def get_nested_column_data_types(
"""
constraints = constraints or {}

nested_column_data_types: Dict[str, Union[str, Dict]] = {}
nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {}
for column in columns.values():
_update_nested_column_data_types(
column["name"],
column["data_type"],
column.get("data_type"),
constraints.get(column["name"]),
nested_column_data_types,
)

formatted_nested_column_data_types: Dict[str, Dict[str, str]] = {}
formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {}
for column_name, unformatted_column_type in nested_column_data_types.items():
formatted_nested_column_data_types[column_name] = {
"name": column_name,
Expand All @@ -191,9 +193,9 @@ def get_nested_column_data_types(

def _update_nested_column_data_types(
column_name: str,
column_data_type: str,
column_data_type: Optional[str],
column_rendered_constraint: Optional[str],
nested_column_data_types: Dict[str, Union[str, Dict]],
nested_column_data_types: Dict[str, Optional[Union[str, Dict]]],
) -> None:
"""
Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint.
Expand All @@ -215,15 +217,38 @@ def _update_nested_column_data_types(

if len(column_name_parts) == 1:
# Base case: column is not nested - store its data_type concatenated with constraint if provided.
nested_column_data_types[root_column_name] = (
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
column_data_type_and_constraints = (
(
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
)
if column_data_type
else None
)

if existing_nested_column_data_type := nested_column_data_types.get(root_column_name):
assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy
# entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY"
existing_nested_column_data_type.update(
{_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints}
)
else:
nested_column_data_types.update({root_column_name: column_data_type_and_constraints})
else:
# Initialize nested dictionary
if root_column_name not in nested_column_data_types:
nested_column_data_types[root_column_name] = {}
parent_data_type = nested_column_data_types.get(root_column_name)
if isinstance(parent_data_type, dict):
# nested dictionary already initialized
pass
elif parent_data_type is None:
# initialize nested dictionary
nested_column_data_types.update({root_column_name: {}})
else:
# a parent specified its base type -- preserve its data_type and potential rendered constraints
# this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc
nested_column_data_types.update(
{root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}}
)

# Recursively process rest of remaining column name
remaining_column_name = ".".join(column_name_parts[1:])
Expand All @@ -237,7 +262,9 @@ def _update_nested_column_data_types(
)


def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, Any]]) -> str:
def _format_nested_data_type(
unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]]
) -> Optional[str]:
"""
Recursively format a (STRUCT) data type given an arbitrarily nested data type structure.
Expand All @@ -249,11 +276,27 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str,
>>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}})
'struct<c string not_null, d struct<e string>>'
"""
if isinstance(unformatted_nested_data_type, str):
if unformatted_nested_data_type is None:
return None
elif isinstance(unformatted_nested_data_type, str):
return unformatted_nested_data_type
else:
parent_data_type, *parent_constraints = unformatted_nested_data_type.pop(
_PARENT_DATA_TYPE_KEY, ""
).split() or [None]

formatted_nested_types = [
f"{column_name} {_format_nested_data_type(column_type)}"
f"{column_name} {_format_nested_data_type(column_type) or ''}".strip()
for column_name, column_type in unformatted_nested_data_type.items()
]
return f"""struct<{", ".join(formatted_nested_types)}>"""

formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>"""

if parent_data_type and parent_data_type.lower() == "array":
formatted_nested_type = f"""array<{formatted_nested_type}>"""

if parent_constraints:
parent_constraints = " ".join(parent_constraints)
formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}"""

return formatted_nested_type
2 changes: 1 addition & 1 deletion dbt/adapters/bigquery/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def nest_column_data_types(
cls,
columns: Dict[str, Dict[str, Any]],
constraints: Optional[Dict[str, str]] = None,
) -> Dict[str, Dict[str, str]]:
) -> Dict[str, Dict[str, Optional[str]]]:
return get_nested_column_data_types(columns, constraints)

def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]:
Expand Down
10 changes: 10 additions & 0 deletions dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
{%- endmacro -%}

{% macro bigquery__get_empty_schema_sql(columns) %}
{%- set col_err = [] -%}
{% for col in columns.values() %}
{%- if col['data_type'] is not defined -%}
{{ col_err.append(col['name']) }}
{%- endif -%}
{%- endfor -%}
{%- if (col_err | length) > 0 -%}
{{ exceptions.column_type_missing(column_names=col_err) }}
{%- endif -%}

{%- set columns = adapter.nest_column_data_types(columns) -%}
{{ return(dbt.default__get_empty_schema_sql(columns)) }}
{% endmacro %}
Expand Down
111 changes: 111 additions & 0 deletions tests/unit/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
None,
{"a": {"name": "a", "data_type": "string"}},
),
# Flat column - missing data_type
(
{"a": {"name": "a"}},
None,
{"a": {"name": "a", "data_type": None}},
),
# Flat column - with constraints
(
{"a": {"name": "a", "data_type": "string"}},
Expand All @@ -32,18 +38,75 @@
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - missing data_type
(
{"b.nested": {"name": "b.nested"}},
None,
{"b": {"name": "b", "data_type": "struct<nested>"}},
),
# Single nested column, 1 level - with constraints
(
{"b.nested": {"name": "b.nested", "data_type": "string"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string not null>"}},
),
# Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type)
(
{"b.nested": {"name": "b.nested"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested>"}},
),
# Single nested column, 1 level - with constraints + other keys
(
{"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}},
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string not null>"}},
),
# Single nested column, 1 level - with corresponding parent column
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column specified last
(
{
"b.nested": {"name": "b.nested", "data_type": "string"},
"b": {"name": "b", "data_type": "struct"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column + parent constraint
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string> not null"}},
),
# Single nested column, 1 level - with corresponding parent column as array
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "array<struct<nested string>>"}},
),
# Single nested column, 1 level - with corresponding parent column as array + constraint
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "array<struct<nested string>> not null"}},
),
# Multiple nested columns, 1 level
(
{
Expand Down Expand Up @@ -106,6 +169,28 @@
},
},
),
# Nested columns, multiple levels - missing data_type
(
{
"b.user.name.first": {
"name": "b.user.name.first",
"data_type": "string",
},
"b.user.name.last": {
"name": "b.user.name.last",
"data_type": "string",
},
"b.user.id": {"name": "b.user.id", "data_type": "int64"},
"b.user.country": {"name": "b.user.country"}, # missing data_type
},
None,
{
"b": {
"name": "b",
"data_type": "struct<user struct<name struct<first string, last string>, id int64, country>>",
},
},
),
# Nested columns, multiple levels - with constraints!
(
{
Expand All @@ -128,6 +213,32 @@
},
},
),
# Nested columns, multiple levels - with parent arrays and constraints!
(
{
"b.user.names": {
"name": "b.user.names",
"data_type": "array",
},
"b.user.names.first": {
"name": "b.user.names.first",
"data_type": "string",
},
"b.user.names.last": {
"name": "b.user.names.last",
"data_type": "string",
},
"b.user.id": {"name": "b.user.id", "data_type": "int64"},
"b.user.country": {"name": "b.user.country", "data_type": "string"},
},
{"b.user.names.first": "not null", "b.user.id": "unique"},
{
"b": {
"name": "b",
"data_type": "struct<user struct<names array<struct<first string not null, last string>>, id int64 unique, country string>>",
},
},
),
],
)
def test_get_nested_column_data_types(columns, constraints, expected_nested_columns):
Expand Down

0 comments on commit 291713c

Please sign in to comment.