From e8327900cb13c394f0f6aa4278eb565f6ef49675 Mon Sep 17 00:00:00 2001 From: Vasenkov Ilia Date: Thu, 12 Sep 2024 10:49:56 +0200 Subject: [PATCH] - --- ch_backup/backup/deduplication.py | 8 +- ch_backup/backup/layout.py | 36 ++--- ch_backup/backup/metadata/backup_metadata.py | 8 +- ch_backup/backup/metadata/part_metadata.py | 16 ++- ch_backup/backup/metadata/table_metadata.py | 1 + ch_backup/ch_backup.py | 4 +- ch_backup/clickhouse/control.py | 12 +- ch_backup/clickhouse/models.py | 3 + ch_backup/config.py | 2 +- ch_backup/logic/table.py | 2 +- .../partially_encrypted_backups.feature | 131 +++--------------- tests/integration/steps/clickhouse.py | 9 +- tests/unit/test_pipeline.py | 6 +- 13 files changed, 80 insertions(+), 158 deletions(-) diff --git a/ch_backup/backup/deduplication.py b/ch_backup/backup/deduplication.py index b703a825..725751b1 100644 --- a/ch_backup/backup/deduplication.py +++ b/ch_backup/backup/deduplication.py @@ -15,6 +15,7 @@ from ch_backup.util import Slotted, utcnow +# pylint: disable=too-many-instance-attributes class PartDedupInfo(Slotted): """ Information about data part to use for deduplication / creation incremental backups. @@ -31,6 +32,7 @@ class PartDedupInfo(Slotted): "tarball", "disk_name", "verified", + "encrypted", ) # pylint: disable=too-many-arguments @@ -46,6 +48,7 @@ def __init__( tarball: bool, disk_name: str, verified: bool, + encrypted: bool, ) -> None: self.database = database self.table = table @@ -57,13 +60,14 @@ def __init__( self.tarball = tarball self.disk_name = disk_name self.verified = verified + self.encrypted = encrypted def to_sql(self): """ Convert to string to use it in insert query """ files_array = "[" + ",".join(f"'{file}'" for file in self.files) + "]" - return f"('{self.database}','{self.table}','{self.name}','{self.backup_path}','{self.checksum}',{self.size},{files_array},{int(self.tarball)},'{self.disk_name}',{int(self.verified)})" + return f"('{self.database}','{self.table}','{self.name}','{self.backup_path}','{self.checksum}',{self.size},{files_array},{int(self.tarball)},'{self.disk_name}',{int(self.verified)},{int(self.encrypted)})" TableDedupReferences = Set[str] @@ -204,6 +208,7 @@ def _populate_dedup_info( tarball=part.tarball, disk_name=part.disk_name, verified=verified, + encrypted=part.encrypted, ) table_dedup_info.add(part.name) @@ -247,6 +252,7 @@ def deduplicate_parts( files=existing_part["files"], tarball=existing_part["tarball"], disk_name=existing_part["disk_name"], + encrypted=existing_part.get("encrypted", True), ) if not existing_part["verified"]: diff --git a/ch_backup/backup/layout.py b/ch_backup/backup/layout.py index 3c891692..18c6993a 100644 --- a/ch_backup/backup/layout.py +++ b/ch_backup/backup/layout.py @@ -78,7 +78,7 @@ def upload_database_create_statement( self._storage_loader.upload_file( local_path, remote_path=remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f"Failed to create async upload of {remote_path}" @@ -109,7 +109,7 @@ def upload_table_create_statement( create_statement, remote_path, is_async=True, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f"Failed to create async upload of {remote_path}" @@ -130,7 +130,7 @@ def upload_access_control_file( self._storage_loader.upload_file( local_path=local_path, remote_path=remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f'Failed to upload access control metadata file "{remote_path}"' @@ -151,7 +151,7 @@ def upload_access_control_files( local_path, remote_path, files=file_names, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: @@ -169,7 +169,7 @@ def upload_udf( self._storage_loader.upload_data( data=metadata, remote_path=remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f'Failed to upload udf metadata "{remote_path}"' @@ -201,7 +201,7 @@ def upload_data_part( files=fpart.files, remote_path=remote_path, is_async=True, - encryption=backup_meta.is_encryption_enabled, + encryption=backup_meta.encrypted, delete=True, callback=callback, ) @@ -260,7 +260,7 @@ def upload_named_collections_create_statement( self._storage_loader.upload_file( local_path, remote_path=remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f"Failed to create async upload of {remote_path}" @@ -277,7 +277,7 @@ def get_udf_create_statement( ) return self._storage_loader.download_data( remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, # backup_meta.encrypted, ) def get_local_nc_create_statement(self, nc_name: str) -> Optional[str]: @@ -306,7 +306,7 @@ def get_named_collection_create_statement( remote_path = _named_collections_data_path(backup_meta.path, filename) return self._storage_loader.download_data( remote_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) def get_backup_names(self) -> Sequence[str]: @@ -395,10 +395,7 @@ def get_database_create_statement( Download and return database create statement. """ remote_path = _db_metadata_path(backup_meta.path, db_name) - return self._storage_loader.download_data( - remote_path, - encryption=backup_meta.is_encryption_enabled, - ) + return self._storage_loader.download_data(remote_path, encryption=True) def write_database_metadata(self, db: Database, db_sql: str) -> None: """ @@ -417,10 +414,7 @@ def get_table_create_statement( Download and return table create statement. """ remote_path = _table_metadata_path(backup_meta.path, db_name, table_name) - return self._storage_loader.download_data( - remote_path, - encryption=backup_meta.is_encryption_enabled, - ) + return self._storage_loader.download_data(remote_path, encryption=True) def download_access_control_file( self, local_path: str, backup_meta: BackupMetadata, file_name: str @@ -439,7 +433,7 @@ def download_access_control_file( self._storage_loader.download_file( remote_path, local_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f"Failed to download access control metadata file {remote_path}" @@ -461,7 +455,7 @@ def download_access_control( self._storage_loader.download_files( remote_path, local_path, - encryption=backup_meta.is_encryption_enabled, + encryption=True, ) except Exception as e: msg = f"Failed to download access control metadata file {remote_path}" @@ -498,7 +492,7 @@ def download_data_part( remote_path=remote_path, local_path=fs_part_path, is_async=True, - encryption=backup_meta.is_encryption_enabled, + encryption=part.encrypted, ) except Exception as e: msg = f"Failed to download part tarball file {remote_path}" @@ -513,7 +507,7 @@ def download_data_part( remote_path=remote_path, local_path=local_path, is_async=True, - encryption=backup_meta.is_encryption_enabled, + encryption=part.encrypted, ) except Exception as e: msg = f"Failed to download part file {remote_path}" diff --git a/ch_backup/backup/metadata/backup_metadata.py b/ch_backup/backup/metadata/backup_metadata.py index 57b2cee1..cfab0a59 100644 --- a/ch_backup/backup/metadata/backup_metadata.py +++ b/ch_backup/backup/metadata/backup_metadata.py @@ -48,7 +48,7 @@ def __init__( hostname: str = None, labels: dict = None, schema_only: bool = False, - is_encryption_enabled: bool = True, + encrypted: bool = True, ) -> None: self.name = name self.path = path @@ -62,7 +62,7 @@ def __init__( self.size = 0 self.real_size = 0 self.schema_only = schema_only - self.is_encryption_enabled = is_encryption_enabled + self.encrypted = encrypted self.cloud_storage: CloudStorageMetadata = CloudStorageMetadata() self._state = BackupState.CREATING @@ -147,7 +147,7 @@ def dump(self, light: bool = False) -> dict: # to replace 'date_fmt' with 'time_format'. "date_fmt": self.time_format, "schema_only": self.schema_only, - "is_encryption_enabled": self.is_encryption_enabled, + "encrypted": self.encrypted, }, } @@ -199,7 +199,7 @@ def load(cls, data: dict) -> "BackupMetadata": backup.labels = meta["labels"] backup.version = meta["version"] backup.schema_only = meta.get("schema_only", False) - backup.is_encryption_enabled = meta.get("is_encryption_enabled", True) + backup.encrypted = meta.get("encrypted", True) # TODO remove after a several weeks/months, when backups rotated # OR NOT TODO, because of backward compatibility backup._user_defined_functions = data.get( diff --git a/ch_backup/backup/metadata/part_metadata.py b/ch_backup/backup/metadata/part_metadata.py index 74013d99..48c94126 100644 --- a/ch_backup/backup/metadata/part_metadata.py +++ b/ch_backup/backup/metadata/part_metadata.py @@ -13,7 +13,7 @@ class RawMetadata(Slotted): Raw metadata for ClickHouse data part. """ - __slots__ = "checksum", "size", "files", "tarball", "link", "disk_name" + __slots__ = "checksum", "size", "files", "tarball", "link", "disk_name", "encrypted" def __init__( self, @@ -23,6 +23,7 @@ def __init__( tarball: bool, link: str = None, disk_name: str = None, + encrypted: bool = True, ) -> None: self.checksum = checksum self.size = size @@ -30,6 +31,7 @@ def __init__( self.tarball = tarball self.link = link self.disk_name = disk_name + self.encrypted = encrypted class PartMetadata(Slotted): @@ -51,12 +53,13 @@ def __init__( tarball: bool, link: str = None, disk_name: str = None, + encrypted: bool = True, ) -> None: self.database: str = database self.table: str = table self.name: str = name self.raw_metadata: RawMetadata = RawMetadata( - checksum, size, files, tarball, link, disk_name + checksum, size, files, tarball, link, disk_name, encrypted ) @property @@ -101,6 +104,13 @@ def tarball(self) -> bool: """ return self.raw_metadata.tarball + @property + def encrypted(self) -> bool: + """ + Returns true if part is encrypted + """ + return self.raw_metadata.encrypted + @classmethod def load( cls, db_name: str, table_name: str, part_name: str, raw_metadata: dict @@ -118,6 +128,7 @@ def load( tarball=raw_metadata.get("tarball", False), link=raw_metadata["link"], disk_name=raw_metadata.get("disk_name", "default"), + encrypted=raw_metadata.get("encrypted", True), ) @classmethod @@ -134,4 +145,5 @@ def from_frozen_part(cls, frozen_part: FrozenPart) -> "PartMetadata": files=frozen_part.files, tarball=True, disk_name=frozen_part.disk_name, + encrypted=frozen_part.encrypted, ) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index 7fbd38ce..23390fe1 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -70,6 +70,7 @@ def add_part(self, part: PartMetadata) -> None: "link": part.link, "tarball": part.tarball, "disk_name": part.disk_name, + "encrypted": part.encrypted, } @classmethod diff --git a/ch_backup/ch_backup.py b/ch_backup/ch_backup.py index 05534456..48c47e82 100644 --- a/ch_backup/ch_backup.py +++ b/ch_backup/ch_backup.py @@ -150,8 +150,8 @@ def backup( ch_version=self._context.ch_ctl.get_version(), time_format=self._context.config["time_format"], schema_only=sources.schema_only, - is_encryption_enabled=self._config.get(EncryptStage.stype, {}).get( - "is_enabled", True + encrypted=self._config.get(EncryptStage.stype, {}).get( + "enabled", True ), ) diff --git a/ch_backup/clickhouse/control.py b/ch_backup/clickhouse/control.py index c01c88b1..9c592432 100644 --- a/ch_backup/clickhouse/control.py +++ b/ch_backup/clickhouse/control.py @@ -15,7 +15,7 @@ from pkg_resources import parse_version from ch_backup import logging -from ch_backup.backup.metadata import TableMetadata +from ch_backup.backup.metadata import BackupMetadata, TableMetadata from ch_backup.backup.restore_context import RestoreContext from ch_backup.calculators import calc_aligned_files_size from ch_backup.clickhouse.client import ClickhouseClient @@ -182,7 +182,8 @@ files Array(String), tarball Bool, disk_name String, - verified Bool + verified Bool, + encrypted Nullable(Bool) ) ENGINE = MergeTree() ORDER BY (database, table, name, checksum) @@ -724,13 +725,15 @@ def get_zookeeper_admin_uuid(self) -> Dict[str, str]: @staticmethod def scan_frozen_parts( - table: Table, disk: Disk, data_path: str, backup_name: str + table: Table, disk: Disk, data_path: str, backup_meta: BackupMetadata, ) -> Iterable[FrozenPart]: """ Yield frozen parts from specific disk and path. """ table_relative_path = os.path.relpath(data_path, disk.path) - path = os.path.join(disk.path, "shadow", backup_name, table_relative_path) + path = os.path.join( + disk.path, "shadow", backup_meta.get_sanitized_name(), table_relative_path + ) if not os.path.exists(path): logging.debug("Shadow path {} is empty", path) @@ -757,6 +760,7 @@ def scan_frozen_parts( checksum, size, rel_paths, + backup_meta.encrypted, ) @staticmethod diff --git a/ch_backup/clickhouse/models.py b/ch_backup/clickhouse/models.py index 10679373..5b277852 100644 --- a/ch_backup/clickhouse/models.py +++ b/ch_backup/clickhouse/models.py @@ -236,6 +236,7 @@ class FrozenPart(Slotted): "checksum", "size", "files", + "encrypted", ) def __init__( @@ -248,6 +249,7 @@ def __init__( checksum: str, size: int, files: List[str], + encrypted: bool, ): super().__init__() self.database = database @@ -258,3 +260,4 @@ def __init__( self.checksum = checksum self.size = size self.files = files + self.encrypted = encrypted diff --git a/ch_backup/config.py b/ch_backup/config.py index 96dabceb..35389800 100644 --- a/ch_backup/config.py +++ b/ch_backup/config.py @@ -131,7 +131,7 @@ def _as_seconds(t: str) -> int: "queue_size": 10, }, "encryption": { - "is_enabled": True, + "enabled": True, "type": "nacl", # Chunk size used when encrypting / decrypting data, in bytes. "chunk_size": parse_size("8 MiB"), diff --git a/ch_backup/logic/table.py b/ch_backup/logic/table.py index ebd1516a..906275ac 100644 --- a/ch_backup/logic/table.py +++ b/ch_backup/logic/table.py @@ -475,7 +475,7 @@ def deduplicate_parts_in_batch( dedup_batch_size = context.config["deduplication_batch_size"] for data_path, disk in table.paths_with_disks: for fpart in context.ch_ctl.scan_frozen_parts( - table, disk, data_path, backup_name + table, disk, data_path, context.backup_meta ): logging.debug("Working on {}", fpart) if disk.type == "s3": diff --git a/tests/integration/features/partially_encrypted_backups.feature b/tests/integration/features/partially_encrypted_backups.feature index 8440686f..3577e5e8 100644 --- a/tests/integration/features/partially_encrypted_backups.feature +++ b/tests/integration/features/partially_encrypted_backups.feature @@ -6,132 +6,41 @@ Feature: Support partially encrypted backups And a working zookeeper on zookeeper01 And a working clickhouse on clickhouse01 And a working clickhouse on clickhouse02 + And clickhouse on clickhouse01 has test schema - Scenario: Restore backup with disabled encryption + Scenario: Create and restore backup with encryption check Given ch-backup configuration on clickhouse01 """ encryption: - type: noop - is_enabled: False + type: nacl + enabled: True + key: odaYtYjhvmeP8GO7vwWlXsViiDbgu4Ti """ Given ch-backup configuration on clickhouse02 """ encryption: - type: noop - is_enabled: False - """ - Given we have executed queries on clickhouse01 - """ - CREATE DATABASE test_db; - - CREATE TABLE test_db.table_01 ON CLUSTER 'default' (date Date, n Int32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_db/table', '{replica}') PARTITION BY date ORDER BY date; - INSERT INTO test_db.table_01 SELECT today(), number FROM system.numbers LIMIT 10; - """ - When we create clickhouse01 clickhouse backup - When we restore clickhouse backup #0 to clickhouse02 - Then clickhouse02 has same schema as clickhouse01 - And we got same clickhouse data at clickhouse01 clickhouse02 - And metadata of clickhouse01 backup #0 contains - """ - is_encryption_enabled: False - """ - - Scenario: Restore backup after encryption disabled with no metadata - Given ch-backup configuration on clickhouse01 - """ - encryption: - type: noop - """ - Given ch-backup configuration on clickhouse02 - """ - encryption: - type: noop - is_enabled: True - """ - Given we have executed queries on clickhouse01 - """ - CREATE DATABASE test_db; - - CREATE TABLE test_db.table_01 ON CLUSTER 'default' (date Date, n Int32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_db/table', '{replica}') PARTITION BY date ORDER BY date; - INSERT INTO test_db.table_01 SELECT today(), number FROM system.numbers LIMIT 10; + type: nacl + enabled: False + key: odaYtYjhvmeP8GO7vwWlXsViiDbgu4Ti """ + And clickhouse01 has test clickhouse data test1 When we create clickhouse01 clickhouse backup - When metadata paths of clickhouse01 backup #0 was deleted - """ - - databases.test_db.engine - - databases.test_db.metadata_path - """ - Given file "metadata/test_db/table_01.sql" in clickhouse01 backup #0 data set to - """ - CREATE TABLE test_db.table_01 ON CLUSTER 'default' (date Date, n Int32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_db/table', '{replica}') PARTITION BY date ORDER BY date - """ - When we restore clickhouse backup #0 to clickhouse02 - Then clickhouse02 has same schema as clickhouse01 - And we got same clickhouse data at clickhouse01 clickhouse02 - And metadata of clickhouse01 backup #0 contains - """ - is_encryption_enabled: True - """ - When we update ch-backup configuration on clickhouse01 - """ - encryption: - type: noop - is_enabled: False - """ - When we update ch-backup configuration on clickhouse02 - """ - encryption: - type: noop - is_enabled: False - """ - When we restore clickhouse backup #0 to clickhouse02 - Then clickhouse02 has same schema as clickhouse01 - And we got not same clickhouse data at clickhouse01 clickhouse02 - And metadata of clickhouse01 backup #0 contains - """ - is_encryption_enabled: True - """ - - Scenario: Restore backup after encryption disabled with valid metadata + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 4 | 0 | Given ch-backup configuration on clickhouse01 """ encryption: - type: noop - """ - Given ch-backup configuration on clickhouse02 - """ - encryption: - type: noop - is_enabled: True - """ - Given we have executed queries on clickhouse01 - """ - CREATE DATABASE test_db; - - CREATE TABLE test_db.table_01 ON CLUSTER 'default' (date Date, n Int32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_db/table', '{replica}') PARTITION BY date ORDER BY date; - INSERT INTO test_db.table_01 SELECT today(), number FROM system.numbers LIMIT 10; + type: nacl + enabled: False + key: odaYtYjhvmeP8GO7vwWlXsViiDbgu4Ti """ + And clickhouse01 has test clickhouse data test2 When we create clickhouse01 clickhouse backup - When we update ch-backup configuration on clickhouse01 - """ - encryption: - type: noop - is_enabled: False - """ - When we update ch-backup configuration on clickhouse02 - """ - encryption: - type: noop - is_enabled: False - """ + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 4 | 4 | + | 1 | created | 4 | 0 | When we restore clickhouse backup #0 to clickhouse02 Then clickhouse02 has same schema as clickhouse01 And we got same clickhouse data at clickhouse01 clickhouse02 - And metadata of clickhouse01 backup #0 contains - """ - is_encryption_enabled: True - """ diff --git a/tests/integration/steps/clickhouse.py b/tests/integration/steps/clickhouse.py index a81860ba..91fffe64 100644 --- a/tests/integration/steps/clickhouse.py +++ b/tests/integration/steps/clickhouse.py @@ -4,7 +4,7 @@ import yaml from behave import given, then, when -from hamcrest import assert_that, calling, equal_to, has_length, raises +from hamcrest import assert_that, equal_to, has_length from tenacity import retry, stop_after_attempt, wait_fixed from tests.integration.modules.clickhouse import ClickhouseClient from tests.integration.modules.docker import get_container, put_file @@ -130,13 +130,6 @@ def step_same_clickhouse_data(context, nodes): assert_that(node_data, equal_to(node1_data)) -@then("we got not same clickhouse data at {nodes}") -def step_not_same_clickhouse_data(context, nodes): - assert_that( - calling(step_same_clickhouse_data).with_args(context, nodes), raises(Exception) - ) - - @then("{node1:w} has the subset of {node2:w} data") def step_has_subset_data(context, node1, node2): options = yaml.load(context.text, yaml.SafeLoader) diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py index 9a3f3701..8433ce38 100644 --- a/tests/unit/test_pipeline.py +++ b/tests/unit/test_pipeline.py @@ -91,7 +91,7 @@ def tmp_dir_path(dir_path=None): ), encrypt_conf=st.fixed_dictionaries( { - "is_enabled": st.booleans(), + "enabled": st.booleans(), "buffer_size": st.integers(1, 1024), "chunk_size": st.integers(1, 1024), "type": st.just("nacl"), @@ -120,7 +120,7 @@ def tmp_dir_path(dir_path=None): file_size=1024, read_conf={"chunk_size": 128}, encrypt_conf={ - "is_enabled": True, + "enabled": True, "buffer_size": 512, "chunk_size": 256, "type": "nacl", @@ -215,7 +215,7 @@ def run_backward_pl(in_file_name, out_file_name, read_conf, encrypt_conf, write_ incoming_chunk_size=st.integers(1, 1024), conf=st.fixed_dictionaries( { - "is_enabled": st.booleans(), + "enabled": st.booleans(), "buffer_size": st.integers(1, 1024), "chunk_size": st.integers(1, 1024), "type": st.just("nacl"),