From 029e4056fbd3451f05349718b45497582db0a8f9 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 10:21:16 +0100 Subject: [PATCH 01/38] WIP introduce entt --- .gitmodules | 3 + cpp/arcticdb/CMakeLists.txt | 143 +++++++++--------- cpp/arcticdb/processing/component_manager.hpp | 63 ++++++++ cpp/third_party/CMakeLists.txt | 1 + cpp/third_party/entt | 1 + 5 files changed, 140 insertions(+), 71 deletions(-) create mode 160000 cpp/third_party/entt diff --git a/.gitmodules b/.gitmodules index 7c009e8039..ef171067e6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "cpp/vcpkg"] path = cpp/vcpkg url = https://github.com/microsoft/vcpkg.git +[submodule "cpp/third_party/entt"] + path = cpp/third_party/entt + url = https://github.com/skypjack/entt.git diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index ef9ddd12e3..fccf0c95bc 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -656,6 +656,7 @@ set (arcticdb_core_libraries Folly::folly # Transitively includes: double-conversion, gflags, glog, libevent, libssl, libcrypto, libiberty, libsodium Azure::azure-identity Azure::azure-storage-blobs + EnTT::EnTT ) if(${ARCTICDB_PYTHON_EXPLICIT_LINK}) @@ -873,79 +874,79 @@ if(${TEST}) python_utils_dump_vars_if_enabled("Python for test compilation") set(unit_test_srcs - async/test/test_async.cpp - codec/test/test_codec.cpp - codec/test/test_encode_field_collection.cpp - codec/test/test_segment_header.cpp - codec/test/test_encoded_field.cpp - column_store/test/ingestion_stress_test.cpp - column_store/test/test_column.cpp - column_store/test/test_column_data_random_accessor.cpp - column_store/test/test_index_filtering.cpp - column_store/test/test_memory_segment.cpp - entity/test/test_atom_key.cpp - entity/test/test_key_serialization.cpp - entity/test/test_metrics.cpp - entity/test/test_ref_key.cpp - entity/test/test_tensor.cpp - log/test/test_log.cpp - pipeline/test/test_container.hpp - pipeline/test/test_pipeline.cpp - pipeline/test/test_query.cpp - util/test/test_regex.cpp - processing/test/test_arithmetic_type_promotion.cpp - processing/test/test_clause.cpp +# async/test/test_async.cpp +# codec/test/test_codec.cpp +# codec/test/test_encode_field_collection.cpp +# codec/test/test_segment_header.cpp +# codec/test/test_encoded_field.cpp +# column_store/test/ingestion_stress_test.cpp +# column_store/test/test_column.cpp +# column_store/test/test_column_data_random_accessor.cpp +# column_store/test/test_index_filtering.cpp +# column_store/test/test_memory_segment.cpp +# entity/test/test_atom_key.cpp +# entity/test/test_key_serialization.cpp +# entity/test/test_metrics.cpp +# entity/test/test_ref_key.cpp +# entity/test/test_tensor.cpp +# log/test/test_log.cpp +# pipeline/test/test_container.hpp +# pipeline/test/test_pipeline.cpp +# pipeline/test/test_query.cpp +# util/test/test_regex.cpp +# processing/test/test_arithmetic_type_promotion.cpp +# processing/test/test_clause.cpp processing/test/test_component_manager.cpp - processing/test/test_expression.cpp - processing/test/test_filter_and_project_sparse.cpp - processing/test/test_has_valid_type_promotion.cpp - processing/test/test_operation_dispatch.cpp - processing/test/test_resample.cpp - processing/test/test_set_membership.cpp - processing/test/test_signed_unsigned_comparison.cpp - processing/test/test_type_comparison.cpp - storage/test/test_local_storages.cpp - storage/test/test_memory_storage.cpp - storage/test/test_s3_storage.cpp - storage/test/test_storage_factory.cpp - storage/test/test_storage_exceptions.cpp - storage/test/test_azure_storage.cpp - storage/test/test_storage_operations.cpp - stream/test/stream_test_common.cpp - stream/test/test_aggregator.cpp - stream/test/test_append_map.cpp - stream/test/test_row_builder.cpp - stream/test/test_segment_aggregator.cpp - stream/test/test_types.cpp - util/memory_tracing.hpp +# processing/test/test_expression.cpp +# processing/test/test_filter_and_project_sparse.cpp +# processing/test/test_has_valid_type_promotion.cpp +# processing/test/test_operation_dispatch.cpp +# processing/test/test_resample.cpp +# processing/test/test_set_membership.cpp +# processing/test/test_signed_unsigned_comparison.cpp +# processing/test/test_type_comparison.cpp +# storage/test/test_local_storages.cpp +# storage/test/test_memory_storage.cpp +# storage/test/test_s3_storage.cpp +# storage/test/test_storage_factory.cpp +# storage/test/test_storage_exceptions.cpp +# storage/test/test_azure_storage.cpp +# storage/test/test_storage_operations.cpp +# stream/test/stream_test_common.cpp +# stream/test/test_aggregator.cpp +# stream/test/test_append_map.cpp +# stream/test/test_row_builder.cpp +# stream/test/test_segment_aggregator.cpp +# stream/test/test_types.cpp +# util/memory_tracing.hpp util/test/gtest_main.cpp - util/test/test_bitmagic.cpp - util/test/test_buffer_pool.cpp - util/test/test_composite.cpp - util/test/test_cursor.cpp - util/test/test_decimal.cpp - util/test/test_exponential_backoff.cpp - util/test/test_format_date.cpp - util/test/test_hash.cpp - util/test/test_id_transformation.cpp - util/test/test_ranges_from_future.cpp - util/test/test_slab_allocator.cpp - util/test/test_storage_lock.cpp - util/test/test_string_pool.cpp - util/test/test_string_utils.cpp - util/test/test_tracing_allocator.cpp - version/test/test_append.cpp - version/test/test_sparse.cpp - version/test/test_stream_version_data.cpp - version/test/test_symbol_list.cpp - version/test/test_version_map.cpp - version/test/test_version_map_batch.cpp - version/test/test_version_store.cpp - version/test/test_sorting_info_state_machine.cpp - version/test/version_map_model.hpp - python/python_handlers.cpp - storage/test/common.hpp - version/test/test_sort_index.cpp +# util/test/test_bitmagic.cpp +# util/test/test_buffer_pool.cpp +# util/test/test_composite.cpp +# util/test/test_cursor.cpp +# util/test/test_decimal.cpp +# util/test/test_exponential_backoff.cpp +# util/test/test_format_date.cpp +# util/test/test_hash.cpp +# util/test/test_id_transformation.cpp +# util/test/test_ranges_from_future.cpp +# util/test/test_slab_allocator.cpp +# util/test/test_storage_lock.cpp +# util/test/test_string_pool.cpp +# util/test/test_string_utils.cpp +# util/test/test_tracing_allocator.cpp +# version/test/test_append.cpp +# version/test/test_sparse.cpp +# version/test/test_stream_version_data.cpp +# version/test/test_symbol_list.cpp +# version/test/test_version_map.cpp +# version/test/test_version_map_batch.cpp +# version/test/test_version_store.cpp +# version/test/test_sorting_info_state_machine.cpp +# version/test/version_map_model.hpp +# python/python_handlers.cpp +# storage/test/common.hpp +# version/test/test_sort_index.cpp ) set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755 diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 6b7d6e22c5..12dc033867 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -9,6 +9,8 @@ #include +#include + #include #include @@ -18,6 +20,8 @@ using namespace pipelines; using EntityId = size_t; using bucket_id = uint8_t; +using namespace entt::literals; + class ComponentManager { public: ComponentManager() = default; @@ -58,6 +62,26 @@ class ComponentManager { template EntityId add(T component, std::optional id=std::nullopt, std::optional expected_get_calls=std::nullopt) { auto insertion_id = entity_id(id); + entt::entity entt_insertion_id; + if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { + entt_insertion_id = it->second; + } else { + entt_insertion_id = registry_.create(); + id_mapping_.emplace(insertion_id, entt_insertion_id); + } + { + std::lock_guard lock(mtx_); + // TODO: Move in once everything is handled by entt + registry_.emplace(entt_insertion_id, component); + if (expected_get_calls.has_value()) { + // TODO: Name bucket id component as well? It is not clear what it is just from the type + // TODO: make these registries members as well? + auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + expected_get_calls_registry.emplace(entt_insertion_id, *expected_get_calls); + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + initial_expected_get_calls_registry.emplace(entt_insertion_id, *expected_get_calls); + } + } if constexpr(std::is_same_v>) { segment_map_.add(insertion_id, std::move(component), expected_get_calls); } else if constexpr(std::is_same_v>) { @@ -78,6 +102,31 @@ class ComponentManager { template std::vector get(const std::vector& ids) { + { + std::lock_guard lock(mtx_); + std::vector entt_ids; + entt_ids.reserve(ids.size()); + for (auto id: ids) { + util::check(id_mapping_.contains(id), "Borked 1"); + entt_ids.push_back(id_mapping_[id]); + } + std::vector res; + res.reserve(ids.size()); + auto &&expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + for (auto id: entt_ids) { + res.emplace_back(registry_.get(id)); + if constexpr(std::is_same_v>) { + expected_get_calls_registry.patch(id, [](auto &expected_get_calls) { --expected_get_calls; }); + if (expected_get_calls_registry.get(id) == 0) { + // TODO: Erase from whole registry, not just the segment component + registry_.erase(id); + } + } + } + return res; + } + + if constexpr(std::is_same_v>) { return segment_map_.get(ids); } else if constexpr(std::is_same_v>) { @@ -97,6 +146,14 @@ class ComponentManager { template uint64_t get_initial_expected_get_calls(EntityId id) { + { + std::lock_guard lock(mtx_); + util::check(id_mapping_.contains(id), "Borked 2"); + const auto entt_id = id_mapping_[id]; + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + return initial_expected_get_calls_registry.get(entt_id); + } + // Only applies to ComponentMaps tracking expected get calls if constexpr(std::is_same_v>) { return segment_map_.get_initial_expected_get_calls(id); @@ -215,6 +272,12 @@ class ComponentManager { // The next ID to use when inserting elements into any of the maps std::atomic next_entity_id_{0}; EntityId entity_id(const std::optional& id=std::nullopt); + + // entt stuff + // TODO: Remove once everything is entt-based + ankerl::unordered_dense::map id_mapping_; + entt::registry registry_; + std::mutex mtx_; }; } // namespace arcticdb \ No newline at end of file diff --git a/cpp/third_party/CMakeLists.txt b/cpp/third_party/CMakeLists.txt index aaf5cc4e58..e184f76bd2 100644 --- a/cpp/third_party/CMakeLists.txt +++ b/cpp/third_party/CMakeLists.txt @@ -5,4 +5,5 @@ if(NOT ${ARCTICDB_USING_CONDA}) add_subdirectory(msgpack-c) add_subdirectory(Remotery) add_subdirectory(lmdbcxx) + add_subdirectory(entt) endif() diff --git a/cpp/third_party/entt b/cpp/third_party/entt new file mode 160000 index 0000000000..156bc470ce --- /dev/null +++ b/cpp/third_party/entt @@ -0,0 +1 @@ +Subproject commit 156bc470cea3a49cdbddd0ae74db77a8c207bbe8 From 1494b303a227627dcba22ef99fa7c720b55f59c0 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 14:18:06 +0100 Subject: [PATCH 02/38] All processing pipeline tests passing with data stored in entt --- cpp/arcticdb/processing/component_manager.hpp | 50 +++++++++++++++---- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 12dc033867..6194665632 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -40,6 +40,31 @@ class ComponentManager { insertion_ids.emplace_back(next_entity_id_.fetch_add(1)); } } + { + std::lock_guard lock(mtx_); + std::vector entt_insertion_ids; + entt_insertion_ids.reserve(insertion_ids.size()); + for (auto insertion_id: insertion_ids) { + if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { + entt_insertion_ids.emplace_back(it->second); + } else { + // TODO: Use version of create that takes two iterators to create multiple ids at once + auto entt_insertion_id = registry_.create(); + id_mapping_.emplace(insertion_id, entt_insertion_id); + entt_insertion_ids.emplace_back(entt_insertion_id); + } + } + // TODO: Move in once everything is handled by entt + registry_.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), components.begin()); + if constexpr(std::is_same_v>) { + // TODO: Name bucket id component as well? It is not clear what it is just from the type + // TODO: make these registries members as well? + auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + expected_get_calls_registry.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), 1); + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + initial_expected_get_calls_registry.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), 1); + } + } if constexpr(std::is_same_v>) { segment_map_.add(insertion_ids, std::move(components)); @@ -62,15 +87,15 @@ class ComponentManager { template EntityId add(T component, std::optional id=std::nullopt, std::optional expected_get_calls=std::nullopt) { auto insertion_id = entity_id(id); - entt::entity entt_insertion_id; - if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { - entt_insertion_id = it->second; - } else { - entt_insertion_id = registry_.create(); - id_mapping_.emplace(insertion_id, entt_insertion_id); - } { std::lock_guard lock(mtx_); + entt::entity entt_insertion_id; + if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { + entt_insertion_id = it->second; + } else { + entt_insertion_id = registry_.create(); + id_mapping_.emplace(insertion_id, entt_insertion_id); + } // TODO: Move in once everything is handled by entt registry_.emplace(entt_insertion_id, component); if (expected_get_calls.has_value()) { @@ -114,11 +139,16 @@ class ComponentManager { res.reserve(ids.size()); auto &&expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); for (auto id: entt_ids) { - res.emplace_back(registry_.get(id)); + auto ptr = registry_.try_get(id); + if (ptr == nullptr) { + internal::check(ptr != nullptr, "Requested non-existent entity"); + } + res.emplace_back(*ptr); if constexpr(std::is_same_v>) { expected_get_calls_registry.patch(id, [](auto &expected_get_calls) { --expected_get_calls; }); if (expected_get_calls_registry.get(id) == 0) { - // TODO: Erase from whole registry, not just the segment component + // TODO: Use destroy to remove from whole registry, not just the segment component + // Takes two iterators so can erase all at once registry_.erase(id); } } @@ -275,7 +305,7 @@ class ComponentManager { // entt stuff // TODO: Remove once everything is entt-based - ankerl::unordered_dense::map id_mapping_; + std::unordered_map id_mapping_; entt::registry registry_; std::mutex mtx_; }; From 529dad1a412e92cf085248ce5f736241e3f34fe2 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 14:18:51 +0100 Subject: [PATCH 03/38] REVERT ME: Only run relevant C++ unit tests --- cpp/arcticdb/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index fccf0c95bc..9733fde747 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -895,13 +895,13 @@ if(${TEST}) # pipeline/test/test_query.cpp # util/test/test_regex.cpp # processing/test/test_arithmetic_type_promotion.cpp -# processing/test/test_clause.cpp + processing/test/test_clause.cpp processing/test/test_component_manager.cpp # processing/test/test_expression.cpp # processing/test/test_filter_and_project_sparse.cpp # processing/test/test_has_valid_type_promotion.cpp # processing/test/test_operation_dispatch.cpp -# processing/test/test_resample.cpp + processing/test/test_resample.cpp # processing/test/test_set_membership.cpp # processing/test/test_signed_unsigned_comparison.cpp # processing/test/test_type_comparison.cpp From de6367d35058cdcdb08bf8efe69972778982b9a6 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 14:38:37 +0100 Subject: [PATCH 04/38] Get rid of ComponentMaps --- cpp/arcticdb/processing/component_manager.hpp | 161 ------------------ 1 file changed, 161 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 6194665632..974e935628 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -65,22 +65,6 @@ class ComponentManager { initial_expected_get_calls_registry.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), 1); } } - - if constexpr(std::is_same_v>) { - segment_map_.add(insertion_ids, std::move(components)); - } else if constexpr(std::is_same_v>) { - row_range_map_.add(insertion_ids, std::move(components)); - } else if constexpr(std::is_same_v>) { - col_range_map_.add(insertion_ids, std::move(components)); - } else if constexpr(std::is_same_v>) { - atom_key_map_.add(insertion_ids, std::move(components)); - } else if constexpr(std::is_same_v) { - bucket_map_.add(insertion_ids, std::move(components)); - } else { - // Hacky workaround for static_assert(false) not being allowed - // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html - static_assert(sizeof(T) == 0, "Unsupported component type passed to ComponentManager::add"); - } return insertion_ids; } @@ -107,21 +91,6 @@ class ComponentManager { initial_expected_get_calls_registry.emplace(entt_insertion_id, *expected_get_calls); } } - if constexpr(std::is_same_v>) { - segment_map_.add(insertion_id, std::move(component), expected_get_calls); - } else if constexpr(std::is_same_v>) { - row_range_map_.add(insertion_id, std::move(component)); - } else if constexpr(std::is_same_v>) { - col_range_map_.add(insertion_id, std::move(component)); - } else if constexpr(std::is_same_v>) { - atom_key_map_.add(insertion_id, std::move(component)); - } else if constexpr(std::is_same_v) { - bucket_map_.add(insertion_id, std::move(component)); - } else { - // Hacky workaround for static_assert(false) not being allowed - // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html - static_assert(sizeof(T) == 0, "Unsupported component type passed to ComponentManager::add"); - } return insertion_id; } @@ -155,23 +124,6 @@ class ComponentManager { } return res; } - - - if constexpr(std::is_same_v>) { - return segment_map_.get(ids); - } else if constexpr(std::is_same_v>) { - return row_range_map_.get(ids); - } else if constexpr(std::is_same_v>) { - return col_range_map_.get(ids); - } else if constexpr(std::is_same_v>) { - return atom_key_map_.get(ids); - } else if constexpr(std::is_same_v) { - return bucket_map_.get(ids); - } else { - // Hacky workaround for static_assert(false) not being allowed - // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html - static_assert(sizeof(T) == 0, "Unsupported component type passed to ComponentManager::get"); - } } template @@ -183,122 +135,9 @@ class ComponentManager { auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); return initial_expected_get_calls_registry.get(entt_id); } - - // Only applies to ComponentMaps tracking expected get calls - if constexpr(std::is_same_v>) { - return segment_map_.get_initial_expected_get_calls(id); - } else { - // Hacky workaround for static_assert(false) not being allowed - // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html - static_assert(sizeof(T) == 0, "Unsupported component type passed to ComponentManager::get_initial_expected_get_calls"); - } } private: - template - class ComponentMap { - public: - explicit ComponentMap(std::string&& entity_type, bool track_expected_gets): - entity_type_(std::move(entity_type)), - opt_expected_get_calls_map_(track_expected_gets ? std::make_optional>() : std::nullopt), - opt_expected_get_calls_initial_map_(track_expected_gets ? std::make_optional>() : std::nullopt){ - }; - ARCTICDB_NO_MOVE_OR_COPY(ComponentMap) - - void add(const std::vector& ids, - std::vector&& entities) { - std::lock_guard lock(mtx_); - for (auto [idx, id]: folly::enumerate(ids)) { - ARCTICDB_DEBUG(log::storage(), "Adding {} with id {}", entity_type_, id); - internal::check(map_.try_emplace(id, std::move(entities[idx])).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - if (opt_expected_get_calls_map_.has_value()) { - internal::check( - opt_expected_get_calls_map_->try_emplace(id, 1).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - internal::check( - opt_expected_get_calls_initial_map_->try_emplace(id, 1).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - } - } - } - void add(EntityId id, T&& entity, std::optional expected_get_calls=std::nullopt) { - std::lock_guard lock(mtx_); - ARCTICDB_DEBUG(log::storage(), "Adding {} with id {}", entity_type_, id); - internal::check(map_.try_emplace(id, std::move(entity)).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - if (opt_expected_get_calls_map_.has_value()) { - internal::check(expected_get_calls.has_value() && *expected_get_calls > 0, - "Failed to insert {} with ID {}, must provide expected gets", - entity_type_, id); - internal::check(opt_expected_get_calls_map_->try_emplace(id, *expected_get_calls).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - internal::check(opt_expected_get_calls_initial_map_->try_emplace(id, *expected_get_calls).second, - "Failed to insert {} with ID {}, already exists", - entity_type_, id); - } - } - std::vector get(const std::vector& ids) { - std::vector res; - res.reserve(ids.size()); - std::lock_guard lock(mtx_); - for (auto id: ids) { - ARCTICDB_DEBUG(log::storage(), "Getting {} with id {}", entity_type_, id); - auto entity_it = map_.find(id); - internal::check(entity_it != map_.end(), - "Requested non-existent {} with ID {}", - entity_type_, id); - res.emplace_back(entity_it->second); - if (opt_expected_get_calls_map_.has_value()) { - auto expected_get_calls_it = opt_expected_get_calls_map_->find(id); - internal::check( - expected_get_calls_it != opt_expected_get_calls_map_->end(), - "Requested non-existent {} with ID {}", - entity_type_, id); - if (--expected_get_calls_it->second == 0) { - ARCTICDB_DEBUG(log::storage(), - "{} with id {} has been fetched the expected number of times, erasing from component manager", - entity_type_, id); - map_.erase(entity_it); - opt_expected_get_calls_map_->erase(expected_get_calls_it); - } - } - } - return res; - } - uint64_t get_initial_expected_get_calls(EntityId id) { - std::lock_guard lock(mtx_); - ARCTICDB_DEBUG(log::storage(), "Getting initial expected get calls of {} with id {}", entity_type_, id); - internal::check(opt_expected_get_calls_initial_map_.has_value(), - "Cannot get initial expected get calls for {} as they are not being tracked", entity_type_); - auto it = opt_expected_get_calls_initial_map_->find(id); - internal::check(it != opt_expected_get_calls_initial_map_->end(), - "Requested non-existent {} with ID {}", - entity_type_, id); - return it->second; - } - private: - // Just used for logging/exception messages - std::string entity_type_; - ankerl::unordered_dense::map map_; - // If not nullopt, tracks the number of calls to get for each entity id, and erases from maps when it has been - // called this many times - std::optional> opt_expected_get_calls_map_; - std::optional> opt_expected_get_calls_initial_map_; - std::mutex mtx_; - }; - - ComponentMap> segment_map_{"segment", true}; - ComponentMap> row_range_map_{"row range", false}; - ComponentMap> col_range_map_{"col range", false}; - ComponentMap> atom_key_map_{"atom key", false}; - ComponentMap bucket_map_{"bucket", false}; - // The next ID to use when inserting elements into any of the maps std::atomic next_entity_id_{0}; EntityId entity_id(const std::optional& id=std::nullopt); From e6e6dc3ab30eb3c4f92f53036ae027ca2a4081ab Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 17:14:46 +0100 Subject: [PATCH 05/38] Eliminate id mapping --- cpp/arcticdb/processing/component_manager.cpp | 8 -- cpp/arcticdb/processing/component_manager.hpp | 129 ++++++------------ cpp/arcticdb/version/version_core.cpp | 42 ++++-- 3 files changed, 76 insertions(+), 103 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.cpp b/cpp/arcticdb/processing/component_manager.cpp index 746b0215a4..9aa0155a5d 100644 --- a/cpp/arcticdb/processing/component_manager.cpp +++ b/cpp/arcticdb/processing/component_manager.cpp @@ -9,13 +9,5 @@ namespace arcticdb { -void ComponentManager::set_next_entity_id(EntityId id) { - next_entity_id_ = id; -} - -EntityId ComponentManager::entity_id(const std::optional& id) { - // Do not use value_or as we do not want the side effect of fetch_add when id was provided by the caller - return id.has_value() ? *id : next_entity_id_.fetch_add(1); -} } // namespace arcticdb \ No newline at end of file diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 974e935628..c1bdea4bf3 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -17,7 +17,7 @@ namespace arcticdb { using namespace pipelines; -using EntityId = size_t; +using EntityId = entt::entity; using bucket_id = uint8_t; using namespace entt::literals; @@ -27,124 +27,85 @@ class ComponentManager { ComponentManager() = default; ARCTICDB_NO_MOVE_OR_COPY(ComponentManager) - void set_next_entity_id(EntityId id); + EntityId get_new_entity_id() { + return registry_.create(); + } template std::vector add(std::vector&& components, const std::optional>& ids=std::nullopt) { + std::lock_guard lock(mtx_); std::vector insertion_ids; if (ids.has_value()) { insertion_ids = *ids; } else { insertion_ids.reserve(components.size()); for (size_t idx = 0; idx < components.size(); ++idx) { - insertion_ids.emplace_back(next_entity_id_.fetch_add(1)); + insertion_ids.emplace_back(get_new_entity_id()); } } - { - std::lock_guard lock(mtx_); - std::vector entt_insertion_ids; - entt_insertion_ids.reserve(insertion_ids.size()); - for (auto insertion_id: insertion_ids) { - if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { - entt_insertion_ids.emplace_back(it->second); - } else { - // TODO: Use version of create that takes two iterators to create multiple ids at once - auto entt_insertion_id = registry_.create(); - id_mapping_.emplace(insertion_id, entt_insertion_id); - entt_insertion_ids.emplace_back(entt_insertion_id); - } - } - // TODO: Move in once everything is handled by entt - registry_.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), components.begin()); - if constexpr(std::is_same_v>) { - // TODO: Name bucket id component as well? It is not clear what it is just from the type - // TODO: make these registries members as well? - auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); - expected_get_calls_registry.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), 1); - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); - initial_expected_get_calls_registry.insert(entt_insertion_ids.begin(), entt_insertion_ids.end(), 1); - } + // TODO: Move in once everything is handled by entt + registry_.insert(insertion_ids.begin(), insertion_ids.end(), components.begin()); + if constexpr(std::is_same_v>) { + // TODO: Name bucket id component as well? It is not clear what it is just from the type + // TODO: make these registries members as well? + auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + initial_expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); } return insertion_ids; } template EntityId add(T component, std::optional id=std::nullopt, std::optional expected_get_calls=std::nullopt) { - auto insertion_id = entity_id(id); - { - std::lock_guard lock(mtx_); - entt::entity entt_insertion_id; - if (auto it = id_mapping_.find(insertion_id); it != id_mapping_.end()) { - entt_insertion_id = it->second; - } else { - entt_insertion_id = registry_.create(); - id_mapping_.emplace(insertion_id, entt_insertion_id); - } - // TODO: Move in once everything is handled by entt - registry_.emplace(entt_insertion_id, component); - if (expected_get_calls.has_value()) { - // TODO: Name bucket id component as well? It is not clear what it is just from the type - // TODO: make these registries members as well? - auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); - expected_get_calls_registry.emplace(entt_insertion_id, *expected_get_calls); - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); - initial_expected_get_calls_registry.emplace(entt_insertion_id, *expected_get_calls); - } + std::lock_guard lock(mtx_); + auto insertion_id = id.has_value() ? *id : get_new_entity_id(); + // TODO: Move in once everything is handled by entt + registry_.emplace(insertion_id, component); + if (expected_get_calls.has_value()) { + // TODO: Name bucket id component as well? It is not clear what it is just from the type + // TODO: make these registries members as well? + auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + initial_expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); } return insertion_id; } template std::vector get(const std::vector& ids) { - { - std::lock_guard lock(mtx_); - std::vector entt_ids; - entt_ids.reserve(ids.size()); - for (auto id: ids) { - util::check(id_mapping_.contains(id), "Borked 1"); - entt_ids.push_back(id_mapping_[id]); + std::lock_guard lock(mtx_); + std::vector res; + res.reserve(ids.size()); + auto &&expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + for (auto id: ids) { + auto ptr = registry_.try_get(id); + if (ptr == nullptr) { + internal::check(ptr != nullptr, "Requested non-existent entity"); } - std::vector res; - res.reserve(ids.size()); - auto &&expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); - for (auto id: entt_ids) { - auto ptr = registry_.try_get(id); - if (ptr == nullptr) { - internal::check(ptr != nullptr, "Requested non-existent entity"); - } - res.emplace_back(*ptr); - if constexpr(std::is_same_v>) { - expected_get_calls_registry.patch(id, [](auto &expected_get_calls) { --expected_get_calls; }); - if (expected_get_calls_registry.get(id) == 0) { - // TODO: Use destroy to remove from whole registry, not just the segment component - // Takes two iterators so can erase all at once - registry_.erase(id); - } + res.emplace_back(*ptr); + if constexpr(std::is_same_v>) { + expected_get_calls_registry.patch(id, [](auto &expected_get_calls) { --expected_get_calls; }); + if (expected_get_calls_registry.get(id) == 0) { + // TODO: Use destroy to remove from whole registry, not just the segment component + // Takes two iterators so can erase all at once + registry_.erase(id); } } - return res; } + return res; } template uint64_t get_initial_expected_get_calls(EntityId id) { - { - std::lock_guard lock(mtx_); - util::check(id_mapping_.contains(id), "Borked 2"); - const auto entt_id = id_mapping_[id]; - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); - return initial_expected_get_calls_registry.get(entt_id); - } + std::lock_guard lock(mtx_); + auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + return initial_expected_get_calls_registry.get(id); } private: - // The next ID to use when inserting elements into any of the maps - std::atomic next_entity_id_{0}; - EntityId entity_id(const std::optional& id=std::nullopt); - // entt stuff - // TODO: Remove once everything is entt-based - std::unordered_map id_mapping_; entt::registry registry_; std::mutex mtx_; }; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 9452b2b76e..d7cacad172 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -509,12 +509,31 @@ std::vector process_clauses( internal::check( std::all_of(segment_proc_unit_counts->begin(), segment_proc_unit_counts->end(), [](const size_t& val) { return val != 0; }), "All segments should be needed by at least one ProcessingUnit"); + // Map from position in segment_and_slice_futures to entity ids + std::vector pos_to_id; + // Map from entity id to position in segment_and_slice_futures + auto id_to_pos = std::make_shared>(); + pos_to_id.reserve(segment_and_slice_futures.size()); + for (size_t i = 0; i < segment_and_slice_futures.size(); ++i) { + auto id = component_manager->get_new_entity_id(); + pos_to_id.emplace_back(id); + id_to_pos->emplace(id, i); + } + // Give this a more descriptive name as we modify it between clauses - std::vector> entity_ids_vec{std::move(processing_unit_indexes)}; + std::vector> entity_ids_vec; + entity_ids_vec.reserve(processing_unit_indexes.size()); + for (const auto& indexes: processing_unit_indexes) { + entity_ids_vec.emplace_back(); + entity_ids_vec.back().reserve(indexes.size()); + for (auto index: indexes) { + entity_ids_vec.back().emplace_back(pos_to_id[index]); + } + } // Used to make sure each entity is only added into the component manager once - auto entity_added_mtx = std::make_shared>(segment_and_slice_futures.size()); - auto entity_added = std::make_shared>(segment_and_slice_futures.size(), false); + auto slice_added_mtx = std::make_shared>(segment_and_slice_futures.size()); + auto slice_added = std::make_shared>(segment_and_slice_futures.size(), false); std::vector>> futures; bool first_clause{true}; // Reverse the order of clauses and iterate through them backwards so that the erase is efficient @@ -525,24 +544,26 @@ std::vector process_clauses( std::vector> local_futs; local_futs.reserve(entity_ids.size()); for (auto id: entity_ids) { - local_futs.emplace_back(segment_and_slice_future_splitters[id].getFuture()); + local_futs.emplace_back(segment_and_slice_future_splitters[id_to_pos->at(id)].getFuture()); } futures.emplace_back( folly::collect(local_futs) .via(&async::cpu_executor()) .thenValue([component_manager, segment_proc_unit_counts, - entity_added_mtx, - entity_added, + id_to_pos, + slice_added_mtx, + slice_added, clauses, entity_ids = std::move(entity_ids)](std::vector&& segment_and_slices) mutable { for (auto&& [idx, segment_and_slice]: folly::enumerate(segment_and_slices)) { auto entity_id = entity_ids[idx]; - std::lock_guard lock((*entity_added_mtx)[entity_id]); - if (!(*entity_added)[entity_id]) { + auto pos = id_to_pos->at(entity_id); + std::lock_guard lock((*slice_added_mtx)[pos]); + if (!(*slice_added)[pos]) { component_manager->add( std::make_shared(std::move(segment_and_slice.segment_in_memory_)), - entity_id, (*segment_proc_unit_counts)[entity_id]); + entity_id, (*segment_proc_unit_counts)[pos]); component_manager->add( std::make_shared(std::move(segment_and_slice.ranges_and_key_.row_range_)), entity_id); @@ -552,7 +573,7 @@ std::vector process_clauses( component_manager->add( std::make_shared(std::move(segment_and_slice.ranges_and_key_.key_)), entity_id); - (*entity_added)[entity_id] = true; + (*slice_added)[pos] = true; } } return async::MemSegmentProcessingTask(*clauses, std::move(entity_ids))(); @@ -708,7 +729,6 @@ std::vector read_and_process( // i.e. if the first processing unit needs ranges_and_keys[0] and ranges_and_keys[1], and the second needs ranges_and_keys[2] and ranges_and_keys[3] // then the structure will be {{0, 1}, {2, 3}} std::vector> processing_unit_indexes = read_query.clauses_[0]->structure_for_processing(ranges_and_keys, start_from); - component_manager->set_next_entity_id(ranges_and_keys.size()); // Start reading as early as possible auto segment_and_slice_futures = store->batch_read_uncompressed(std::move(ranges_and_keys), columns_to_decode(pipeline_context)); From 41a40d079724f551c0d24f007b6ad2e29ece4363 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 9 Sep 2024 17:46:16 +0100 Subject: [PATCH 06/38] Pull out some hard-coded constants --- cpp/arcticdb/processing/component_manager.hpp | 26 ++++++++----------- .../test/test_component_manager.cpp | 3 --- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index c1bdea4bf3..8046476760 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -22,6 +22,9 @@ using bucket_id = uint8_t; using namespace entt::literals; +constexpr auto expected_get_calls_id = "expected_get_calls"_hs; +constexpr auto initial_expected_get_calls_id = "initial_expected_get_calls"_hs; + class ComponentManager { public: ComponentManager() = default; @@ -43,14 +46,11 @@ class ComponentManager { insertion_ids.emplace_back(get_new_entity_id()); } } - // TODO: Move in once everything is handled by entt registry_.insert(insertion_ids.begin(), insertion_ids.end(), components.begin()); if constexpr(std::is_same_v>) { - // TODO: Name bucket id component as well? It is not clear what it is just from the type - // TODO: make these registries members as well? - auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); initial_expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); } return insertion_ids; @@ -60,14 +60,11 @@ class ComponentManager { EntityId add(T component, std::optional id=std::nullopt, std::optional expected_get_calls=std::nullopt) { std::lock_guard lock(mtx_); auto insertion_id = id.has_value() ? *id : get_new_entity_id(); - // TODO: Move in once everything is handled by entt registry_.emplace(insertion_id, component); if (expected_get_calls.has_value()) { - // TODO: Name bucket id component as well? It is not clear what it is just from the type - // TODO: make these registries members as well? - auto&& expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); initial_expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); } return insertion_id; @@ -78,7 +75,7 @@ class ComponentManager { std::lock_guard lock(mtx_); std::vector res; res.reserve(ids.size()); - auto &&expected_get_calls_registry = registry_.storage("expected_get_calls"_hs); + auto &&expected_get_calls_registry = registry_.storage(expected_get_calls_id); for (auto id: ids) { auto ptr = registry_.try_get(id); if (ptr == nullptr) { @@ -86,8 +83,8 @@ class ComponentManager { } res.emplace_back(*ptr); if constexpr(std::is_same_v>) { - expected_get_calls_registry.patch(id, [](auto &expected_get_calls) { --expected_get_calls; }); - if (expected_get_calls_registry.get(id) == 0) { + auto& expected_get_calls = expected_get_calls_registry.get(id); + if (--expected_get_calls == 0) { // TODO: Use destroy to remove from whole registry, not just the segment component // Takes two iterators so can erase all at once registry_.erase(id); @@ -100,12 +97,11 @@ class ComponentManager { template uint64_t get_initial_expected_get_calls(EntityId id) { std::lock_guard lock(mtx_); - auto&& initial_expected_get_calls_registry = registry_.storage("initial_expected_get_calls"_hs); + auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); return initial_expected_get_calls_registry.get(id); } private: - // entt stuff entt::registry registry_; std::mutex mtx_; }; diff --git a/cpp/arcticdb/processing/test/test_component_manager.cpp b/cpp/arcticdb/processing/test/test_component_manager.cpp index 9bc4b3ee66..2a3ab35f1b 100644 --- a/cpp/arcticdb/processing/test/test_component_manager.cpp +++ b/cpp/arcticdb/processing/test/test_component_manager.cpp @@ -36,9 +36,6 @@ TEST(ComponentManager, Simple) { ASSERT_EQ(component_manager.add(col_range_1, id_1), id_1); ASSERT_EQ(component_manager.add(key_1, id_1), id_1); - ASSERT_EQ(id_0, 0); - ASSERT_EQ(id_1, 1); - ASSERT_EQ(component_manager.get>({id_0})[0], segment_0); ASSERT_EQ(component_manager.get_initial_expected_get_calls>(id_0), expected_get_calls_0); ASSERT_EQ(component_manager.get>({id_0})[0], row_range_0); From 371cac34928465581e5b44cec07533371d8b96bf Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 11 Sep 2024 14:50:21 +0100 Subject: [PATCH 07/38] Add arbitrary components to an entity when reading from storage --- cpp/arcticdb/processing/component_manager.hpp | 19 ++++++++----------- .../test/test_component_manager.cpp | 14 +++++--------- .../processing/test/test_resample.cpp | 16 ++++++---------- cpp/arcticdb/version/version_core.cpp | 14 +++++--------- 4 files changed, 24 insertions(+), 39 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 8046476760..734e40c304 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -56,18 +56,15 @@ class ComponentManager { return insertion_ids; } - template - EntityId add(T component, std::optional id=std::nullopt, std::optional expected_get_calls=std::nullopt) { + template + void add_entity(EntityId id, uint64_t expected_get_calls, Args... args) { std::lock_guard lock(mtx_); - auto insertion_id = id.has_value() ? *id : get_new_entity_id(); - registry_.emplace(insertion_id, component); - if (expected_get_calls.has_value()) { - auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); - expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); - auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - initial_expected_get_calls_registry.emplace(insertion_id, *expected_get_calls); - } - return insertion_id; + // Fold expressions magic to emplace every element of args into the registry with its type as template parameter + ([&]{ registry_.emplace(id, args); }(), ...); + auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); + expected_get_calls_registry.emplace(id, expected_get_calls); + auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); + initial_expected_get_calls_registry.emplace(id, expected_get_calls); } template diff --git a/cpp/arcticdb/processing/test/test_component_manager.cpp b/cpp/arcticdb/processing/test/test_component_manager.cpp index 2a3ab35f1b..97b8f7b965 100644 --- a/cpp/arcticdb/processing/test/test_component_manager.cpp +++ b/cpp/arcticdb/processing/test/test_component_manager.cpp @@ -26,15 +26,11 @@ TEST(ComponentManager, Simple) { auto key_1 = std::make_shared(AtomKeyBuilder().version_id(2).build("symbol_1", KeyType::TABLE_DATA)); uint64_t expected_get_calls_1{2}; - auto id_0 = component_manager.add(segment_0, std::nullopt, expected_get_calls_0); - ASSERT_EQ(component_manager.add(row_range_0, id_0), id_0); - ASSERT_EQ(component_manager.add(col_range_0, id_0), id_0); - ASSERT_EQ(component_manager.add(key_0, id_0), id_0); - - auto id_1 = component_manager.add(segment_1, std::nullopt, expected_get_calls_1); - ASSERT_EQ(component_manager.add(row_range_1, id_1), id_1); - ASSERT_EQ(component_manager.add(col_range_1, id_1), id_1); - ASSERT_EQ(component_manager.add(key_1, id_1), id_1); + auto id_0 = component_manager.get_new_entity_id(); + component_manager.add_entity(id_0, expected_get_calls_0, segment_0, row_range_0, col_range_0, key_0); + + auto id_1 = component_manager.get_new_entity_id(); + component_manager.add_entity(id_1, expected_get_calls_1, segment_1, row_range_1, col_range_1, key_1); ASSERT_EQ(component_manager.get>({id_0})[0], segment_0); ASSERT_EQ(component_manager.get_initial_expected_get_calls>(id_0), expected_get_calls_0); diff --git a/cpp/arcticdb/processing/test/test_resample.cpp b/cpp/arcticdb/processing/test/test_resample.cpp index f21519f7ec..084abb54ad 100644 --- a/cpp/arcticdb/processing/test/test_resample.cpp +++ b/cpp/arcticdb/processing/test/test_resample.cpp @@ -329,16 +329,12 @@ TEST(Resample, ProcessMultipleSegments) { auto row_range_2 = std::make_shared(5, 6); auto col_range_2 = std::make_shared(1, 2); - auto id_0 = component_manager->add(seg_0, std::nullopt, 1); - component_manager->add(row_range_0, id_0); - component_manager->add(col_range_0, id_0); - auto id_1 = component_manager->add(seg_1, std::nullopt, 2); - component_manager->add(row_range_1, id_1); - component_manager->add(col_range_1, id_1); - auto id_2 = component_manager->add(seg_2, std::nullopt, 1); - component_manager->add(row_range_2, id_2); - component_manager->add(col_range_2, id_2); - + auto id_0 = component_manager->get_new_entity_id(); + component_manager->add_entity(id_0, 1, seg_0, row_range_0, col_range_0); + auto id_1 = component_manager->get_new_entity_id(); + component_manager->add_entity(id_1, 2, seg_1, row_range_1, col_range_1); + auto id_2 = component_manager->get_new_entity_id(); + component_manager->add_entity(id_2, 1, seg_2, row_range_2, col_range_2); std::vector ids_0{id_0, id_1}; std::vector ids_1{id_1}; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index d7cacad172..a784aaf594 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -561,18 +561,14 @@ std::vector process_clauses( auto pos = id_to_pos->at(entity_id); std::lock_guard lock((*slice_added_mtx)[pos]); if (!(*slice_added)[pos]) { - component_manager->add( + component_manager->add_entity( + entity_id, + (*segment_proc_unit_counts)[pos], std::make_shared(std::move(segment_and_slice.segment_in_memory_)), - entity_id, (*segment_proc_unit_counts)[pos]); - component_manager->add( std::make_shared(std::move(segment_and_slice.ranges_and_key_.row_range_)), - entity_id); - component_manager->add( std::make_shared(std::move(segment_and_slice.ranges_and_key_.col_range_)), - entity_id); - component_manager->add( - std::make_shared(std::move(segment_and_slice.ranges_and_key_.key_)), - entity_id); + std::make_shared(std::move(segment_and_slice.ranges_and_key_.key_)) + ); (*slice_added)[pos] = true; } } From 170a8f4af2102deaff3b479091d66567bf75ba75 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 11 Sep 2024 16:27:28 +0100 Subject: [PATCH 08/38] All methods to add entties/components now fully type-generic --- cpp/arcticdb/processing/clause.cpp | 19 +----- cpp/arcticdb/processing/component_manager.hpp | 59 ++++++++++++------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 23c97f5e5c..862e2f1781 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -109,24 +109,11 @@ ProcessingUnit gather_entities(std::shared_ptr component_manag * pushed into the component manager with the same ID. */ std::vector push_entities(std::shared_ptr component_manager, ProcessingUnit&& proc) { - std::optional> res; - if (proc.segments_.has_value()) { - res = std::make_optional>(component_manager->add(std::move(*proc.segments_))); - } - if (proc.row_ranges_.has_value()) { - res = component_manager->add(std::move(*proc.row_ranges_), res); - } - if (proc.col_ranges_.has_value()) { - res = component_manager->add(std::move(*proc.col_ranges_), res); - } - if (proc.atom_keys_.has_value()) { - res = component_manager->add(std::move(*proc.atom_keys_), res); - } - internal::check(res.has_value(), "Unexpected empty result in push_entities"); + auto ids = component_manager->add_entities(std::move(*proc.segments_), std::move(*proc.row_ranges_), std::move(*proc.col_ranges_)); if (proc.bucket_.has_value()) { - component_manager->add(std::vector(res->size(), *proc.bucket_), res); + component_manager->add_component(ids, *proc.bucket_); } - return *res; + return ids; } std::vector flatten_entities(std::vector>&& entity_ids_vec) { diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 734e40c304..74e7a8258b 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -34,28 +34,7 @@ class ComponentManager { return registry_.create(); } - template - std::vector add(std::vector&& components, const std::optional>& ids=std::nullopt) { - std::lock_guard lock(mtx_); - std::vector insertion_ids; - if (ids.has_value()) { - insertion_ids = *ids; - } else { - insertion_ids.reserve(components.size()); - for (size_t idx = 0; idx < components.size(); ++idx) { - insertion_ids.emplace_back(get_new_entity_id()); - } - } - registry_.insert(insertion_ids.begin(), insertion_ids.end(), components.begin()); - if constexpr(std::is_same_v>) { - auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); - expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); - auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - initial_expected_get_calls_registry.insert(insertion_ids.begin(), insertion_ids.end(), 1); - } - return insertion_ids; - } - + // Add a single entity with the components defined by args template void add_entity(EntityId id, uint64_t expected_get_calls, Args... args) { std::lock_guard lock(mtx_); @@ -67,6 +46,42 @@ class ComponentManager { initial_expected_get_calls_registry.emplace(id, expected_get_calls); } + // Add a collection of entities. Each element of args should be a vector of components, all of the same length + template + std::vector add_entities(Args... args) { + std::vector ids; + size_t entity_count{0}; + std::lock_guard lock(mtx_); + // Fold expressions magic to emplace every element of args into the registry with its type as template parameter + ([&]{ + if (entity_count == 0) { + entity_count = args.size(); + ids.reserve(entity_count); + for (ARCTICDB_UNUSED size_t i=0; i( + args.size() == entity_count, + "ComponentManager::add_entities received vectors of differing lengths" + ); + } + registry_.insert(ids.cbegin(), ids.cend(), args.begin()); + }(), ...); + auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); + expected_get_calls_registry.insert(ids.cbegin(), ids.cend(), 1); + auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); + initial_expected_get_calls_registry.insert(ids.cbegin(), ids.cend(), 1); + return ids; + } + + // Add the same component to all the input entities + template + void add_component(const std::vector& ids, T component) { + std::lock_guard lock(mtx_); + registry_.insert(ids.cbegin(), ids.cend(), component); + } + template std::vector get(const std::vector& ids) { std::lock_guard lock(mtx_); From 4faacc0ac766076ae487520d36cee44d5be2b4b4 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 11 Sep 2024 16:53:42 +0100 Subject: [PATCH 09/38] Make get_initial_expected_get_calls take a vector of entity ids like get does --- cpp/arcticdb/processing/clause.cpp | 7 +------ cpp/arcticdb/processing/component_manager.hpp | 16 ++++++++-------- .../processing/test/test_component_manager.cpp | 6 ++---- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 862e2f1781..f898cefabd 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -91,12 +91,7 @@ ProcessingUnit gather_entities(std::shared_ptr component_manag res.set_atom_keys(component_manager->get>(entity_ids)); } if (include_initial_expected_get_calls) { - std::vector segment_initial_expected_get_calls; - segment_initial_expected_get_calls.reserve(entity_ids.size()); - for (auto entity_id: entity_ids) { - segment_initial_expected_get_calls.emplace_back(component_manager->get_initial_expected_get_calls>(entity_id)); - } - res.set_segment_initial_expected_get_calls(std::move(segment_initial_expected_get_calls)); + res.set_segment_initial_expected_get_calls(component_manager->get_initial_expected_get_calls(entity_ids)); } return res; } diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 74e7a8258b..ef19b1b150 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -89,11 +89,7 @@ class ComponentManager { res.reserve(ids.size()); auto &&expected_get_calls_registry = registry_.storage(expected_get_calls_id); for (auto id: ids) { - auto ptr = registry_.try_get(id); - if (ptr == nullptr) { - internal::check(ptr != nullptr, "Requested non-existent entity"); - } - res.emplace_back(*ptr); + res.emplace_back(registry_.get(id)); if constexpr(std::is_same_v>) { auto& expected_get_calls = expected_get_calls_registry.get(id); if (--expected_get_calls == 0) { @@ -106,11 +102,15 @@ class ComponentManager { return res; } - template - uint64_t get_initial_expected_get_calls(EntityId id) { + std::vector get_initial_expected_get_calls(const std::vector& ids) { + std::vector initial_expected_get_calls; + initial_expected_get_calls.reserve(ids.size()); std::lock_guard lock(mtx_); auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - return initial_expected_get_calls_registry.get(id); + for (auto id: ids) { + initial_expected_get_calls.emplace_back(initial_expected_get_calls_registry.get(id)); + } + return initial_expected_get_calls; } private: diff --git a/cpp/arcticdb/processing/test/test_component_manager.cpp b/cpp/arcticdb/processing/test/test_component_manager.cpp index 97b8f7b965..fd1c8aa6ea 100644 --- a/cpp/arcticdb/processing/test/test_component_manager.cpp +++ b/cpp/arcticdb/processing/test/test_component_manager.cpp @@ -33,17 +33,15 @@ TEST(ComponentManager, Simple) { component_manager.add_entity(id_1, expected_get_calls_1, segment_1, row_range_1, col_range_1, key_1); ASSERT_EQ(component_manager.get>({id_0})[0], segment_0); - ASSERT_EQ(component_manager.get_initial_expected_get_calls>(id_0), expected_get_calls_0); + ASSERT_EQ(component_manager.get_initial_expected_get_calls({id_0})[0], expected_get_calls_0); ASSERT_EQ(component_manager.get>({id_0})[0], row_range_0); ASSERT_EQ(component_manager.get>({id_0})[0], col_range_0); ASSERT_EQ(component_manager.get>({id_0})[0], key_0); - EXPECT_THROW(component_manager.get>({id_0}), InternalException); ASSERT_EQ(component_manager.get>({id_1})[0], segment_1); ASSERT_EQ(component_manager.get>({id_1})[0], segment_1); - ASSERT_EQ(component_manager.get_initial_expected_get_calls>(id_1), expected_get_calls_1); + ASSERT_EQ(component_manager.get_initial_expected_get_calls({id_1})[0], expected_get_calls_1); ASSERT_EQ(component_manager.get>({id_1})[0], row_range_1); ASSERT_EQ(component_manager.get>({id_1})[0], col_range_1); ASSERT_EQ(component_manager.get>({id_1})[0], key_1); - EXPECT_THROW(component_manager.get>({id_1}), InternalException); } From 51b0ac4e8fd725368e245329be6368530ef7c868 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 14:14:39 +0100 Subject: [PATCH 10/38] Further templating --- cpp/arcticdb/processing/clause.cpp | 80 +++++++------------ cpp/arcticdb/processing/clause.hpp | 40 ++++++++-- cpp/arcticdb/processing/component_manager.hpp | 65 +++++++-------- cpp/arcticdb/version/version_core.cpp | 2 +- 4 files changed, 91 insertions(+), 96 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index f898cefabd..4163e65473 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -71,31 +71,6 @@ std::vector> structure_all_together(std::vector component_manager, - std::vector&& entity_ids, - bool include_atom_keys, - bool include_initial_expected_get_calls) { - ProcessingUnit res; - res.set_segments(component_manager->get>(entity_ids)); - res.set_row_ranges(component_manager->get>(entity_ids)); - res.set_col_ranges(component_manager->get>(entity_ids)); - - if (include_atom_keys) { - res.set_atom_keys(component_manager->get>(entity_ids)); - } - if (include_initial_expected_get_calls) { - res.set_segment_initial_expected_get_calls(component_manager->get_initial_expected_get_calls(entity_ids)); - } - return res; -} - /* * On exit from a clause, we need to push the elements of the newly created processing unit's into the component * manager. These will either be used by the next clause in the pipeline, or to present the output dataframe back to @@ -103,10 +78,11 @@ ProcessingUnit gather_entities(std::shared_ptr component_manag * Elements that share an index in the optional vectors of a ProcessingUnit correspond to the same entity, and so are * pushed into the component manager with the same ID. */ -std::vector push_entities(std::shared_ptr component_manager, ProcessingUnit&& proc) { - auto ids = component_manager->add_entities(std::move(*proc.segments_), std::move(*proc.row_ranges_), std::move(*proc.col_ranges_)); +std::vector push_entities(ComponentManager& component_manager, ProcessingUnit&& proc, EntityFetchCount entity_fetch_count) { + auto ids = component_manager.add_entities(std::move(*proc.segments_), std::move(*proc.row_ranges_), std::move(*proc.col_ranges_)); + component_manager.add_component(ids, entity_fetch_count); if (proc.bucket_.has_value()) { - component_manager->add_component(ids, *proc.bucket_); + component_manager.add_component(ids, *proc.bucket_); } return ids; } @@ -201,7 +177,7 @@ std::vector FilterClause::process(std::vector&& entity_ids) if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); proc.set_expression_context(expression_context_); auto variant_data = proc.get(expression_context_->root_node_name_); std::vector output; @@ -209,7 +185,7 @@ std::vector FilterClause::process(std::vector&& entity_ids) [&proc, &output, this](util::BitSet &bitset) { if (bitset.count() > 0) { proc.apply_filter(std::move(bitset), optimisation_); - output = push_entities(component_manager_, std::move(proc)); + output = push_entities(*component_manager_, std::move(proc)); } else { log::version().debug("Filter returned empty result"); } @@ -218,7 +194,7 @@ std::vector FilterClause::process(std::vector&& entity_ids) log::version().debug("Filter returned empty result"); }, [&output, &proc, this](FullResult) { - output = push_entities(component_manager_, std::move(proc)); + output = push_entities(*component_manager_, std::move(proc)); }, [](const auto &) { util::raise_rte("Expected bitset from filter clause"); @@ -234,7 +210,7 @@ std::vector ProjectClause::process(std::vector&& entity_ids) if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); proc.set_expression_context(expression_context_); auto variant_data = proc.get(expression_context_->root_node_name_); std::vector output; @@ -246,11 +222,11 @@ std::vector ProjectClause::process(std::vector&& entity_ids) proc.segments_->back()->add_column(scalar_field(data_type, name), col.column_); ++proc.col_ranges_->back()->second; - output = push_entities(component_manager_, std::move(proc)); + output = push_entities(*component_manager_, std::move(proc)); }, [&proc, &output, this](const EmptyResult &) { if (expression_context_->dynamic_schema_) - output = push_entities(component_manager_, std::move(proc)); + output = push_entities(*component_manager_, std::move(proc)); else util::raise_rte("Cannot project from empty column with static schema"); }, @@ -301,7 +277,7 @@ std::vector AggregationClause::process(std::vector&& entity_ if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); auto row_slices = split_by_row_slice(std::move(proc)); // Sort procs following row range descending order, as we are going to iterate through them backwards @@ -483,7 +459,7 @@ std::vector AggregationClause::process(std::vector&& entity_ seg.set_string_pool(string_pool); seg.set_row_id(num_unique - 1); - return push_entities(component_manager_, ProcessingUnit(std::move(seg))); + return push_entities(*component_manager_, ProcessingUnit(std::move(seg))); } [[nodiscard]] std::string AggregationClause::to_string() const { @@ -643,7 +619,7 @@ std::vector ResampleClause::process(std::vector, std::shared_ptr, std::shared_ptr, EntityFetchCount >(*component_manager_, std::move(entity_ids)); auto row_slices = split_by_row_slice(std::move(proc)); // If the expected get calls for the segments in the first row slice are 2, the first bucket overlapping this row // slice is being computed by the call to process dealing with the row slices above these. Otherwise, this call @@ -701,7 +677,7 @@ std::vector ResampleClause::process(std::vectortype().data_type(), aggregator.get_output_column_name().value), aggregated_column); } seg.set_row_data(output_index_column->row_count() - 1); - return push_entities(component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range))); + return push_entities(*component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range))); } template @@ -806,7 +782,7 @@ template struct ResampleClause; if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); size_t min_start_row = std::numeric_limits::max(); size_t max_end_row = 0; size_t min_start_col = std::numeric_limits::max(); @@ -826,7 +802,7 @@ template struct ResampleClause; } std::vector output; if (output_seg.has_value()) { - output = push_entities(component_manager_, + output = push_entities(*component_manager_, ProcessingUnit(std::move(*output_seg), RowRange{min_start_row, max_end_row}, ColRange{min_start_col, max_end_col})); @@ -838,7 +814,7 @@ std::vector SplitClause::process(std::vector&& entity_ids) c if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); std::vector ret; for (auto&& [idx, seg]: folly::enumerate(proc.segments_.value())) { auto split_segs = seg->split(rows_); @@ -846,7 +822,7 @@ std::vector SplitClause::process(std::vector&& entity_ids) c size_t end_row = 0; for (auto&& split_seg : split_segs) { end_row = start_row + split_seg.row_count(); - auto new_entity_ids = push_entities(component_manager_, + auto new_entity_ids = push_entities(*component_manager_, ProcessingUnit(std::move(split_seg), RowRange(start_row, end_row), std::move(*proc.col_ranges_->at(idx)))); @@ -861,13 +837,13 @@ std::vector SortClause::process(std::vector&& entity_ids) co if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); for (auto& seg: proc.segments_.value()) { // This modifies the segment in place, which goes against the ECS principle of all entities being immutable // Only used by SortMerge right now and so this is fine, although it would not generalise well seg->sort(column_); } - return push_entities(component_manager_, std::move(proc)); + return push_entities(*component_manager_, std::move(proc)); } template @@ -886,7 +862,7 @@ void merge_impl( SegmentationPolicy segmentation_policy{static_cast(num_segment_rows)}; auto func = [&component_manager, &ret, &row_range, &col_range](auto &&segment) { - ret.emplace_back(push_entities(component_manager, ProcessingUnit{std::forward(segment), row_range, col_range})); + ret.emplace_back(push_entities(*component_manager, ProcessingUnit{std::forward(segment), row_range, col_range})); }; using AggregatorType = stream::Aggregator; @@ -919,7 +895,7 @@ std::optional>> MergeClause::repartition(std:: // first one and can use structure_for_processing. Ideally // merging should be parallel like resampling auto entity_ids = flatten_entities(std::move(entity_ids_vec)); - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); auto compare = [](const std::unique_ptr &left, @@ -977,7 +953,7 @@ std::vector ColumnStatsGenerationClause::process(std::vector internal::check( !entity_ids.empty(), "ColumnStatsGenerationClause::process does not make sense with no processing units"); - auto proc = gather_entities(component_manager_, std::move(entity_ids), true); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); std::vector aggregators_data; internal::check( static_cast(column_stats_aggregators_), @@ -1031,7 +1007,7 @@ std::vector ColumnStatsGenerationClause::process(std::vector seg.concatenate(agg_data->finalize(column_stats_aggregators_->at(agg_data.index).get_output_column_names())); } seg.set_row_id(0); - return push_entities(component_manager_, ProcessingUnit(std::move(seg))); + return push_entities(*component_manager_, ProcessingUnit(std::move(seg))); } std::vector> RowRangeClause::structure_for_processing( @@ -1047,7 +1023,7 @@ std::vector RowRangeClause::process(std::vector&& entity_ids if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); std::vector output; for (auto&& [idx, row_range]: folly::enumerate(proc.row_ranges_.value())) { if ((start_ > row_range->start() && start_ < row_range->end()) || @@ -1069,7 +1045,7 @@ std::vector RowRangeClause::process(std::vector&& entity_ids proc.segments_->at(idx) = std::make_shared(std::move(truncated_segment)); } // else all rows in this segment are required, do nothing } - return push_entities(component_manager_, std::move(proc)); + return push_entities(*component_manager_, std::move(proc)); } void RowRangeClause::set_processing_config(const ProcessingConfig& processing_config) { @@ -1134,7 +1110,7 @@ std::vector DateRangeClause::process(std::vector &&entity_id if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids), true); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); std::vector output; // We are only interested in the index, which is in every SegmentInMemory in proc.segments_, so just use the first auto row_range = proc.row_ranges_->at(0); @@ -1150,7 +1126,7 @@ std::vector DateRangeClause::process(std::vector &&entity_id } proc.truncate(start_row, end_row); } // else all rows in the processing unit are required, do nothing - return push_entities(component_manager_, std::move(proc)); + return push_entities(*component_manager_, std::move(proc)); } std::string DateRangeClause::to_string() const { diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 107da44e63..213d874c6e 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -114,12 +114,35 @@ std::vector> structure_by_column_slice(std::vector> structure_all_together(std::vector& ranges_and_keys); -ProcessingUnit gather_entities(std::shared_ptr component_manager, - std::vector&& entity_ids, - bool include_atom_keys = false, - bool include_initial_expected_get_calls = false); +/* + * On entry to a clause, construct ProcessingUnits from the input entity IDs. These will either be provided by the + * structure_for_processing method for the first clause in the pipeline, or by the previous clause for all subsequent + * clauses. + */ +template +ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector&& entity_ids) { + ProcessingUnit res; + auto components = component_manager.get_entities(entity_ids); + ([&]{ + auto component = std::move(std::get>(components)); + if constexpr (std::is_same_v>) { + res.set_segments(std::move(component)); + } else if constexpr (std::is_same_v>) { + res.set_row_ranges(std::move(component)); + } else if constexpr (std::is_same_v>) { + res.set_col_ranges(std::move(component)); + } else if constexpr (std::is_same_v>) { + res.set_atom_keys(std::move(component)); + } else if constexpr (std::is_same_v) { + res.set_segment_initial_expected_get_calls(std::move(component)); + } else { + static_assert(sizeof(Args) == 0, "Unexpected component type provided in gather_entities"); + } + }(), ...); + return res; +} -std::vector push_entities(std::shared_ptr component_manager, ProcessingUnit&& proc); +std::vector push_entities(ComponentManager& component_manager, ProcessingUnit&& proc, EntityFetchCount entity_fetch_count=1); std::vector flatten_entities(std::vector>&& entity_ids_vec); @@ -272,14 +295,15 @@ struct PartitionClause { if (entity_ids.empty()) { return {}; } - auto proc = gather_entities(component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager_, std::move(entity_ids)); std::vector partitioned_procs = partition_processing_segment( proc, ColumnName(grouping_column_), processing_config_.dynamic_schema_); std::vector output; for (auto &&partitioned_proc: partitioned_procs) { - std::vector proc_entity_ids = push_entities(component_manager_, std::move(partitioned_proc)); + // EntityFetchCount is 2, once for the repartition, and once for AggregationClause::process + std::vector proc_entity_ids = push_entities(*component_manager_, std::move(partitioned_proc), 2); output.insert(output.end(), proc_entity_ids.begin(), proc_entity_ids.end()); } return output; @@ -305,7 +329,7 @@ struct PartitionClause { // Experimentation shows flattening the entities into a single vector and a single call to // component_manager_->get is faster than not flattening and making multiple calls auto entity_ids = flatten_entities(std::move(entity_ids_vec)); - std::vector buckets{component_manager_->get(entity_ids)}; + auto [buckets] = component_manager_->get_entities(entity_ids); for (auto [idx, entity_id]: folly::enumerate(entity_ids)) { res[buckets[idx]].emplace_back(entity_id); } diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index ef19b1b150..6b3e9e7b22 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -18,12 +18,12 @@ namespace arcticdb { using namespace pipelines; using EntityId = entt::entity; +using EntityFetchCount = uint64_t; using bucket_id = uint8_t; using namespace entt::literals; -constexpr auto expected_get_calls_id = "expected_get_calls"_hs; -constexpr auto initial_expected_get_calls_id = "initial_expected_get_calls"_hs; +constexpr auto remaining_entity_fetch_count_id = "remaining_entity_fetch_count"_hs; class ComponentManager { public: @@ -40,13 +40,12 @@ class ComponentManager { std::lock_guard lock(mtx_); // Fold expressions magic to emplace every element of args into the registry with its type as template parameter ([&]{ registry_.emplace(id, args); }(), ...); - auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); - expected_get_calls_registry.emplace(id, expected_get_calls); - auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - initial_expected_get_calls_registry.emplace(id, expected_get_calls); + registry_.emplace(id, expected_get_calls); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.emplace(id, expected_get_calls); } - // Add a collection of entities. Each element of args should be a vector of components, all of the same length + // Add a collection of entities. Each element of args should be a vector of components, all of which are the same length template std::vector add_entities(Args... args) { std::vector ids; @@ -68,10 +67,6 @@ class ComponentManager { } registry_.insert(ids.cbegin(), ids.cend(), args.begin()); }(), ...); - auto&& expected_get_calls_registry = registry_.storage(expected_get_calls_id); - expected_get_calls_registry.insert(ids.cbegin(), ids.cend(), 1); - auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - initial_expected_get_calls_registry.insert(ids.cbegin(), ids.cend(), 1); return ids; } @@ -80,39 +75,39 @@ class ComponentManager { void add_component(const std::vector& ids, T component) { std::lock_guard lock(mtx_); registry_.insert(ids.cbegin(), ids.cend(), component); + if constexpr (std::is_same_v) { + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), component); + } } - template - std::vector get(const std::vector& ids) { + // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args + template + std::tuple...> get_entities(const std::vector& ids) { + std::tuple...> res; + ([&]{ + std::get>(res).reserve(ids.size()); + }(), ...); + // Those that have been fetched as many times as they're going to + std::vector ids_to_erase; + // Most entities are only fetched once, hence this reserve + ids_to_erase.reserve(ids.size()); std::lock_guard lock(mtx_); - std::vector res; - res.reserve(ids.size()); - auto &&expected_get_calls_registry = registry_.storage(expected_get_calls_id); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); for (auto id: ids) { - res.emplace_back(registry_.get(id)); - if constexpr(std::is_same_v>) { - auto& expected_get_calls = expected_get_calls_registry.get(id); - if (--expected_get_calls == 0) { - // TODO: Use destroy to remove from whole registry, not just the segment component - // Takes two iterators so can erase all at once - registry_.erase(id); - } + std::tuple components = registry_.get(id); + ([&]{ + std::get>(res).emplace_back(std::get(components)); + }(), ...); + auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); + if (--remaining_entity_fetch_count == 0) { + ids_to_erase.emplace_back(id); } } + registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); return res; } - std::vector get_initial_expected_get_calls(const std::vector& ids) { - std::vector initial_expected_get_calls; - initial_expected_get_calls.reserve(ids.size()); - std::lock_guard lock(mtx_); - auto&& initial_expected_get_calls_registry = registry_.storage(initial_expected_get_calls_id); - for (auto id: ids) { - initial_expected_get_calls.emplace_back(initial_expected_get_calls_registry.get(id)); - } - return initial_expected_get_calls; - } - private: entt::registry registry_; std::mutex mtx_; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index a784aaf594..8d12cdf98c 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -733,7 +733,7 @@ std::vector read_and_process( std::move(segment_and_slice_futures), std::move(processing_unit_indexes), std::make_shared>>(read_query.clauses_)); - auto proc = gather_entities(component_manager, std::move(processed_entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { return clause->clause_info().modifies_output_descriptor_; From 4f391897f37825ec9e091c814767d6d4a837cd54 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 14:23:59 +0100 Subject: [PATCH 11/38] Renamed segment_initial_expected_get_calls --- cpp/arcticdb/processing/clause.cpp | 4 ++-- cpp/arcticdb/processing/clause.hpp | 2 +- cpp/arcticdb/processing/processing_unit.cpp | 14 +++++++------- cpp/arcticdb/processing/processing_unit.hpp | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 4163e65473..9d3db879ea 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -621,11 +621,11 @@ std::vector ResampleClause::process(std::vector, std::shared_ptr, std::shared_ptr, EntityFetchCount >(*component_manager_, std::move(entity_ids)); auto row_slices = split_by_row_slice(std::move(proc)); - // If the expected get calls for the segments in the first row slice are 2, the first bucket overlapping this row + // If the entity fetch counts for the entities in the first row slice are 2, the first bucket overlapping this row // slice is being computed by the call to process dealing with the row slices above these. Otherwise, this call // should do it const auto& front_slice = row_slices.front(); - bool responsible_for_first_overlapping_bucket = front_slice.segment_initial_expected_get_calls_->at(0) == 1; + bool responsible_for_first_overlapping_bucket = front_slice.entity_fetch_count_->at(0) == 1; // Find the iterators into bucket_boundaries_ of the start of the first and the end of the last bucket this call to process is // responsible for calculating // All segments in a given row slice contain the same index column, so just grab info from the first one diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 213d874c6e..4d536b89ce 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -134,7 +134,7 @@ ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector< } else if constexpr (std::is_same_v>) { res.set_atom_keys(std::move(component)); } else if constexpr (std::is_same_v) { - res.set_segment_initial_expected_get_calls(std::move(component)); + res.set_entity_fetch_count(std::move(component)); } else { static_assert(sizeof(Args) == 0, "Unexpected component type provided in gather_entities"); } diff --git a/cpp/arcticdb/processing/processing_unit.cpp b/cpp/arcticdb/processing/processing_unit.cpp index 45ff293e9d..8a8f8e3bf0 100644 --- a/cpp/arcticdb/processing/processing_unit.cpp +++ b/cpp/arcticdb/processing/processing_unit.cpp @@ -109,7 +109,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { internal::check(input.segments_.has_value(), "split_by_row_slice needs Segments"); internal::check(input.row_ranges_.has_value(), "split_by_row_slice needs RowRanges"); internal::check(input.col_ranges_.has_value(), "split_by_row_slice needs ColRanges"); - auto include_expected_get_calls = input.segment_initial_expected_get_calls_.has_value(); + auto include_expected_get_calls = input.entity_fetch_count_.has_value(); std::vector output; // Some clauses (e.g. AggregationClause) are lossy about row-ranges. We can assume that if all of the input column @@ -119,7 +119,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { for (size_t idx = 0; idx < input.segments_->size(); ++idx) { ProcessingUnit proc_tmp(std::move(*input.segments_->at(idx)), std::move(*input.row_ranges_->at(idx)), std::move(*input.col_ranges_->at(idx))); if (include_expected_get_calls) { - proc_tmp.set_segment_initial_expected_get_calls({input.segment_initial_expected_get_calls_->at(idx)}); + proc_tmp.set_entity_fetch_count({input.entity_fetch_count_->at(idx)}); } output.emplace_back(std::move(proc_tmp)); } @@ -131,7 +131,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { it->second.row_ranges_->emplace_back(input.row_ranges_->at(idx)); it->second.col_ranges_->emplace_back(input.col_ranges_->at(idx)); if (include_expected_get_calls) { - it->second.segment_initial_expected_get_calls_->emplace_back(input.segment_initial_expected_get_calls_->at(idx)); + it->second.entity_fetch_count_->emplace_back(input.entity_fetch_count_->at(idx)); } } else { auto [inserted_it, _] = output_map.emplace(*row_range_ptr, ProcessingUnit{}); @@ -139,7 +139,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { inserted_it->second.row_ranges_.emplace(1, input.row_ranges_->at(idx)); inserted_it->second.col_ranges_.emplace(1, input.col_ranges_->at(idx)); if (include_expected_get_calls) { - inserted_it->second.segment_initial_expected_get_calls_.emplace(1, input.segment_initial_expected_get_calls_->at(idx)); + inserted_it->second.entity_fetch_count_.emplace(1, input.entity_fetch_count_->at(idx)); } } } @@ -154,14 +154,14 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { // The expected get counts for all segments in a row slice should be the same // This should always be 1 or 2 for the first/last row slice, and 1 for all of the others for (auto row_slice = output.cbegin(); row_slice != output.cend(); ++row_slice) { - auto expected_get_calls = row_slice->segment_initial_expected_get_calls_->front(); + auto expected_get_calls = row_slice->entity_fetch_count_->front(); uint64_t max_expected_get_calls = row_slice == output.cbegin() || row_slice == std::prev(output.cend()) ? 2 : 1; internal::check(0 < expected_get_calls && expected_get_calls <= max_expected_get_calls, "expected_get_calls in split_by_row_slice should be 1 or 2, got {}", expected_get_calls); internal::check( - std::all_of(row_slice->segment_initial_expected_get_calls_->begin(), - row_slice->segment_initial_expected_get_calls_->end(), + std::all_of(row_slice->entity_fetch_count_->begin(), + row_slice->entity_fetch_count_->end(), [&expected_get_calls](uint64_t i) { return i == expected_get_calls; }), "All segments in same row slice should have same expected_get_calls in split_by_row_slice"); } diff --git a/cpp/arcticdb/processing/processing_unit.hpp b/cpp/arcticdb/processing/processing_unit.hpp index 0491bf7efa..98892afac9 100644 --- a/cpp/arcticdb/processing/processing_unit.hpp +++ b/cpp/arcticdb/processing/processing_unit.hpp @@ -51,7 +51,7 @@ namespace arcticdb { std::optional>> col_ranges_; std::optional>> atom_keys_; std::optional bucket_; - std::optional> segment_initial_expected_get_calls_; + std::optional> entity_fetch_count_; std::shared_ptr expression_context_; std::unordered_map computed_data_; @@ -90,8 +90,8 @@ namespace arcticdb { bucket_.emplace(bucket); } - void set_segment_initial_expected_get_calls(std::vector&& segment_initial_expected_get_calls) { - segment_initial_expected_get_calls_.emplace(segment_initial_expected_get_calls); + void set_entity_fetch_count(std::vector&& entity_fetch_count) { + entity_fetch_count_.emplace(entity_fetch_count); } void apply_filter(util::BitSet&& bitset, PipelineOptimisation optimisation); From 75f59f008413b9e54f11f9f9447640b28ab89a4a Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 15:15:32 +0100 Subject: [PATCH 12/38] Ensure entities are deleted when no longer needed --- cpp/arcticdb/processing/component_manager.hpp | 20 ++++++++++++++----- cpp/arcticdb/version/version_core.cpp | 5 +++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 6b3e9e7b22..ec4c35df4c 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -36,13 +36,16 @@ class ComponentManager { // Add a single entity with the components defined by args template - void add_entity(EntityId id, uint64_t expected_get_calls, Args... args) { + void add_entity(EntityId id, Args... args) { std::lock_guard lock(mtx_); // Fold expressions magic to emplace every element of args into the registry with its type as template parameter - ([&]{ registry_.emplace(id, args); }(), ...); - registry_.emplace(id, expected_get_calls); - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - remaining_entity_fetch_count_registry.emplace(id, expected_get_calls); + ([&]{ + registry_.emplace(id, args); + if constexpr (std::is_same_v) { + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.emplace(id, args); + } + }(), ...); } // Add a collection of entities. Each element of args should be a vector of components, all of which are the same length @@ -108,6 +111,13 @@ class ComponentManager { return res; } + bool empty() const { + for (ARCTICDB_UNUSED auto _: registry_.view()) { + return false; + } + return true; + } + private: entt::registry registry_; std::mutex mtx_; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 8d12cdf98c..41b6c4d716 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -563,11 +563,11 @@ std::vector process_clauses( if (!(*slice_added)[pos]) { component_manager->add_entity( entity_id, - (*segment_proc_unit_counts)[pos], std::make_shared(std::move(segment_and_slice.segment_in_memory_)), std::make_shared(std::move(segment_and_slice.ranges_and_key_.row_range_)), std::make_shared(std::move(segment_and_slice.ranges_and_key_.col_range_)), - std::make_shared(std::move(segment_and_slice.ranges_and_key_.key_)) + std::make_shared(std::move(segment_and_slice.ranges_and_key_.key_)), + (*segment_proc_unit_counts)[pos] ); (*slice_added)[pos] = true; } @@ -734,6 +734,7 @@ std::vector read_and_process( std::move(processing_unit_indexes), std::make_shared>>(read_query.clauses_)); auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); + internal::check(component_manager->empty(), "ComponentManager still contains entities"); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { return clause->clause_info().modifies_output_descriptor_; From cb1457eaabffdb723e55aedd9543dd108ea05d67 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 15:58:32 +0100 Subject: [PATCH 13/38] Fixed C++ tests --- cpp/arcticdb/CMakeLists.txt | 138 +++++++++--------- cpp/arcticdb/processing/clause.cpp | 2 +- cpp/arcticdb/processing/test/test_clause.cpp | 34 ++--- .../test/test_component_manager.cpp | 38 ++--- .../processing/test/test_resample.cpp | 16 +- cpp/arcticdb/version/version_core.cpp | 2 +- 6 files changed, 117 insertions(+), 113 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 9733fde747..1845b97731 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -874,79 +874,79 @@ if(${TEST}) python_utils_dump_vars_if_enabled("Python for test compilation") set(unit_test_srcs -# async/test/test_async.cpp -# codec/test/test_codec.cpp -# codec/test/test_encode_field_collection.cpp -# codec/test/test_segment_header.cpp -# codec/test/test_encoded_field.cpp -# column_store/test/ingestion_stress_test.cpp -# column_store/test/test_column.cpp -# column_store/test/test_column_data_random_accessor.cpp -# column_store/test/test_index_filtering.cpp -# column_store/test/test_memory_segment.cpp -# entity/test/test_atom_key.cpp -# entity/test/test_key_serialization.cpp -# entity/test/test_metrics.cpp -# entity/test/test_ref_key.cpp -# entity/test/test_tensor.cpp -# log/test/test_log.cpp -# pipeline/test/test_container.hpp -# pipeline/test/test_pipeline.cpp -# pipeline/test/test_query.cpp -# util/test/test_regex.cpp -# processing/test/test_arithmetic_type_promotion.cpp + async/test/test_async.cpp + codec/test/test_codec.cpp + codec/test/test_encode_field_collection.cpp + codec/test/test_segment_header.cpp + codec/test/test_encoded_field.cpp + column_store/test/ingestion_stress_test.cpp + column_store/test/test_column.cpp + column_store/test/test_column_data_random_accessor.cpp + column_store/test/test_index_filtering.cpp + column_store/test/test_memory_segment.cpp + entity/test/test_atom_key.cpp + entity/test/test_key_serialization.cpp + entity/test/test_metrics.cpp + entity/test/test_ref_key.cpp + entity/test/test_tensor.cpp + log/test/test_log.cpp + pipeline/test/test_container.hpp + pipeline/test/test_pipeline.cpp + pipeline/test/test_query.cpp + util/test/test_regex.cpp + processing/test/test_arithmetic_type_promotion.cpp processing/test/test_clause.cpp processing/test/test_component_manager.cpp -# processing/test/test_expression.cpp -# processing/test/test_filter_and_project_sparse.cpp -# processing/test/test_has_valid_type_promotion.cpp -# processing/test/test_operation_dispatch.cpp + processing/test/test_expression.cpp + processing/test/test_filter_and_project_sparse.cpp + processing/test/test_has_valid_type_promotion.cpp + processing/test/test_operation_dispatch.cpp processing/test/test_resample.cpp -# processing/test/test_set_membership.cpp -# processing/test/test_signed_unsigned_comparison.cpp -# processing/test/test_type_comparison.cpp -# storage/test/test_local_storages.cpp -# storage/test/test_memory_storage.cpp -# storage/test/test_s3_storage.cpp -# storage/test/test_storage_factory.cpp -# storage/test/test_storage_exceptions.cpp -# storage/test/test_azure_storage.cpp -# storage/test/test_storage_operations.cpp -# stream/test/stream_test_common.cpp -# stream/test/test_aggregator.cpp -# stream/test/test_append_map.cpp -# stream/test/test_row_builder.cpp -# stream/test/test_segment_aggregator.cpp -# stream/test/test_types.cpp -# util/memory_tracing.hpp + processing/test/test_set_membership.cpp + processing/test/test_signed_unsigned_comparison.cpp + processing/test/test_type_comparison.cpp + storage/test/test_local_storages.cpp + storage/test/test_memory_storage.cpp + storage/test/test_s3_storage.cpp + storage/test/test_storage_factory.cpp + storage/test/test_storage_exceptions.cpp + storage/test/test_azure_storage.cpp + storage/test/test_storage_operations.cpp + stream/test/stream_test_common.cpp + stream/test/test_aggregator.cpp + stream/test/test_append_map.cpp + stream/test/test_row_builder.cpp + stream/test/test_segment_aggregator.cpp + stream/test/test_types.cpp + util/memory_tracing.hpp util/test/gtest_main.cpp -# util/test/test_bitmagic.cpp -# util/test/test_buffer_pool.cpp -# util/test/test_composite.cpp -# util/test/test_cursor.cpp -# util/test/test_decimal.cpp -# util/test/test_exponential_backoff.cpp -# util/test/test_format_date.cpp -# util/test/test_hash.cpp -# util/test/test_id_transformation.cpp -# util/test/test_ranges_from_future.cpp -# util/test/test_slab_allocator.cpp -# util/test/test_storage_lock.cpp -# util/test/test_string_pool.cpp -# util/test/test_string_utils.cpp -# util/test/test_tracing_allocator.cpp -# version/test/test_append.cpp -# version/test/test_sparse.cpp -# version/test/test_stream_version_data.cpp -# version/test/test_symbol_list.cpp -# version/test/test_version_map.cpp -# version/test/test_version_map_batch.cpp -# version/test/test_version_store.cpp -# version/test/test_sorting_info_state_machine.cpp -# version/test/version_map_model.hpp -# python/python_handlers.cpp -# storage/test/common.hpp -# version/test/test_sort_index.cpp + util/test/test_bitmagic.cpp + util/test/test_buffer_pool.cpp + util/test/test_composite.cpp + util/test/test_cursor.cpp + util/test/test_decimal.cpp + util/test/test_exponential_backoff.cpp + util/test/test_format_date.cpp + util/test/test_hash.cpp + util/test/test_id_transformation.cpp + util/test/test_ranges_from_future.cpp + util/test/test_slab_allocator.cpp + util/test/test_storage_lock.cpp + util/test/test_string_pool.cpp + util/test/test_string_utils.cpp + util/test/test_tracing_allocator.cpp + version/test/test_append.cpp + version/test/test_sparse.cpp + version/test/test_stream_version_data.cpp + version/test/test_symbol_list.cpp + version/test/test_version_map.cpp + version/test/test_version_map_batch.cpp + version/test/test_version_store.cpp + version/test/test_sorting_info_state_machine.cpp + version/test/version_map_model.hpp + python/python_handlers.cpp + storage/test/common.hpp + version/test/test_sort_index.cpp ) set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755 diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 9d3db879ea..271eac4d61 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -619,7 +619,7 @@ std::vector ResampleClause::process(std::vector, std::shared_ptr, std::shared_ptr, EntityFetchCount >(*component_manager_, std::move(entity_ids)); + auto proc = gather_entities, std::shared_ptr, std::shared_ptr, EntityFetchCount>(*component_manager_, std::move(entity_ids)); auto row_slices = split_by_row_slice(std::move(proc)); // If the entity fetch counts for the entities in the first row slice are 2, the first bucket overlapping this row // slice is being computed by the call to process dealing with the row slices above these. Otherwise, this call diff --git a/cpp/arcticdb/processing/test/test_clause.cpp b/cpp/arcticdb/processing/test/test_clause.cpp index 1da137317c..362d9a1a09 100644 --- a/cpp/arcticdb/processing/test/test_clause.cpp +++ b/cpp/arcticdb/processing/test/test_clause.cpp @@ -52,7 +52,7 @@ TEST(Clause, PartitionEmptyColumn) { partition.set_component_manager(component_manager); auto proc_unit = ProcessingUnit{generate_groupby_testing_empty_segment(100, 10)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); auto processed = partition.process(std::move(entity_ids)); ASSERT_TRUE(processed.empty()); @@ -73,9 +73,9 @@ TEST(Clause, AggregationEmptyColumn) { size_t num_rows{100}; size_t unique_grouping_values{10}; auto proc_unit = ProcessingUnit{generate_groupby_testing_segment(num_rows, unique_grouping_values)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto aggregated = gather_entities(component_manager, aggregation.process(std::move(entity_ids))); + auto aggregated = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, aggregation.process(std::move(entity_ids))); ASSERT_TRUE(aggregated.segments_.has_value()); auto segments = aggregated.segments_.value(); ASSERT_EQ(1, segments.size()); @@ -147,9 +147,9 @@ TEST(Clause, AggregationColumn) size_t num_rows{100}; size_t unique_grouping_values{10}; auto proc_unit = ProcessingUnit{generate_groupby_testing_segment(num_rows, unique_grouping_values)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto aggregated = gather_entities(component_manager, aggregation.process(std::move(entity_ids))); + auto aggregated = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, aggregation.process(std::move(entity_ids))); ASSERT_TRUE(aggregated.segments_.has_value()); auto segments = aggregated.segments_.value(); ASSERT_EQ(1, segments.size()); @@ -178,9 +178,9 @@ TEST(Clause, AggregationSparseColumn) size_t num_rows{100}; size_t unique_grouping_values{10}; auto proc_unit = ProcessingUnit{generate_groupby_testing_sparse_segment(num_rows, unique_grouping_values)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto aggregated = gather_entities(component_manager, aggregation.process(std::move(entity_ids))); + auto aggregated = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, aggregation.process(std::move(entity_ids))); ASSERT_TRUE(aggregated.segments_.has_value()); auto segments = aggregated.segments_.value(); ASSERT_EQ(1, segments.size()); @@ -242,9 +242,9 @@ TEST(Clause, AggregationSparseGroupby) { // 1 more group because of missing values size_t unique_groups{unique_grouping_values + 1}; auto proc_unit = ProcessingUnit{generate_sparse_groupby_testing_segment(num_rows, unique_grouping_values)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto aggregated = gather_entities(component_manager, aggregation.process(std::move(entity_ids))); + auto aggregated = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, aggregation.process(std::move(entity_ids))); ASSERT_TRUE(aggregated.segments_.has_value()); auto segments = aggregated.segments_.value(); ASSERT_EQ(1, segments.size()); @@ -300,9 +300,9 @@ TEST(Clause, Passthrough) { auto seg = get_standard_timeseries_segment("passthrough"); auto copied = seg.clone(); auto proc_unit = ProcessingUnit{std::move(seg)};; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto ret = gather_entities(component_manager, passthrough.process(std::move(entity_ids))); + auto ret = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, passthrough.process(std::move(entity_ids))); ASSERT_TRUE(ret.segments_.has_value()); ASSERT_EQ(ret.segments_->size(), 1); ASSERT_EQ(*ret.segments_->at(0), copied); @@ -321,9 +321,9 @@ TEST(Clause, Sort) { std::mt19937 urng(rng()); std::shuffle(seg.begin(), seg.end(), urng); auto proc_unit = ProcessingUnit{std::move(seg)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto res = gather_entities(component_manager, sort_clause.process(std::move(entity_ids))); + auto res = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, sort_clause.process(std::move(entity_ids))); ASSERT_TRUE(res.segments_.has_value()); ASSERT_EQ(*res.segments_->at(0), copied); } @@ -339,9 +339,9 @@ TEST(Clause, Split) { auto seg = get_standard_timeseries_segment(symbol, 100); auto copied = seg.clone(); auto proc_unit = ProcessingUnit{std::move(seg)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto res = gather_entities(component_manager, split_clause.process(std::move(entity_ids))); + auto res = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, split_clause.process(std::move(entity_ids))); ASSERT_TRUE(res.segments_.has_value()); ASSERT_EQ(res.segments_->size(), 10); @@ -404,14 +404,14 @@ TEST(Clause, Merge) { std::vector entity_ids; for(auto x = 0u; x < num_segs; ++x) { auto proc_unit = ProcessingUnit{std::move(segs[x])}; - entity_ids.push_back(push_entities(component_manager, std::move(proc_unit))[0]); + entity_ids.push_back(push_entities(*component_manager, std::move(proc_unit))[0]); } std::vector processed_ids = merge_clause.process(std::move(entity_ids)); std::vector> vec; vec.emplace_back(std::move(processed_ids)); auto repartitioned = merge_clause.repartition(std::move(vec)); - auto res = gather_entities(component_manager, std::move(repartitioned->at(0))); + auto res = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(repartitioned->at(0))); ASSERT_TRUE(res.segments_.has_value()); ASSERT_EQ(res.segments_->size(), 1u); ASSERT_EQ(*res.segments_->at(0), seg); diff --git a/cpp/arcticdb/processing/test/test_component_manager.cpp b/cpp/arcticdb/processing/test/test_component_manager.cpp index fd1c8aa6ea..0cddd5922a 100644 --- a/cpp/arcticdb/processing/test/test_component_manager.cpp +++ b/cpp/arcticdb/processing/test/test_component_manager.cpp @@ -18,30 +18,34 @@ TEST(ComponentManager, Simple) { auto row_range_0 = std::make_shared(0, 10); auto col_range_0 = std::make_shared(10, 20); auto key_0 = std::make_shared(AtomKeyBuilder().version_id(1).build("symbol_0", KeyType::TABLE_DATA)); - uint64_t expected_get_calls_0{1}; + uint64_t entity_fetch_count_0{1}; auto segment_1 = std::make_shared(); auto row_range_1 = std::make_shared(20, 30); auto col_range_1 = std::make_shared(30, 40); auto key_1 = std::make_shared(AtomKeyBuilder().version_id(2).build("symbol_1", KeyType::TABLE_DATA)); - uint64_t expected_get_calls_1{2}; + uint64_t entity_fetch_count_1{2}; auto id_0 = component_manager.get_new_entity_id(); - component_manager.add_entity(id_0, expected_get_calls_0, segment_0, row_range_0, col_range_0, key_0); + component_manager.add_entity(id_0, segment_0, row_range_0, col_range_0, key_0, entity_fetch_count_0); auto id_1 = component_manager.get_new_entity_id(); - component_manager.add_entity(id_1, expected_get_calls_1, segment_1, row_range_1, col_range_1, key_1); - - ASSERT_EQ(component_manager.get>({id_0})[0], segment_0); - ASSERT_EQ(component_manager.get_initial_expected_get_calls({id_0})[0], expected_get_calls_0); - ASSERT_EQ(component_manager.get>({id_0})[0], row_range_0); - ASSERT_EQ(component_manager.get>({id_0})[0], col_range_0); - ASSERT_EQ(component_manager.get>({id_0})[0], key_0); - - ASSERT_EQ(component_manager.get>({id_1})[0], segment_1); - ASSERT_EQ(component_manager.get>({id_1})[0], segment_1); - ASSERT_EQ(component_manager.get_initial_expected_get_calls({id_1})[0], expected_get_calls_1); - ASSERT_EQ(component_manager.get>({id_1})[0], row_range_1); - ASSERT_EQ(component_manager.get>({id_1})[0], col_range_1); - ASSERT_EQ(component_manager.get>({id_1})[0], key_1); + component_manager.add_entity(id_1, segment_1, row_range_1, col_range_1, key_1, entity_fetch_count_1); + + auto [segments, row_ranges, col_ranges, keys, entity_fetch_counts] = component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>({id_0, id_1}); + + ASSERT_EQ(segments[0], segment_0); + ASSERT_EQ(row_ranges[0], row_range_0); + ASSERT_EQ(col_ranges[0], col_range_0); + ASSERT_EQ(keys[0], key_0); + ASSERT_EQ(entity_fetch_counts[0], entity_fetch_count_0); + + ASSERT_EQ(segments[1], segment_1); + ASSERT_EQ(row_ranges[1], row_range_1); + ASSERT_EQ(col_ranges[1], col_range_1); + ASSERT_EQ(keys[1], key_1); + ASSERT_EQ(entity_fetch_counts[1], entity_fetch_count_1); + + // EntityFetchCount for entity with id_1 is 2, so can be fetched again without exceptions + component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>({id_1}); } diff --git a/cpp/arcticdb/processing/test/test_resample.cpp b/cpp/arcticdb/processing/test/test_resample.cpp index 084abb54ad..d669f4c011 100644 --- a/cpp/arcticdb/processing/test/test_resample.cpp +++ b/cpp/arcticdb/processing/test/test_resample.cpp @@ -242,9 +242,9 @@ TEST(Resample, ProcessOneSegment) { seg.set_row_id(num_rows - 1); auto proc_unit = ProcessingUnit{std::move(seg)}; - auto entity_ids = push_entities(component_manager, std::move(proc_unit)); + auto entity_ids = push_entities(*component_manager, std::move(proc_unit)); - auto resampled = gather_entities(component_manager, resample.process(std::move(entity_ids))); + auto resampled = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, resample.process(std::move(entity_ids))); ASSERT_TRUE(resampled.segments_.has_value()); auto segments = resampled.segments_.value(); ASSERT_EQ(1, segments.size()); @@ -330,17 +330,17 @@ TEST(Resample, ProcessMultipleSegments) { auto col_range_2 = std::make_shared(1, 2); auto id_0 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_0, 1, seg_0, row_range_0, col_range_0); + component_manager->add_entity(id_0, seg_0, row_range_0, col_range_0, EntityFetchCount(1)); auto id_1 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_1, 2, seg_1, row_range_1, col_range_1); + component_manager->add_entity(id_1, seg_1, row_range_1, col_range_1, EntityFetchCount(2)); auto id_2 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_2, 1, seg_2, row_range_2, col_range_2); + component_manager->add_entity(id_2, seg_2, row_range_2, col_range_2, EntityFetchCount(1)); std::vector ids_0{id_0, id_1}; std::vector ids_1{id_1}; std::vector ids_2{id_2}; - auto resampled_0 = gather_entities(component_manager, resample.process(std::move(ids_0))); + auto resampled_0 = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, resample.process(std::move(ids_0))); auto resampled_seg_0 = *resampled_0.segments_.value()[0]; auto& resampled_index_column_0 = resampled_seg_0.column(0); auto& resampled_sum_column_0 = resampled_seg_0.column(1); @@ -349,7 +349,7 @@ TEST(Resample, ProcessMultipleSegments) { ASSERT_EQ(0, resampled_sum_column_0.scalar_at(0)); ASSERT_EQ(30, resampled_sum_column_0.scalar_at(1)); - auto resampled_1 = gather_entities(component_manager, resample.process(std::move(ids_1))); + auto resampled_1 = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, resample.process(std::move(ids_1))); auto resampled_seg_1 = *resampled_1.segments_.value()[0]; auto& resampled_index_column_1 = resampled_seg_1.column(0); auto& resampled_sum_column_1 = resampled_seg_1.column(1); @@ -358,7 +358,7 @@ TEST(Resample, ProcessMultipleSegments) { ASSERT_EQ(30, resampled_sum_column_1.scalar_at(0)); ASSERT_EQ(40, resampled_sum_column_1.scalar_at(1)); - auto resampled_2 = gather_entities(component_manager, resample.process(std::move(ids_2))); + auto resampled_2 = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, resample.process(std::move(ids_2))); auto resampled_seg_2 = *resampled_2.segments_.value()[0]; auto& resampled_index_column_2 = resampled_seg_2.column(0); auto& resampled_sum_column_2 = resampled_seg_2.column(1); diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 41b6c4d716..bfe493ba96 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -497,7 +497,7 @@ std::vector process_clauses( } // Map from index in segment_and_slice_future_splitters to the number of processing units that require that segment - auto segment_proc_unit_counts = std::make_shared>(segment_and_slice_futures.size(), 0); + auto segment_proc_unit_counts = std::make_shared>(segment_and_slice_futures.size(), 0); for (const auto& list: processing_unit_indexes) { for (auto idx: list) { internal::check( From 037d8bbcadbbd8edd3fd57804de15a667e98b709 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 16:14:37 +0100 Subject: [PATCH 14/38] Remove component_manager.cpp --- cpp/arcticdb/CMakeLists.txt | 1 - cpp/arcticdb/processing/component_manager.cpp | 13 ------------- 2 files changed, 14 deletions(-) delete mode 100644 cpp/arcticdb/processing/component_manager.cpp diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 1845b97731..b22f4b080f 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -434,7 +434,6 @@ set(arcticdb_srcs processing/processing_unit.cpp processing/aggregation_utils.cpp processing/clause.cpp - processing/component_manager.cpp processing/expression_node.cpp processing/operation_dispatch.cpp processing/operation_dispatch_unary.cpp diff --git a/cpp/arcticdb/processing/component_manager.cpp b/cpp/arcticdb/processing/component_manager.cpp deleted file mode 100644 index 9aa0155a5d..0000000000 --- a/cpp/arcticdb/processing/component_manager.cpp +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright 2023 Man Group Operations Limited - * - * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. - * - * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. - */ - -#include - -namespace arcticdb { - - -} // namespace arcticdb \ No newline at end of file From 403eba2614d03641c687d1bbc887fa1540b7ede7 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 16:57:57 +0100 Subject: [PATCH 15/38] Fix benchmark_clause.cpp --- cpp/arcticdb/processing/test/benchmark_clause.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/test/benchmark_clause.cpp b/cpp/arcticdb/processing/test/benchmark_clause.cpp index 35ce8d56e3..e969508c56 100644 --- a/cpp/arcticdb/processing/test/benchmark_clause.cpp +++ b/cpp/arcticdb/processing/test/benchmark_clause.cpp @@ -43,7 +43,7 @@ void time_merge_on_segments(const std::vector &segments, benchm std::vector entity_ids; for (auto& segment : segments){ auto proc_unit = ProcessingUnit{segment.clone()}; - entity_ids.push_back(push_entities(component_manager, std::move(proc_unit))[0]); + entity_ids.push_back(push_entities(*component_manager, std::move(proc_unit))[0]); } auto stream_id = StreamId("Merge"); From 70673600bdb1459c35fe6a476a2dd14bd595c0d3 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 13 Sep 2024 17:15:00 +0100 Subject: [PATCH 16/38] Convert debug::check to internal::chec, rename some old references to expected_get_count --- cpp/arcticdb/processing/component_manager.hpp | 2 +- cpp/arcticdb/processing/processing_unit.cpp | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index ec4c35df4c..18b59f05d7 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -63,7 +63,7 @@ class ComponentManager { ids.emplace_back(get_new_entity_id()); } } else { - debug::check( + internal::check( args.size() == entity_count, "ComponentManager::add_entities received vectors of differing lengths" ); diff --git a/cpp/arcticdb/processing/processing_unit.cpp b/cpp/arcticdb/processing/processing_unit.cpp index 8a8f8e3bf0..d1eb37a35b 100644 --- a/cpp/arcticdb/processing/processing_unit.cpp +++ b/cpp/arcticdb/processing/processing_unit.cpp @@ -109,7 +109,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { internal::check(input.segments_.has_value(), "split_by_row_slice needs Segments"); internal::check(input.row_ranges_.has_value(), "split_by_row_slice needs RowRanges"); internal::check(input.col_ranges_.has_value(), "split_by_row_slice needs ColRanges"); - auto include_expected_get_calls = input.entity_fetch_count_.has_value(); + auto include_entity_fetch_count = input.entity_fetch_count_.has_value(); std::vector output; // Some clauses (e.g. AggregationClause) are lossy about row-ranges. We can assume that if all of the input column @@ -118,7 +118,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { output.reserve(input.segments_->size()); for (size_t idx = 0; idx < input.segments_->size(); ++idx) { ProcessingUnit proc_tmp(std::move(*input.segments_->at(idx)), std::move(*input.row_ranges_->at(idx)), std::move(*input.col_ranges_->at(idx))); - if (include_expected_get_calls) { + if (include_entity_fetch_count) { proc_tmp.set_entity_fetch_count({input.entity_fetch_count_->at(idx)}); } output.emplace_back(std::move(proc_tmp)); @@ -130,7 +130,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { it->second.segments_->emplace_back(input.segments_->at(idx)); it->second.row_ranges_->emplace_back(input.row_ranges_->at(idx)); it->second.col_ranges_->emplace_back(input.col_ranges_->at(idx)); - if (include_expected_get_calls) { + if (include_entity_fetch_count) { it->second.entity_fetch_count_->emplace_back(input.entity_fetch_count_->at(idx)); } } else { @@ -138,7 +138,7 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { inserted_it->second.segments_.emplace(1, input.segments_->at(idx)); inserted_it->second.row_ranges_.emplace(1, input.row_ranges_->at(idx)); inserted_it->second.col_ranges_.emplace(1, input.col_ranges_->at(idx)); - if (include_expected_get_calls) { + if (include_entity_fetch_count) { inserted_it->second.entity_fetch_count_.emplace(1, input.entity_fetch_count_->at(idx)); } } @@ -150,20 +150,20 @@ std::vector split_by_row_slice(ProcessingUnit&& proc) { } internal::check(!output.empty(), "Unexpected empty output in split_by_row_slice"); - if (include_expected_get_calls) { + if (include_entity_fetch_count) { // The expected get counts for all segments in a row slice should be the same // This should always be 1 or 2 for the first/last row slice, and 1 for all of the others for (auto row_slice = output.cbegin(); row_slice != output.cend(); ++row_slice) { - auto expected_get_calls = row_slice->entity_fetch_count_->front(); - uint64_t max_expected_get_calls = row_slice == output.cbegin() || row_slice == std::prev(output.cend()) ? 2 : 1; - internal::check(0 < expected_get_calls && expected_get_calls <= max_expected_get_calls, - "expected_get_calls in split_by_row_slice should be 1 or 2, got {}", - expected_get_calls); + auto entity_fetch_count = row_slice->entity_fetch_count_->front(); + uint64_t max_entity_fetch_count = row_slice == output.cbegin() || row_slice == std::prev(output.cend()) ? 2 : 1; + internal::check(0 < entity_fetch_count && entity_fetch_count <= max_entity_fetch_count, + "entity_fetch_count in split_by_row_slice should be 1 or 2, got {}", + entity_fetch_count); internal::check( std::all_of(row_slice->entity_fetch_count_->begin(), row_slice->entity_fetch_count_->end(), - [&expected_get_calls](uint64_t i) { return i == expected_get_calls; }), - "All segments in same row slice should have same expected_get_calls in split_by_row_slice"); + [&entity_fetch_count](uint64_t i) { return i == entity_fetch_count; }), + "All segments in same row slice should have same entity_fetch_count in split_by_row_slice"); } } From 24065702c9b787a77025687e17c579cda77e43af Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 16 Sep 2024 13:15:23 +0000 Subject: [PATCH 17/38] REVERT ME: instrumented for time interacting with ComponentManager --- cpp/arcticdb/processing/clause.cpp | 13 +++++++++++++ cpp/arcticdb/processing/clause.hpp | 13 +++++++++++++ cpp/arcticdb/version/version_core.cpp | 13 +++++++++++++ 3 files changed, 39 insertions(+) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 271eac4d61..1be96cf711 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -79,11 +79,24 @@ std::vector> structure_all_together(std::vector push_entities(ComponentManager& component_manager, ProcessingUnit&& proc, EntityFetchCount entity_fetch_count) { + static std::atomic total_us{0}; + static std::atomic count{0}; + auto start = std::chrono::steady_clock::now(); auto ids = component_manager.add_entities(std::move(*proc.segments_), std::move(*proc.row_ranges_), std::move(*proc.col_ranges_)); component_manager.add_component(ids, entity_fetch_count); if (proc.bucket_.has_value()) { component_manager.add_component(ids, *proc.bucket_); } + auto end = std::chrono::steady_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + auto old_total_us = total_us.fetch_add(us); + auto old_count = count.fetch_add(1); + if (old_count == 640000) { + log::schedule().debug("push_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); + } + if (old_count == 640063) { + log::schedule().debug("push_entities average over {} calls: {}us", count.load(), double(total_us.load()) / double(count.load())); + } return ids; } diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 4d536b89ce..b2f7fa6dd0 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -121,6 +121,9 @@ std::vector> structure_all_together(std::vector ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector&& entity_ids) { + static std::atomic total_us{0}; + static std::atomic count{0}; + auto start = std::chrono::steady_clock::now(); ProcessingUnit res; auto components = component_manager.get_entities(entity_ids); ([&]{ @@ -139,6 +142,16 @@ ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector< static_assert(sizeof(Args) == 0, "Unexpected component type provided in gather_entities"); } }(), ...); + auto end = std::chrono::steady_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + auto old_total_us = total_us.fetch_add(us); + auto old_count = count.fetch_add(1); + if (old_count == 10000) { + log::schedule().debug("gather_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); + } + if (old_count == 10064) { + log::schedule().debug("gather_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); + } return res; } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index bfe493ba96..82c3f03a2f 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -534,6 +534,7 @@ std::vector process_clauses( // Used to make sure each entity is only added into the component manager once auto slice_added_mtx = std::make_shared>(segment_and_slice_futures.size()); auto slice_added = std::make_shared>(segment_and_slice_futures.size(), false); + auto entity_adding_time = std::make_shared>(segment_and_slice_futures.size(), 0); std::vector>> futures; bool first_clause{true}; // Reverse the order of clauses and iterate through them backwards so that the erase is efficient @@ -554,6 +555,7 @@ std::vector process_clauses( id_to_pos, slice_added_mtx, slice_added, + entity_adding_time, clauses, entity_ids = std::move(entity_ids)](std::vector&& segment_and_slices) mutable { for (auto&& [idx, segment_and_slice]: folly::enumerate(segment_and_slices)) { @@ -561,6 +563,7 @@ std::vector process_clauses( auto pos = id_to_pos->at(entity_id); std::lock_guard lock((*slice_added_mtx)[pos]); if (!(*slice_added)[pos]) { + auto start = std::chrono::steady_clock::now(); component_manager->add_entity( entity_id, std::make_shared(std::move(segment_and_slice.segment_in_memory_)), @@ -570,6 +573,9 @@ std::vector process_clauses( (*segment_proc_unit_counts)[pos] ); (*slice_added)[pos] = true; + auto end = std::chrono::steady_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + (*entity_adding_time)[pos] = us; } } return async::MemSegmentProcessingTask(*clauses, std::move(entity_ids))(); @@ -594,6 +600,9 @@ std::vector process_clauses( clauses->erase(clauses->end() - 1); } } + auto total_us = std::accumulate(entity_adding_time->begin(), entity_adding_time->end(), 0LL); + auto count = entity_adding_time->size(); + log::schedule().debug("first entity push average over {} calls: {}us", count, double(total_us) / double(count)); return flatten_entities(std::move(entity_ids_vec)); } @@ -733,7 +742,11 @@ std::vector read_and_process( std::move(segment_and_slice_futures), std::move(processing_unit_indexes), std::make_shared>>(read_query.clauses_)); + auto start = std::chrono::steady_clock::now(); auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); + auto end = std::chrono::steady_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + log::schedule().debug("Final gather_entities call: {}us", us); internal::check(component_manager->empty(), "ComponentManager still contains entities"); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { From 1b95096f98ea65e25113d7c33837b3b5049d9541 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 16 Sep 2024 14:52:33 +0000 Subject: [PATCH 18/38] Only take ComponentManager mutex once in each call to push_entities --- cpp/arcticdb/processing/clause.cpp | 18 +++++++++++++++--- cpp/arcticdb/processing/component_manager.hpp | 15 ++++----------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 1be96cf711..3d97dfd2cf 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -82,10 +82,22 @@ std::vector push_entities(ComponentManager& component_manager, Process static std::atomic total_us{0}; static std::atomic count{0}; auto start = std::chrono::steady_clock::now(); - auto ids = component_manager.add_entities(std::move(*proc.segments_), std::move(*proc.row_ranges_), std::move(*proc.col_ranges_)); - component_manager.add_component(ids, entity_fetch_count); + std::vector entity_fetch_counts(proc.segments_->size(), entity_fetch_count); + std::vector ids; if (proc.bucket_.has_value()) { - component_manager.add_component(ids, *proc.bucket_); + std::vector bucket_ids(proc.segments_->size(), *proc.bucket_); + ids = component_manager.add_entities( + std::move(*proc.segments_), + std::move(*proc.row_ranges_), + std::move(*proc.col_ranges_), + std::move(entity_fetch_counts), + std::move(bucket_ids)); + } else { + ids = component_manager.add_entities( + std::move(*proc.segments_), + std::move(*proc.row_ranges_), + std::move(*proc.col_ranges_), + std::move(entity_fetch_counts)); } auto end = std::chrono::steady_clock::now(); auto us = std::chrono::duration_cast(end - start).count(); diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 18b59f05d7..b2239eadc1 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -69,21 +69,14 @@ class ComponentManager { ); } registry_.insert(ids.cbegin(), ids.cend(), args.begin()); + if constexpr (std::is_same_v) { + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), args.begin()); + } }(), ...); return ids; } - // Add the same component to all the input entities - template - void add_component(const std::vector& ids, T component) { - std::lock_guard lock(mtx_); - registry_.insert(ids.cbegin(), ids.cend(), component); - if constexpr (std::is_same_v) { - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), component); - } - } - // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args template std::tuple...> get_entities(const std::vector& ids) { From 889461bc8dce7b9c51a4a97bc0f32efcc8217307 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 16 Sep 2024 15:24:18 +0000 Subject: [PATCH 19/38] Pull as much work out of get_entities as possible --- cpp/arcticdb/processing/component_manager.hpp | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index b2239eadc1..6fa03f6f3d 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -80,27 +80,35 @@ class ComponentManager { // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args template std::tuple...> get_entities(const std::vector& ids) { - std::tuple...> res; - ([&]{ - std::get>(res).reserve(ids.size()); - }(), ...); // Those that have been fetched as many times as they're going to std::vector ids_to_erase; // Most entities are only fetched once, hence this reserve ids_to_erase.reserve(ids.size()); - std::lock_guard lock(mtx_); - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - for (auto id: ids) { - std::tuple components = registry_.get(id); - ([&]{ - std::get>(res).emplace_back(std::get(components)); - }(), ...); - auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); - if (--remaining_entity_fetch_count == 0) { - ids_to_erase.emplace_back(id); + std::vector> tuple_res; + tuple_res.reserve(ids.size()); + { + std::lock_guard lock(mtx_); + auto &&remaining_entity_fetch_count_registry = registry_.storage( + remaining_entity_fetch_count_id); + for (auto id: ids) { + tuple_res.emplace_back(std::move(registry_.get(id))); + auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); + if (--remaining_entity_fetch_count == 0) { + ids_to_erase.emplace_back(id); + } } + registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); + } + + std::tuple...> res; + ([&]{ + std::get>(res).reserve(ids.size()); + }(), ...); + for (auto&& tuple: tuple_res) { + ([&] { + std::get>(res).emplace_back(std::move(std::get(tuple))); + }(), ...); } - registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); return res; } From 6590249efbc760534799d19a9b54e77fc0d80427 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 17 Sep 2024 12:54:23 +0000 Subject: [PATCH 20/38] More instrumentation, only destroy SegmentInMemory component, use const& registry --- cpp/arcticdb/processing/clause.hpp | 4 +++ cpp/arcticdb/processing/component_manager.hpp | 34 +++++++++++++++++-- cpp/arcticdb/version/version_core.cpp | 2 +- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index b2f7fa6dd0..ef4631b59a 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -342,7 +342,11 @@ struct PartitionClause { // Experimentation shows flattening the entities into a single vector and a single call to // component_manager_->get is faster than not flattening and making multiple calls auto entity_ids = flatten_entities(std::move(entity_ids_vec)); + auto start = std::chrono::steady_clock::now(); auto [buckets] = component_manager_->get_entities(entity_ids); + auto end = std::chrono::steady_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + log::schedule().debug("repartition get_entities call: {}us", us); for (auto [idx, entity_id]: folly::enumerate(entity_ids)) { res[buckets[idx]].emplace_back(entity_id); } diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 6fa03f6f3d..9187978c58 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -31,6 +31,8 @@ class ComponentManager { ARCTICDB_NO_MOVE_OR_COPY(ComponentManager) EntityId get_new_entity_id() { + // This does not need protecting by mtx_ right now as it is called in serial, but this will need to change + // when batch_read is truly parallel with the processing pipeline return registry_.create(); } @@ -80,35 +82,61 @@ class ComponentManager { // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args template std::tuple...> get_entities(const std::vector& ids) { + auto t0 = std::chrono::steady_clock::now(); + auto t2 = t0; + auto t3 = t0; + auto t4 = t0; + auto t5 = t0; // Those that have been fetched as many times as they're going to std::vector ids_to_erase; // Most entities are only fetched once, hence this reserve ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); + const auto& cregistry = registry_; + auto t1 = std::chrono::steady_clock::now(); { std::lock_guard lock(mtx_); + t2 = std::chrono::steady_clock::now(); auto &&remaining_entity_fetch_count_registry = registry_.storage( remaining_entity_fetch_count_id); + t3 = std::chrono::steady_clock::now(); for (auto id: ids) { - tuple_res.emplace_back(std::move(registry_.get(id))); + tuple_res.emplace_back(std::move(cregistry.get(id))); auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); if (--remaining_entity_fetch_count == 0) { ids_to_erase.emplace_back(id); } } - registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); + t4 = std::chrono::steady_clock::now(); + registry_.erase>(ids_to_erase.begin(), ids_to_erase.end()); +// registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); + t5 = std::chrono::steady_clock::now(); } - + auto t6 = std::chrono::steady_clock::now(); std::tuple...> res; ([&]{ std::get>(res).reserve(ids.size()); }(), ...); + auto t7 = std::chrono::steady_clock::now(); for (auto&& tuple: tuple_res) { ([&] { std::get>(res).emplace_back(std::move(std::get(tuple))); }(), ...); } + auto t8 = std::chrono::steady_clock::now(); + if (ids.size() > 1) { + log::schedule().debug("num entities: {}", ids.size()); + log::schedule().debug("preamble: {}us", std::chrono::duration_cast(t1 - t0).count()); + log::schedule().debug("acquire lock: {}us", std::chrono::duration_cast(t2 - t1).count()); + log::schedule().debug("ref registry: {}us", std::chrono::duration_cast(t3 - t2).count()); + log::schedule().debug("main loop: {}us", std::chrono::duration_cast(t4 - t3).count()); + log::schedule().debug("destroy entities: {}us", std::chrono::duration_cast(t5 - t4).count()); + log::schedule().debug("release lock: {}us", std::chrono::duration_cast(t6 - t5).count()); + log::schedule().debug("reserve result: {}us", std::chrono::duration_cast(t7 - t6).count()); + log::schedule().debug("populate result: {}us", std::chrono::duration_cast(t8 - t7).count()); + log::schedule().debug("total: {}us", std::chrono::duration_cast(t8 - t0).count()); + } return res; } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 82c3f03a2f..48b67d1f54 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -747,7 +747,7 @@ std::vector read_and_process( auto end = std::chrono::steady_clock::now(); auto us = std::chrono::duration_cast(end - start).count(); log::schedule().debug("Final gather_entities call: {}us", us); - internal::check(component_manager->empty(), "ComponentManager still contains entities"); +// internal::check(component_manager->empty(), "ComponentManager still contains entities"); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { return clause->clause_info().modifies_output_descriptor_; From ee15555e83a220e62b8f50d0a7785a32d2fd4c65 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 17 Sep 2024 15:25:47 +0000 Subject: [PATCH 21/38] Use view to get entities --- cpp/arcticdb/processing/component_manager.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 9187978c58..bf6ffe66be 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -94,6 +94,7 @@ class ComponentManager { std::vector> tuple_res; tuple_res.reserve(ids.size()); const auto& cregistry = registry_; + auto view = cregistry.view(entt::exclude); auto t1 = std::chrono::steady_clock::now(); { std::lock_guard lock(mtx_); @@ -102,7 +103,8 @@ class ComponentManager { remaining_entity_fetch_count_id); t3 = std::chrono::steady_clock::now(); for (auto id: ids) { - tuple_res.emplace_back(std::move(cregistry.get(id))); +// tuple_res.emplace_back(std::move(cregistry.get(id))); + tuple_res.emplace_back(std::move(view.get(id))); auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); if (--remaining_entity_fetch_count == 0) { ids_to_erase.emplace_back(id); From 6c76c7af3b258da566aa8925b970d2b06d35778d Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 17 Sep 2024 16:04:09 +0000 Subject: [PATCH 22/38] TODOs --- cpp/arcticdb/processing/component_manager.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index bf6ffe66be..8fcf65fc17 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -93,7 +93,9 @@ class ComponentManager { ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); + // TODO: See if this is needed for performance const auto& cregistry = registry_; + // TODO: Either find proper way to do this, or use dummy class defined for this purpose auto view = cregistry.view(entt::exclude); auto t1 = std::chrono::steady_clock::now(); { @@ -103,7 +105,6 @@ class ComponentManager { remaining_entity_fetch_count_id); t3 = std::chrono::steady_clock::now(); for (auto id: ids) { -// tuple_res.emplace_back(std::move(cregistry.get(id))); tuple_res.emplace_back(std::move(view.get(id))); auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); if (--remaining_entity_fetch_count == 0) { From 56403012658a0e8e7eec4564f234ad1bd1239073 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 17 Sep 2024 16:16:39 +0000 Subject: [PATCH 23/38] Use const view --- cpp/arcticdb/processing/component_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 8fcf65fc17..2c30ac5b85 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -96,7 +96,7 @@ class ComponentManager { // TODO: See if this is needed for performance const auto& cregistry = registry_; // TODO: Either find proper way to do this, or use dummy class defined for this purpose - auto view = cregistry.view(entt::exclude); + auto view = cregistry.view(entt::exclude); auto t1 = std::chrono::steady_clock::now(); { std::lock_guard lock(mtx_); From ac001aa7e53ea9fd7974795bbdb8dfb936a9047c Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 17 Sep 2024 16:33:47 +0000 Subject: [PATCH 24/38] Remove commented out destroy --- cpp/arcticdb/processing/component_manager.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 2c30ac5b85..f551157790 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -113,7 +113,6 @@ class ComponentManager { } t4 = std::chrono::steady_clock::now(); registry_.erase>(ids_to_erase.begin(), ids_to_erase.end()); -// registry_.destroy(ids_to_erase.begin(), ids_to_erase.end()); t5 = std::chrono::steady_clock::now(); } auto t6 = std::chrono::steady_clock::now(); From 3165ee6d332cd7d582f2848e80258682c4ea09d1 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 18 Sep 2024 08:21:23 +0000 Subject: [PATCH 25/38] Reset shared ptr instead of erasing component --- cpp/arcticdb/processing/component_manager.hpp | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index f551157790..07f79c7ef0 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -85,14 +85,13 @@ class ComponentManager { auto t0 = std::chrono::steady_clock::now(); auto t2 = t0; auto t3 = t0; - auto t4 = t0; - auto t5 = t0; // Those that have been fetched as many times as they're going to std::vector ids_to_erase; // Most entities are only fetched once, hence this reserve ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); + auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); // TODO: See if this is needed for performance const auto& cregistry = registry_; // TODO: Either find proper way to do this, or use dummy class defined for this purpose @@ -101,43 +100,34 @@ class ComponentManager { { std::lock_guard lock(mtx_); t2 = std::chrono::steady_clock::now(); - auto &&remaining_entity_fetch_count_registry = registry_.storage( - remaining_entity_fetch_count_id); - t3 = std::chrono::steady_clock::now(); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); if (--remaining_entity_fetch_count == 0) { - ids_to_erase.emplace_back(id); + registry_.get>(id).reset(); } } - t4 = std::chrono::steady_clock::now(); - registry_.erase>(ids_to_erase.begin(), ids_to_erase.end()); - t5 = std::chrono::steady_clock::now(); + t3 = std::chrono::steady_clock::now(); } - auto t6 = std::chrono::steady_clock::now(); std::tuple...> res; ([&]{ std::get>(res).reserve(ids.size()); }(), ...); - auto t7 = std::chrono::steady_clock::now(); + auto t4 = std::chrono::steady_clock::now(); for (auto&& tuple: tuple_res) { ([&] { std::get>(res).emplace_back(std::move(std::get(tuple))); }(), ...); } - auto t8 = std::chrono::steady_clock::now(); + auto t5 = std::chrono::steady_clock::now(); if (ids.size() > 1) { log::schedule().debug("num entities: {}", ids.size()); log::schedule().debug("preamble: {}us", std::chrono::duration_cast(t1 - t0).count()); log::schedule().debug("acquire lock: {}us", std::chrono::duration_cast(t2 - t1).count()); - log::schedule().debug("ref registry: {}us", std::chrono::duration_cast(t3 - t2).count()); - log::schedule().debug("main loop: {}us", std::chrono::duration_cast(t4 - t3).count()); - log::schedule().debug("destroy entities: {}us", std::chrono::duration_cast(t5 - t4).count()); - log::schedule().debug("release lock: {}us", std::chrono::duration_cast(t6 - t5).count()); - log::schedule().debug("reserve result: {}us", std::chrono::duration_cast(t7 - t6).count()); - log::schedule().debug("populate result: {}us", std::chrono::duration_cast(t8 - t7).count()); - log::schedule().debug("total: {}us", std::chrono::duration_cast(t8 - t0).count()); + log::schedule().debug("main loop: {}us", std::chrono::duration_cast(t3 - t2).count()); + log::schedule().debug("reserve result: {}us", std::chrono::duration_cast(t4 - t3).count()); + log::schedule().debug("populate result: {}us", std::chrono::duration_cast(t5 - t4).count()); + log::schedule().debug("total: {}us", std::chrono::duration_cast(t5 - t0).count()); } return res; } From 2fac2583ae1657a352961ae31c7c6c0fb7a910cb Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 18 Sep 2024 08:27:34 +0000 Subject: [PATCH 26/38] Added TODO --- cpp/arcticdb/processing/component_manager.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 07f79c7ef0..07b8189331 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -59,6 +59,7 @@ class ComponentManager { // Fold expressions magic to emplace every element of args into the registry with its type as template parameter ([&]{ if (entity_count == 0) { + // TODO: Use version of registry_.create that takes 2 iterators entity_count = args.size(); ids.reserve(entity_count); for (ARCTICDB_UNUSED size_t i=0; i Date: Wed, 18 Sep 2024 09:57:05 +0000 Subject: [PATCH 27/38] Make gathering entities lock-free --- cpp/arcticdb/processing/component_manager.hpp | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 07b8189331..5754bdc07f 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -44,8 +44,9 @@ class ComponentManager { ([&]{ registry_.emplace(id, args); if constexpr (std::is_same_v) { - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - remaining_entity_fetch_count_registry.emplace(id, args); + registry_.emplace>(id, args); +// auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); +// remaining_entity_fetch_count_registry.emplace(id, args); } }(), ...); } @@ -73,8 +74,14 @@ class ComponentManager { } registry_.insert(ids.cbegin(), ids.cend(), args.begin()); if constexpr (std::is_same_v) { - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), args.begin()); + for (size_t idx=0; idx>(ids[idx], args[idx]); + } +// std::vector> remaining_entity_fetch_counts(args.begin(), args.end()); +// registry_.insert>(ids.cbegin(), ids.cend(), args.begin()); +// registry_.emplace>(id, std::atomic(args)); +// auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); +// remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), args.begin()); } }(), ...); return ids; @@ -92,19 +99,19 @@ class ComponentManager { ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); - auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); +// auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); // TODO: See if this is needed for performance const auto& cregistry = registry_; // TODO: Either find proper way to do this, or use dummy class defined for this purpose auto view = cregistry.view(entt::exclude); auto t1 = std::chrono::steady_clock::now(); { - std::lock_guard lock(mtx_); +// std::lock_guard lock(mtx_); t2 = std::chrono::steady_clock::now(); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); - auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); - if (--remaining_entity_fetch_count == 0) { + auto& remaining_entity_fetch_count = registry_.get>(id); + if (remaining_entity_fetch_count.fetch_sub(1) == 1) { registry_.get>(id).reset(); } } From 22c14a643abbdbfe3578788844b33f0845f6258f Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 18 Sep 2024 13:02:59 +0000 Subject: [PATCH 28/38] Remove empty check --- cpp/arcticdb/processing/component_manager.hpp | 7 ------- cpp/arcticdb/version/version_core.cpp | 1 - 2 files changed, 8 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 5754bdc07f..99c5cf411e 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -140,13 +140,6 @@ class ComponentManager { return res; } - bool empty() const { - for (ARCTICDB_UNUSED auto _: registry_.view()) { - return false; - } - return true; - } - private: entt::registry registry_; std::mutex mtx_; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 48b67d1f54..236dd321b8 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -747,7 +747,6 @@ std::vector read_and_process( auto end = std::chrono::steady_clock::now(); auto us = std::chrono::duration_cast(end - start).count(); log::schedule().debug("Final gather_entities call: {}us", us); -// internal::check(component_manager->empty(), "ComponentManager still contains entities"); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { return clause->clause_info().modifies_output_descriptor_; From 75ec37d9d1148c92104c38dfa4a6e082447decbb Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 14:23:07 +0000 Subject: [PATCH 29/38] Use uint64_t for entity ids --- cpp/arcticdb/CMakeLists.txt | 143 ++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 71 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index b22f4b080f..e802a01890 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -637,6 +637,7 @@ if (WIN32) ) endif() +add_compile_definitions(ENTT_ID_TYPE=std::uint64_t) set (arcticdb_core_libraries pybind11::module # Transitively includes Python::Module or Python::Python as appropriate @@ -873,79 +874,79 @@ if(${TEST}) python_utils_dump_vars_if_enabled("Python for test compilation") set(unit_test_srcs - async/test/test_async.cpp - codec/test/test_codec.cpp - codec/test/test_encode_field_collection.cpp - codec/test/test_segment_header.cpp - codec/test/test_encoded_field.cpp - column_store/test/ingestion_stress_test.cpp - column_store/test/test_column.cpp - column_store/test/test_column_data_random_accessor.cpp - column_store/test/test_index_filtering.cpp - column_store/test/test_memory_segment.cpp - entity/test/test_atom_key.cpp - entity/test/test_key_serialization.cpp - entity/test/test_metrics.cpp - entity/test/test_ref_key.cpp - entity/test/test_tensor.cpp - log/test/test_log.cpp - pipeline/test/test_container.hpp - pipeline/test/test_pipeline.cpp - pipeline/test/test_query.cpp - util/test/test_regex.cpp - processing/test/test_arithmetic_type_promotion.cpp - processing/test/test_clause.cpp +# async/test/test_async.cpp +# codec/test/test_codec.cpp +# codec/test/test_encode_field_collection.cpp +# codec/test/test_segment_header.cpp +# codec/test/test_encoded_field.cpp +# column_store/test/ingestion_stress_test.cpp +# column_store/test/test_column.cpp +# column_store/test/test_column_data_random_accessor.cpp +# column_store/test/test_index_filtering.cpp +# column_store/test/test_memory_segment.cpp +# entity/test/test_atom_key.cpp +# entity/test/test_key_serialization.cpp +# entity/test/test_metrics.cpp +# entity/test/test_ref_key.cpp +# entity/test/test_tensor.cpp +# log/test/test_log.cpp +# pipeline/test/test_container.hpp +# pipeline/test/test_pipeline.cpp +# pipeline/test/test_query.cpp +# util/test/test_regex.cpp +# processing/test/test_arithmetic_type_promotion.cpp +# processing/test/test_clause.cpp processing/test/test_component_manager.cpp - processing/test/test_expression.cpp - processing/test/test_filter_and_project_sparse.cpp - processing/test/test_has_valid_type_promotion.cpp - processing/test/test_operation_dispatch.cpp - processing/test/test_resample.cpp - processing/test/test_set_membership.cpp - processing/test/test_signed_unsigned_comparison.cpp - processing/test/test_type_comparison.cpp - storage/test/test_local_storages.cpp - storage/test/test_memory_storage.cpp - storage/test/test_s3_storage.cpp - storage/test/test_storage_factory.cpp - storage/test/test_storage_exceptions.cpp - storage/test/test_azure_storage.cpp - storage/test/test_storage_operations.cpp - stream/test/stream_test_common.cpp - stream/test/test_aggregator.cpp - stream/test/test_append_map.cpp - stream/test/test_row_builder.cpp - stream/test/test_segment_aggregator.cpp - stream/test/test_types.cpp - util/memory_tracing.hpp +# processing/test/test_expression.cpp +# processing/test/test_filter_and_project_sparse.cpp +# processing/test/test_has_valid_type_promotion.cpp +# processing/test/test_operation_dispatch.cpp +# processing/test/test_resample.cpp +# processing/test/test_set_membership.cpp +# processing/test/test_signed_unsigned_comparison.cpp +# processing/test/test_type_comparison.cpp +# storage/test/test_local_storages.cpp +# storage/test/test_memory_storage.cpp +# storage/test/test_s3_storage.cpp +# storage/test/test_storage_factory.cpp +# storage/test/test_storage_exceptions.cpp +# storage/test/test_azure_storage.cpp +# storage/test/test_storage_operations.cpp +# stream/test/stream_test_common.cpp +# stream/test/test_aggregator.cpp +# stream/test/test_append_map.cpp +# stream/test/test_row_builder.cpp +# stream/test/test_segment_aggregator.cpp +# stream/test/test_types.cpp +# util/memory_tracing.hpp util/test/gtest_main.cpp - util/test/test_bitmagic.cpp - util/test/test_buffer_pool.cpp - util/test/test_composite.cpp - util/test/test_cursor.cpp - util/test/test_decimal.cpp - util/test/test_exponential_backoff.cpp - util/test/test_format_date.cpp - util/test/test_hash.cpp - util/test/test_id_transformation.cpp - util/test/test_ranges_from_future.cpp - util/test/test_slab_allocator.cpp - util/test/test_storage_lock.cpp - util/test/test_string_pool.cpp - util/test/test_string_utils.cpp - util/test/test_tracing_allocator.cpp - version/test/test_append.cpp - version/test/test_sparse.cpp - version/test/test_stream_version_data.cpp - version/test/test_symbol_list.cpp - version/test/test_version_map.cpp - version/test/test_version_map_batch.cpp - version/test/test_version_store.cpp - version/test/test_sorting_info_state_machine.cpp - version/test/version_map_model.hpp - python/python_handlers.cpp - storage/test/common.hpp - version/test/test_sort_index.cpp +# util/test/test_bitmagic.cpp +# util/test/test_buffer_pool.cpp +# util/test/test_composite.cpp +# util/test/test_cursor.cpp +# util/test/test_decimal.cpp +# util/test/test_exponential_backoff.cpp +# util/test/test_format_date.cpp +# util/test/test_hash.cpp +# util/test/test_id_transformation.cpp +# util/test/test_ranges_from_future.cpp +# util/test/test_slab_allocator.cpp +# util/test/test_storage_lock.cpp +# util/test/test_string_pool.cpp +# util/test/test_string_utils.cpp +# util/test/test_tracing_allocator.cpp +# version/test/test_append.cpp +# version/test/test_sparse.cpp +# version/test/test_stream_version_data.cpp +# version/test/test_symbol_list.cpp +# version/test/test_version_map.cpp +# version/test/test_version_map_batch.cpp +# version/test/test_version_store.cpp +# version/test/test_sorting_info_state_machine.cpp +# version/test/version_map_model.hpp +# python/python_handlers.cpp +# storage/test/common.hpp +# version/test/test_sort_index.cpp ) set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755 From ccae95d7a19d1e8b7cb941ec110df6d026e5fb52 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 14:23:20 +0000 Subject: [PATCH 30/38] Revert "Make gathering entities lock-free" This reverts commit 6891e1a3a4470ea40a6d16673508678ae88131ef. --- cpp/arcticdb/processing/component_manager.hpp | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 99c5cf411e..b7ec241ed6 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -44,9 +44,8 @@ class ComponentManager { ([&]{ registry_.emplace(id, args); if constexpr (std::is_same_v) { - registry_.emplace>(id, args); -// auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); -// remaining_entity_fetch_count_registry.emplace(id, args); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.emplace(id, args); } }(), ...); } @@ -74,14 +73,8 @@ class ComponentManager { } registry_.insert(ids.cbegin(), ids.cend(), args.begin()); if constexpr (std::is_same_v) { - for (size_t idx=0; idx>(ids[idx], args[idx]); - } -// std::vector> remaining_entity_fetch_counts(args.begin(), args.end()); -// registry_.insert>(ids.cbegin(), ids.cend(), args.begin()); -// registry_.emplace>(id, std::atomic(args)); -// auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); -// remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), args.begin()); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + remaining_entity_fetch_count_registry.insert(ids.cbegin(), ids.cend(), args.begin()); } }(), ...); return ids; @@ -99,19 +92,19 @@ class ComponentManager { ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); -// auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); // TODO: See if this is needed for performance const auto& cregistry = registry_; // TODO: Either find proper way to do this, or use dummy class defined for this purpose auto view = cregistry.view(entt::exclude); auto t1 = std::chrono::steady_clock::now(); { -// std::lock_guard lock(mtx_); + std::lock_guard lock(mtx_); t2 = std::chrono::steady_clock::now(); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); - auto& remaining_entity_fetch_count = registry_.get>(id); - if (remaining_entity_fetch_count.fetch_sub(1) == 1) { + auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); + if (--remaining_entity_fetch_count == 0) { registry_.get>(id).reset(); } } From 83321f753fa8fe9611966085c680988d296ebcc1 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 14:45:05 +0000 Subject: [PATCH 31/38] Pin to stable release of entt, add_subdirectory in conda builds too --- cpp/third_party/CMakeLists.txt | 2 +- cpp/third_party/entt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/third_party/CMakeLists.txt b/cpp/third_party/CMakeLists.txt index e184f76bd2..9d1d086981 100644 --- a/cpp/third_party/CMakeLists.txt +++ b/cpp/third_party/CMakeLists.txt @@ -5,5 +5,5 @@ if(NOT ${ARCTICDB_USING_CONDA}) add_subdirectory(msgpack-c) add_subdirectory(Remotery) add_subdirectory(lmdbcxx) - add_subdirectory(entt) endif() +add_subdirectory(entt) diff --git a/cpp/third_party/entt b/cpp/third_party/entt index 156bc470ce..7821307565 160000 --- a/cpp/third_party/entt +++ b/cpp/third_party/entt @@ -1 +1 @@ -Subproject commit 156bc470cea3a49cdbddd0ae74db77a8c207bbe8 +Subproject commit 78213075654a688e9da6bc49f7f873d25c26d12c From 384f9ad2051e99e5c2027ecd8552305392a3061b Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 14:59:20 +0000 Subject: [PATCH 32/38] Remove unnecessary code --- cpp/arcticdb/processing/component_manager.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index b7ec241ed6..1c62057c06 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -93,10 +93,7 @@ class ComponentManager { std::vector> tuple_res; tuple_res.reserve(ids.size()); auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - // TODO: See if this is needed for performance - const auto& cregistry = registry_; - // TODO: Either find proper way to do this, or use dummy class defined for this purpose - auto view = cregistry.view(entt::exclude); + auto view = registry_.view(); auto t1 = std::chrono::steady_clock::now(); { std::lock_guard lock(mtx_); From bf426310f6cf66ebcafa3d740f68f51305451910 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 15:10:00 +0000 Subject: [PATCH 33/38] Use version of registry_.create that takes 2 iterators --- cpp/arcticdb/processing/component_manager.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 1c62057c06..0b0ac688ec 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -59,12 +59,9 @@ class ComponentManager { // Fold expressions magic to emplace every element of args into the registry with its type as template parameter ([&]{ if (entity_count == 0) { - // TODO: Use version of registry_.create that takes 2 iterators entity_count = args.size(); - ids.reserve(entity_count); - for (ARCTICDB_UNUSED size_t i=0; i( args.size() == entity_count, From c82b23cf89ba5da57276b3ad9dff96ca02e29ef3 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 19 Sep 2024 15:17:06 +0000 Subject: [PATCH 34/38] Remove most profiling timings --- cpp/arcticdb/processing/component_manager.hpp | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 0b0ac688ec..a6d26b01c8 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -80,21 +80,16 @@ class ComponentManager { // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args template std::tuple...> get_entities(const std::vector& ids) { - auto t0 = std::chrono::steady_clock::now(); - auto t2 = t0; - auto t3 = t0; // Those that have been fetched as many times as they're going to std::vector ids_to_erase; // Most entities are only fetched once, hence this reserve ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); - auto &&remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); auto view = registry_.view(); - auto t1 = std::chrono::steady_clock::now(); { std::lock_guard lock(mtx_); - t2 = std::chrono::steady_clock::now(); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); @@ -102,28 +97,16 @@ class ComponentManager { registry_.get>(id).reset(); } } - t3 = std::chrono::steady_clock::now(); } std::tuple...> res; ([&]{ std::get>(res).reserve(ids.size()); }(), ...); - auto t4 = std::chrono::steady_clock::now(); for (auto&& tuple: tuple_res) { ([&] { std::get>(res).emplace_back(std::move(std::get(tuple))); }(), ...); } - auto t5 = std::chrono::steady_clock::now(); - if (ids.size() > 1) { - log::schedule().debug("num entities: {}", ids.size()); - log::schedule().debug("preamble: {}us", std::chrono::duration_cast(t1 - t0).count()); - log::schedule().debug("acquire lock: {}us", std::chrono::duration_cast(t2 - t1).count()); - log::schedule().debug("main loop: {}us", std::chrono::duration_cast(t3 - t2).count()); - log::schedule().debug("reserve result: {}us", std::chrono::duration_cast(t4 - t3).count()); - log::schedule().debug("populate result: {}us", std::chrono::duration_cast(t5 - t4).count()); - log::schedule().debug("total: {}us", std::chrono::duration_cast(t5 - t0).count()); - } return res; } From 95e80db713e3715ff3955dd4a0ed4ceb26a9436c Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 20 Sep 2024 08:30:01 +0000 Subject: [PATCH 35/38] Added comments --- cpp/arcticdb/CMakeLists.txt | 142 +++++++++--------- cpp/arcticdb/processing/clause.cpp | 15 -- cpp/arcticdb/processing/clause.hpp | 17 --- cpp/arcticdb/processing/component_manager.hpp | 34 +++-- cpp/arcticdb/version/version_core.cpp | 19 +-- 5 files changed, 95 insertions(+), 132 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index e802a01890..0c0e45843f 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -874,79 +874,79 @@ if(${TEST}) python_utils_dump_vars_if_enabled("Python for test compilation") set(unit_test_srcs -# async/test/test_async.cpp -# codec/test/test_codec.cpp -# codec/test/test_encode_field_collection.cpp -# codec/test/test_segment_header.cpp -# codec/test/test_encoded_field.cpp -# column_store/test/ingestion_stress_test.cpp -# column_store/test/test_column.cpp -# column_store/test/test_column_data_random_accessor.cpp -# column_store/test/test_index_filtering.cpp -# column_store/test/test_memory_segment.cpp -# entity/test/test_atom_key.cpp -# entity/test/test_key_serialization.cpp -# entity/test/test_metrics.cpp -# entity/test/test_ref_key.cpp -# entity/test/test_tensor.cpp -# log/test/test_log.cpp -# pipeline/test/test_container.hpp -# pipeline/test/test_pipeline.cpp -# pipeline/test/test_query.cpp -# util/test/test_regex.cpp -# processing/test/test_arithmetic_type_promotion.cpp -# processing/test/test_clause.cpp + async/test/test_async.cpp + codec/test/test_codec.cpp + codec/test/test_encode_field_collection.cpp + codec/test/test_segment_header.cpp + codec/test/test_encoded_field.cpp + column_store/test/ingestion_stress_test.cpp + column_store/test/test_column.cpp + column_store/test/test_column_data_random_accessor.cpp + column_store/test/test_index_filtering.cpp + column_store/test/test_memory_segment.cpp + entity/test/test_atom_key.cpp + entity/test/test_key_serialization.cpp + entity/test/test_metrics.cpp + entity/test/test_ref_key.cpp + entity/test/test_tensor.cpp + log/test/test_log.cpp + pipeline/test/test_container.hpp + pipeline/test/test_pipeline.cpp + pipeline/test/test_query.cpp + util/test/test_regex.cpp + processing/test/test_arithmetic_type_promotion.cpp + processing/test/test_clause.cpp processing/test/test_component_manager.cpp -# processing/test/test_expression.cpp -# processing/test/test_filter_and_project_sparse.cpp -# processing/test/test_has_valid_type_promotion.cpp -# processing/test/test_operation_dispatch.cpp -# processing/test/test_resample.cpp -# processing/test/test_set_membership.cpp -# processing/test/test_signed_unsigned_comparison.cpp -# processing/test/test_type_comparison.cpp -# storage/test/test_local_storages.cpp -# storage/test/test_memory_storage.cpp -# storage/test/test_s3_storage.cpp -# storage/test/test_storage_factory.cpp -# storage/test/test_storage_exceptions.cpp -# storage/test/test_azure_storage.cpp -# storage/test/test_storage_operations.cpp -# stream/test/stream_test_common.cpp -# stream/test/test_aggregator.cpp -# stream/test/test_append_map.cpp -# stream/test/test_row_builder.cpp -# stream/test/test_segment_aggregator.cpp -# stream/test/test_types.cpp -# util/memory_tracing.hpp + processing/test/test_expression.cpp + processing/test/test_filter_and_project_sparse.cpp + processing/test/test_has_valid_type_promotion.cpp + processing/test/test_operation_dispatch.cpp + processing/test/test_resample.cpp + processing/test/test_set_membership.cpp + processing/test/test_signed_unsigned_comparison.cpp + processing/test/test_type_comparison.cpp + storage/test/test_local_storages.cpp + storage/test/test_memory_storage.cpp + storage/test/test_s3_storage.cpp + storage/test/test_storage_factory.cpp + storage/test/test_storage_exceptions.cpp + storage/test/test_azure_storage.cpp + storage/test/test_storage_operations.cpp + stream/test/stream_test_common.cpp + stream/test/test_aggregator.cpp + stream/test/test_append_map.cpp + stream/test/test_row_builder.cpp + stream/test/test_segment_aggregator.cpp + stream/test/test_types.cpp + util/memory_tracing.hpp util/test/gtest_main.cpp -# util/test/test_bitmagic.cpp -# util/test/test_buffer_pool.cpp -# util/test/test_composite.cpp -# util/test/test_cursor.cpp -# util/test/test_decimal.cpp -# util/test/test_exponential_backoff.cpp -# util/test/test_format_date.cpp -# util/test/test_hash.cpp -# util/test/test_id_transformation.cpp -# util/test/test_ranges_from_future.cpp -# util/test/test_slab_allocator.cpp -# util/test/test_storage_lock.cpp -# util/test/test_string_pool.cpp -# util/test/test_string_utils.cpp -# util/test/test_tracing_allocator.cpp -# version/test/test_append.cpp -# version/test/test_sparse.cpp -# version/test/test_stream_version_data.cpp -# version/test/test_symbol_list.cpp -# version/test/test_version_map.cpp -# version/test/test_version_map_batch.cpp -# version/test/test_version_store.cpp -# version/test/test_sorting_info_state_machine.cpp -# version/test/version_map_model.hpp -# python/python_handlers.cpp -# storage/test/common.hpp -# version/test/test_sort_index.cpp + util/test/test_bitmagic.cpp + util/test/test_buffer_pool.cpp + util/test/test_composite.cpp + util/test/test_cursor.cpp + util/test/test_decimal.cpp + util/test/test_exponential_backoff.cpp + util/test/test_format_date.cpp + util/test/test_hash.cpp + util/test/test_id_transformation.cpp + util/test/test_ranges_from_future.cpp + util/test/test_slab_allocator.cpp + util/test/test_storage_lock.cpp + util/test/test_string_pool.cpp + util/test/test_string_utils.cpp + util/test/test_tracing_allocator.cpp + version/test/test_append.cpp + version/test/test_sparse.cpp + version/test/test_stream_version_data.cpp + version/test/test_symbol_list.cpp + version/test/test_version_map.cpp + version/test/test_version_map_batch.cpp + version/test/test_version_store.cpp + version/test/test_sorting_info_state_machine.cpp + version/test/version_map_model.hpp + python/python_handlers.cpp + storage/test/common.hpp + version/test/test_sort_index.cpp ) set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755 diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 3d97dfd2cf..8d7b2c7734 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -75,13 +75,8 @@ std::vector> structure_all_together(std::vector push_entities(ComponentManager& component_manager, ProcessingUnit&& proc, EntityFetchCount entity_fetch_count) { - static std::atomic total_us{0}; - static std::atomic count{0}; - auto start = std::chrono::steady_clock::now(); std::vector entity_fetch_counts(proc.segments_->size(), entity_fetch_count); std::vector ids; if (proc.bucket_.has_value()) { @@ -99,16 +94,6 @@ std::vector push_entities(ComponentManager& component_manager, Process std::move(*proc.col_ranges_), std::move(entity_fetch_counts)); } - auto end = std::chrono::steady_clock::now(); - auto us = std::chrono::duration_cast(end - start).count(); - auto old_total_us = total_us.fetch_add(us); - auto old_count = count.fetch_add(1); - if (old_count == 640000) { - log::schedule().debug("push_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); - } - if (old_count == 640063) { - log::schedule().debug("push_entities average over {} calls: {}us", count.load(), double(total_us.load()) / double(count.load())); - } return ids; } diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index ef4631b59a..4d536b89ce 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -121,9 +121,6 @@ std::vector> structure_all_together(std::vector ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector&& entity_ids) { - static std::atomic total_us{0}; - static std::atomic count{0}; - auto start = std::chrono::steady_clock::now(); ProcessingUnit res; auto components = component_manager.get_entities(entity_ids); ([&]{ @@ -142,16 +139,6 @@ ProcessingUnit gather_entities(ComponentManager& component_manager, std::vector< static_assert(sizeof(Args) == 0, "Unexpected component type provided in gather_entities"); } }(), ...); - auto end = std::chrono::steady_clock::now(); - auto us = std::chrono::duration_cast(end - start).count(); - auto old_total_us = total_us.fetch_add(us); - auto old_count = count.fetch_add(1); - if (old_count == 10000) { - log::schedule().debug("gather_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); - } - if (old_count == 10064) { - log::schedule().debug("gather_entities average over {} calls: {}us", old_count, double(old_total_us) / double(old_count)); - } return res; } @@ -342,11 +329,7 @@ struct PartitionClause { // Experimentation shows flattening the entities into a single vector and a single call to // component_manager_->get is faster than not flattening and making multiple calls auto entity_ids = flatten_entities(std::move(entity_ids_vec)); - auto start = std::chrono::steady_clock::now(); auto [buckets] = component_manager_->get_entities(entity_ids); - auto end = std::chrono::steady_clock::now(); - auto us = std::chrono::duration_cast(end - start).count(); - log::schedule().debug("repartition get_entities call: {}us", us); for (auto [idx, entity_id]: folly::enumerate(entity_ids)) { res[buckets[idx]].emplace_back(entity_id); } diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index a6d26b01c8..e55c7eadb2 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -30,19 +30,23 @@ class ComponentManager { ComponentManager() = default; ARCTICDB_NO_MOVE_OR_COPY(ComponentManager) - EntityId get_new_entity_id() { - // This does not need protecting by mtx_ right now as it is called in serial, but this will need to change - // when batch_read is truly parallel with the processing pipeline - return registry_.create(); + std::vector get_new_entity_ids(size_t count) { + std::vector ids(count); + std::lock_guard lock(mtx_); + registry_.create(ids.begin(), ids.end()); + return ids; } // Add a single entity with the components defined by args template void add_entity(EntityId id, Args... args) { std::lock_guard lock(mtx_); - // Fold expressions magic to emplace every element of args into the registry with its type as template parameter ([&]{ registry_.emplace(id, args); + // Store the initial entity fetch count component as a "first-class" entity, accessible by + // registry_.get(id), as this is external facing (used by resample) + // The remaining entity fetch count below will be decremented each time an entity is fetched, but is never + // accessed externally, so make this a named component. if constexpr (std::is_same_v) { auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); remaining_entity_fetch_count_registry.emplace(id, args); @@ -50,22 +54,23 @@ class ComponentManager { }(), ...); } - // Add a collection of entities. Each element of args should be a vector of components, all of which are the same length + // Add a collection of entities. Each element of args should be a collection of components, all of which have the + // same number of elements template std::vector add_entities(Args... args) { std::vector ids; size_t entity_count{0}; std::lock_guard lock(mtx_); - // Fold expressions magic to emplace every element of args into the registry with its type as template parameter ([&]{ if (entity_count == 0) { + // Reserve memory for the result on the first pass entity_count = args.size(); ids.resize(entity_count); registry_.create(ids.begin(), ids.end()); } else { internal::check( args.size() == entity_count, - "ComponentManager::add_entities received vectors of differing lengths" + "ComponentManager::add_entities received collections of differing lengths" ); } registry_.insert(ids.cbegin(), ids.cend(), args.begin()); @@ -80,24 +85,27 @@ class ComponentManager { // Get a collection of entities. Returns a tuple of vectors, one for each component requested via Args template std::tuple...> get_entities(const std::vector& ids) { - // Those that have been fetched as many times as they're going to - std::vector ids_to_erase; - // Most entities are only fetched once, hence this reserve - ids_to_erase.reserve(ids.size()); std::vector> tuple_res; tuple_res.reserve(ids.size()); auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + // Using view.get theoretically and empirically faster than registry_.get auto view = registry_.view(); { std::lock_guard lock(mtx_); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); - auto &remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); + auto& remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); + // This entity will never be accessed again + // Ideally would call registry_.destroy(id), or at least registry_.erase>(id) + // at this point. However, they are both slower than this, so just decrement the ref count of the only + // sizeable component, so that when the shared pointer goes out of scope in the calling function, the + // memory is freed if (--remaining_entity_fetch_count == 0) { registry_.get>(id).reset(); } } } + // Convert vector of tuples into tuple of vectors std::tuple...> res; ([&]{ std::get>(res).reserve(ids.size()); diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 236dd321b8..90563a7125 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -514,10 +514,10 @@ std::vector process_clauses( // Map from entity id to position in segment_and_slice_futures auto id_to_pos = std::make_shared>(); pos_to_id.reserve(segment_and_slice_futures.size()); - for (size_t i = 0; i < segment_and_slice_futures.size(); ++i) { - auto id = component_manager->get_new_entity_id(); + auto ids = component_manager->get_new_entity_ids(segment_and_slice_futures.size()); + for (auto&& [idx, id]: folly::enumerate(ids)) { pos_to_id.emplace_back(id); - id_to_pos->emplace(id, i); + id_to_pos->emplace(id, idx); } // Give this a more descriptive name as we modify it between clauses @@ -534,7 +534,6 @@ std::vector process_clauses( // Used to make sure each entity is only added into the component manager once auto slice_added_mtx = std::make_shared>(segment_and_slice_futures.size()); auto slice_added = std::make_shared>(segment_and_slice_futures.size(), false); - auto entity_adding_time = std::make_shared>(segment_and_slice_futures.size(), 0); std::vector>> futures; bool first_clause{true}; // Reverse the order of clauses and iterate through them backwards so that the erase is efficient @@ -555,7 +554,6 @@ std::vector process_clauses( id_to_pos, slice_added_mtx, slice_added, - entity_adding_time, clauses, entity_ids = std::move(entity_ids)](std::vector&& segment_and_slices) mutable { for (auto&& [idx, segment_and_slice]: folly::enumerate(segment_and_slices)) { @@ -563,7 +561,6 @@ std::vector process_clauses( auto pos = id_to_pos->at(entity_id); std::lock_guard lock((*slice_added_mtx)[pos]); if (!(*slice_added)[pos]) { - auto start = std::chrono::steady_clock::now(); component_manager->add_entity( entity_id, std::make_shared(std::move(segment_and_slice.segment_in_memory_)), @@ -573,9 +570,6 @@ std::vector process_clauses( (*segment_proc_unit_counts)[pos] ); (*slice_added)[pos] = true; - auto end = std::chrono::steady_clock::now(); - auto us = std::chrono::duration_cast(end - start).count(); - (*entity_adding_time)[pos] = us; } } return async::MemSegmentProcessingTask(*clauses, std::move(entity_ids))(); @@ -600,9 +594,6 @@ std::vector process_clauses( clauses->erase(clauses->end() - 1); } } - auto total_us = std::accumulate(entity_adding_time->begin(), entity_adding_time->end(), 0LL); - auto count = entity_adding_time->size(); - log::schedule().debug("first entity push average over {} calls: {}us", count, double(total_us) / double(count)); return flatten_entities(std::move(entity_ids_vec)); } @@ -742,11 +733,7 @@ std::vector read_and_process( std::move(segment_and_slice_futures), std::move(processing_unit_indexes), std::make_shared>>(read_query.clauses_)); - auto start = std::chrono::steady_clock::now(); auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); - auto end = std::chrono::steady_clock::now(); - auto us = std::chrono::duration_cast(end - start).count(); - log::schedule().debug("Final gather_entities call: {}us", us); if (std::any_of(read_query.clauses_.begin(), read_query.clauses_.end(), [](const std::shared_ptr& clause) { return clause->clause_info().modifies_output_descriptor_; From de6a7a6aa2a4bbf4d3e6213ae85f1695b221cbba Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 20 Sep 2024 10:24:18 +0000 Subject: [PATCH 36/38] Fix some C++ tests since ComponentManager API change --- .../processing/test/test_component_manager.cpp | 11 +++++------ cpp/arcticdb/processing/test/test_resample.cpp | 18 ++++++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/arcticdb/processing/test/test_component_manager.cpp b/cpp/arcticdb/processing/test/test_component_manager.cpp index 0cddd5922a..a7c41752df 100644 --- a/cpp/arcticdb/processing/test/test_component_manager.cpp +++ b/cpp/arcticdb/processing/test/test_component_manager.cpp @@ -26,13 +26,12 @@ TEST(ComponentManager, Simple) { auto key_1 = std::make_shared(AtomKeyBuilder().version_id(2).build("symbol_1", KeyType::TABLE_DATA)); uint64_t entity_fetch_count_1{2}; - auto id_0 = component_manager.get_new_entity_id(); - component_manager.add_entity(id_0, segment_0, row_range_0, col_range_0, key_0, entity_fetch_count_0); + auto ids = component_manager.get_new_entity_ids(2); + component_manager.add_entity(ids[0], segment_0, row_range_0, col_range_0, key_0, entity_fetch_count_0); - auto id_1 = component_manager.get_new_entity_id(); - component_manager.add_entity(id_1, segment_1, row_range_1, col_range_1, key_1, entity_fetch_count_1); + component_manager.add_entity(ids[1], segment_1, row_range_1, col_range_1, key_1, entity_fetch_count_1); - auto [segments, row_ranges, col_ranges, keys, entity_fetch_counts] = component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>({id_0, id_1}); + auto [segments, row_ranges, col_ranges, keys, entity_fetch_counts] = component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>(ids); ASSERT_EQ(segments[0], segment_0); ASSERT_EQ(row_ranges[0], row_range_0); @@ -47,5 +46,5 @@ TEST(ComponentManager, Simple) { ASSERT_EQ(entity_fetch_counts[1], entity_fetch_count_1); // EntityFetchCount for entity with id_1 is 2, so can be fetched again without exceptions - component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>({id_1}); + component_manager.get_entities, std::shared_ptr, std::shared_ptr, std::shared_ptr, EntityFetchCount>({ids[1]}); } diff --git a/cpp/arcticdb/processing/test/test_resample.cpp b/cpp/arcticdb/processing/test/test_resample.cpp index d669f4c011..7c473d9a9f 100644 --- a/cpp/arcticdb/processing/test/test_resample.cpp +++ b/cpp/arcticdb/processing/test/test_resample.cpp @@ -329,16 +329,14 @@ TEST(Resample, ProcessMultipleSegments) { auto row_range_2 = std::make_shared(5, 6); auto col_range_2 = std::make_shared(1, 2); - auto id_0 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_0, seg_0, row_range_0, col_range_0, EntityFetchCount(1)); - auto id_1 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_1, seg_1, row_range_1, col_range_1, EntityFetchCount(2)); - auto id_2 = component_manager->get_new_entity_id(); - component_manager->add_entity(id_2, seg_2, row_range_2, col_range_2, EntityFetchCount(1)); - - std::vector ids_0{id_0, id_1}; - std::vector ids_1{id_1}; - std::vector ids_2{id_2}; + auto ids = component_manager->get_new_entity_ids(3); + component_manager->add_entity(ids[0], seg_0, row_range_0, col_range_0, EntityFetchCount(1)); + component_manager->add_entity(ids[0], seg_1, row_range_1, col_range_1, EntityFetchCount(2)); + component_manager->add_entity(ids[2], seg_2, row_range_2, col_range_2, EntityFetchCount(1)); + + std::vector ids_0{ids[0], ids[1]}; + std::vector ids_1{ids[1]}; + std::vector ids_2{ids[2]}; auto resampled_0 = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, resample.process(std::move(ids_0))); auto resampled_seg_0 = *resampled_0.segments_.value()[0]; From 56c188389a019aeb8fee9a2f1bf06dbb994869f4 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 20 Sep 2024 10:50:51 +0000 Subject: [PATCH 37/38] Fix remaining tests --- cpp/arcticdb/CMakeLists.txt | 55 ++++++++++--------- cpp/arcticdb/processing/component_manager.cpp | 31 +++++++++++ cpp/arcticdb/processing/component_manager.hpp | 15 ++--- .../processing/test/test_resample.cpp | 2 +- 4 files changed, 64 insertions(+), 39 deletions(-) create mode 100644 cpp/arcticdb/processing/component_manager.cpp diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 0c0e45843f..5e97ac205b 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -434,6 +434,7 @@ set(arcticdb_srcs processing/processing_unit.cpp processing/aggregation_utils.cpp processing/clause.cpp + processing/component_manager.cpp processing/expression_node.cpp processing/operation_dispatch.cpp processing/operation_dispatch_unary.cpp @@ -874,33 +875,33 @@ if(${TEST}) python_utils_dump_vars_if_enabled("Python for test compilation") set(unit_test_srcs - async/test/test_async.cpp - codec/test/test_codec.cpp - codec/test/test_encode_field_collection.cpp - codec/test/test_segment_header.cpp - codec/test/test_encoded_field.cpp - column_store/test/ingestion_stress_test.cpp - column_store/test/test_column.cpp - column_store/test/test_column_data_random_accessor.cpp - column_store/test/test_index_filtering.cpp - column_store/test/test_memory_segment.cpp - entity/test/test_atom_key.cpp - entity/test/test_key_serialization.cpp - entity/test/test_metrics.cpp - entity/test/test_ref_key.cpp - entity/test/test_tensor.cpp - log/test/test_log.cpp - pipeline/test/test_container.hpp - pipeline/test/test_pipeline.cpp - pipeline/test/test_query.cpp - util/test/test_regex.cpp - processing/test/test_arithmetic_type_promotion.cpp - processing/test/test_clause.cpp - processing/test/test_component_manager.cpp - processing/test/test_expression.cpp - processing/test/test_filter_and_project_sparse.cpp - processing/test/test_has_valid_type_promotion.cpp - processing/test/test_operation_dispatch.cpp +# async/test/test_async.cpp +# codec/test/test_codec.cpp +# codec/test/test_encode_field_collection.cpp +# codec/test/test_segment_header.cpp +# codec/test/test_encoded_field.cpp +# column_store/test/ingestion_stress_test.cpp +# column_store/test/test_column.cpp +# column_store/test/test_column_data_random_accessor.cpp +# column_store/test/test_index_filtering.cpp +# column_store/test/test_memory_segment.cpp +# entity/test/test_atom_key.cpp +# entity/test/test_key_serialization.cpp +# entity/test/test_metrics.cpp +# entity/test/test_ref_key.cpp +# entity/test/test_tensor.cpp +# log/test/test_log.cpp +# pipeline/test/test_container.hpp +# pipeline/test/test_pipeline.cpp +# pipeline/test/test_query.cpp +# util/test/test_regex.cpp +# processing/test/test_arithmetic_type_promotion.cpp +# processing/test/test_clause.cpp +# processing/test/test_component_manager.cpp +# processing/test/test_expression.cpp +# processing/test/test_filter_and_project_sparse.cpp +# processing/test/test_has_valid_type_promotion.cpp +# processing/test/test_operation_dispatch.cpp processing/test/test_resample.cpp processing/test/test_set_membership.cpp processing/test/test_signed_unsigned_comparison.cpp diff --git a/cpp/arcticdb/processing/component_manager.cpp b/cpp/arcticdb/processing/component_manager.cpp new file mode 100644 index 0000000000..5b413b6477 --- /dev/null +++ b/cpp/arcticdb/processing/component_manager.cpp @@ -0,0 +1,31 @@ +/* Copyright 2023 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. + */ + + +#include + +#include + +namespace arcticdb { + +std::vector ComponentManager::get_new_entity_ids(size_t count) { + std::vector ids(count); + std::lock_guard lock(mtx_); + registry_.create(ids.begin(), ids.end()); + return ids; +} + +void ComponentManager::erase_entity(EntityId id) { + // Ideally would call registry_.destroy(id), or at least registry_.erase>(id) + // at this point. However, they are both slower than this, so just decrement the ref count of the only + // sizeable component, so that when the shared pointer goes out of scope in the calling function, the + // memory is freed + registry_.get>(id).reset(); +} + + +} // namespace arcticdb \ No newline at end of file diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index e55c7eadb2..116e100591 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -30,12 +30,7 @@ class ComponentManager { ComponentManager() = default; ARCTICDB_NO_MOVE_OR_COPY(ComponentManager) - std::vector get_new_entity_ids(size_t count) { - std::vector ids(count); - std::lock_guard lock(mtx_); - registry_.create(ids.begin(), ids.end()); - return ids; - } + std::vector get_new_entity_ids(size_t count); // Add a single entity with the components defined by args template @@ -96,12 +91,8 @@ class ComponentManager { tuple_res.emplace_back(std::move(view.get(id))); auto& remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id); // This entity will never be accessed again - // Ideally would call registry_.destroy(id), or at least registry_.erase>(id) - // at this point. However, they are both slower than this, so just decrement the ref count of the only - // sizeable component, so that when the shared pointer goes out of scope in the calling function, the - // memory is freed if (--remaining_entity_fetch_count == 0) { - registry_.get>(id).reset(); + erase_entity(id); } } } @@ -119,6 +110,8 @@ class ComponentManager { } private: + void erase_entity(EntityId id); + entt::registry registry_; std::mutex mtx_; }; diff --git a/cpp/arcticdb/processing/test/test_resample.cpp b/cpp/arcticdb/processing/test/test_resample.cpp index 7c473d9a9f..1c8ebc87ed 100644 --- a/cpp/arcticdb/processing/test/test_resample.cpp +++ b/cpp/arcticdb/processing/test/test_resample.cpp @@ -331,7 +331,7 @@ TEST(Resample, ProcessMultipleSegments) { auto ids = component_manager->get_new_entity_ids(3); component_manager->add_entity(ids[0], seg_0, row_range_0, col_range_0, EntityFetchCount(1)); - component_manager->add_entity(ids[0], seg_1, row_range_1, col_range_1, EntityFetchCount(2)); + component_manager->add_entity(ids[1], seg_1, row_range_1, col_range_1, EntityFetchCount(2)); component_manager->add_entity(ids[2], seg_2, row_range_2, col_range_2, EntityFetchCount(1)); std::vector ids_0{ids[0], ids[1]}; From 7cda181fd63c761dd2e6f791346b89332c25416a Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 20 Sep 2024 17:25:39 +0100 Subject: [PATCH 38/38] Fix Windows segfaults --- cpp/arcticdb/processing/component_manager.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/processing/component_manager.hpp b/cpp/arcticdb/processing/component_manager.hpp index 116e100591..c7410cb53f 100644 --- a/cpp/arcticdb/processing/component_manager.hpp +++ b/cpp/arcticdb/processing/component_manager.hpp @@ -82,11 +82,11 @@ class ComponentManager { std::tuple...> get_entities(const std::vector& ids) { std::vector> tuple_res; tuple_res.reserve(ids.size()); - auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); - // Using view.get theoretically and empirically faster than registry_.get - auto view = registry_.view(); { std::lock_guard lock(mtx_); + auto&& remaining_entity_fetch_count_registry = registry_.storage(remaining_entity_fetch_count_id); + // Using view.get theoretically and empirically faster than registry_.get + auto view = registry_.view(); for (auto id: ids) { tuple_res.emplace_back(std::move(view.get(id))); auto& remaining_entity_fetch_count = remaining_entity_fetch_count_registry.get(id);