diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index 42724e933cb220..00723bd82876e1 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -82,6 +82,8 @@ class reorder_factory { std::map> _cached_reorders; }; +int64_t get_convolution_channel_count(const convolution_node& conv_node, const layout& layout, bool is_input); + class layout_optimizer { public: enum class optimization_attributes_type { diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index ede2b2bb462c91..af36c51994866b 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -112,6 +112,21 @@ std::pair, bool> reorder_factory::get_weights_reorder } } +int64_t cldnn::get_convolution_channel_count(const convolution_node& conv_node, const layout& layout, bool is_input) { + auto channel_count = layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1; + if (channel_count == -1) { + auto weights_layout = conv_node.weights().get_output_layout(); + if (weights_layout.is_static()) { + const auto& shape = weights_layout.get_partial_shape(); + if (is_input) + channel_count = shape[conv_node.get_groups() > 1 ? 2 : 1].get_length(); + else + channel_count = shape[conv_node.get_groups() > 1 ? 1 : 0].get_length(); + } + } + return channel_count; +} + bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) { if (node.is_type() && fmt == format::byxf) return false; @@ -250,24 +265,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, return false; }; - auto get_conv_channel_count = [](const convolution_node& conv_node, const layout& layout, bool is_input) -> int64_t { - auto channel_count = layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1; - if (channel_count == -1) { - auto weights_layout = conv_node.weights().get_output_layout(); - if (weights_layout.is_static()) { - const auto& shape = weights_layout.get_partial_shape(); - if (is_input) - channel_count = shape[conv_node.get_groups() > 1 ? 2 : 1].get_length(); - else - channel_count = shape[conv_node.get_groups() > 1 ? 1 : 0].get_length(); - } - } - return channel_count; - }; - auto& conv_node = next.as(); - auto in_channel_count = get_conv_channel_count(conv_node, prev_output_layout, true); - auto out_channel_count = get_conv_channel_count(conv_node, next_output_layout, false); + auto in_channel_count = get_convolution_channel_count(conv_node, prev_output_layout, true); + auto out_channel_count = get_convolution_channel_count(conv_node, next_output_layout, false); if ((prev.is_dynamic() || next.is_dynamic()) && (in_channel_count == -1 || out_channel_count == -1)) return false; @@ -276,7 +276,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, if (next.get_preferred_impl_type() == impl_types::onednn && ((fmt_prev == format::byxf && fmt_next == format::byxf) || (fmt_prev == format::bfyx && fmt_next == format::byxf && - (prev_dt == data_types::f16 && get_conv_channel_count(conv_node, next.get_input_layout(0), false) <= 8))) && + (prev_dt == data_types::f16 && get_convolution_channel_count(conv_node, next.get_input_layout(0), false) <= 8))) && is_input_reorder(prev, next)) return true; @@ -989,22 +989,9 @@ void layout_optimizer::set_onednn_dyn_conv_preferred_format(convolution_node& no return (rank <= 4) ? cldnn::format::byxf : cldnn::format::bzyxf; }; - // Helper function to get channel count safely - auto get_channel_count = [](const layout& layout) -> int64_t { - return layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1; - }; - // Get channel counts once - int64_t input_channels = get_channel_count(input_layout); - int64_t output_channels = get_channel_count(output_layout); - auto weights_layout = node.weights().get_output_layout(); - // Try to get channel counts from weight layout - if (input_channels == -1 && weights_layout.is_static()) { - input_channels = weights_layout.get_partial_shape()[node.get_groups() > 1 ? 2 : 1].get_length(); - } - if (output_channels == -1 && weights_layout.is_static()) { - output_channels = weights_layout.get_partial_shape()[node.get_groups() > 1 ? 1 : 0].get_length(); - } + auto input_channels = get_convolution_channel_count(node, input_layout, true); + auto output_channels = get_convolution_channel_count(node, output_layout, false); if (i8_u8_input) { // Set default input format for i8/u8 input diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index fc6ed79aca5064..5aaab1b0e803de 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -606,7 +606,13 @@ bool primitive_inst::need_reset_output_memory() const { const bool is_user_onednn_impl = user_inst->get_node().get_preferred_impl_type() == impl_types::onednn; const bool is_user_conv = user_inst->get_node().is_type(); if (is_user_conv && is_user_onednn_impl) { + auto& conv_node = user_inst->get_node().as(); auto& output_layout = _impl_params->get_output_layout(0); + auto in_channel_count = get_convolution_channel_count(conv_node, output_layout, true); + // If the channel count is dynamic, we cannot verify feature alignment, + // so we conservatively do the reset and return true for this condition. + if (in_channel_count == -1) + return true; auto get_feature_block_size = [](format fmt) { int feature_block_size = 1; @@ -623,7 +629,7 @@ bool primitive_inst::need_reset_output_memory() const { auto feature_block_size = get_feature_block_size(fmt); // if layout is single blocked and feature size is not aligned with the blocking size, need to reset output so that we can guarantee zero-filling // NOTE: We may improve this logic to avoid reset if we are sure that it is not "corrupted" by other layers. - if (output_layout.feature() % feature_block_size != 0) { + if (in_channel_count % feature_block_size != 0) { return true; } } diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp index ce821bda8fad95..44c602ca08ace1 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp @@ -31,6 +31,7 @@ const std::string no_bias = ""; struct mem_reset_params { ov::Dimension::value_type in_channel; + bool is_dynamic; bool need_reset; }; @@ -43,20 +44,32 @@ TEST_P(mem_reset_test, need_reset_output_memory_test) { return; tests::random_generator rg(GET_SUITE_NAME); - ov::PartialShape input_pshape = {1, p.in_channel, 64, 64}; + + ov::PartialShape target_pshape = {1, p.in_channel, 64, 64}; + ov::PartialShape input_pshape; + + if (p.is_dynamic) { + for (size_t i = 0; i < target_pshape.size(); ++i) { + input_pshape.emplace_back(ov::Dimension()); + } + input_pshape[1] = target_pshape[1]; + } else { + input_pshape = target_pshape; + } + ov::PartialShape weights_pshape = {16, p.in_channel, 3, 3}; layout in_layout{ input_pshape, data_types::f16, format::bfyx }; layout weights_layout{ weights_pshape, data_types::f16, format::bfyx }; - auto input_data = rg.generate_random_1d(in_layout.count(), -1, 1); - auto input_mem = engine.allocate_memory(in_layout); + auto input_data = rg.generate_random_1d(ov::shape_size(target_pshape.get_shape()), -1, 1); + auto input_mem = engine.allocate_memory({ target_pshape, data_types::f16, format::bfyx }); set_values(input_mem, input_data); auto weights_data = rg.generate_random_1d(weights_layout.count(), -1, 1); auto weights_mem = engine.allocate_memory(weights_layout); set_values(weights_mem, weights_data); - auto input1 = input_layout("input1", input_mem->get_layout()); - auto input2 = input_layout("input2", input_mem->get_layout()); + auto input1 = input_layout("input1", in_layout); + auto input2 = input_layout("input2", in_layout); auto weights = data("weights", weights_mem); auto eltw = eltwise("eltwise", {input_info("input1"), input_info("input2")}, eltwise_mode::sum); auto eltw_reorder = reorder("reorder1", input_info("eltwise"), format::b_fs_yx_fsv16, data_types::f16 ); @@ -87,13 +100,19 @@ TEST_P(mem_reset_test, need_reset_output_memory_test) { auto outputs_test_blocked = network_test_blocked.execute(); - auto reorder_inst = network_test_blocked.get_primitive("reorder1"); + // Additional reorder is added and fused when force_implemenetations enable in dynamic + auto target_primitive_id = p.is_dynamic ? "reorder1_0_reorder_2" : "reorder1"; + auto reorder_inst = network_test_blocked.get_primitive(target_primitive_id); ASSERT_TRUE(PrimitiveInstTestHelper::need_reset_output_memory(reorder_inst) == p.need_reset); } INSTANTIATE_TEST_SUITE_P(smoke, mem_reset_test, testing::Values( - mem_reset_params{ 9, true }, // If tensor is not packed(not aligned to 16), need_reset_output_memory == true - mem_reset_params{ 16, false } + // static + mem_reset_params{ 9, false, true }, // If tensor is not packed(not aligned to 16), need_reset_output_memory == true + mem_reset_params{ 16, false, false }, + // dynamic + mem_reset_params{ 9, true, true }, + mem_reset_params{ 16, true, false } ) );