Skip to content
Draft
13 changes: 3 additions & 10 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,23 +81,16 @@ std::optional<size_t> determine_dynamic_batch_size(const IODescriptor& desc,
return std::nullopt;
}

if (!desc.shapeFromIRModel.has_value() || !desc.shapeFromIRModel.value().is_dynamic()) {
// Make sure that PLUGIN batch mode is currently active
if (*desc.shapeFromCompiler.begin() != intel_npu::utils::DEFAULT_BATCH_SIZE) {
return std::nullopt;
}

if (batchSize.has_value()) {
return batchSize.value();
}

if (tensor->get_shape().empty() || *desc.shapeFromCompiler.begin() != intel_npu::utils::DEFAULT_BATCH_SIZE) {
return std::nullopt;
}

if ((*desc.shapeFromIRModel)[intel_npu::utils::BATCH_AXIS].is_dynamic()) {
return tensor->get_shape()[intel_npu::utils::BATCH_AXIS];
}

return std::nullopt;
return tensor->get_shape()[intel_npu::utils::BATCH_AXIS];
}

} // namespace
Expand Down
269 changes: 252 additions & 17 deletions src/plugins/intel_npu/src/plugin/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,31 @@ std::shared_ptr<ov::Model> create_dummy_model(const std::vector<IODescriptor>& i
ov::ParameterVector parameters;
ov::ResultVector results;

// Helper function to check if a tensor was originally dynamic
auto wasOriginallyDynamic = [](const std::unordered_set<std::string>& tensorNames) -> bool {
for (const auto& name : tensorNames) {
if (name.find("_DYNBATCH_ORIG") != std::string::npos) {
return true;
}
}
return false;
};

for (const IODescriptor& inputDescriptor : inputDescriptors) {
if (inputDescriptor.isStateInput || inputDescriptor.isStateOutput || inputDescriptor.isShapeTensor ||
inputDescriptor.isInitInputWeights || inputDescriptor.isMainInputWeights) {
continue;
}

std::shared_ptr<ov::op::v0::Parameter> parameter = std::make_shared<ov::op::v0::Parameter>(
inputDescriptor.precision,
inputDescriptor.shapeFromIRModel.has_value() ? *inputDescriptor.shapeFromIRModel
: inputDescriptor.shapeFromCompiler);
auto shape = inputDescriptor.shapeFromIRModel.has_value() ? *inputDescriptor.shapeFromIRModel
: inputDescriptor.shapeFromCompiler;

if (wasOriginallyDynamic(inputDescriptor.outputTensorNames)) {
shape[intel_npu::utils::BATCH_AXIS] = ov::Dimension(-1);
}

std::shared_ptr<ov::op::v0::Parameter> parameter =
std::make_shared<ov::op::v0::Parameter>(inputDescriptor.precision, shape);

parameter->set_friendly_name(inputDescriptor.nodeFriendlyName);
parameter->output(0).get_tensor().set_names(inputDescriptor.outputTensorNames);
Expand All @@ -86,11 +101,17 @@ std::shared_ptr<ov::Model> create_dummy_model(const std::vector<IODescriptor>& i
std::shared_ptr<ov::Node> constantDummy =
std::make_shared<ov::op::v0::Constant>(outputDescriptor.precision, CONSTANT_NODE_DUMMY_SHAPE);

const std::shared_ptr<ov::descriptor::Tensor>& tensorDummy = std::make_shared<ov::descriptor::Tensor>(
outputDescriptor.precision,
outputDescriptor.shapeFromIRModel.has_value() ? *outputDescriptor.shapeFromIRModel
: outputDescriptor.shapeFromCompiler,
outputDescriptor.outputTensorNames);
auto shape = outputDescriptor.shapeFromIRModel.has_value() ? *outputDescriptor.shapeFromIRModel
: outputDescriptor.shapeFromCompiler;

if (wasOriginallyDynamic(outputDescriptor.outputTensorNames)) {
shape[intel_npu::utils::BATCH_AXIS] = ov::Dimension(-1);
}

const std::shared_ptr<ov::descriptor::Tensor>& tensorDummy =
std::make_shared<ov::descriptor::Tensor>(outputDescriptor.precision,
shape,
outputDescriptor.outputTensorNames);

auto& result = results.emplace_back(std::make_shared<ov::op::v0::Result>(constantDummy));
result->output(0).set_tensor_ptr(tensorDummy);
Expand Down Expand Up @@ -516,9 +537,165 @@ ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& argument
return _properties->get_property(name, arguments);
}

// Helper function to check if shape has dynamic dimensions other than batch dimension
bool hasOtherDynamicDims(const ov::PartialShape& shape) {
for (size_t dim_idx = 1; dim_idx < shape.size(); dim_idx++) {
if (shape[dim_idx].is_dynamic()) {
return true; // Found dynamic dimension other than batch
}
}
return false;
}

bool checkModelDynamicDims(const std::shared_ptr<const ov::Model>& model) {
// Check parameters (inputs)
const auto& params = model->get_parameters();
for (const auto& param : params) {
const auto& shape = param->get_partial_shape();
if (hasOtherDynamicDims(shape)) {
return true;
}
}

// Check results (outputs)
const auto& results = model->get_results();
for (const auto& result : results) {
const auto& shape = result->get_output_partial_shape(0);
if (hasOtherDynamicDims(shape)) {
return true;
}
}

return false;
}

bool validateModelBatch(const std::shared_ptr<const ov::Model>& model, Logger logger) {
std::set<ov::Output<const ov::Node>> batchedInputs;
std::set<ov::Output<const ov::Node>> batchedOutputs;
std::set<size_t> sBatchSize;

// Limitation: Plugin batching is not supported when there are dynamic
// dimensions other than the batch dimension.
if (checkModelDynamicDims(model)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we were discussed this offline the problem of simultaneous existence of batch and dynamic dimensions is that we would have a "gap" in plain tensor data after a first batch end position and before a next batch start position.

That IMHO makes it harder to find where next N-line for a next batch portion is located and we cannot simply find a next batch I in an entire tensor by executing I * get_size() / N
Also there is an opened issue to overcome a such imitation.

What if we do not introduce that limitation at all, as it manifests a lot of brittleness in the code and introduces very specific checks like hasOtherDynamicDims?
it's a rare situation that we have N!=1 and dynamic shape together, which being observed only add a complexity for getting next line of batch in a result tensor, and which will be covered soon as additional issue?

What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we do not introduce that limitation at all, as it manifests a lot of brittleness in the code and introduces very specific checks like hasOtherDynamicDims?
it's a rare situation that we have N!=1 and dynamic shape together, which being observed only add a complexity for getting next line of batch in a result tensor, and which will be covered soon as additional issue?

Sure, we can do it separately in E#179728. Thanks!

return false;
}

const auto& params = model->get_parameters();
for (size_t input_id = 0; input_id < params.size(); input_id++) {
const auto& input = params[input_id];
const auto& shape = input->get_partial_shape();
ov::Layout layout = ov::layout::get_layout(input);

// Batching on plugin is working only when batching is found on 0th dimension
if ((shape.size() &&
shape[intel_npu::utils::BATCH_AXIS].get_max_length() != intel_npu::utils::DEFAULT_BATCH_SIZE) ||
(ov::layout::has_batch(layout) && ov::layout::batch_idx(layout) == intel_npu::utils::BATCH_AXIS)) {
const auto& staticShape = shape.is_dynamic() ? shape.get_max_shape() : input->get_shape();
batchedInputs.insert(params[input_id]->output(0));

if (shape.rank().is_dynamic()) {
OPENVINO_THROW("Shapes with dynamic rank are not supported.");
} else {
sBatchSize.insert(staticShape[intel_npu::utils::BATCH_AXIS]);
}
} else {
// gather some diagnostic info
std::optional<size_t> batch_dim_index_detected;
for (size_t i = 1; i < shape.size(); i++) {
if (shape[i].has_symbol()) {
batch_dim_index_detected = i;
break;
}
}
std::stringstream sstream;
sstream << "Only networks with inputs batched by 0th dimension are supported. ";
if (batch_dim_index_detected.has_value()) {
sstream << "The batch has been detected on: " << batch_dim_index_detected.value()
<< " dimension instead. ";
} else {
sstream << "The batch hasn't been detected at all. ";
}
sstream << "Please check input id: " << input_id << " by the name: " << input->get_friendly_name()
<< ", layout: " << layout.to_string() << ", is_dynamic: " << shape.is_dynamic();
logger.info("%s", sstream.str());
return false;
}
}
for (const auto& output : model->get_results()) {
const auto& shape = output->get_output_partial_shape(0);
ov::Layout layout = ov::layout::get_layout(output);

// Batching on plugin is working only when batching is found on 0th dimension
if ((shape.size() &&
shape[intel_npu::utils::BATCH_AXIS].get_max_length() != intel_npu::utils::DEFAULT_BATCH_SIZE) ||
(ov::layout::has_batch(layout) && ov::layout::batch_idx(layout) == intel_npu::utils::BATCH_AXIS)) {
const auto& node = output->input_value(0);
const auto& staticShape = shape.is_dynamic() ? shape.get_max_shape() : output->get_shape();
batchedOutputs.insert(ov::Output<const ov::Node>(node.get_node(), node.get_index()));

if (shape.rank().is_dynamic()) {
OPENVINO_THROW("Shapes with dynamic rank are not supported.");
} else {
sBatchSize.insert(staticShape[intel_npu::utils::BATCH_AXIS]);
}
} else {
logger.info("Only networks with outputs batched by 0th dimension are supported. Please check an output by "
"the name: %s, layout: %s",
output->get_friendly_name(),
layout.to_string());
return false;
}
}
if (!batchedInputs.size() || !batchedOutputs.size()) {
logger.info(
"Only networks with inputs/outputs featuring batched dim are supported! Got inputs: %ld, outputs: %ld",
batchedInputs.size(),
batchedOutputs.size());
return false;
}

if (sBatchSize.size() != 1) {
logger.info("Batching size shall have same value for all tensors! Got unique batch sizes number: %ld",
sBatchSize.size());
return false;
}

auto node_info_printer = [&logger](const auto& ov_node, std::string nodeType) {
logger.info("%s: %s has shape value: %s",
nodeType,
ov_node.get_any_name(),
ov_node.get_partial_shape().to_string());
};

for (const auto& ov_node : batchedInputs) {
node_info_printer(ov_node, "Input");
}
for (const auto& ov_node : batchedOutputs) {
node_info_printer(ov_node, "Output");
}

return true;
}

void deBatchModel(std::shared_ptr<ov::Model>& model, ov::Dimension newBatch) {
size_t inputIdx = 0;
std::map<std::string, ov::PartialShape> newShapes;
for (auto&& item : model->get_parameters()) {
auto layout = item->get_layout();
auto partShape = item->get_partial_shape();
if (ov::layout::has_batch(layout)) {
partShape[ov::layout::batch_idx(layout)] = newBatch;
}
newShapes.emplace(item->get_friendly_name(), partShape);
inputIdx++;
}
model->reshape(newShapes);
}

std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<const ov::Model>& model,
const ov::AnyMap& properties) const {
OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::compile_model");
auto modelForCompilation = model->clone();

// Before going any further: if
// ... 1 - NPUW mode is activated
Expand Down Expand Up @@ -560,21 +737,79 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
auto device = _backend == nullptr ? nullptr : _backend->getDevice(localConfig.get<DEVICE_ID>());
localConfig.update({{ov::intel_npu::platform.name(), platform}});

if (localConfig.isAvailable(ov::intel_npu::batch_mode.name()) &&
!localConfig.has(ov::intel_npu::batch_mode.name())) {
auto updateBatchMode = [&](ov::intel_npu::BatchMode mode) {
std::stringstream strStream;
strStream << ov::intel_npu::BatchMode::AUTO;
strStream << mode;
_logger.info("Setting batching mode to %s.", strStream.str());
localConfig.update({{ov::intel_npu::batch_mode.name(), strStream.str()}});
};

if (localConfig.isAvailable(ov::intel_npu::batch_mode.name()) &&
!localConfig.has(ov::intel_npu::batch_mode.name())) {
updateBatchMode(ov::intel_npu::BatchMode::AUTO);
}

if (localConfig.isAvailable(ov::intel_npu::batch_mode.name()) && !model->get_variables().empty()) {
if (localConfig.get<BATCH_MODE>() == ov::intel_npu::BatchMode::PLUGIN) {
OPENVINO_THROW("This model contains states, thus it is not supported when handling batching on the plugin");
}

std::stringstream strStream;
strStream << ov::intel_npu::BatchMode::COMPILER;
localConfig.update({{ov::intel_npu::batch_mode.name(), strStream.str()}});
updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
}

if (localConfig.isAvailable(ov::intel_npu::batch_mode.name())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tips:
I checked and it seems that all logic requires the ingredient condition localConfig.isAvailable met and it seems that we can wrap all these lines under the single if

if((localConfig.isAvailable()) {
 ... all new code
}

Perhaps it improves readability

bool autoOrPluginBatch = localConfig.get<BATCH_MODE>() == ov::intel_npu::BatchMode::PLUGIN ||
localConfig.get<BATCH_MODE>() == ov::intel_npu::BatchMode::AUTO;
if (modelForCompilation->is_dynamic()) { // Avoiding risks with static models. TODO: common solution.
try {
const bool pluginBatchingIsSupported = validateModelBatch(modelForCompilation, _logger);

if (autoOrPluginBatch && pluginBatchingIsSupported) {
_logger.info("Attempting to handle batching on the plugin side.");

// Store dynamic batch info in tensor names BEFORE reshaping
auto encodeDynamicBatchInfo = [](std::shared_ptr<ov::Model> model) {
// Encode info in input tensor names
for (auto& input : model->inputs()) {
std::string originalName = input.get_any_name();
std::string newName = originalName + "_DYNBATCH_ORIG";
input.get_tensor().set_names({newName});
}

// Encode info in output tensor names
for (auto& output : model->outputs()) {
std::string originalName = output.get_any_name();
std::string newName = originalName + "_DYNBATCH_ORIG";
output.get_tensor().set_names({newName});
}
};

try {
encodeDynamicBatchInfo(modelForCompilation);
ov::set_batch(modelForCompilation, ov::Dimension(1));
updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
} catch (const std::exception& ex) {
_logger.warning("The plugin couldn't resize a batched model due to exception: %s.\n"
"Trying to debatch it...",
ex.what());
encodeDynamicBatchInfo(modelForCompilation);
deBatchModel(modelForCompilation, ov::Dimension(1));
if (!modelForCompilation) {
OPENVINO_THROW("Cannot debatch a model");
}
_logger.info("The model has been debatched successfully");
updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
}
} else {
_logger.info("Batching will be handed by compiler.");
updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
}
} catch (const std::exception& ex) {
_logger.info("Couldn't validate and reshape the model. Batching will be handed by compiler.",
ex.what());
updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
}
}
}

// Update stepping w/ information from driver, unless provided by user or we are off-device
Expand Down Expand Up @@ -625,10 +860,10 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
_logger.debug("performing compile");

if (!localConfig.get<WEIGHTLESS_BLOB>()) {
graph = compiler->compile(model->clone(), localConfig);
graph = compiler->compile(modelForCompilation->clone(), localConfig);
} else {
check_weightless_cache_attribute_occurrence(model);
graph = compiler->compileWS(model->clone(), localConfig);
graph = compiler->compileWS(modelForCompilation->clone(), localConfig);
}
} catch (const std::exception& ex) {
OPENVINO_THROW(ex.what());
Expand Down
Loading