Skip to content

onnx

onnx

Attributes

__doctitle__ module-attribute

__doctitle__ = 'ONNX Runtime'

logger module-attribute

logger = getLogger('inferflow.runtime.onnx')

__all__ module-attribute

__all__ = ['ONNXRuntimeMixin', 'ONNXRuntime']

Classes

ONNXRuntimeMixin

Shared ONNX runtime logic for sync and async implementations.

This mixin provides common ONNX-specific logic that is shared between synchronous and asynchronous runtime implementations. It handles:

- Execution provider selection (CPU, CUDA)
- Input precision conversion
- Output parsing
- Batch output splitting

This mixin is pure logic with no I/O operations, making it safe to reuse across sync and async implementations.

Attributes:

Name Type Description
device Any

Device configuration (provided by subclass).

precision Precision

Precision configuration (provided by subclass).

Example
# In sync runtime
class ONNXRuntime(
    ONNXRuntimeMixin, RuntimeConfigMixin, BatchableRuntime
):
    def load(self):
        providers = self._get_onnx_providers()  # Use mixin
        # ...


# In async runtime
class ONNXRuntime(
    ONNXRuntimeMixin, RuntimeConfigMixin, BatchableRuntime
):
    async def load(self):
        providers = (
            self._get_onnx_providers()
        )  # Same mixin!
        # ...
Attributes
device instance-attribute
device: Any
precision instance-attribute
precision: Precision

ONNXRuntime

ONNXRuntime(model_path: str | PathLike[str], device: str | Device, precision: Precision = FP32, warmup_iterations: int = 3, warmup_shape: tuple[int, ...] = (1, 3, 224, 224), providers: list[str] | None = None)

Bases: RuntimeConfigMixin, ONNXRuntimeMixin, BatchableRuntime[ndarray, Any]

ONNX Runtime for model inference (sync version).

Supports
  • ONNX (.onnx) models
  • CPU, CUDA execution providers
  • FP32, FP16 precision
  • Batch inference
  • Automatic warmup

This is the synchronous version. For async support, see inferflow.asyncio.runtime.onnx.ONNXRuntime.

Attributes:

Name Type Description
session InferenceSession | None

Loaded ONNX inference session (None before load()).

input_name str | None

Name of the model's input tensor.

output_names list[str] | None

Names of the model's output tensors.

providers

List of execution providers to use.

Parameters:

Name Type Description Default
model_path str | PathLike[str]

Path to ONNX model file.

required
device str | Device

Device to run inference on (default: "cpu").

required
precision Precision

Model precision (default: FP32).

FP32
warmup_iterations int

Number of warmup iterations (default: 3).

3
warmup_shape tuple[int, ...]

Input shape for warmup (default: (1, 3, 224, 224)).

(1, 3, 224, 224)
providers list[str] | None

ONNX execution providers (default: auto-detect).

None

Raises:

Type Description
FileNotFoundError

If model file does not exist.

ImportError

If onnxruntime is not installed.

Example
import inferflow as iff
import numpy as np

# Initialize runtime
runtime = iff.ONNXRuntime(
    model_path="model.onnx",
    device="cuda:0",
    precision=iff.Precision.FP16,
)

# Single inference
with runtime:
    input_array = np.random.randn(1, 3, 224, 224).astype(
        np.float32
    )
    output = runtime.infer(input_array)

# Batch inference
with runtime:
    batch = [
        np.random.randn(1, 3, 224, 224).astype(np.float32),
        np.random.randn(1, 3, 224, 224).astype(np.float32),
    ]
    outputs = runtime.infer_batch(batch)
Source code in inferflow/runtime/onnx.py
def __init__(
    self,
    model_path: str | os.PathLike[str],
    device: str | Device,
    precision: Precision = Precision.FP32,
    warmup_iterations: int = 3,
    warmup_shape: tuple[int, ...] = (1, 3, 224, 224),
    providers: list[str] | None = None,
):
    super().__init__(
        model_path=model_path,
        device=device,
        precision=precision,
        warmup_iterations=warmup_iterations,
        warmup_shape=warmup_shape,
    )

    # Get providers (reuse mixin)
    self.providers = self._get_onnx_providers(providers)

    self.session: ort.InferenceSession | None = None
    self.input_name: str | None = None
    self.output_names: list[str] | None = None

    logger.info(
        f"ONNXRuntime initialized: "
        f"model={self.model_path}, device={self.device}, "
        f"providers={self.providers}, precision={self.precision.value}"
    )
Attributes
providers instance-attribute
providers = _get_onnx_providers(providers)
session instance-attribute
session: InferenceSession | None = None
input_name instance-attribute
input_name: str | None = None
output_names instance-attribute
output_names: list[str] | None = None
Functions
load
load() -> None

Load ONNX model and prepare for inference.

Performs
  • Configure session options
  • Load model from disk
  • Extract input/output names
  • Warmup inference

Raises:

Type Description
FileNotFoundError

If model file does not exist.

RuntimeError

If ONNX Runtime fails to load model.

Source code in inferflow/runtime/onnx.py
def load(self) -> None:
    """Load ONNX model and prepare for inference.

    Performs:
        - Configure session options
        - Load model from disk
        - Extract input/output names
        - Warmup inference

    Raises:
        FileNotFoundError: If model file does not exist.
        RuntimeError: If ONNX Runtime fails to load model.
    """
    logger.info(f"Loading ONNX model from {self.model_path}")

    # Session options
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

    # Load model
    self.session = ort.InferenceSession(
        str(self.model_path),
        sess_options=sess_options,
        providers=self.providers,
    )

    # Get input/output names
    self.input_name = self.session.get_inputs()[0].name
    self.output_names = [output.name for output in self.session.get_outputs()]

    logger.info(
        f"Model loaded:  input={self.input_name}, "
        f"outputs={self.output_names}, "
        f"providers={self.session.get_providers()}"
    )

    # Warmup
    self._warmup()
infer
infer(input: ndarray) -> Any

Run inference on a single input.

Automatically handles
  • Converting to correct dtype

Parameters:

Name Type Description Default
input ndarray

Input numpy array.

required

Returns:

Type Description
Any

Output array or tuple of arrays (if multi-output model).

Raises:

Type Description
RuntimeError

If model is not loaded.

Example
with runtime:
    input = np.random.randn(1, 3, 224, 224).astype(
        np.float32
    )
    output = runtime.infer(input)
Source code in inferflow/runtime/onnx.py
def infer(self, input: np.ndarray) -> t.Any:
    """Run inference on a single input.

    Automatically handles:
        - Converting to correct dtype

    Args:
        input: Input numpy array.

    Returns:
        Output array or tuple of arrays (if multi-output model).

    Raises:
        RuntimeError: If model is not loaded.

    Example:
        ```python
        with runtime:
            input = np.random.randn(1, 3, 224, 224).astype(
                np.float32
            )
            output = runtime.infer(input)
        ```
    """
    if self.session is None or self.input_name is None or self.output_names is None:
        raise RuntimeError("Model not loaded. Call load() first.")

    # Prepare input (reuse mixin)
    input = self._prepare_onnx_input(input)

    # Run inference
    outputs = self.session.run(self.output_names, {self.input_name: input})

    # Parse output (reuse mixin)
    return self._parse_onnx_output(list(outputs))
infer_batch
infer_batch(inputs: list[ndarray]) -> list[Any]

Run inference on a batch of inputs.

Concatenates inputs into a single batch array for efficient processing, then splits the output back into individual results.

Parameters:

Name Type Description Default
inputs list[ndarray]

List of input arrays. Each should have shape (1, C, H, W).

required

Returns:

Type Description
list[Any]

List of outputs, one per input. Each maintains batch dimension.

Raises:

Type Description
RuntimeError

If model is not loaded.

Example
with runtime:
    batch = [
        np.random.randn(1, 3, 224, 224).astype(np.float32),
        np.random.randn(1, 3, 224, 224).astype(np.float32),
    ]
    outputs = runtime.infer_batch(batch)
Source code in inferflow/runtime/onnx.py
def infer_batch(self, inputs: list[np.ndarray]) -> list[t.Any]:
    """Run inference on a batch of inputs.

    Concatenates inputs into a single batch array for efficient
    processing, then splits the output back into individual results.

    Args:
        inputs: List of input arrays. Each should have shape (1, C, H, W).

    Returns:
        List of outputs, one per input. Each maintains batch dimension.

    Raises:
        RuntimeError: If model is not loaded.

    Example:
        ```python
        with runtime:
            batch = [
                np.random.randn(1, 3, 224, 224).astype(np.float32),
                np.random.randn(1, 3, 224, 224).astype(np.float32),
            ]
            outputs = runtime.infer_batch(batch)
        ```
    """
    if self.session is None:
        raise RuntimeError("Model not loaded. Call load() first.")

    # Concatenate inputs
    batch = np.concatenate(inputs, axis=0)

    # Prepare input (reuse mixin)
    batch = self._prepare_onnx_input(batch)

    # Run batch inference
    batch_output = self.infer(batch)

    # Split outputs (reuse mixin)
    return self._split_onnx_batch_output(batch_output, len(inputs))
unload
unload() -> None

Unload model and free resources.

Performs
  • Release session from memory

Safe to call multiple times.

Source code in inferflow/runtime/onnx.py
def unload(self) -> None:
    """Unload model and free resources.

    Performs:
        - Release session from memory

    Safe to call multiple times.
    """
    logger.info("Unloading model")
    self.session = None
    self.input_name = None
    self.output_names = None
    logger.info("Model unloaded")