Skip to content

I/O Operations

io

Classes

Functions

load_dataset

load_dataset(path: str | PathLike[str], name: str | None = None, format: str = 'auto', **kwargs: Any) -> Dataset

Load dataset from file or directory.

Parameters:

Name Type Description Default
path str | PathLike[str]

Path to dataset

required
name str | None

Optional name for the dataset

None
format str

Dataset format ('coco', 'yolo', or 'auto' to detect)

'auto'
**kwargs Any

Additional format-specific parameters

{}

Returns:

Type Description
Dataset

Loaded Dataset

Raises:

Type Description
ValueError

If format is unsupported or auto-detection fails

FileNotFoundError

If path doesn't exist

Source code in boxlab/dataset/io.py
def load_dataset(
    path: str | os.PathLike[str],
    name: str | None = None,
    format: str = "auto",
    **kwargs: t.Any,
) -> Dataset:
    """Load dataset from file or directory.

    Args:
        path: Path to dataset
        name: Optional name for the dataset
        format: Dataset format ('coco', 'yolo', or 'auto' to detect)
        **kwargs: Additional format-specific parameters

    Returns:
        Loaded Dataset

    Raises:
        ValueError: If format is unsupported or auto-detection fails
        FileNotFoundError: If path doesn't exist
    """
    import pathlib

    path = pathlib.Path(path)

    if not path.exists():
        raise FileNotFoundError(f"Path not found: {path}")

    # Auto-detect format
    if format == "auto":
        format = _detect_format(path)
        logger.info(f"Auto-detected format: {format}")

    # Get loader
    try:
        loader = get_loader(format)
    except KeyError as e:
        available = list_loaders()
        raise ValueError(f"Unsupported format: {format}. Available formats: {available}") from e

    # Load dataset
    return loader.load(path, name, **kwargs)

export_dataset

export_dataset(dataset: Dataset, output_dir: str | PathLike[str], format: str, split_ratio: SplitRatio | None = None, seed: int | None = None, naming: str | NamingStrategy = 'original', copy_images: bool = True, **kwargs: Any) -> None

Export dataset to specified format.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to export

required
output_dir str | PathLike[str]

Output directory path

required
format str

Export format ('coco', 'yolo', etc.)

required
split_ratio SplitRatio | None

Optional train/val/test split ratios

None
seed int | None

Random seed for reproducibility

None
naming str | NamingStrategy

File naming strategy ('original', 'prefix', 'uuid', 'sequential') or a NamingStrategy instance

'original'
copy_images bool

Whether to copy image files

True
**kwargs Any

Additional format-specific parameters

{}

Raises:

Type Description
ValueError

If format is unsupported

Examples:

>>> # Export to COCO format
>>> export_dataset(dataset, "./output", format="coco")
>>> # Export to YOLO with splits
>>> export_dataset(
...     dataset,
...     "./output",
...     format="yolo",
...     split_ratio=SplitRatio(train=0.7, val=0.2, test=0.1),
...     naming="prefix",
... )
Source code in boxlab/dataset/io.py
def export_dataset(
    dataset: Dataset,
    output_dir: str | os.PathLike[str],
    format: str,
    split_ratio: SplitRatio | None = None,
    seed: int | None = None,
    naming: str | NamingStrategy = "original",
    copy_images: bool = True,
    **kwargs: t.Any,
) -> None:
    """Export dataset to specified format.

    Args:
        dataset: Dataset to export
        output_dir: Output directory path
        format: Export format ('coco', 'yolo', etc.)
        split_ratio: Optional train/val/test split ratios
        seed: Random seed for reproducibility
        naming: File naming strategy ('original', 'prefix', 'uuid', 'sequential')
                or a NamingStrategy instance
        copy_images: Whether to copy image files
        **kwargs: Additional format-specific parameters

    Raises:
        ValueError: If format is unsupported

    Examples:
        >>> # Export to COCO format
        >>> export_dataset(dataset, "./output", format="coco")

        >>> # Export to YOLO with splits
        >>> export_dataset(
        ...     dataset,
        ...     "./output",
        ...     format="yolo",
        ...     split_ratio=SplitRatio(train=0.7, val=0.2, test=0.1),
        ...     naming="prefix",
        ... )
    """
    # Get exporter
    try:
        exporter = get_exporter(format)
    except KeyError as e:
        available = list_exporters()
        raise ValueError(f"Unsupported format: {format}. Available formats: {available}") from e

    # Get naming strategy
    naming_strategy = _get_naming_strategy(naming)

    # Export
    exporter.export(
        dataset,
        output_dir,
        split_ratio=split_ratio,
        seed=seed,
        naming_strategy=naming_strategy,
        copy_images=copy_images,
        **kwargs,
    )

get_supported_formats

get_supported_formats() -> dict[str, dict[str, Any]]

Get information about all supported formats.

Returns:

Type Description
dict[str, dict[str, Any]]

Dictionary with loader and exporter information

Examples:

>>> formats = get_supported_formats()
>>> print(formats["loaders"])
{'coco': {...}, 'yolo': {...}}
Source code in boxlab/dataset/io.py
def get_supported_formats() -> dict[str, dict[str, t.Any]]:
    """Get information about all supported formats.

    Returns:
        Dictionary with loader and exporter information

    Examples:
        >>> formats = get_supported_formats()
        >>> print(formats["loaders"])
        {'coco': {...}, 'yolo': {...}}
    """
    from boxlab.dataset.plugins.registry import get_exporter_info
    from boxlab.dataset.plugins.registry import get_loader_info

    return {
        "loaders": get_loader_info(),
        "exporters": get_exporter_info(),
    }

merge

merge(*datasets: Dataset, name: str = 'merged_dataset', resolve_conflicts: Literal['skip', 'rename', 'error'] = 'skip', preserve_sources: bool = True, fix_duplicates: bool = True) -> Dataset

Merge multiple datasets into one.

Parameters:

Name Type Description Default
*datasets Dataset

Datasets to merge

()
name str

Name for the merged dataset

'merged_dataset'
resolve_conflicts Literal['skip', 'rename', 'error']

How to handle category conflicts

'skip'
preserve_sources bool

Whether to preserve source information

True
fix_duplicates bool

Whether to fix duplicate category names

True

Returns:

Type Description
Dataset

Merged dataset

Source code in boxlab/dataset/io.py
def merge(
    *datasets: Dataset,
    name: str = "merged_dataset",
    resolve_conflicts: t.Literal["skip", "rename", "error"] = "skip",
    preserve_sources: bool = True,
    fix_duplicates: bool = True,
) -> Dataset:
    """Merge multiple datasets into one.

    Args:
        *datasets: Datasets to merge
        name: Name for the merged dataset
        resolve_conflicts: How to handle category conflicts
        preserve_sources: Whether to preserve source information
        fix_duplicates: Whether to fix duplicate category names

    Returns:
        Merged dataset
    """
    import logging

    from boxlab.exceptions import DatasetMergeError

    logger = logging.getLogger(__name__)

    if not datasets:
        raise DatasetMergeError("No datasets provided for merging")

    logger.info(f"Merging {len(datasets)} datasets into '{name}'")

    # Fix duplicates if requested
    if fix_duplicates:
        for dataset in datasets:
            dataset.fix_duplicate_categories()

    merged = Dataset(name=name)

    for i, dataset in enumerate(datasets):
        logger.debug(f"Merging dataset {i + 1}/{len(datasets)}: {dataset.name}")
        merged = merged.merge(
            dataset,
            resolve_conflicts=resolve_conflicts,
            preserve_sources=preserve_sources,
        )

    logger.info(f"Successfully merged {len(datasets)} datasets")

    return merged

options: show_root_heading: true show_source: true heading_level: 2 members_order: source show_signature_annotations: true separate_signature: true

Overview

The I/O module provides high-level convenience functions for common dataset operations. It simplifies loading, exporting, and merging datasets with automatic format detection and sensible defaults.

Key Concepts

Automatic Format Detection

The load_dataset() function can automatically detect the format based on file structure:

  • .json files → COCO format
  • Directories with .yaml/.yml + images//labels/ → YOLO format

Naming Strategies

When exporting datasets, you can control how output files are named:

Strategy Pattern Example
original Keep original name image001.jpg
uuid Random UUID a1b2c3d4-e5f6-7890.jpg
uuid_prefix UUID + source prefix camera1_a1b2c3d4.jpg
sequential Sequential numbers 000001.jpg
sequential_prefix Numbers + source prefix camera1_000001.jpg

Conflict Resolution

When merging datasets with duplicate category names:

  • skip: Keep the first occurrence
  • rename: Add _other suffix to duplicates
  • error: Raise an exception

Common Workflows

Convert Between Formats

from boxlab.dataset.io import load_dataset, export_dataset

# Load COCO dataset
dataset = load_dataset("coco/instances.json", format="coco")

# Export to YOLO format
export_dataset(dataset, "output/yolo", format="yolo")

Split Dataset for Training

from boxlab.dataset.io import load_dataset, export_dataset
from boxlab.dataset.types import SplitRatio

# Load dataset
dataset = load_dataset("my_dataset/")

# Export with 70/20/10 split
split_ratio = SplitRatio(train=0.7, val=0.2, test=0.1)
export_dataset(
    dataset,
    "output/split_data",
    format="yolo",
    split_ratio=split_ratio,
    seed=42  # Reproducible split
)

Combine Multiple Datasets

from boxlab.dataset.io import load_dataset, merge, export_dataset

# Load datasets from different sources
ds1 = load_dataset("source1/")
ds2 = load_dataset("source2/")
ds3 = load_dataset("source3/")

# Merge all datasets
merged = merge(ds1, ds2, ds3, name="combined_dataset")

# Export merged dataset
export_dataset(merged, "output/merged", format="coco")

Format Support Discovery

List Available Formats

from boxlab.dataset.io import get_supported_formats

formats = get_supported_formats()

print("Available Loaders:")
for name, info in formats["loaders"].items():
    print(f"  • {name}: {info['description']}")

print("\nAvailable Exporters:")
for name, info in formats["exporters"].items():
    print(f"  • {name}: {info['description']}")

Check Format Capabilities

from boxlab.dataset.io import get_supported_formats

formats = get_supported_formats()

# Check COCO loader extensions
coco_info = formats["loaders"]["coco"]
print(f"COCO supports: {coco_info['supported_extensions']}")

# Check YOLO exporter defaults
yolo_info = formats["exporters"]["yolo"]
print(f"YOLO defaults: {yolo_info['default_config']}")

Error Handling

Handle Format Detection Failures

from boxlab.dataset.io import load_dataset

try:
    # Try auto-detection
    dataset = load_dataset("unknown_structure/")
except ValueError as e:
    print(f"Auto-detection failed: {e}")
    # Fall back to explicit format
    dataset = load_dataset("unknown_structure/", format="yolo")

Handle Merge Conflicts

from boxlab.dataset.io import merge
from boxlab.exceptions import CategoryConflictError

try:
    merged = merge(
        ds1, ds2,
        resolve_conflicts="error"  # Strict mode
    )
except CategoryConflictError as e:
    print(f"Category conflict: {e}")
    # Retry with automatic resolution
    merged = merge(ds1, ds2, resolve_conflicts="rename")

See Also