Source code for draugr.numpy_utilities.datasets.categorical.shallow_category

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import os

__author__ = "Christian Heider Nielsen"

from pathlib import Path
from typing import Dict, Sequence, Iterable, Union

from draugr.numpy_utilities.datasets.splitting import (
    SplitEnum,
    train_valid_test_split,
)
from draugr.numpy_utilities.datasets.defaults import DEFAULT_ACCEPTED_FILE_FORMATS
from warg import drop_unused_kws

__all__ = ["build_shallow_categorical_dataset"]


[docs]@drop_unused_kws def build_shallow_categorical_dataset( directory: Union[Path, str], *, validation_percentage: float = 15, testing_percentage: float = 0, extensions: Iterable = DEFAULT_ACCEPTED_FILE_FORMATS, is_valid_file: callable = None, verbose: bool = False, ) -> Dict[str, Dict[SplitEnum, Sequence]]: """ Returns: An OrderedDict containing an entry for each label subfolder, with images split into training, testing, and validation sets within each label. The order of items defines the class indices. :param is_valid_file: :param directory: :param validation_percentage: :param testing_percentage: :param extensions: :param verbose: :return:""" if not isinstance(directory, Path): directory = Path(directory) if not extensions: extensions = DEFAULT_ACCEPTED_FILE_FORMATS if not directory.exists(): logging.error(f"directory {directory} not found.") raise FileNotFoundError(f"directory {directory} not found.") categories_dict = {category: [] for category in next(os.walk(str(directory)))[1]} logging.info(f"Found categories {categories_dict.keys()}") for c in categories_dict.keys(): for sub_directory in sorted([Path(x[0]) for x in os.walk(str(directory / c))]): logging.info(f"Looking for samples in {sub_directory}") for extension in sorted(set(os.path.normcase(ext) for ext in extensions)): extension = extension.lstrip(".") files = list(sub_directory.glob(f"*.{extension}")) logging.info( f"Found {len(files)} samples of type {extension} for category {c}" ) categories_dict[c].extend(files) if verbose: print(categories_dict) return train_valid_test_split( categories_dict, testing_percentage=testing_percentage, validation_percentage=validation_percentage, )
if __name__ == "__main__": def asd() -> None: """ :rtype: None """ from draugr.visualisation import indent_lines from draugr.numpy_utilities.datasets.splitting import SplitEnum a = build_shallow_categorical_dataset( Path.home() / "Data" / "mnist_png" / "training", testing_percentage=0 ) for k in a.keys(): total = ( len(a[k][SplitEnum.training]) + len(a[k][SplitEnum.validation]) + len(a[k][SplitEnum.testing]) ) print(f"\n{k}:") print(indent_lines(len(a[k][SplitEnum.training]) / total)) print(indent_lines(len(a[k][SplitEnum.validation]) / total)) print(indent_lines(len(a[k][SplitEnum.testing]) / total)) asd()