Source code for draugr.numpy_utilities.datasets.categorical.deep_category
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import os
__author__ = "Christian Heider Nielsen"
from pathlib import Path
from draugr.numpy_utilities.datasets.splitting import train_valid_test_split
from draugr.numpy_utilities.datasets.defaults import DEFAULT_ACCEPTED_FILE_FORMATS
from typing import Iterable, Union
from warg import drop_unused_kws
__all__ = ["build_deep_categorical_dataset"]
[docs]@drop_unused_kws
def build_deep_categorical_dataset(
directory: Union[Path, str],
*,
validation_percentage: float = 15,
testing_percentage: float = 0,
extensions: Iterable = DEFAULT_ACCEPTED_FILE_FORMATS,
is_valid_file: callable = None,
) -> dict:
"""
Builds a list of training images from the file system.
Analyzes the sub folders in the image directory, splits them into stable
training, testing, and validation sets, and returns a data structure
describing the lists of images for each label and their paths.
Args:
image_directory: String path to a folder containing subfolders of images.
testing_percentage: Integer percentage of the images to reserve for tests.
validation_percentage: Integer percentage of images reserved for validation.
Returns:
An OrderedDict containing an entry for each label subfolder, with images
split into training, testing, and validation sets within each label.
The order of items defines the class indices.
:param is_valid_file:
:param directory:
:param validation_percentage:
:param testing_percentage:
:param extensions:
:type extensions:"""
if not isinstance(directory, Path):
directory = Path(directory)
if not directory.exists():
logging.error(f"Image directory {directory} not found.")
raise FileNotFoundError(f"Image directory {directory} not found.")
b = [path for path, sub_dirs, files in os.walk(str(directory)) if len(files) > 0]
categories_dict = {label.split("/")[-1]: [] for label in b}
for label, path in {label.split("/")[-1]: label for label in b}.items():
for sub_directory in sorted([Path(x[0]) for x in os.walk(str(path))]):
logging.info(f"Looking for images in {sub_directory}")
for extension in sorted(set(os.path.normcase(ext) for ext in extensions)):
extension = extension.lstrip(".")
categories_dict[label].extend(sub_directory / f"*.{extension}")
return train_valid_test_split(
categories_dict,
testing_percentage=testing_percentage,
validation_percentage=validation_percentage,
)
if __name__ == "__main__":
def aiusdj() -> None:
"""
:rtype: None
"""
from draugr.visualisation import indent_lines
from draugr.numpy_utilities.datasets.splitting import SplitEnum
a = build_deep_categorical_dataset(
Path.home() / "Data" / "mnist_png" / "training", testing_percentage=0
)
for k in a.keys():
total = (
len(a[k][SplitEnum.training])
+ len(a[k][SplitEnum.validation])
+ len(a[k][SplitEnum.testing])
)
print(f"\n{k}:")
print(indent_lines(len(a[k][SplitEnum.training]) / total))
print(indent_lines(len(a[k][SplitEnum.validation]) / total))
print(indent_lines(len(a[k][SplitEnum.testing]) / total))
aiusdj()