Source code for draugr.numpy_utilities.datasets.splitting

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = "Christian Heider Nielsen"
__doc__ = r"""

           Created on 25/03/2020
           """

import collections
import hashlib
import re
import sys
from enum import Enum
from pathlib import Path
from typing import Any, Dict, Iterable, OrderedDict, Sequence

import numpy

__all__ = ["SplitEnum", "SplitIndexer", "train_valid_test_split", "select_split"]

from sorcery import assigned_names


[docs]class SplitEnum(Enum): """ Split Enum class for selecting splits""" (training, validation, testing) = assigned_names()
[docs]class SplitIndexer: """Splits dataset in to 3 parts based on percentages, returns indices for the data set sequence""" default_split_names = {i: i.value for i in SplitEnum}
[docs] def __init__( self, dataset_length: int, training: float = 0.7, validation: float = 0.2, testing: float = 0.1, ): self.total_num = dataset_length splits = numpy.array([training, validation, testing]) self.normalised_split = splits / sum(splits) ( self.training_percentage, self.validation_percentage, self.testing_percentage, ) = self.normalised_split self.training_num, self.validation_num, self.testing_num = self.unnormalised( dataset_length )
[docs] def shuffled_indices(self) -> dict: """description""" split_indices = numpy.random.permutation(self.total_num).tolist() return { SplitEnum.training: self.select_train_indices(split_indices), SplitEnum.validation: self.select_validation_indices(split_indices), SplitEnum.testing: self.select_testing_indices(split_indices), }
[docs] def select_train_indices(self, ind: Sequence) -> Sequence: """description""" return ind[: self.training_num]
[docs] def select_validation_indices(self, ind: Sequence) -> Sequence: """description""" if self.validation_num: if self.testing_num: return ind[self.training_num : -self.testing_num] return ind[self.training_num :] return []
[docs] def select_testing_indices(self, ind: Sequence) -> Sequence: """description""" if self.testing_num: return ind[-self.testing_num :] return []
[docs] def unnormalised(self, num: int, floored: bool = True) -> numpy.ndarray: """ :param num: :type num: :param floored: :type floored: :return: :rtype:""" unnorm = self.normalised_split * num if floored: unnorm = numpy.floor(unnorm) return unnorm.astype(int)
def __repr__(self) -> str: return str( {k: n for k, n in zip(self.default_split_names, self.normalised_split)} )
[docs] def select_shuffled_split_indices(self, split: SplitEnum, seed: int = 0) -> object: """description""" numpy.random.seed(seed) split_indices = numpy.random.permutation(self.total_num).tolist() if split == SplitEnum.training: return self.select_train_indices(split_indices) elif split == SplitEnum.validation: return self.select_validation_indices(split_indices) elif split == SplitEnum.testing: return self.select_testing_indices(split_indices) elif split is None: return split_indices raise NotImplementedError
[docs]def train_valid_test_split( categories: Dict[str, Iterable[Path]], *, validation_percentage: float = 15, # TODO: ACCEPT AND SQUEEZE ZERO-HUNDRED TO ZERO-ONE range! testing_percentage: float = 0, verbose: bool = False, ) -> OrderedDict: """ Magic hashing :param verbose: :type verbose: :param categories: :param testing_percentage: :param validation_percentage: :return:""" result = collections.OrderedDict() if verbose: print(categories) for c, vs in categories.items(): training_images = [] testing_images = [] validation_images = [] for file_name in vs: b_rep = bytes(re.sub(r"_nohash_.*$", "", f"{c}{file_name.name}"), "utf8") percentage_hash = ( int(hashlib.sha1(b_rep).hexdigest(), 16) % (sys.maxsize + 1) ) * (100.0 / sys.maxsize) if percentage_hash < validation_percentage + testing_percentage: if percentage_hash < testing_percentage: testing_images.append(file_name) else: validation_images.append(file_name) else: training_images.append(file_name) result[c] = { SplitEnum.training: training_images, SplitEnum.validation: validation_images, SplitEnum.testing: testing_images, } return result
[docs]def select_split( data_cat_split: Dict[Any, Dict[SplitEnum, Sequence]], split: SplitEnum, verbose: bool = False, ) -> Dict[Any, Sequence]: """ :param verbose: :type verbose: :param data_cat_split: :type data_cat_split: :param split: :type split: :return: :rtype:""" data = {k: [] for k in data_cat_split.keys()} if verbose: print(data_cat_split) for k, v in data_cat_split.items(): if verbose: print(v[split]) for item in v[split]: data[k].append(item) return data
if __name__ == "__main__": def asd(): """description""" split_by_p = SplitIndexer(100) print(split_by_p.default_split_names) print(split_by_p.shuffled_indices()) print(split_by_p.select_shuffled_split_indices(SplitEnum.training)) a = split_by_p.select_shuffled_split_indices(None) print(a, len(a)) def uihsad(): """description""" a = None if a: a = SplitEnum(a) print(a) uihsad()