Source code for mialab.utilities.file_access_utilities

"""This modules contains utility functions and classes for the access of the file system."""
import abc
import enum
import os
import typing as t

import mialab.data.structure as structure


[docs]class FilePathGenerator(metaclass=abc.ABCMeta): """Represents an abstract file path generator. This class is used in :py:class:`FileSystemDataCrawler` to convert a human readable data identifier to an data file path, which allows to load the data."""
[docs] @staticmethod @abc.abstractmethod def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str: """Gets the full file path for a data file. Args: id_ (str): The data's identification. root_dir (str): The data file's root directory. file_key (object): A human readable identifier used to identify the data file. file_extension (str): The data's file extension. Returns: str: The data's full file path. """ raise NotImplementedError()
[docs]class BrainImageFilePathGenerator(FilePathGenerator): """Represents a brain image file path generator. The generator is used to convert a human readable image identifier to an image file path, which allows to load the image. """
[docs] def __init__(self): """Initializes a new instance of the BrainImageFilePathGenerator class.""" pass
[docs] @staticmethod def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str: """Gets the full file path for an image. Args: id_ (str): The image identification. root_dir (str): The image' root directory. file_key (object): A human readable identifier used to identify the image. file_extension (str): The image' file extension. Returns: str: The images' full file path. """ # the commented file_names are for the registration group if file_key == structure.BrainImageTypes.T1w: file_name = 'T1native' elif file_key == structure.BrainImageTypes.T2w: file_name = 'T2native' elif file_key == structure.BrainImageTypes.GroundTruth: file_name = 'labels_native' elif file_key == structure.BrainImageTypes.BrainMask: file_name = 'Brainmasknative' elif file_key == structure.BrainImageTypes.RegistrationTransform: return os.path.join(root_dir, 'affine.txt') else: raise ValueError('Unknown key') return os.path.join(root_dir, file_name + file_extension)
[docs]class DirectoryFilter(metaclass=abc.ABCMeta): """Represents an abstract directory filter. This class is used in :py:class:`FileSystemDataCrawler` to filter a list of directories. """
[docs] @staticmethod @abc.abstractmethod def filter_directories(dirs: t.List[str]) -> t.List[str]: """Filters a list of directories. Args: dirs (List[str]): A list of directories. Returns: List[str]: The filtered list of directories. """ raise NotImplementedError()
[docs]class DataDirectoryFilter(DirectoryFilter): """Represents a data directory filter. The filter is used to """
[docs] def __init__(self): """Initializes a new instance of the DataDirectoryFilter class.""" pass
[docs] @staticmethod def filter_directories(dirs: t.List[str]) -> t.List[str]: """Filters a list of directories. Args: dirs (List[str]): A list of directories. Returns: List[str]: The filtered list of directories. """ # currently, we do not filter the directories. but you could filter the directory list like this: # return [dir for dir in dirs if not dir.lower().__contains__('atlas')] return dirs
[docs]class FileSystemDataCrawler: """Represents a file system data crawler. Examples: Suppose we have the following directory structure:: /path/to/root_dir ./Patient1 ./Image.mha ./GroundTruth.mha ./some_text_file.txt ./Patient2 ./Image.mha ./GroundTruth.mha ./GroundTruthRater2.mha ./Atlas ./Atlas.mha We can use the following code to load the images `Image.mha` and `GroundTruth.mha` in the directories `Patient1` and `Patient2`: >>> class MyImgType(enum.Enum): >>> T1 = 1 >>> GroundTruth = 2 >>> >>> class MyFilePathGenerator(FilePathGenerator): >>> @staticmethod >>> def get_full_file_path(_id: str, root_dir: str, file_key, file_extension: str) -> str: >>> if file_key == MyImgType.T1: >>> file_name = 'Image' >>> elif file_key == MyImgType.GroundTruth: >>> file_name = 'GroundTruth' >>> else: >>> raise ValueError('Unknown key') >>> >>> return os.path.join(root_dir, file_name + file_extension) >>> >>> class MyDirFilter(DirectoryFilter): >>> @staticmethod >>> def filter_directories(dirs: t.List[str]) -> t.List[str]: >>> return sorted([dir_ for dir_ in dirs if dir_.lower().__contains__('patient')]) >>> >>> crawler = FileSystemDataCrawler('/path/to/root_dir', >>> [MyImgType.T1, MyImgType.GroundTruth], >>> MyFilePathGenerator(), >>> MyDirFilter(), >>> '.mha') >>> for id_, path in crawler.data.items(): >>> print(id_, path) Patient1 {'Patient1': '/path/to/root_dir/Patient1', <MyImgType.T1: 1>: '/path/to/root_dir/Patient1/Image.mha', <MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient1/GroundTruth.mha'} Patient2 {'Patient2': '/path/to/root_dir/Patient2', <MyImgType.T1: 1>: '/path/to/root_dir/Patient2/Image.mha', <MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient2/GroundTruth.mha'} """
[docs] def __init__(self, root_dir: str, file_keys: list, file_path_generator: FilePathGenerator, dir_filter: DirectoryFilter = None, file_extension: str = '.nii.gz'): """Initializes a new instance of the FileSystemDataCrawler class. Args: root_dir (str): The path to the root directory, which contains subdirectories with the data. file_keys (list): A list of objects, which represent human readable data identifiers (one identifier for each data file to crawl). file_path_generator (FilePathGenerator): A file path generator, which converts a human readable data identifier to an data file path. dir_filter (DirectoryFilter): A directory filter, which filters a list of directories. file_extension (str): The data file extension (with or without dot). """ super().__init__() self.root_dir = root_dir self.dir_filter = dir_filter self.file_keys = file_keys self.file_path_generator = file_path_generator self.file_extension = file_extension if file_extension.startswith('.') else '.' + file_extension # dict with key=id (i.e, directory name), value=path to data directory self.data = {} # dict with key=id (i.e, directory name), value=dict with key=file_keys and value=path to file data_dir = self._crawl_directories() self._crawl_data(data_dir)
def _crawl_data(self, data_dir: dict): """Crawls the data inside a directory.""" for id_, path in data_dir.items(): data_dict = {id_: path} # init dict with id_ pointing to path for item in self.file_keys: file_path = self.file_path_generator.get_full_file_path(id_, path, item, self.file_extension) data_dict[item] = file_path self.data[id_] = data_dict def _crawl_directories(self) -> dict: """Crawls the directories, which contain data. Returns: dict: A dictionary where the keys are the directory names and the values the full path to the directory. """ if not os.path.isdir(self.root_dir): raise ValueError('root_dir {} does not exist'.format(self.root_dir)) # search the root directory for data directories data_dirs = next(os.walk(self.root_dir))[1] if self.dir_filter: # filter the data directories data_dirs = self.dir_filter.filter_directories(data_dirs) return { data_dir: os.path.join(self.root_dir, data_dir) for data_dir in data_dirs if any(file.endswith(self.file_extension) for file # check if directory contains data files in os.listdir(os.path.join(self.root_dir, data_dir))) }