Source code for mialab.utilities.file_access_utilities

"""This modules contains utility functions and classes for the access of the file system."""
import abc
import enum
import os
import typing as t

import mialab.data.structure as structure


[docs]class FilePathGenerator(metaclass=abc.ABCMeta):
    """Represents an abstract file path generator.

    This class is used in :py:class:`FileSystemDataCrawler` to convert a human readable data identifier to an
    data file path, which allows to load the data."""

[docs]    @staticmethod
    @abc.abstractmethod
    def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str:
        """Gets the full file path for a data file.

                Args:
                    id_ (str): The data's identification.
                    root_dir (str): The data file's root directory.
                    file_key (object): A human readable identifier used to identify the data file.
                    file_extension (str): The data's file extension.

                Returns:
                    str: The data's full file path.
                """
        raise NotImplementedError()


[docs]class BrainImageFilePathGenerator(FilePathGenerator):
    """Represents a brain image file path generator.

    The generator is used to convert a human readable image identifier to an image file path,
    which allows to load the image.
    """

[docs]    def __init__(self):
        """Initializes a new instance of the BrainImageFilePathGenerator class."""
        pass

[docs]    @staticmethod
    def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str:
        """Gets the full file path for an image.

        Args:
            id_ (str): The image identification.
            root_dir (str): The image' root directory.
            file_key (object): A human readable identifier used to identify the image.
            file_extension (str): The image' file extension.

        Returns:
            str: The images' full file path.
        """

        # the commented file_names are for the registration group

        if file_key == structure.BrainImageTypes.T1w:
            file_name = 'T1native'
        elif file_key == structure.BrainImageTypes.T2w:
            file_name = 'T2native'
        elif file_key == structure.BrainImageTypes.GroundTruth:
            file_name = 'labels_native'
        elif file_key == structure.BrainImageTypes.BrainMask:
            file_name = 'Brainmasknative'
        elif file_key == structure.BrainImageTypes.RegistrationTransform:
            return os.path.join(root_dir, 'affine.txt')
        else:
            raise ValueError('Unknown key')

        return os.path.join(root_dir, file_name + file_extension)


[docs]class DirectoryFilter(metaclass=abc.ABCMeta):
    """Represents an abstract directory filter.

    This class is used in  :py:class:`FileSystemDataCrawler` to filter a list of directories.
    """

[docs]    @staticmethod
    @abc.abstractmethod
    def filter_directories(dirs: t.List[str]) -> t.List[str]:
        """Filters a list of directories.

        Args:
            dirs (List[str]): A list of directories.

        Returns:
            List[str]: The filtered list of directories.
        """
        raise NotImplementedError()


[docs]class DataDirectoryFilter(DirectoryFilter):
    """Represents a data directory filter.

    The filter is used to
    """

[docs]    def __init__(self):
        """Initializes a new instance of the DataDirectoryFilter class."""
        pass

[docs]    @staticmethod
    def filter_directories(dirs: t.List[str]) -> t.List[str]:
        """Filters a list of directories.

        Args:
            dirs (List[str]): A list of directories.

        Returns:
            List[str]: The filtered list of directories.
        """

        # currently, we do not filter the directories. but you could filter the directory list like this:
        # return [dir for dir in dirs if not dir.lower().__contains__('atlas')]
        return dirs


[docs]class FileSystemDataCrawler:
    """Represents a file system data crawler.

    Examples:
        Suppose we have the following directory structure::

            /path/to/root_dir
                ./Patient1
                    ./Image.mha
                    ./GroundTruth.mha
                    ./some_text_file.txt
                ./Patient2
                    ./Image.mha
                    ./GroundTruth.mha
                    ./GroundTruthRater2.mha
                ./Atlas
                    ./Atlas.mha

        We can use the following code to load the images `Image.mha` and `GroundTruth.mha`
        in the directories `Patient1` and `Patient2`:

        >>> class MyImgType(enum.Enum):
        >>>     T1 = 1
        >>>     GroundTruth = 2
        >>>
        >>> class MyFilePathGenerator(FilePathGenerator):
        >>>     @staticmethod
        >>>     def get_full_file_path(_id: str, root_dir: str, file_key, file_extension: str) -> str:
        >>>         if file_key == MyImgType.T1:
        >>>             file_name = 'Image'
        >>>         elif file_key == MyImgType.GroundTruth:
        >>>             file_name = 'GroundTruth'
        >>>         else:
        >>>             raise ValueError('Unknown key')
        >>>
        >>>         return os.path.join(root_dir, file_name + file_extension)
        >>>
        >>> class MyDirFilter(DirectoryFilter):
        >>>     @staticmethod
        >>>     def filter_directories(dirs: t.List[str]) -> t.List[str]:
        >>>         return sorted([dir_ for dir_ in dirs if dir_.lower().__contains__('patient')])
        >>>
        >>> crawler = FileSystemDataCrawler('/path/to/root_dir',
        >>>                                 [MyImgType.T1, MyImgType.GroundTruth],
        >>>                                 MyFilePathGenerator(),
        >>>                                 MyDirFilter(),
        >>>                                 '.mha')
        >>> for id_, path in crawler.data.items():
        >>>     print(id_, path)
        Patient1 {'Patient1': '/path/to/root_dir/Patient1',
                  <MyImgType.T1: 1>: '/path/to/root_dir/Patient1/Image.mha',
                  <MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient1/GroundTruth.mha'}
        Patient2 {'Patient2': '/path/to/root_dir/Patient2',
                  <MyImgType.T1: 1>: '/path/to/root_dir/Patient2/Image.mha',
                  <MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient2/GroundTruth.mha'}
    """

[docs]    def __init__(self,
                 root_dir: str,
                 file_keys: list,
                 file_path_generator: FilePathGenerator,
                 dir_filter: DirectoryFilter = None,
                 file_extension: str = '.nii.gz'):
        """Initializes a new instance of the FileSystemDataCrawler class.

        Args:
            root_dir (str): The path to the root directory, which contains subdirectories with the data.
            file_keys (list): A list of objects, which represent human readable data identifiers
                (one identifier for each data file to crawl).
            file_path_generator (FilePathGenerator): A file path generator, which converts a human readable
                data identifier to an data file path.
            dir_filter (DirectoryFilter): A directory filter, which filters a list of directories.
            file_extension (str): The data file extension (with or without dot).
        """
        super().__init__()

        self.root_dir = root_dir
        self.dir_filter = dir_filter
        self.file_keys = file_keys
        self.file_path_generator = file_path_generator
        self.file_extension = file_extension if file_extension.startswith('.') else '.' + file_extension

        # dict with key=id (i.e, directory name), value=path to data directory
        self.data = {}  # dict with key=id (i.e, directory name), value=dict with key=file_keys and value=path to file

        data_dir = self._crawl_directories()
        self._crawl_data(data_dir)

    def _crawl_data(self, data_dir: dict):
        """Crawls the data inside a directory."""

        for id_, path in data_dir.items():
            data_dict = {id_: path}  # init dict with id_ pointing to path
            for item in self.file_keys:
                file_path = self.file_path_generator.get_full_file_path(id_, path, item, self.file_extension)
                data_dict[item] = file_path

            self.data[id_] = data_dict

    def _crawl_directories(self) -> dict:
        """Crawls the directories, which contain data.

        Returns:
            dict: A dictionary where the keys are the directory names and the values the full path to the directory.
        """

        if not os.path.isdir(self.root_dir):
            raise ValueError('root_dir {} does not exist'.format(self.root_dir))

        # search the root directory for data directories
        data_dirs = next(os.walk(self.root_dir))[1]

        if self.dir_filter:
            # filter the data directories
            data_dirs = self.dir_filter.filter_directories(data_dirs)

        return {
            data_dir: os.path.join(self.root_dir, data_dir)
            for data_dir in data_dirs
            if any(file.endswith(self.file_extension) for file  # check if directory contains data files
                   in os.listdir(os.path.join(self.root_dir, data_dir)))
        }