"""This modules contains utility functions and classes for the access of the file system."""
import abc
import enum
import os
import typing as t
import mialab.data.structure as structure
[docs]class FilePathGenerator(metaclass=abc.ABCMeta):
"""Represents an abstract file path generator.
This class is used in :py:class:`FileSystemDataCrawler` to convert a human readable data identifier to an
data file path, which allows to load the data."""
[docs] @staticmethod
@abc.abstractmethod
def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str:
"""Gets the full file path for a data file.
Args:
id_ (str): The data's identification.
root_dir (str): The data file's root directory.
file_key (object): A human readable identifier used to identify the data file.
file_extension (str): The data's file extension.
Returns:
str: The data's full file path.
"""
raise NotImplementedError()
[docs]class BrainImageFilePathGenerator(FilePathGenerator):
"""Represents a brain image file path generator.
The generator is used to convert a human readable image identifier to an image file path,
which allows to load the image.
"""
[docs] def __init__(self):
"""Initializes a new instance of the BrainImageFilePathGenerator class."""
pass
[docs] @staticmethod
def get_full_file_path(id_: str, root_dir: str, file_key, file_extension: str) -> str:
"""Gets the full file path for an image.
Args:
id_ (str): The image identification.
root_dir (str): The image' root directory.
file_key (object): A human readable identifier used to identify the image.
file_extension (str): The image' file extension.
Returns:
str: The images' full file path.
"""
# the commented file_names are for the registration group
if file_key == structure.BrainImageTypes.T1w:
file_name = 'T1native'
elif file_key == structure.BrainImageTypes.T2w:
file_name = 'T2native'
elif file_key == structure.BrainImageTypes.GroundTruth:
file_name = 'labels_native'
elif file_key == structure.BrainImageTypes.BrainMask:
file_name = 'Brainmasknative'
elif file_key == structure.BrainImageTypes.RegistrationTransform:
return os.path.join(root_dir, 'affine.txt')
else:
raise ValueError('Unknown key')
return os.path.join(root_dir, file_name + file_extension)
[docs]class DirectoryFilter(metaclass=abc.ABCMeta):
"""Represents an abstract directory filter.
This class is used in :py:class:`FileSystemDataCrawler` to filter a list of directories.
"""
[docs] @staticmethod
@abc.abstractmethod
def filter_directories(dirs: t.List[str]) -> t.List[str]:
"""Filters a list of directories.
Args:
dirs (List[str]): A list of directories.
Returns:
List[str]: The filtered list of directories.
"""
raise NotImplementedError()
[docs]class DataDirectoryFilter(DirectoryFilter):
"""Represents a data directory filter.
The filter is used to
"""
[docs] def __init__(self):
"""Initializes a new instance of the DataDirectoryFilter class."""
pass
[docs] @staticmethod
def filter_directories(dirs: t.List[str]) -> t.List[str]:
"""Filters a list of directories.
Args:
dirs (List[str]): A list of directories.
Returns:
List[str]: The filtered list of directories.
"""
# currently, we do not filter the directories. but you could filter the directory list like this:
# return [dir for dir in dirs if not dir.lower().__contains__('atlas')]
return dirs
[docs]class FileSystemDataCrawler:
"""Represents a file system data crawler.
Examples:
Suppose we have the following directory structure::
/path/to/root_dir
./Patient1
./Image.mha
./GroundTruth.mha
./some_text_file.txt
./Patient2
./Image.mha
./GroundTruth.mha
./GroundTruthRater2.mha
./Atlas
./Atlas.mha
We can use the following code to load the images `Image.mha` and `GroundTruth.mha`
in the directories `Patient1` and `Patient2`:
>>> class MyImgType(enum.Enum):
>>> T1 = 1
>>> GroundTruth = 2
>>>
>>> class MyFilePathGenerator(FilePathGenerator):
>>> @staticmethod
>>> def get_full_file_path(_id: str, root_dir: str, file_key, file_extension: str) -> str:
>>> if file_key == MyImgType.T1:
>>> file_name = 'Image'
>>> elif file_key == MyImgType.GroundTruth:
>>> file_name = 'GroundTruth'
>>> else:
>>> raise ValueError('Unknown key')
>>>
>>> return os.path.join(root_dir, file_name + file_extension)
>>>
>>> class MyDirFilter(DirectoryFilter):
>>> @staticmethod
>>> def filter_directories(dirs: t.List[str]) -> t.List[str]:
>>> return sorted([dir_ for dir_ in dirs if dir_.lower().__contains__('patient')])
>>>
>>> crawler = FileSystemDataCrawler('/path/to/root_dir',
>>> [MyImgType.T1, MyImgType.GroundTruth],
>>> MyFilePathGenerator(),
>>> MyDirFilter(),
>>> '.mha')
>>> for id_, path in crawler.data.items():
>>> print(id_, path)
Patient1 {'Patient1': '/path/to/root_dir/Patient1',
<MyImgType.T1: 1>: '/path/to/root_dir/Patient1/Image.mha',
<MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient1/GroundTruth.mha'}
Patient2 {'Patient2': '/path/to/root_dir/Patient2',
<MyImgType.T1: 1>: '/path/to/root_dir/Patient2/Image.mha',
<MyImgType.GroundTruth: 2>: '/path/to/root_dir/Patient2/GroundTruth.mha'}
"""
[docs] def __init__(self,
root_dir: str,
file_keys: list,
file_path_generator: FilePathGenerator,
dir_filter: DirectoryFilter = None,
file_extension: str = '.nii.gz'):
"""Initializes a new instance of the FileSystemDataCrawler class.
Args:
root_dir (str): The path to the root directory, which contains subdirectories with the data.
file_keys (list): A list of objects, which represent human readable data identifiers
(one identifier for each data file to crawl).
file_path_generator (FilePathGenerator): A file path generator, which converts a human readable
data identifier to an data file path.
dir_filter (DirectoryFilter): A directory filter, which filters a list of directories.
file_extension (str): The data file extension (with or without dot).
"""
super().__init__()
self.root_dir = root_dir
self.dir_filter = dir_filter
self.file_keys = file_keys
self.file_path_generator = file_path_generator
self.file_extension = file_extension if file_extension.startswith('.') else '.' + file_extension
# dict with key=id (i.e, directory name), value=path to data directory
self.data = {} # dict with key=id (i.e, directory name), value=dict with key=file_keys and value=path to file
data_dir = self._crawl_directories()
self._crawl_data(data_dir)
def _crawl_data(self, data_dir: dict):
"""Crawls the data inside a directory."""
for id_, path in data_dir.items():
data_dict = {id_: path} # init dict with id_ pointing to path
for item in self.file_keys:
file_path = self.file_path_generator.get_full_file_path(id_, path, item, self.file_extension)
data_dict[item] = file_path
self.data[id_] = data_dict
def _crawl_directories(self) -> dict:
"""Crawls the directories, which contain data.
Returns:
dict: A dictionary where the keys are the directory names and the values the full path to the directory.
"""
if not os.path.isdir(self.root_dir):
raise ValueError('root_dir {} does not exist'.format(self.root_dir))
# search the root directory for data directories
data_dirs = next(os.walk(self.root_dir))[1]
if self.dir_filter:
# filter the data directories
data_dirs = self.dir_filter.filter_directories(data_dirs)
return {
data_dir: os.path.join(self.root_dir, data_dir)
for data_dir in data_dirs
if any(file.endswith(self.file_extension) for file # check if directory contains data files
in os.listdir(os.path.join(self.root_dir, data_dir)))
}