Source code for gobbli.dataset.nested_file

from abc import abstractmethod
from pathlib import Path
from typing import List, Set, Tuple

from gobbli.dataset.base import BaseDataset


[docs]class NestedFileDataset(BaseDataset): """ A dataset downloaded as an archive from some URL and composed of the following directory structure: <train_folder>/ <label1>/ data1 data2 <label2>/ data1 data2 <test_folder>/ <label1>/ data1 data2 <label2>/ data1 data2 """ TRAIN_X_FILE = "train_X.csv" TRAIN_Y_FILE = "train_y.csv" TEST_X_FILE = "test_X.csv" TEST_Y_FILE = "test_Y.csv" # Null byte _shouldn't_ be embedded in any data files... DELIMITER = "\x00"
[docs] @abstractmethod def labels(self) -> Set[str]: """ Return the set of folder names that should be considered labels in each directory. """ raise NotImplementedError
[docs] @abstractmethod def download(self, data_dir: Path) -> Path: """ Download and extract the dataset archive into the given data dir. Return the resulting path. """ raise NotImplementedError
[docs] @abstractmethod def folders(self) -> Tuple[Path, Path]: """ Return relative paths to the train and test folders, respectively, from the top level of the extracted archive. """ raise NotImplementedError
[docs] @abstractmethod def read_source_file(self, file_path: Path) -> str: """ Read the text from a source file. Used to account for per-dataset encodings and other format differences. """ raise NotImplementedError
def _build(self): data_dir = self.data_dir() data_dir.mkdir(exist_ok=True, parents=True) self.download(data_dir) train_folder, test_folder = self.folders() self._load_folder( data_dir / train_folder, data_dir / self.TRAIN_X_FILE, data_dir / self.TRAIN_Y_FILE, ) self._load_folder( data_dir / test_folder, data_dir / self.TEST_X_FILE, data_dir / self.TEST_Y_FILE, ) def _load_folder(self, folder, X_file, y_file): """ Combine a nested directory structure into a single output file. Assume the directory names are category names, and each file in the directory is a separate row assigned to that category. Write output to the given output file. """ X = [] y = [] labels = self.labels() for category_dir in folder.iterdir(): category_name = category_dir.name # Skip folders not named for labels if category_name not in labels: continue for data_file in category_dir.iterdir(): X.append(self.read_source_file(data_file)) y.append(category_name) X_file.write_text(NestedFileDataset.DELIMITER.join(X)) y_file.write_text(NestedFileDataset.DELIMITER.join(y)) def _read_data_file(self, filepath: Path) -> List[str]: return filepath.read_text().split(NestedFileDataset.DELIMITER) def _is_built(self) -> bool: data_files = ( self.TRAIN_X_FILE, self.TRAIN_Y_FILE, self.TEST_X_FILE, self.TEST_Y_FILE, ) return all((self.data_dir() / data_file).exists() for data_file in data_files)
[docs] def X_train(self): return self._read_data_file(self.data_dir() / self.TRAIN_X_FILE)
[docs] def y_train(self): return self._read_data_file(self.data_dir() / self.TRAIN_Y_FILE)
[docs] def X_test(self): return self._read_data_file(self.data_dir() / self.TEST_X_FILE)
[docs] def y_test(self): return self._read_data_file(self.data_dir() / self.TEST_Y_FILE)