Source code for gobbli.dataset.imdb

from pathlib import Path
from typing import Set, Tuple

from gobbli.dataset.nested_file import NestedFileDataset
from gobbli.util import download_archive


[docs]class IMDBDataset(NestedFileDataset): """ gobbli Dataset for the IMDB sentiment analysis problem. https://ai.stanford.edu/~amaas/data/sentiment/ """
[docs] def labels(self) -> Set[str]: return {"pos", "neg"}
[docs] def download(self, data_dir: Path) -> Path: return download_archive( "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", data_dir )
[docs] def folders(self) -> Tuple[Path, Path]: return Path("aclImdb/train"), Path("aclImdb/test")
[docs] def read_source_file(self, file_path: Path) -> str: return file_path.read_text()