Source code for gobbli.dataset.newsgroups

from pathlib import Path
from typing import Set, Tuple

from gobbli.dataset.nested_file import NestedFileDataset
from gobbli.util import download_archive


[docs]class NewsgroupsDataset(NestedFileDataset): """ gobbli Dataset for the 20 Newsgroups problem. http://qwone.com/~jason/20Newsgroups/ """
[docs] def labels(self) -> Set[str]: return { "alt.atheism", "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey", "sci.crypt", "sci.electronics", "sci.med", "sci.space", "soc.religion.christian", "talk.politics.guns", "talk.politics.mideast", "talk.politics.misc", "talk.religion.misc", }
[docs] def download(self, data_dir: Path) -> Path: return download_archive( "https://ndownloader.figshare.com/files/5975967", data_dir, filename="20news-bydate.tar.gz", )
[docs] def folders(self) -> Tuple[Path, Path]: return Path("20news-bydate-train"), Path("20news-bydate-test")
[docs] def read_source_file(self, file_path: Path) -> str: return file_path.read_text(encoding="latin-1")