Source code for gobbli.model.random

from typing import List, Optional

import numpy as np

import gobbli.io
from gobbli.model.base import BaseModel
from gobbli.model.context import ContainerTaskContext
from gobbli.model.mixin import EmbedMixin, TrainMixin


[docs]class RandomEmbedder(BaseModel, TrainMixin, EmbedMixin): """ Dummy embeddings generator that returns random numbers as embeddings and has a stub training method to create a uniform API with other embedding models. Useful for ensuring user code works with the gobbli input/output format without having to build a time-consuming model. """ SEED = 1234 DIMENSIONALITY = 32
[docs] def init(self, params): pass
def _build(self): """ No build step required for this model. """
[docs] def tokenize(self, X: List[str]) -> List[List[str]]: """ Return a tokenized list of documents. """ return [x.split() for x in X]
def _train( self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext ) -> gobbli.io.TrainOutput: """ No training needed -- stubbed for API uniformity. """ return gobbli.io.TrainOutput( valid_loss=0, valid_accuracy=0, train_loss=0, labels=[], multilabel=False ) def _embed( self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext ) -> gobbli.io.EmbedOutput: """ Generate random embeddings. """ np.random.seed(RandomEmbedder.SEED) X_tokenized = self.tokenize(embed_input.X) embeddings = [] for tokens in X_tokenized: token_embeddings = np.random.rand( # sequence length len(tokens), # dimensionality RandomEmbedder.DIMENSIONALITY, ) if embed_input.pooling == gobbli.io.EmbedPooling.MEAN: token_embeddings = np.mean(token_embeddings, axis=0) embeddings.append(token_embeddings) embed_tokens = None # type: Optional[List[List[str]]] if embed_input.pooling == gobbli.io.EmbedPooling.NONE: embed_tokens = X_tokenized[:] return gobbli.io.EmbedOutput(X_embedded=embeddings, embed_tokens=embed_tokens)