Source code for gobbli.model.random

from typing import List, Optional

import numpy as np

import gobbli.io
from gobbli.model.base import BaseModel
from gobbli.model.context import ContainerTaskContext
from gobbli.model.mixin import EmbedMixin, TrainMixin


[docs]class RandomEmbedder(BaseModel, TrainMixin, EmbedMixin):
    """
    Dummy embeddings generator that returns random numbers as embeddings
    and has a stub training method to create a uniform API with other embedding
    models.

    Useful for ensuring user code works with the gobbli input/output format
    without having to build a time-consuming model.
    """

    SEED = 1234
    DIMENSIONALITY = 32

[docs]    def init(self, params):
        pass

    def _build(self):
        """
        No build step required for this model.
        """

[docs]    def tokenize(self, X: List[str]) -> List[List[str]]:
        """
        Return a tokenized list of documents.
        """
        return [x.split() for x in X]

    def _train(
        self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext
    ) -> gobbli.io.TrainOutput:
        """
        No training needed -- stubbed for API uniformity.
        """
        return gobbli.io.TrainOutput(
            valid_loss=0, valid_accuracy=0, train_loss=0, labels=[], multilabel=False
        )

    def _embed(
        self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext
    ) -> gobbli.io.EmbedOutput:
        """
        Generate random embeddings.
        """
        np.random.seed(RandomEmbedder.SEED)
        X_tokenized = self.tokenize(embed_input.X)

        embeddings = []
        for tokens in X_tokenized:
            token_embeddings = np.random.rand(
                # sequence length
                len(tokens),
                # dimensionality
                RandomEmbedder.DIMENSIONALITY,
            )

            if embed_input.pooling == gobbli.io.EmbedPooling.MEAN:
                token_embeddings = np.mean(token_embeddings, axis=0)

            embeddings.append(token_embeddings)

        embed_tokens = None  # type: Optional[List[List[str]]]
        if embed_input.pooling == gobbli.io.EmbedPooling.NONE:
            embed_tokens = X_tokenized[:]

        return gobbli.io.EmbedOutput(X_embedded=embeddings, embed_tokens=embed_tokens)
Source code for gobbli.model.random

Navigation

Related Topics