Source code for gobbli.test.model.test_sklearn

from pathlib import Path

import joblib
import pytest
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from gobbli.model.sklearn import (
    SKLearnClassifier,
    make_cv_tfidf_logistic_regression,
    make_default_tfidf_logistic_regression,
    persist_estimator,
)
from gobbli.model.sklearn.model import _AT_LEAST_TWO_CLASSES_ERR_MSG, _SafeEstimator


def _make_test_estimator():
    # Don't use anything complicated, like a Pipeline or GridSearchCV,
    # since those may have estimator params which don't compare cleanly
    return LogisticRegression()


def _assert_estimators_equal(clf1, clf2):
    assert type(clf1) is type(clf2)
    assert clf1.get_params() == clf2.get_params()


[docs]def test_dump_estimator(tmpdir):
    tempdir_path = Path(tmpdir)
    clf = _make_test_estimator()

    dump_path = tempdir_path / "test.joblib"
    SKLearnClassifier._dump_estimator(clf, dump_path)
    loaded_clf = joblib.load(dump_path)

    _assert_estimators_equal(clf, loaded_clf)


[docs]def test_load_estimator(tmpdir):
    tempdir_path = Path(tmpdir)
    clf = _make_test_estimator()

    dump_path = tempdir_path / "test.joblib"
    joblib.dump(clf, dump_path)
    loaded_clf = SKLearnClassifier._load_estimator(dump_path)

    _assert_estimators_equal(clf, loaded_clf)


[docs]def test_persist_estimator(tmp_gobbli_dir):
    clf = _make_test_estimator()
    clf_path = persist_estimator(clf)

    assert tmp_gobbli_dir in clf_path.parents

    loaded_clf = SKLearnClassifier._load_estimator(clf_path)

    _assert_estimators_equal(clf, loaded_clf)


[docs]@pytest.mark.parametrize(
    "clf,err",
    [
        # Not an estimator
        (None, ValueError),
        # Also not an estimator
        (1, ValueError),
        # Estimator but not a classifier
        (LinearRegression(), ValueError),
        # Estimator with no predict_proba
        (LinearSVC(), ValueError),
        # Valid estimator
        (LogisticRegression(), None),
        # Invalid pipeline (not a classifier)
        (Pipeline([("linreg", LinearRegression())]), ValueError),
        # Invalid pipeline (no predict_proba)
        (Pipeline([("svc", LinearSVC())]), ValueError),
        # Valid pipeline
        (Pipeline([("logreg", LogisticRegression())]), None),
        # Invalid grid search (not a classifier)
        (GridSearchCV(LinearRegression(), {}), ValueError),
        # Invalid grid search (no predict_proba)
        (GridSearchCV(LinearSVC(), {}), ValueError),
        # Valid grid search
        (GridSearchCV(LogisticRegression(), {}), None),
        # Our helpers should both be valid
        (make_cv_tfidf_logistic_regression(), None),
        (make_default_tfidf_logistic_regression(), None),
    ],
)
def test_validate_estimator(clf, err):
    if err is not None:
        with pytest.raises(err):
            SKLearnClassifier._validate_estimator(clf)
    else:
        SKLearnClassifier._validate_estimator(clf)


[docs]def test_safe_estimator():
    clf = make_default_tfidf_logistic_regression()

    X_train = ["test", "test2"]
    y_train = ["a", "a"]

    with pytest.raises(ValueError) as e:
        clf.fit(X_train, y_train)
    assert _AT_LEAST_TWO_CLASSES_ERR_MSG in str(e.value)

    safe_clf = _SafeEstimator(clf)
    safe_clf.fit(X_train, y_train)
    assert safe_clf.classes_.tolist() == ["a"]

    y_pred = safe_clf.predict(X_train)
    assert y_pred.tolist() == ["a", "a"]

    y_pred_proba = safe_clf.predict_proba(X_train)
    assert y_pred_proba.tolist() == [[1], [1]]


[docs]@pytest.mark.parametrize(
    "params,exception",
    [
        # Unknown param
        ({"unknown": None}, ValueError),
        # Bad type (estimator_path)
        ({"estimator_path": 1}, TypeError),
        # init loads the model path, and there isn't a good way to
        # reference a temp path from this param list, so we'll assume
        # other tests will catch a failure to initialize from a good
        # estimator path
    ],
)
def test_init(params, exception):
    with pytest.raises(exception):
        SKLearnClassifier(**params)
Source code for gobbli.test.model.test_sklearn

Navigation

Related Topics