Source code for GPy.examples.non_gaussian

# Copyright (c) 2014, Alan Saul
# Licensed under the BSD 3-clause license (see LICENSE.txt)

import GPy
import numpy as np

MPL_AVAILABLE = True
try:
    import matplotlib.pyplot as plt
except ImportError:
    MPL_AVAILABLE = False


[docs]def student_t_approx(optimize=True, plot=True):
    """
    Example of regressing with a student t likelihood using Laplace
    """
    real_std = 0.1
    # Start a function, any function
    X = np.linspace(0.0, np.pi * 2, 100)[:, None]
    Y = np.sin(X) + np.random.randn(*X.shape) * real_std
    Y = Y / Y.max()
    Yc = Y.copy()

    X_full = np.linspace(0.0, np.pi * 2, 500)[:, None]
    Y_full = np.sin(X_full)
    Y_full = Y_full / Y_full.max()

    # Slightly noisy data
    Yc[75:80] += 1

    # Very noisy data
    # Yc[10] += 100
    # Yc[25] += 10
    # Yc[23] += 10
    # Yc[26] += 1000
    # Yc[24] += 10
    # Yc = Yc/Yc.max()

    # Add student t random noise to datapoints
    deg_free = 1
    print("Real noise: ", real_std)
    initial_var_guess = 0.5
    edited_real_sd = initial_var_guess

    # Kernel object
    kernel1 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
    kernel2 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
    kernel3 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
    kernel4 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])

    # Gaussian GP model on clean data
    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
    # optimize
    m1[".*white"].constrain_fixed(1e-5)
    m1.randomize()

    # Gaussian GP model on corrupt data
    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
    m2[".*white"].constrain_fixed(1e-5)
    m2.randomize()

    # Student t GP model on clean data
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    m3 = GPy.core.GP(
        X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf
    )
    m3[".*t_scale2"].constrain_bounded(1e-6, 10.0)
    m3[".*white"].constrain_fixed(1e-5)
    m3.randomize()

    # Student t GP model on corrupt data
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    m4 = GPy.core.GP(
        X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf
    )
    m4[".*t_scale2"].constrain_bounded(1e-6, 10.0)
    m4[".*white"].constrain_fixed(1e-5)
    m4.randomize()
    print(m4)
    debug = True
    if debug:
        m4.optimize(messages=1)
        from matplotlib import pyplot as pb

        pb.plot(m4.X, m4.inference_method.f_hat)
        pb.plot(m4.X, m4.Y, "rx")
        m4.plot()
        print(m4)
        return m4

    if optimize:
        optimizer = "scg"
        print("Clean Gaussian")
        m1.optimize(optimizer, messages=1)
        print("Corrupt Gaussian")
        m2.optimize(optimizer, messages=1)
        print("Clean student t")
        m3.optimize(optimizer, messages=1)
        print("Corrupt student t")
        m4.optimize(optimizer, messages=1)

    if MPL_AVAILABLE and plot:
        plt.figure(1)
        plt.suptitle("Gaussian likelihood")
        ax = plt.subplot(211)
        m1.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
        plt.title("Gaussian clean")

        ax = plt.subplot(212)
        m2.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
        plt.title("Gaussian corrupt")

        plt.figure(2)
        plt.suptitle("Student-t likelihood")
        ax = plt.subplot(211)
        m3.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
        plt.title("Student-t rasm clean")

        ax = plt.subplot(212)
        m4.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
        plt.title("Student-t rasm corrupt")

    return m1, m2, m3, m4


[docs]def boston_example(optimize=True, plot=True):
    raise NotImplementedError("Needs updating")
    import sklearn
    from sklearn.cross_validation import KFold

    optimizer = "bfgs"
    messages = 0
    try:
        import pods
    except ImportError:
        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.boston_housing()
    degrees_freedoms = [3, 5, 8, 10]
    X = data["X"].copy()
    Y = data["Y"].copy()
    X = X - X.mean(axis=0)
    X = X / X.std(axis=0)
    Y = Y - Y.mean()
    Y = Y / Y.std()
    num_folds = 10
    kf = KFold(len(Y), n_folds=num_folds, indices=True)
    num_models = (
        len(degrees_freedoms) + 3
    )  # 3 for baseline, gaussian, gaussian laplace approx
    score_folds = np.zeros((num_models, num_folds))
    pred_density = score_folds.copy()

    def rmse(Y, Ystar):
        return np.sqrt(np.mean((Y - Ystar) ** 2))

    for n, (train, test) in enumerate(kf):
        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
        print("Fold {}".format(n))

        noise = 1e-1  # np.exp(-2)
        rbf_len = 0.5
        data_axis_plot = 4
        kernelstu = (
            GPy.kern.RBF(X.shape[1])
            + GPy.kern.white(X.shape[1])
            + GPy.kern.bias(X.shape[1])
        )
        kernelgp = (
            GPy.kern.RBF(X.shape[1])
            + GPy.kern.white(X.shape[1])
            + GPy.kern.bias(X.shape[1])
        )

        # Baseline
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))

        # Gaussian GP
        print("Gauss GP")
        mgp = GPy.models.GPRegression(
            X_train.copy(), Y_train.copy(), kernel=kernelgp.copy()
        )
        mgp.constrain_fixed(".*white", 1e-5)
        mgp[".*len"] = rbf_len
        mgp[".*noise"] = noise
        print(mgp)
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
        Y_test_pred = mgp.predict(X_test)
        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
        print(mgp)
        print(pred_density)

        print("Gaussian Laplace GP")
        N, D = Y_train.shape
        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(
            variance=noise, N=N, D=D
        )
        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
        mg = GPy.models.GPRegression(
            X_train.copy(),
            Y_train.copy(),
            kernel=kernelstu.copy(),
            likelihood=g_likelihood,
        )
        mg.constrain_positive("noise_variance")
        mg.constrain_fixed(".*white", 1e-5)
        mg["rbf_len"] = rbf_len
        mg["noise"] = noise
        print(mg)
        if optimize:
            mg.optimize(optimizer=optimizer, messages=messages)
        Y_test_pred = mg.predict(X_test)
        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
        print(pred_density)
        print(mg)

        for stu_num, df in enumerate(degrees_freedoms):
            # Student T
            print("Student-T GP {}df".format(df))
            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(
                deg_free=df, sigma2=noise
            )
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
            mstu_t = GPy.models.GPRegression(
                X_train.copy(),
                Y_train.copy(),
                kernel=kernelstu.copy(),
                likelihood=stu_t_likelihood,
            )
            mstu_t.constrain_fixed(".*white", 1e-5)
            mstu_t.constrain_bounded(".*t_scale2", 0.0001, 1000)
            mstu_t["rbf_len"] = rbf_len
            mstu_t[".*t_scale2"] = noise
            print(mstu_t)
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
            Y_test_pred = mstu_t.predict(X_test)
            score_folds[3 + stu_num, n] = rmse(Y_test, Y_test_pred[0])
            pred_density[3 + stu_num, n] = np.mean(
                mstu_t.log_predictive_density(X_test, Y_test)
            )
            print(pred_density)
            print(mstu_t)

    if MPL_AVAILABLE and plot:
        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
        plt.title("GP gauss")

        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
        plt.title("Lap gauss")

        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
        plt.title("Stu t {}df".format(df))

    print("Average scores: {}".format(np.mean(score_folds, 1)))
    print("Average pred density: {}".format(np.mean(pred_density, 1)))

    if MPL_AVAILABLE and plot:
        # Plotting
        stu_t_legends = ["Student T, df={}".format(df) for df in degrees_freedoms]
        legends = ["Baseline", "Gaussian", "Laplace Approx Gaussian"] + stu_t_legends

        # Plot boxplots for RMSE density
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.title("RMSE")
        bp = ax.boxplot(score_folds.T, notch=0, sym="+", vert=1, whis=1.5)
        plt.setp(bp["boxes"], color="black")
        plt.setp(bp["whiskers"], color="black")
        plt.setp(bp["fliers"], color="red", marker="+")
        xtickNames = plt.setp(ax, xticklabels=legends)
        plt.setp(xtickNames, rotation=45, fontsize=8)
        ax.set_ylabel("RMSE")
        ax.set_xlabel("Distribution")
        # Make grid and put it below boxes
        ax.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
        ax.set_axisbelow(True)

        # Plot boxplots for predictive density
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.title("Predictive density")
        bp = ax.boxplot(pred_density[1:, :].T, notch=0, sym="+", vert=1, whis=1.5)
        plt.setp(bp["boxes"], color="black")
        plt.setp(bp["whiskers"], color="black")
        plt.setp(bp["fliers"], color="red", marker="+")
        xtickNames = plt.setp(ax, xticklabels=legends[1:])
        plt.setp(xtickNames, rotation=45, fontsize=8)
        ax.set_ylabel("Mean Log probability P(Y*|Y)")
        ax.set_xlabel("Distribution")
        # Make grid and put it below boxes
        ax.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
        ax.set_axisbelow(True)
    return mstu_t


# def precipitation_example():
#     import sklearn
#     from sklearn.cross_validation import KFold

#     data = datasets.boston_housing()
#     X = data["X"].copy()
#     Y = data["Y"].copy()
#     X = X - X.mean(axis=0)
#     X = X / X.std(axis=0)
#     Y = Y - Y.mean()
#     Y = Y / Y.std()
#     import ipdb

#     ipdb.set_trace()  # XXX BREAKPOINT
#     num_folds = 10
#     kf = KFold(len(Y), n_folds=num_folds, indices=True)
#     score_folds = np.zeros((4, num_folds))

#     def rmse(Y, Ystar):
#         return np.sqrt(np.mean((Y - Ystar) ** 2))

#     for train, test in kf:
#         for n, (train, test) in enumerate(kf):
#             X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
#             print("Fold {}".format(n))