Source code for forestatrisk.validate.model_validation

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# ==============================================================================
# author          :Ghislain Vieilledent
# email           :ghislain.vieilledent@cirad.fr, ghislainv@gmail.com
# web             :https://ecology.ghislainv.fr
# python_version  :>=2.7
# license         :GPLv3
# ==============================================================================

# Standard library imports
from __future__ import division, print_function  # Python 3 compatibility

# Third party imports
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Local application imports
from ..model import model_binomial_iCAR


# AUC (see Liu 2011)

[docs]
def computeAUC(pos_scores, neg_scores, n_sample=100000):
    """Compute the AUC index.

    Compute the Area Under the ROC Curve (AUC). See `Liu et al. 2011
    <https://doi.org/10.1111/j.1600-0587.2010.06354.x>`_\\ .

    :param pos_scores: Scores of positive observations.
    :param neg_scores: Scores of negative observations.
    :param n_samples: Number of samples to approximate AUC.

    :return: AUC value.

    """

    pos_scores = np.array(pos_scores, dtype=float)
    neg_scores = np.array(neg_scores, dtype=float)
    pos_sample = np.random.choice(pos_scores, size=n_sample, replace=True)
    neg_sample = np.random.choice(neg_scores, size=n_sample, replace=True)
    AUC = np.mean(1.0 * (pos_sample > neg_sample) + 0.5 * (pos_sample == neg_sample))

    return AUC



# accuracy_indices

[docs]
def accuracy_indices(pred, obs):
    """Compute accuracy indices.

    Compute the Overall Accuracy, the Figure of Merit, the
    Specificity, the Sensitivity, the True Skill Statistics and the
    Cohen's Kappa from a confusion matrix built on predictions
    vs. observations.

    :param pred: List of predictions.
    :param obs: List of observations.

    :return: A dictionnary of accuracy indices.

    """

    # Create pandas data-frame
    df = pd.DataFrame({"pred": pred, "obs": obs})

    # Confusion matrix
    n00 = sum((df["pred"] == 0) & (df["obs"] == 0))
    n10 = sum((df["pred"] == 1) & (df["obs"] == 0))
    n01 = sum((df["pred"] == 0) & (df["obs"] == 1))
    n11 = sum((df["pred"] == 1) & (df["obs"] == 1))

    # Accuracy indices
    N = n11 + n10 + n00 + n01
    OA = (n11 + n00) / N
    FOM = n11 / (n11 + n10 + n01)
    Sensitivity = n11 / (n11 + n01)
    Specificity = n00 / (n00 + n10)
    TSS = Sensitivity + Specificity - 1
    Prob_1and1 = (n11 + n10) * (n11 + n01)
    Prob_0and0 = (n00 + n01) * (n00 + n10)
    Expected_accuracy = (Prob_1and1 + Prob_0and0) / (N * N)
    Kappa = (OA - Expected_accuracy) / (1 - Expected_accuracy)

    r = {
        "OA": OA,
        "EA": Expected_accuracy,
        "FOM": FOM,
        "Sen": Sensitivity,
        "Spe": Specificity,
        "TSS": TSS,
        "K": Kappa,
    }

    return r



# cross_validation

[docs]
def cross_validation(
    data,
    formula,
    mod_type="icar",
    ratio=30,
    nrep=5,
    seed=1234,
    icar_args={
        "n_neighbors": None,
        "neighbors": None,
        "burnin": 1000,
        "mcmc": 1000,
        "thin": 1,
        "beta_start": 0,
    },
    rf_args={"n_estimators": 100, "n_jobs": None},
):
    """Model cross-validation

    Performs model cross-validation.

    :param data: Full dataset.
    :param formula: Model formula.
    :param mod_type: Model type, can be either "icar", "glm", or "rf".
    :param ratio: Percentage of data used for testing.
    :param nrep: Number of repetitions for cross-validation.
    :param seed: Seed for reproducibility.
    :param icar_args: Dictionnary of arguments for the binomial iCAR model.
    :param rf_args: Dictionnary of arguments for the random forest model.

    :return: A Pandas data frame with cross-validation results.

    """

    # Set random seed for reproducibility
    np.random.seed(seed)

    # Result table
    CV_df = pd.DataFrame(
        {"index": ["AUC", "OA", "EA", "FOM", "Sen", "Spe", "TSS", "K"]}
    )

    # Constants
    nobs = data.shape[0]
    nobs_test = int(round(nobs * (ratio / 100)))
    rows = np.arange(nobs)

    # Loop on repetitions
    for i in range(nrep):
        # Print message
        print("Repetition #: " + str(i + 1))

        # Data-sets for cross-validation
        rows_test = np.random.choice(rows, size=nobs_test, replace=False)
        rows_train = np.where(np.isin(rows, rows_test, invert=True))
        data_test = data.iloc[rows_test].copy()
        data_train = data.iloc[rows_train].copy()

        # True threshold in data_test (might be slightly different from 0.5)
        # nfor_test = sum(data_test.fcc23 == 1)
        ndefor_test = sum(data_test.fcc23 == 0)
        thresh_test = 1 - (ndefor_test / nobs_test)

        # Training matrices
        y, x = dmatrices(formula, data=data_train, NA_action="drop")
        Y_train = y[:, 0]
        X_train = x[:, :-1]  # We remove the last column (cells)
        # Test matrices
        y, x = dmatrices(formula, data=data_test, NA_action="drop")
        # Y_test = y[:, 0]
        X_test = x[:, :-1]  # We remove the last column (cells)

        # Compute deforestation probability
        # icar
        if mod_type == "icar":
            # Training the model
            mod_icar = model_binomial_iCAR(
                # Observations
                suitability_formula=formula,
                data=data_train,
                # Spatial structure
                n_neighbors=icar_args["n_neighbors"],
                neighbors=icar_args["neighbors"],
                # Chains
                burnin=icar_args["burnin"],
                mcmc=icar_args["mcmc"],
                thin=icar_args["thin"],
                # Starting values
                beta_start=icar_args["beta_start"],
            )
            # Predictions for the test dataset
            data_test["theta_pred"] = mod_icar.predict(new_data=data_test)
        # glm
        if mod_type == "glm":
            # Training the model
            glm = LogisticRegression(solver="lbfgs")
            mod_glm = glm.fit(X_train, Y_train)
            # Predictions for the test dataset
            data_test["theta_pred"] = mod_glm.predict_proba(X_test)[:, 1]
        # RF
        if mod_type == "rf":
            # Training the model
            rf = RandomForestClassifier(
                n_estimators=rf_args["n_estimators"], n_jobs=rf_args["n_jobs"]
            )
            mod_rf = rf.fit(X_train, Y_train)
            # Predictions for the test dataset
            data_test["theta_pred"] = mod_rf.predict_proba(X_test)[:, 1]

        # Transform probabilities into binary data
        proba_thresh = np.quantile(data_test["theta_pred"], thresh_test)
        data_test["pred"] = 0
        data_test.loc[data_test.theta_pred > proba_thresh, "pred"] = 1

        # AUC
        pos_scores = data_test.theta_pred[data_test.fcc23 == 0]
        neg_scores = data_test.theta_pred[data_test.fcc23 == 1]
        AUC = computeAUC(pos_scores, neg_scores)
        # Accuracy indices
        obs = 1 - data_test.fcc23
        pred = data_test.pred
        ai = accuracy_indices(obs, pred)

        # Tupple of indices
        acc_ind = (
            AUC,
            ai["OA"],
            ai["EA"],
            ai["FOM"],
            ai["Sen"],
            ai["Spe"],
            ai["TSS"],
            ai["K"],
        )

        # Results as data frame
        CV_df["rep" + str(i + 1)] = acc_ind

    # Mean over repetitions
    CV_values = CV_df.loc[:, CV_df.columns != "index"]
    CV_df["mean"] = np.mean(CV_values, axis=1)
    CV_df = CV_df.round(4)

    return CV_df



# End
forestatrisk

Navigation

Related Topics

Source code for forestatrisk.validate.model_validation