Source code for coniferest.isoforest

from .coniferest import Coniferest, ConiferestEvaluator

__all__ = ["IsolationForest"]



[docs]
class IsolationForest(Coniferest):
    """
    Isolation forest.

    This is a reimplementation of sklearn.ensemble.IsolationForest,
    which trains and evaluates much faster. It also supports multi-threading
    for evaluation (sample scoring).

    Parameters
    ----------
    n_trees : int, optional
        Number of trees in forest to build.

    n_subsamples : int, optional
        Number of subsamples to use for building the trees.

    max_depth : int or None, optional
        Maximal tree depth. If None, `log2(n_subsamples)` is used.

    n_jobs : int, default=-1
        Number of threads to use for evaluation. If -1, use all available CPUs.

    random_seed : int or None, optional
        Seed for reproducibility. If None, random seed is used.
    """

    def __init__(
        self,
        n_trees=100,
        n_subsamples=256,
        max_depth=None,
        n_jobs=-1,
        random_seed=None,
        sampletrees_per_batch=1 << 20,
    ):
        super().__init__(
            trees=[],
            n_subsamples=n_subsamples,
            max_depth=max_depth,
            n_jobs=n_jobs,
            random_seed=random_seed,
            sampletrees_per_batch=sampletrees_per_batch,
        )
        self.n_trees = n_trees
        self.evaluator = None


[docs]
    def fit(self, data, labels=None):
        """
        Build the trees based on data.

        Parameters
        ----------
        data
            2-d array with features.

        labels
            Unused. Defaults to None.

        Returns
        -------
        self
        """
        self.trees = self.build_trees(data, self.n_trees)
        self.evaluator = ConiferestEvaluator(self)
        return self



[docs]
    def score_samples(self, samples):
        """
        Compute scores for given samples.

        Parameters
        ----------
        samples
            2-d array with features.

        Returns
        -------
        1-d array with scores.
        """
        return self.evaluator.score_samples(samples)



[docs]
    def fit_known(self, data, known_data=None, known_labels=None):
        return self.fit(data)



[docs]
    def feature_signature(self, x):
        return self.evaluator.feature_signature(x)



[docs]
    def feature_importance(self, x):
        return self.evaluator.feature_importance(x)



[docs]
    def apply(self, x, output=None):
        """
        Apply the forest to X, return leaf indices.

        Parameters
        ----------
        x : ndarray shape (n_samples, n_features)
            2-d array with features.
        output : {"dense", "sparse"}, default="dense"
            If "dense", returns a dense array of leaf indices per tree.
            If "sparse", returns a sparse CSR matrix of shape (n_samples, n_leaves)
            where each row has non-zero entries for leaves reached by the sample.

        Returns
        -------
        x_leafs : ndarray of shape (n_samples, n_estimators) or csr_matrix of shape (n_samples, n_leaves)
            For each datapoint x in X and for each tree in the forest,
            return the index of the leaf x ends up in (dense format).
            If output="sparse", returns a sparse matrix with 1.0 in entries where
            sample reaches the leaf.
        """
        return self.evaluator.apply(x, output)



[docs]
    def distance(self, x, y=None, *, method=None):
        """
        Compute distance matrix between samples based on leaf co-occurrence.

        The distance is defined as 1 minus the fraction of trees where two samples
        land in the same leaf. This gives a measure of dissimilarity between
        samples based on their paths through the forest.

        Parameters
        ----------
        x : ndarray shape (n_samples_x, n_features) or (n_features,)
            Input samples. If 1-D, treated as a single sample.
        y : ndarray shape (n_samples_y, n_features) or (n_features,), optional
            Second set of samples for pairwise distance. If None (default),
            computes distances between all pairs in x.
        method : {"common_leaf_ratio"}, default="common_leaf_ratio"
            Distance computation method. Currently only "common_leaf_ratio"
            is supported.

        Returns
        -------
        distances : ndarray shape (n_samples_x, n_samples_y)
            Distance matrix where distances[i, j] is the distance between
            the i-th sample in x and j-th sample in y.
            If y is None, returns a square symmetric matrix of shape
            (n_samples_x, n_samples_x).

        Raises
        ------
        ValueError
            If method is not one of the known methods.
        """
        return self.evaluator.distance(x, y, method=method)
Source code for coniferest.isoforest

coniferest

Navigation

Related Topics