Source code for coniferest.isoforest

from .coniferest import Coniferest, ConiferestEvaluator

__all__ = ["IsolationForest"]


[docs] class IsolationForest(Coniferest): """ Isolation forest. This is a reimplementation of sklearn.ensemble.IsolationForest, which trains and evaluates much faster. It also supports multi-threading for evaluation (sample scoring). Parameters ---------- n_trees : int, optional Number of trees in forest to build. n_subsamples : int, optional Number of subsamples to use for building the trees. max_depth : int or None, optional Maximal tree depth. If None, `log2(n_subsamples)` is used. n_jobs : int, default=-1 Number of threads to use for evaluation. If -1, use all available CPUs. random_seed : int or None, optional Seed for reproducibility. If None, random seed is used. """ def __init__( self, n_trees=100, n_subsamples=256, max_depth=None, n_jobs=-1, random_seed=None, sampletrees_per_batch=1 << 20, ): super().__init__( trees=[], n_subsamples=n_subsamples, max_depth=max_depth, n_jobs=n_jobs, random_seed=random_seed, sampletrees_per_batch=sampletrees_per_batch, ) self.n_trees = n_trees self.evaluator = None
[docs] def fit(self, data, labels=None): """ Build the trees based on data. Parameters ---------- data 2-d array with features. labels Unused. Defaults to None. Returns ------- self """ self.trees = self.build_trees(data, self.n_trees) self.evaluator = ConiferestEvaluator(self) return self
[docs] def score_samples(self, samples): """ Compute scores for given samples. Parameters ---------- samples 2-d array with features. Returns ------- 1-d array with scores. """ return self.evaluator.score_samples(samples)
[docs] def fit_known(self, data, known_data=None, known_labels=None): return self.fit(data)
[docs] def feature_signature(self, x): return self.evaluator.feature_signature(x)
[docs] def feature_importance(self, x): return self.evaluator.feature_importance(x)
[docs] def apply(self, x, output=None): """ Apply the forest to X, return leaf indices. Parameters ---------- x : ndarray shape (n_samples, n_features) 2-d array with features. output : {"dense", "sparse"}, default="dense" If "dense", returns a dense array of leaf indices per tree. If "sparse", returns a sparse CSR matrix of shape (n_samples, n_leaves) where each row has non-zero entries for leaves reached by the sample. Returns ------- x_leafs : ndarray of shape (n_samples, n_estimators) or csr_matrix of shape (n_samples, n_leaves) For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in (dense format). If output="sparse", returns a sparse matrix with 1.0 in entries where sample reaches the leaf. """ return self.evaluator.apply(x, output)
[docs] def distance(self, x, y=None, *, method=None): """ Compute distance matrix between samples based on leaf co-occurrence. The distance is defined as 1 minus the fraction of trees where two samples land in the same leaf. This gives a measure of dissimilarity between samples based on their paths through the forest. Parameters ---------- x : ndarray shape (n_samples_x, n_features) or (n_features,) Input samples. If 1-D, treated as a single sample. y : ndarray shape (n_samples_y, n_features) or (n_features,), optional Second set of samples for pairwise distance. If None (default), computes distances between all pairs in x. method : {"common_leaf_ratio"}, default="common_leaf_ratio" Distance computation method. Currently only "common_leaf_ratio" is supported. Returns ------- distances : ndarray shape (n_samples_x, n_samples_y) Distance matrix where distances[i, j] is the distance between the i-th sample in x and j-th sample in y. If y is None, returns a square symmetric matrix of shape (n_samples_x, n_samples_x). Raises ------ ValueError If method is not one of the known methods. """ return self.evaluator.distance(x, y, method=method)