from .coniferest import Coniferest, ConiferestEvaluator
__all__ = ["IsolationForest"]
[docs]
class IsolationForest(Coniferest):
"""
Isolation forest.
This is a reimplementation of sklearn.ensemble.IsolationForest,
which trains and evaluates much faster. It also supports multi-threading
for evaluation (sample scoring).
Parameters
----------
n_trees : int, optional
Number of trees in forest to build.
n_subsamples : int, optional
Number of subsamples to use for building the trees.
max_depth : int or None, optional
Maximal tree depth. If None, `log2(n_subsamples)` is used.
n_jobs : int, default=-1
Number of threads to use for evaluation. If -1, use all available CPUs.
random_seed : int or None, optional
Seed for reproducibility. If None, random seed is used.
"""
def __init__(
self,
n_trees=100,
n_subsamples=256,
max_depth=None,
n_jobs=-1,
random_seed=None,
sampletrees_per_batch=1 << 20,
):
super().__init__(
trees=[],
n_subsamples=n_subsamples,
max_depth=max_depth,
n_jobs=n_jobs,
random_seed=random_seed,
sampletrees_per_batch=sampletrees_per_batch,
)
self.n_trees = n_trees
self.evaluator = None
[docs]
def fit(self, data, labels=None):
"""
Build the trees based on data.
Parameters
----------
data
2-d array with features.
labels
Unused. Defaults to None.
Returns
-------
self
"""
self.trees = self.build_trees(data, self.n_trees)
self.evaluator = ConiferestEvaluator(self)
return self
[docs]
def score_samples(self, samples):
"""
Compute scores for given samples.
Parameters
----------
samples
2-d array with features.
Returns
-------
1-d array with scores.
"""
return self.evaluator.score_samples(samples)
[docs]
def fit_known(self, data, known_data=None, known_labels=None):
return self.fit(data)
[docs]
def feature_signature(self, x):
return self.evaluator.feature_signature(x)
[docs]
def feature_importance(self, x):
return self.evaluator.feature_importance(x)
[docs]
def apply(self, x, output=None):
"""
Apply the forest to X, return leaf indices.
Parameters
----------
x : ndarray shape (n_samples, n_features)
2-d array with features.
output : {"dense", "sparse"}, default="dense"
If "dense", returns a dense array of leaf indices per tree.
If "sparse", returns a sparse CSR matrix of shape (n_samples, n_leaves)
where each row has non-zero entries for leaves reached by the sample.
Returns
-------
x_leafs : ndarray of shape (n_samples, n_estimators) or csr_matrix of shape (n_samples, n_leaves)
For each datapoint x in X and for each tree in the forest,
return the index of the leaf x ends up in (dense format).
If output="sparse", returns a sparse matrix with 1.0 in entries where
sample reaches the leaf.
"""
return self.evaluator.apply(x, output)
[docs]
def distance(self, x, y=None, *, method=None):
"""
Compute distance matrix between samples based on leaf co-occurrence.
The distance is defined as 1 minus the fraction of trees where two samples
land in the same leaf. This gives a measure of dissimilarity between
samples based on their paths through the forest.
Parameters
----------
x : ndarray shape (n_samples_x, n_features) or (n_features,)
Input samples. If 1-D, treated as a single sample.
y : ndarray shape (n_samples_y, n_features) or (n_features,), optional
Second set of samples for pairwise distance. If None (default),
computes distances between all pairs in x.
method : {"common_leaf_ratio"}, default="common_leaf_ratio"
Distance computation method. Currently only "common_leaf_ratio"
is supported.
Returns
-------
distances : ndarray shape (n_samples_x, n_samples_y)
Distance matrix where distances[i, j] is the distance between
the i-th sample in x and j-th sample in y.
If y is None, returns a square symmetric matrix of shape
(n_samples_x, n_samples_x).
Raises
------
ValueError
If method is not one of the known methods.
"""
return self.evaluator.distance(x, y, method=method)