import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import IsolationForest
[docs]def outliers_detect(s, method="zscore"):
"""
Detects outliers in a pandas series
Parameters
----------
s : pandas.core.series.Series
Pandas Series for which the outliers need to be found
method : str, default = "zscore"
The algorithm/method used for outlier detection.
One of 'zscore', 'iforest', 'iqr'
Returns
-------
numpy.array
Boolean array with same length as the input,
indices of outlier marked.
Examples
--------
>>> from eazieda.outliers_detect import outliers_detect
>>> s = pd.Series([1,1,1,1,1,1,1,1,1,1,1e14])
>>> outliers_detect(s)
array([False, False, False, False, False, False, False, False, False,
True])
"""
if not isinstance(s, pd.Series):
raise TypeError("s should be a pandas series")
if method == "zscore":
outliers = outliers_detect_zscore(s)
elif method == "iqr":
outliers = outliers_detect_iqr(s)
elif method == "iforest":
outliers = outliers_detect_iforest(s)
else:
raise ValueError("Invalid method. should be zscore, iqr or iforest")
return outliers
[docs]def outliers_detect_iforest(s):
"""
Detects outliers in a pandas series using isolation forests
Parameters
----------
s : pandas.core.series.Series
Pandas Series for which the outliers need to be found
Returns
-------
numpy.array
Boolean array with same length as the input,
indices of outlier marked.
Examples
--------
>>> from eazieda.outliers_detect import outliers_detect_iforest
>>> s = pd.Series([1,2,1,2,1, 1000])
>>> outliers_detect_iforest(s)
array([False, False, False, False, False, True])
"""
iforest = IsolationForest().fit(s.values.reshape(-1, 1))
return iforest.predict(s.values.reshape(-1, 1)) == -1
[docs]def outliers_detect_iqr(s, factor=1.5):
"""
Detects outliers in a pandas series using inter-quantile ranges
Parameters
----------
s : pandas.core.series.Series
Pandas Series for which the outliers need to be found
factor : int
iqr factor used for outliers
Returns
-------
numpy.array
Boolean array with same length as the input,
indices of outlier marked.
Examples
--------
>>> from eazieda.outliers_detect import outliers_detect_iqr
>>> s = pd.Series([1,2,1,2,1, 1000])
>>> outliers_detect_iqr(s)
array([False, False, False, False, False, True])
"""
q1 = s.quantile(0.25)
q3 = s.quantile(0.75)
inter_quantile_range = q3 - q1
return (
(s < (q1 - factor * inter_quantile_range))
| (s > (q3 + factor * inter_quantile_range))
).values
[docs]def outliers_detect_zscore(s, threshold=3):
"""
Detects outliers in a pandas series using zscores
Parameters
----------
s : pandas.core.series.Series
Pandas Series for which the outliers need to be found
threshold : int
zscore threshold used for outliers
Returns
-------
numpy.array
Boolean array with same length as the input,
indices of outlier marked.
Examples
--------
>>> from eazieda.outliers_detect import outliers_detect_zscore
>>> s = pd.Series([1,1,1,1,1,1,1,1,1,1,1e14])
>>> outliers_detect_zscore(s)
array([False, False, False, False, False, False, False, False, False,
True])
"""
z = np.abs(stats.zscore(s))
return z > threshold
[docs]def remove_outliers(s, outliers, inplace=False):
"""
Drops outliers from the given series
Parameters
----------
s : pandas.core.series.Series
Pandas Series for which the outliers need to be found
outliers : numpy.array
boolean numpy array with the same length as s.
Outliers should be marked with True.
inplace : boolean
do the removal inplace
Returns
-------
None or pd.Series
series with outliers removed. None if inplace=True.
Examples
--------
>>> from eazieda.outliers_detect import remove_outliers
>>> s = pd.Series([1,1e14])
>>> outliers = np.array([False,,True])
>>> remove_outliers(s, outliers)
>>> s
0 1.0
dtype: float64
"""
if not isinstance(s, pd.Series):
raise TypeError("s should be a pandas series")
if not isinstance(outliers, np.ndarray):
raise TypeError("outliers should be numpy array")
return s.drop(s.index[outliers], inplace=inplace)