Source code for eazieda.missing_impute
import pandas as pd
import numpy as np
[docs]def missing_impute(data, method_num="mean", method_non_num="most_frequent"):
"""
Return the imputed version of data based on the methods selected
Parameters
----------
data : pandas.core.frame.DataFrame
A Pandas Dataframe for which the missing values need to be detected
method_num : str, default = "mean"
The method used for imputing numerical missing values
One of 'drop', mean', 'median'
method_non_num: str, default = "most_frequent"
The method used for imputing non-numerical
missing values. One of 'drop', 'most_frequent'
Returns
-------
pandas.core.frame.DataFrame
A imputed dataframe
Examples
--------
>>> from eazieda.missing_impute import missing_impute
>>> df = pd.DataFrame([[1, "x"], [np.nan, "y"], [2, np.nan], [3, "y"]],
>>> columns = ['a', 'b'])
>>> missing_impute(df)
a b
0 1 x
1 2 y
2 2 y
3 3 y
"""
# Tests whether input data is of pd.DataFrame type
if not isinstance(data, pd.DataFrame):
raise TypeError("Please pass in a Pandas DataFrame for `data`")
# Tests whether input method_num is one of the option
if method_num not in ["drop", "mean", "median"]:
raise ValueError(
"Please enter one the following option: 'drop', 'mean', 'median'"
)
# Tests whether input method_non_num is one of the option
if method_non_num not in ["drop", "most_frequent"]:
raise ValueError(
"Please enter one the the option: 'drop', 'most_frequent'"
)
# filter out the numerical columns and non-numerical columns
num_columns = data.select_dtypes(include=np.number).columns.tolist()
non_num_columns = [col for col in data.columns if col not in num_columns]
imputed_df = data.copy()
# impute for numerical columns:
if method_num == "drop":
imputed_df = imputed_df.dropna(axis=0, subset=num_columns)
else:
for num_column in num_columns:
if method_num == "mean":
imputed_df[num_columns] = imputed_df[num_columns].replace(
np.nan, imputed_df[num_columns].mean()
)
else:
imputed_df[num_columns] = imputed_df[num_columns].replace(
np.nan, imputed_df[num_columns].median()
)
# impute for non-numrical columns:
if method_non_num == "drop":
imputed_df = imputed_df.dropna(axis=0, subset=non_num_columns)
else:
for non_num_column in non_num_columns:
most_frequent = (
imputed_df[non_num_column]
.value_counts()
.sort_values(ascending=False)
.index[0]
)
imputed_df[non_num_column] = imputed_df[non_num_column].replace(
np.nan, most_frequent
)
return imputed_df.reset_index(drop=True)