Source code for eazieda.missing_impute

import pandas as pd
import numpy as np


[docs]def missing_impute(data, method_num="mean", method_non_num="most_frequent"): """ Return the imputed version of data based on the methods selected Parameters ---------- data : pandas.core.frame.DataFrame A Pandas Dataframe for which the missing values need to be detected method_num : str, default = "mean" The method used for imputing numerical missing values One of 'drop', mean', 'median' method_non_num: str, default = "most_frequent" The method used for imputing non-numerical missing values. One of 'drop', 'most_frequent' Returns ------- pandas.core.frame.DataFrame A imputed dataframe Examples -------- >>> from eazieda.missing_impute import missing_impute >>> df = pd.DataFrame([[1, "x"], [np.nan, "y"], [2, np.nan], [3, "y"]], >>> columns = ['a', 'b']) >>> missing_impute(df) a b 0 1 x 1 2 y 2 2 y 3 3 y """ # Tests whether input data is of pd.DataFrame type if not isinstance(data, pd.DataFrame): raise TypeError("Please pass in a Pandas DataFrame for `data`") # Tests whether input method_num is one of the option if method_num not in ["drop", "mean", "median"]: raise ValueError( "Please enter one the following option: 'drop', 'mean', 'median'" ) # Tests whether input method_non_num is one of the option if method_non_num not in ["drop", "most_frequent"]: raise ValueError( "Please enter one the the option: 'drop', 'most_frequent'" ) # filter out the numerical columns and non-numerical columns num_columns = data.select_dtypes(include=np.number).columns.tolist() non_num_columns = [col for col in data.columns if col not in num_columns] imputed_df = data.copy() # impute for numerical columns: if method_num == "drop": imputed_df = imputed_df.dropna(axis=0, subset=num_columns) else: for num_column in num_columns: if method_num == "mean": imputed_df[num_columns] = imputed_df[num_columns].replace( np.nan, imputed_df[num_columns].mean() ) else: imputed_df[num_columns] = imputed_df[num_columns].replace( np.nan, imputed_df[num_columns].median() ) # impute for non-numrical columns: if method_non_num == "drop": imputed_df = imputed_df.dropna(axis=0, subset=non_num_columns) else: for non_num_column in non_num_columns: most_frequent = ( imputed_df[non_num_column] .value_counts() .sort_values(ascending=False) .index[0] ) imputed_df[non_num_column] = imputed_df[non_num_column].replace( np.nan, most_frequent ) return imputed_df.reset_index(drop=True)