Source code for eazieda.eazieda

# from vega_datasets import data
# import pandas as pd
# import altair as alt
# import numpy as np


[docs]def corr_plot( data, features, method="pearson", plot_width=600, plot_height=400): """ Generates a correlation plot for a list of features in a given dataframe Parameters ---------- data: pandas.core.frame.DataFrame The input dataframe features: list A list of strings that represents numerical feature names len(features) >=2 method: str, default = "pearson" The correlation method Other correlation methods are "spearman" or "kendall" plot_width: int, default = 600 The width of the plot plot_height: int, default = 400 The height of the plot Returns ------- `altair plot` An interactive altair correlation plot Examples -------- >>> from eazieda.eazieda import corr_plot >>> from vega_datasets import data >>> df = data.iris() >>> corr_plot(df, ["petal_length", "petal_width", "sepal_length"]) """ pass
[docs]def missing_impute( data, impute=False, method_num="mean", method_non_num="most_frequent" ): """ Return the number/percentage of missing values for each column in the dataframe as well as giving the choice of imputing the missing values in place Parameters ---------- data : pandas.core.frame.DataFrame A Pandas Dataframe for which the missing values need to be detected impute : bool, default = False Whether to impute the missing values in place. method_num : str, default = "mean" The method used for imputing numerical missing values This is only applicable if impute=True One of 'drop', mean', 'median' method_non_num: str, default = "most_frequent" The method used for imputing non-numerical missing values This is only applicable if impute=True One of 'drop', 'most_frequent' Returns ------- pandas.core.frame.DataFrame A dataframe containing two columns: the number of missing values and the percentage of missing values for each column Examples -------- >>> from eazieda.eazieda import missing_impute >>> df = pd.DataFrame([[1, "x"], [np.nan, "y"], [2, np.nan], [3, "y"]], columns = ['a', 'b']) >>> missing_impute(df) n_missing percent a 1 25% b 1 25% """ pass