Source code for eazieda.eazieda

# from vega_datasets import data
# import pandas as pd
# import altair as alt
# import numpy as np


[docs]def corr_plot(
        data,
        features,
        method="pearson",
        plot_width=600,
        plot_height=400):
    """
    Generates a correlation plot for a list of features in a given dataframe

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        The input dataframe
    features: list
        A list of strings that represents numerical feature names
        len(features) >=2 

    method: str, default = "pearson"
        The correlation method
        Other correlation methods are "spearman" or "kendall"

    plot_width: int, default = 600
        The width of the plot

    plot_height: int, default = 400
        The height of the plot

    Returns
    -------
    `altair plot`
        An interactive altair correlation plot

    Examples
    --------
    >>> from eazieda.eazieda import corr_plot
    >>> from vega_datasets import data
    >>> df = data.iris()
    >>> corr_plot(df, ["petal_length", "petal_width", "sepal_length"])
    """
    pass

[docs]def missing_impute(
        data,
        impute=False,
        method_num="mean",
        method_non_num="most_frequent"
):
    """
    Return the number/percentage of missing values for each column 
    in the dataframe as well as giving the
    choice of imputing the missing values in place

    Parameters
    ----------
    data : pandas.core.frame.DataFrame
        A Pandas Dataframe for which the missing values need to be detected

    impute : bool, default = False
        Whether to impute the missing values in place.

    method_num : str, default = "mean"
        The method used for imputing numerical missing values
        This is only applicable if impute=True
        One of 'drop', mean', 'median'

    method_non_num: str, default = "most_frequent"
        The method used for imputing non-numerical missing values 
        This is only applicable if impute=True
        One of 'drop', 'most_frequent'

    Returns
    -------
    pandas.core.frame.DataFrame
        A dataframe containing two columns: the number of missing values and 
        the percentage of missing values for each column

    Examples
    --------
    >>> from eazieda.eazieda import missing_impute
    >>> df = pd.DataFrame([[1, "x"], [np.nan, "y"], [2, np.nan], [3, "y"]], columns = ['a', 'b'])
    >>> missing_impute(df)
        n_missing	percent
    a	1	        25%
    b	1	        25%
    """
    pass