Source code for eazieda.corr_plot

import pandas as pd
import altair as alt
import numpy as np


[docs]def corr_plot( data, features=None, method="pearson", plot_width=500, plot_height=400 ): """ Generates a correlation plot for a list of features in a given dataframe Parameters ---------- data: pandas.core.frame.DataFrame The input dataframe features: list, default = None A list of strings that represents feature names len(features) >=2 None returns plot of all numeric features method: str, default = "pearson" The correlation method Other correlation methods are "spearman" or "kendall" plot_width: int, default = 500 The width of the plot plot_height: int, default = 400 The height of the plot Returns ------- `altair plot` An interactive altair correlation plot Examples -------- >>> from eazieda.corr_plot import corr_plot >>> from vega_datasets import data >>> df = data.iris() >>> corr_plot(df, ["petal_length", "petal_width", "sepal_length"], >>> "pearson") """ # Defining numeric_features list numeric_features = [ "int16", "int32", "int64", "float16", "float32", "float64", ] # Cheking user's inputs # Tests whether input data is of pd.DataFrame type if not isinstance(data, pd.DataFrame): raise TypeError("Please pass in a Pandas DataFrame for `data`") # Tests whether input features is of the type list if features is not None: if not isinstance(features, list): raise TypeError("Please pass in a list for `features`") # Tests whether input features has at least two features if features is not None: if len(features) < 2: raise ValueError("At least two features should be selected") # Tests whether input method is of the type str if not isinstance(method, str): raise TypeError("Please pass in a str for `method`") # Tests whether input method is one of the 3 available options if method not in ("pearson", "spearman", "kendall"): raise Exception( "Please pick a correlation method: 'pearson', 'spearman' or" " 'kendall'" ) # Tests whether input plot width and height are of the type int if (not isinstance(plot_width, int)) or (not isinstance(plot_height, int)): raise TypeError("Both plot_width and plot_height must be integers") # Subsetting the data dataframe if features is None: if data.select_dtypes(include=numeric_features).shape[1] < 2: raise ValueError( "Dataframe should have at least two numerical features" ) data = data.select_dtypes(include=numeric_features) else: if data[features].select_dtypes(np.number).shape[1] < 2: raise ValueError( "Dataframe should have at least two numerical features" ) data = data[features].select_dtypes(include=np.number) # Creating corr_df dataframe corr_df = data.corr(method).stack().reset_index(name="corr") corr_df.loc[corr_df["corr"] == 1, "corr"] = 0 corr_df["abs"] = corr_df["corr"].abs() # Correlation plot corr_plot = ( alt.Chart( corr_df, title=f"{method} Correlations Plot for Numerical Features" ) .mark_circle() .encode( x=alt.X("level_0", title="Numerical Features"), y=alt.Y("level_1", title="Numerical Features"), size=alt.Size("abs", title="Correlation Size"), color=alt.Color( "corr", title="Correlation", scale=alt.Scale(scheme="blueorange"), ), tooltip=alt.Tooltip("corr"), ) .properties(width=plot_width, height=plot_height) ) return corr_plot