我喜欢把所有东西都储存在
pandas
中,并尽可能地用
DataFrame.plot()
来绘制。
from matplotlib import pyplot as plt
from pandas.core.frame import DataFrame
import scipy.stats as stats
import statsmodels.api as sm
def linear_regression(df: DataFrame) -> DataFrame:
"""Perform a univariate regression and store results in a new data frame.
Args:
df (DataFrame): orginal data set with x and y.
Returns:
DataFrame: another dataframe with raw data and results.
mod = sm.OLS(endog=df['y'], exog=df['x']).fit()
influence = mod.get_influence()
res = df.copy()
res['resid'] = mod.resid
res['fittedvalues'] = mod.fittedvalues
res['resid_std'] = mod.resid_pearson
res['leverage'] = influence.hat_matrix_diag
return res
def plot_diagnosis(df: DataFrame):
fig, axes = plt.subplots(nrows=2, ncols=2)
plt.style.use('seaborn')
# Residual against fitted values.
df.plot.scatter(
x='fittedvalues', y='resid', ax=axes[0, 0]
axes[0, 0].axhline(y=0, color='grey', linestyle='dashed')
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Fitted')
# qqplot
sm.qqplot(
df['resid'], dist=stats.t, fit=True, line='45',
ax=axes[0, 1], c='#4C72B0'
axes[0, 1].set_title('Normal Q-Q')
# The scale-location plot.
df.plot.scatter(
x='fittedvalues', y='resid_std', ax=axes[1, 0]
axes[1, 0].axhline(y=0, color='grey', linestyle='dashed')
axes[1, 0].set_xlabel('Fitted values')
axes[1, 0].set_ylabel('Sqrt(|standardized residuals|)')
axes[1, 0].set_title('Scale-Location')
# Standardized residuals vs. leverage
df.plot.scatter(
x='leverage', y='resid_std', ax=axes[1, 1]
axes[1, 1].axhline(y=0, color='grey', linestyle='dashed')
axes[1, 1].set_xlabel('Leverage')
axes[1, 1].set_ylabel('Sqrt(|standardized residuals|)')
axes[1, 1].set_title('Residuals vs Leverage')
plt.tight_layout()