In [1]: import numpy as np
from scipy import stats
In [2]: import statsmodels.api as sm
In [3]: import matplotlib
In [4]: import matplotlib.pyplot as plt
In [5]: from statsmodels.sandbox.regression.predstd import wls_prediction_std
fix a seed for these examples
In [6]: np.random.seed(9876789)
In [7]: nsample = 50
In [8]: sig = 0.5
In [9]: x1 = np.linspace(0, 20, nsample)
In [10]: X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]
In [11]: beta = [0.5, 0.5, -0.02, 5.]
In [12]: y_true = np.dot(X, beta)
In [13]: y = y_true + sig * np.random.normal(size=nsample)
In [14]: plt.figure()
Out[14]: <matplotlib.figure.Figure at 0x7c20250>
In [15]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[15]:
[<matplotlib.lines.Line2D at 0x8265d10>,
<matplotlib.lines.Line2D at 0x82661d0>]
In [16]: res = sm.OLS(y, X).fit()
In [17]: print res.params
[ 0.47040466 0.2931004 -0.01826292 5.24935422]
In [18]: print res.bse
[ 0.02853117 0.11215937 0.00250506 0.18499717]
In [19]: print res.predict()
[ 4.79278116 5.17262168 5.52726298 5.84073136 6.10281792 6.31075592 6.46967527 6.59175976 6.69424528 6.796588 6.91726779 7.07075203 7.26511865 7.50072896 7.77016828
8.05946415 8.35038197 8.62342091 8.8610178 9.05043274 9.18584224 9.26929595 9.31037998 9.32464188 9.33103621 9.34881043 9.39434252 9.47845015 9.60461339 9.76840293
9.95820778 10.15714295 10.34582361 10.50554994 10.62137947 10.68458209 10.69407437 10.65659757 10.58561009 10.49907624 10.41651482 10.3557922 10.33018692 10.34620807 10.4025259
10.49019023 10.594101 10.69548911 10.77500019 10.81587443]
In [20]: prstd, iv_l, iv_u = wls_prediction_std(res)
In [21]: plt.plot(x1, res.fittedvalues, 'r--.')
Out[21]: [<matplotlib.lines.Line2D at 0x6d73e50>]
In [22]: plt.plot(x1, iv_u, 'r--')
Out[22]: [<matplotlib.lines.Line2D at 0x7b18dd0>]
In [23]: plt.plot(x1, iv_l, 'r--')
Out[23]: [<matplotlib.lines.Line2D at 0x7c203d0>]
In [24]: plt.title('blue: true, red: OLS')
Out[24]: <matplotlib.text.Text at 0x6d15d50>
In [25]: print res.summary()
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.920
Model: OLS Adj. R-squared: 0.915
Method: Least Squares F-statistic: 175.9
Date: Wed, 11 Jul 2012 Prob (F-statistic): 3.30e-25
Time: 19:52:33 Log-Likelihood: -38.308
No. Observations: 50 AIC: 84.62
Df Residuals: 46 BIC: 92.26
Df Model: 3
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1 0.4704 0.029 16.487 0.000 0.413 0.528
x2 0.2931 0.112 2.613 0.012 0.067 0.519
x3 -0.0183 0.003 -7.290 0.000 -0.023 -0.013
const 5.2494 0.185 28.375 0.000 4.877 5.622
==============================================================================
Omnibus: 1.779 Durbin-Watson: 2.359
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.440
Skew: 0.414 Prob(JB): 0.487
Kurtosis: 2.933 Cond. No. 221.
==============================================================================
In [26]: sig = 1.
suppose observations from 3 groups
In [27]: xg = np.zeros(nsample, int)
In [28]: xg[20:40] = 1
In [29]: xg[40:] = 2
In [30]: print xg
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]
In [31]: dummy = (xg[:,None] == np.unique(xg)).astype(float)
use group 0 as benchmark
In [32]: X = np.c_[x1, dummy[:,1:], np.ones(nsample)]
In [33]: beta = [1., 3, -3, 10]
In [34]: y_true = np.dot(X, beta)
In [35]: y = y_true + sig * np.random.normal(size=nsample)
In [36]: res2 = sm.OLS(y, X).fit()
In [37]: print res2.params
[ 0.98475721 3.44866047 -2.68110961 9.77981018]
In [38]: print res2.bse
[ 0.06801982 0.64590456 1.05239527 0.35213863]
In [39]: print res.predict()
[ 4.79278116 5.17262168 5.52726298 5.84073136 6.10281792 6.31075592 6.46967527 6.59175976 6.69424528 6.796588 6.91726779 7.07075203 7.26511865 7.50072896 7.77016828
8.05946415 8.35038197 8.62342091 8.8610178 9.05043274 9.18584224 9.26929595 9.31037998 9.32464188 9.33103621 9.34881043 9.39434252 9.47845015 9.60461339 9.76840293
9.95820778 10.15714295 10.34582361 10.50554994 10.62137947 10.68458209 10.69407437 10.65659757 10.58561009 10.49907624 10.41651482 10.3557922 10.33018692 10.34620807 10.4025259
10.49019023 10.594101 10.69548911 10.77500019 10.81587443]
In [40]: prstd, iv_l, iv_u = wls_prediction_std(res2)
In [41]: plt.figure()
Out[41]: <matplotlib.figure.Figure at 0x6d190d0>
In [42]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[42]:
[<matplotlib.lines.Line2D at 0x6d2e2d0>,
<matplotlib.lines.Line2D at 0x6d323d0>]
In [43]: plt.figure()
Out[43]: <matplotlib.figure.Figure at 0x6d19950>
In [44]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[44]:
[<matplotlib.lines.Line2D at 0x6d50650>,
<matplotlib.lines.Line2D at 0x6d50190>]
In [45]: plt.plot(x1, res2.fittedvalues, 'r--.')
Out[45]: [<matplotlib.lines.Line2D at 0x6d426d0>]
In [46]: plt.plot(x1, iv_u, 'r--')
Out[46]: [<matplotlib.lines.Line2D at 0x6d42650>]
In [47]: plt.plot(x1, iv_l, 'r--')
Out[47]: [<matplotlib.lines.Line2D at 0x6d2da50>]
In [48]: plt.title('blue: true, red: OLS')
Out[48]: <matplotlib.text.Text at 0x7c40f10>
In [49]: print res.summary()
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.920
Model: OLS Adj. R-squared: 0.915
Method: Least Squares F-statistic: 175.9
Date: Wed, 11 Jul 2012 Prob (F-statistic): 3.30e-25
Time: 19:52:35 Log-Likelihood: -38.308
No. Observations: 50 AIC: 84.62
Df Residuals: 46 BIC: 92.26
Df Model: 3
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1 0.4704 0.029 16.487 0.000 0.413 0.528
x2 0.2931 0.112 2.613 0.012 0.067 0.519
x3 -0.0183 0.003 -7.290 0.000 -0.023 -0.013
const 5.2494 0.185 28.375 0.000 4.877 5.622
==============================================================================
Omnibus: 1.779 Durbin-Watson: 2.359
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.440
Skew: 0.414 Prob(JB): 0.487
Kurtosis: 2.933 Cond. No. 221.
==============================================================================
In [50]: R = [[0, 1, 0, 0],
....: [0, 0, 1, 0]]
....:
F test joint hypothesis R * beta = 0 i.e. coefficient on both dummy variables equal zero
In [51]: print res2.f_test(R)
<F test: F=array([[ 124.30756422]]), p=[[ 0.]], df_denom=46, df_num=2>
strongly rejected Null of identical constant in 3 groups .. <F test: F=124.19050615860911, p=2.87411973729e-019, df_denom=46, df_num=2>
In [52]: help(res2.f_test)
Help on method f_test in module statsmodels.base.model:
f_test(self, r_matrix, q_matrix=None, cov_p=None, scale=1.0, invcov=None) method of statsmodels.regression.linear_model.OLSResults instance
Compute an Fcontrast/F-test for a contrast matrix.
Here, matrix `r_matrix` is assumed to be non-singular. More precisely,
r_matrix (pX pX.T) r_matrix.T
is assumed invertible. Here, pX is the generalized inverse of the
design matrix of the model. There can be problems in non-OLS models
where the rank of the covariance of the noise is not full.
Parameters
----------
r_matrix : array-like
q x p array where q is the number of restrictions to test and
p is the number of regressors in the full model fit.
If q is 1 then f_test(r_matrix).fvalue is equivalent to
the square of t_test(r_matrix).t
q_matrix : array-like
q x 1 array, that represents the sum of each linear restriction.
Default is all zeros for each restriction.
scale : float, optional
Default is 1.0 for no scaling.
invcov : array-like, optional
A qxq matrix to specify an inverse covariance
matrix based on a restrictions matrix.
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
>>> data = sm.datasets.longley.load()
>>> data.exog = sm.add_constant(data.exog)
>>> results = sm.OLS(data.endog, data.exog).fit()
>>> A = np.identity(len(results.params))
>>> A = A[:-1,:]
This tests that each coefficient is jointly statistically
significantly different from zero.
>>> print results.f_test(A)
<F contrast: F=330.28533923463488, p=4.98403052872e-10,
df_denom=9, df_num=6>
Compare this to
>>> results.F
330.2853392346658
>>> results.F_p
4.98403096572e-10
>>> B = np.array(([0,1,-1,0,0,0,0],[0,0,0,0,1,-1,0]))
This tests that the coefficient on the 2nd and 3rd regressors are
equal and jointly that the coefficient on the 5th and 6th regressors
are equal.
>>> print results.f_test(B)
<F contrast: F=9.740461873303655, p=0.00560528853174, df_denom=9,
df_num=2>
See also
--------
statsmodels.contrasts
statsmodels.model.t_test
t test for Null hypothesis effects of 2nd and 3rd group add to zero
In [53]: R = [0, 1, -1, 0]
In [54]: print res2.t_test(R)
<T test: effect=array([ 6.12977008]), sd=array([[ 0.58029388]]), t=array([[ 10.5632168]]), p=array([[ 0.]]), df_denom=46>
don’t reject Null at 5% confidence level (note one sided p-value) .. <T test: effect=1.0363792917100714, sd=0.52675137730463362, t=1.9674923243925513, p=0.027586676754860262, df_denom=46>
OLS with small group effects
In [55]: beta = [1., 0.3, -0.0, 10]
In [56]: y_true = np.dot(X, beta)
In [57]: y = y_true + sig * np.random.normal(size=nsample)
In [58]: res3 = sm.OLS(y, X).fit()
In [59]: print res3.f_test(R)
<F test: F=array([[ 0.4929838]]), p=[[ 0.48613705]], df_denom=46, df_num=1>
don’t reject Null of identical constant in 3 groups .. <F test: F=1.9715385826285652, p=0.15083366806, df_denom=46, df_num=2>