Logo

OLS Example with joint significance tests

In [1]: import numpy as np

from scipy import stats

In [2]: import statsmodels.api as sm

In [3]: import matplotlib
In [4]: import matplotlib.pyplot as plt

In [5]: from statsmodels.sandbox.regression.predstd import wls_prediction_std

fix a seed for these examples

In [6]: np.random.seed(9876789)

OLS non-linear curve but linear in parameters

In [7]: nsample = 50

In [8]: sig = 0.5

In [9]: x1 = np.linspace(0, 20, nsample)

In [10]: X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]

In [11]: beta = [0.5, 0.5, -0.02, 5.]

In [12]: y_true = np.dot(X, beta)

In [13]: y = y_true + sig * np.random.normal(size=nsample)

In [14]: plt.figure()
Out[14]: <matplotlib.figure.Figure at 0x7c20250>

In [15]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[15]: 
[<matplotlib.lines.Line2D at 0x8265d10>,
 <matplotlib.lines.Line2D at 0x82661d0>]

In [16]: res = sm.OLS(y, X).fit()

In [17]: print res.params
[ 0.47040466  0.2931004  -0.01826292  5.24935422]

In [18]: print res.bse
[ 0.02853117  0.11215937  0.00250506  0.18499717]

In [19]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [20]: prstd, iv_l, iv_u = wls_prediction_std(res)

In [21]: plt.plot(x1, res.fittedvalues, 'r--.')
Out[21]: [<matplotlib.lines.Line2D at 0x6d73e50>]

In [22]: plt.plot(x1, iv_u, 'r--')
Out[22]: [<matplotlib.lines.Line2D at 0x7b18dd0>]

In [23]: plt.plot(x1, iv_l, 'r--')
Out[23]: [<matplotlib.lines.Line2D at 0x7c203d0>]

In [24]: plt.title('blue: true,   red: OLS')
Out[24]: <matplotlib.text.Text at 0x6d15d50>

In [25]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Wed, 11 Jul 2012   Prob (F-statistic):           3.30e-25
Time:                        19:52:33   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================
../../_images/tut_ols_0.png

OLS with dummy variables

In [26]: sig = 1.

suppose observations from 3 groups

In [27]: xg = np.zeros(nsample, int)

In [28]: xg[20:40] = 1

In [29]: xg[40:] = 2

In [30]: print xg
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]

In [31]: dummy = (xg[:,None] == np.unique(xg)).astype(float)

use group 0 as benchmark

In [32]: X = np.c_[x1, dummy[:,1:], np.ones(nsample)]

In [33]: beta = [1., 3, -3, 10]

In [34]: y_true = np.dot(X, beta)

In [35]: y = y_true + sig * np.random.normal(size=nsample)

In [36]: res2 = sm.OLS(y, X).fit()

In [37]: print res2.params
[ 0.98475721  3.44866047 -2.68110961  9.77981018]

In [38]: print res2.bse
[ 0.06801982  0.64590456  1.05239527  0.35213863]

In [39]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [40]: prstd, iv_l, iv_u = wls_prediction_std(res2)

In [41]: plt.figure()
Out[41]: <matplotlib.figure.Figure at 0x6d190d0>

In [42]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[42]: 
[<matplotlib.lines.Line2D at 0x6d2e2d0>,
 <matplotlib.lines.Line2D at 0x6d323d0>]

In [43]: plt.figure()
Out[43]: <matplotlib.figure.Figure at 0x6d19950>

In [44]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[44]: 
[<matplotlib.lines.Line2D at 0x6d50650>,
 <matplotlib.lines.Line2D at 0x6d50190>]

In [45]: plt.plot(x1, res2.fittedvalues, 'r--.')
Out[45]: [<matplotlib.lines.Line2D at 0x6d426d0>]

In [46]: plt.plot(x1, iv_u, 'r--')
Out[46]: [<matplotlib.lines.Line2D at 0x6d42650>]

In [47]: plt.plot(x1, iv_l, 'r--')
Out[47]: [<matplotlib.lines.Line2D at 0x6d2da50>]

In [48]: plt.title('blue: true,   red: OLS')
Out[48]: <matplotlib.text.Text at 0x7c40f10>

In [49]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Wed, 11 Jul 2012   Prob (F-statistic):           3.30e-25
Time:                        19:52:35   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================

In [50]: R = [[0, 1, 0, 0],
   ....:       [0, 0, 1, 0]]
   ....: 
../../_images/tut_ols_1.png

F test joint hypothesis R * beta = 0 i.e. coefficient on both dummy variables equal zero

In [51]: print res2.f_test(R)
<F test: F=array([[ 124.30756422]]), p=[[ 0.]], df_denom=46, df_num=2>

strongly rejected Null of identical constant in 3 groups .. <F test: F=124.19050615860911, p=2.87411973729e-019, df_denom=46, df_num=2>

In [52]: help(res2.f_test)
Help on method f_test in module statsmodels.base.model:

f_test(self, r_matrix, q_matrix=None, cov_p=None, scale=1.0, invcov=None) method of statsmodels.regression.linear_model.OLSResults instance
    Compute an Fcontrast/F-test for a contrast matrix.
    
    Here, matrix `r_matrix` is assumed to be non-singular. More precisely,
    
    r_matrix (pX pX.T) r_matrix.T
    
    is assumed invertible. Here, pX is the generalized inverse of the
    design matrix of the model. There can be problems in non-OLS models
    where the rank of the covariance of the noise is not full.
    
    Parameters
    ----------
    r_matrix : array-like
        q x p array where q is the number of restrictions to test and
        p is the number of regressors in the full model fit.
        If q is 1 then f_test(r_matrix).fvalue is equivalent to
        the square of t_test(r_matrix).t
    q_matrix : array-like
        q x 1 array, that represents the sum of each linear restriction.
        Default is all zeros for each restriction.
    scale : float, optional
        Default is 1.0 for no scaling.
    invcov : array-like, optional
        A qxq matrix to specify an inverse covariance
        matrix based on a restrictions matrix.
    
    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.longley.load()
    >>> data.exog = sm.add_constant(data.exog)
    >>> results = sm.OLS(data.endog, data.exog).fit()
    >>> A = np.identity(len(results.params))
    >>> A = A[:-1,:]
    
    This tests that each coefficient is jointly statistically
    significantly different from zero.
    
    >>> print results.f_test(A)
    <F contrast: F=330.28533923463488, p=4.98403052872e-10,
     df_denom=9, df_num=6>
    
    Compare this to
    
    >>> results.F
    330.2853392346658
    >>> results.F_p
    4.98403096572e-10
    
    >>> B = np.array(([0,1,-1,0,0,0,0],[0,0,0,0,1,-1,0]))
    
    This tests that the coefficient on the 2nd and 3rd regressors are
    equal and jointly that the coefficient on the 5th and 6th regressors
    are equal.
    
    >>> print results.f_test(B)
    <F contrast: F=9.740461873303655, p=0.00560528853174, df_denom=9,
     df_num=2>
    
    See also
    --------
    statsmodels.contrasts
    statsmodels.model.t_test

t test for Null hypothesis effects of 2nd and 3rd group add to zero

In [53]: R = [0, 1, -1, 0]

In [54]: print res2.t_test(R)
<T test: effect=array([ 6.12977008]), sd=array([[ 0.58029388]]), t=array([[ 10.5632168]]), p=array([[ 0.]]), df_denom=46>

don’t reject Null at 5% confidence level (note one sided p-value) .. <T test: effect=1.0363792917100714, sd=0.52675137730463362, t=1.9674923243925513, p=0.027586676754860262, df_denom=46>

OLS with small group effects

In [55]: beta = [1., 0.3, -0.0, 10]

In [56]: y_true = np.dot(X, beta)

In [57]: y = y_true + sig * np.random.normal(size=nsample)

In [58]: res3 = sm.OLS(y, X).fit()

In [59]: print res3.f_test(R)
<F test: F=array([[ 0.4929838]]), p=[[ 0.48613705]], df_denom=46, df_num=1>

don’t reject Null of identical constant in 3 groups .. <F test: F=1.9715385826285652, p=0.15083366806, df_denom=46, df_num=2>

Table Of Contents

This Page