001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math.stat.regression; 019 import java.io.Serializable; 020 021 import org.apache.commons.math.MathException; 022 import org.apache.commons.math.MathRuntimeException; 023 import org.apache.commons.math.distribution.TDistribution; 024 import org.apache.commons.math.distribution.TDistributionImpl; 025 026 /** 027 * Estimates an ordinary least squares regression model 028 * with one independent variable. 029 * <p> 030 * <code> y = intercept + slope * x </code></p> 031 * <p> 032 * Standard errors for <code>intercept</code> and <code>slope</code> are 033 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 034 * <p> 035 * Observations (x,y pairs) can be added to the model one at a time or they 036 * can be provided in a 2-dimensional array. The observations are not stored 037 * in memory, so there is no limit to the number of observations that can be 038 * added to the model.</p> 039 * <p> 040 * <strong>Usage Notes</strong>: <ul> 041 * <li> When there are fewer than two observations in the model, or when 042 * there is no variation in the x values (i.e. all x values are the same) 043 * all statistics return <code>NaN</code>. At least two observations with 044 * different x coordinates are requred to estimate a bivariate regression 045 * model. 046 * </li> 047 * <li> getters for the statistics always compute values based on the current 048 * set of observations -- i.e., you can get statistics, then add more data 049 * and get updated statistics without using a new instance. There is no 050 * "compute" method that updates all statistics. Each of the getters performs 051 * the necessary computations to return the requested statistic.</li> 052 * </ul></p> 053 * 054 * @version $Revision: 773189 $ $Date: 2009-05-09 05:57:04 -0400 (Sat, 09 May 2009) $ 055 */ 056 public class SimpleRegression implements Serializable { 057 058 /** Serializable version identifier */ 059 private static final long serialVersionUID = -3004689053607543335L; 060 061 /** the distribution used to compute inference statistics. */ 062 private TDistribution distribution; 063 064 /** sum of x values */ 065 private double sumX = 0d; 066 067 /** total variation in x (sum of squared deviations from xbar) */ 068 private double sumXX = 0d; 069 070 /** sum of y values */ 071 private double sumY = 0d; 072 073 /** total variation in y (sum of squared deviations from ybar) */ 074 private double sumYY = 0d; 075 076 /** sum of products */ 077 private double sumXY = 0d; 078 079 /** number of observations */ 080 private long n = 0; 081 082 /** mean of accumulated x values, used in updating formulas */ 083 private double xbar = 0; 084 085 /** mean of accumulated y values, used in updating formulas */ 086 private double ybar = 0; 087 088 // ---------------------Public methods-------------------------------------- 089 090 /** 091 * Create an empty SimpleRegression instance 092 */ 093 public SimpleRegression() { 094 this(new TDistributionImpl(1.0)); 095 } 096 097 /** 098 * Create an empty SimpleRegression using the given distribution object to 099 * compute inference statistics. 100 * @param t the distribution used to compute inference statistics. 101 * @since 1.2 102 */ 103 public SimpleRegression(TDistribution t) { 104 super(); 105 setDistribution(t); 106 } 107 108 /** 109 * Adds the observation (x,y) to the regression data set. 110 * <p> 111 * Uses updating formulas for means and sums of squares defined in 112 * "Algorithms for Computing the Sample Variance: Analysis and 113 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 114 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 115 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 116 * 117 * 118 * @param x independent variable value 119 * @param y dependent variable value 120 */ 121 public void addData(double x, double y) { 122 if (n == 0) { 123 xbar = x; 124 ybar = y; 125 } else { 126 double dx = x - xbar; 127 double dy = y - ybar; 128 sumXX += dx * dx * (double) n / (n + 1d); 129 sumYY += dy * dy * (double) n / (n + 1d); 130 sumXY += dx * dy * (double) n / (n + 1d); 131 xbar += dx / (n + 1.0); 132 ybar += dy / (n + 1.0); 133 } 134 sumX += x; 135 sumY += y; 136 n++; 137 138 if (n > 2) { 139 distribution.setDegreesOfFreedom(n - 2); 140 } 141 } 142 143 144 /** 145 * Removes the observation (x,y) from the regression data set. 146 * <p> 147 * Mirrors the addData method. This method permits the use of 148 * SimpleRegression instances in streaming mode where the regression 149 * is applied to a sliding "window" of observations, however the caller is 150 * responsible for maintaining the set of observations in the window.</p> 151 * 152 * The method has no effect if there are no points of data (i.e. n=0) 153 * 154 * @param x independent variable value 155 * @param y dependent variable value 156 */ 157 public void removeData(double x, double y) { 158 if (n > 0) { 159 double dx = x - xbar; 160 double dy = y - ybar; 161 sumXX -= dx * dx * (double) n / (n - 1d); 162 sumYY -= dy * dy * (double) n / (n - 1d); 163 sumXY -= dx * dy * (double) n / (n - 1d); 164 xbar -= dx / (n - 1.0); 165 ybar -= dy / (n - 1.0); 166 sumX -= x; 167 sumY -= y; 168 n--; 169 170 if (n > 2) { 171 distribution.setDegreesOfFreedom(n - 2); 172 } 173 } 174 } 175 176 /** 177 * Adds the observations represented by the elements in 178 * <code>data</code>. 179 * <p> 180 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 181 * <code>(data[1][0],data[1][1])</code>, etc.</p> 182 * <p> 183 * This method does not replace data that has already been added. The 184 * observations represented by <code>data</code> are added to the existing 185 * dataset.</p> 186 * <p> 187 * To replace all data, use <code>clear()</code> before adding the new 188 * data.</p> 189 * 190 * @param data array of observations to be added 191 */ 192 public void addData(double[][] data) { 193 for (int i = 0; i < data.length; i++) { 194 addData(data[i][0], data[i][1]); 195 } 196 } 197 198 199 /** 200 * Removes observations represented by the elements in <code>data</code>. 201 * <p> 202 * If the array is larger than the current n, only the first n elements are 203 * processed. This method permits the use of SimpleRegression instances in 204 * streaming mode where the regression is applied to a sliding "window" of 205 * observations, however the caller is responsible for maintaining the set 206 * of observations in the window.</p> 207 * <p> 208 * To remove all data, use <code>clear()</code>.</p> 209 * 210 * @param data array of observations to be removed 211 */ 212 public void removeData(double[][] data) { 213 for (int i = 0; i < data.length && n > 0; i++) { 214 removeData(data[i][0], data[i][1]); 215 } 216 } 217 218 /** 219 * Clears all data from the model. 220 */ 221 public void clear() { 222 sumX = 0d; 223 sumXX = 0d; 224 sumY = 0d; 225 sumYY = 0d; 226 sumXY = 0d; 227 n = 0; 228 } 229 230 /** 231 * Returns the number of observations that have been added to the model. 232 * 233 * @return n number of observations that have been added. 234 */ 235 public long getN() { 236 return n; 237 } 238 239 /** 240 * Returns the "predicted" <code>y</code> value associated with the 241 * supplied <code>x</code> value, based on the data that has been 242 * added to the model when this method is activated. 243 * <p> 244 * <code> predict(x) = intercept + slope * x </code></p> 245 * <p> 246 * <strong>Preconditions</strong>: <ul> 247 * <li>At least two observations (with at least two different x values) 248 * must have been added before invoking this method. If this method is 249 * invoked before a model can be estimated, <code>Double,NaN</code> is 250 * returned. 251 * </li></ul></p> 252 * 253 * @param x input <code>x</code> value 254 * @return predicted <code>y</code> value 255 */ 256 public double predict(double x) { 257 double b1 = getSlope(); 258 return getIntercept(b1) + b1 * x; 259 } 260 261 /** 262 * Returns the intercept of the estimated regression line. 263 * <p> 264 * The least squares estimate of the intercept is computed using the 265 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 266 * The intercept is sometimes denoted b0.</p> 267 * <p> 268 * <strong>Preconditions</strong>: <ul> 269 * <li>At least two observations (with at least two different x values) 270 * must have been added before invoking this method. If this method is 271 * invoked before a model can be estimated, <code>Double,NaN</code> is 272 * returned. 273 * </li></ul></p> 274 * 275 * @return the intercept of the regression line 276 */ 277 public double getIntercept() { 278 return getIntercept(getSlope()); 279 } 280 281 /** 282 * Returns the slope of the estimated regression line. 283 * <p> 284 * The least squares estimate of the slope is computed using the 285 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 286 * The slope is sometimes denoted b1.</p> 287 * <p> 288 * <strong>Preconditions</strong>: <ul> 289 * <li>At least two observations (with at least two different x values) 290 * must have been added before invoking this method. If this method is 291 * invoked before a model can be estimated, <code>Double.NaN</code> is 292 * returned. 293 * </li></ul></p> 294 * 295 * @return the slope of the regression line 296 */ 297 public double getSlope() { 298 if (n < 2) { 299 return Double.NaN; //not enough data 300 } 301 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) { 302 return Double.NaN; //not enough variation in x 303 } 304 return sumXY / sumXX; 305 } 306 307 /** 308 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 309 * sum of squared errors</a> (SSE) associated with the regression 310 * model. 311 * <p> 312 * The sum is computed using the computational formula</p> 313 * <p> 314 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 315 * <p> 316 * where <code>SYY</code> is the sum of the squared deviations of the y 317 * values about their mean, <code>SXX</code> is similarly defined and 318 * <code>SXY</code> is the sum of the products of x and y mean deviations. 319 * </p><p> 320 * The sums are accumulated using the updating algorithm referenced in 321 * {@link #addData}.</p> 322 * <p> 323 * The return value is constrained to be non-negative - i.e., if due to 324 * rounding errors the computational formula returns a negative result, 325 * 0 is returned.</p> 326 * <p> 327 * <strong>Preconditions</strong>: <ul> 328 * <li>At least two observations (with at least two different x values) 329 * must have been added before invoking this method. If this method is 330 * invoked before a model can be estimated, <code>Double,NaN</code> is 331 * returned. 332 * </li></ul></p> 333 * 334 * @return sum of squared errors associated with the regression model 335 */ 336 public double getSumSquaredErrors() { 337 return Math.max(0d, sumYY - sumXY * sumXY / sumXX); 338 } 339 340 /** 341 * Returns the sum of squared deviations of the y values about their mean. 342 * <p> 343 * This is defined as SSTO 344 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 345 * <p> 346 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 347 * 348 * @return sum of squared deviations of y values 349 */ 350 public double getTotalSumSquares() { 351 if (n < 2) { 352 return Double.NaN; 353 } 354 return sumYY; 355 } 356 357 /** 358 * Returns the sum of squared deviations of the x values about their mean. 359 * 360 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 361 * 362 * @return sum of squared deviations of x values 363 */ 364 public double getXSumSquares() { 365 if (n < 2) { 366 return Double.NaN; 367 } 368 return sumXX; 369 } 370 371 /** 372 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. 373 * 374 * @return sum of cross products 375 */ 376 public double getSumOfCrossProducts() { 377 return sumXY; 378 } 379 380 /** 381 * Returns the sum of squared deviations of the predicted y values about 382 * their mean (which equals the mean of y). 383 * <p> 384 * This is usually abbreviated SSR or SSM. It is defined as SSM 385 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 386 * <p> 387 * <strong>Preconditions</strong>: <ul> 388 * <li>At least two observations (with at least two different x values) 389 * must have been added before invoking this method. If this method is 390 * invoked before a model can be estimated, <code>Double.NaN</code> is 391 * returned. 392 * </li></ul></p> 393 * 394 * @return sum of squared deviations of predicted y values 395 */ 396 public double getRegressionSumSquares() { 397 return getRegressionSumSquares(getSlope()); 398 } 399 400 /** 401 * Returns the sum of squared errors divided by the degrees of freedom, 402 * usually abbreviated MSE. 403 * <p> 404 * If there are fewer than <strong>three</strong> data pairs in the model, 405 * or if there is no variation in <code>x</code>, this returns 406 * <code>Double.NaN</code>.</p> 407 * 408 * @return sum of squared deviations of y values 409 */ 410 public double getMeanSquareError() { 411 if (n < 3) { 412 return Double.NaN; 413 } 414 return getSumSquaredErrors() / (n - 2); 415 } 416 417 /** 418 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 419 * Pearson's product moment correlation coefficient</a>, 420 * usually denoted r. 421 * <p> 422 * <strong>Preconditions</strong>: <ul> 423 * <li>At least two observations (with at least two different x values) 424 * must have been added before invoking this method. If this method is 425 * invoked before a model can be estimated, <code>Double,NaN</code> is 426 * returned. 427 * </li></ul></p> 428 * 429 * @return Pearson's r 430 */ 431 public double getR() { 432 double b1 = getSlope(); 433 double result = Math.sqrt(getRSquare()); 434 if (b1 < 0) { 435 result = -result; 436 } 437 return result; 438 } 439 440 /** 441 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 442 * coefficient of determination</a>, 443 * usually denoted r-square. 444 * <p> 445 * <strong>Preconditions</strong>: <ul> 446 * <li>At least two observations (with at least two different x values) 447 * must have been added before invoking this method. If this method is 448 * invoked before a model can be estimated, <code>Double,NaN</code> is 449 * returned. 450 * </li></ul></p> 451 * 452 * @return r-square 453 */ 454 public double getRSquare() { 455 double ssto = getTotalSumSquares(); 456 return (ssto - getSumSquaredErrors()) / ssto; 457 } 458 459 /** 460 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 461 * standard error of the intercept estimate</a>, 462 * usually denoted s(b0). 463 * <p> 464 * If there are fewer that <strong>three</strong> observations in the 465 * model, or if there is no variation in x, this returns 466 * <code>Double.NaN</code>.</p> 467 * 468 * @return standard error associated with intercept estimate 469 */ 470 public double getInterceptStdErr() { 471 return Math.sqrt( 472 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); 473 } 474 475 /** 476 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 477 * error of the slope estimate</a>, 478 * usually denoted s(b1). 479 * <p> 480 * If there are fewer that <strong>three</strong> data pairs in the model, 481 * or if there is no variation in x, this returns <code>Double.NaN</code>. 482 * </p> 483 * 484 * @return standard error associated with slope estimate 485 */ 486 public double getSlopeStdErr() { 487 return Math.sqrt(getMeanSquareError() / sumXX); 488 } 489 490 /** 491 * Returns the half-width of a 95% confidence interval for the slope 492 * estimate. 493 * <p> 494 * The 95% confidence interval is</p> 495 * <p> 496 * <code>(getSlope() - getSlopeConfidenceInterval(), 497 * getSlope() + getSlopeConfidenceInterval())</code></p> 498 * <p> 499 * If there are fewer that <strong>three</strong> observations in the 500 * model, or if there is no variation in x, this returns 501 * <code>Double.NaN</code>.</p> 502 * <p> 503 * <strong>Usage Note</strong>:<br> 504 * The validity of this statistic depends on the assumption that the 505 * observations included in the model are drawn from a 506 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 507 * Bivariate Normal Distribution</a>.</p> 508 * 509 * @return half-width of 95% confidence interval for the slope estimate 510 * @throws MathException if the confidence interval can not be computed. 511 */ 512 public double getSlopeConfidenceInterval() throws MathException { 513 return getSlopeConfidenceInterval(0.05d); 514 } 515 516 /** 517 * Returns the half-width of a (100-100*alpha)% confidence interval for 518 * the slope estimate. 519 * <p> 520 * The (100-100*alpha)% confidence interval is </p> 521 * <p> 522 * <code>(getSlope() - getSlopeConfidenceInterval(), 523 * getSlope() + getSlopeConfidenceInterval())</code></p> 524 * <p> 525 * To request, for example, a 99% confidence interval, use 526 * <code>alpha = .01</code></p> 527 * <p> 528 * <strong>Usage Note</strong>:<br> 529 * The validity of this statistic depends on the assumption that the 530 * observations included in the model are drawn from a 531 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 532 * Bivariate Normal Distribution</a>.</p> 533 * <p> 534 * <strong> Preconditions:</strong><ul> 535 * <li>If there are fewer that <strong>three</strong> observations in the 536 * model, or if there is no variation in x, this returns 537 * <code>Double.NaN</code>. 538 * </li> 539 * <li><code>(0 < alpha < 1)</code>; otherwise an 540 * <code>IllegalArgumentException</code> is thrown. 541 * </li></ul></p> 542 * 543 * @param alpha the desired significance level 544 * @return half-width of 95% confidence interval for the slope estimate 545 * @throws MathException if the confidence interval can not be computed. 546 */ 547 public double getSlopeConfidenceInterval(double alpha) 548 throws MathException { 549 if (alpha >= 1 || alpha <= 0) { 550 throw MathRuntimeException.createIllegalArgumentException( 551 "out of bounds significance level {0}, must be between {1} and {2}", 552 alpha, 0.0, 1.0); 553 } 554 return getSlopeStdErr() * 555 distribution.inverseCumulativeProbability(1d - alpha / 2d); 556 } 557 558 /** 559 * Returns the significance level of the slope (equiv) correlation. 560 * <p> 561 * Specifically, the returned value is the smallest <code>alpha</code> 562 * such that the slope confidence interval with significance level 563 * equal to <code>alpha</code> does not include <code>0</code>. 564 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 565 * </p><p> 566 * <strong>Usage Note</strong>:<br> 567 * The validity of this statistic depends on the assumption that the 568 * observations included in the model are drawn from a 569 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 570 * Bivariate Normal Distribution</a>.</p> 571 * <p> 572 * If there are fewer that <strong>three</strong> observations in the 573 * model, or if there is no variation in x, this returns 574 * <code>Double.NaN</code>.</p> 575 * 576 * @return significance level for slope/correlation 577 * @throws MathException if the significance level can not be computed. 578 */ 579 public double getSignificance() throws MathException { 580 return 2d * (1.0 - distribution.cumulativeProbability( 581 Math.abs(getSlope()) / getSlopeStdErr())); 582 } 583 584 // ---------------------Private methods----------------------------------- 585 586 /** 587 * Returns the intercept of the estimated regression line, given the slope. 588 * <p> 589 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 590 * 591 * @param slope current slope 592 * @return the intercept of the regression line 593 */ 594 private double getIntercept(double slope) { 595 return (sumY - slope * sumX) / n; 596 } 597 598 /** 599 * Computes SSR from b1. 600 * 601 * @param slope regression slope estimate 602 * @return sum of squared deviations of predicted y values 603 */ 604 private double getRegressionSumSquares(double slope) { 605 return slope * slope * sumXX; 606 } 607 608 /** 609 * Modify the distribution used to compute inference statistics. 610 * @param value the new distribution 611 * @since 1.2 612 */ 613 public void setDistribution(TDistribution value) { 614 distribution = value; 615 616 // modify degrees of freedom 617 if (n > 2) { 618 distribution.setDegreesOfFreedom(n - 2); 619 } 620 } 621 }