View Javadoc

1   /*
2    * Copyright 2003-2004 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.math.random;
18  
19  import java.io.Serializable;
20  import java.io.BufferedReader;
21  import java.io.FileReader;
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.net.URL;
26  import java.util.ArrayList;
27  import java.util.List;
28  
29  import org.apache.commons.math.stat.descriptive.SummaryStatistics;
30  import org.apache.commons.math.stat.descriptive.StatisticalSummary;
31  
32  /**
33   * Implements <code>EmpiricalDistribution</code> interface.  This implementation
34   * uses what amounts to the
35   * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
36   * Variable Kernel Method</a> with Gaussian smoothing:<p>
37   * <strong>Digesting the input file</strong>
38   * <ol><li>Pass the file once to compute min and max.</li>
39   * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
40   * <li>Pass the data file again, computing bin counts and univariate
41   *     statistics (mean, std dev.) for each of the bins </li>
42   * <li>Divide the interval (0,1) into subintervals associated with the bins,
43   *     with the length of a bin's subinterval proportional to its count.</li></ol>
44   * <strong>Generating random values from the distribution</strong><ol>
45   * <li>Generate a uniformly distributed value in (0,1) </li>
46   * <li>Select the subinterval to which the value belongs.
47   * <li>Generate a random Gaussian value with mean = mean of the associated
48   *     bin and std dev = std dev of associated bin.</li></ol></p><p>
49   *<strong>USAGE NOTES:</strong><ul>
50   *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
51   *    is to set the bin count to approximately the length of the input file divided
52   *    by 10. </li>
53   *<li>The input file <i>must</i> be a plain text file containing one valid numeric
54   *    entry per line.</li>
55   * </ul></p>
56   *
57   * @version $Revision: 348894 $ $Date: 2005-11-24 23:34:47 -0700 (Thu, 24 Nov 2005) $
58   */
59  public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
60  
61      /** Serializable version identifier */
62      private static final long serialVersionUID = -6773236347582113490L;
63  
64      /** List of SummaryStatistics objects characterizing the bins */
65      private ArrayList binStats = null;
66  
67      /** Sample statistics */
68      private SummaryStatistics sampleStats = null;
69  
70      /** number of bins */
71      private int binCount = 1000;
72  
73      /** is the distribution loaded? */
74      private boolean loaded = false;
75  
76      /** upper bounds of subintervals in (0,1) "belonging" to the bins */
77      private double[] upperBounds = null;
78  
79      /** RandomData instance to use in repeated calls to getNext() */
80      private RandomData randomData = new RandomDataImpl();
81  
82      /**
83       * Creates a new EmpiricalDistribution with the default bin count.
84       */
85      public EmpiricalDistributionImpl() {
86          binStats = new ArrayList();
87      }
88  
89      /**
90       * Creates a new EmpiricalDistribution  with the specified bin count.
91       * 
92       * @param binCount number of bins
93       */
94      public EmpiricalDistributionImpl(int binCount) {
95          this.binCount = binCount;
96          binStats = new ArrayList();
97      }
98  
99       /**
100      * Computes the empirical distribution from the provided
101      * array of numbers.
102      * 
103      * @param in the input data array
104      */
105     public void load(double[] in) {
106         DataAdapter da = new ArrayDataAdapter(in);
107         try {
108             da.computeStats();
109             fillBinStats(in);
110         } catch (Exception e) {
111             throw new RuntimeException(e.getMessage());
112         }
113         loaded = true;
114 
115     }
116 
117     /**
118      * Computes the empirical distribution using data read from a URL.
119      * @param url  url of the input file
120      * 
121      * @throws IOException if an IO error occurs
122      */
123     public void load(URL url) throws IOException {
124         BufferedReader in =
125             new BufferedReader(new InputStreamReader(url.openStream()));
126         try {
127             DataAdapter da = new StreamDataAdapter(in);
128             try {
129                 da.computeStats();
130             } catch (Exception e) {
131                 throw new IOException(e.getMessage());
132             }
133             in = new BufferedReader(new InputStreamReader(url.openStream()));
134             fillBinStats(in);
135             loaded = true;
136         } finally {
137            if (in != null) {
138                try {
139                    in.close();
140                } catch (Exception ex) {
141                    // ignore
142                }
143            }
144         }
145     }
146 
147     /**
148      * Computes the empirical distribution from the input file.
149      * 
150      * @param file the input file
151      * @throws IOException if an IO error occurs
152      */
153     public void load(File file) throws IOException {
154         BufferedReader in = new BufferedReader(new FileReader(file));
155         try {
156             DataAdapter da = new StreamDataAdapter(in);
157             try {
158                 da.computeStats();
159             } catch (Exception e) {
160                 throw new IOException(e.getMessage());
161             }
162             in = new BufferedReader(new FileReader(file));
163             fillBinStats(in);
164             loaded = true;
165         } finally {
166             if (in != null) {
167                 try {
168                     in.close();
169                 } catch (Exception ex) {
170                     // ignore
171                 }
172             }
173         }
174     }
175 
176     /**
177      * Provides methods for computing <code>sampleStats</code> and
178      * <code>beanStats</code> abstracting the source of data.
179      */
180     private abstract class DataAdapter{
181         /** 
182          * Compute bin stats.
183          * 
184          * @param min minimum value
185          * @param delta  grid size
186          * @throws Exception  if an error occurs computing bin stats
187          */
188         public abstract void computeBinStats(double min, double delta)
189                 throws Exception;
190         /**
191          * Compute sample statistics.
192          * 
193          * @throws Exception if an error occurs computing sample stats
194          */
195         public abstract void computeStats() throws Exception;
196     }
197     /**
198      * Factory of <code>DataAdapter</code> objects. For every supported source
199      * of data (array of doubles, file, etc.) an instance of the proper object
200      * is returned.
201      */
202     private class DataAdapterFactory{
203         /**
204          * Creates a DataAdapter from a data object
205          * 
206          * @param in object providing access to the data
207          * @return DataAdapter instance
208          */
209         public DataAdapter getAdapter(Object in) {
210             if (in instanceof BufferedReader) {
211                 BufferedReader inputStream = (BufferedReader) in;
212                 return new StreamDataAdapter(inputStream);
213             } else if (in instanceof double[]) {
214                 double[] inputArray = (double[]) in;
215                 return new ArrayDataAdapter(inputArray);
216             } else {
217                 throw new IllegalArgumentException(
218                     "Input data comes from the" + " unsupported source");
219             }
220         }
221     }
222     /**
223      * <code>DataAdapter</code> for data provided through some input stream
224      */
225     private class StreamDataAdapter extends DataAdapter{
226         
227         /** Input stream providng access to the data */
228         private BufferedReader inputStream;
229         
230         /**
231          * Create a StreamDataAdapter from a BufferedReader
232          * 
233          * @param in BufferedReader input stream
234          */
235         public StreamDataAdapter(BufferedReader in){
236             super();
237             inputStream = in;
238         }
239         /**
240          * Computes binStats
241          * 
242          * @param min  minimum value
243          * @param delta  grid size
244          * @throws IOException if an IO error occurs
245          */
246         public void computeBinStats(double min, double delta)
247                 throws IOException {
248             String str = null;
249             double val = 0.0d;
250             while ((str = inputStream.readLine()) != null) {
251                 val = Double.parseDouble(str);
252                 SummaryStatistics stats =
253                     (SummaryStatistics) binStats.get(findBin(min, val, delta));
254                 stats.addValue(val);
255             }
256 
257             inputStream.close();
258             inputStream = null;
259         }
260         /**
261          * Computes sampleStats
262          * 
263          * @throws IOException if an IOError occurs
264          */
265         public void computeStats() throws IOException {
266             String str = null;
267             double val = 0.0;
268             sampleStats = SummaryStatistics.newInstance();
269             while ((str = inputStream.readLine()) != null) {
270                 val = new Double(str).doubleValue();
271                 sampleStats.addValue(val);
272             }
273             inputStream.close();
274             inputStream = null;
275         }
276     }
277 
278     /**
279      * <code>DataAdapter</code> for data provided as array of doubles.
280      */
281     private class ArrayDataAdapter extends DataAdapter{
282         
283         /** Array of input  data values */
284         private double[] inputArray;
285         
286         /**
287          * Construct an ArrayDataAdapter from a double[] array
288          * 
289          * @param in double[] array holding the data
290          */
291         public ArrayDataAdapter(double[] in){
292             super();
293             inputArray = in;
294         }
295         /**
296          * Computes sampleStats
297          * 
298          * @throws IOException if an IO error occurs
299          */
300         public void computeStats() throws IOException {
301             sampleStats = SummaryStatistics.newInstance();
302             for (int i = 0; i < inputArray.length; i++) {
303                 sampleStats.addValue(inputArray[i]);
304             }
305         }
306         /**
307          * Computes binStats
308          * 
309          * @param min  minimum value
310          * @param delta  grid size
311          * @throws IOException  if an IO error occurs
312          */
313         public void computeBinStats(double min, double delta)
314             throws IOException {
315             for (int i = 0; i < inputArray.length; i++) {
316                 SummaryStatistics stats =
317                     (SummaryStatistics) binStats.get(
318                             findBin(min, inputArray[i], delta));
319                 stats.addValue(inputArray[i]);
320             }
321         }
322     }
323 
324     /**
325      * Fills binStats array (second pass through data file).
326      * 
327      * @param in object providing access to the data
328      * @throws IOException  if an IO error occurs
329      */
330     private void fillBinStats(Object in) throws IOException {
331         // Load array of bin upper bounds -- evenly spaced from min - max
332         double min = sampleStats.getMin();
333         double max = sampleStats.getMax();
334         double delta = (max - min)/(new Double(binCount)).doubleValue();
335         double[] binUpperBounds = new double[binCount];
336         binUpperBounds[0] = min + delta;
337         for (int i = 1; i< binCount - 1; i++) {
338             binUpperBounds[i] = binUpperBounds[i-1] + delta;
339         }
340         binUpperBounds[binCount -1] = max;
341 
342         // Initialize binStats ArrayList
343         if (!binStats.isEmpty()) {
344             binStats.clear();
345         }
346         for (int i = 0; i < binCount; i++) {
347             SummaryStatistics stats = SummaryStatistics.newInstance();
348             binStats.add(i,stats);
349         }
350 
351         // Filling data in binStats Array
352         DataAdapterFactory aFactory = new DataAdapterFactory();
353         DataAdapter da = aFactory.getAdapter(in);
354         try {
355             da.computeBinStats(min, delta);
356         } catch (Exception e) {
357             if(e instanceof RuntimeException){
358                 throw new RuntimeException(e.getMessage());
359             }else{
360                 throw new IOException(e.getMessage());
361             }
362         }
363 
364         // Assign upperBounds based on bin counts
365         upperBounds = new double[binCount];
366         upperBounds[0] =
367         ((double)((SummaryStatistics)binStats.get(0)).getN())/
368         (double)sampleStats.getN();
369         for (int i = 1; i < binCount-1; i++) {
370             upperBounds[i] = upperBounds[i-1] +
371             ((double)((SummaryStatistics)binStats.get(i)).getN())/
372             (double)sampleStats.getN();
373         }
374         upperBounds[binCount-1] = 1.0d;
375     }
376     
377     /**
378      * Returns the index of the bin to which the given value belongs
379      * 
380      * @param min  the minimum value
381      * @param value  the value whose bin we are trying to find
382      * @param delta  the grid size
383      * @return the index of the bin containing the value
384      */
385     private int findBin(double min, double value, double delta) {
386         return Math.min(
387                 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 
388                 binCount - 1);
389         }
390 
391     /**
392      * Generates a random value from this distribution.
393      * 
394      * @return the random value.
395      * @throws IllegalStateException if the distribution has not been loaded
396      */
397     public double getNextValue() throws IllegalStateException {
398 
399         if (!loaded) {
400             throw new IllegalStateException("distribution not loaded");
401         }
402 
403         // Start with a uniformly distributed random number in (0,1)
404         double x = Math.random();
405 
406         // Use this to select the bin and generate a Gaussian within the bin
407         for (int i = 0; i < binCount; i++) {
408            if (x <= upperBounds[i]) {
409                SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
410                if (stats.getN() > 0) {
411                    if (stats.getStandardDeviation() > 0) {  // more than one obs
412                         return randomData.nextGaussian
413                             (stats.getMean(),stats.getStandardDeviation());
414                    } else {
415                        return stats.getMean(); // only one obs in bin
416                    }
417                }
418            }
419         }
420         throw new RuntimeException("No bin selected");
421     }
422 
423     /**
424      * Returns a {@link StatisticalSummary} describing this distribution.
425      * <strong>Preconditions:</strong><ul>
426      * <li>the distribution must be loaded before invoking this method</li></ul>
427      * 
428      * @return the sample statistics
429      * @throws IllegalStateException if the distribution has not been loaded
430      */
431     public StatisticalSummary getSampleStats() {
432         return sampleStats;
433     }
434 
435     /**
436      * Returns the number of bins.
437      * 
438      * @return the number of bins.
439      */
440     public int getBinCount() {
441         return binCount;
442     }
443 
444     /**
445      * Returns an ArrayList of {@link SummaryStatistics} instances containing
446      * statistics describing the values in each of the bins.  The ArrayList is
447      * indexed on the bin number.
448      * 
449      * @return List of bin statistics.
450      */
451     public List getBinStats() {
452         return binStats;
453     }
454 
455     /**
456      * Returns (a fresh copy of) the array of upper bounds for the bins.
457        Bins are: <br/>
458      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
459      *  (upperBounds[binCount-1],max]
460      * 
461      * @return array of bin upper bounds
462      */
463     public double[] getUpperBounds() {
464         int len = upperBounds.length;
465         double[] out = new double[len];
466         System.arraycopy(upperBounds, 0, out, 0, len);
467         return out;
468     }
469 
470     /**
471      * Property indicating whether or not the distribution has been loaded.
472      * 
473      * @return true if the distribution has been loaded
474      */
475     public boolean isLoaded() {
476         return loaded;
477     }
478 }