View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math.random;
19  
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.FileReader;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Serializable;
26  import java.net.URL;
27  import java.util.ArrayList;
28  import java.util.List;
29  
30  import org.apache.commons.math.MathRuntimeException;
31  import org.apache.commons.math.stat.descriptive.StatisticalSummary;
32  import org.apache.commons.math.stat.descriptive.SummaryStatistics;
33  
34  /**
35   * Implements <code>EmpiricalDistribution</code> interface.  This implementation
36   * uses what amounts to the
37   * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
38   * Variable Kernel Method</a> with Gaussian smoothing:<p>
39   * <strong>Digesting the input file</strong>
40   * <ol><li>Pass the file once to compute min and max.</li>
41   * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
42   * <li>Pass the data file again, computing bin counts and univariate
43   *     statistics (mean, std dev.) for each of the bins </li>
44   * <li>Divide the interval (0,1) into subintervals associated with the bins,
45   *     with the length of a bin's subinterval proportional to its count.</li></ol>
46   * <strong>Generating random values from the distribution</strong><ol>
47   * <li>Generate a uniformly distributed value in (0,1) </li>
48   * <li>Select the subinterval to which the value belongs.
49   * <li>Generate a random Gaussian value with mean = mean of the associated
50   *     bin and std dev = std dev of associated bin.</li></ol></p><p>
51   *<strong>USAGE NOTES:</strong><ul>
52   *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
53   *    is to set the bin count to approximately the length of the input file divided
54   *    by 10. </li>
55   *<li>The input file <i>must</i> be a plain text file containing one valid numeric
56   *    entry per line.</li>
57   * </ul></p>
58   *
59   * @version $Revision: 772119 $ $Date: 2009-05-06 05:43:28 -0400 (Wed, 06 May 2009) $
60   */
61  public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
62  
63      /** Serializable version identifier */
64      private static final long serialVersionUID = 5729073523949762654L;
65  
66      /** List of SummaryStatistics objects characterizing the bins */
67      private List<SummaryStatistics> binStats = null;
68  
69      /** Sample statistics */
70      private SummaryStatistics sampleStats = null;
71  
72      /** number of bins */
73      private int binCount = 1000;
74  
75      /** is the distribution loaded? */
76      private boolean loaded = false;
77  
78      /** upper bounds of subintervals in (0,1) "belonging" to the bins */
79      private double[] upperBounds = null;
80  
81      /** RandomData instance to use in repeated calls to getNext() */
82      private RandomData randomData = new RandomDataImpl();
83  
84      /**
85       * Creates a new EmpiricalDistribution with the default bin count.
86       */
87      public EmpiricalDistributionImpl() {
88          binStats = new ArrayList<SummaryStatistics>();
89      }
90  
91      /**
92       * Creates a new EmpiricalDistribution  with the specified bin count.
93       * 
94       * @param binCount number of bins
95       */
96      public EmpiricalDistributionImpl(int binCount) {
97          this.binCount = binCount;
98          binStats = new ArrayList<SummaryStatistics>();
99      }
100 
101      /**
102      * Computes the empirical distribution from the provided
103      * array of numbers.
104      * 
105      * @param in the input data array
106      */
107     public void load(double[] in) {
108         DataAdapter da = new ArrayDataAdapter(in);
109         try {
110             da.computeStats();
111             fillBinStats(in);
112         } catch (Exception e) {
113             throw new MathRuntimeException(e);
114         }
115         loaded = true;
116 
117     }
118 
119     /**
120      * Computes the empirical distribution using data read from a URL.
121      * @param url  url of the input file
122      * 
123      * @throws IOException if an IO error occurs
124      */
125     public void load(URL url) throws IOException {
126         BufferedReader in =
127             new BufferedReader(new InputStreamReader(url.openStream()));
128         try {
129             DataAdapter da = new StreamDataAdapter(in);
130             try {
131                 da.computeStats();
132             } catch (IOException ioe) {
133                 // don't wrap exceptions which are already IOException
134                 throw ioe;
135             } catch (RuntimeException rte) {
136                 // don't wrap RuntimeExceptions
137                 throw rte;
138             } catch (Exception e) {
139                 throw MathRuntimeException.createIOException(e);
140             }
141             if (sampleStats.getN() == 0) {
142                 throw MathRuntimeException.createEOFException("URL {0} contains no data",
143                                                               url);
144             }
145             in = new BufferedReader(new InputStreamReader(url.openStream()));
146             fillBinStats(in);
147             loaded = true;
148         } finally {
149            try {
150                in.close();
151            } catch (IOException ex) {
152                // ignore
153            }
154         }
155     }
156 
157     /**
158      * Computes the empirical distribution from the input file.
159      * 
160      * @param file the input file
161      * @throws IOException if an IO error occurs
162      */
163     public void load(File file) throws IOException {
164         BufferedReader in = new BufferedReader(new FileReader(file));
165         try {
166             DataAdapter da = new StreamDataAdapter(in);
167             try {
168                 da.computeStats();
169             } catch (IOException ioe) {
170                 // don't wrap exceptions which are already IOException
171                 throw ioe;
172             } catch (RuntimeException rte) {
173                 // don't wrap RuntimeExceptions
174                 throw rte;
175             } catch (Exception e) {
176                 throw MathRuntimeException.createIOException(e);
177             }
178             in = new BufferedReader(new FileReader(file));
179             fillBinStats(in);
180             loaded = true;
181         } finally {
182             try {
183                 in.close();
184             } catch (IOException ex) {
185                 // ignore
186             }
187         }
188     }
189 
190     /**
191      * Provides methods for computing <code>sampleStats</code> and
192      * <code>beanStats</code> abstracting the source of data.
193      */
194     private abstract class DataAdapter{
195         /** 
196          * Compute bin stats.
197          * 
198          * @param min minimum value
199          * @param delta  grid size
200          * @throws Exception  if an error occurs computing bin stats
201          */
202         public abstract void computeBinStats(double min, double delta)
203                 throws Exception;
204         /**
205          * Compute sample statistics.
206          * 
207          * @throws Exception if an error occurs computing sample stats
208          */
209         public abstract void computeStats() throws Exception;
210     }
211     /**
212      * Factory of <code>DataAdapter</code> objects. For every supported source
213      * of data (array of doubles, file, etc.) an instance of the proper object
214      * is returned.
215      */
216     private class DataAdapterFactory{
217         /**
218          * Creates a DataAdapter from a data object
219          * 
220          * @param in object providing access to the data
221          * @return DataAdapter instance
222          */
223         public DataAdapter getAdapter(Object in) {
224             if (in instanceof BufferedReader) {
225                 BufferedReader inputStream = (BufferedReader) in;
226                 return new StreamDataAdapter(inputStream);
227             } else if (in instanceof double[]) {
228                 double[] inputArray = (double[]) in;
229                 return new ArrayDataAdapter(inputArray);
230             } else {
231                 throw MathRuntimeException.createIllegalArgumentException(
232                       "input data comes from unsupported datasource: {0}, " +
233                       "supported sources: {1}, {2}",
234                       in.getClass().getName(),
235                       BufferedReader.class.getName(), double[].class.getName());
236             }
237         }
238     }
239     /**
240      * <code>DataAdapter</code> for data provided through some input stream
241      */
242     private class StreamDataAdapter extends DataAdapter{
243         
244         /** Input stream providng access to the data */
245         private BufferedReader inputStream;
246         
247         /**
248          * Create a StreamDataAdapter from a BufferedReader
249          * 
250          * @param in BufferedReader input stream
251          */
252         public StreamDataAdapter(BufferedReader in){
253             super();
254             inputStream = in;
255         }
256         /**
257          * Computes binStats
258          * 
259          * @param min  minimum value
260          * @param delta  grid size
261          * @throws IOException if an IO error occurs
262          */
263         @Override
264         public void computeBinStats(double min, double delta)
265                 throws IOException {
266             String str = null;
267             double val = 0.0d;
268             while ((str = inputStream.readLine()) != null) {
269                 val = Double.parseDouble(str);
270                 SummaryStatistics stats = binStats.get(findBin(min, val, delta));
271                 stats.addValue(val);
272             }
273 
274             inputStream.close();
275             inputStream = null;
276         }
277         /**
278          * Computes sampleStats
279          * 
280          * @throws IOException if an IOError occurs
281          */
282         @Override
283         public void computeStats() throws IOException {
284             String str = null;
285             double val = 0.0;
286             sampleStats = new SummaryStatistics();
287             while ((str = inputStream.readLine()) != null) {
288                 val = Double.valueOf(str).doubleValue();
289                 sampleStats.addValue(val);
290             }
291             inputStream.close();
292             inputStream = null;
293         }
294     }
295 
296     /**
297      * <code>DataAdapter</code> for data provided as array of doubles.
298      */
299     private class ArrayDataAdapter extends DataAdapter{
300         
301         /** Array of input  data values */
302         private double[] inputArray;
303         
304         /**
305          * Construct an ArrayDataAdapter from a double[] array
306          * 
307          * @param in double[] array holding the data
308          */
309         public ArrayDataAdapter(double[] in){
310             super();
311             inputArray = in;
312         }
313         /**
314          * Computes sampleStats
315          * 
316          * @throws IOException if an IO error occurs
317          */
318         @Override
319         public void computeStats() throws IOException {
320             sampleStats = new SummaryStatistics();
321             for (int i = 0; i < inputArray.length; i++) {
322                 sampleStats.addValue(inputArray[i]);
323             }
324         }
325         /**
326          * Computes binStats
327          * 
328          * @param min  minimum value
329          * @param delta  grid size
330          * @throws IOException  if an IO error occurs
331          */
332         @Override
333         public void computeBinStats(double min, double delta)
334             throws IOException {
335             for (int i = 0; i < inputArray.length; i++) {
336                 SummaryStatistics stats =
337                     binStats.get(findBin(min, inputArray[i], delta));
338                 stats.addValue(inputArray[i]);
339             }
340         }
341     }
342 
343     /**
344      * Fills binStats array (second pass through data file).
345      * 
346      * @param in object providing access to the data
347      * @throws IOException  if an IO error occurs
348      */
349     private void fillBinStats(Object in) throws IOException {
350         // Load array of bin upper bounds -- evenly spaced from min - max
351         double min = sampleStats.getMin();
352         double max = sampleStats.getMax();
353         double delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
354         double[] binUpperBounds = new double[binCount];
355         binUpperBounds[0] = min + delta;
356         for (int i = 1; i< binCount - 1; i++) {
357             binUpperBounds[i] = binUpperBounds[i-1] + delta;
358         }
359         binUpperBounds[binCount -1] = max;
360 
361         // Initialize binStats ArrayList
362         if (!binStats.isEmpty()) {
363             binStats.clear();
364         }
365         for (int i = 0; i < binCount; i++) {
366             SummaryStatistics stats = new SummaryStatistics();
367             binStats.add(i,stats);
368         }
369 
370         // Filling data in binStats Array
371         DataAdapterFactory aFactory = new DataAdapterFactory();
372         DataAdapter da = aFactory.getAdapter(in);
373         try {
374             da.computeBinStats(min, delta);
375         } catch (IOException ioe) {
376             // don't wrap exceptions which are already IOException
377             throw ioe;
378         } catch (RuntimeException rte) {
379             // don't wrap RuntimeExceptions
380             throw rte;
381         } catch (Exception e) {
382             throw MathRuntimeException.createIOException(e);
383         }
384 
385         // Assign upperBounds based on bin counts
386         upperBounds = new double[binCount];
387         upperBounds[0] =
388         ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
389         for (int i = 1; i < binCount-1; i++) {
390             upperBounds[i] = upperBounds[i-1] +
391             ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
392         }
393         upperBounds[binCount-1] = 1.0d;
394     }
395     
396     /**
397      * Returns the index of the bin to which the given value belongs
398      * 
399      * @param min  the minimum value
400      * @param value  the value whose bin we are trying to find
401      * @param delta  the grid size
402      * @return the index of the bin containing the value
403      */
404     private int findBin(double min, double value, double delta) {
405         return Math.min(
406                 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 
407                 binCount - 1);
408         }
409 
410     /**
411      * Generates a random value from this distribution.
412      * 
413      * @return the random value.
414      * @throws IllegalStateException if the distribution has not been loaded
415      */
416     public double getNextValue() throws IllegalStateException {
417 
418         if (!loaded) {
419             throw MathRuntimeException.createIllegalStateException("distribution not loaded");
420         }
421 
422         // Start with a uniformly distributed random number in (0,1)
423         double x = Math.random();
424 
425         // Use this to select the bin and generate a Gaussian within the bin
426         for (int i = 0; i < binCount; i++) {
427            if (x <= upperBounds[i]) {
428                SummaryStatistics stats = binStats.get(i);
429                if (stats.getN() > 0) {
430                    if (stats.getStandardDeviation() > 0) {  // more than one obs
431                         return randomData.nextGaussian
432                             (stats.getMean(),stats.getStandardDeviation());
433                    } else {
434                        return stats.getMean(); // only one obs in bin
435                    }
436                }
437            }
438         }
439         throw new MathRuntimeException("no bin selected");
440     }
441 
442     /**
443      * Returns a {@link StatisticalSummary} describing this distribution.
444      * <strong>Preconditions:</strong><ul>
445      * <li>the distribution must be loaded before invoking this method</li></ul>
446      * 
447      * @return the sample statistics
448      * @throws IllegalStateException if the distribution has not been loaded
449      */
450     public StatisticalSummary getSampleStats() {
451         return sampleStats;
452     }
453 
454     /**
455      * Returns the number of bins.
456      * 
457      * @return the number of bins.
458      */
459     public int getBinCount() {
460         return binCount;
461     }
462 
463     /**
464      * Returns a List of {@link SummaryStatistics} instances containing
465      * statistics describing the values in each of the bins.  The list is
466      * indexed on the bin number.
467      * 
468      * @return List of bin statistics.
469      */
470     public List<SummaryStatistics> getBinStats() {
471         return binStats;
472     }
473 
474     /**
475      * Returns (a fresh copy of) the array of upper bounds for the bins.
476        Bins are: <br/>
477      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
478      *  (upperBounds[binCount-1],max]
479      * 
480      * @return array of bin upper bounds
481      */
482     public double[] getUpperBounds() {
483         int len = upperBounds.length;
484         double[] out = new double[len];
485         System.arraycopy(upperBounds, 0, out, 0, len);
486         return out;
487     }
488 
489     /**
490      * Property indicating whether or not the distribution has been loaded.
491      * 
492      * @return true if the distribution has been loaded
493      */
494     public boolean isLoaded() {
495         return loaded;
496     }
497 }