001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.random;
019    
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileReader;
023    import java.io.IOException;
024    import java.io.InputStreamReader;
025    import java.io.Serializable;
026    import java.net.URL;
027    import java.util.ArrayList;
028    import java.util.List;
029    
030    import org.apache.commons.math.MathRuntimeException;
031    import org.apache.commons.math.stat.descriptive.StatisticalSummary;
032    import org.apache.commons.math.stat.descriptive.SummaryStatistics;
033    
034    /**
035     * Implements <code>EmpiricalDistribution</code> interface.  This implementation
036     * uses what amounts to the
037     * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
038     * Variable Kernel Method</a> with Gaussian smoothing:<p>
039     * <strong>Digesting the input file</strong>
040     * <ol><li>Pass the file once to compute min and max.</li>
041     * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
042     * <li>Pass the data file again, computing bin counts and univariate
043     *     statistics (mean, std dev.) for each of the bins </li>
044     * <li>Divide the interval (0,1) into subintervals associated with the bins,
045     *     with the length of a bin's subinterval proportional to its count.</li></ol>
046     * <strong>Generating random values from the distribution</strong><ol>
047     * <li>Generate a uniformly distributed value in (0,1) </li>
048     * <li>Select the subinterval to which the value belongs.
049     * <li>Generate a random Gaussian value with mean = mean of the associated
050     *     bin and std dev = std dev of associated bin.</li></ol></p><p>
051     *<strong>USAGE NOTES:</strong><ul>
052     *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
053     *    is to set the bin count to approximately the length of the input file divided
054     *    by 10. </li>
055     *<li>The input file <i>must</i> be a plain text file containing one valid numeric
056     *    entry per line.</li>
057     * </ul></p>
058     *
059     * @version $Revision: 772119 $ $Date: 2009-05-06 05:43:28 -0400 (Wed, 06 May 2009) $
060     */
061    public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
062    
063        /** Serializable version identifier */
064        private static final long serialVersionUID = 5729073523949762654L;
065    
066        /** List of SummaryStatistics objects characterizing the bins */
067        private List<SummaryStatistics> binStats = null;
068    
069        /** Sample statistics */
070        private SummaryStatistics sampleStats = null;
071    
072        /** number of bins */
073        private int binCount = 1000;
074    
075        /** is the distribution loaded? */
076        private boolean loaded = false;
077    
078        /** upper bounds of subintervals in (0,1) "belonging" to the bins */
079        private double[] upperBounds = null;
080    
081        /** RandomData instance to use in repeated calls to getNext() */
082        private RandomData randomData = new RandomDataImpl();
083    
084        /**
085         * Creates a new EmpiricalDistribution with the default bin count.
086         */
087        public EmpiricalDistributionImpl() {
088            binStats = new ArrayList<SummaryStatistics>();
089        }
090    
091        /**
092         * Creates a new EmpiricalDistribution  with the specified bin count.
093         * 
094         * @param binCount number of bins
095         */
096        public EmpiricalDistributionImpl(int binCount) {
097            this.binCount = binCount;
098            binStats = new ArrayList<SummaryStatistics>();
099        }
100    
101         /**
102         * Computes the empirical distribution from the provided
103         * array of numbers.
104         * 
105         * @param in the input data array
106         */
107        public void load(double[] in) {
108            DataAdapter da = new ArrayDataAdapter(in);
109            try {
110                da.computeStats();
111                fillBinStats(in);
112            } catch (Exception e) {
113                throw new MathRuntimeException(e);
114            }
115            loaded = true;
116    
117        }
118    
119        /**
120         * Computes the empirical distribution using data read from a URL.
121         * @param url  url of the input file
122         * 
123         * @throws IOException if an IO error occurs
124         */
125        public void load(URL url) throws IOException {
126            BufferedReader in =
127                new BufferedReader(new InputStreamReader(url.openStream()));
128            try {
129                DataAdapter da = new StreamDataAdapter(in);
130                try {
131                    da.computeStats();
132                } catch (IOException ioe) {
133                    // don't wrap exceptions which are already IOException
134                    throw ioe;
135                } catch (RuntimeException rte) {
136                    // don't wrap RuntimeExceptions
137                    throw rte;
138                } catch (Exception e) {
139                    throw MathRuntimeException.createIOException(e);
140                }
141                if (sampleStats.getN() == 0) {
142                    throw MathRuntimeException.createEOFException("URL {0} contains no data",
143                                                                  url);
144                }
145                in = new BufferedReader(new InputStreamReader(url.openStream()));
146                fillBinStats(in);
147                loaded = true;
148            } finally {
149               try {
150                   in.close();
151               } catch (IOException ex) {
152                   // ignore
153               }
154            }
155        }
156    
157        /**
158         * Computes the empirical distribution from the input file.
159         * 
160         * @param file the input file
161         * @throws IOException if an IO error occurs
162         */
163        public void load(File file) throws IOException {
164            BufferedReader in = new BufferedReader(new FileReader(file));
165            try {
166                DataAdapter da = new StreamDataAdapter(in);
167                try {
168                    da.computeStats();
169                } catch (IOException ioe) {
170                    // don't wrap exceptions which are already IOException
171                    throw ioe;
172                } catch (RuntimeException rte) {
173                    // don't wrap RuntimeExceptions
174                    throw rte;
175                } catch (Exception e) {
176                    throw MathRuntimeException.createIOException(e);
177                }
178                in = new BufferedReader(new FileReader(file));
179                fillBinStats(in);
180                loaded = true;
181            } finally {
182                try {
183                    in.close();
184                } catch (IOException ex) {
185                    // ignore
186                }
187            }
188        }
189    
190        /**
191         * Provides methods for computing <code>sampleStats</code> and
192         * <code>beanStats</code> abstracting the source of data.
193         */
194        private abstract class DataAdapter{
195            /** 
196             * Compute bin stats.
197             * 
198             * @param min minimum value
199             * @param delta  grid size
200             * @throws Exception  if an error occurs computing bin stats
201             */
202            public abstract void computeBinStats(double min, double delta)
203                    throws Exception;
204            /**
205             * Compute sample statistics.
206             * 
207             * @throws Exception if an error occurs computing sample stats
208             */
209            public abstract void computeStats() throws Exception;
210        }
211        /**
212         * Factory of <code>DataAdapter</code> objects. For every supported source
213         * of data (array of doubles, file, etc.) an instance of the proper object
214         * is returned.
215         */
216        private class DataAdapterFactory{
217            /**
218             * Creates a DataAdapter from a data object
219             * 
220             * @param in object providing access to the data
221             * @return DataAdapter instance
222             */
223            public DataAdapter getAdapter(Object in) {
224                if (in instanceof BufferedReader) {
225                    BufferedReader inputStream = (BufferedReader) in;
226                    return new StreamDataAdapter(inputStream);
227                } else if (in instanceof double[]) {
228                    double[] inputArray = (double[]) in;
229                    return new ArrayDataAdapter(inputArray);
230                } else {
231                    throw MathRuntimeException.createIllegalArgumentException(
232                          "input data comes from unsupported datasource: {0}, " +
233                          "supported sources: {1}, {2}",
234                          in.getClass().getName(),
235                          BufferedReader.class.getName(), double[].class.getName());
236                }
237            }
238        }
239        /**
240         * <code>DataAdapter</code> for data provided through some input stream
241         */
242        private class StreamDataAdapter extends DataAdapter{
243            
244            /** Input stream providng access to the data */
245            private BufferedReader inputStream;
246            
247            /**
248             * Create a StreamDataAdapter from a BufferedReader
249             * 
250             * @param in BufferedReader input stream
251             */
252            public StreamDataAdapter(BufferedReader in){
253                super();
254                inputStream = in;
255            }
256            /**
257             * Computes binStats
258             * 
259             * @param min  minimum value
260             * @param delta  grid size
261             * @throws IOException if an IO error occurs
262             */
263            @Override
264            public void computeBinStats(double min, double delta)
265                    throws IOException {
266                String str = null;
267                double val = 0.0d;
268                while ((str = inputStream.readLine()) != null) {
269                    val = Double.parseDouble(str);
270                    SummaryStatistics stats = binStats.get(findBin(min, val, delta));
271                    stats.addValue(val);
272                }
273    
274                inputStream.close();
275                inputStream = null;
276            }
277            /**
278             * Computes sampleStats
279             * 
280             * @throws IOException if an IOError occurs
281             */
282            @Override
283            public void computeStats() throws IOException {
284                String str = null;
285                double val = 0.0;
286                sampleStats = new SummaryStatistics();
287                while ((str = inputStream.readLine()) != null) {
288                    val = Double.valueOf(str).doubleValue();
289                    sampleStats.addValue(val);
290                }
291                inputStream.close();
292                inputStream = null;
293            }
294        }
295    
296        /**
297         * <code>DataAdapter</code> for data provided as array of doubles.
298         */
299        private class ArrayDataAdapter extends DataAdapter{
300            
301            /** Array of input  data values */
302            private double[] inputArray;
303            
304            /**
305             * Construct an ArrayDataAdapter from a double[] array
306             * 
307             * @param in double[] array holding the data
308             */
309            public ArrayDataAdapter(double[] in){
310                super();
311                inputArray = in;
312            }
313            /**
314             * Computes sampleStats
315             * 
316             * @throws IOException if an IO error occurs
317             */
318            @Override
319            public void computeStats() throws IOException {
320                sampleStats = new SummaryStatistics();
321                for (int i = 0; i < inputArray.length; i++) {
322                    sampleStats.addValue(inputArray[i]);
323                }
324            }
325            /**
326             * Computes binStats
327             * 
328             * @param min  minimum value
329             * @param delta  grid size
330             * @throws IOException  if an IO error occurs
331             */
332            @Override
333            public void computeBinStats(double min, double delta)
334                throws IOException {
335                for (int i = 0; i < inputArray.length; i++) {
336                    SummaryStatistics stats =
337                        binStats.get(findBin(min, inputArray[i], delta));
338                    stats.addValue(inputArray[i]);
339                }
340            }
341        }
342    
343        /**
344         * Fills binStats array (second pass through data file).
345         * 
346         * @param in object providing access to the data
347         * @throws IOException  if an IO error occurs
348         */
349        private void fillBinStats(Object in) throws IOException {
350            // Load array of bin upper bounds -- evenly spaced from min - max
351            double min = sampleStats.getMin();
352            double max = sampleStats.getMax();
353            double delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
354            double[] binUpperBounds = new double[binCount];
355            binUpperBounds[0] = min + delta;
356            for (int i = 1; i< binCount - 1; i++) {
357                binUpperBounds[i] = binUpperBounds[i-1] + delta;
358            }
359            binUpperBounds[binCount -1] = max;
360    
361            // Initialize binStats ArrayList
362            if (!binStats.isEmpty()) {
363                binStats.clear();
364            }
365            for (int i = 0; i < binCount; i++) {
366                SummaryStatistics stats = new SummaryStatistics();
367                binStats.add(i,stats);
368            }
369    
370            // Filling data in binStats Array
371            DataAdapterFactory aFactory = new DataAdapterFactory();
372            DataAdapter da = aFactory.getAdapter(in);
373            try {
374                da.computeBinStats(min, delta);
375            } catch (IOException ioe) {
376                // don't wrap exceptions which are already IOException
377                throw ioe;
378            } catch (RuntimeException rte) {
379                // don't wrap RuntimeExceptions
380                throw rte;
381            } catch (Exception e) {
382                throw MathRuntimeException.createIOException(e);
383            }
384    
385            // Assign upperBounds based on bin counts
386            upperBounds = new double[binCount];
387            upperBounds[0] =
388            ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
389            for (int i = 1; i < binCount-1; i++) {
390                upperBounds[i] = upperBounds[i-1] +
391                ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
392            }
393            upperBounds[binCount-1] = 1.0d;
394        }
395        
396        /**
397         * Returns the index of the bin to which the given value belongs
398         * 
399         * @param min  the minimum value
400         * @param value  the value whose bin we are trying to find
401         * @param delta  the grid size
402         * @return the index of the bin containing the value
403         */
404        private int findBin(double min, double value, double delta) {
405            return Math.min(
406                    Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 
407                    binCount - 1);
408            }
409    
410        /**
411         * Generates a random value from this distribution.
412         * 
413         * @return the random value.
414         * @throws IllegalStateException if the distribution has not been loaded
415         */
416        public double getNextValue() throws IllegalStateException {
417    
418            if (!loaded) {
419                throw MathRuntimeException.createIllegalStateException("distribution not loaded");
420            }
421    
422            // Start with a uniformly distributed random number in (0,1)
423            double x = Math.random();
424    
425            // Use this to select the bin and generate a Gaussian within the bin
426            for (int i = 0; i < binCount; i++) {
427               if (x <= upperBounds[i]) {
428                   SummaryStatistics stats = binStats.get(i);
429                   if (stats.getN() > 0) {
430                       if (stats.getStandardDeviation() > 0) {  // more than one obs
431                            return randomData.nextGaussian
432                                (stats.getMean(),stats.getStandardDeviation());
433                       } else {
434                           return stats.getMean(); // only one obs in bin
435                       }
436                   }
437               }
438            }
439            throw new MathRuntimeException("no bin selected");
440        }
441    
442        /**
443         * Returns a {@link StatisticalSummary} describing this distribution.
444         * <strong>Preconditions:</strong><ul>
445         * <li>the distribution must be loaded before invoking this method</li></ul>
446         * 
447         * @return the sample statistics
448         * @throws IllegalStateException if the distribution has not been loaded
449         */
450        public StatisticalSummary getSampleStats() {
451            return sampleStats;
452        }
453    
454        /**
455         * Returns the number of bins.
456         * 
457         * @return the number of bins.
458         */
459        public int getBinCount() {
460            return binCount;
461        }
462    
463        /**
464         * Returns a List of {@link SummaryStatistics} instances containing
465         * statistics describing the values in each of the bins.  The list is
466         * indexed on the bin number.
467         * 
468         * @return List of bin statistics.
469         */
470        public List<SummaryStatistics> getBinStats() {
471            return binStats;
472        }
473    
474        /**
475         * Returns (a fresh copy of) the array of upper bounds for the bins.
476           Bins are: <br/>
477         * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
478         *  (upperBounds[binCount-1],max]
479         * 
480         * @return array of bin upper bounds
481         */
482        public double[] getUpperBounds() {
483            int len = upperBounds.length;
484            double[] out = new double[len];
485            System.arraycopy(upperBounds, 0, out, 0, len);
486            return out;
487        }
488    
489        /**
490         * Property indicating whether or not the distribution has been loaded.
491         * 
492         * @return true if the distribution has been loaded
493         */
494        public boolean isLoaded() {
495            return loaded;
496        }
497    }