001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.tools;
003
004//// Taken from http://www.bmsi.com/java/#diff
005
006// http://www.bmsi.com/java/DiffPrint.java could also be useful
007
008/*
009 * $Log: Diff.java,v $
010 * Revision 1.15  2013/04/01 16:27:31  stuart
011 * Fix DiffPrint unified format with test cases.
012 * Begin porting some diff-2.7 features.
013 *
014 * Revision 1.14  2010/03/03 21:21:25  stuart
015 * Test new direct equivalence API
016 *
017 * Revision 1.13  2009/12/07 17:43:17  stuart
018 * Compute equivMax for int[] ctor
019 *
020 * Revision 1.12  2009/12/07 17:34:46  stuart
021 * Ctor with int[].
022 *
023 * Revision 1.11  2009/11/15 01:11:54  stuart
024 * Diff doesn't need to be generic
025 *
026 * Revision 1.10  2009/11/15 00:54:03  stuart
027 * Update to Java 5 containers
028 *
029 * Revision 1.7  2009/01/19 03:05:26  stuart
030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
031 *
032 * Revision 1.6  2003/03/06 22:51:32  stuart
033 * Convert to CVS
034 *
035 * Revision 1.5  2002/07/19  19:14:40  stuart
036 * fix reverseScript, make change ctor public, update docs
037 *
038 * Revision 1.4  2002/04/09  17:53:39  stuart
039 * More flexible interface for diff() function.
040 *
041 * Revision 1.3  2000/03/03 21:58:03  stuart
042 * move discard_confusing_lines and shift_boundaries to class file_data
043 *
044 * Revision 1.2  2000/03/02  16:37:38  stuart
045 * Add GPL and copyright
046 *
047 */
048
049import java.util.HashMap;
050import java.util.Map;
051
052/** A class to compare vectors of objects.  The result of comparison
053    is a list of <code>change</code> objects which form an
054    edit script.  The objects compared are traditionally lines
055    of text from two files.  Comparison options such as "ignore
056    whitespace" are implemented by modifying the <code>equals</code>
057    and <code>hashcode</code> methods for the objects compared.
058<p>
059   The basic algorithm is described in: <br>
060   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
061   Algorithmica Vol. 1 No. 2, 1986, p 251.
062<p>
063   This class outputs different results from GNU diff 1.15 on some
064   inputs.  Our results are actually better (smaller change list, smaller
065   total size of changes), but it would be nice to know why.  Perhaps
066   there is a memory overwrite bug in GNU diff 1.15.
067
068  @author Stuart D. Gathman, translated from GNU diff 1.15
069    Copyright (C) 2000  Business Management Systems, Inc.
070<p>
071    This program is free software; you can redistribute it and/or modify
072    it under the terms of the GNU General Public License as published by
073    the Free Software Foundation; either version 1, or (at your option)
074    any later version.
075<p>
076    This program is distributed in the hope that it will be useful,
077    but WITHOUT ANY WARRANTY; without even the implied warranty of
078    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
079    GNU General Public License for more details.
080<p>
081    You should have received a copy of the <a href=COPYING.txt>
082    GNU General Public License</a>
083    along with this program; if not, write to the Free Software
084    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
085 */
086public class Diff {
087
088    /** Prepare to find differences between two arrays.  Each element of
089      the arrays is translated to an "equivalence number" based on
090      the result of <code>equals</code>.  The original Object arrays
091      are no longer needed for computing the differences.  They will
092      be needed again later to print the results of the comparison as
093      an edit script, if desired.
094     * @param a first array
095     * @param b second array
096     */
097    public Diff(Object[] a, Object[] b) {
098        Map<Object, Integer> h = new HashMap<>(a.length + b.length);
099        filevec = new FileData[] {new FileData(a, h), new FileData(b, h)};
100    }
101
102    /** 1 more than the maximum equivalence value used for this or its
103     sibling file. */
104    private int equivMax = 1;
105
106    /** When set to true, the comparison uses a heuristic to speed it up.
107    With this heuristic, for files with a constant small density
108    of changes, the algorithm is linear in the file size.  */
109    public boolean heuristic;
110
111    /** When set to true, the algorithm returns a guarranteed minimal
112      set of changes.  This makes things slower, sometimes much slower. */
113    public boolean noDiscards;
114
115    private int[] xvec, yvec; /* Vectors being compared. */
116    private int[] fdiag;      /* Vector, indexed by diagonal, containing
117                   the X coordinate of the point furthest
118                   along the given diagonal in the forward
119                   search of the edit matrix. */
120    private int[] bdiag;      /* Vector, indexed by diagonal, containing
121                   the X coordinate of the point furthest
122                   along the given diagonal in the backward
123                   search of the edit matrix. */
124    private int fdiagoff, bdiagoff;
125    private final FileData[] filevec;
126    private int cost;
127    /** Snakes bigger than this are considered "big". */
128    private static final int SNAKE_LIMIT = 20;
129
130    /**
131     * Find the midpoint of the shortest edit script for a specified
132     * portion of the two files.
133     *
134     * We scan from the beginnings of the files, and simultaneously from the ends,
135     * doing a breadth-first search through the space of edit-sequence.
136     * When the two searches meet, we have found the midpoint of the shortest
137     * edit sequence.
138     *
139     * The value returned is the number of the diagonal on which the midpoint lies.
140     * The diagonal number equals the number of inserted lines minus the number
141     * of deleted lines (counting only lines before the midpoint).
142     * The edit cost is stored into COST; this is the total number of
143     * lines inserted or deleted (counting only lines before the midpoint).
144     *
145     * This function assumes that the first lines of the specified portions
146     * of the two files do not match, and likewise that the last lines do not
147     * match.  The caller must trim matching lines from the beginning and end
148     * of the portions it is going to specify.
149     *
150     * Note that if we return the "wrong" diagonal value, or if
151     * the value of bdiag at that diagonal is "wrong",
152     * the worst this can do is cause suboptimal diff output.
153     * It cannot cause incorrect diff output.
154     * @param xoff xoff
155     * @param xlim xlim
156     * @param yoff yoff
157     * @param ylim ylim
158     * @return midpoint of the shortest edit script
159     */
160    private int diag(int xoff, int xlim, int yoff, int ylim) {
161        final int[] fd = fdiag; // Give the compiler a chance.
162        final int[] bd = bdiag; // Additional help for the compiler.
163        final int[] xv = xvec;      // Still more help for the compiler.
164        final int[] yv = yvec;      // And more and more . . .
165        final int dmin = xoff - ylim;   // Minimum valid diagonal.
166        final int dmax = xlim - yoff;   // Maximum valid diagonal.
167        final int fmid = xoff - yoff;   // Center diagonal of top-down search.
168        final int bmid = xlim - ylim;   // Center diagonal of bottom-up search.
169        int fmin = fmid, fmax = fmid;   // Limits of top-down search.
170        int bmin = bmid, bmax = bmid;   // Limits of bottom-up search.
171        // True if southeast corner is on an odd diagonal with respect to the northwest.
172        final boolean odd = (fmid - bmid & 1) != 0;
173
174        fd[fdiagoff + fmid] = xoff;
175        bd[bdiagoff + bmid] = xlim;
176
177        for (int c = 1;; ++c) {
178            int d;          /* Active diagonal. */
179            boolean big_snake = false;
180
181            /* Extend the top-down search by an edit step in each diagonal. */
182            if (fmin > dmin) {
183                fd[fdiagoff + --fmin - 1] = -1;
184            } else {
185                ++fmin;
186            }
187            if (fmax < dmax) {
188                fd[fdiagoff + ++fmax + 1] = -1;
189            } else {
190                --fmax;
191            }
192            for (d = fmax; d >= fmin; d -= 2) {
193                int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
194
195                if (tlo >= thi) {
196                    x = tlo + 1;
197                } else {
198                    x = thi;
199                }
200                oldx = x;
201                y = x - d;
202                while (x < xlim && y < ylim && xv[x] == yv[y]) {
203                    ++x; ++y;
204                }
205                if (x - oldx > SNAKE_LIMIT) {
206                    big_snake = true;
207                }
208                fd[fdiagoff + d] = x;
209                if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
210                    cost = 2 * c - 1;
211                    return d;
212                }
213            }
214
215            /* Similar extend the bottom-up search. */
216            if (bmin > dmin) {
217                bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
218            } else {
219                ++bmin;
220            }
221            if (bmax < dmax) {
222                bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
223            } else {
224                --bmax;
225            }
226            for (d = bmax; d >= bmin; d -= 2) {
227                int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
228
229                if (tlo < thi) {
230                    x = tlo;
231                } else {
232                    x = thi - 1;
233                }
234                oldx = x;
235                y = x - d;
236                while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
237                    --x; --y;
238                }
239                if (oldx - x > SNAKE_LIMIT) {
240                    big_snake = true;
241                }
242                bd[bdiagoff + d] = x;
243                if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
244                    cost = 2 * c;
245                    return d;
246                }
247            }
248
249            /* Heuristic: check occasionally for a diagonal that has made
250       lots of progress compared with the edit distance.
251       If we have any such, find the one that has made the most
252       progress and return it as if it had succeeded.
253
254       With this heuristic, for files with a constant small density
255       of changes, the algorithm is linear in the file size.  */
256
257            if (c > 200 && big_snake && heuristic) {
258                int best = 0;
259                int bestpos = -1;
260
261                for (d = fmax; d >= fmin; d -= 2) {
262                    int dd = d - fmid;
263                    int x = fd[fdiagoff + d];
264                    int y = x - d;
265                    int v = (x - xoff) * 2 - dd;
266                    if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
267                        if (v > best
268                                && xoff + SNAKE_LIMIT <= x && x < xlim
269                                && yoff + SNAKE_LIMIT <= y && y < ylim) {
270                            /* We have a good enough best diagonal.
271                               now insist that it end with a significant snake.  */
272                            int k;
273
274                            for (k = 1; xvec[x - k] == yvec[y - k]; k++) {
275                                if (k == SNAKE_LIMIT) {
276                                    best = v;
277                                    bestpos = d;
278                                    break;
279                                }
280                            }
281                        }
282                    }
283                }
284                if (best > 0) {
285                    cost = 2 * c - 1;
286                    return bestpos;
287                }
288
289                best = 0;
290                for (d = bmax; d >= bmin; d -= 2) {
291                    int dd = d - bmid;
292                    int x = bd[bdiagoff + d];
293                    int y = x - d;
294                    int v = (xlim - x) * 2 + dd;
295                    if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
296                        if (v > best
297                                && xoff < x && x <= xlim - SNAKE_LIMIT
298                                && yoff < y && y <= ylim - SNAKE_LIMIT) {
299                            /* We have a good enough best diagonal.
300                               now insist that it end with a significant snake.  */
301                            int k;
302
303                            for (k = 0; xvec[x + k] == yvec[y + k]; k++) {
304                                if (k == SNAKE_LIMIT) {
305                                    best = v;
306                                    bestpos = d;
307                                    break;
308                                }
309                            }
310                        }
311                    }
312                }
313                if (best > 0) {
314                    cost = 2 * c - 1;
315                    return bestpos;
316                }
317            }
318        }
319    }
320
321    /**
322     * Compare in detail contiguous subsequences of the two files
323     * which are known, as a whole, to match each other.
324     *
325     * The results are recorded in the vectors filevec[N].changed_flag, by
326     * storing a 1 in the element for each line that is an insertion or deletion.
327     *
328     * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
329     *
330     * Note that XLIM, YLIM are exclusive bounds.
331     * All line numbers are origin-0 and discarded lines are not counted.
332     * @param xoff xoff
333     * @param xlim xlim
334     * @param yoff yoff
335     * @param ylim ylim
336     */
337    private void compareseq(int xoff, int xlim, int yoff, int ylim) {
338        /* Slide down the bottom initial diagonal. */
339        while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
340            ++xoff; ++yoff;
341        }
342        /* Slide up the top initial diagonal. */
343        while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
344            --xlim; --ylim;
345        }
346
347        /* Handle simple cases. */
348        if (xoff == xlim) {
349            while (yoff < ylim) {
350                filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true;
351            }
352        } else if (yoff == ylim) {
353            while (xoff < xlim) {
354                filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true;
355            }
356        } else {
357            /* Find a point of correspondence in the middle of the files.  */
358
359            int d = diag(xoff, xlim, yoff, ylim);
360            int c = cost;
361            int b = bdiag[bdiagoff + d];
362
363            if (c == 1)
364                /* This should be impossible, because it implies that
365                   one of the two subsequences is empty,
366                   and that case was handled above without calling `diag'.
367                   Let's verify that this is true.  */
368                throw new IllegalArgumentException("Empty subsequence");
369            else {
370                /* Use that point to split this problem into two subproblems.  */
371                compareseq(xoff, b, yoff, b - d);
372                /* This used to use f instead of b,
373                   but that is incorrect!
374                   It is not necessarily the case that diagonal d
375                   has a snake from b to f.  */
376                compareseq(b, xlim, b - d, ylim);
377            }
378        }
379    }
380
381    /** Discard lines from one file that have no matches in the other file.
382     */
383    private void discard_confusing_lines() {
384        filevec[0].discard_confusing_lines(filevec[1]);
385        filevec[1].discard_confusing_lines(filevec[0]);
386    }
387
388    private boolean inhibit;
389
390    /**
391     * Adjust inserts/deletes of blank lines to join changes as much as possible.
392     */
393    private void shift_boundaries() {
394        if (inhibit)
395            return;
396        filevec[0].shift_boundaries(filevec[1]);
397        filevec[1].shift_boundaries(filevec[0]);
398    }
399
400    /**
401     * Script builder.
402     */
403    public interface ScriptBuilder {
404        /**
405         * Scan the tables of which lines are inserted and deleted, producing an edit script.
406         * @param changed0 true for lines in first file which do not match 2nd
407         * @param len0 number of lines in first file
408         * @param changed1 true for lines in 2nd file which do not match 1st
409         * @param len1 number of lines in 2nd file
410         * @return a linked list of changes - or null
411         */
412        Change build_script(
413                boolean[] changed0, int len0,
414                boolean[] changed1, int len1
415        );
416    }
417
418    /** Scan the tables of which lines are inserted and deleted,
419     producing an edit script in reverse order.  */
420
421    static class ReverseScript implements ScriptBuilder {
422        @Override
423        public  Change build_script(
424                final boolean[] changed0, int len0,
425                final boolean[] changed1, int len1) {
426            Change script = null;
427            int i0 = 0, i1 = 0;
428            while (i0 < len0 || i1 < len1) {
429                if (changed0[1+i0] || changed1[1+i1]) {
430                    int line0 = i0, line1 = i1;
431
432                    /* Find # lines changed here in each file.  */
433                    while (changed0[1+i0]) {
434                        ++i0;
435                    }
436                    while (changed1[1+i1]) {
437                        ++i1;
438                    }
439
440                    /* Record this change.  */
441                    script = new Change(line0, line1, i0 - line0, i1 - line1, script);
442                }
443
444                /* We have reached lines in the two files that match each other.  */
445                i0++; i1++;
446            }
447
448            return script;
449        }
450    }
451
452    static class ForwardScript implements ScriptBuilder {
453        /** Scan the tables of which lines are inserted and deleted,
454            producing an edit script in forward order.  */
455        @Override
456        public Change build_script(
457                final boolean[] changed0, int len0,
458                final boolean[] changed1, int len1) {
459            Change script = null;
460            int i0 = len0, i1 = len1;
461
462            while (i0 >= 0 || i1 >= 0) {
463                if (changed0[i0] || changed1[i1]) {
464                    int line0 = i0, line1 = i1;
465
466                    /* Find # lines changed here in each file.  */
467                    while (changed0[i0]) {
468                        --i0;
469                    }
470                    while (changed1[i1]) {
471                        --i1;
472                    }
473
474                    /* Record this change.  */
475                    script = new Change(i0, i1, line0 - i0, line1 - i1, script);
476                }
477
478                /* We have reached lines in the two files that match each other.  */
479                i0--; i1--;
480            }
481
482            return script;
483        }
484    }
485
486    /** Standard Forward ScriptBuilder. */
487    public static final ScriptBuilder forwardScript = new ForwardScript();
488    /** Standard Reverse ScriptBuilder. */
489    public static final ScriptBuilder reverseScript = new ReverseScript();
490
491    /**
492     * Report the differences of two files. DEPTH is the current directory depth.
493     * @param reverse if {@code true} use {@link #reverseScript} else use {@link #forwardScript}
494     * @return the differences of two files
495     */
496    public final Change diff_2(final boolean reverse) {
497        return diff(reverse ? reverseScript : forwardScript);
498    }
499
500    /**
501     * Get the results of comparison as an edit script.  The script
502     * is described by a list of changes.  The standard ScriptBuilder
503     * implementations provide for forward and reverse edit scripts.
504     * Alternate implementations could, for instance, list common elements
505     * instead of differences.
506     * @param bld an object to build the script from change flags
507     * @return the head of a list of changes
508     */
509    public Change diff(final ScriptBuilder bld) {
510
511        // Some lines are obviously insertions or deletions because they don't match anything.
512        // Detect them now, and avoid even thinking about them in the main comparison algorithm.
513        discard_confusing_lines();
514
515        // Now do the main comparison algorithm, considering just the undiscarded lines.
516        xvec = filevec[0].undiscarded;
517        yvec = filevec[1].undiscarded;
518
519        int diags = filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3;
520        fdiag = new int[diags];
521        fdiagoff = filevec[1].nondiscardedLines + 1;
522        bdiag = new int[diags];
523        bdiagoff = filevec[1].nondiscardedLines + 1;
524
525        compareseq(0, filevec[0].nondiscardedLines,
526                   0, filevec[1].nondiscardedLines);
527        fdiag = null;
528        bdiag = null;
529
530        // Modify the results slightly to make them prettier in cases where that can validly be done.
531        shift_boundaries();
532
533        // Get the results of comparison in the form of a chain of `struct change's -- an edit script.
534        return bld.build_script(
535                filevec[0].changedFlag,
536                filevec[0].bufferedLines,
537                filevec[1].changedFlag,
538                filevec[1].bufferedLines
539        );
540    }
541
542    /** The result of comparison is an "edit script": a chain of change objects.
543     Each change represents one place where some lines are deleted
544     and some are inserted.
545
546     LINE0 and LINE1 are the first affected lines in the two files (origin 0).
547     DELETED is the number of lines deleted here from file 0.
548     INSERTED is the number of lines inserted here in file 1.
549
550     If DELETED is 0 then LINE0 is the number of the line before
551     which the insertion was done; vice versa for INSERTED and LINE1.  */
552
553    public static class Change {
554        /** Previous or next edit command. */
555        public Change link;
556        /** # lines of file 1 changed here.  */
557        public final int inserted;
558        /** # lines of file 0 changed here.  */
559        public final int deleted;
560        /** Line number of 1st deleted line.  */
561        public final int line0;
562        /** Line number of 1st inserted line.  */
563        public final int line1;
564
565        /**
566         * Cons an additional entry onto the front of an edit script OLD.
567         * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
568         * DELETED is the number of lines deleted here from file 0.
569         * INSERTED is the number of lines inserted here in file 1.
570         *
571         * If DELETED is 0 then LINE0 is the number of the line before
572         * which the insertion was done; vice versa for INSERTED and LINE1.
573         * @param line0 first affected lines in the two files (origin 0)
574         * @param line1 first affected lines in the two files (origin 0)
575         * @param deleted the number of lines deleted here from file 0
576         * @param inserted the number of lines inserted here in file 1
577         * @param old edit script
578         */
579        public Change(int line0, int line1, int deleted, int inserted, Change old) {
580            this.line0 = line0;
581            this.line1 = line1;
582            this.inserted = inserted;
583            this.deleted = deleted;
584            this.link = old;
585        }
586
587        /**
588         * Returns the number of insertions and deletions of this change as well as
589         * (recursively) the changes linked via {@link #link}.
590         * @return recursive number of insertions and deletions
591         */
592        public int getTotalNumberOfChanges() {
593            return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0);
594        }
595
596        @Override
597        public String toString() {
598            String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1);
599            return (link != null) ? s = s + '\n' + link : s;
600        }
601    }
602
603    /**
604     * Data on one input file being compared.
605     */
606    class FileData {
607
608        /** Allocate changed array for the results of comparison.  */
609        void clear() {
610            // Allocate a flag for each line of each file, saying whether that line is an insertion or deletion.
611            // Allocate an extra element, always zero, at each end of each vector.
612            changedFlag = new boolean[bufferedLines + 2];
613        }
614
615        /**
616         * Return equiv_count[I] as the number of lines in this file that fall in equivalence class I.
617         * @return the array of equivalence class counts.
618         */
619        int[] equivCount() {
620            int[] equiv_count = new int[equivMax];
621            for (int i = 0; i < bufferedLines; ++i) {
622                ++equiv_count[equivs[i]];
623            }
624            return equiv_count;
625        }
626
627        /**
628         * Discard lines that have no matches in another file.
629         *
630         * A line which is discarded will not be considered by the actual comparison algorithm;
631         * it will be as if that line were not in the file.
632         * The file's `realindexes' table maps virtual line numbers
633         * (which don't count the discarded lines) into real line numbers;
634         * this is how the actual comparison algorithm produces results
635         * that are comprehensible when the discarded lines are counted.
636         * <p>
637         * When we discard a line, we also mark it as a deletion or insertion so that it will be printed in the output.
638         * @param f the other file
639         */
640        void discard_confusing_lines(FileData f) {
641            clear();
642            // Set up table of which lines are going to be discarded.
643            final byte[] discarded = discardable(f.equivCount());
644
645            // Don't really discard the provisional lines except when they occur in a run of discardables,
646            // with nonprovisionals at the beginning and end.
647            filterDiscards(discarded);
648
649            // Actually discard the lines.
650            discard(discarded);
651        }
652
653        /**
654         * Mark to be discarded each line that matches no line of another file.
655         * If a line matches many lines, mark it as provisionally discardable.
656         * @param counts The count of each equivalence number for the other file.
657         * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line
658         * @see #equivCount()
659         */
660        private byte[] discardable(final int[] counts) {
661            final int end = bufferedLines;
662            final byte[] discards = new byte[end];
663            final int[] equivs = this.equivs;
664            int many = 5;
665            int tem = end / 64;
666
667            /* Multiply MANY by approximate square root of number of lines.
668               That is the threshold for provisionally discardable lines.  */
669            while ((tem = tem >> 2) > 0) {
670                many *= 2;
671            }
672
673            for (int i = 0; i < end; i++) {
674                int nmatch;
675                if (equivs[i] == 0) {
676                    continue;
677                }
678                nmatch = counts[equivs[i]];
679                if (nmatch == 0) {
680                    discards[i] = 1;
681                } else if (nmatch > many) {
682                    discards[i] = 2;
683                }
684            }
685            return discards;
686        }
687
688        /**
689         * Don't really discard the provisional lines except when they occur
690         * in a run of discardables, with nonprovisionals at the beginning and end.
691         * @param discards discards
692         */
693        private void filterDiscards(final byte[] discards) {
694            final int end = bufferedLines;
695
696            for (int i = 0; i < end; i++) {
697                /* Cancel provisional discards not in middle of run of discards.  */
698                if (discards[i] == 2) {
699                    discards[i] = 0;
700                } else if (discards[i] != 0) {
701                    /* We have found a nonprovisional discard.  */
702                    int j;
703                    int length;
704                    int provisional = 0;
705
706                    /* Find end of this run of discardable lines.
707                       Count how many are provisionally discardable.  */
708                    for (j = i; j < end; j++) {
709                        if (discards[j] == 0) {
710                            break;
711                        }
712                        if (discards[j] == 2) {
713                            ++provisional;
714                        }
715                    }
716
717                    /* Cancel provisional discards at end, and shrink the run.  */
718                    while (j > i && discards[j - 1] == 2) {
719                        discards[--j] = 0; --provisional;
720                    }
721
722                    /* Now we have the length of a run of discardable lines
723                       whose first and last are not provisional.  */
724                    length = j - i;
725
726                    /* If 1/4 of the lines in the run are provisional,
727                       cancel discarding of all provisional lines in the run.  */
728                    if (provisional * 4 > length) {
729                        while (j > i)
730                            if (discards[--j] == 2) {
731                                discards[j] = 0;
732                            }
733                    } else {
734                        int consec;
735                        int minimum = 1;
736                        int tem = length / 4;
737
738                        /* MINIMUM is approximate square root of LENGTH/4.
739                           A subrun of two or more provisionals can stand
740                           when LENGTH is at least 16.
741                           A subrun of 4 or more can stand when LENGTH >= 64.  */
742                        while ((tem = tem >> 2) > 0) {
743                            minimum *= 2;
744                        }
745                        minimum++;
746
747                        /* Cancel any subrun of MINIMUM or more provisionals
748                           within the larger run.  */
749                        for (j = 0, consec = 0; j < length; j++) {
750                            if (discards[i + j] != 2) {
751                                consec = 0;
752                            } else if (minimum == ++consec) {
753                                /* Back up to start of subrun, to cancel it all.  */
754                                j -= consec;
755                            } else if (minimum < consec) {
756                                discards[i + j] = 0;
757                            }
758                        }
759
760                        /* Scan from beginning of run
761                           until we find 3 or more nonprovisionals in a row
762                           or until the first nonprovisional at least 8 lines in.
763                           Until that point, cancel any provisionals.  */
764                        for (j = 0, consec = 0; j < length; j++) {
765                            if (j >= 8 && discards[i + j] == 1) {
766                                break;
767                            }
768                            if (discards[i + j] == 2) {
769                                consec = 0; discards[i + j] = 0;
770                            } else if (discards[i + j] == 0) {
771                                consec = 0;
772                            } else {
773                                consec++;
774                            }
775                            if (consec == 3) {
776                                break;
777                            }
778                        }
779
780                        /* I advances to the last line of the run.  */
781                        i += length - 1;
782
783                        /* Same thing, from end.  */
784                        for (j = 0, consec = 0; j < length; j++) {
785                            if (j >= 8 && discards[i - j] == 1) {
786                                break;
787                            }
788                            if (discards[i - j] == 2) {
789                                consec = 0; discards[i - j] = 0;
790                            } else if (discards[i - j] == 0) {
791                                consec = 0;
792                            } else {
793                                consec++;
794                            }
795                            if (consec == 3) {
796                                break;
797                            }
798                        }
799                    }
800                }
801            }
802        }
803
804        /** Actually discard the lines.
805            @param discards flags lines to be discarded
806         */
807        private void discard(final byte[] discards) {
808            final int end = bufferedLines;
809            int j = 0;
810            for (int i = 0; i < end; ++i) {
811                if (noDiscards || discards[i] == 0) {
812                    undiscarded[j] = equivs[i];
813                    realindexes[j++] = i;
814                } else {
815                    changedFlag[1+i] = true;
816                }
817            }
818            nondiscardedLines = j;
819        }
820
821        FileData(int length) {
822            bufferedLines = length;
823            equivs = new int[length];
824            undiscarded = new int[bufferedLines];
825            realindexes = new int[bufferedLines];
826        }
827
828        FileData(Object[] data, Map<Object, Integer> h) {
829            this(data.length);
830            // FIXME: diff 2.7 removes common prefix and common suffix
831            for (int i = 0; i < data.length; ++i) {
832                Integer ir = h.get(data[i]);
833                if (ir == null) {
834                    h.put(data[i], equivs[i] = equivMax++);
835                } else {
836                    equivs[i] = ir.intValue();
837                }
838            }
839        }
840
841        /** Adjust inserts/deletes of blank lines to join changes
842       as much as possible.
843
844       We do something when a run of changed lines include a blank
845       line at one end and have an excluded blank line at the other.
846       We are free to choose which blank line is included.
847       `compareseq' always chooses the one at the beginning,
848       but usually it is cleaner to consider the following blank line
849       to be the "change".  The only exception is if the preceding blank line
850       would join this change to other changes.
851      @param f the file being compared against
852         */
853        void shift_boundaries(FileData f) {
854            final boolean[] changed = changedFlag;
855            final boolean[] other_changed = f.changedFlag;
856            int i = 0;
857            int j = 0;
858            int i_end = bufferedLines;
859            int preceding = -1;
860            int other_preceding = -1;
861
862            for (;;) {
863                int start, end, other_start;
864
865                /* Scan forwards to find beginning of another run of changes.
866                   Also keep track of the corresponding point in the other file.  */
867
868                while (i < i_end && !changed[1+i]) {
869                    while (other_changed[1+j++]) {
870                        /* Non-corresponding lines in the other file
871                           will count as the preceding batch of changes.  */
872                        other_preceding = j;
873                    }
874                    i++;
875                }
876
877                if (i == i_end) {
878                    break;
879                }
880
881                start = i;
882                other_start = j;
883
884                for (;;) {
885                    /* Now find the end of this run of changes.  */
886
887                    while (i < i_end && changed[1+i]) {
888                        i++;
889                    }
890                    end = i;
891
892                    /* If the first changed line matches the following unchanged one,
893                       and this run does not follow right after a previous run,
894                       and there are no lines deleted from the other file here,
895                       then classify the first changed line as unchanged
896                       and the following line as changed in its place.  */
897
898                    /* You might ask, how could this run follow right after another?
899                       Only because the previous run was shifted here.  */
900
901                    if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j]
902                         && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) {
903                        changed[1+end++] = true;
904                        changed[1+start++] = false;
905                        ++i;
906                        /* Since one line-that-matches is now before this run
907                           instead of after, we must advance in the other file
908                           to keep in synch.  */
909                        ++j;
910                    } else {
911                        break;
912                    }
913                }
914
915                preceding = i;
916                other_preceding = j;
917            }
918        }
919
920        /** Number of elements (lines) in this file. */
921        private final int bufferedLines;
922
923        /** Vector, indexed by line number, containing an equivalence code for
924           each line.  It is this vector that is actually compared with that
925           of another file to generate differences. */
926        private final int[]     equivs;
927
928        /** Vector, like the previous one except that
929           the elements for discarded lines have been squeezed out.  */
930        private final int[]    undiscarded;
931
932        /** Vector mapping virtual line numbers (not counting discarded lines)
933           to real ones (counting those lines).  Both are origin-0.  */
934        private final int[]    realindexes;
935
936        /** Total number of nondiscarded lines. */
937        private int         nondiscardedLines;
938
939        /** Array, indexed by real origin-1 line number,
940           containing true for a line that is an insertion or a deletion.
941           The results of comparison are stored here.  */
942        private boolean[]       changedFlag;
943    }
944}