001// License: GPL. 002package org.openstreetmap.josm.tools; 003 004//// Taken from http://www.bmsi.com/java/#diff 005 006// http://www.bmsi.com/java/DiffPrint.java could also be useful 007 008/* 009 * $Log: Diff.java,v $ 010 * Revision 1.15 2013/04/01 16:27:31 stuart 011 * Fix DiffPrint unified format with test cases. 012 * Begin porting some diff-2.7 features. 013 * 014 * Revision 1.14 2010/03/03 21:21:25 stuart 015 * Test new direct equivalence API 016 * 017 * Revision 1.13 2009/12/07 17:43:17 stuart 018 * Compute equiv_max for int[] ctor 019 * 020 * Revision 1.12 2009/12/07 17:34:46 stuart 021 * Ctor with int[]. 022 * 023 * Revision 1.11 2009/11/15 01:11:54 stuart 024 * Diff doesn't need to be generic 025 * 026 * Revision 1.10 2009/11/15 00:54:03 stuart 027 * Update to Java 5 containers 028 * 029 * Revision 1.7 2009/01/19 03:05:26 stuart 030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han. 031 * 032 * Revision 1.6 2003/03/06 22:51:32 stuart 033 * Convert to CVS 034 * 035 * Revision 1.5 2002/07/19 19:14:40 stuart 036 * fix reverseScript, make change ctor public, update docs 037 * 038 * Revision 1.4 2002/04/09 17:53:39 stuart 039 * More flexible interface for diff() function. 040 * 041 * Revision 1.3 2000/03/03 21:58:03 stuart 042 * move discard_confusing_lines and shift_boundaries to class file_data 043 * 044 * Revision 1.2 2000/03/02 16:37:38 stuart 045 * Add GPL and copyright 046 * 047 */ 048 049import java.util.HashMap; 050import java.util.Map; 051 052/** A class to compare vectors of objects. The result of comparison 053 is a list of <code>change</code> objects which form an 054 edit script. The objects compared are traditionally lines 055 of text from two files. Comparison options such as "ignore 056 whitespace" are implemented by modifying the <code>equals</code> 057 and <code>hashcode</code> methods for the objects compared. 058<p> 059 The basic algorithm is described in: </br> 060 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 061 Algorithmica Vol. 1 No. 2, 1986, p 251. 062<p> 063 This class outputs different results from GNU diff 1.15 on some 064 inputs. Our results are actually better (smaller change list, smaller 065 total size of changes), but it would be nice to know why. Perhaps 066 there is a memory overwrite bug in GNU diff 1.15. 067 068 @author Stuart D. Gathman, translated from GNU diff 1.15 069 Copyright (C) 2000 Business Management Systems, Inc. 070<p> 071 This program is free software; you can redistribute it and/or modify 072 it under the terms of the GNU General Public License as published by 073 the Free Software Foundation; either version 1, or (at your option) 074 any later version. 075<p> 076 This program is distributed in the hope that it will be useful, 077 but WITHOUT ANY WARRANTY; without even the implied warranty of 078 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 079 GNU General Public License for more details. 080<p> 081 You should have received a copy of the <a href=COPYING.txt> 082 GNU General Public License</a> 083 along with this program; if not, write to the Free Software 084 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 085 086 */ 087 088public class Diff { 089 090 /** Prepare to find differences between two arrays. Each element of 091 the arrays is translated to an "equivalence number" based on 092 the result of <code>equals</code>. The original Object arrays 093 are no longer needed for computing the differences. They will 094 be needed again later to print the results of the comparison as 095 an edit script, if desired. 096 */ 097 public Diff(Object[] a, Object[] b) { 098 Map<Object,Integer> h = new HashMap<Object,Integer>(a.length + b.length); 099 filevec = new FileData[] { new FileData(a,h),new FileData(b,h) }; 100 } 101 102 /** 1 more than the maximum equivalence value used for this or its 103 sibling file. */ 104 private int equiv_max = 1; 105 106 /** When set to true, the comparison uses a heuristic to speed it up. 107 With this heuristic, for files with a constant small density 108 of changes, the algorithm is linear in the file size. */ 109 public boolean heuristic = false; 110 111 /** When set to true, the algorithm returns a guarranteed minimal 112 set of changes. This makes things slower, sometimes much slower. */ 113 public boolean no_discards = false; 114 115 private int[] xvec, yvec; /* Vectors being compared. */ 116 private int[] fdiag; /* Vector, indexed by diagonal, containing 117 the X coordinate of the point furthest 118 along the given diagonal in the forward 119 search of the edit matrix. */ 120 private int[] bdiag; /* Vector, indexed by diagonal, containing 121 the X coordinate of the point furthest 122 along the given diagonal in the backward 123 search of the edit matrix. */ 124 private int fdiagoff, bdiagoff; 125 private final FileData[] filevec; 126 private int cost; 127 /** Snakes bigger than this are considered "big". */ 128 private static final int SNAKE_LIMIT = 20; 129 130 /** Find the midpoint of the shortest edit script for a specified 131 portion of the two files. 132 133 We scan from the beginnings of the files, and simultaneously from the ends, 134 doing a breadth-first search through the space of edit-sequence. 135 When the two searches meet, we have found the midpoint of the shortest 136 edit sequence. 137 138 The value returned is the number of the diagonal on which the midpoint lies. 139 The diagonal number equals the number of inserted lines minus the number 140 of deleted lines (counting only lines before the midpoint). 141 The edit cost is stored into COST; this is the total number of 142 lines inserted or deleted (counting only lines before the midpoint). 143 144 This function assumes that the first lines of the specified portions 145 of the two files do not match, and likewise that the last lines do not 146 match. The caller must trim matching lines from the beginning and end 147 of the portions it is going to specify. 148 149 Note that if we return the "wrong" diagonal value, or if 150 the value of bdiag at that diagonal is "wrong", 151 the worst this can do is cause suboptimal diff output. 152 It cannot cause incorrect diff output. */ 153 154 private int diag (int xoff, int xlim, int yoff, int ylim) { 155 final int[] fd = fdiag; // Give the compiler a chance. 156 final int[] bd = bdiag; // Additional help for the compiler. 157 final int[] xv = xvec; // Still more help for the compiler. 158 final int[] yv = yvec; // And more and more . . . 159 final int dmin = xoff - ylim; // Minimum valid diagonal. 160 final int dmax = xlim - yoff; // Maximum valid diagonal. 161 final int fmid = xoff - yoff; // Center diagonal of top-down search. 162 final int bmid = xlim - ylim; // Center diagonal of bottom-up search. 163 int fmin = fmid, fmax = fmid; // Limits of top-down search. 164 int bmin = bmid, bmax = bmid; // Limits of bottom-up search. 165 /* True if southeast corner is on an odd 166 diagonal with respect to the northwest. */ 167 final boolean odd = (fmid - bmid & 1) != 0; 168 169 fd[fdiagoff + fmid] = xoff; 170 bd[bdiagoff + bmid] = xlim; 171 172 for (int c = 1;; ++c) 173 { 174 int d; /* Active diagonal. */ 175 boolean big_snake = false; 176 177 /* Extend the top-down search by an edit step in each diagonal. */ 178 if (fmin > dmin) { 179 fd[fdiagoff + --fmin - 1] = -1; 180 } else { 181 ++fmin; 182 } 183 if (fmax < dmax) { 184 fd[fdiagoff + ++fmax + 1] = -1; 185 } else { 186 --fmax; 187 } 188 for (d = fmax; d >= fmin; d -= 2) 189 { 190 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; 191 192 if (tlo >= thi) { 193 x = tlo + 1; 194 } else { 195 x = thi; 196 } 197 oldx = x; 198 y = x - d; 199 while (x < xlim && y < ylim && xv[x] == yv[y]) { 200 ++x; ++y; 201 } 202 if (x - oldx > SNAKE_LIMIT) { 203 big_snake = true; 204 } 205 fd[fdiagoff + d] = x; 206 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) 207 { 208 cost = 2 * c - 1; 209 return d; 210 } 211 } 212 213 /* Similar extend the bottom-up search. */ 214 if (bmin > dmin) { 215 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; 216 } else { 217 ++bmin; 218 } 219 if (bmax < dmax) { 220 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; 221 } else { 222 --bmax; 223 } 224 for (d = bmax; d >= bmin; d -= 2) 225 { 226 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; 227 228 if (tlo < thi) { 229 x = tlo; 230 } else { 231 x = thi - 1; 232 } 233 oldx = x; 234 y = x - d; 235 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { 236 --x; --y; 237 } 238 if (oldx - x > SNAKE_LIMIT) { 239 big_snake = true; 240 } 241 bd[bdiagoff + d] = x; 242 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) 243 { 244 cost = 2 * c; 245 return d; 246 } 247 } 248 249 /* Heuristic: check occasionally for a diagonal that has made 250 lots of progress compared with the edit distance. 251 If we have any such, find the one that has made the most 252 progress and return it as if it had succeeded. 253 254 With this heuristic, for files with a constant small density 255 of changes, the algorithm is linear in the file size. */ 256 257 if (c > 200 && big_snake && heuristic) 258 { 259 int best = 0; 260 int bestpos = -1; 261 262 for (d = fmax; d >= fmin; d -= 2) 263 { 264 int dd = d - fmid; 265 int x = fd[fdiagoff + d]; 266 int y = x - d; 267 int v = (x - xoff) * 2 - dd; 268 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 269 { 270 if (v > best 271 && xoff + SNAKE_LIMIT <= x && x < xlim 272 && yoff + SNAKE_LIMIT <= y && y < ylim) 273 { 274 /* We have a good enough best diagonal. 275 now insist that it end with a significant snake. */ 276 int k; 277 278 for (k = 1; xvec[x - k] == yvec[y - k]; k++) 279 if (k == SNAKE_LIMIT) 280 { 281 best = v; 282 bestpos = d; 283 break; 284 } 285 } 286 } 287 } 288 if (best > 0) 289 { 290 cost = 2 * c - 1; 291 return bestpos; 292 } 293 294 best = 0; 295 for (d = bmax; d >= bmin; d -= 2) 296 { 297 int dd = d - bmid; 298 int x = bd[bdiagoff + d]; 299 int y = x - d; 300 int v = (xlim - x) * 2 + dd; 301 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 302 { 303 if (v > best 304 && xoff < x && x <= xlim - SNAKE_LIMIT 305 && yoff < y && y <= ylim - SNAKE_LIMIT) 306 { 307 /* We have a good enough best diagonal. 308 now insist that it end with a significant snake. */ 309 int k; 310 311 for (k = 0; xvec[x + k] == yvec[y + k]; k++) 312 if (k == SNAKE_LIMIT) 313 { 314 best = v; 315 bestpos = d; 316 break; 317 } 318 } 319 } 320 } 321 if (best > 0) 322 { 323 cost = 2 * c - 1; 324 return bestpos; 325 } 326 } 327 } 328 } 329 330 /** Compare in detail contiguous subsequences of the two files 331 which are known, as a whole, to match each other. 332 333 The results are recorded in the vectors filevec[N].changed_flag, by 334 storing a 1 in the element for each line that is an insertion or deletion. 335 336 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 337 338 Note that XLIM, YLIM are exclusive bounds. 339 All line numbers are origin-0 and discarded lines are not counted. */ 340 341 private void compareseq (int xoff, int xlim, int yoff, int ylim) { 342 /* Slide down the bottom initial diagonal. */ 343 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { 344 ++xoff; ++yoff; 345 } 346 /* Slide up the top initial diagonal. */ 347 while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { 348 --xlim; --ylim; 349 } 350 351 /* Handle simple cases. */ 352 if (xoff == xlim) { 353 while (yoff < ylim) { 354 filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true; 355 } 356 } else if (yoff == ylim) { 357 while (xoff < xlim) { 358 filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true; 359 } 360 } else { 361 /* Find a point of correspondence in the middle of the files. */ 362 363 int d = diag (xoff, xlim, yoff, ylim); 364 int c = cost; 365 int b = bdiag[bdiagoff + d]; 366 367 if (c == 1) 368 /* This should be impossible, because it implies that 369 one of the two subsequences is empty, 370 and that case was handled above without calling `diag'. 371 Let's verify that this is true. */ 372 throw new IllegalArgumentException("Empty subsequence"); 373 else 374 { 375 /* Use that point to split this problem into two subproblems. */ 376 compareseq (xoff, b, yoff, b - d); 377 /* This used to use f instead of b, 378 but that is incorrect! 379 It is not necessarily the case that diagonal d 380 has a snake from b to f. */ 381 compareseq (b, xlim, b - d, ylim); 382 } 383 } 384 } 385 386 /** Discard lines from one file that have no matches in the other file. 387 */ 388 private void discard_confusing_lines() { 389 filevec[0].discard_confusing_lines(filevec[1]); 390 filevec[1].discard_confusing_lines(filevec[0]); 391 } 392 393 private boolean inhibit = false; 394 395 /** Adjust inserts/deletes of blank lines to join changes 396 as much as possible. 397 */ 398 private void shift_boundaries() { 399 if (inhibit) 400 return; 401 filevec[0].shift_boundaries(filevec[1]); 402 filevec[1].shift_boundaries(filevec[0]); 403 } 404 405 public interface ScriptBuilder { 406 /** Scan the tables of which lines are inserted and deleted, 407 producing an edit script. 408 @param changed0 true for lines in first file which do not match 2nd 409 @param len0 number of lines in first file 410 @param changed1 true for lines in 2nd file which do not match 1st 411 @param len1 number of lines in 2nd file 412 @return a linked list of changes - or null 413 */ 414 public Change build_script( 415 boolean[] changed0,int len0, 416 boolean[] changed1,int len1 417 ); 418 } 419 420 /** Scan the tables of which lines are inserted and deleted, 421 producing an edit script in reverse order. */ 422 423 static class ReverseScript implements ScriptBuilder { 424 @Override 425 public Change build_script( 426 final boolean[] changed0,int len0, 427 final boolean[] changed1,int len1) 428 { 429 Change script = null; 430 int i0 = 0, i1 = 0; 431 while (i0 < len0 || i1 < len1) { 432 if (changed0[1+i0] || changed1[1+i1]) { 433 int line0 = i0, line1 = i1; 434 435 /* Find # lines changed here in each file. */ 436 while (changed0[1+i0]) { 437 ++i0; 438 } 439 while (changed1[1+i1]) { 440 ++i1; 441 } 442 443 /* Record this change. */ 444 script = new Change(line0, line1, i0 - line0, i1 - line1, script); 445 } 446 447 /* We have reached lines in the two files that match each other. */ 448 i0++; i1++; 449 } 450 451 return script; 452 } 453 } 454 455 static class ForwardScript implements ScriptBuilder { 456 /** Scan the tables of which lines are inserted and deleted, 457 producing an edit script in forward order. */ 458 @Override 459 public Change build_script( 460 final boolean[] changed0,int len0, 461 final boolean[] changed1,int len1) 462 { 463 Change script = null; 464 int i0 = len0, i1 = len1; 465 466 while (i0 >= 0 || i1 >= 0) 467 { 468 if (changed0[i0] || changed1[i1]) 469 { 470 int line0 = i0, line1 = i1; 471 472 /* Find # lines changed here in each file. */ 473 while (changed0[i0]) { 474 --i0; 475 } 476 while (changed1[i1]) { 477 --i1; 478 } 479 480 /* Record this change. */ 481 script = new Change(i0, i1, line0 - i0, line1 - i1, script); 482 } 483 484 /* We have reached lines in the two files that match each other. */ 485 i0--; i1--; 486 } 487 488 return script; 489 } 490 } 491 492 /** Standard ScriptBuilders. */ 493 public final static ScriptBuilder 494 forwardScript = new ForwardScript(), 495 reverseScript = new ReverseScript(); 496 497 /** Report the differences of two files. DEPTH is the current directory 498 depth. */ 499 public final Change diff_2(final boolean reverse) { 500 return diff(reverse ? reverseScript : forwardScript); 501 } 502 503 /** Get the results of comparison as an edit script. The script 504 is described by a list of changes. The standard ScriptBuilder 505 implementations provide for forward and reverse edit scripts. 506 Alternate implementations could, for instance, list common elements 507 instead of differences. 508 @param bld an object to build the script from change flags 509 @return the head of a list of changes 510 */ 511 public Change diff(final ScriptBuilder bld) { 512 513 /* Some lines are obviously insertions or deletions 514 because they don't match anything. Detect them now, 515 and avoid even thinking about them in the main comparison algorithm. */ 516 517 discard_confusing_lines (); 518 519 /* Now do the main comparison algorithm, considering just the 520 undiscarded lines. */ 521 522 xvec = filevec[0].undiscarded; 523 yvec = filevec[1].undiscarded; 524 525 int diags = 526 filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; 527 fdiag = new int[diags]; 528 fdiagoff = filevec[1].nondiscarded_lines + 1; 529 bdiag = new int[diags]; 530 bdiagoff = filevec[1].nondiscarded_lines + 1; 531 532 compareseq (0, filevec[0].nondiscarded_lines, 533 0, filevec[1].nondiscarded_lines); 534 fdiag = null; 535 bdiag = null; 536 537 /* Modify the results slightly to make them prettier 538 in cases where that can validly be done. */ 539 540 shift_boundaries (); 541 542 /* Get the results of comparison in the form of a chain 543 of `struct change's -- an edit script. */ 544 return bld.build_script( 545 filevec[0].changed_flag, 546 filevec[0].buffered_lines, 547 filevec[1].changed_flag, 548 filevec[1].buffered_lines 549 ); 550 551 } 552 553 /** The result of comparison is an "edit script": a chain of change objects. 554 Each change represents one place where some lines are deleted 555 and some are inserted. 556 557 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 558 DELETED is the number of lines deleted here from file 0. 559 INSERTED is the number of lines inserted here in file 1. 560 561 If DELETED is 0 then LINE0 is the number of the line before 562 which the insertion was done; vice versa for INSERTED and LINE1. */ 563 564 public static class Change { 565 /** Previous or next edit command. */ 566 public Change link; 567 /** # lines of file 1 changed here. */ 568 public final int inserted; 569 /** # lines of file 0 changed here. */ 570 public final int deleted; 571 /** Line number of 1st deleted line. */ 572 public final int line0; 573 /** Line number of 1st inserted line. */ 574 public final int line1; 575 576 /** Cons an additional entry onto the front of an edit script OLD. 577 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 578 DELETED is the number of lines deleted here from file 0. 579 INSERTED is the number of lines inserted here in file 1. 580 581 If DELETED is 0 then LINE0 is the number of the line before 582 which the insertion was done; vice versa for INSERTED and LINE1. */ 583 public Change(int line0, int line1, int deleted, int inserted, Change old) { 584 this.line0 = line0; 585 this.line1 = line1; 586 this.inserted = inserted; 587 this.deleted = deleted; 588 this.link = old; 589 } 590 591 public String toString() { 592 String s = String.format("%d -%d +%d %d",line0,deleted,inserted,line1); 593 return (link != null) ? s = s + '\n' + link : s; 594 } 595 } 596 597 /** Data on one input file being compared. 598 */ 599 600 class FileData { 601 602 /** Allocate changed array for the results of comparison. */ 603 void clear() { 604 /* Allocate a flag for each line of each file, saying whether that line 605 is an insertion or deletion. 606 Allocate an extra element, always zero, at each end of each vector. 607 */ 608 changed_flag = new boolean[buffered_lines + 2]; 609 } 610 611 /** Return equiv_count[I] as the number of lines in this file 612 that fall in equivalence class I. 613 @return the array of equivalence class counts. 614 */ 615 int[] equivCount() { 616 int[] equiv_count = new int[equiv_max]; 617 for (int i = 0; i < buffered_lines; ++i) { 618 ++equiv_count[equivs[i]]; 619 } 620 return equiv_count; 621 } 622 623 /** Discard lines that have no matches in another file. 624 625 A line which is discarded will not be considered by the actual 626 comparison algorithm; it will be as if that line were not in the file. 627 The file's `realindexes' table maps virtual line numbers 628 (which don't count the discarded lines) into real line numbers; 629 this is how the actual comparison algorithm produces results 630 that are comprehensible when the discarded lines are counted. 631<p> 632 When we discard a line, we also mark it as a deletion or insertion 633 so that it will be printed in the output. 634 @param f the other file 635 */ 636 void discard_confusing_lines(FileData f) { 637 clear(); 638 /* Set up table of which lines are going to be discarded. */ 639 final byte[] discarded = discardable(f.equivCount()); 640 641 /* Don't really discard the provisional lines except when they occur 642 in a run of discardables, with nonprovisionals at the beginning 643 and end. */ 644 filterDiscards(discarded); 645 646 /* Actually discard the lines. */ 647 discard(discarded); 648 } 649 650 /** 651 * Mark to be discarded each line that matches no line of another file. 652 * If a line matches many lines, mark it as provisionally discardable. 653 * @see #equivCount() 654 * @param counts The count of each equivalence number for the other file. 655 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable 656 * for each line 657 */ 658 private byte[] discardable(final int[] counts) { 659 final int end = buffered_lines; 660 final byte[] discards = new byte[end]; 661 final int[] equivs = this.equivs; 662 int many = 5; 663 int tem = end / 64; 664 665 /* Multiply MANY by approximate square root of number of lines. 666 That is the threshold for provisionally discardable lines. */ 667 while ((tem = tem >> 2) > 0) { 668 many *= 2; 669 } 670 671 for (int i = 0; i < end; i++) 672 { 673 int nmatch; 674 if (equivs[i] == 0) { 675 continue; 676 } 677 nmatch = counts[equivs[i]]; 678 if (nmatch == 0) { 679 discards[i] = 1; 680 } else if (nmatch > many) { 681 discards[i] = 2; 682 } 683 } 684 return discards; 685 } 686 687 /** 688 * Don't really discard the provisional lines except when they occur 689 * in a run of discardables, with nonprovisionals at the beginning 690 * and end. 691 */ 692 private void filterDiscards(final byte[] discards) { 693 final int end = buffered_lines; 694 695 for (int i = 0; i < end; i++) 696 { 697 /* Cancel provisional discards not in middle of run of discards. */ 698 if (discards[i] == 2) { 699 discards[i] = 0; 700 } else if (discards[i] != 0) 701 { 702 /* We have found a nonprovisional discard. */ 703 int j; 704 int length; 705 int provisional = 0; 706 707 /* Find end of this run of discardable lines. 708 Count how many are provisionally discardable. */ 709 for (j = i; j < end; j++) 710 { 711 if (discards[j] == 0) { 712 break; 713 } 714 if (discards[j] == 2) { 715 ++provisional; 716 } 717 } 718 719 /* Cancel provisional discards at end, and shrink the run. */ 720 while (j > i && discards[j - 1] == 2) { 721 discards[--j] = 0; --provisional; 722 } 723 724 /* Now we have the length of a run of discardable lines 725 whose first and last are not provisional. */ 726 length = j - i; 727 728 /* If 1/4 of the lines in the run are provisional, 729 cancel discarding of all provisional lines in the run. */ 730 if (provisional * 4 > length) { 731 while (j > i) 732 if (discards[--j] == 2) { 733 discards[j] = 0; 734 } 735 } else { 736 int consec; 737 int minimum = 1; 738 int tem = length / 4; 739 740 /* MINIMUM is approximate square root of LENGTH/4. 741 A subrun of two or more provisionals can stand 742 when LENGTH is at least 16. 743 A subrun of 4 or more can stand when LENGTH >= 64. */ 744 while ((tem = tem >> 2) > 0) { 745 minimum *= 2; 746 } 747 minimum++; 748 749 /* Cancel any subrun of MINIMUM or more provisionals 750 within the larger run. */ 751 for (j = 0, consec = 0; j < length; j++) 752 if (discards[i + j] != 2) { 753 consec = 0; 754 } else if (minimum == ++consec) { 755 /* Back up to start of subrun, to cancel it all. */ 756 j -= consec; 757 } else if (minimum < consec) { 758 discards[i + j] = 0; 759 } 760 761 /* Scan from beginning of run 762 until we find 3 or more nonprovisionals in a row 763 or until the first nonprovisional at least 8 lines in. 764 Until that point, cancel any provisionals. */ 765 for (j = 0, consec = 0; j < length; j++) 766 { 767 if (j >= 8 && discards[i + j] == 1) { 768 break; 769 } 770 if (discards[i + j] == 2) { 771 consec = 0; discards[i + j] = 0; 772 } 773 else if (discards[i + j] == 0) { 774 consec = 0; 775 } else { 776 consec++; 777 } 778 if (consec == 3) { 779 break; 780 } 781 } 782 783 /* I advances to the last line of the run. */ 784 i += length - 1; 785 786 /* Same thing, from end. */ 787 for (j = 0, consec = 0; j < length; j++) 788 { 789 if (j >= 8 && discards[i - j] == 1) { 790 break; 791 } 792 if (discards[i - j] == 2) { 793 consec = 0; discards[i - j] = 0; 794 } 795 else if (discards[i - j] == 0) { 796 consec = 0; 797 } else { 798 consec++; 799 } 800 if (consec == 3) { 801 break; 802 } 803 } 804 } 805 } 806 } 807 } 808 809 /** Actually discard the lines. 810 @param discards flags lines to be discarded 811 */ 812 private void discard(final byte[] discards) { 813 final int end = buffered_lines; 814 int j = 0; 815 for (int i = 0; i < end; ++i) 816 if (no_discards || discards[i] == 0) 817 { 818 undiscarded[j] = equivs[i]; 819 realindexes[j++] = i; 820 } else { 821 changed_flag[1+i] = true; 822 } 823 nondiscarded_lines = j; 824 } 825 826 FileData(int length) { 827 buffered_lines = length; 828 equivs = new int[length]; 829 undiscarded = new int[buffered_lines]; 830 realindexes = new int[buffered_lines]; 831 } 832 833 FileData(Object[] data, Map<Object,Integer> h) { 834 this(data.length); 835 // FIXME: diff 2.7 removes common prefix and common suffix 836 for (int i = 0; i < data.length; ++i) { 837 Integer ir = h.get(data[i]); 838 if (ir == null) { 839 h.put(data[i],equivs[i] = equiv_max++); 840 } else { 841 equivs[i] = ir.intValue(); 842 } 843 } 844 } 845 846 /** Adjust inserts/deletes of blank lines to join changes 847 as much as possible. 848 849 We do something when a run of changed lines include a blank 850 line at one end and have an excluded blank line at the other. 851 We are free to choose which blank line is included. 852 `compareseq' always chooses the one at the beginning, 853 but usually it is cleaner to consider the following blank line 854 to be the "change". The only exception is if the preceding blank line 855 would join this change to other changes. 856 @param f the file being compared against 857 */ 858 859 void shift_boundaries(FileData f) { 860 final boolean[] changed = changed_flag; 861 final boolean[] other_changed = f.changed_flag; 862 int i = 0; 863 int j = 0; 864 int i_end = buffered_lines; 865 int preceding = -1; 866 int other_preceding = -1; 867 868 for (;;) 869 { 870 int start, end, other_start; 871 872 /* Scan forwards to find beginning of another run of changes. 873 Also keep track of the corresponding point in the other file. */ 874 875 while (i < i_end && !changed[1+i]) 876 { 877 while (other_changed[1+j++]) { 878 /* Non-corresponding lines in the other file 879 will count as the preceding batch of changes. */ 880 other_preceding = j; 881 } 882 i++; 883 } 884 885 if (i == i_end) { 886 break; 887 } 888 889 start = i; 890 other_start = j; 891 892 for (;;) 893 { 894 /* Now find the end of this run of changes. */ 895 896 while (i < i_end && changed[1+i]) { 897 i++; 898 } 899 end = i; 900 901 /* If the first changed line matches the following unchanged one, 902 and this run does not follow right after a previous run, 903 and there are no lines deleted from the other file here, 904 then classify the first changed line as unchanged 905 and the following line as changed in its place. */ 906 907 /* You might ask, how could this run follow right after another? 908 Only because the previous run was shifted here. */ 909 910 if (end != i_end 911 && equivs[start] == equivs[end] 912 && !other_changed[1+j] 913 && end != i_end 914 && !((preceding >= 0 && start == preceding) 915 || (other_preceding >= 0 916 && other_start == other_preceding))) 917 { 918 changed[1+end++] = true; 919 changed[1+start++] = false; 920 ++i; 921 /* Since one line-that-matches is now before this run 922 instead of after, we must advance in the other file 923 to keep in synch. */ 924 ++j; 925 } else { 926 break; 927 } 928 } 929 930 preceding = i; 931 other_preceding = j; 932 } 933 } 934 935 /** Number of elements (lines) in this file. */ 936 final int buffered_lines; 937 938 /** Vector, indexed by line number, containing an equivalence code for 939 each line. It is this vector that is actually compared with that 940 of another file to generate differences. */ 941 private final int[] equivs; 942 943 /** Vector, like the previous one except that 944 the elements for discarded lines have been squeezed out. */ 945 final int[] undiscarded; 946 947 /** Vector mapping virtual line numbers (not counting discarded lines) 948 to real ones (counting those lines). Both are origin-0. */ 949 final int[] realindexes; 950 951 /** Total number of nondiscarded lines. */ 952 int nondiscarded_lines; 953 954 /** Array, indexed by real origin-1 line number, 955 containing true for a line that is an insertion or a deletion. 956 The results of comparison are stored here. */ 957 boolean[] changed_flag; 958 } 959}