001    /*
002     * CDDL HEADER START
003     *
004     * The contents of this file are subject to the terms of the
005     * Common Development and Distribution License, Version 1.0 only
006     * (the "License").  You may not use this file except in compliance
007     * with the License.
008     *
009     * You can obtain a copy of the license at
010     * trunk/opends/resource/legal-notices/OpenDS.LICENSE
011     * or https://OpenDS.dev.java.net/OpenDS.LICENSE.
012     * See the License for the specific language governing permissions
013     * and limitations under the License.
014     *
015     * When distributing Covered Code, include this CDDL HEADER in each
016     * file and include the License file at
017     * trunk/opends/resource/legal-notices/OpenDS.LICENSE.  If applicable,
018     * add the following below this CDDL HEADER, with the fields enclosed
019     * by brackets "[]" replaced with your own identifying information:
020     *      Portions Copyright [yyyy] [name of copyright owner]
021     *
022     * CDDL HEADER END
023     *
024     *
025     *      Copyright 2008 Sun Microsystems, Inc.
026     */
027    package org.opends.server.util;
028    
029    
030    
031    import static org.opends.server.util.Validator.*;
032    
033    
034    
035    /**
036     * This class provides an implementation of the Levenshtein distance algorithm,
037     * which may be used to determine the minimum number of changes required to
038     * transform one string into another.  For the purpose of this algorithm, a
039     * change is defined as replacing one character with another, inserting a new
040     * character, or removing an existing character.
041     * <BR><BR>
042     * The basic algorithm works as follows for a source string S of length X and
043     * a target string T of length Y:
044     * <OL>
045     *   <LI>Create a matrix M with dimensions of X+1, Y+1.</LI>
046     *   <LI>Fill the first row with sequentially-increasing values 0 through
047     *       X.</LI>
048     *   <LI>Fill the first column with sequentially-increasing values 0 through
049     *       Y.</LI>
050     *   <LI>Create a nested loop iterating over the characters in the strings.  In
051     *       the outer loop, iterate through characters in S from 0 to X-1 using an
052     *       iteration counter I.  In the inner loop, iterate through the characters
053     *       in T from 0 to Y-1 using an iterator counter J.  Calculate the
054     *       following three things and place the smallest value in the matrix in
055     *       row I+1 column J+1:
056     *     <UL>
057     *       <LI>One more than the value in the matrix position immediately to the
058     *           left (i.e., 1 + M[I][J+1]).</LI>
059     *       <LI>One more than the value in the matrix position immediately above
060     *           (i.e., 1 + M[I+1][J]).</LI>
061     *       <LI>Define a value V to be zero if S[I] is the same as T[J], or one if
062     *           they are different.  Add that value to the value in the matrix
063     *           position immediately above and to the left (i.e.,
064     *           V + M[I][J]).</LI>
065     *     </UL>
066     *   </LI>
067     *   <LI>The Levenshtein distance value, which is the least number of changes
068     *       needed to transform the source string into the target string, will be
069     *       the value in the bottom right corner of the matrix (i.e.,
070     *       M[X][Y]).</LI>
071     * </OL>
072     * <BR><BR>
073     * Note that this is a completely "clean room" implementation, developed from a
074     * description of the algorithm, rather than copying an existing implementation.
075     * Doing it in this way eliminates copyright and licensing concerns associated
076     * with using an existing implementation.
077     */
078    @org.opends.server.types.PublicAPI(
079         stability=org.opends.server.types.StabilityLevel.UNCOMMITTED,
080         mayInstantiate=false,
081         mayExtend=false,
082         mayInvoke=true)
083    public final class LevenshteinDistance
084    {
085      /**
086       * Calculates the Levenshtein distance between the provided string values.
087       *
088       * @param  source  The source string to compare.  It must not be {@code null}.
089       * @param  target  The target string to compare.  It must not be {@code null}.
090       *
091       * @return  The minimum number of changes required to turn the source string
092       *          into the target string.
093       */
094      public static int calculate(String source, String target)
095      {
096        ensureNotNull(source, target);
097    
098        // sl == source length; tl == target length
099        int sl = source.length();
100        int tl = target.length();
101    
102    
103        // If either of the lengths is zero, then the distance is the length of the
104        // other string.
105        if (sl == 0)
106        {
107          return tl;
108        }
109        else if (tl == 0)
110        {
111          return sl;
112        }
113    
114    
115        // w == matrix width; h == matrix height
116        int w = sl+1;
117        int h = tl+1;
118    
119    
120        // m == matrix array
121        // Create the array and fill it with values 0..sl in the first dimension and
122        // 0..tl in the second dimension.
123        int[][] m = new int[w][h];
124        for (int i=0; i < w; i++)
125        {
126          m[i][0] = i;
127        }
128    
129        for (int i=1; i < h; i++)
130        {
131          m[0][i] = i;
132        }
133    
134        for (int i=0,x=1; i < sl; i++,x++)
135        {
136          char s = source.charAt(i);
137    
138          for (int j=0,y=1; j < tl; j++,y++)
139          {
140            char t = target.charAt(j);
141    
142    
143            // Figure out what to put in the appropriate matrix slot.  It should be
144            // the lowest of:
145            // - One more than the value to the left
146            // - One more than the value to the top
147            // - If the characters are equal, the value to the upper left, otherwise
148            //   one more than the value to the upper left.
149            m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)),
150                               (m[i][j] + ((s == t) ? 0 : 1)));
151          }
152        }
153    
154        // The Levenshtein distance should now be the value in the lower right
155        // corner of the matrix.
156        return m[sl][tl];
157      }
158    }
159