001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2017 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
028import com.puppycrawl.tools.checkstyle.api.DetailAST;
029import com.puppycrawl.tools.checkstyle.api.TextBlock;
030import com.puppycrawl.tools.checkstyle.api.TokenTypes;
031
032/**
033 * <p>
034 * Restrict using <a href =
035 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
036 * Unicode escapes</a> (such as <code>&#92;u221e</code>).
037 * It is possible to allow using escapes for
038 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
039 * non-printable(control) characters</a>.
040 * Also, this check can be configured to allow using escapes
041 * if trail comment is present. By the option it is possible to
042 * allow using escapes if literal contains only them. By the option it
043 * is possible to allow using escapes for space literals.
044 * </p>
045 * <p>
046 * Examples of using Unicode:</p>
047 * <pre>
048 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
049 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
050 * </pre>
051 * <p>
052 * An example of how to configure the check is:
053 * </p>
054 * <pre>
055 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
056 * </pre>
057 * <p>
058 * An example of non-printable(control) characters.
059 * </p>
060 * <pre>
061 * return '&#92;ufeff' + content; // byte order mark
062 * </pre>
063 * <p>
064 * An example of how to configure the check to allow using escapes
065 * for non-printable(control) characters:
066 * </p>
067 * <pre>
068 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
069 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
070 * &lt;/module&gt;
071 * </pre>
072 * <p>
073 * Example of using escapes with trail comment:
074 * </p>
075 * <pre>
076 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
077 * </pre>
078 * <p>An example of how to configure the check to allow using escapes
079 * if trail comment is present:
080 * </p>
081 * <pre>
082 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
083 *     &lt;property name="allowByTailComment" value="true"/&gt;
084 * &lt;/module&gt;
085 * </pre>
086 * <p>Example of using escapes if literal contains only them:
087 * </p>
088 * <pre>
089 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
090 * </pre>
091 * <p>An example of how to configure the check to allow escapes
092 * if literal contains only them:
093 * </p>
094 * <pre>
095 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
096 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
097 * &lt;/module&gt;
098 * </pre>
099 * <p>An example of how to configure the check to allow non-printable escapes:
100 * </p>
101 * <pre>
102 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
103 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
104 * &lt;/module&gt;
105 * </pre>
106 *
107 * @author maxvetrenko
108 *
109 */
110public class AvoidEscapedUnicodeCharactersCheck
111    extends AbstractCheck {
112    /**
113     * A key is pointing to the warning message text in "messages.properties"
114     * file.
115     */
116    public static final String MSG_KEY = "forbid.escaped.unicode.char";
117
118    /** Regular expression for Unicode chars. */
119    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
120
121    /**
122     * Regular expression Unicode control characters.
123     *
124     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
125     *     Appendix:Control characters</a>
126     */
127    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
128            + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)"
129            + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
130            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
131
132    /** Regular expression for all escaped chars. */
133    private static final Pattern ALL_ESCAPED_CHARS =
134            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
135                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
136
137    /** Regular expression for escaped backslash. */
138    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
139
140    /** Regular expression for non-printable unicode chars. */
141    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
142            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
143            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
144            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
145            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
146            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
147            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
148            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
149            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
150            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
151            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
152            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
153            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
154            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
155            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
156            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
157            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
158            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
159            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
160            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
161
162    /** Cpp style comments. */
163    private Map<Integer, TextBlock> singlelineComments;
164    /** C style comments. */
165    private Map<Integer, List<TextBlock>> blockComments;
166
167    /** Allow use escapes for non-printable(control) characters.  */
168    private boolean allowEscapesForControlCharacters;
169
170    /** Allow use escapes if trail comment is present. */
171    private boolean allowByTailComment;
172
173    /** Allow if all characters in literal are escaped. */
174    private boolean allowIfAllCharactersEscaped;
175
176    /** Allow escapes for space literals. */
177    private boolean allowNonPrintableEscapes;
178
179    /**
180     * Set allowIfAllCharactersEscaped.
181     * @param allow user's value.
182     */
183    public final void setAllowEscapesForControlCharacters(boolean allow) {
184        allowEscapesForControlCharacters = allow;
185    }
186
187    /**
188     * Set allowByTailComment.
189     * @param allow user's value.
190     */
191    public final void setAllowByTailComment(boolean allow) {
192        allowByTailComment = allow;
193    }
194
195    /**
196     * Set allowIfAllCharactersEscaped.
197     * @param allow user's value.
198     */
199    public final void setAllowIfAllCharactersEscaped(boolean allow) {
200        allowIfAllCharactersEscaped = allow;
201    }
202
203    /**
204     * Set allowSpaceEscapes.
205     * @param allow user's value.
206     */
207    public final void setAllowNonPrintableEscapes(boolean allow) {
208        allowNonPrintableEscapes = allow;
209    }
210
211    @Override
212    public int[] getDefaultTokens() {
213        return getAcceptableTokens();
214    }
215
216    @Override
217    public int[] getAcceptableTokens() {
218        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
219    }
220
221    @Override
222    public int[] getRequiredTokens() {
223        return getAcceptableTokens();
224    }
225
226    @Override
227    public void beginTree(DetailAST rootAST) {
228        singlelineComments = getFileContents().getSingleLineComments();
229        blockComments = getFileContents().getBlockComments();
230    }
231
232    @Override
233    public void visitToken(DetailAST ast) {
234
235        final String literal = ast.getText();
236
237        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
238                || isAllCharactersEscaped(literal)
239                || allowEscapesForControlCharacters
240                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
241                || allowNonPrintableEscapes
242                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
243            log(ast.getLineNo(), MSG_KEY);
244        }
245    }
246
247    /**
248     * Checks if literal has Unicode chars.
249     * @param literal String literal.
250     * @return true if literal has Unicode chars.
251     */
252    private static boolean hasUnicodeChar(String literal) {
253        final String literalWithoutEscapedBackslashes =
254                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
255        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
256    }
257
258    /**
259     * Check if String literal contains Unicode control chars.
260     * @param literal String literal.
261     * @param pattern RegExp for valid characters.
262     * @return true, if String literal contains Unicode control chars.
263     */
264    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
265        final int unicodeMatchesCounter =
266                countMatches(UNICODE_REGEXP, literal);
267        final int unicodeValidMatchesCounter =
268                countMatches(pattern, literal);
269        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
270    }
271
272    /**
273     * Check if trail comment is present after ast token.
274     * @param ast current token.
275     * @return true if trail comment is present after ast token.
276     */
277    private boolean hasTrailComment(DetailAST ast) {
278        boolean result = false;
279        final int lineNo = ast.getLineNo();
280        if (singlelineComments.containsKey(lineNo)) {
281            result = true;
282        }
283        else {
284            final String line = getLines()[lineNo - 1];
285            final List<TextBlock> commentList = blockComments.get(lineNo);
286            if (commentList != null) {
287                final TextBlock comment = commentList.get(commentList.size() - 1);
288                result = isTrailingBlockComment(comment, line);
289            }
290        }
291        return result;
292    }
293
294    /**
295     * Whether the C style comment is trailing.
296     * @param comment the comment to check.
297     * @param line the line where the comment starts.
298     * @return true if the comment is trailing.
299     */
300    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
301        return comment.getText().length != 1
302            || line.substring(comment.getEndColNo() + 1).trim().isEmpty();
303    }
304
305    /**
306     * Count regexp matches into String literal.
307     * @param pattern pattern.
308     * @param target String literal.
309     * @return count of regexp matches.
310     */
311    private static int countMatches(Pattern pattern, String target) {
312        int matcherCounter = 0;
313        final Matcher matcher = pattern.matcher(target);
314        while (matcher.find()) {
315            matcherCounter++;
316        }
317        return matcherCounter;
318    }
319
320    /**
321     * Checks if all characters in String literal is escaped.
322     * @param literal current literal.
323     * @return true if all characters in String literal is escaped.
324     */
325    private boolean isAllCharactersEscaped(String literal) {
326        return allowIfAllCharactersEscaped
327                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
328                        literal.length() - 1)).find();
329    }
330}