001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2017 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 028import com.puppycrawl.tools.checkstyle.api.DetailAST; 029import com.puppycrawl.tools.checkstyle.api.TextBlock; 030import com.puppycrawl.tools.checkstyle.api.TokenTypes; 031 032/** 033 * <p> 034 * Restrict using <a href = 035 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 036 * Unicode escapes</a> (such as <code>\u221e</code>). 037 * It is possible to allow using escapes for 038 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 039 * non-printable(control) characters</a>. 040 * Also, this check can be configured to allow using escapes 041 * if trail comment is present. By the option it is possible to 042 * allow using escapes if literal contains only them. By the option it 043 * is possible to allow using escapes for space literals. 044 * </p> 045 * <p> 046 * Examples of using Unicode:</p> 047 * <pre> 048 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 049 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 050 * </pre> 051 * <p> 052 * An example of how to configure the check is: 053 * </p> 054 * <pre> 055 * <module name="AvoidEscapedUnicodeCharacters"/> 056 * </pre> 057 * <p> 058 * An example of non-printable(control) characters. 059 * </p> 060 * <pre> 061 * return '\ufeff' + content; // byte order mark 062 * </pre> 063 * <p> 064 * An example of how to configure the check to allow using escapes 065 * for non-printable(control) characters: 066 * </p> 067 * <pre> 068 * <module name="AvoidEscapedUnicodeCharacters"> 069 * <property name="allowEscapesForControlCharacters" value="true"/> 070 * </module> 071 * </pre> 072 * <p> 073 * Example of using escapes with trail comment: 074 * </p> 075 * <pre> 076 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 077 * </pre> 078 * <p>An example of how to configure the check to allow using escapes 079 * if trail comment is present: 080 * </p> 081 * <pre> 082 * <module name="AvoidEscapedUnicodeCharacters"> 083 * <property name="allowByTailComment" value="true"/> 084 * </module> 085 * </pre> 086 * <p>Example of using escapes if literal contains only them: 087 * </p> 088 * <pre> 089 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 090 * </pre> 091 * <p>An example of how to configure the check to allow escapes 092 * if literal contains only them: 093 * </p> 094 * <pre> 095 * <module name="AvoidEscapedUnicodeCharacters"> 096 * <property name="allowIfAllCharactersEscaped" value="true"/> 097 * </module> 098 * </pre> 099 * <p>An example of how to configure the check to allow non-printable escapes: 100 * </p> 101 * <pre> 102 * <module name="AvoidEscapedUnicodeCharacters"> 103 * <property name="allowNonPrintableEscapes" value="true"/> 104 * </module> 105 * </pre> 106 * 107 * @author maxvetrenko 108 * 109 */ 110public class AvoidEscapedUnicodeCharactersCheck 111 extends AbstractCheck { 112 /** 113 * A key is pointing to the warning message text in "messages.properties" 114 * file. 115 */ 116 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 117 118 /** Regular expression for Unicode chars. */ 119 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 120 121 /** 122 * Regular expression Unicode control characters. 123 * 124 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 125 * Appendix:Control characters</a> 126 */ 127 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" 128 + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)" 129 + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]" 130 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 131 132 /** Regular expression for all escaped chars. */ 133 private static final Pattern ALL_ESCAPED_CHARS = 134 Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 135 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); 136 137 /** Regular expression for escaped backslash. */ 138 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 139 140 /** Regular expression for non-printable unicode chars. */ 141 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" 142 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 143 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 144 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 145 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 146 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 147 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 148 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 149 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 150 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 151 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 152 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 153 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 154 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 155 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 156 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 157 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 158 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 159 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 160 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 161 162 /** Cpp style comments. */ 163 private Map<Integer, TextBlock> singlelineComments; 164 /** C style comments. */ 165 private Map<Integer, List<TextBlock>> blockComments; 166 167 /** Allow use escapes for non-printable(control) characters. */ 168 private boolean allowEscapesForControlCharacters; 169 170 /** Allow use escapes if trail comment is present. */ 171 private boolean allowByTailComment; 172 173 /** Allow if all characters in literal are escaped. */ 174 private boolean allowIfAllCharactersEscaped; 175 176 /** Allow escapes for space literals. */ 177 private boolean allowNonPrintableEscapes; 178 179 /** 180 * Set allowIfAllCharactersEscaped. 181 * @param allow user's value. 182 */ 183 public final void setAllowEscapesForControlCharacters(boolean allow) { 184 allowEscapesForControlCharacters = allow; 185 } 186 187 /** 188 * Set allowByTailComment. 189 * @param allow user's value. 190 */ 191 public final void setAllowByTailComment(boolean allow) { 192 allowByTailComment = allow; 193 } 194 195 /** 196 * Set allowIfAllCharactersEscaped. 197 * @param allow user's value. 198 */ 199 public final void setAllowIfAllCharactersEscaped(boolean allow) { 200 allowIfAllCharactersEscaped = allow; 201 } 202 203 /** 204 * Set allowSpaceEscapes. 205 * @param allow user's value. 206 */ 207 public final void setAllowNonPrintableEscapes(boolean allow) { 208 allowNonPrintableEscapes = allow; 209 } 210 211 @Override 212 public int[] getDefaultTokens() { 213 return getAcceptableTokens(); 214 } 215 216 @Override 217 public int[] getAcceptableTokens() { 218 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 219 } 220 221 @Override 222 public int[] getRequiredTokens() { 223 return getAcceptableTokens(); 224 } 225 226 @Override 227 public void beginTree(DetailAST rootAST) { 228 singlelineComments = getFileContents().getSingleLineComments(); 229 blockComments = getFileContents().getBlockComments(); 230 } 231 232 @Override 233 public void visitToken(DetailAST ast) { 234 235 final String literal = ast.getText(); 236 237 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 238 || isAllCharactersEscaped(literal) 239 || allowEscapesForControlCharacters 240 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 241 || allowNonPrintableEscapes 242 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 243 log(ast.getLineNo(), MSG_KEY); 244 } 245 } 246 247 /** 248 * Checks if literal has Unicode chars. 249 * @param literal String literal. 250 * @return true if literal has Unicode chars. 251 */ 252 private static boolean hasUnicodeChar(String literal) { 253 final String literalWithoutEscapedBackslashes = 254 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 255 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 256 } 257 258 /** 259 * Check if String literal contains Unicode control chars. 260 * @param literal String literal. 261 * @param pattern RegExp for valid characters. 262 * @return true, if String literal contains Unicode control chars. 263 */ 264 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 265 final int unicodeMatchesCounter = 266 countMatches(UNICODE_REGEXP, literal); 267 final int unicodeValidMatchesCounter = 268 countMatches(pattern, literal); 269 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 270 } 271 272 /** 273 * Check if trail comment is present after ast token. 274 * @param ast current token. 275 * @return true if trail comment is present after ast token. 276 */ 277 private boolean hasTrailComment(DetailAST ast) { 278 boolean result = false; 279 final int lineNo = ast.getLineNo(); 280 if (singlelineComments.containsKey(lineNo)) { 281 result = true; 282 } 283 else { 284 final String line = getLines()[lineNo - 1]; 285 final List<TextBlock> commentList = blockComments.get(lineNo); 286 if (commentList != null) { 287 final TextBlock comment = commentList.get(commentList.size() - 1); 288 result = isTrailingBlockComment(comment, line); 289 } 290 } 291 return result; 292 } 293 294 /** 295 * Whether the C style comment is trailing. 296 * @param comment the comment to check. 297 * @param line the line where the comment starts. 298 * @return true if the comment is trailing. 299 */ 300 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 301 return comment.getText().length != 1 302 || line.substring(comment.getEndColNo() + 1).trim().isEmpty(); 303 } 304 305 /** 306 * Count regexp matches into String literal. 307 * @param pattern pattern. 308 * @param target String literal. 309 * @return count of regexp matches. 310 */ 311 private static int countMatches(Pattern pattern, String target) { 312 int matcherCounter = 0; 313 final Matcher matcher = pattern.matcher(target); 314 while (matcher.find()) { 315 matcherCounter++; 316 } 317 return matcherCounter; 318 } 319 320 /** 321 * Checks if all characters in String literal is escaped. 322 * @param literal current literal. 323 * @return true if all characters in String literal is escaped. 324 */ 325 private boolean isAllCharactersEscaped(String literal) { 326 return allowIfAllCharactersEscaped 327 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 328 literal.length() - 1)).find(); 329 } 330}