1 /**
2 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3 * @author Zev Blut zb@ubit.com
4 * @author Romain PELISSE belaran@gmail.com
5 */
6 package net.sourceforge.pmd.cpd;
7
8 import java.util.List;
9
10 public abstract class AbstractTokenizer implements Tokenizer
11 {
12
13 protected List<String> stringToken;
14 protected List<String> ignorableCharacter;
15
16 protected List<String> ignorableStmt;
17 protected char ONE_LINE_COMMENT_CHAR = '#';
18
19 private List<String> code;
20 private int lineNumber = 0;
21 private String currentLine;
22
23 protected boolean spanMultipleLinesString = true;
24
25 private boolean downcaseString = true;
26
27 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
28 this.code = tokens.getCode();
29
30 for ( this.lineNumber = 0; lineNumber < this.code.size(); lineNumber++ ) {
31 this.currentLine = this.code.get(this.lineNumber);
32 int loc = 0;
33 while ( loc < currentLine.length() ) {
34 StringBuffer token = new StringBuffer();
35 loc = getTokenFromLine(token,loc);
36 if (token.length() > 0 && !isIgnorableString(token.toString())) {
37 if (downcaseString) {
38 token = new StringBuffer(token.toString().toLowerCase());
39 }
40 if ( CPD.debugEnable )
41 System.out.println("Token added:" + token.toString());
42 tokenEntries.add(new TokenEntry(token.toString(),
43 tokens.getFileName(),
44 lineNumber));
45
46 }
47 }
48 }
49 tokenEntries.add(TokenEntry.getEOF());
50 }
51
52 private int getTokenFromLine(StringBuffer token, int loc) {
53 for (int j = loc; j < this.currentLine.length(); j++) {
54 char tok = this.currentLine.charAt(j);
55 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
56 if (isComment(tok)) {
57 if (token.length() > 0) {
58 return j;
59 } else {
60 return getCommentToken(token, loc);
61 }
62 } else if (isString(tok)) {
63 if (token.length() > 0) {
64 return j;
65 } else {
66
67 return parseString(token, j, tok);
68 }
69 } else {
70 token.append(tok);
71 }
72 } else {
73 if (token.length() > 0) {
74 return j;
75 }
76 }
77 loc = j;
78 }
79 return loc + 1;
80 }
81
82 private int parseString(StringBuffer token, int loc, char stringDelimiter) {
83 boolean escaped = false;
84 boolean done = false;
85 char tok = ' ';
86 while ((loc < currentLine.length()) && ! done) {
87 tok = currentLine.charAt(loc);
88 if (escaped && tok == stringDelimiter)
89 escaped = false;
90 else if (tok == stringDelimiter && (token.length() > 0))
91 done = true;
92 else if (tok == '\\')
93 escaped = true;
94 else
95 escaped = false;
96
97 token.append(tok);
98 loc++;
99 }
100
101 if ( ! done &&
102 loc >= currentLine.length() &&
103 this.spanMultipleLinesString &&
104 ++this.lineNumber < this.code.size()
105 ) {
106
107 this.currentLine = this.code.get(this.lineNumber);
108
109 loc = this.parseString(token, loc, stringDelimiter);
110 }
111 return loc + 1;
112 }
113
114 private boolean ignoreCharacter(char tok)
115 {
116 return this.ignorableCharacter.contains("" + tok);
117 }
118
119 private boolean isString(char tok)
120 {
121 return this.stringToken.contains("" + tok);
122 }
123
124 private boolean isComment(char tok)
125 {
126 return tok == ONE_LINE_COMMENT_CHAR;
127 }
128
129 private int getCommentToken(StringBuffer token, int loc)
130 {
131 while (loc < this.currentLine.length())
132 {
133 token.append(this.currentLine.charAt(loc++));
134 }
135 return loc;
136 }
137
138 private boolean isIgnorableString(String token)
139 {
140 return this.ignorableStmt.contains(token);
141 }
142 }