View Javadoc
1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.io.BufferedReader;
7   import java.io.CharArrayReader;
8   import java.io.Closeable;
9   import java.io.IOException;
10  import java.io.PushbackReader;
11  import java.util.Properties;
12  
13  import org.apache.commons.io.IOUtils;
14  import org.apache.commons.lang3.RandomStringUtils;
15  
16  /**
17   * This class does a best-guess try-anything tokenization.
18   *
19   * @author jheintz
20   */
21  public class CsTokenizer implements Tokenizer {
22  
23      private boolean ignoreUsings = false;
24  
25      public void setProperties(Properties properties) {
26          if (properties.containsKey(IGNORE_USINGS)) {
27              ignoreUsings = Boolean.parseBoolean(properties.getProperty(IGNORE_USINGS, "false"));
28          }
29      }
30  
31      @Override
32      public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
33          Tokenizer tokenizer =
34                  new Tokenizer(sourceCode.getCodeBuffer().toString());
35          Token token = tokenizer.getNextToken();
36  
37          while (!token.equals(Token.EOF)) {
38              Token lookAhead = tokenizer.getNextToken();
39  
40              // Ignore using directives
41              // Only using directives should be ignored, because these are used to import namespaces
42              //
43              // Using directive: 'using System.Math;'
44              // Using statement: 'using (Font font1 = new Font(..)) { .. }'
45              if (ignoreUsings &&
46                      "using".equals(token.image) &&
47                      !"(".equals(lookAhead.image)
48              ) {
49                  // We replace the 'using' token by a random token, because it should not be part of
50                  // any duplication block. When we omit it from the token stream, there is a change that
51                  // we get a duplication block that starts before the 'using' directives and ends afterwards.
52                  String randomTokenText =
53                      RandomStringUtils.randomAlphanumeric(20);
54  
55                  token = new Token(randomTokenText, token.lineNumber);
56                  //Skip all other tokens of the using directive to prevent a partial matching
57                  while (!";".equals(lookAhead.image) && !lookAhead.equals(Token.EOF)) {
58                      lookAhead = tokenizer.getNextToken();
59                  }
60              }
61              if (!";".equals(token.image)) {
62                  tokenEntries.add(new TokenEntry(token.image, sourceCode.getFileName(), token.lineNumber));
63              }
64              token = lookAhead;
65          }
66          tokenEntries.add(TokenEntry.getEOF());
67          IOUtils.closeQuietly(tokenizer);
68      }
69  
70      public void setIgnoreUsings(boolean ignoreUsings) {
71          this.ignoreUsings = ignoreUsings;
72      }
73  
74  
75      private static class Tokenizer implements Closeable {
76          private boolean endOfFile;
77          private int line;
78          private final PushbackReader reader;
79  
80          public Tokenizer(String sourceCode) {
81              endOfFile = false;
82              line = 1;
83              reader = new PushbackReader(new BufferedReader(new CharArrayReader(sourceCode.toCharArray())));
84          }
85  
86          public Token getNextToken() {
87              if (endOfFile) {
88                  return Token.EOF;
89              }
90  
91              try {
92                  int ic = reader.read();
93                  char c;
94                  StringBuilder b;
95                  while (ic != -1) {
96                      c = (char) ic;
97                      switch (c) {
98                      // new line
99                      case '\n':
100                         line++;
101                         ic = reader.read();
102                         break;
103 
104                     // white space
105                     case ' ':
106                     case '\t':
107                     case '\r':
108                         ic = reader.read();
109                         break;
110 
111                     case ';':
112                         return new Token(";", line);
113 
114                     // < << <= <<= > >> >= >>=
115                     case '<':
116                     case '>':
117                         ic = reader.read();
118                         if (ic == '=') {
119                             return new Token(c + "=", line);
120                         } else if (ic == c) {
121                             ic = reader.read();
122                             if (ic == '=') {
123                                 return new Token(c +  c + "=", line);
124                             } else {
125                                 reader.unread(ic);
126                                 return new Token(String.valueOf(c) + c, line);
127                             }
128                         } else {
129                             reader.unread(ic);
130                             return new Token(String.valueOf(c), line);
131                         }
132 
133                     // = == & &= && | |= || + += ++ - -= --
134                     case '=':
135                     case '&':
136                     case '|':
137                     case '+':
138                     case '-':
139                         ic = reader.read();
140                         if (ic == '=' || ic == c) {
141                             return new Token(c + String.valueOf((char) ic), line);
142                         } else {
143                             reader.unread(ic);
144                             return new Token(String.valueOf(c), line);
145                         }
146 
147                     // ! != * *= % %= ^ ^= ~ ~=
148                     case '!':
149                     case '*':
150                     case '%':
151                     case '^':
152                     case '~':
153                         ic = reader.read();
154                         if (ic == '=') {
155                             return new Token(c + "=", line);
156                         } else {
157                             reader.unread(ic);
158                             return new Token(String.valueOf(c), line);
159                         }
160 
161                     // strings & chars
162                     case '"':
163                     case '\'':
164                         int beginLine = line;
165                         b = new StringBuilder();
166                         b.append(c);
167                         while ((ic = reader.read()) != c) {
168                             if (ic == -1) {
169                                 break;
170                             }
171                             b.append((char) ic);
172                             if (ic == '\\') {
173                                 int next = reader.read();
174                                 if (next != -1) {
175                                     b.append((char) next);
176 
177                                     if (next == '\n') {
178                                         line++;
179                                     }
180                                 }
181                             } else if (ic == '\n') {
182                                 line++;
183                             }
184                         }
185                         if (ic != -1) {
186                             b.append((char) ic);
187                         }
188                         return new Token(b.toString(), beginLine);
189 
190                     // / /= /*...*/ //...
191                     case '/':
192                         switch (c = (char) (ic = reader.read())) {
193                         case '*':
194                             //int beginLine = line;
195                             int state = 1;
196                             b = new StringBuilder();
197                             b.append("/*");
198 
199                             while ((ic = reader.read()) != -1) {
200                                 c = (char) ic;
201                                 b.append(c);
202 
203                                 if (c == '\n') {
204                                     line++;
205                                 }
206 
207                                 if (state == 1) {
208                                     if (c == '*') {
209                                         state = 2;
210                                     }
211                                 } else {
212                                     if (c == '/') {
213                                         ic = reader.read();
214                                         break;
215                                     } else if (c != '*') {
216                                         state = 1;
217                                     }
218                                 }
219                             }
220                             // ignore the /* comment
221                             // tokenEntries.add(new TokenEntry(b.toString(),
222                             // sourceCode.getFileName(), beginLine));
223                             break;
224 
225                         case '/':
226                             b = new StringBuilder();
227                             b.append("//");
228                             while ((ic = reader.read()) != '\n') {
229                                 if (ic == -1) {
230                                     break;
231                                 }
232                                 b.append((char) ic);
233                             }
234                             // ignore the // comment
235                             // tokenEntries.add(new TokenEntry(b.toString(),
236                             // sourceCode.getFileName(), line));
237                             break;
238 
239                         case '=':
240                             return new Token("/=", line);
241 
242                         default:
243                             reader.unread(ic);
244                             return new Token("/", line);
245                         }
246                         break;
247 
248                     default:
249                         // [a-zA-Z_][a-zA-Z_0-9]*
250                         if (Character.isJavaIdentifierStart(c)) {
251                             b = new StringBuilder();
252                             do {
253                                 b.append(c);
254                                 c = (char) (ic = reader.read());
255                             } while (Character.isJavaIdentifierPart(c));
256                             reader.unread(ic);
257                             return new Token(b.toString(), line);
258                         }
259                         // numbers
260                         else if (Character.isDigit(c) || c == '.') {
261                             b = new StringBuilder();
262                             do {
263                                 b.append(c);
264                                 if (c == 'e' || c == 'E') {
265                                     c = (char) (ic = reader.read());
266                                     if ("1234567890-".indexOf(c) == -1) {
267                                         break;
268                                     }
269                                     b.append(c);
270                                 }
271                                 c = (char) (ic = reader.read());
272                             } while ("1234567890.iIlLfFdDsSuUeExX".indexOf(c) != -1);
273                             reader.unread(ic);
274                             return new Token(b.toString(), line);
275                         }
276                         // anything else
277                         else {
278                             return new Token(String.valueOf(c), line);
279                         }
280                     }
281                 }
282             } catch (IOException e) {
283                 e.printStackTrace();
284             }
285             endOfFile = true;
286             return Token.EOF;
287         }
288 
289         @Override
290         public void close() throws IOException {
291             reader.close();
292         }
293     }
294 
295     private static class Token {
296         public static final Token EOF = new Token("EOF", -1);
297 
298         public final String image;
299         public final int lineNumber;
300 
301         public Token(String image, int lineNumber) {
302             this.image = image;
303             this.lineNumber = lineNumber;
304         }
305     }
306 }