View Javadoc
1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.io.BufferedReader;
7   import java.io.CharArrayReader;
8   import java.util.NoSuchElementException;
9   import java.util.StringTokenizer;
10  
11  import org.apache.commons.io.IOUtils;
12  
13  /**
14   * This class does a best-guess try-anything tokenization.
15   *
16   * @author jheintz
17   */
18  public class AnyTokenizer implements Tokenizer {
19      public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
20  
21      public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
22          StringBuilder sb = sourceCode.getCodeBuffer();
23          BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()));
24          try {
25              int lineNumber = 1;
26              String line = reader.readLine();
27              while (line != null) {
28                  StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
29                  try {
30                      String token = tokenizer.nextToken();
31                      while (token != null) {
32                          if (!token.equals(" ") && !token.equals("\t")) {
33                              tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber));
34                          }
35                          token = tokenizer.nextToken();
36                      }
37                  } catch (NoSuchElementException ex) {
38                      // done with tokens
39                  }
40                  // advance iteration variables
41                  line = reader.readLine();
42                  lineNumber++;
43              }
44          } catch (Exception ex) {
45              ex.printStackTrace();
46          } finally {
47              IOUtils.closeQuietly(reader);
48              tokenEntries.add(TokenEntry.getEOF());
49          }
50      }
51  }