View Javadoc
1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.util.List;
7   
8   /**
9    *
10   * @author Zev Blut zb@ubit.com
11   * @author Romain PELISSE belaran@gmail.com
12   */
13  public abstract class AbstractTokenizer implements Tokenizer {
14  
15      // FIXME depending on subclasses to assign local vars is rather fragile -
16      // better to make private and setup via explicit hook methods
17  
18      protected List<String> stringToken; // List<String>, should be set by sub
19                                          // classes
20      protected List<String> ignorableCharacter; // List<String>, should be set by
21                                                 // sub classes
22                                                 // FIXME:Maybe an array of 'char'
23                                                 // would be better for
24                                                 // performance ?
25      protected List<String> ignorableStmt; // List<String>, should be set by sub
26                                            // classes
27      protected char oneLineCommentChar = '#'; // Most script languages ( shell,
28                                               // ruby, python,...) use this
29                                               // symbol for comment line
30  
31      private List<String> code;
32      private int lineNumber = 0;
33      private String currentLine;
34  
35      protected boolean spanMultipleLinesString = true; // Most languages do, so
36                                                        // default is true
37      protected Character spanMultipleLinesLineContinuationCharacter = null;
38  
39      private boolean downcaseString = true;
40  
41      public void tokenize(SourceCode tokens, Tokens tokenEntries) {
42          code = tokens.getCode();
43  
44          for (lineNumber = 0; lineNumber < code.size(); lineNumber++) {
45              currentLine = code.get(lineNumber);
46              int loc = 0;
47              while (loc < currentLine.length()) {
48                  StringBuilder token = new StringBuilder();
49                  loc = getTokenFromLine(token, loc);
50                  if (token.length() > 0 && !isIgnorableString(token.toString())) {
51                      if (downcaseString) {
52                          token = new StringBuilder(token.toString().toLowerCase());
53                      }
54                      // need to re-think how to link this
55                      // if ( CPD.debugEnable ) {
56                      // System.out.println("Token added:" + token.toString());
57                      // }
58                      tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), lineNumber));
59  
60                  }
61              }
62          }
63          tokenEntries.add(TokenEntry.getEOF());
64      }
65  
66      private int getTokenFromLine(StringBuilder token, int loc) {
67          for (int j = loc; j < currentLine.length(); j++) {
68              char tok = currentLine.charAt(j);
69              if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
70                  if (isComment(tok)) {
71                      if (token.length() > 0) {
72                          return j;
73                      } else {
74                          return getCommentToken(token, loc);
75                      }
76                  } else if (isString(tok)) {
77                      if (token.length() > 0) {
78                          return j; // we need to now parse the string as a
79                                    // separate token.
80                      } else {
81                          // we are at the start of a string
82                          return parseString(token, j, tok);
83                      }
84                  } else {
85                      token.append(tok);
86                  }
87              } else {
88                  if (token.length() > 0) {
89                      return j;
90                  }
91              }
92              loc = j;
93          }
94          return loc + 1;
95      }
96  
97      private int parseString(StringBuilder token, int loc, char stringDelimiter) {
98          boolean escaped = false;
99          boolean done = false;
100         char tok = ' '; // this will be replaced.
101         while (loc < currentLine.length() && !done) {
102             tok = currentLine.charAt(loc);
103             if (escaped && tok == stringDelimiter) { // Found an escaped string
104                 escaped = false;
105             } else if (tok == stringDelimiter && token.length() > 0) {
106                 // We are done, we found the end of the string...
107                 done = true;
108             } else if (tok == '\\') { // Found an escaped char
109                 escaped = true;
110             } else { // Adding char...
111                 escaped = false;
112             }
113             // Adding char to String:" + token.toString());
114             token.append(tok);
115             loc++;
116         }
117         // Handling multiple lines string
118         if (!done && // ... we didn't find the end of the string
119                 loc >= currentLine.length() && // ... we have reach the end of
120                                                // the line ( the String is
121                                                // incomplete, for the moment at
122                                                // least)
123                 spanMultipleLinesString && // ... the language allow multiple
124                                            // line span Strings
125                 lineNumber < code.size() - 1 // ... there is still more lines to
126                                              // parse
127         ) {
128             // removes last character, if it is the line continuation (e.g.
129             // backslash) character
130             if (spanMultipleLinesLineContinuationCharacter != null && token.length() > 0
131                     && token.charAt(token.length() - 1) == spanMultipleLinesLineContinuationCharacter.charValue()) {
132                 token.deleteCharAt(token.length() - 1);
133             }
134             // parsing new line
135             currentLine = code.get(++lineNumber);
136             // Warning : recursive call !
137             loc = parseString(token, 0, stringDelimiter);
138         }
139         return loc + 1;
140     }
141 
142     private boolean ignoreCharacter(char tok) {
143         return ignorableCharacter.contains(String.valueOf(tok));
144     }
145 
146     private boolean isString(char tok) {
147         return stringToken.contains(String.valueOf(tok));
148     }
149 
150     private boolean isComment(char tok) {
151         return tok == oneLineCommentChar;
152     }
153 
154     private int getCommentToken(StringBuilder token, int loc) {
155         while (loc < currentLine.length()) {
156             token.append(currentLine.charAt(loc++));
157         }
158         return loc;
159     }
160 
161     private boolean isIgnorableString(String token) {
162         return ignorableStmt.contains(token);
163     }
164 }