View Javadoc

1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.util.List;
7   
8   /**
9    *
10   * @author Zev Blut zb@ubit.com
11   * @author Romain PELISSE belaran@gmail.com
12   */
13  public abstract class AbstractTokenizer implements Tokenizer {
14  
15  	//FIXME depending on subclasses to assign local vars is rather fragile - better to make private and setup via explicit hook methods
16  	
17  	protected List<String> stringToken;		    // List<String>, should be set by sub classes
18  	protected List<String> ignorableCharacter;  // List<String>, should be set by sub classes
19  												// FIXME:Maybe an array of 'char' would be better for performance ?
20  	protected List<String> ignorableStmt; 		// List<String>, should be set by sub classes
21  	protected char oneLineCommentChar = '#'; // Most script languages ( shell, ruby, python,...) use this symbol for comment line
22  
23  	private List<String> code;
24  	private int lineNumber = 0;
25  	private String currentLine;
26  
27  	protected boolean spanMultipleLinesString = true;	// Most languages do, so default is true
28  
29  	private boolean downcaseString = true;
30  
31      public void tokenize(SourceCode tokens, Tokens tokenEntries) {
32          code = tokens.getCode();
33  
34          for ( lineNumber = 0; lineNumber < code.size(); lineNumber++ ) {
35          	currentLine = code.get(lineNumber);
36              int loc = 0;
37              while ( loc < currentLine.length() ) {
38                  StringBuilder token = new StringBuilder();
39                  loc = getTokenFromLine(token,loc);
40                  if (token.length() > 0 && !isIgnorableString(token.toString())) {
41                      if (downcaseString) {
42                          token = new StringBuilder(token.toString().toLowerCase());
43                      }
44  // need to re-think how to link this                    
45  //                    if ( CPD.debugEnable ) {
46  //                    	System.out.println("Token added:" + token.toString());
47  //                    }
48                      tokenEntries.add(new TokenEntry(token.toString(),
49                              tokens.getFileName(),
50                              lineNumber)
51                      		);
52  
53                  }
54              }
55          }
56          tokenEntries.add(TokenEntry.getEOF());
57      }
58  
59      private int getTokenFromLine(StringBuilder token, int loc) {
60          for (int j = loc; j < currentLine.length(); j++) {
61              char tok = currentLine.charAt(j);
62              if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
63                  if (isComment(tok)) {
64                      if (token.length() > 0) {
65                          return j;
66                      } else {
67                          return getCommentToken(token, loc);
68                      }
69                  } else if (isString(tok)) {
70                      if (token.length() > 0) {
71                          return j; // we need to now parse the string as a separate token.
72                      } else {
73                          // we are at the start of a string
74                          return parseString(token, j, tok);
75                      }
76                  } else {
77                      token.append(tok);
78                  }
79              } else {
80                  if (token.length() > 0) {
81                      return j;
82                  }
83              }
84              loc = j;
85          }
86          return loc + 1;
87      }
88  
89      private int parseString(StringBuilder token, int loc, char stringDelimiter) {
90          boolean escaped = false;
91          boolean done = false;
92          char tok = ' '; // this will be replaced.
93          while ((loc < currentLine.length()) && ! done) {
94              tok = currentLine.charAt(loc);
95              if (escaped && tok == stringDelimiter) { // Found an escaped string
96                  escaped = false;
97              } else if (tok == stringDelimiter && (token.length() > 0)) { // We are done, we found the end of the string...
98                  done = true;
99              } else if (tok == '\\') { // Found an escaped char
100                 escaped = true;
101             } else {	// Adding char...
102                 escaped = false;
103             }
104             //Adding char to String:" + token.toString());
105             token.append(tok);
106             loc++;
107         }
108         // Handling multiple lines string
109         if ( 	! done &&	// ... we didn't find the end of the string
110         		loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)
111         		spanMultipleLinesString && // ... the language allow multiple line span Strings
112         		lineNumber < code.size() - 1 // ... there is still more lines to parse
113         	) {
114         	// parsing new line
115         	currentLine = code.get(++lineNumber);
116         	// Warning : recursive call !
117         	loc = parseString(token, loc, stringDelimiter);
118         }
119         return loc + 1;
120     }
121 
122     private boolean ignoreCharacter(char tok)
123     {
124     	return ignorableCharacter.contains(String.valueOf(tok));
125     }
126 
127     private boolean isString(char tok)
128     {
129     	return stringToken.contains(String.valueOf(tok));
130     }
131 
132     private boolean isComment(char tok)
133     {
134         return tok == oneLineCommentChar;
135     }
136 
137     private int getCommentToken(StringBuilder token, int loc)
138     {
139         while (loc < currentLine.length())
140         {
141             token.append(currentLine.charAt(loc++));
142         }
143         return loc;
144     }
145 
146     private boolean isIgnorableString(String token)
147     {
148     	return ignorableStmt.contains(token);
149     }
150 }