View Javadoc

1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    * @author Zev Blut zb@ubit.com
4    * @author Romain PELISSE belaran@gmail.com
5    */
6   package net.sourceforge.pmd.cpd;
7   
8   import java.util.List;
9   
10  public abstract class AbstractTokenizer implements Tokenizer
11  {
12  
13  	protected List<String> stringToken;			// List<String>, should be setted by children classes
14  	protected List<String> ignorableCharacter; 	// List<String>, should be setted by children classes
15  												// FIXME:Maybe an array of 'char' would be better for perfomance ?
16  	protected List<String> ignorableStmt; 		// List<String>, should be setted by children classes
17  	protected char ONE_LINE_COMMENT_CHAR = '#'; // Most script language ( shell, ruby, python,...) use this symbol for comment line
18  
19  	private List<String> code;
20  	private int lineNumber = 0;
21  	private String currentLine;
22  
23  	protected boolean spanMultipleLinesString = true;	// Most language does, so default is true
24  
25  	private boolean downcaseString = true;
26  
27      public void tokenize(SourceCode tokens, Tokens tokenEntries) {
28          this.code = tokens.getCode();
29  
30          for ( this.lineNumber = 0; lineNumber < this.code.size(); lineNumber++ ) {
31          	this.currentLine = this.code.get(this.lineNumber);
32              int loc = 0;
33              while ( loc < currentLine.length() ) {
34                  StringBuffer token = new StringBuffer();
35                  loc = getTokenFromLine(token,loc);
36                  if (token.length() > 0 && !isIgnorableString(token.toString())) {
37                      if (downcaseString) {
38                          token = new StringBuffer(token.toString().toLowerCase());
39                      }
40                      if ( CPD.debugEnable )
41                      	System.out.println("Token added:" + token.toString());
42                      tokenEntries.add(new TokenEntry(token.toString(),
43                              tokens.getFileName(),
44                              lineNumber));
45  
46                  }
47              }
48          }
49          tokenEntries.add(TokenEntry.getEOF());
50      }
51  
52      private int getTokenFromLine(StringBuffer token, int loc) {
53          for (int j = loc; j < this.currentLine.length(); j++) {
54              char tok = this.currentLine.charAt(j);
55              if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
56                  if (isComment(tok)) {
57                      if (token.length() > 0) {
58                          return j;
59                      } else {
60                          return getCommentToken(token, loc);
61                      }
62                  } else if (isString(tok)) {
63                      if (token.length() > 0) {
64                          return j; // we need to now parse the string as a seperate token.
65                      } else {
66                          // we are at the start of a string
67                          return parseString(token, j, tok);
68                      }
69                  } else {
70                      token.append(tok);
71                  }
72              } else {
73                  if (token.length() > 0) {
74                      return j;
75                  }
76              }
77              loc = j;
78          }
79          return loc + 1;
80      }
81  
82      private int parseString(StringBuffer token, int loc, char stringDelimiter) {
83          boolean escaped = false;
84          boolean done = false;
85          char tok = ' '; // this will be replaced.
86          while ((loc < currentLine.length()) && ! done) {
87              tok = currentLine.charAt(loc);
88              if (escaped && tok == stringDelimiter) // Found an escaped string
89                  escaped = false;
90              else if (tok == stringDelimiter && (token.length() > 0)) // We are done, we found the end of the string...
91                  done = true;
92              else if (tok == '\\') // Found an escaped char
93                  escaped = true;
94              else	// Adding char...
95                  escaped = false;
96              //Adding char to String:" + token.toString());
97              token.append(tok);
98              loc++;
99          }
100         // Handling multiple lines string
101         if ( 	! done &&	// ... we didn't find the end of the string
102         		loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)
103         		this.spanMultipleLinesString && // ... the language allow multiple line span Strings
104         		++this.lineNumber < this.code.size() // ... there is still more lines to parse
105         	) {
106         	// parsing new line
107         	this.currentLine = this.code.get(this.lineNumber);
108         	// Warning : recursive call !
109         	loc = this.parseString(token, loc, stringDelimiter);
110         }
111         return loc + 1;
112     }
113 
114     private boolean ignoreCharacter(char tok)
115     {
116     	return this.ignorableCharacter.contains("" + tok);
117     }
118 
119     private boolean isString(char tok)
120     {
121     	return this.stringToken.contains("" + tok);
122     }
123 
124     private boolean isComment(char tok)
125     {
126         return tok == ONE_LINE_COMMENT_CHAR;
127     }
128 
129     private int getCommentToken(StringBuffer token, int loc)
130     {
131         while (loc < this.currentLine.length())
132         {
133             token.append(this.currentLine.charAt(loc++));
134         }
135         return loc;
136     }
137 
138     private boolean isIgnorableString(String token)
139     {
140     	return this.ignorableStmt.contains(token);
141     }
142 }