View Javadoc

1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.io.BufferedReader;
7   import java.io.CharArrayReader;
8   import java.io.IOException;
9   
10  import org.apache.commons.io.IOUtils;
11  
12  /**
13   * This class does a best-guess try-anything tokenization.
14   *
15   * @author jheintz
16   */
17  public class CsTokenizer implements Tokenizer {
18  	
19      public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
20      	BufferedReader reader = new BufferedReader(new CharArrayReader(sourceCode.getCodeBuffer().toString().toCharArray()));
21      	try {
22      		int ic = reader.read(), line=1;
23      		char c;
24      		StringBuilder b;
25  			while(ic!=-1)
26  			{
27  				c = (char)ic;
28  				switch(c)
29  				{
30  					// new line
31  				case '\n':
32  					line++;
33  					ic = reader.read();
34  					break;
35  				
36  					// white space
37  				case ' ':
38  				case '\t':
39  				case '\r':
40  					ic = reader.read();
41  					break;
42  
43  					// ignore semicolons
44  				case ';':
45  					ic = reader.read();
46  					break;
47  
48  					// < << <= <<= > >> >= >>=
49  				case '<':
50  				case '>':
51  					ic = reader.read();
52  					if(ic == '=')
53  					{
54  						tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
55  						ic = reader.read();
56  					}
57  					else if(ic == c)
58  					{
59  						ic = reader.read();
60  						if(ic == '=')
61  						{
62  							tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c)+"=", sourceCode.getFileName(), line));
63  							ic = reader.read();
64  						}
65  						else
66  						{
67  							tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c), sourceCode.getFileName(), line));
68  						}
69  					}
70  					else
71  					{
72  						tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
73  					}
74  					break;
75  	
76  					// = == & &= && | |= || + += ++ - -= --
77  				case '=':
78  				case '&':
79  				case '|':
80  				case '+':
81  				case '-':
82  					ic = reader.read();
83  					if(ic == '=' || ic == c)
84  					{
85  						tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf((char)ic), sourceCode.getFileName(), line));
86  						ic = reader.read();
87  					}
88  					else
89  					{
90  						tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
91  					}
92  					break;
93  				
94  					// ! != * *= % %= ^ ^= ~ ~=
95  				case '!':
96  				case '*':
97  				case '%':
98  				case '^':
99  				case '~':
100 					ic = reader.read();
101 					if(ic == '=')
102 					{
103 						tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
104 						ic = reader.read();
105 					}
106 					else
107 					{
108 						tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
109 					}
110 					break;
111 					
112 					// strings & chars
113 				case '"':
114 				case '\'':
115 					b = new StringBuilder();
116 					b.append(c);
117 					while((ic = reader.read()) != c)
118 					{
119 						if(ic == -1)
120 							break;
121 						b.append((char)ic);
122 						if(ic == '\\') {
123 							int next = reader.read();
124 							if (next != -1) b.append((char)next);
125 						}
126 					}
127 					if (ic != -1) b.append((char)ic);
128 					tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
129 					ic = reader.read();
130 					break;
131 					
132 					// / /= /*...*/ //...
133 				case '/':
134 					switch(c = (char)(ic = reader.read()))
135 					{
136 					case '*':
137 						int state = 1;
138 						b = new StringBuilder();
139 						b.append("/*");
140 						
141 						while((ic = reader.read()) != -1)
142 						{
143 							c = (char)ic;
144 							b.append(c);
145 							
146 							if(state==1)
147 							{
148 								if(c == '*')
149 									state = 2;
150 							}
151 							else
152 							{
153 								if(c == '/') {
154 									ic = reader.read();
155 									break;
156 								} else if(c != '*') {
157 									state = 1;
158 								}
159 							}
160 						}
161 						// ignore the /* comment
162 						//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
163 						break;
164 						
165 					case '/':
166 						b = new StringBuilder();
167 						b.append("//");
168 						while((ic = reader.read()) != '\n')
169 						{
170 							if(ic==-1)
171 								break;
172 							b.append((char)ic);
173 						}
174 						// ignore the // comment
175 						//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
176 						break;
177 						
178 					case '=':
179 						tokenEntries.add(new TokenEntry("/=", sourceCode.getFileName(), line));
180 						ic = reader.read();
181 						break;
182 						
183 					default:
184 						tokenEntries.add(new TokenEntry("/", sourceCode.getFileName(), line));
185 						break;
186 					}
187 					break;
188 					
189 					
190 					
191 				default:
192 					// [a-zA-Z_][a-zA-Z_0-9]*
193 					if(Character.isJavaIdentifierStart(c))
194 					{
195 						b = new StringBuilder();
196 						do
197 						{
198 							b.append(c);
199 							c = (char)(ic = reader.read());
200 						} while(Character.isJavaIdentifierPart(c));
201 						tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
202 					}
203 					// numbers
204 					else if(Character.isDigit(c) || c == '.')
205 					{
206 						b = new StringBuilder();
207 						do
208 						{
209 							b.append(c);
210 							if(c == 'e' || c == 'E')
211 							{
212 								c = (char)(ic = reader.read());
213 								if("1234567890-".indexOf(c)==-1)
214 									break;
215 								b.append(c);
216 							}
217 							c = (char)(ic = reader.read());
218 						} while("1234567890.iIlLfFdDsSuUeExX".indexOf(c)!=-1);
219 						
220 						tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));	
221 					}
222 					// anything else
223 					else
224 					{
225 						tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
226 						ic = reader.read();
227 						break;
228 					}
229 				}
230 			}
231 		} catch (IOException e) {
232 			e.printStackTrace();
233 		} finally {
234 		    IOUtils.closeQuietly(reader);
235 		    tokenEntries.add(TokenEntry.getEOF());
236 		}
237     }
238 }