OSDN Git Service

0eedbe78eb1d90632ad84c1013f141d6ff7d83a4
[radegast/radegast.git] / Radegast / Core / Types / StringTokenizer.cs
1 // 
2 // Radegast Metaverse Client
3 // Copyright (c) 2009-2013, Radegast Development Team
4 // All rights reserved.
5 // 
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are met:
8 // 
9 //     * Redistributions of source code must retain the above copyright notice,
10 //       this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above copyright
12 //       notice, this list of conditions and the following disclaimer in the
13 //       documentation and/or other materials provided with the distribution.
14 //     * Neither the name of the application "Radegast", nor the names of its
15 //       contributors may be used to endorse or promote products derived from
16 //       this software without specific prior written permission.
17 // 
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 //
29 // $Id$
30 //
31 /********************************************************8
32  *      Author: Andrew Deren
33  *      Date: July, 2004
34  *      http://www.adersoftware.com
35  * 
36  *      StringTokenizer class. You can use this class in any way you want
37  * as long as this header remains in this file.
38  * 
39  **********************************************************/
40 using System;
41 using System.IO;
42
43 namespace Radegast
44 {
45     public enum TokenKind
46     {
47         Unknown,
48         Word,
49         Number,
50         QuotedString,
51         WhiteSpace,
52         Symbol,
53         Comment,
54         EOL,
55         EOF
56     }
57
58     public class Token
59     {
60         int line;
61         int column;
62         string value;
63         TokenKind kind;
64
65         public Token(TokenKind kind, string value, int line, int column)
66         {
67             this.kind = kind;
68             this.value = value;
69             this.line = line;
70             this.column = column;
71         }
72
73         public int Column
74         {
75             get { return this.column; }
76         }
77
78         public TokenKind Kind
79         {
80             get { return this.kind; }
81         }
82
83         public int Line
84         {
85             get { return this.line; }
86         }
87
88         public string Value
89         {
90             get { return this.value; }
91         }
92     }
93
94         /// <summary>
95         /// StringTokenizer tokenized string (or stream) into tokens.
96         /// </summary>
97         public class StringTokenizer
98         {
99                 const char EOF = (char)0;
100
101                 int line;
102                 int column;
103                 int pos;        // position within data
104
105                 string data;
106
107                 bool ignoreWhiteSpace;
108                 char[] symbolChars;
109
110                 int saveLine;
111                 int saveCol;
112                 int savePos;
113
114                 public StringTokenizer(TextReader reader)
115                 {
116                         if (reader == null)
117                                 throw new ArgumentNullException("reader");
118
119                         data = reader.ReadToEnd();
120
121                         Reset();
122                 }
123
124                 public StringTokenizer(string data)
125                 {
126                         if (data == null)
127                                 throw new ArgumentNullException("data");
128
129                         this.data = data;
130
131                         Reset();
132                 }
133
134                 /// <summary>
135                 /// gets or sets which characters are part of TokenKind.Symbol
136                 /// </summary>
137                 public char[] SymbolChars
138                 {
139                         get { return this.symbolChars; }
140                         set { this.symbolChars = value; }
141                 }
142
143                 /// <summary>
144                 /// if set to true, white space characters will be ignored,
145                 /// but EOL and whitespace inside of string will still be tokenized
146                 /// </summary>
147                 public bool IgnoreWhiteSpace
148                 {
149                         get { return this.ignoreWhiteSpace; }
150                         set { this.ignoreWhiteSpace = value; }
151                 }
152
153                 private void Reset()
154                 {
155                         this.ignoreWhiteSpace = false;
156                         this.symbolChars = new char[]{'=', '+', '-', '/', ',', '.', '*', '~', '!', '@', '#', '$', '%', '^', '&', '(', ')', '{', '}', '[', ']', ':', ';', '<', '>', '?', '|', '\\'};
157
158                         line = 1;
159                         column = 1;
160                         pos = 0;
161                 }
162
163                 protected char LA(int count)
164                 {
165                         if (pos + count >= data.Length)
166                                 return EOF;
167                         else
168                                 return data[pos+count];
169                 }
170
171                 protected char Consume()
172                 {
173                         char ret = data[pos];
174                         pos++;
175                         column++;
176
177                         return ret;
178                 }
179
180                 protected Token CreateToken(TokenKind kind, string value)
181                 {
182                         return new Token(kind, value, line, column);
183                 }
184
185                 protected Token CreateToken(TokenKind kind)
186                 {
187                         string tokenData = data.Substring(savePos, pos-savePos);
188                         return new Token(kind, tokenData, saveLine, saveCol);
189                 }
190
191                 public Token Next()
192                 {
193                         ReadToken:
194
195                         char ch = LA(0);
196                         switch (ch)
197                         {
198                                 case EOF:
199                                         return CreateToken(TokenKind.EOF, string.Empty);
200
201                                 case ' ':
202                                 case '\t':
203                                 {
204                                         if (this.ignoreWhiteSpace)
205                                         {
206                                                 Consume();
207                                                 goto ReadToken;
208                                         }
209                                         else
210                                                 return ReadWhitespace();
211                                 }
212                                 case '0':
213                                 case '1':
214                                 case '2':
215                                 case '3':
216                                 case '4':
217                                 case '5':
218                                 case '6':
219                                 case '7':
220                                 case '8':
221                                 case '9':
222                                         return ReadNumber();
223
224                                 case '\r':
225                                 {
226                                         StartRead();
227                                         Consume();
228                                         if (LA(0) == '\n')
229                                                 Consume();      // on DOS/Windows we have \r\n for new line
230
231                                         line++;
232                                         column=1;
233
234                                         return CreateToken(TokenKind.EOL);
235                                 }
236                                 case '\n':
237                                 {
238                                         StartRead();
239                                         Consume();
240                                         line++;
241                                         column=1;
242                                         
243                                         return CreateToken(TokenKind.EOL);
244                                 }
245
246                                 case '"':
247                                 {
248                                         return ReadString();
249                                 }
250
251                 case '/':
252                 {
253                     if (LA(1) == '/')
254                     {
255                         return ReadComment();
256                     }
257                     else if (LA(1) == '*')
258                     {
259                         return ReadStarComment();
260                     }
261                     else
262                     {
263                         StartRead();
264                         Consume();
265                         return CreateToken(TokenKind.Symbol);
266                     }
267                 }
268
269                                 default:
270                                 {
271                                         if (Char.IsLetter(ch) || ch == '_')
272                                                 return ReadWord();
273                                         else if (IsSymbol(ch))
274                                         {
275                                                 StartRead();
276                                                 Consume();
277                                                 return CreateToken(TokenKind.Symbol);
278                                         }
279                                         else
280                                         {
281                                                 StartRead();
282                                                 Consume();
283                                                 return CreateToken(TokenKind.Unknown);                                          
284                                         }
285                                 }
286
287                         }
288                 }
289
290                 /// <summary>
291                 /// save read point positions so that CreateToken can use those
292                 /// </summary>
293                 private void StartRead()
294                 {
295                         saveLine = line;
296                         saveCol = column;
297                         savePos = pos;
298                 }
299
300                 /// <summary>
301                 /// reads all whitespace characters (does not include newline)
302                 /// </summary>
303                 /// <returns></returns>
304                 protected Token ReadWhitespace()
305                 {
306                         StartRead();
307
308                         Consume(); // consume the looked-ahead whitespace char
309
310                         while (true)
311                         {
312                                 char ch = LA(0);
313                                 if (ch == '\t' || ch == ' ')
314                                         Consume();
315                                 else
316                                         break;
317                         }
318
319                         return CreateToken(TokenKind.WhiteSpace);
320                         
321                 }
322
323                 /// <summary>
324                 /// reads number. Number is: DIGIT+ ("." DIGIT*)?
325                 /// </summary>
326                 /// <returns></returns>
327                 protected Token ReadNumber()
328                 {
329                         StartRead();
330
331                         bool hadDot = false;
332
333                         Consume(); // read first digit
334
335                         while (true)
336                         {
337                                 char ch = LA(0);
338                                 if (Char.IsDigit(ch))
339                                         Consume();
340                                 else if (ch == '.' && !hadDot)
341                                 {
342                                         hadDot = true;
343                                         Consume();
344                                 }
345                                 else
346                                         break;
347                         }
348
349                         return CreateToken(TokenKind.Number);
350                 }
351
352                 /// <summary>
353                 /// reads word. Word contains any alpha character or _
354                 /// </summary>
355                 protected Token ReadWord()
356                 {
357                         StartRead();
358
359                         Consume(); // consume first character of the word
360
361                         while (true)
362                         {
363                                 char ch = LA(0);
364                                 if (Char.IsLetter(ch) || ch == '_')
365                                         Consume();
366                                 else
367                                         break;
368                         }
369
370                         return CreateToken(TokenKind.Word);
371                 }
372
373         /// <summary>
374         /// Reads he rest of line in // comment
375         /// </summary>
376         protected Token ReadComment()
377         {
378             StartRead();
379
380             Consume(); // consume first character of the comment
381
382             while (true)
383             {
384                 char ch = LA(0);
385                 if (ch != EOF && ch != '\n' && ch != '\r')
386                     Consume();
387                 else
388                     break;
389             }
390
391             return CreateToken(TokenKind.Comment);
392         }
393
394         /// <summary>
395         /// Read c-style comments /* */
396         /// </summary>
397         protected Token ReadStarComment()
398         {
399             StartRead();
400
401             Consume(); // consume first character of the comment
402
403             while (true)
404             {
405                 char ch = LA(0);
406                 if (ch == EOF)
407                 {
408                     break;
409                 }
410                 else if (ch == '*' && LA(1) == '/')
411                 {
412                     Consume();
413                     Consume();
414                     break;
415                 }
416                 else
417                 {
418                     Consume();
419                 }
420             }
421
422             return CreateToken(TokenKind.Comment);
423         }
424
425                 /// <summary>
426                 /// reads all characters until next " is found.
427                 /// If "" (2 quotes) are found, then they are consumed as
428                 /// part of the string
429                 /// </summary>
430                 /// <returns></returns>
431                 protected Token ReadString()
432                 {
433                         StartRead();
434
435                         Consume(); // read "
436
437                         while (true)
438                         {
439                                 char ch = LA(0);
440                                 if (ch == EOF)
441                                         break;
442                                 else if (ch == '\r')    // handle CR in strings
443                                 {
444                                         Consume();
445                                         if (LA(0) == '\n')      // for DOS & windows
446                                                 Consume();
447
448                                         line++;
449                                         column = 1;
450                                 }
451                                 else if (ch == '\n')    // new line in quoted string
452                                 {
453                                         Consume();
454
455                                         line++;
456                                         column = 1;
457                                 }
458                                 else if (ch == '"')
459                                 {
460                                         Consume();
461                                         if (LA(0) != '"')
462                                                 break;  // done reading, and this quotes does not have escape character
463                                         else
464                                                 Consume(); // consume second ", because first was just an escape
465                                 }
466                                 else
467                                         Consume();
468                         }
469
470                         return CreateToken(TokenKind.QuotedString);
471                 }
472
473                 /// <summary>
474                 /// checks whether c is a symbol character.
475                 /// </summary>
476                 protected bool IsSymbol(char c)
477                 {
478                         for (int i=0; i<symbolChars.Length; i++)
479                                 if (symbolChars[i] == c)
480                                         return true;
481
482                         return false;
483                 }
484         }
485 }