OSDN Git Service

enable command notify on x64
[yamy/yamy.git] / parser.cpp
1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\r
2 // parser.cpp\r
3 \r
4 \r
5 #include "misc.h"\r
6 \r
7 #include "errormessage.h"\r
8 #include "parser.h"\r
9 #include <cassert>\r
10 \r
11 \r
12 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\r
13 // Token\r
14 \r
15 \r
16 Token::Token(const Token &i_token)\r
17                 : m_type(i_token.m_type),\r
18                 m_isValueQuoted(i_token.m_isValueQuoted),\r
19                 m_numericValue(i_token.m_numericValue),\r
20                 m_stringValue(i_token.m_stringValue),\r
21                 m_data(i_token.m_data)\r
22 {\r
23 }\r
24 \r
25 Token::Token(int i_value, const tstringi &i_display)\r
26                 : m_type(Type_number),\r
27                 m_isValueQuoted(false),\r
28                 m_numericValue(i_value),\r
29                 m_stringValue(i_display),\r
30                 m_data(NULL)\r
31 {\r
32 }\r
33 \r
34 Token::Token(const tstringi &i_value, bool i_isValueQuoted, bool i_isRegexp)\r
35                 : m_type(i_isRegexp ? Type_regexp : Type_string),\r
36                 m_isValueQuoted(i_isValueQuoted),\r
37                 m_numericValue(0),\r
38                 m_stringValue(i_value),\r
39                 m_data(NULL)\r
40 {\r
41 }\r
42 \r
43 Token::Token(Type i_m_type)\r
44                 : m_type(i_m_type),\r
45                 m_isValueQuoted(false),\r
46                 m_numericValue(0),\r
47                 m_stringValue(_T("")),\r
48                 m_data(NULL)\r
49 {\r
50         ASSERT(m_type == Type_openParen || m_type == Type_closeParen ||\r
51                    m_type == Type_comma);\r
52 }\r
53 \r
54 // get numeric value\r
55 int Token::getNumber() const\r
56 {\r
57         if (m_type == Type_number)\r
58                 return m_numericValue;\r
59         if (m_stringValue.empty())\r
60                 return 0;\r
61         else\r
62                 throw ErrorMessage() << _T("`") << *this << _T("' is not a Type_number.");\r
63 }\r
64 \r
65 // get string value\r
66 tstringi Token::getString() const\r
67 {\r
68         if (m_type == Type_string)\r
69                 return m_stringValue;\r
70         throw ErrorMessage() << _T("`") << *this << _T("' is not a string.");\r
71 }\r
72 \r
73 // get regexp value\r
74 tstringi Token::getRegexp() const\r
75 {\r
76         if (m_type == Type_regexp)\r
77                 return m_stringValue;\r
78         throw ErrorMessage() << _T("`") << *this << _T("' is not a regexp.");\r
79 }\r
80 \r
81 // case insensitive equal\r
82 bool Token::operator==(const _TCHAR *i_str) const\r
83 {\r
84         if (m_type == Type_string)\r
85                 return m_stringValue == i_str;\r
86         return false;\r
87 }\r
88 \r
89 // paren equal\r
90 bool Token::operator==(const _TCHAR i_c) const\r
91 {\r
92         if (i_c == _T('(')) return m_type == Type_openParen;\r
93         if (i_c == _T(')')) return m_type == Type_openParen;\r
94         return false;\r
95 }\r
96 \r
97 // add string\r
98 void Token::add(const tstringi &i_str)\r
99 {\r
100         m_stringValue += i_str;\r
101 }\r
102 \r
103 // stream output\r
104 tostream &operator<<(tostream &i_ost, const Token &i_token)\r
105 {\r
106         switch (i_token.m_type) {\r
107         case Token::Type_string:\r
108                 i_ost << i_token.m_stringValue;\r
109                 break;\r
110         case Token::Type_number:\r
111                 i_ost << i_token.m_stringValue;\r
112                 break;\r
113         case Token::Type_regexp:\r
114                 i_ost << i_token.m_stringValue;\r
115                 break;\r
116         case Token::Type_openParen:\r
117                 i_ost << _T("(");\r
118                 break;\r
119         case Token::Type_closeParen:\r
120                 i_ost << _T(")");\r
121                 break;\r
122         case Token::Type_comma:\r
123                 i_ost << _T(", ");\r
124                 break;\r
125         }\r
126         return i_ost;\r
127 }\r
128 \r
129 \r
130 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\r
131 // Parser\r
132 \r
133 \r
134 Parser::Parser(const _TCHAR *i_str, size_t i_length)\r
135                 : m_lineNumber(1),\r
136                 m_prefixes(NULL),\r
137                 m_internalLineNumber(1),\r
138                 m_ptr(i_str),\r
139                 m_end(i_str + i_length)\r
140 {\r
141 }\r
142 \r
143 // set string that may be prefix of a token.\r
144 // prefix_ is not copied, so it must be preserved after setPrefix()\r
145 void Parser::setPrefixes(const Prefixes *i_prefixes)\r
146 {\r
147         m_prefixes = i_prefixes;\r
148 }\r
149 \r
150 // get a line\r
151 bool Parser::getLine(tstringi *o_line)\r
152 {\r
153         o_line->resize(0);\r
154 \r
155         if (m_ptr == m_end)\r
156                 return false;\r
157 \r
158         const _TCHAR *begin = m_ptr;\r
159         const _TCHAR *end = m_end;\r
160 \r
161         // lines are separated by: "\r\n", "\n", "\x2028" (Unicode Line Separator)\r
162         while (m_ptr != m_end)\r
163                 switch (*m_ptr) {\r
164                 case _T('\n'):\r
165 #ifdef UNICODE\r
166                 case 0x2028:\r
167                         //case _T('\x2028'):    //  (U+2028)\r
168 #endif\r
169                         end = m_ptr;\r
170                         ++ m_ptr;\r
171                         goto got_line_end;\r
172                 case _T('\r'):\r
173                         if (m_ptr + 1 != m_end && m_ptr[1] == _T('\n')) {\r
174                                 end = m_ptr;\r
175                                 m_ptr += 2;\r
176                                 goto got_line_end;\r
177                         }\r
178                         // fall through\r
179                 default:\r
180                         ++ m_ptr;\r
181                         break;\r
182                 }\r
183 got_line_end:\r
184         ++ m_internalLineNumber;\r
185         // o_line->assign(begin, end);          // why bcc cannot link this ?\r
186         o_line->assign(begin, end - begin);             // workarond for bcc\r
187         return true;\r
188 }\r
189 \r
190 // symbol test\r
191 static bool isSymbolChar(_TCHAR i_c)\r
192 {\r
193         if (i_c == _T('\0'))\r
194                 return false;\r
195         if (_istlead(i_c) ||\r
196                         _istalpha(i_c) ||\r
197                         _istdigit(i_c) ||\r
198                         _istlead(i_c))\r
199                 return true;\r
200 \r
201 #ifdef UNICODE\r
202         if (0x80 <= i_c && _istgraph(i_c))\r
203                 return true;\r
204 #endif // UNICODE\r
205 \r
206         if (_istpunct(i_c))\r
207                 return !!_tcschr(_T("-+/?_\\"), i_c);\r
208 \r
209 #ifdef UNICODE\r
210         // check arrows\r
211         if (_tcschr(_T("\x2190\x2191\x2192\x2193"), i_c)) {\r
212                 return true;\r
213         }\r
214 #endif // UNICODE\r
215         return _istgraph(i_c);\r
216 }\r
217 \r
218 \r
219 // get a parsed line.\r
220 // if no more lines exist, returns false\r
221 bool Parser::getLine(std::vector<Token> *o_tokens)\r
222 {\r
223         o_tokens->clear();\r
224         m_lineNumber = m_internalLineNumber;\r
225 \r
226         tstringi line;\r
227         bool isTokenExist = false;\r
228 continue_getLineLoop:\r
229         while (getLine(&line)) {\r
230                 const _TCHAR *t = line.c_str();\r
231 \r
232 continue_getTokenLoop:\r
233                 while (true) {\r
234                         // skip white space\r
235                         while (*t != _T('\0') && _istspace(*t))\r
236                                 t ++;\r
237                         if (*t == _T('\0') || *t == _T('#'))\r
238                                 goto break_getTokenLoop; // no more tokens exist\r
239                         if (*t == _T('\\') && *(t + 1) == _T('\0'))\r
240                                 goto continue_getLineLoop; // continue to next line\r
241 \r
242                         const _TCHAR *tokenStart = t;\r
243 \r
244                         // comma or empty token\r
245                         if (*t == _T(',')) {\r
246                                 if (!isTokenExist)\r
247                                         o_tokens->push_back(Token(_T(""), false));\r
248                                 isTokenExist = false;\r
249                                 o_tokens->push_back(Token(Token::Type_comma));\r
250                                 t ++;\r
251                                 goto continue_getTokenLoop;\r
252                         }\r
253 \r
254                         // paren\r
255                         if (*t == _T('(')) {\r
256                                 o_tokens->push_back(Token(Token::Type_openParen));\r
257                                 isTokenExist = false;\r
258                                 t ++;\r
259                                 goto continue_getTokenLoop;\r
260                         }\r
261                         if (*t == _T(')')) {\r
262                                 if (!isTokenExist)\r
263                                         o_tokens->push_back(Token(_T(""), false));\r
264                                 isTokenExist = true;\r
265                                 o_tokens->push_back(Token(Token::Type_closeParen));\r
266                                 t ++;\r
267                                 goto continue_getTokenLoop;\r
268                         }\r
269 \r
270                         isTokenExist = true;\r
271 \r
272                         // prefix\r
273                         if (m_prefixes)\r
274                                 for (size_t i = 0; i < m_prefixes->size(); i ++)\r
275                                         if (_tcsnicmp(tokenStart, m_prefixes->at(i).c_str(),\r
276                                                                   m_prefixes->at(i).size()) == 0) {\r
277                                                 o_tokens->push_back(Token(m_prefixes->at(i), false));\r
278                                                 t += m_prefixes->at(i).size();\r
279                                                 goto continue_getTokenLoop;\r
280                                         }\r
281 \r
282                         // quoted or regexp\r
283                         if (*t == _T('"') || *t == _T('\'') ||\r
284                                         *t == _T('/') || (*t == _T('\\') && *(t + 1) == _T('m') &&\r
285                                                                           *(t + 2) != _T('\0'))) {\r
286                                 bool isRegexp = !(*t == _T('"') || *t == _T('\''));\r
287                                 _TCHAR q[2] = { *t++, _T('\0') }; // quote character\r
288                                 if (q[0] == _T('\\')) {\r
289                                         t++;\r
290                                         q[0] = *t++;\r
291                                 }\r
292                                 tokenStart = t;\r
293 \r
294                                 while (*t != _T('\0') && *t != q[0]) {\r
295                                         if (*t == _T('\\') && *(t + 1))\r
296                                                 t ++;\r
297                                         if (_istlead(*t) && *(t + 1))\r
298                                                 t ++;\r
299                                         t ++;\r
300                                 }\r
301 \r
302                                 tstring str =\r
303                                         interpretMetaCharacters(tokenStart, t - tokenStart, q, isRegexp);\r
304 #ifdef _MBCS\r
305                                 if (isRegexp)\r
306                                         str = guardRegexpFromMbcs(str.c_str());\r
307 #endif\r
308                                 // concatinate continuous string\r
309                                 if (!isRegexp &&\r
310                                                 0 < o_tokens->size() && o_tokens->back().isString() &&\r
311                                                 o_tokens->back().isQuoted())\r
312                                         o_tokens->back().add(str);\r
313                                 else\r
314                                         o_tokens->push_back(Token(str, true, isRegexp));\r
315                                 if (*t != _T('\0'))\r
316                                         t ++;\r
317                                 goto continue_getTokenLoop;\r
318                         }\r
319 \r
320                         // not quoted\r
321                         {\r
322                                 while (isSymbolChar(*t)) {\r
323                                         if (*t == _T('\\'))\r
324                                                 if (*(t + 1))\r
325                                                         t ++;\r
326                                                 else\r
327                                                         break;\r
328                                         if (_istlead(*t) && *(t + 1))\r
329                                                 t ++;\r
330                                         t ++;\r
331                                 }\r
332                                 if (t == tokenStart) {\r
333                                         ErrorMessage e;\r
334                                         e << _T("invalid character ");\r
335 #ifdef UNICODE\r
336                                         e << _T("U+");\r
337                                         e << std::hex; // << std::setw(4) << std::setfill(_T('0'));\r
338                                         e << (int)(wchar_t)*t;\r
339 #else\r
340                                         e << _T("\\x");\r
341                                         e << std::hex; // << std::setw(2) << std::setfill(_T('0'));\r
342                                         e << (int)(u_char)*t;\r
343 #endif\r
344                                         e << std::dec;\r
345                                         if (_istprint(*t))\r
346                                                 e << _T("(") << *t << _T(")");\r
347                                         throw e;\r
348                                 }\r
349 \r
350                                 _TCHAR *numEnd = NULL;\r
351                                 long value = _tcstol(tokenStart, &numEnd, 0);\r
352                                 if (tokenStart == numEnd) {\r
353                                         tstring str = interpretMetaCharacters(tokenStart, t - tokenStart);\r
354                                         o_tokens->push_back(Token(str, false));\r
355                                 } else {\r
356                                         o_tokens->push_back(\r
357                                                 Token(value, tstringi(tokenStart, numEnd - tokenStart)));\r
358                                         t = numEnd;\r
359                                 }\r
360                                 goto continue_getTokenLoop;\r
361                         }\r
362                 }\r
363 break_getTokenLoop:\r
364                 if (0 < o_tokens->size())\r
365                         break;\r
366                 m_lineNumber = m_internalLineNumber;\r
367                 isTokenExist = false;\r
368         }\r
369 \r
370         return 0 < o_tokens->size();\r
371 }\r