1:
2: // Compiler implementation of the D programming language
3: // Copyright (c) 1999-2011 by Digital Mars
4: // All Rights Reserved
5: // written by Walter Bright
6: // http://www.digitalmars.com
7: // License for redistribution is by either the Artistic License
8: // in artistic.txt, or the GNU General Public License in gnu.txt.
9: // See the included readme.txt for details.
10:
11: /* Lexical Analyzer */
12:
13: #include <stdio.h>
14: #include <string.h>
15: #include <ctype.h>
16: #include <stdarg.h>
17: #include <errno.h>
18: #include <wchar.h>
19: #include <stdlib.h>
20: static char __file__[] = __FILE__; /* for tassert.h */
21: #include "tassert.h"
22: #include <time.h> // for time() and ctime()
23:
24: #include "rmem.h"
25:
26: #include "stringtable.h"
27:
28: #include "lexer.h"
29: #include "utf.h"
30: #include "identifier.h"
31: #include "id.h"
32: #include "module.h"
33:
34: #if _WIN32 && __DMC__
35: // from \dm\src\include\setlocal.h
36: extern "C" char * __cdecl __locale_decpoint;
37: #endif
38:
39: extern int HtmlNamedEntity(unsigned char *p, int length);
40:
41: #define LS 0x2028 // UTF line separator
42: #define PS 0x2029 // UTF paragraph separator
43:
44: void unittest_lexer();
45:
46: /********************************************
47: * Do our own char maps
48: */
49:
50: static unsigned char cmtable[256];
51:
52: const int CMoctal = 0x1;
53: const int CMhex = 0x2;
54: const int CMidchar = 0x4;
55:
56: inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; }
57: inline unsigned char ishex (unsigned char c) { return cmtable[c] & CMhex; }
58: inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; }
59:
60: static void cmtable_init()
61: {
62: for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++)
63: {
64: if ('0' <= c && c <= '7')
65: cmtable[c] |= CMoctal;
66: if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))
67: cmtable[c] |= CMhex;
68: if (isalnum(c) || c == '_')
69: cmtable[c] |= CMidchar;
70: }
71: }
72:
73:
74: /************************* Token **********************************************/
75:
76: const char *Token::tochars[TOKMAX];
77:
78: void *Token::operator new(size_t size)
79: { Token *t;
80:
81: if (Lexer::freelist)
82: {
83: t = Lexer::freelist;
84: Lexer::freelist = t->next;
85: return t;
86: }
87:
88: return ::operator new(size);
89: }
90:
91: #ifdef DEBUG
92: void Token::print()
93: {
94: fprintf(stdmsg, "%s\n", toChars());
95: }
96: #endif
97:
98: const char *Token::toChars()
99: { const char *p;
100: static char buffer[3 + 3 * sizeof(value) + 1];
101:
102: p = buffer;
103: switch (value)
104: {
105: case TOKint32v:
106: #if IN_GCC
107: sprintf(buffer,"%d",(d_int32)int64value);
108: #else
109: sprintf(buffer,"%d",int32value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
110: #endif
111: break;
112:
113: case TOKuns32v:
114: case TOKcharv:
115: case TOKwcharv:
116: case TOKdcharv:
117: #if IN_GCC
118: sprintf(buffer,"%uU",(d_uns32)uns64value);
119: #else
120: sprintf(buffer,"%uU",uns32value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
121: #endif
122: break;
123:
124: case TOKint64v:
125: sprintf(buffer,"%jdL",int64value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
warning C6271: Extra argument passed to 'sprintf': parameter '3' is not used by the format string
126: break;
127:
128: case TOKuns64v:
129: sprintf(buffer,"%juUL",uns64value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
warning C6271: Extra argument passed to 'sprintf': parameter '3' is not used by the format string
130: break;
131:
132: #if IN_GCC
133: case TOKfloat32v:
134: case TOKfloat64v:
135: case TOKfloat80v:
136: float80value.format(buffer, sizeof(buffer));
137: break;
138: case TOKimaginary32v:
139: case TOKimaginary64v:
140: case TOKimaginary80v:
141: float80value.format(buffer, sizeof(buffer));
142: // %% buffer
143: strcat(buffer, "i");
144: break;
145: #else
146: case TOKfloat32v:
147: sprintf(buffer,"%Lgf", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
148: break;
149:
150: case TOKfloat64v:
151: sprintf(buffer,"%Lg", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
152: break;
153:
154: case TOKfloat80v:
155: sprintf(buffer,"%LgL", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
156: break;
157:
158: case TOKimaginary32v:
159: sprintf(buffer,"%Lgfi", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
160: break;
161:
162: case TOKimaginary64v:
163: sprintf(buffer,"%Lgi", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
164: break;
165:
166: case TOKimaginary80v:
167: sprintf(buffer,"%LgLi", float80value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
168: break;
169: #endif
170:
171: case TOKstring:
172: #if CSTRINGS
173: p = string;
174: #else
175: { OutBuffer buf;
176:
177: buf.writeByte('"');
178: for (size_t i = 0; i < len; )
179: { unsigned c;
180:
181: utf_decodeChar((unsigned char *)ustring, len, &i, &c);
182: switch (c)
183: {
184: case 0:
185: break;
186:
187: case '"':
188: case '\\':
189: buf.writeByte('\\');
190: default:
191: if (isprint(c))
192: buf.writeByte(c);
193: else if (c <= 0x7F)
194: buf.printf("\\x%02x", c);
195: else if (c <= 0xFFFF)
196: buf.printf("\\u%04x", c);
197: else
198: buf.printf("\\U%08x", c);
199: continue;
200: }
201: break;
202: }
203: buf.writeByte('"');
204: if (postfix)
205: buf.writeByte('"');
206: buf.writeByte(0);
207: p = (char *)buf.extractData();
208: }
209: #endif
210: break;
211:
212: case TOKidentifier:
213: case TOKenum:
214: case TOKstruct:
215: case TOKimport:
216: case BASIC_TYPES:
217: p = ident->toChars();
218: break;
219:
220: default:
221: p = toChars(value);
222: break;
223: }
224: return p;
225: }
226:
227: const char *Token::toChars(enum TOK value)
228: { const char *p;
229: static char buffer[3 + 3 * sizeof(value) + 1];
230:
231: p = tochars[value];
232: if (!p)
233: { sprintf(buffer,"TOK%d",value);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
234: p = buffer;
235: }
236: return p;
237: }
238:
239: /*************************** Lexer ********************************************/
240:
241: Token *Lexer::freelist = NULL;
242: StringTable Lexer::stringtable;
243: OutBuffer Lexer::stringbuffer;
244:
245: Lexer::Lexer(Module *mod,
246: unsigned char *base, unsigned begoffset, unsigned endoffset,
247: int doDocComment, int commentToken)
248: : loc(mod, 1)
249: {
250: //printf("Lexer::Lexer(%p,%d)\n",base,length);
251: //printf("lexer.mod = %p, %p\n", mod, this->loc.mod);
252: memset(&token,0,sizeof(token));
253: this->base = base;
254: this->end = base + endoffset;
255: p = base + begoffset;
256: this->mod = mod;
257: this->doDocComment = doDocComment;
258: this->anyToken = 0;
259: this->commentToken = commentToken;
260: //initKeywords();
261:
262: /* If first line starts with '#!', ignore the line
263: */
264:
265: if (p[0] == '#' && p[1] =='!')
266: {
267: p += 2;
268: while (1)
269: { unsigned char c = *p;
270: switch (c)
271: {
272: case '\n':
273: p++;
274: break;
275:
276: case '\r':
277: p++;
278: if (*p == '\n')
279: p++;
280: break;
281:
282: case 0:
283: case 0x1A:
284: break;
285:
286: default:
287: if (c & 0x80)
288: { unsigned u = decodeUTF();
289: if (u == PS || u == LS)
290: break;
291: }
292: p++;
293: continue;
294: }
295: break;
296: }
297: loc.linnum = 2;
298: }
299: }
300:
301:
302: void Lexer::error(const char *format, ...)
303: {
304: if (mod && !global.gag)
305: {
306: char *p = loc.toChars();
307: if (*p)
308: fprintf(stdmsg, "%s: ", p);
309: mem.free(p);
310:
311: va_list ap;
312: va_start(ap, format);
313: vfprintf(stdmsg, format, ap);
314: va_end(ap);
315:
316: fprintf(stdmsg, "\n");
317: fflush(stdmsg);
318:
319: if (global.errors >= 20) // moderate blizzard of cascading messages
320: fatal();
321: }
322: global.errors++;
323: }
324:
325: void Lexer::error(Loc loc, const char *format, ...)
326: {
327: if (mod && !global.gag)
328: {
329: char *p = loc.toChars();
330: if (*p)
331: fprintf(stdmsg, "%s: ", p);
332: mem.free(p);
333:
334: va_list ap;
335: va_start(ap, format);
336: vfprintf(stdmsg, format, ap);
337: va_end(ap);
338:
339: fprintf(stdmsg, "\n");
340: fflush(stdmsg);
341:
342: if (global.errors >= 20) // moderate blizzard of cascading messages
343: fatal();
344: }
345: global.errors++;
346: }
347:
348: TOK Lexer::nextToken()
349: { Token *t;
350:
351: if (token.next)
352: {
353: t = token.next;
354: memcpy(&token,t,sizeof(Token));
355: t->next = freelist;
356: freelist = t;
357: }
358: else
359: {
360: scan(&token);
361: }
362: //token.print();
363: return token.value;
364: }
365:
366: Token *Lexer::peek(Token *ct)
367: { Token *t;
368:
369: if (ct->next)
370: t = ct->next;
371: else
372: {
373: t = new Token();
374: scan(t);
375: t->next = NULL;
376: ct->next = t;
377: }
378: return t;
379: }
380:
381: /***********************
382: * Look ahead at next token's value.
383: */
384:
385: TOK Lexer::peekNext()
386: {
387: return peek(&token)->value;
388: }
389:
390: /***********************
391: * Look 2 tokens ahead at value.
392: */
393:
394: TOK Lexer::peekNext2()
395: {
396: Token *t = peek(&token);
397: return peek(t)->value;
398: }
399:
400: /*********************************
401: * tk is on the opening (.
402: * Look ahead and return token that is past the closing ).
403: */
404:
405: Token *Lexer::peekPastParen(Token *tk)
406: {
407: //printf("peekPastParen()\n");
408: int parens = 1;
409: int curlynest = 0;
410: while (1)
411: {
412: tk = peek(tk);
413: //tk->print();
414: switch (tk->value)
415: {
416: case TOKlparen:
417: parens++;
418: continue;
419:
420: case TOKrparen:
421: --parens;
422: if (parens)
423: continue;
424: tk = peek(tk);
425: break;
426:
427: case TOKlcurly:
428: curlynest++;
429: continue;
430:
431: case TOKrcurly:
432: if (--curlynest >= 0)
433: continue;
434: break;
435:
436: case TOKsemicolon:
437: if (curlynest)
438: continue;
439: break;
440:
441: case TOKeof:
442: break;
443:
444: default:
445: continue;
446: }
447: return tk;
448: }
449: }
450:
451: /**********************************
452: * Determine if string is a valid Identifier.
453: * Placed here because of commonality with Lexer functionality.
454: * Returns:
455: * 0 invalid
456: */
457:
458: int Lexer::isValidIdentifier(char *p)
459: {
460: size_t len;
461: size_t idx;
462:
463: if (!p || !*p)
464: goto Linvalid;
465:
466: if (*p >= '0' && *p <= '9') // beware of isdigit() on signed chars
467: goto Linvalid;
468:
469: len = strlen(p);
470: idx = 0;
471: while (p[idx])
472: { dchar_t dc;
473:
474: const char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc);
475: if (q)
476: goto Linvalid;
477:
478: if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
479: goto Linvalid;
480: }
481: return 1;
482:
483: Linvalid:
484: return 0;
485: }
486:
487: /****************************
488: * Turn next token in buffer into a token.
489: */
490:
491: void Lexer::scan(Token *t)
492: {
493: unsigned lastLine = loc.linnum;
494: unsigned linnum;
495:
496: t->blockComment = NULL;
497: t->lineComment = NULL;
498: while (1)
499: {
500: t->ptr = p;
501: //printf("p = %p, *p = '%c'\n",p,*p);
502: switch (*p)
503: {
504: case 0:
505: case 0x1A:
506: t->value = TOKeof; // end of file
507: return;
508:
509: case ' ':
510: case '\t':
511: case '\v':
512: case '\f':
513: p++;
514: continue; // skip white space
515:
516: case '\r':
517: p++;
518: if (*p != '\n') // if CR stands by itself
519: loc.linnum++;
520: continue; // skip white space
521:
522: case '\n':
523: p++;
524: loc.linnum++;
525: continue; // skip white space
526:
527: case '0': case '1': case '2': case '3': case '4':
528: case '5': case '6': case '7': case '8': case '9':
529: t->value = number(t);
530: return;
531:
532: #if CSTRINGS
533: case '\'':
534: t->value = charConstant(t, 0);
535: return;
536:
537: case '"':
538: t->value = stringConstant(t,0);
539: return;
540:
541: case 'l':
542: case 'L':
543: if (p[1] == '\'')
544: {
545: p++;
546: t->value = charConstant(t, 1);
547: return;
548: }
549: else if (p[1] == '"')
550: {
551: p++;
552: t->value = stringConstant(t, 1);
553: return;
554: }
555: #else
556: case '\'':
557: t->value = charConstant(t,0);
558: return;
559:
560: case 'r':
561: if (p[1] != '"')
562: goto case_ident;
563: p++;
564: case '`':
565: t->value = wysiwygStringConstant(t, *p);
566: return;
567:
568: case 'x':
569: if (p[1] != '"')
570: goto case_ident;
571: p++;
572: t->value = hexStringConstant(t);
573: return;
574:
575: #if DMDV2
576: case 'q':
577: if (p[1] == '"')
578: {
579: p++;
580: t->value = delimitedStringConstant(t);
581: return;
582: }
583: else if (p[1] == '{')
584: {
585: p++;
586: t->value = tokenStringConstant(t);
587: return;
588: }
589: else
590: goto case_ident;
591: #endif
592:
593: case '"':
594: t->value = escapeStringConstant(t,0);
595: return;
596:
597: #if ! TEXTUAL_ASSEMBLY_OUT
598: case '\\': // escaped string literal
599: { unsigned c;
600: unsigned char *pstart = p;
601:
602: stringbuffer.reset();
603: do
604: {
605: p++;
606: switch (*p)
607: {
608: case 'u':
609: case 'U':
610: case '&':
611: c = escapeSequence();
612: stringbuffer.writeUTF8(c);
613: break;
614:
615: default:
616: c = escapeSequence();
617: stringbuffer.writeByte(c);
618: break;
619: }
620: } while (*p == '\\');
621: t->len = stringbuffer.offset;
622: stringbuffer.writeByte(0);
623: t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
624: memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
625: t->postfix = 0;
626: t->value = TOKstring;
627: #if DMDV2
628: if (!global.params.useDeprecated)
629: error("Escape String literal %.*s is deprecated, use double quoted string literal \"%.*s\" instead", p - pstart, pstart, p - pstart, pstart);
630: #endif
631: return;
632: }
633: #endif
634:
635: case 'l':
636: case 'L':
637: #endif
638: case 'a': case 'b': case 'c': case 'd': case 'e':
639: case 'f': case 'g': case 'h': case 'i': case 'j':
640: case 'k': case 'm': case 'n': case 'o':
641: #if DMDV2
642: case 'p': /*case 'q': case 'r':*/ case 's': case 't':
643: #else
644: case 'p': case 'q': /*case 'r':*/ case 's': case 't':
645: #endif
646: case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
647: case 'z':
648: case 'A': case 'B': case 'C': case 'D': case 'E':
649: case 'F': case 'G': case 'H': case 'I': case 'J':
650: case 'K': case 'M': case 'N': case 'O':
651: case 'P': case 'Q': case 'R': case 'S': case 'T':
652: case 'U': case 'V': case 'W': case 'X': case 'Y':
653: case 'Z':
654: case '_':
655: case_ident:
656: { unsigned char c;
657:
658: while (1)
659: {
660: c = *++p;
661: if (isidchar(c))
662: continue;
663: else if (c & 0x80)
664: { unsigned char *s = p;
665: unsigned u = decodeUTF();
666: if (isUniAlpha(u))
667: continue;
668: error("char 0x%04x not allowed in identifier", u);
669: p = s;
670: }
671: break;
672: }
673:
674: StringValue *sv = stringtable.update((char *)t->ptr, p - t->ptr);
675: Identifier *id = (Identifier *) sv->ptrvalue;
676: if (!id)
677: { id = new Identifier(sv->lstring.string,TOKidentifier);
678: sv->ptrvalue = id;
679: }
680: t->ident = id;
681: t->value = (enum TOK) id->value;
682: anyToken = 1;
683: if (*t->ptr == '_') // if special identifier token
684: {
685: static char date[11+1];
686: static char time[8+1];
687: static char timestamp[24+1];
688:
689: if (!date[0]) // lazy evaluation
690: { time_t t;
warning C6246: Local declaration of 't' hides declaration of the same name in outer scope. For additional information, see previous declaration at line '491' of 'c:\projects\extern\d\dmd\src\lexer.c': Lines: 491
691: char *p;
692:
693: ::time(&t);
694: p = ctime(&t);
warning C4996: 'ctime': This function or variable may be unsafe. Consider using ctime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\time.inl(86) : see declaration of 'ctime'
695: assert(p);
696: sprintf(date, "%.6s %.4s", p + 4, p + 20);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
697: sprintf(time, "%.8s", p + 11);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
698: sprintf(timestamp, "%.24s", p);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
699: }
700:
701: #if DMDV1
702: if (mod && id == Id::FILE)
703: {
704: t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars());
705: goto Lstr;
706: }
707: else if (mod && id == Id::LINE)
708: {
709: t->value = TOKint64v;
710: t->uns64value = loc.linnum;
711: }
712: else
713: #endif
714: if (id == Id::DATE)
715: {
716: t->ustring = (unsigned char *)date;
717: goto Lstr;
718: }
719: else if (id == Id::TIME)
720: {
721: t->ustring = (unsigned char *)time;
722: goto Lstr;
723: }
724: else if (id == Id::VENDOR)
725: {
726: t->ustring = (unsigned char *)"Digital Mars D";
727: goto Lstr;
728: }
729: else if (id == Id::TIMESTAMP)
730: {
731: t->ustring = (unsigned char *)timestamp;
732: Lstr:
733: t->value = TOKstring;
734: Llen:
warning C4102: 'Llen' : unreferenced label
735: t->postfix = 0;
736: t->len = strlen((char *)t->ustring);
737: }
738: else if (id == Id::VERSIONX)
739: { unsigned major = 0;
740: unsigned minor = 0;
741:
742: for (const char *p = global.version + 1; 1; p++)
743: {
744: char c = *p;
warning C6246: Local declaration of 'c' hides declaration of the same name in outer scope. For additional information, see previous declaration at line '656' of 'c:\projects\extern\d\dmd\src\lexer.c': Lines: 656
745: if (isdigit(c))
warning C6328: 'char' passed as parameter '1' when 'unsigned char' is required in call to 'isdigit'
746: minor = minor * 10 + c - '0';
747: else if (c == '.')
748: { major = minor;
749: minor = 0;
750: }
751: else
752: break;
753: }
754: t->value = TOKint64v;
755: t->uns64value = major * 1000 + minor;
756: }
757: #if DMDV2
758: else if (id == Id::EOFX)
759: {
760: t->value = TOKeof;
761: // Advance scanner to end of file
762: while (!(*p == 0 || *p == 0x1A))
763: p++;
764: }
765: #endif
766: }
767: //printf("t->value = %d\n",t->value);
768: return;
769: }
770:
771: case '/':
772: p++;
773: switch (*p)
774: {
775: case '=':
776: p++;
777: t->value = TOKdivass;
778: return;
779:
780: case '*':
781: p++;
782: linnum = loc.linnum;
783: while (1)
784: {
785: while (1)
786: { unsigned char c = *p;
787: switch (c)
788: {
789: case '/':
790: break;
791:
792: case '\n':
793: loc.linnum++;
794: p++;
795: continue;
796:
797: case '\r':
798: p++;
799: if (*p != '\n')
800: loc.linnum++;
801: continue;
802:
803: case 0:
804: case 0x1A:
805: error("unterminated /* */ comment");
806: p = end;
807: t->value = TOKeof;
808: return;
809:
810: default:
811: if (c & 0x80)
812: { unsigned u = decodeUTF();
813: if (u == PS || u == LS)
814: loc.linnum++;
815: }
816: p++;
817: continue;
818: }
819: break;
820: }
821: p++;
822: if (p[-2] == '*' && p - 3 != t->ptr)
823: break;
824: }
825: if (commentToken)
826: {
827: t->value = TOKcomment;
828: return;
829: }
830: else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
831: { // if /** but not /**/
832: getDocComment(t, lastLine == linnum);
833: }
834: continue;
835:
836: case '/': // do // style comments
837: linnum = loc.linnum;
838: while (1)
839: { unsigned char c = *++p;
840: switch (c)
841: {
842: case '\n':
843: break;
844:
845: case '\r':
846: if (p[1] == '\n')
847: p++;
848: break;
849:
850: case 0:
851: case 0x1A:
852: if (commentToken)
853: {
854: p = end;
855: t->value = TOKcomment;
856: return;
857: }
858: if (doDocComment && t->ptr[2] == '/')
859: getDocComment(t, lastLine == linnum);
860: p = end;
861: t->value = TOKeof;
862: return;
863:
864: default:
865: if (c & 0x80)
866: { unsigned u = decodeUTF();
867: if (u == PS || u == LS)
868: break;
869: }
870: continue;
871: }
872: break;
873: }
874:
875: if (commentToken)
876: {
877: p++;
878: loc.linnum++;
879: t->value = TOKcomment;
880: return;
881: }
882: if (doDocComment && t->ptr[2] == '/')
883: getDocComment(t, lastLine == linnum);
884:
885: p++;
886: loc.linnum++;
887: continue;
888:
889: case '+':
890: { int nest;
891:
892: linnum = loc.linnum;
893: p++;
894: nest = 1;
895: while (1)
896: { unsigned char c = *p;
897: switch (c)
898: {
899: case '/':
900: p++;
901: if (*p == '+')
902: {
903: p++;
904: nest++;
905: }
906: continue;
907:
908: case '+':
909: p++;
910: if (*p == '/')
911: {
912: p++;
913: if (--nest == 0)
914: break;
915: }
916: continue;
917:
918: case '\r':
919: p++;
920: if (*p != '\n')
921: loc.linnum++;
922: continue;
923:
924: case '\n':
925: loc.linnum++;
926: p++;
927: continue;
928:
929: case 0:
930: case 0x1A:
931: error("unterminated /+ +/ comment");
932: p = end;
933: t->value = TOKeof;
934: return;
935:
936: default:
937: if (c & 0x80)
938: { unsigned u = decodeUTF();
939: if (u == PS || u == LS)
940: loc.linnum++;
941: }
942: p++;
943: continue;
944: }
945: break;
946: }
947: if (commentToken)
948: {
949: t->value = TOKcomment;
950: return;
951: }
952: if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
953: { // if /++ but not /++/
954: getDocComment(t, lastLine == linnum);
955: }
956: continue;
957: }
958: }
959: t->value = TOKdiv;
960: return;
961:
962: case '.':
963: p++;
964: if (isdigit(*p))
965: { /* Note that we don't allow ._1 and ._ as being
966: * valid floating point numbers.
967: */
968: p--;
969: t->value = inreal(t);
970: }
971: else if (p[0] == '.')
972: {
973: if (p[1] == '.')
974: { p += 2;
975: t->value = TOKdotdotdot;
976: }
977: else
978: { p++;
979: t->value = TOKslice;
980: }
981: }
982: else
983: t->value = TOKdot;
984: return;
985:
986: case '&':
987: p++;
988: if (*p == '=')
989: { p++;
990: t->value = TOKandass;
991: }
992: else if (*p == '&')
993: { p++;
994: t->value = TOKandand;
995: }
996: else
997: t->value = TOKand;
998: return;
999:
1000: case '|':
1001: p++;
1002: if (*p == '=')
1003: { p++;
1004: t->value = TOKorass;
1005: }
1006: else if (*p == '|')
1007: { p++;
1008: t->value = TOKoror;
1009: }
1010: else
1011: t->value = TOKor;
1012: return;
1013:
1014: case '-':
1015: p++;
1016: if (*p == '=')
1017: { p++;
1018: t->value = TOKminass;
1019: }
1020: #if 0
1021: else if (*p == '>')
1022: { p++;
1023: t->value = TOKarrow;
1024: }
1025: #endif
1026: else if (*p == '-')
1027: { p++;
1028: t->value = TOKminusminus;
1029: }
1030: else
1031: t->value = TOKmin;
1032: return;
1033:
1034: case '+':
1035: p++;
1036: if (*p == '=')
1037: { p++;
1038: t->value = TOKaddass;
1039: }
1040: else if (*p == '+')
1041: { p++;
1042: t->value = TOKplusplus;
1043: }
1044: else
1045: t->value = TOKadd;
1046: return;
1047:
1048: case '<':
1049: p++;
1050: if (*p == '=')
1051: { p++;
1052: t->value = TOKle; // <=
1053: }
1054: else if (*p == '<')
1055: { p++;
1056: if (*p == '=')
1057: { p++;
1058: t->value = TOKshlass; // <<=
1059: }
1060: else
1061: t->value = TOKshl; // <<
1062: }
1063: else if (*p == '>')
1064: { p++;
1065: if (*p == '=')
1066: { p++;
1067: t->value = TOKleg; // <>=
1068: }
1069: else
1070: t->value = TOKlg; // <>
1071: }
1072: else
1073: t->value = TOKlt; // <
1074: return;
1075:
1076: case '>':
1077: p++;
1078: if (*p == '=')
1079: { p++;
1080: t->value = TOKge; // >=
1081: }
1082: else if (*p == '>')
1083: { p++;
1084: if (*p == '=')
1085: { p++;
1086: t->value = TOKshrass; // >>=
1087: }
1088: else if (*p == '>')
1089: { p++;
1090: if (*p == '=')
1091: { p++;
1092: t->value = TOKushrass; // >>>=
1093: }
1094: else
1095: t->value = TOKushr; // >>>
1096: }
1097: else
1098: t->value = TOKshr; // >>
1099: }
1100: else
1101: t->value = TOKgt; // >
1102: return;
1103:
1104: case '!':
1105: p++;
1106: if (*p == '=')
1107: { p++;
1108: if (*p == '=' && global.params.Dversion == 1)
1109: { p++;
1110: t->value = TOKnotidentity; // !==
1111: }
1112: else
1113: t->value = TOKnotequal; // !=
1114: }
1115: else if (*p == '<')
1116: { p++;
1117: if (*p == '>')
1118: { p++;
1119: if (*p == '=')
1120: { p++;
1121: t->value = TOKunord; // !<>=
1122: }
1123: else
1124: t->value = TOKue; // !<>
1125: }
1126: else if (*p == '=')
1127: { p++;
1128: t->value = TOKug; // !<=
1129: }
1130: else
1131: t->value = TOKuge; // !<
1132: }
1133: else if (*p == '>')
1134: { p++;
1135: if (*p == '=')
1136: { p++;
1137: t->value = TOKul; // !>=
1138: }
1139: else
1140: t->value = TOKule; // !>
1141: }
1142: else
1143: t->value = TOKnot; // !
1144: return;
1145:
1146: case '=':
1147: p++;
1148: if (*p == '=')
1149: { p++;
1150: if (*p == '=' && global.params.Dversion == 1)
1151: { p++;
1152: t->value = TOKidentity; // ===
1153: }
1154: else
1155: t->value = TOKequal; // ==
1156: }
1157: else
1158: t->value = TOKassign; // =
1159: return;
1160:
1161: case '~':
1162: p++;
1163: if (*p == '=')
1164: { p++;
1165: t->value = TOKcatass; // ~=
1166: }
1167: else
1168: t->value = TOKtilde; // ~
1169: return;
1170:
1171: #if DMDV2
1172: case '^':
1173: p++;
1174: if (*p == '^')
1175: { p++;
1176: if (*p == '=')
1177: { p++;
1178: t->value = TOKpowass; // ^^=
1179: }
1180: else
1181: t->value = TOKpow; // ^^
1182: }
1183: else if (*p == '=')
1184: { p++;
1185: t->value = TOKxorass; // ^=
1186: }
1187: else
1188: t->value = TOKxor; // ^
1189: return;
1190: #endif
1191:
1192: #define SINGLE(c,tok) case c: p++; t->value = tok; return;
1193:
1194: SINGLE('(', TOKlparen)
1195: SINGLE(')', TOKrparen)
1196: SINGLE('[', TOKlbracket)
1197: SINGLE(']', TOKrbracket)
1198: SINGLE('{', TOKlcurly)
1199: SINGLE('}', TOKrcurly)
1200: SINGLE('?', TOKquestion)
1201: SINGLE(',', TOKcomma)
1202: SINGLE(';', TOKsemicolon)
1203: SINGLE(':', TOKcolon)
1204: SINGLE('$', TOKdollar)
1205: #if DMDV2
1206: SINGLE('@', TOKat)
1207: #endif
1208: #undef SINGLE
1209:
1210: #define DOUBLE(c1,tok1,c2,tok2) \
1211: case c1: \
1212: p++; \
1213: if (*p == c2) \
1214: { p++; \
1215: t->value = tok2; \
1216: } \
1217: else \
1218: t->value = tok1; \
1219: return;
1220:
1221: DOUBLE('*', TOKmul, '=', TOKmulass)
1222: DOUBLE('%', TOKmod, '=', TOKmodass)
1223: #if DMDV1
1224: DOUBLE('^', TOKxor, '=', TOKxorass)
1225: #endif
1226: #undef DOUBLE
1227:
1228: case '#':
1229: p++;
1230: pragma();
1231: continue;
1232:
1233: default:
1234: { unsigned c = *p;
1235:
1236: if (c & 0x80)
1237: { c = decodeUTF();
1238:
1239: // Check for start of unicode identifier
1240: if (isUniAlpha(c))
1241: goto case_ident;
1242:
1243: if (c == PS || c == LS)
1244: {
1245: loc.linnum++;
1246: p++;
1247: continue;
1248: }
1249: }
1250: if (c < 0x80 && isprint(c))
1251: error("unsupported char '%c'", c);
1252: else
1253: error("unsupported char 0x%02x", c);
1254: p++;
1255: continue;
1256: }
1257: }
1258: }
1259: }
1260:
1261: /*******************************************
1262: * Parse escape sequence.
1263: */
1264:
1265: unsigned Lexer::escapeSequence()
1266: { unsigned c = *p;
1267:
1268: #ifdef TEXTUAL_ASSEMBLY_OUT
1269: return c;
1270: #endif
1271: int n;
1272: int ndigits;
1273:
1274: switch (c)
1275: {
1276: case '\'':
1277: case '"':
1278: case '?':
1279: case '\\':
1280: Lconsume:
1281: p++;
1282: break;
1283:
1284: case 'a': c = 7; goto Lconsume;
1285: case 'b': c = 8; goto Lconsume;
1286: case 'f': c = 12; goto Lconsume;
1287: case 'n': c = 10; goto Lconsume;
1288: case 'r': c = 13; goto Lconsume;
1289: case 't': c = 9; goto Lconsume;
1290: case 'v': c = 11; goto Lconsume;
1291:
1292: case 'u':
1293: ndigits = 4;
1294: goto Lhex;
1295: case 'U':
1296: ndigits = 8;
1297: goto Lhex;
1298: case 'x':
1299: ndigits = 2;
1300: Lhex:
1301: p++;
1302: c = *p;
1303: if (ishex(c))
1304: { unsigned v;
1305:
1306: n = 0;
1307: v = 0;
1308: while (1)
1309: {
1310: if (isdigit(c))
1311: c -= '0';
1312: else if (islower(c))
1313: c -= 'a' - 10;
1314: else
1315: c -= 'A' - 10;
1316: v = v * 16 + c;
1317: c = *++p;
1318: if (++n == ndigits)
1319: break;
1320: if (!ishex(c))
1321: { error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1322: break;
1323: }
1324: }
1325: if (ndigits != 2 && !utf_isValidDchar(v))
1326: { error("invalid UTF character \\U%08x", v);
1327: v = '?'; // recover with valid UTF character
1328: }
1329: c = v;
1330: }
1331: else
1332: error("undefined escape hex sequence \\%c\n",c);
1333: break;
1334:
1335: case '&': // named character entity
1336: for (unsigned char *idstart = ++p; 1; p++)
1337: {
1338: switch (*p)
1339: {
1340: case ';':
1341: c = HtmlNamedEntity(idstart, p - idstart);
1342: if (c == ~0)
1343: { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1344: c = ' ';
1345: }
1346: p++;
1347: break;
1348:
1349: default:
1350: if (isalpha(*p) ||
1351: (p != idstart + 1 && isdigit(*p)))
1352: continue;
1353: error("unterminated named entity");
1354: break;
1355: }
1356: break;
1357: }
1358: break;
1359:
1360: case 0:
1361: case 0x1A: // end of file
1362: c = '\\';
1363: break;
1364:
1365: default:
1366: if (isoctal(c))
1367: { unsigned v;
1368:
1369: n = 0;
1370: v = 0;
1371: do
1372: {
1373: v = v * 8 + (c - '0');
1374: c = *++p;
1375: } while (++n < 3 && isoctal(c));
1376: c = v;
1377: if (c > 0xFF)
1378: error("0%03o is larger than a byte", c);
1379: }
1380: else
1381: error("undefined escape sequence \\%c\n",c);
1382: break;
1383: }
1384: return c;
1385: }
1386:
1387: /**************************************
1388: */
1389:
1390: TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1391: { unsigned c;
1392: Loc start = loc;
1393:
1394: p++;
1395: stringbuffer.reset();
1396: while (1)
1397: {
1398: c = *p++;
1399: switch (c)
1400: {
1401: case '\n':
1402: loc.linnum++;
1403: break;
1404:
1405: case '\r':
1406: if (*p == '\n')
1407: continue; // ignore
1408: c = '\n'; // treat EndOfLine as \n character
1409: loc.linnum++;
1410: break;
1411:
1412: case 0:
1413: case 0x1A:
1414: error("unterminated string constant starting at %s", start.toChars());
1415: t->ustring = (unsigned char *)"";
1416: t->len = 0;
1417: t->postfix = 0;
1418: return TOKstring;
1419:
1420: case '"':
1421: case '`':
1422: if (c == tc)
1423: {
1424: t->len = stringbuffer.offset;
1425: stringbuffer.writeByte(0);
1426: t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1427: memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1428: stringPostfix(t);
1429: return TOKstring;
1430: }
1431: break;
1432:
1433: default:
1434: if (c & 0x80)
1435: { p--;
1436: unsigned u = decodeUTF();
1437: p++;
1438: if (u == PS || u == LS)
1439: loc.linnum++;
1440: stringbuffer.writeUTF8(u);
1441: continue;
1442: }
1443: break;
1444: }
1445: stringbuffer.writeByte(c);
1446: }
1447: }
1448:
1449: /**************************************
1450: * Lex hex strings:
1451: * x"0A ae 34FE BD"
1452: */
1453:
1454: TOK Lexer::hexStringConstant(Token *t)
1455: { unsigned c;
1456: Loc start = loc;
1457: unsigned n = 0;
1458: unsigned v;
1459:
1460: p++;
1461: stringbuffer.reset();
1462: while (1)
1463: {
1464: c = *p++;
1465: switch (c)
1466: {
1467: case ' ':
1468: case '\t':
1469: case '\v':
1470: case '\f':
1471: continue; // skip white space
1472:
1473: case '\r':
1474: if (*p == '\n')
1475: continue; // ignore
1476: // Treat isolated '\r' as if it were a '\n'
1477: case '\n':
1478: loc.linnum++;
1479: continue;
1480:
1481: case 0:
1482: case 0x1A:
1483: error("unterminated string constant starting at %s", start.toChars());
1484: t->ustring = (unsigned char *)"";
1485: t->len = 0;
1486: t->postfix = 0;
1487: return TOKstring;
1488:
1489: case '"':
1490: if (n & 1)
1491: { error("odd number (%d) of hex characters in hex string", n);
1492: stringbuffer.writeByte(v);
1493: }
1494: t->len = stringbuffer.offset;
1495: stringbuffer.writeByte(0);
1496: t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1497: memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1498: stringPostfix(t);
1499: return TOKstring;
1500:
1501: default:
1502: if (c >= '0' && c <= '9')
1503: c -= '0';
1504: else if (c >= 'a' && c <= 'f')
1505: c -= 'a' - 10;
1506: else if (c >= 'A' && c <= 'F')
1507: c -= 'A' - 10;
1508: else if (c & 0x80)
1509: { p--;
1510: unsigned u = decodeUTF();
1511: p++;
1512: if (u == PS || u == LS)
1513: loc.linnum++;
1514: else
1515: error("non-hex character \\u%04x", u);
1516: }
1517: else
1518: error("non-hex character '%c'", c);
1519: if (n & 1)
1520: { v = (v << 4) | c;
1521: stringbuffer.writeByte(v);
1522: }
1523: else
1524: v = c;
1525: n++;
1526: break;
1527: }
1528: }
1529: }
1530:
1531:
1532: #if DMDV2
1533: /**************************************
1534: * Lex delimited strings:
1535: * q"(foo(xxx))" // "foo(xxx)"
1536: * q"[foo(]" // "foo("
1537: * q"/foo]/" // "foo]"
1538: * q"HERE
1539: * foo
1540: * HERE" // "foo\n"
1541: * Input:
1542: * p is on the "
1543: */
1544:
1545: TOK Lexer::delimitedStringConstant(Token *t)
1546: { unsigned c;
1547: Loc start = loc;
1548: unsigned delimleft = 0;
1549: unsigned delimright = 0;
1550: unsigned nest = 1;
1551: unsigned nestcount;
1552: Identifier *hereid = NULL;
1553: unsigned blankrol = 0;
1554: unsigned startline = 0;
1555:
1556: p++;
1557: stringbuffer.reset();
1558: while (1)
1559: {
1560: c = *p++;
1561: //printf("c = '%c'\n", c);
1562: switch (c)
1563: {
1564: case '\n':
1565: Lnextline:
1566: loc.linnum++;
1567: startline = 1;
1568: if (blankrol)
1569: { blankrol = 0;
1570: continue;
1571: }
1572: if (hereid)
1573: {
1574: stringbuffer.writeUTF8(c);
1575: continue;
1576: }
1577: break;
1578:
1579: case '\r':
1580: if (*p == '\n')
1581: continue; // ignore
1582: c = '\n'; // treat EndOfLine as \n character
1583: goto Lnextline;
1584:
1585: case 0:
1586: case 0x1A:
1587: goto Lerror;
1588:
1589: default:
1590: if (c & 0x80)
1591: { p--;
1592: c = decodeUTF();
1593: p++;
1594: if (c == PS || c == LS)
1595: goto Lnextline;
1596: }
1597: break;
1598: }
1599: if (delimleft == 0)
1600: { delimleft = c;
1601: nest = 1;
1602: nestcount = 1;
1603: if (c == '(')
1604: delimright = ')';
1605: else if (c == '{')
1606: delimright = '}';
1607: else if (c == '[')
1608: delimright = ']';
1609: else if (c == '<')
1610: delimright = '>';
1611: else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1612: { // Start of identifier; must be a heredoc
1613: Token t;
warning C6246: Local declaration of 't' hides declaration of the same name in outer scope. For additional information, see previous declaration at line '1545' of 'c:\projects\extern\d\dmd\src\lexer.c': Lines: 1545
1614: p--;
1615: scan(&t); // read in heredoc identifier
1616: if (t.value != TOKidentifier)
1617: { error("identifier expected for heredoc, not %s", t.toChars());
1618: delimright = c;
1619: }
1620: else
1621: { hereid = t.ident;
1622: //printf("hereid = '%s'\n", hereid->toChars());
1623: blankrol = 1;
1624: }
1625: nest = 0;
1626: }
1627: else
1628: { delimright = c;
1629: nest = 0;
1630: #if DMDV2
1631: if (isspace(c))
1632: error("delimiter cannot be whitespace");
1633: #endif
1634: }
1635: }
1636: else
1637: {
1638: if (blankrol)
1639: { error("heredoc rest of line should be blank");
1640: blankrol = 0;
1641: continue;
1642: }
1643: if (nest == 1)
1644: {
1645: if (c == delimleft)
1646: nestcount++;
1647: else if (c == delimright)
1648: { nestcount--;
1649: if (nestcount == 0)
1650: goto Ldone;
1651: }
1652: }
1653: else if (c == delimright)
1654: goto Ldone;
1655: if (startline && isalpha(c)
1656: #if DMDV2
1657: && hereid
1658: #endif
1659: )
1660: { Token t;
warning C6246: Local declaration of 't' hides declaration of the same name in outer scope. For additional information, see previous declaration at line '1545' of 'c:\projects\extern\d\dmd\src\lexer.c': Lines: 1545
1661: unsigned char *psave = p;
1662: p--;
1663: scan(&t); // read in possible heredoc identifier
1664: //printf("endid = '%s'\n", t.ident->toChars());
1665: if (t.value == TOKidentifier && t.ident->equals(hereid))
1666: { /* should check that rest of line is blank
1667: */
1668: goto Ldone;
1669: }
1670: p = psave;
1671: }
1672: stringbuffer.writeUTF8(c);
1673: startline = 0;
1674: }
1675: }
1676:
1677: Ldone:
1678: if (*p == '"')
1679: p++;
1680: else
1681: error("delimited string must end in %c\"", delimright);
1682: t->len = stringbuffer.offset;
1683: stringbuffer.writeByte(0);
1684: t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1685: memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1686: stringPostfix(t);
1687: return TOKstring;
1688:
1689: Lerror:
1690: error("unterminated string constant starting at %s", start.toChars());
1691: t->ustring = (unsigned char *)"";
1692: t->len = 0;
1693: t->postfix = 0;
1694: return TOKstring;
1695: }
1696:
1697: /**************************************
1698: * Lex delimited strings:
1699: * q{ foo(xxx) } // " foo(xxx) "
1700: * q{foo(} // "foo("
1701: * q{{foo}"}"} // "{foo}"}""
1702: * Input:
1703: * p is on the q
1704: */
1705:
1706: TOK Lexer::tokenStringConstant(Token *t)
1707: {
1708: unsigned nest = 1;
1709: Loc start = loc;
1710: unsigned char *pstart = ++p;
1711:
1712: while (1)
1713: { Token tok;
1714:
1715: scan(&tok);
1716: switch (tok.value)
1717: {
1718: case TOKlcurly:
1719: nest++;
1720: continue;
1721:
1722: case TOKrcurly:
1723: if (--nest == 0)
1724: goto Ldone;
1725: continue;
1726:
1727: case TOKeof:
1728: goto Lerror;
1729:
1730: default:
1731: continue;
1732: }
1733: }
1734:
1735: Ldone:
1736: t->len = p - 1 - pstart;
1737: t->ustring = (unsigned char *)mem.malloc(t->len + 1);
1738: memcpy(t->ustring, pstart, t->len);
1739: t->ustring[t->len] = 0;
1740: stringPostfix(t);
1741: return TOKstring;
1742:
1743: Lerror:
1744: error("unterminated token string constant starting at %s", start.toChars());
1745: t->ustring = (unsigned char *)"";
1746: t->len = 0;
1747: t->postfix = 0;
1748: return TOKstring;
1749: }
1750:
1751: #endif
1752:
1753:
1754: /**************************************
1755: */
1756:
1757: TOK Lexer::escapeStringConstant(Token *t, int wide)
1758: { unsigned c;
1759: Loc start = loc;
1760:
1761: p++;
1762: stringbuffer.reset();
1763: while (1)
1764: {
1765: c = *p++;
1766: switch (c)
1767: {
1768: #if !( TEXTUAL_ASSEMBLY_OUT )
1769: case '\\':
1770: switch (*p)
1771: {
1772: case 'u':
1773: case 'U':
1774: case '&':
1775: c = escapeSequence();
1776: stringbuffer.writeUTF8(c);
1777: continue;
1778:
1779: default:
1780: c = escapeSequence();
1781: break;
1782: }
1783: break;
1784: #endif
1785: case '\n':
1786: loc.linnum++;
1787: break;
1788:
1789: case '\r':
1790: if (*p == '\n')
1791: continue; // ignore
1792: c = '\n'; // treat EndOfLine as \n character
1793: loc.linnum++;
1794: break;
1795:
1796: case '"':
1797: t->len = stringbuffer.offset;
1798: stringbuffer.writeByte(0);
1799: t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset);
1800: memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1801: stringPostfix(t);
1802: return TOKstring;
1803:
1804: case 0:
1805: case 0x1A:
1806: p--;
1807: error("unterminated string constant starting at %s", start.toChars());
1808: t->ustring = (unsigned char *)"";
1809: t->len = 0;
1810: t->postfix = 0;
1811: return TOKstring;
1812:
1813: default:
1814: if (c & 0x80)
1815: {
1816: p--;
1817: c = decodeUTF();
1818: if (c == LS || c == PS)
1819: { c = '\n';
1820: loc.linnum++;
1821: }
1822: p++;
1823: stringbuffer.writeUTF8(c);
1824: continue;
1825: }
1826: break;
1827: }
1828: stringbuffer.writeByte(c);
1829: }
1830: }
1831:
1832: /**************************************
1833: */
1834:
1835: TOK Lexer::charConstant(Token *t, int wide)
1836: {
1837: unsigned c;
1838: TOK tk = TOKcharv;
1839:
1840: //printf("Lexer::charConstant\n");
1841: p++;
1842: c = *p++;
1843: switch (c)
1844: {
1845: #if ! TEXTUAL_ASSEMBLY_OUT
1846: case '\\':
1847: switch (*p)
1848: {
1849: case 'u':
1850: t->uns64value = escapeSequence();
1851: tk = TOKwcharv;
1852: break;
1853:
1854: case 'U':
1855: case '&':
1856: t->uns64value = escapeSequence();
1857: tk = TOKdcharv;
1858: break;
1859:
1860: default:
1861: t->uns64value = escapeSequence();
1862: break;
1863: }
1864: break;
1865: #endif
1866: case '\n':
1867: L1:
1868: loc.linnum++;
1869: case '\r':
1870: case 0:
1871: case 0x1A:
1872: case '\'':
1873: error("unterminated character constant");
1874: return tk;
1875:
1876: default:
1877: if (c & 0x80)
1878: {
1879: p--;
1880: c = decodeUTF();
1881: p++;
1882: if (c == LS || c == PS)
1883: goto L1;
1884: if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1885: tk = TOKwcharv;
1886: else
1887: tk = TOKdcharv;
1888: }
1889: t->uns64value = c;
1890: break;
1891: }
1892:
1893: if (*p != '\'')
1894: { error("unterminated character constant");
1895: return tk;
1896: }
1897: p++;
1898: return tk;
1899: }
1900:
1901: /***************************************
1902: * Get postfix of string literal.
1903: */
1904:
1905: void Lexer::stringPostfix(Token *t)
1906: {
1907: switch (*p)
1908: {
1909: case 'c':
1910: case 'w':
1911: case 'd':
1912: t->postfix = *p;
1913: p++;
1914: break;
1915:
1916: default:
1917: t->postfix = 0;
1918: break;
1919: }
1920: }
1921:
1922: /***************************************
1923: * Read \u or \U unicode sequence
1924: * Input:
1925: * u 'u' or 'U'
1926: */
1927:
1928: #if 0
1929: unsigned Lexer::wchar(unsigned u)
1930: {
1931: unsigned value;
1932: unsigned n;
1933: unsigned char c;
1934: unsigned nchars;
1935:
1936: nchars = (u == 'U') ? 8 : 4;
1937: value = 0;
1938: for (n = 0; 1; n++)
1939: {
1940: ++p;
1941: if (n == nchars)
1942: break;
1943: c = *p;
1944: if (!ishex(c))
1945: { error("\\%c sequence must be followed by %d hex characters", u, nchars);
1946: break;
1947: }
1948: if (isdigit(c))
1949: c -= '0';
1950: else if (islower(c))
1951: c -= 'a' - 10;
1952: else
1953: c -= 'A' - 10;
1954: value <<= 4;
1955: value |= c;
1956: }
1957: return value;
1958: }
1959: #endif
1960:
1961: /**************************************
1962: * Read in a number.
1963: * If it's an integer, store it in tok.TKutok.Vlong.
1964: * integers can be decimal, octal or hex
1965: * Handle the suffixes U, UL, LU, L, etc.
1966: * If it's double, store it in tok.TKutok.Vdouble.
1967: * Returns:
1968: * TKnum
1969: * TKdouble,...
1970: */
1971:
1972: TOK Lexer::number(Token *t)
1973: {
1974: // We use a state machine to collect numbers
1975: enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale,
1976: STATE_hex, STATE_binary, STATE_hex0, STATE_binary0,
1977: STATE_hexh, STATE_error };
1978: enum STATE state;
1979:
1980: enum FLAGS
1981: { FLAGS_decimal = 1, // decimal
1982: FLAGS_unsigned = 2, // u or U suffix
1983: FLAGS_long = 4, // l or L suffix
1984: };
1985: enum FLAGS flags = FLAGS_decimal;
1986:
1987: int i;
warning C4101: 'i' : unreferenced local variable
1988: int base;
1989: unsigned c;
1990: unsigned char *start;
1991: TOK result;
1992:
1993: //printf("Lexer::number()\n");
1994: state = STATE_initial;
1995: base = 0;
1996: stringbuffer.reset();
1997: start = p;
1998: while (1)
1999: {
2000: c = *p;
2001: switch (state)
2002: {
2003: case STATE_initial: // opening state
2004: if (c == '0')
2005: state = STATE_0;
2006: else
2007: state = STATE_decimal;
2008: break;
2009:
2010: case STATE_0:
2011: flags = (FLAGS) (flags & ~FLAGS_decimal);
2012: switch (c)
2013: {
2014: #if ZEROH
2015: case 'H': // 0h
2016: case 'h':
2017: goto hexh;
2018: #endif
2019: case 'X':
2020: case 'x':
2021: state = STATE_hex0;
2022: break;
2023:
2024: case '.':
2025: if (p[1] == '.') // .. is a separate token
2026: goto done;
2027: case 'i':
2028: case 'f':
2029: case 'F':
2030: goto real;
2031: #if ZEROH
2032: case 'E':
2033: case 'e':
2034: goto case_hex;
2035: #endif
2036: case 'B':
2037: case 'b':
2038: state = STATE_binary0;
2039: break;
2040:
2041: case '0': case '1': case '2': case '3':
2042: case '4': case '5': case '6': case '7':
2043: state = STATE_octal;
2044: break;
2045:
2046: #if ZEROH
2047: case '8': case '9': case 'A':
2048: case 'C': case 'D': case 'F':
2049: case 'a': case 'c': case 'd': case 'f':
2050: case_hex:
2051: state = STATE_hexh;
2052: break;
2053: #endif
2054: case '_':
2055: state = STATE_octal;
2056: p++;
2057: continue;
2058:
2059: case 'L':
2060: if (p[1] == 'i')
2061: goto real;
2062: goto done;
2063:
2064: default:
2065: goto done;
2066: }
2067: break;
2068:
2069: case STATE_decimal: // reading decimal number
2070: if (!isdigit(c))
2071: {
2072: #if ZEROH
2073: if (ishex(c)
2074: || c == 'H' || c == 'h'
2075: )
2076: goto hexh;
2077: #endif
2078: if (c == '_') // ignore embedded _
2079: { p++;
2080: continue;
2081: }
2082: if (c == '.' && p[1] != '.')
2083: goto real;
2084: else if (c == 'i' || c == 'f' || c == 'F' ||
2085: c == 'e' || c == 'E')
2086: {
2087: real: // It's a real number. Back up and rescan as a real
2088: p = start;
2089: return inreal(t);
2090: }
2091: else if (c == 'L' && p[1] == 'i')
2092: goto real;
2093: goto done;
2094: }
2095: break;
2096:
2097: case STATE_hex0: // reading hex number
2098: case STATE_hex:
2099: if (!ishex(c))
2100: {
2101: if (c == '_') // ignore embedded _
2102: { p++;
2103: continue;
2104: }
2105: if (c == '.' && p[1] != '.')
2106: goto real;
2107: if (c == 'P' || c == 'p' || c == 'i')
2108: goto real;
2109: if (state == STATE_hex0)
2110: error("Hex digit expected, not '%c'", c);
2111: goto done;
2112: }
2113: state = STATE_hex;
2114: break;
2115:
2116: #if ZEROH
2117: hexh:
2118: state = STATE_hexh;
2119: case STATE_hexh: // parse numbers like 0FFh
2120: if (!ishex(c))
2121: {
2122: if (c == 'H' || c == 'h')
2123: {
2124: p++;
2125: base = 16;
2126: goto done;
2127: }
2128: else
2129: {
2130: // Check for something like 1E3 or 0E24
2131: if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) ||
2132: memchr((char *)stringbuffer.data, 'e', stringbuffer.offset))
2133: goto real;
2134: error("Hex digit expected, not '%c'", c);
2135: goto done;
2136: }
2137: }
2138: break;
2139: #endif
2140:
2141: case STATE_octal: // reading octal number
2142: case STATE_octale: // reading octal number with non-octal digits
2143: if (!isoctal(c))
2144: {
2145: #if ZEROH
2146: if (ishex(c)
2147: || c == 'H' || c == 'h'
2148: )
2149: goto hexh;
2150: #endif
2151: if (c == '_') // ignore embedded _
2152: { p++;
2153: continue;
2154: }
2155: if (c == '.' && p[1] != '.')
2156: goto real;
2157: if (c == 'i')
2158: goto real;
2159: if (isdigit(c))
2160: {
2161: state = STATE_octale;
2162: }
2163: else
2164: goto done;
2165: }
2166: break;
2167:
2168: case STATE_binary0: // starting binary number
2169: case STATE_binary: // reading binary number
2170: if (c != '0' && c != '1')
2171: {
2172: #if ZEROH
2173: if (ishex(c)
2174: || c == 'H' || c == 'h'
2175: )
2176: goto hexh;
2177: #endif
2178: if (c == '_') // ignore embedded _
2179: { p++;
2180: continue;
2181: }
2182: if (state == STATE_binary0)
2183: { error("binary digit expected");
2184: state = STATE_error;
2185: break;
2186: }
2187: else
2188: goto done;
2189: }
2190: state = STATE_binary;
2191: break;
2192:
2193: case STATE_error: // for error recovery
2194: if (!isdigit(c)) // scan until non-digit
2195: goto done;
2196: break;
2197:
2198: default:
2199: assert(0);
2200: }
2201: stringbuffer.writeByte(c);
2202: p++;
2203: }
2204: done:
2205: stringbuffer.writeByte(0); // terminate string
2206: if (state == STATE_octale)
2207: error("Octal digit expected");
2208:
2209: uinteger_t n; // unsigned >=64 bit integer type
2210:
2211: if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0))
2212: n = stringbuffer.data[0] - '0';
2213: else
2214: {
2215: // Convert string to integer
2216: #if __DMC__
2217: errno = 0;
2218: n = strtoull((char *)stringbuffer.data,NULL,base);
2219: if (errno == ERANGE)
2220: error("integer overflow");
2221: #else
2222: // Not everybody implements strtoull()
2223: char *p = (char *)stringbuffer.data;
2224: int r = 10, d;
2225:
2226: if (*p == '0')
2227: {
2228: if (p[1] == 'x' || p[1] == 'X')
2229: p += 2, r = 16;
2230: else if (p[1] == 'b' || p[1] == 'B')
2231: p += 2, r = 2;
2232: else if (isdigit(p[1]))
warning C6328: 'char' passed as parameter '1' when 'unsigned char' is required in call to 'isdigit'
2233: p += 1, r = 8;
2234: }
2235:
2236: n = 0;
2237: while (1)
2238: {
2239: if (*p >= '0' && *p <= '9')
2240: d = *p - '0';
2241: else if (*p >= 'a' && *p <= 'z')
2242: d = *p - 'a' + 10;
2243: else if (*p >= 'A' && *p <= 'Z')
2244: d = *p - 'A' + 10;
2245: else
2246: break;
2247: if (d >= r)
2248: break;
2249: uinteger_t n2 = n * r;
2250: //printf("n2 / r = %llx, n = %llx\n", n2/r, n);
2251: if (n2 / r != n || n2 + d < n)
2252: {
2253: error ("integer overflow");
2254: break;
2255: }
2256:
2257: n = n2 + d;
2258: p++;
2259: }
2260: #endif
2261: if (sizeof(n) > 8 &&
2262: n > 0xFFFFFFFFFFFFFFFFULL) // if n needs more than 64 bits
2263: error("integer overflow");
2264: }
2265:
2266: // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2267: const unsigned char *psuffix = p;
2268: while (1)
2269: { unsigned char f;
2270:
2271: switch (*p)
2272: { case 'U':
2273: case 'u':
2274: f = FLAGS_unsigned;
2275: goto L1;
2276:
2277: case 'l':
2278: if (1 || !global.params.useDeprecated)
2279: error("'l' suffix is deprecated, use 'L' instead");
2280: case 'L':
2281: f = FLAGS_long;
2282: L1:
2283: p++;
2284: if (flags & f)
2285: error("unrecognized token");
2286: flags = (FLAGS) (flags | f);
2287: continue;
2288: default:
2289: break;
2290: }
2291: break;
2292: }
2293:
2294: if (state == STATE_octal && n >= 8 && !global.params.useDeprecated)
2295: error("octal literals 0%llo%.*s are deprecated, use std.conv.octal!%llo%.*s instead",
2296: n, p - psuffix, psuffix, n, p - psuffix, psuffix);
2297:
2298: switch (flags)
2299: {
2300: case 0:
2301: /* Octal or Hexadecimal constant.
2302: * First that fits: int, uint, long, ulong
2303: */
2304: if (n & 0x8000000000000000LL)
2305: result = TOKuns64v;
2306: else if (n & 0xFFFFFFFF00000000LL)
2307: result = TOKint64v;
2308: else if (n & 0x80000000)
2309: result = TOKuns32v;
2310: else
2311: result = TOKint32v;
2312: break;
2313:
2314: case FLAGS_decimal:
2315: /* First that fits: int, long, long long
2316: */
2317: if (n & 0x8000000000000000LL)
2318: { error("signed integer overflow");
2319: result = TOKuns64v;
2320: }
2321: else if (n & 0xFFFFFFFF80000000LL)
2322: result = TOKint64v;
2323: else
2324: result = TOKint32v;
2325: break;
2326:
2327: case FLAGS_unsigned:
2328: case FLAGS_decimal | FLAGS_unsigned:
2329: /* First that fits: uint, ulong
2330: */
2331: if (n & 0xFFFFFFFF00000000LL)
2332: result = TOKuns64v;
2333: else
2334: result = TOKuns32v;
2335: break;
2336:
2337: case FLAGS_decimal | FLAGS_long:
2338: if (n & 0x8000000000000000LL)
2339: { error("signed integer overflow");
2340: result = TOKuns64v;
2341: }
2342: else
2343: result = TOKint64v;
2344: break;
2345:
2346: case FLAGS_long:
2347: if (n & 0x8000000000000000LL)
2348: result = TOKuns64v;
2349: else
2350: result = TOKint64v;
2351: break;
2352:
2353: case FLAGS_unsigned | FLAGS_long:
2354: case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
2355: result = TOKuns64v;
2356: break;
2357:
2358: default:
2359: #ifdef DEBUG
2360: printf("%x\n",flags);
2361: #endif
2362: assert(0);
2363: }
2364: t->uns64value = n;
2365: return result;
2366: }
2367:
2368: /**************************************
2369: * Read in characters, converting them to real.
2370: * Bugs:
2371: * Exponent overflow not detected.
2372: * Too much requested precision is not detected.
2373: */
2374:
2375: TOK Lexer::inreal(Token *t)
2376: #ifdef __DMC__
2377: __in
2378: {
2379: assert(*p == '.' || isdigit(*p));
2380: }
2381: __out (result)
2382: {
2383: switch (result)
2384: {
2385: case TOKfloat32v:
2386: case TOKfloat64v:
2387: case TOKfloat80v:
2388: case TOKimaginary32v:
2389: case TOKimaginary64v:
2390: case TOKimaginary80v:
2391: break;
2392:
2393: default:
2394: assert(0);
2395: }
2396: }
2397: __body
2398: #endif /* __DMC__ */
2399: { int dblstate;
2400: unsigned c;
2401: char hex; // is this a hexadecimal-floating-constant?
2402: TOK result;
2403:
2404: //printf("Lexer::inreal()\n");
2405: stringbuffer.reset();
2406: dblstate = 0;
2407: hex = 0;
2408: Lnext:
2409: while (1)
2410: {
2411: // Get next char from input
2412: c = *p++;
2413: //printf("dblstate = %d, c = '%c'\n", dblstate, c);
2414: while (1)
2415: {
2416: switch (dblstate)
2417: {
2418: case 0: // opening state
2419: if (c == '0')
2420: dblstate = 9;
2421: else if (c == '.')
2422: dblstate = 3;
2423: else
2424: dblstate = 1;
2425: break;
2426:
2427: case 9:
2428: dblstate = 1;
2429: if (c == 'X' || c == 'x')
2430: { hex++;
2431: break;
2432: }
2433: case 1: // digits to left of .
2434: case 3: // digits to right of .
2435: case 7: // continuing exponent digits
2436: if (!isdigit(c) && !(hex && isxdigit(c)))
2437: {
2438: if (c == '_')
2439: goto Lnext; // ignore embedded '_'
2440: dblstate++;
2441: continue;
2442: }
2443: break;
2444:
2445: case 2: // no more digits to left of .
2446: if (c == '.')
2447: { dblstate++;
2448: break;
2449: }
2450: case 4: // no more digits to right of .
2451: if ((c == 'E' || c == 'e') ||
2452: hex && (c == 'P' || c == 'p'))
2453: { dblstate = 5;
2454: hex = 0; // exponent is always decimal
2455: break;
2456: }
2457: if (hex)
2458: error("binary-exponent-part required");
2459: goto done;
2460:
2461: case 5: // looking immediately to right of E
2462: dblstate++;
2463: if (c == '-' || c == '+')
2464: break;
2465: case 6: // 1st exponent digit expected
2466: if (!isdigit(c))
2467: error("exponent expected");
2468: dblstate++;
2469: break;
2470:
2471: case 8: // past end of exponent digits
2472: goto done;
2473: }
2474: break;
2475: }
2476: stringbuffer.writeByte(c);
2477: }
2478: done:
2479: p--;
2480:
2481: stringbuffer.writeByte(0);
2482:
2483: #if _WIN32 && __DMC__
2484: char *save = __locale_decpoint;
2485: __locale_decpoint = ".";
2486: #endif
2487: #ifdef IN_GCC
2488: t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble);
2489: #else
2490: t->float80value = strtold((char *)stringbuffer.data, NULL);
2491: #endif
2492: errno = 0;
2493: switch (*p)
2494: {
2495: case 'F':
2496: case 'f':
2497: #ifdef IN_GCC
2498: real_t::parse((char *)stringbuffer.data, real_t::Float);
2499: #else
2500: { // Only interested in errno return
2501: float f = strtof((char *)stringbuffer.data, NULL);
warning C4244: 'initializing' : conversion from 'double' to 'float', possible loss of data
2502: // Assign to f to keep gcc warnings at bay
2503: }
2504: #endif
2505: result = TOKfloat32v;
2506: p++;
2507: break;
2508:
2509: default:
2510: #ifdef IN_GCC
2511: real_t::parse((char *)stringbuffer.data, real_t::Double);
2512: #else
2513: /* Should do our own strtod(), since dmc and linux gcc
2514: * accept 2.22507e-308, while apple gcc will only take
2515: * 2.22508e-308. Not sure who is right.
2516: */
2517: { // Only interested in errno return
2518: double d = strtod((char *)stringbuffer.data, NULL);
2519: // Assign to d to keep gcc warnings at bay
2520: }
2521: #endif
2522: result = TOKfloat64v;
2523: break;
2524:
2525: case 'l':
2526: if (!global.params.useDeprecated)
2527: error("'l' suffix is deprecated, use 'L' instead");
2528: case 'L':
2529: result = TOKfloat80v;
2530: p++;
2531: break;
2532: }
2533: if (*p == 'i' || *p == 'I')
2534: {
2535: if (!global.params.useDeprecated && *p == 'I')
2536: error("'I' suffix is deprecated, use 'i' instead");
2537: p++;
2538: switch (result)
2539: {
2540: case TOKfloat32v:
2541: result = TOKimaginary32v;
2542: break;
2543: case TOKfloat64v:
2544: result = TOKimaginary64v;
2545: break;
2546: case TOKfloat80v:
2547: result = TOKimaginary80v;
2548: break;
2549: }
2550: }
2551: #if _WIN32 && __DMC__
2552: __locale_decpoint = save;
2553: #endif
2554: if (errno == ERANGE)
2555: error("number is not representable");
2556: return result;
2557: }
2558:
2559: /*********************************************
2560: * Do pragma.
2561: * Currently, the only pragma supported is:
2562: * #line linnum [filespec]
2563: */
2564:
2565: void Lexer::pragma()
2566: {
2567: Token tok;
2568: int linnum;
2569: char *filespec = NULL;
2570: Loc loc = this->loc;
2571:
2572: scan(&tok);
2573: if (tok.value != TOKidentifier || tok.ident != Id::line)
2574: goto Lerr;
2575:
2576: scan(&tok);
2577: if (tok.value == TOKint32v || tok.value == TOKint64v)
2578: linnum = tok.uns64value - 1;
warning C4244: '=' : conversion from 'd_uns64' to 'int', possible loss of data
2579: else
2580: goto Lerr;
2581:
2582: while (1)
2583: {
2584: switch (*p)
2585: {
2586: case 0:
2587: case 0x1A:
2588: case '\n':
2589: Lnewline:
2590: this->loc.linnum = linnum;
2591: if (filespec)
2592: this->loc.filename = filespec;
2593: return;
2594:
2595: case '\r':
2596: p++;
2597: if (*p != '\n')
2598: { p--;
2599: goto Lnewline;
2600: }
2601: continue;
2602:
2603: case ' ':
2604: case '\t':
2605: case '\v':
2606: case '\f':
2607: p++;
2608: continue; // skip white space
2609:
2610: case '_':
2611: if (mod && memcmp(p, "__FILE__", 8) == 0)
2612: {
2613: p += 8;
2614: filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars());
2615: }
2616: continue;
2617:
2618: case '"':
2619: if (filespec)
2620: goto Lerr;
2621: stringbuffer.reset();
2622: p++;
2623: while (1)
2624: { unsigned c;
2625:
2626: c = *p;
2627: switch (c)
2628: {
2629: case '\n':
2630: case '\r':
2631: case 0:
2632: case 0x1A:
2633: goto Lerr;
2634:
2635: case '"':
2636: stringbuffer.writeByte(0);
2637: filespec = mem.strdup((char *)stringbuffer.data);
2638: p++;
2639: break;
2640:
2641: default:
2642: if (c & 0x80)
2643: { unsigned u = decodeUTF();
2644: if (u == PS || u == LS)
2645: goto Lerr;
2646: }
2647: stringbuffer.writeByte(c);
2648: p++;
2649: continue;
2650: }
2651: break;
2652: }
2653: continue;
2654:
2655: default:
2656: if (*p & 0x80)
2657: { unsigned u = decodeUTF();
2658: if (u == PS || u == LS)
2659: goto Lnewline;
2660: }
2661: goto Lerr;
2662: }
2663: }
2664:
2665: Lerr:
2666: error(loc, "#line integer [\"filespec\"]\\n expected");
2667: }
2668:
2669:
2670: /********************************************
2671: * Decode UTF character.
2672: * Issue error messages for invalid sequences.
2673: * Return decoded character, advance p to last character in UTF sequence.
2674: */
2675:
2676: unsigned Lexer::decodeUTF()
2677: {
2678: dchar_t u;
2679: unsigned char c;
2680: unsigned char *s = p;
2681: size_t len;
2682: size_t idx;
2683: const char *msg;
2684:
2685: c = *s;
2686: assert(c & 0x80);
2687:
2688: // Check length of remaining string up to 6 UTF-8 characters
2689: for (len = 1; len < 6 && s[len]; len++)
2690: ;
2691:
2692: idx = 0;
2693: msg = utf_decodeChar(s, len, &idx, &u);
2694: p += idx - 1;
2695: if (msg)
2696: {
2697: error("%s", msg);
2698: }
2699: return u;
2700: }
2701:
2702:
2703: /***************************************************
2704: * Parse doc comment embedded between t->ptr and p.
2705: * Remove trailing blanks and tabs from lines.
2706: * Replace all newlines with \n.
2707: * Remove leading comment character from each line.
2708: * Decide if it's a lineComment or a blockComment.
2709: * Append to previous one for this token.
2710: */
2711:
2712: void Lexer::getDocComment(Token *t, unsigned lineComment)
2713: {
2714: /* ct tells us which kind of comment it is: '/', '*', or '+'
2715: */
2716: unsigned char ct = t->ptr[2];
2717:
2718: /* Start of comment text skips over / * *, / + +, or / / /
2719: */
2720: unsigned char *q = t->ptr + 3; // start of comment text
2721:
2722: unsigned char *qend = p;
2723: if (ct == '*' || ct == '+')
2724: qend -= 2;
2725:
2726: /* Scan over initial row of ****'s or ++++'s or ////'s
2727: */
2728: for (; q < qend; q++)
2729: {
2730: if (*q != ct)
2731: break;
2732: }
2733:
2734: /* Remove trailing row of ****'s or ++++'s
2735: */
2736: if (ct != '/')
2737: {
2738: for (; q < qend; qend--)
2739: {
2740: if (qend[-1] != ct)
2741: break;
2742: }
2743: }
2744:
2745: /* Comment is now [q .. qend].
2746: * Canonicalize it into buf[].
2747: */
2748: OutBuffer buf;
2749: int linestart = 0;
2750:
2751: for (; q < qend; q++)
2752: {
2753: unsigned char c = *q;
2754:
2755: switch (c)
2756: {
2757: case '*':
2758: case '+':
2759: if (linestart && c == ct)
2760: { linestart = 0;
2761: /* Trim preceding whitespace up to preceding \n
2762: */
2763: while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2764: buf.offset--;
2765: continue;
2766: }
2767: break;
2768:
2769: case ' ':
2770: case '\t':
2771: break;
2772:
2773: case '\r':
2774: if (q[1] == '\n')
2775: continue; // skip the \r
2776: goto Lnewline;
2777:
2778: default:
2779: if (c == 226)
2780: {
2781: // If LS or PS
2782: if (q[1] == 128 &&
2783: (q[2] == 168 || q[2] == 169))
2784: {
2785: q += 2;
2786: goto Lnewline;
2787: }
2788: }
2789: linestart = 0;
2790: break;
2791:
2792: Lnewline:
2793: c = '\n'; // replace all newlines with \n
2794: case '\n':
2795: linestart = 1;
2796:
2797: /* Trim trailing whitespace
2798: */
2799: while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2800: buf.offset--;
2801:
2802: break;
2803: }
2804: buf.writeByte(c);
2805: }
2806:
2807: // Always end with a newline
2808: if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2809: buf.writeByte('\n');
2810:
2811: buf.writeByte(0);
2812:
2813: // It's a line comment if the start of the doc comment comes
2814: // after other non-whitespace on the same line.
2815: unsigned char** dc = (lineComment && anyToken)
2816: ? &t->lineComment
2817: : &t->blockComment;
2818:
2819: // Combine with previous doc comment, if any
2820: if (*dc)
2821: *dc = combineComments(*dc, (unsigned char *)buf.data);
2822: else
2823: *dc = (unsigned char *)buf.extractData();
2824: }
2825:
2826: /********************************************
2827: * Combine two document comments into one,
2828: * separated by a newline.
2829: */
2830:
2831: unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2)
2832: {
2833: //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
2834:
2835: unsigned char *c = c2;
2836:
2837: if (c1)
2838: { c = c1;
2839: if (c2)
2840: { size_t len1 = strlen((char *)c1);
2841: size_t len2 = strlen((char *)c2);
2842:
2843: c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1);
2844: memcpy(c, c1, len1);
2845: if (len1 && c1[len1 - 1] != '\n')
2846: { c[len1] = '\n';
2847: len1++;
2848: }
2849: memcpy(c + len1, c2, len2);
2850: c[len1 + len2] = 0;
2851: }
2852: }
2853: return c;
2854: }
2855:
2856: /********************************************
2857: * Create an identifier in the string table.
2858: */
2859:
2860: Identifier *Lexer::idPool(const char *s)
2861: {
2862: size_t len = strlen(s);
2863: StringValue *sv = stringtable.update(s, len);
2864: Identifier *id = (Identifier *) sv->ptrvalue;
2865: if (!id)
2866: {
2867: id = new Identifier(sv->lstring.string, TOKidentifier);
2868: sv->ptrvalue = id;
2869: }
2870: return id;
2871: }
2872:
2873: /*********************************************
2874: * Create a unique identifier using the prefix s.
2875: */
2876:
2877: Identifier *Lexer::uniqueId(const char *s, int num)
2878: { char buffer[32];
2879: size_t slen = strlen(s);
2880:
2881: assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer));
2882: sprintf(buffer, "%s%d", s, num);
warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\stdio.h(371) : see declaration of 'sprintf'
2883: return idPool(buffer);
2884: }
2885:
2886: Identifier *Lexer::uniqueId(const char *s)
2887: {
2888: static int num;
2889: return uniqueId(s, ++num);
2890: }
2891:
2892: /****************************************
2893: */
2894:
2895: struct Keyword
2896: { const char *name;
2897: enum TOK value;
2898: };
2899:
2900: static Keyword keywords[] =
2901: {
2902: // { "", TOK },
2903:
2904: { "this", TOKthis },
2905: { "super", TOKsuper },
2906: { "assert", TOKassert },
2907: { "null", TOKnull },
2908: { "true", TOKtrue },
2909: { "false", TOKfalse },
2910: { "cast", TOKcast },
2911: { "new", TOKnew },
2912: { "delete", TOKdelete },
2913: { "throw", TOKthrow },
2914: { "module", TOKmodule },
2915: { "pragma", TOKpragma },
2916: { "typeof", TOKtypeof },
2917: { "typeid", TOKtypeid },
2918:
2919: { "template", TOKtemplate },
2920:
2921: { "void", TOKvoid },
2922: { "byte", TOKint8 },
2923: { "ubyte", TOKuns8 },
2924: { "short", TOKint16 },
2925: { "ushort", TOKuns16 },
2926: { "int", TOKint32 },
2927: { "uint", TOKuns32 },
2928: { "long", TOKint64 },
2929: { "ulong", TOKuns64 },
2930: { "cent", TOKcent, },
2931: { "ucent", TOKucent, },
2932: { "float", TOKfloat32 },
2933: { "double", TOKfloat64 },
2934: { "real", TOKfloat80 },
2935:
2936: { "bool", TOKbool },
2937: { "char", TOKchar },
2938: { "wchar", TOKwchar },
2939: { "dchar", TOKdchar },
2940:
2941: { "ifloat", TOKimaginary32 },
2942: { "idouble", TOKimaginary64 },
2943: { "ireal", TOKimaginary80 },
2944:
2945: { "cfloat", TOKcomplex32 },
2946: { "cdouble", TOKcomplex64 },
2947: { "creal", TOKcomplex80 },
2948:
2949: { "delegate", TOKdelegate },
2950: { "function", TOKfunction },
2951:
2952: { "is", TOKis },
2953: { "if", TOKif },
2954: { "else", TOKelse },
2955: { "while", TOKwhile },
2956: { "for", TOKfor },
2957: { "do", TOKdo },
2958: { "switch", TOKswitch },
2959: { "case", TOKcase },
2960: { "default", TOKdefault },
2961: { "break", TOKbreak },
2962: { "continue", TOKcontinue },
2963: { "synchronized", TOKsynchronized },
2964: { "return", TOKreturn },
2965: { "goto", TOKgoto },
2966: { "try", TOKtry },
2967: { "catch", TOKcatch },
2968: { "finally", TOKfinally },
2969: { "with", TOKwith },
2970: { "asm", TOKasm },
2971: { "foreach", TOKforeach },
2972: { "foreach_reverse", TOKforeach_reverse },
2973: { "scope", TOKscope },
2974:
2975: { "struct", TOKstruct },
2976: { "class", TOKclass },
2977: { "interface", TOKinterface },
2978: { "union", TOKunion },
2979: { "enum", TOKenum },
2980: { "import", TOKimport },
2981: { "mixin", TOKmixin },
2982: { "static", TOKstatic },
2983: { "final", TOKfinal },
2984: { "const", TOKconst },
2985: { "typedef", TOKtypedef },
2986: { "alias", TOKalias },
2987: { "override", TOKoverride },
2988: { "abstract", TOKabstract },
2989: { "volatile", TOKvolatile },
2990: { "debug", TOKdebug },
2991: { "deprecated", TOKdeprecated },
2992: { "in", TOKin },
2993: { "out", TOKout },
2994: { "inout", TOKinout },
2995: { "lazy", TOKlazy },
2996: { "auto", TOKauto },
2997:
2998: { "align", TOKalign },
2999: { "extern", TOKextern },
3000: { "private", TOKprivate },
3001: { "package", TOKpackage },
3002: { "protected", TOKprotected },
3003: { "public", TOKpublic },
3004: { "export", TOKexport },
3005:
3006: { "body", TOKbody },
3007: { "invariant", TOKinvariant },
3008: { "unittest", TOKunittest },
3009: { "version", TOKversion },
3010: //{ "manifest", TOKmanifest },
3011:
3012: // Added after 1.0
3013: { "__argTypes", TOKargTypes },
3014: { "ref", TOKref },
3015: { "macro", TOKmacro },
3016: #if DMDV2
3017: { "pure", TOKpure },
3018: { "nothrow", TOKnothrow },
3019: { "__thread", TOKtls },
3020: { "__gshared", TOKgshared },
3021: { "__traits", TOKtraits },
3022: { "__overloadset", TOKoverloadset },
3023: { "__FILE__", TOKfile },
3024: { "__LINE__", TOKline },
3025: { "shared", TOKshared },
3026: { "immutable", TOKimmutable },
3027: #endif
3028: };
3029:
3030: int Token::isKeyword()
3031: {
3032: for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++)
3033: {
3034: if (keywords[u].value == value)
3035: return 1;
3036: }
3037: return 0;
3038: }
3039:
3040: void Lexer::initKeywords()
3041: { StringValue *sv;
3042: unsigned u;
3043: enum TOK v;
3044: unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]);
3045:
3046: if (global.params.Dversion == 1)
3047: nkeywords -= 2;
3048:
3049: cmtable_init();
3050:
3051: for (u = 0; u < nkeywords; u++)
3052: { const char *s;
3053:
3054: //printf("keyword[%d] = '%s'\n",u, keywords[u].name);
3055: s = keywords[u].name;
3056: v = keywords[u].value;
3057: sv = stringtable.insert(s, strlen(s));
3058: sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v);
3059:
3060: //printf("tochars[%d] = '%s'\n",v, s);
3061: Token::tochars[v] = s;
3062: }
3063:
3064: Token::tochars[TOKeof] = "EOF";
3065: Token::tochars[TOKlcurly] = "{";
3066: Token::tochars[TOKrcurly] = "}";
3067: Token::tochars[TOKlparen] = "(";
3068: Token::tochars[TOKrparen] = ")";
3069: Token::tochars[TOKlbracket] = "[";
3070: Token::tochars[TOKrbracket] = "]";
3071: Token::tochars[TOKsemicolon] = ";";
3072: Token::tochars[TOKcolon] = ":";
3073: Token::tochars[TOKcomma] = ",";
3074: Token::tochars[TOKdot] = ".";
3075: Token::tochars[TOKxor] = "^";
3076: Token::tochars[TOKxorass] = "^=";
3077: Token::tochars[TOKassign] = "=";
3078: Token::tochars[TOKconstruct] = "=";
3079: #if DMDV2
3080: Token::tochars[TOKblit] = "=";
3081: #endif
3082: Token::tochars[TOKlt] = "<";
3083: Token::tochars[TOKgt] = ">";
3084: Token::tochars[TOKle] = "<=";
3085: Token::tochars[TOKge] = ">=";
3086: Token::tochars[TOKequal] = "==";
3087: Token::tochars[TOKnotequal] = "!=";
3088: Token::tochars[TOKnotidentity] = "!is";
3089: Token::tochars[TOKtobool] = "!!";
3090:
3091: Token::tochars[TOKunord] = "!<>=";
3092: Token::tochars[TOKue] = "!<>";
3093: Token::tochars[TOKlg] = "<>";
3094: Token::tochars[TOKleg] = "<>=";
3095: Token::tochars[TOKule] = "!>";
3096: Token::tochars[TOKul] = "!>=";
3097: Token::tochars[TOKuge] = "!<";
3098: Token::tochars[TOKug] = "!<=";
3099:
3100: Token::tochars[TOKnot] = "!";
3101: Token::tochars[TOKtobool] = "!!";
3102: Token::tochars[TOKshl] = "<<";
3103: Token::tochars[TOKshr] = ">>";
3104: Token::tochars[TOKushr] = ">>>";
3105: Token::tochars[TOKadd] = "+";
3106: Token::tochars[TOKmin] = "-";
3107: Token::tochars[TOKmul] = "*";
3108: Token::tochars[TOKdiv] = "/";
3109: Token::tochars[TOKmod] = "%";
3110: Token::tochars[TOKslice] = "..";
3111: Token::tochars[TOKdotdotdot] = "...";
3112: Token::tochars[TOKand] = "&";
3113: Token::tochars[TOKandand] = "&&";
3114: Token::tochars[TOKor] = "|";
3115: Token::tochars[TOKoror] = "||";
3116: Token::tochars[TOKarray] = "[]";
3117: Token::tochars[TOKindex] = "[i]";
3118: Token::tochars[TOKaddress] = "&";
3119: Token::tochars[TOKstar] = "*";
3120: Token::tochars[TOKtilde] = "~";
3121: Token::tochars[TOKdollar] = "$";
3122: Token::tochars[TOKcast] = "cast";
3123: Token::tochars[TOKplusplus] = "++";
3124: Token::tochars[TOKminusminus] = "--";
3125: Token::tochars[TOKpreplusplus] = "++";
3126: Token::tochars[TOKpreminusminus] = "--";
3127: Token::tochars[TOKtype] = "type";
3128: Token::tochars[TOKquestion] = "?";
3129: Token::tochars[TOKneg] = "-";
3130: Token::tochars[TOKuadd] = "+";
3131: Token::tochars[TOKvar] = "var";
3132: Token::tochars[TOKaddass] = "+=";
3133: Token::tochars[TOKminass] = "-=";
3134: Token::tochars[TOKmulass] = "*=";
3135: Token::tochars[TOKdivass] = "/=";
3136: Token::tochars[TOKmodass] = "%=";
3137: Token::tochars[TOKshlass] = "<<=";
3138: Token::tochars[TOKshrass] = ">>=";
3139: Token::tochars[TOKushrass] = ">>>=";
3140: Token::tochars[TOKandass] = "&=";
3141: Token::tochars[TOKorass] = "|=";
3142: Token::tochars[TOKcatass] = "~=";
3143: Token::tochars[TOKcat] = "~";
3144: Token::tochars[TOKcall] = "call";
3145: Token::tochars[TOKidentity] = "is";
3146: Token::tochars[TOKnotidentity] = "!is";
3147:
3148: Token::tochars[TOKorass] = "|=";
3149: Token::tochars[TOKidentifier] = "identifier";
3150: #if DMDV2
3151: Token::tochars[TOKat] = "@";
3152: Token::tochars[TOKpow] = "^^";
3153: Token::tochars[TOKpowass] = "^^=";
3154: #endif
3155:
3156: // For debugging
3157: Token::tochars[TOKerror] = "error";
3158: Token::tochars[TOKdotexp] = "dotexp";
3159: Token::tochars[TOKdotti] = "dotti";
3160: Token::tochars[TOKdotvar] = "dotvar";
3161: Token::tochars[TOKdottype] = "dottype";
3162: Token::tochars[TOKsymoff] = "symoff";
3163: Token::tochars[TOKarraylength] = "arraylength";
3164: Token::tochars[TOKarrayliteral] = "arrayliteral";
3165: Token::tochars[TOKassocarrayliteral] = "assocarrayliteral";
3166: Token::tochars[TOKstructliteral] = "structliteral";
3167: Token::tochars[TOKstring] = "string";
3168: Token::tochars[TOKdsymbol] = "symbol";
3169: Token::tochars[TOKtuple] = "tuple";
3170: Token::tochars[TOKdeclaration] = "declaration";
3171: Token::tochars[TOKdottd] = "dottd";
3172: Token::tochars[TOKon_scope_exit] = "scope(exit)";
3173: Token::tochars[TOKon_scope_success] = "scope(success)";
3174: Token::tochars[TOKon_scope_failure] = "scope(failure)";
3175:
3176: #if UNITTEST
3177: unittest_lexer();
3178: #endif
3179: }
3180:
3181: #if UNITTEST
3182:
3183: void unittest_lexer()
3184: {
3185: //printf("unittest_lexer()\n");
3186:
3187: /* Not much here, just trying things out.
3188: */
3189: const unsigned char text[] = "int";
3190: Lexer lex1(NULL, (unsigned char *)text, 0, sizeof(text), 0, 0);
3191: TOK tok;
3192: tok = lex1.nextToken();
3193: //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32);
3194: assert(tok == TOKint32);
3195: tok = lex1.nextToken();
3196: assert(tok == TOKeof);
3197: tok = lex1.nextToken();
3198: assert(tok == TOKeof);
3199: }
3200:
3201: #endif
3202:
3203: