1:
2: // Copyright (c) 1999-2009 by Digital Mars
3: // All Rights Reserved
4: // written by Walter Bright
5: // http://www.digitalmars.com
6: // License for redistribution is by either the Artistic License
7: // in artistic.txt, or the GNU General Public License in gpl.txt.
8: // See the included readme.txt for details.
9:
10:
11: /* HTML parser
12: */
13:
14: #include <stdio.h>
15: #include <string.h>
16: #include <ctype.h>
17: #include <stdarg.h>
18: #include <errno.h>
19: #include <wchar.h>
20:
21: #include "html.h"
22:
23: #if MARS
24: static char __file__[] = __FILE__; /* for tassert.h */
25: #include "tassert.h"
26: #include "root.h"
27: //#include "../mars/mars.h"
28: #else
29: #include "outbuf.h"
30: #include "msgs2.h"
31:
32: extern void html_err(const char *, unsigned, unsigned, ...);
33:
34: static char __file__[] = __FILE__; /* for tassert.h */
35: #include "tassert.h"
36: #endif
37:
38: #if __GNUC__
39: int memicmp(const char *s1, const char *s2, int n);
40: #if 0
41: {
42: int result = 0;
43:
44: for (int i = 0; i < n; i++)
45: { char c1 = s1[i];
46: char c2 = s2[i];
47:
48: result = c1 - c2;
49: if (result)
50: {
51: if ('A' <= c1 && c1 <= 'Z')
52: c1 += 'a' - 'A';
53: if ('A' <= c2 && c2 <= 'Z')
54: c2 += 'a' - 'A';
55: result = c1 - c2;
56: if (result)
57: break;
58: }
59: }
60: return result;
61: }
62: #endif
63: #endif
64:
65: extern int HtmlNamedEntity(unsigned char *p, int length);
66:
67: static int isLineSeparator(const unsigned char* p);
68:
69: /**********************************
70: * Determine if beginning of tag identifier
71: * or a continuation of a tag identifier.
72: */
73:
74: inline int istagstart(int c)
75: {
76: return (isalpha(c) || c == '_');
77: }
78:
79: inline int istag(int c)
80: {
81: return (isalnum(c) || c == '_');
82: }
83:
84: /**********************************************
85: */
86:
87: Html::Html(const char *sourcename, unsigned char *base, unsigned length)
88: {
89: //printf("Html::Html()\n");
90: this->sourcename = sourcename;
91: this->base = base;
92: p = base;
93: end = base + length;
94: linnum = 1;
95: dbuf = NULL;
96: inCode = 0;
97: }
98:
99: /**********************************************
100: * Print error & quit.
101: */
102:
103: void Html::error(const char *format, ...)
104: {
105: printf("%s(%d) : HTML Error: ", sourcename, linnum);
106:
107: va_list ap;
108: va_start(ap, format);
109: vprintf(format, ap);
110: va_end(ap);
111:
112: printf("\n");
113: fflush(stdout);
114:
115: //#if MARS
116: // global.errors++;
117: //#else
118: exit(EXIT_FAILURE);
119: //#endif
120: }
121:
122: /**********************************************
123: * Extract all the code from an HTML file,
124: * concatenate it all together, and store in buf.
125: */
126:
127: #if MARS
128: void Html::extractCode(OutBuffer *buf)
129: #else
130: void Html::extractCode(Outbuffer *buf)
131: #endif
132: {
133: //printf("Html::extractCode()\n");
134: dbuf = buf; // save for other routines
135: buf->reserve(end - p);
136: inCode = 0;
137: while (1)
138: {
139: //printf("p = %p, *p = x%x\n", p, *p);
140: switch (*p)
141: {
142: #if 0 // strings are not recognized outside of tags
143: case '"':
144: case '\'':
145: skipString();
146: continue;
147: #endif
148: case '<':
149: if (p[1] == '!' && isCommentStart())
150: { // Comments start with <!--
151: scanComment();
152: }
153: else if(p[1] == '!' && isCDATAStart())
154: {
155: scanCDATA();
156: }
157: else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
158: skipTag();
159: else if (istagstart(*skipWhite(p + 1)))
160: skipTag();
161: else
162: goto Ldefault;
163: continue;
164:
165: case 0:
166: case 0x1a:
167: break; // end of file
168:
169: case '&':
170: if (inCode)
171: { // Translate character entity into ascii for D parser
172: int c;
173:
174: c = charEntity();
175: #if MARS
176: buf->writeUTF8(c);
177: #else
178: buf->writeByte(c);
179: #endif
180: }
181: else
182: p++;
183: continue;
184:
185: case '\r':
186: if (p[1] == '\n')
187: goto Ldefault;
188: case '\n':
189: linnum++;
190: // Always extract new lines, so that D lexer counts the
191: // lines right.
192: buf->writeByte(*p);
193: p++;
194: continue;
195:
196: default:
197: Ldefault:
198: if (inCode)
199: buf->writeByte(*p);
200: p++;
201: continue;
202: }
203: break;
204: }
205: buf->writeByte(0); // ending sentinel
206: #if SCPP
207: //printf("Code is: '%s'\n", buf->toString() + 3);
208: #endif
209: #if MARS
210: //printf("D code is: '%s'\n", (char *)buf->data);
211: #endif
212: }
213:
214: /***********************************************
215: * Scan to end of <> tag.
216: * Look for <code> and </code> tags to start/stop D processing.
217: * Input:
218: * p is on opening '<' of tag; it's already verified that
219: * it's a tag by lookahead
220: * Output:
221: * p is past closing '>' of tag
222: */
223:
224: void Html::skipTag()
225: {
226: enum TagState // what parsing state we're in
227: {
228: TStagstart, // start of tag name
229: TStag, // in a tag name
230: TSrest, // following tag name
231: };
232: enum TagState state = TStagstart;
233: int inot;
234: unsigned char *tagstart = NULL;
235: int taglen = 0;
236:
237: p++;
238: inot = 0;
239: if (*p == '/')
240: { inot = 1;
241: p++;
242: }
243: while (1)
244: {
245: switch (*p)
246: {
247: case '>': // found end of tag
248: p++;
249: break;
250:
251: case '"':
252: case '\'':
253: state = TSrest;
254: skipString();
255: continue;
256:
257: case '<':
258: if (p[1] == '!' && isCommentStart())
259: { // Comments start with <!--
260: scanComment();
261: }
262: else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
263: { error("nested tag");
264: skipTag();
265: }
266: else if (istagstart(*skipWhite(p + 1)))
267: { error("nested tag");
268: skipTag();
269: }
270: // Treat comments as if they were whitespace
271: state = TSrest;
272: continue;
273:
274: case 0:
275: case 0x1a:
276: error("end of file before end of tag");
277: break; // end of file
278:
279: case '\r':
280: if (p[1] == '\n')
281: goto Ldefault;
282: case '\n':
283: linnum++;
284: // Always extract new lines, so that code lexer counts the
285: // lines right.
286: dbuf->writeByte(*p);
287: state = TSrest; // end of tag
288: p++;
289: continue;
290:
291: case ' ':
292: case '\t':
293: case '\f':
294: case '\v':
295: if (state == TStagstart)
296: { p++;
297: continue;
298: }
299: default:
300: Ldefault:
301: switch (state)
302: {
303: case TStagstart: // start of tag name
304: assert(istagstart(*p));
305: state = TStag;
306: tagstart = p;
307: taglen = 0;
308: break;
309:
310: case TStag:
311: if (istag(*p))
312: { // Continuing tag name
313: taglen++;
314: }
315: else
316: { // End of tag name
317: state = TSrest;
318: }
319: break;
320:
321: case TSrest:
322: break;
323: }
324: p++;
325: continue;
326: }
327: break;
328: }
329:
330: // See if we parsed a <code> or </code> tag
331: if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
332: && *(p - 2) != '/') // ignore "<code />" (XHTML)
warning C4996: 'memicmp': The POSIX name for this item is deprecated. Instead, use the ISO C++ conformant name: _memicmp. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\string.h(93) : see declaration of 'memicmp'
333: {
334: if (inot)
335: { inCode--;
336: if (inCode < 0)
337: inCode = 0; // ignore extra </code>'s
338: }
339: else
340: inCode++;
341: }
342: }
343:
344: /***********************************************
345: * Scan to end of attribute string.
346: */
347:
348: void Html::skipString()
349: {
350: int tc = *p;
351:
352: while (1)
353: {
354: p++;
355: switch (*p)
356: {
357: case '"':
358: case '\'':
359: if (*p == tc)
360: { p++;
361: break;
362: }
363: continue;
364:
365: case '\r':
366: if (p[1] == '\n')
367: goto Ldefault;
368: case '\n':
369: linnum++;
370: // Always extract new lines, so that D lexer counts the
371: // lines right.
372: dbuf->writeByte(*p);
373: continue;
374:
375: case 0:
376: case 0x1a:
377: Leof:
warning C4102: 'Leof' : unreferenced label
378: error("end of file before closing %c of string", tc);
379: break;
380:
381: default:
382: Ldefault:
383: continue;
384: }
385: break;
386: }
387: }
388:
389: /*********************************
390: * If p points to any white space, skip it
391: * and return pointer just past it.
392: */
393:
394: unsigned char *Html::skipWhite(unsigned char *q)
395: {
396: for (; 1; q++)
397: {
398: switch (*q)
399: {
400: case ' ':
401: case '\t':
402: case '\f':
403: case '\v':
404: case '\r':
405: case '\n':
406: continue;
407:
408: default:
409: break;
410: }
411: break;
412: }
413: return q;
414: }
415:
416: /***************************************************
417: * Scan to end of comment.
418: * Comments are defined any of a number of ways.
419: * IE 5.0: <!-- followed by >
420: * "HTML The Definitive Guide": <!-- text with at least one space in it -->
421: * Netscape: <!-- --> comments nest
422: * w3c: whitespace can appear between -- and > of comment close
423: */
424:
425: void Html::scanComment()
426: {
427: // Most of the complexity is dealing with the case that
428: // an arbitrary amount of whitespace can appear between
429: // the -- and the > of a comment close.
430: int scangt = 0;
431:
432: //printf("scanComment()\n");
433: if (*p == '\n')
434: { linnum++;
435: // Always extract new lines, so that D lexer counts the
436: // lines right.
437: dbuf->writeByte(*p);
438: }
439: while (1)
440: {
441: //scangt = 1; // IE 5.0 compatibility
442: p++;
443: switch (*p)
444: {
445: case '-':
446: if (p[1] == '-')
447: {
448: if (p[2] == '>') // optimize for most common case
449: {
450: p += 3;
451: break;
452: }
453: p++;
454: scangt = 1;
455: }
456: else
457: scangt = 0;
458: continue;
459:
460: case '>':
461: if (scangt)
462: { // found -->
463: p++;
464: break;
465: }
466: continue;
467:
468: case ' ':
469: case '\t':
470: case '\f':
471: case '\v':
472: // skip white space
473: continue;
474:
475: case '\r':
476: if (p[1] == '\n')
477: goto Ldefault;
478: case '\n':
479: linnum++; // remember to count lines
480: // Always extract new lines, so that D lexer counts the
481: // lines right.
482: dbuf->writeByte(*p);
483: continue;
484:
485: case 0:
486: case 0x1a:
487: error("end of file before closing --> of comment");
488: break;
489:
490: default:
491: Ldefault:
492: scangt = 0; // it's not -->
493: continue;
494: }
495: break;
496: }
497: //printf("*p = '%c'\n", *p);
498: }
499:
500: /********************************************
501: * Determine if we are at the start of a comment.
502: * Input:
503: * p is on the opening '<'
504: * Returns:
505: * 0 if not start of a comment
506: * 1 if start of a comment, p is adjusted to point past --
507: */
508:
509: int Html::isCommentStart()
510: #ifdef __DMC__
511: __out(result)
512: {
513: if (result == 0)
514: ;
515: else if (result == 1)
516: {
517: assert(p[-2] == '-' && p[-1] == '-');
518: }
519: else
520: assert(0);
521: }
522: __body
523: #endif /* __DMC__ */
524: { unsigned char *s;
525:
526: if (p[0] == '<' && p[1] == '!')
527: {
528: for (s = p + 2; 1; s++)
529: {
530: switch (*s)
531: {
532: case ' ':
533: case '\t':
534: case '\r':
535: case '\f':
536: case '\v':
537: // skip white space, even though spec says no
538: // white space is allowed
539: continue;
540:
541: case '-':
542: if (s[1] == '-')
543: {
544: p = s + 2;
545: return 1;
546: }
547: goto No;
548:
549: default:
550: goto No;
551: }
552: }
553: }
554: No:
555: return 0;
556: }
557:
558: int Html::isCDATAStart()
559: {
560: const char * CDATA_START_MARKER = "<![CDATA[";
561: size_t len = strlen(CDATA_START_MARKER);
562:
563: if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
564: {
565: p += len;
566: return 1;
567: }
568: else
569: {
570: return 0;
571: }
572: }
573:
574: void Html::scanCDATA()
575: {
576: while(*p && *p != 0x1A)
577: {
578: int lineSepLength = isLineSeparator(p);
579: if (lineSepLength>0)
580: {
581: /* Always extract new lines, so that D lexer counts the lines
582: * right.
583: */
584: linnum++;
585: dbuf->writeByte('\n');
586: p += lineSepLength;
587: continue;
588: }
589: else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
590: {
591: /* end of CDATA section */
592: p += 3;
593: return;
594: }
595: else if (inCode)
596: {
597: /* this CDATA section contains D code */
598: dbuf->writeByte(*p);
599: }
600:
601: p++;
602: }
603: }
604:
605:
606: /********************************************
607: * Convert an HTML character entity into a character.
608: * Forms are:
609: * &name; named entity
610: * &#ddd; decimal
611: * &#xhhhh; hex
612: * Input:
613: * p is on the &
614: */
615:
616: int Html::charEntity()
617: { int c = 0;
618: int v;
619: int hex;
620: unsigned char *pstart = p;
621:
622: //printf("Html::charEntity('%c')\n", *p);
623: if (p[1] == '#')
624: {
625: p++;
626: if (p[1] == 'x' || p[1] == 'X')
627: { p++;
628: hex = 1;
629: }
630: else
631: hex = 0;
632: if (p[1] == ';')
633: goto Linvalid;
634: while (1)
635: {
636: p++;
637: switch (*p)
638: {
639: case 0:
640: case 0x1a:
641: error("end of file before end of character entity");
642: goto Lignore;
643:
644: case '\n':
645: case '\r':
646: case '<': // tag start
647: // Termination is assumed
648: break;
649:
650: case ';':
651: // Termination is explicit
652: p++;
653: break;
654:
655: case '0': case '1': case '2': case '3': case '4':
656: case '5': case '6': case '7': case '8': case '9':
657: v = *p - '0';
658: goto Lvalue;
659:
660: case 'a': case 'b': case 'c':
661: case 'd': case 'e': case 'f':
662: if (!hex)
663: goto Linvalid;
664: v = (*p - 'a') + 10;
665: goto Lvalue;
666:
667: case 'A': case 'B': case 'C':
668: case 'D': case 'E': case 'F':
669: if (!hex)
670: goto Linvalid;
671: v = (*p - 'A') + 10;
672: goto Lvalue;
673:
674: Lvalue:
675: if (hex)
676: c = (c << 4) + v;
677: else
678: c = (c * 10) + v;
679: if (c > 0x10FFFF)
680: {
681: error("character entity out of range");
682: goto Lignore;
683: }
684: continue;
685:
686: default:
687: Linvalid:
688: error("invalid numeric character reference");
689: goto Lignore;
690: }
691: break;
692: }
693: }
694: else
695: {
696: // It's a named entity; gather all characters until ;
697: unsigned char *idstart = p + 1;
698:
699: while (1)
700: {
701: p++;
702: switch (*p)
703: {
704: case 0:
705: case 0x1a:
706: error("end of file before end of character entity");
707: break;
708:
709: case '\n':
710: case '\r':
711: case '<': // tag start
712: // Termination is assumed
713: c = HtmlNamedEntity(idstart, p - idstart);
714: if (c == -1)
715: goto Lignore;
716: break;
717:
718: case ';':
719: // Termination is explicit
720: c = HtmlNamedEntity(idstart, p - idstart);
721: if (c == -1)
722: goto Lignore;
723: p++;
724: break;
725:
726: default:
727: continue;
728: }
729: break;
730: }
731: }
732:
733: // Kludge to convert non-breaking space to ascii space
734: if (c == 160)
735: c = ' ';
736:
737: return c;
738:
739: Lignore:
740: //printf("Lignore\n");
741: p = pstart + 1;
742: return '&';
743: }
744:
745: /**
746: * identify DOS, Linux, Mac, Next and Unicode line endings
747: * 0 if this is no line separator
748: * >0 the length of the separator
749: * Note: input has to be UTF-8
750: */
751: static int isLineSeparator(const unsigned char* p)
752: {
753: // Linux
754: if( p[0]=='\n')
755: return 1;
756:
757: // Mac & Dos
758: if( p[0]=='\r')
759: return (p[1]=='\n') ? 2 : 1;
760:
761: // Unicode (line || paragraph sep.)
762: if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
763: return 3;
764:
765: // Next
766: if( p[0]==0xC2 && p[1]==0x85)
767: return 2;
768:
769: return 0;
770: }
771:
772:
773: