1 /* streamio.c -- handles character stream I/O
3 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
6 Wrapper around Tidy input source and output sink
7 that calls appropriate interfaces, and applies
8 necessary char encoding transformations: to/from
9 ISO-10646 and/or UTF-8.
23 #ifdef TIDY_WIN32_MLANG_SUPPORT
27 /************************
28 ** Forward Declarations
29 ************************/
31 static uint ReadCharFromStream( StreamIn* in );
33 static uint ReadByte( StreamIn* in );
34 static void UngetByte( StreamIn* in, uint byteValue );
36 static void PutByte( uint byteValue, StreamOut* out );
38 static void EncodeWin1252( uint c, StreamOut* out );
39 static void EncodeMacRoman( uint c, StreamOut* out );
40 static void EncodeIbm858( uint c, StreamOut* out );
41 static void EncodeLatin0( uint c, StreamOut* out );
43 static uint DecodeIbm850(uint c);
44 static uint DecodeLatin0(uint c);
46 static uint PopChar( StreamIn *in );
48 /******************************
49 ** Static (duration) Globals
50 ******************************/
52 static StreamOut stderrStreamOut =
57 #ifdef TIDY_WIN32_MLANG_SUPPORT
61 { 0, TY_(filesink_putByte) }
64 static StreamOut stdoutStreamOut =
69 #ifdef TIDY_WIN32_MLANG_SUPPORT
73 { 0, TY_(filesink_putByte) }
76 StreamOut* TY_(StdErrOutput)(void)
78 if ( stderrStreamOut.sink.sinkData == 0 )
79 stderrStreamOut.sink.sinkData = stderr;
80 return &stderrStreamOut;
84 StreamOut* TY_(StdOutOutput)(void)
86 if ( stdoutStreamOut.sink.sinkData == 0 )
87 stdoutStreamOut.sink.sinkData = stdout;
88 return &stdoutStreamOut;
92 void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out )
94 if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
96 if ( out->iotype == FileIO )
97 fclose( (FILE*) out->sink.sinkData );
98 TidyDocFree( doc, out );
102 /************************
104 ************************/
106 static void InitLastPos( StreamIn *in );
108 StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
110 StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
112 TidyClearMemory( in, sizeof(StreamIn) );
115 in->encoding = encoding;
116 in->state = FSM_ASCII;
118 in->bufsize = CHARBUF_SIZE;
119 in->allocator = doc->allocator;
120 in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
122 #ifdef TIDY_STORE_ORIGINAL_TEXT
130 void TY_(freeStreamIn)(StreamIn* in)
132 #ifdef TIDY_STORE_ORIGINAL_TEXT
134 TidyFree(in->allocator, in->otextbuf);
136 TidyFree(in->allocator, in->charbuf);
137 TidyFree(in->allocator, in);
140 StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
142 StreamIn *in = TY_(initStreamIn)( doc, encoding );
143 if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
145 TY_(freeStreamIn)( in );
152 StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
154 StreamIn *in = TY_(initStreamIn)( doc, encoding );
155 tidyInitInputBuffer( &in->source, buf );
156 in->iotype = BufferIO;
160 StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
162 StreamIn *in = TY_(initStreamIn)( doc, encoding );
163 memcpy( &in->source, source, sizeof(TidyInputSource) );
168 int TY_(ReadBOMEncoding)(StreamIn *in)
171 #if SUPPORT_UTF16_ENCODINGS
176 if (c == EndOfStream)
180 if (c1 == EndOfStream)
186 /* todo: dont warn about mismatch for auto input encoding */
187 /* todo: let the user override the encoding found here */
189 #if SUPPORT_UTF16_ENCODINGS
192 if ( bom == UNICODE_BOM_BE )
194 /* big-endian UTF-16 */
195 if ( in->encoding != UTF16 && in->encoding != UTF16BE )
196 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
198 return UTF16BE; /* return decoded BOM */
200 else if (bom == UNICODE_BOM_LE)
202 /* little-endian UTF-16 */
203 if (in->encoding != UTF16 && in->encoding != UTF16LE)
204 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
206 return UTF16LE; /* return decoded BOM */
209 #endif /* SUPPORT_UTF16_ENCODINGS */
211 uint c2 = ReadByte(in);
213 if (c2 == EndOfStream)
220 if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
223 if (in->encoding != UTF8)
224 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
238 #ifdef TIDY_STORE_ORIGINAL_TEXT
239 void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
241 if (in->otextlen + 1 >= in->otextsize)
243 size_t size = in->otextsize ? 1 : 2;
244 in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size);
245 in->otextsize += size;
247 in->otextbuf[in->otextlen++] = c;
248 in->otextbuf[in->otextlen ] = 0;
251 void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
253 int i, err, count = 0;
254 tmbchar buf[10] = {0};
256 err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
260 /* replacement character 0xFFFD encoded as UTF-8 */
261 buf[0] = (byte) 0xEF;
262 buf[1] = (byte) 0xBF;
263 buf[2] = (byte) 0xBD;
267 for (i = 0; i < count; ++i)
268 TY_(AddByteToOriginalText)(in, buf[i]);
272 static void InitLastPos( StreamIn *in )
275 in->firstlastpos = 0;
278 static void PopLastPos( StreamIn *in )
280 in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
281 if ( in->curlastpos == in->firstlastpos )
282 in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
285 static void SaveLastPos( StreamIn *in )
288 in->lastcols[in->curlastpos] = in->curcol;
291 static void RestoreLastPos( StreamIn *in )
293 if ( in->firstlastpos == in->curlastpos )
297 in->curcol = in->lastcols[in->curlastpos];
298 if ( in->curlastpos == 0 )
299 in->curlastpos = LASTPOS_SIZE;
304 uint TY_(ReadChar)( StreamIn *in )
306 uint c = EndOfStream;
307 uint tabsize = cfg( in->doc, TidyTabSize );
308 #ifdef TIDY_STORE_ORIGINAL_TEXT
313 return PopChar( in );
326 c = ReadCharFromStream(in);
328 if ( EndOfStream == c )
333 #ifdef TIDY_STORE_ORIGINAL_TEXT
335 TY_(AddCharToOriginalText)(in, (tchar)c);
344 #ifdef TIDY_STORE_ORIGINAL_TEXT
346 TY_(AddCharToOriginalText)(in, (tchar)c);
348 in->tabs = tabsize > 0 ?
349 tabsize - ((in->curcol - 1) % tabsize) - 1
356 /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
359 #ifdef TIDY_STORE_ORIGINAL_TEXT
361 TY_(AddCharToOriginalText)(in, (tchar)c);
363 c = ReadCharFromStream(in);
366 TY_(UngetChar)( c, in );
371 #ifdef TIDY_STORE_ORIGINAL_TEXT
372 TY_(AddCharToOriginalText)(in, (tchar)c);
380 #ifndef NO_NATIVE_ISO2022_SUPPORT
381 /* strip control characters, except for Esc */
386 /* Form Feed is allowed in HTML */
387 if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
391 continue; /* discard control char */
393 /* watch out for chars that have already been decoded such as */
394 /* IS02022, UTF-8 etc, that don't require further decoding */
398 #ifndef NO_NATIVE_ISO2022_SUPPORT
399 || in->encoding == ISO2022
401 || in->encoding == UTF8
403 #if SUPPORT_ASIAN_ENCODINGS
404 || in->encoding == SHIFTJIS /* #431953 - RJ */
405 || in->encoding == BIG5 /* #431953 - RJ */
413 #if SUPPORT_UTF16_ENCODINGS
414 /* handle surrogate pairs */
415 if ( in->encoding == UTF16LE ||
416 in->encoding == UTF16 ||
417 in->encoding == UTF16BE )
419 if ( !TY_(IsValidUTF16FromUCS4)(c) )
421 /* invalid UTF-16 value */
422 TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
425 else if ( TY_(IsLowSurrogate)(c) )
428 uint m = ReadCharFromStream( in );
429 if ( m == EndOfStream )
433 if ( TY_(IsHighSurrogate)(m) )
435 n = TY_(CombineSurrogatePair)( m, n );
436 if ( TY_(IsValidCombinedChar)(n) )
439 /* not a valid pair */
441 TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
446 /* Do first: acts on range 128 - 255 */
447 switch ( in->encoding )
450 c = TY_(DecodeMacRoman)( c );
453 c = DecodeIbm850( c );
456 c = DecodeLatin0( c );
460 /* produced e.g. as a side-effect of smart quotes in Word */
461 /* but can't happen if using MACROMAN encoding */
462 if ( 127 < c && c < 160 )
464 uint c1 = 0, replMode = DISCARDED_CHAR;
465 Bool isVendorChar = ( in->encoding == WIN1252 ||
466 in->encoding == MACROMAN );
467 Bool isWinChar = ( in->encoding == WIN1252 ||
468 TY_(ReplacementCharEncoding) == WIN1252 );
469 Bool isMacChar = ( in->encoding == MACROMAN ||
470 TY_(ReplacementCharEncoding) == MACROMAN );
472 /* set error position just before offending character */
475 in->doc->lexer->lines = in->curline;
476 in->doc->lexer->columns = in->curcol;
480 c1 = TY_(DecodeWin1252)( c );
481 else if ( isMacChar )
482 c1 = TY_(DecodeMacRoman)( c );
484 replMode = REPLACED_CHAR;
486 if ( c1 == 0 && isVendorChar )
487 TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
488 else if ( ! isVendorChar )
489 TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
495 continue; /* illegal char is discarded */
501 #ifdef TIDY_STORE_ORIGINAL_TEXT
503 TY_(AddCharToOriginalText)(in, (tchar)c);
509 static uint PopChar( StreamIn *in )
511 uint c = EndOfStream;
514 assert( in->bufpos > 0 );
515 c = in->charbuf[ --in->bufpos ];
516 if ( in->bufpos == 0 )
532 void TY_(UngetChar)( uint c, StreamIn *in )
534 if (c == EndOfStream)
536 /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
542 if (in->bufpos + 1 >= in->bufsize)
543 in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
545 in->charbuf[(in->bufpos)++] = c;
550 RestoreLastPos( in );
555 /************************
557 ************************/
559 static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
561 StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
562 TidyClearMemory( out, sizeof(StreamOut) );
563 out->encoding = encoding;
564 out->state = FSM_ASCII;
569 StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
571 StreamOut* out = initStreamOut( doc, encoding, nl );
572 TY_(initFileSink)( &out->sink, fp );
573 out->iotype = FileIO;
576 StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
578 StreamOut* out = initStreamOut( doc, encoding, nl );
579 tidyInitOutputBuffer( &out->sink, buf );
580 out->iotype = BufferIO;
583 StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
585 StreamOut* out = initStreamOut( doc, encoding, nl );
586 memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
587 out->iotype = UserIO;
591 void TY_(WriteChar)( uint c, StreamOut* out )
593 /* Translate outgoing newlines */
596 if ( out->nl == TidyCRLF )
597 TY_(WriteChar)( CR, out );
598 else if ( out->nl == TidyCR )
602 if (out->encoding == MACROMAN)
604 EncodeMacRoman( c, out );
606 else if (out->encoding == WIN1252)
608 EncodeWin1252( c, out );
610 else if (out->encoding == IBM858)
612 EncodeIbm858( c, out );
614 else if (out->encoding == LATIN0)
616 EncodeLatin0( c, out );
619 else if (out->encoding == UTF8)
623 TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
626 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
627 /* replacement char 0xFFFD encoded as UTF-8 */
628 PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
631 #ifndef NO_NATIVE_ISO2022_SUPPORT
632 else if (out->encoding == ISO2022)
634 if (c == 0x1b) /* ESC */
635 out->state = FSM_ESC;
642 out->state = FSM_ESCD;
644 out->state = FSM_ESCP;
646 out->state = FSM_ASCII;
651 out->state = FSM_ESCDP;
653 out->state = FSM_NONASCII;
657 out->state = FSM_NONASCII;
661 out->state = FSM_ASCII;
675 #endif /* NO_NATIVE_ISO2022_SUPPORT */
677 #if SUPPORT_UTF16_ENCODINGS
678 else if ( out->encoding == UTF16LE ||
679 out->encoding == UTF16BE ||
680 out->encoding == UTF16 )
685 if ( !TY_(IsValidUTF16FromUCS4)(c) )
687 /* invalid UTF-16 value */
688 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
692 else if ( TY_(IsCombinedChar)(c) )
694 /* output both, unless something goes wrong */
696 if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
698 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
705 /* just put the char out */
709 for (i = 0; i < numChars; i++)
713 if (out->encoding == UTF16LE)
715 uint ch = c & 0xFF; PutByte(ch, out);
716 ch = (c >> 8) & 0xFF; PutByte(ch, out);
719 else if (out->encoding == UTF16BE || out->encoding == UTF16)
721 uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
722 ch = c & 0xFF; PutByte(ch, out);
728 #if SUPPORT_ASIAN_ENCODINGS
729 else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
735 uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
736 ch = c & 0xFF; PutByte(ch, out);
747 /****************************
748 ** Miscellaneous / Helpers
749 ****************************/
751 /* char encoding used when replacing illegal SGML chars,
752 ** regardless of specified encoding. Set at compile time
753 ** to either Windows or Mac.
755 const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC;
758 /* Mapping for Windows Western character set CP 1252
759 ** (chars 128-159/U+0080-U+009F) to Unicode.
761 static const uint Win2Unicode[32] =
763 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
764 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
765 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
766 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
769 /* Function for conversion from Windows-1252 to Unicode */
770 uint TY_(DecodeWin1252)(uint c)
772 if (127 < c && c < 160)
773 c = Win2Unicode[c - 128];
778 static void EncodeWin1252( uint c, StreamOut* out )
780 if (c < 128 || (c > 159 && c < 256))
786 for (i = 128; i < 160; i++)
787 if (Win2Unicode[i - 128] == c)
796 John Love-Jensen contributed this table for mapping MacRoman
797 character set to Unicode
800 /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
801 static const uint Mac2Unicode[128] =
805 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
806 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
808 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
809 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
811 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
812 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
814 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
815 /* =BD U+2126 OHM SIGN */
816 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
818 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
819 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
821 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
822 /* =DB U+00A4 CURRENCY SIGN */
823 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
825 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
826 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
827 /* xF0 = Apple Logo */
828 /* =F0 U+2665 BLACK HEART SUIT */
829 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
830 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
833 /* Function to convert from MacRoman to Unicode */
834 uint TY_(DecodeMacRoman)(uint c)
837 c = Mac2Unicode[c - 128];
841 static void EncodeMacRoman( uint c, StreamOut* out )
847 /* For mac users, map Unicode back to MacRoman. */
849 for (i = 128; i < 256; i++)
851 if (Mac2Unicode[i - 128] == c)
860 /* Mapping for OS/2 Western character set CP 850
861 ** (chars 128-255) to Unicode.
863 static const uint IBM2Unicode[128] =
865 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
866 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
867 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
868 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
869 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
870 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
871 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
872 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
873 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
874 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
875 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
876 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
877 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
878 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
879 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
880 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
883 /* Function for conversion from OS/2-850 to Unicode */
884 static uint DecodeIbm850(uint c)
886 if (127 < c && c < 256)
887 c = IBM2Unicode[c - 128];
892 /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
893 static void EncodeIbm858( uint c, StreamOut* out )
900 for (i = 128; i < 256; i++)
902 if (IBM2Unicode[i - 128] == c)
912 /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
913 static uint DecodeLatin0(uint c)
915 if (159 < c && c < 191)
919 case 0xA4: c = 0x20AC; break;
920 case 0xA6: c = 0x0160; break;
921 case 0xA8: c = 0x0161; break;
922 case 0xB4: c = 0x017D; break;
923 case 0xB8: c = 0x017E; break;
924 case 0xBC: c = 0x0152; break;
925 case 0xBD: c = 0x0153; break;
926 case 0xBE: c = 0x0178; break;
932 /* Map Unicode back to ISO-8859-15. */
933 static void EncodeLatin0( uint c, StreamOut* out )
937 case 0x20AC: c = 0xA4; break;
938 case 0x0160: c = 0xA6; break;
939 case 0x0161: c = 0xA8; break;
940 case 0x017D: c = 0xB4; break;
941 case 0x017E: c = 0xB8; break;
942 case 0x0152: c = 0xBC; break;
943 case 0x0153: c = 0xBD; break;
944 case 0x0178: c = 0xBE; break;
950 Table to map symbol font characters to Unicode; undefined
951 characters are mapped to 0x0000 and characters without any
952 Unicode equivalent are mapped to '?'. Is this appropriate?
955 static const uint Symbol2Unicode[] =
957 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
958 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
960 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
961 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
963 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
964 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
966 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
967 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
969 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
970 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
972 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
973 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
975 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
976 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
978 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
979 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
981 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
982 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
984 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
985 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
987 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
988 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
990 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
991 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
993 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
994 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
996 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
997 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
999 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
1000 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
1002 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
1003 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
1007 /* Function to convert from Symbol Font chars to Unicode */
1008 uint DecodeSymbolFont(uint c)
1013 /* todo: add some error message */
1015 return Symbol2Unicode[c];
1020 /* Facilitates user defined source by providing
1021 ** an entry point to marshal pointers-to-functions.
1022 ** Needed by .NET and possibly other language bindings.
1024 Bool TIDY_CALL tidyInitSource( TidyInputSource* source,
1026 TidyGetByteFunc gbFunc,
1027 TidyUngetByteFunc ugbFunc,
1028 TidyEOFFunc endFunc )
1030 Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
1034 source->sourceData = srcData;
1035 source->getByte = gbFunc;
1036 source->ungetByte = ugbFunc;
1037 source->eof = endFunc;
1043 Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
1045 TidyPutByteFunc pbFunc )
1047 Bool status = ( sink && snkData && pbFunc );
1050 sink->sinkData = snkData;
1051 sink->putByte = pbFunc;
1056 /* GetByte must return a byte value in a signed
1057 ** integer so that a negative value can signal EOF
1058 ** without interfering w/ 0-255 legitimate byte values.
1060 uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1062 int bv = source->getByte( source->sourceData );
1065 Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1067 return source->eof( source->sourceData );
1069 void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1071 source->ungetByte( source->sourceData, (byte) ch );
1073 void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1075 sink->putByte( sink->sinkData, (byte) ch );
1078 static uint ReadByte( StreamIn* in )
1080 return tidyGetByte( &in->source );
1082 Bool TY_(IsEOF)( StreamIn* in )
1084 return tidyIsEOF( &in->source );
1086 static void UngetByte( StreamIn* in, uint byteValue )
1088 tidyUngetByte( &in->source, byteValue );
1090 static void PutByte( uint byteValue, StreamOut* out )
1092 tidyPutByte( &out->sink, byteValue );
1096 static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1100 for (i = 0; i < *count; i++)
1102 /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1103 if ( in && TY_(IsEOF)(in) )
1105 /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1110 in->source.ungetByte( in->source.sourceData, buf[i] );
1115 Read raw bytes from stream, return <= 0 if EOF; or if
1116 "unget" is true, Unget the bytes to re-synchronize the input stream
1117 Normally UTF-8 successor bytes are read using this routine.
1119 static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1122 for ( ix=0; ix < *count; ++ix )
1124 if ( in->rawPushed )
1126 buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1127 if ( in->rawBufpos == 0 )
1132 if ( in->source.eof(in->source.sourceData) )
1137 buf[ix] = in->source.getByte( in->source.sourceData );
1143 /* read char from stream */
1144 static uint ReadCharFromStream( StreamIn* in )
1147 #ifdef TIDY_WIN32_MLANG_SUPPORT
1151 if ( TY_(IsEOF)(in) )
1156 if (c == EndOfStream)
1159 #ifndef NO_NATIVE_ISO2022_SUPPORT
1161 A document in ISO-2022 based encoding uses some ESC sequences
1162 called "designator" to switch character sets. The designators
1163 defined and used in ISO-2022-JP are:
1165 "ESC" + "(" + ? for ISO646 variants
1168 "ESC" + "$" + "(" + ? for multibyte character sets
1170 Where ? stands for a single character used to indicate the
1171 character set for multibyte characters.
1173 Tidy handles this by preserving the escape sequence and
1174 setting the top bit of each byte for non-ascii chars. This
1175 bit is then cleared on output. The input stream keeps track
1176 of the state to determine when to set/clear the bit.
1179 if (in->encoding == ISO2022)
1181 if (c == 0x1b) /* ESC */
1183 in->state = FSM_ESC;
1191 in->state = FSM_ESCD;
1193 in->state = FSM_ESCP;
1195 in->state = FSM_ASCII;
1200 in->state = FSM_ESCDP;
1202 in->state = FSM_NONASCII;
1206 in->state = FSM_NONASCII;
1210 in->state = FSM_ASCII;
1223 #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1225 #if SUPPORT_UTF16_ENCODINGS
1226 if ( in->encoding == UTF16LE )
1228 uint c1 = ReadByte( in );
1229 if ( EndOfStream == c1 )
1235 if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1237 uint c1 = ReadByte( in );
1238 if ( EndOfStream == c1 )
1245 if ( in->encoding == UTF8 )
1247 /* deal with UTF-8 encoded char */
1251 /* first byte "c" is passed in separately */
1252 err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1253 if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1257 /* set error position just before offending character */
1258 in->doc->lexer->lines = in->curline;
1259 in->doc->lexer->columns = in->curcol;
1261 TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1262 n = 0xFFFD; /* replacement char */
1268 #if SUPPORT_ASIAN_ENCODINGS
1270 This section is suitable for any "multibyte" variable-width
1271 character encoding in which a one-byte code is less than
1272 128, and the first byte of a two-byte code is greater or
1273 equal to 128. Note that Big5 and ShiftJIS fit into this
1274 kind, even though their second byte may be less than 128
1276 if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1280 else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1283 Rick Cameron pointed out that for Shift_JIS, the values from
1284 0xa1 through 0xdf represent singe-byte characters
1285 (U+FF61 to U+FF9F - half-shift Katakana)
1291 uint c1 = ReadByte( in );
1292 if ( EndOfStream == c1 )
1300 #ifdef TIDY_WIN32_MLANG_SUPPORT
1301 else if (in->encoding > WIN32MLANG)
1303 assert( in->mlang != NULL );
1304 return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1314 /* Output a Byte Order Mark if required */
1315 void TY_(outBOM)( StreamOut *out )
1317 if ( out->encoding == UTF8
1318 #if SUPPORT_UTF16_ENCODINGS
1319 || out->encoding == UTF16LE
1320 || out->encoding == UTF16BE
1321 || out->encoding == UTF16
1325 /* this will take care of encoding the BOM correctly */
1326 TY_(WriteChar)( UNICODE_BOM, out );
1330 /* this is in intermediate fix for various problems in the */
1331 /* long term code and data in charsets.c should be used */
1332 static struct _enc2iana
1336 ctmbstr tidyOptName;
1337 } const enc2iana[] =
1339 { ASCII, "us-ascii", "ascii" },
1340 { LATIN0, "iso-8859-15", "latin0" },
1341 { LATIN1, "iso-8859-1", "latin1" },
1342 { UTF8, "utf-8", "utf8" },
1343 { MACROMAN, "macintosh", "mac" },
1344 { WIN1252, "windows-1252", "win1252" },
1345 { IBM858, "ibm00858", "ibm858" },
1346 #if SUPPORT_UTF16_ENCODINGS
1347 { UTF16LE, "utf-16", "utf16le" },
1348 { UTF16BE, "utf-16", "utf16be" },
1349 { UTF16, "utf-16", "utf16" },
1351 #if SUPPORT_ASIAN_ENCODINGS
1352 { BIG5, "big5", "big5" },
1353 { SHIFTJIS, "shift_jis", "shiftjis"},
1355 #ifndef NO_NATIVE_ISO2022_SUPPORT
1356 { ISO2022, NULL, "iso2022" },
1358 { RAW, NULL, "raw" }
1361 ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1365 for (i = 0; enc2iana[i].name; ++i)
1366 if (enc2iana[i].id == id)
1367 return enc2iana[i].name;
1372 ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1376 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1377 if (enc2iana[i].id == id)
1378 return enc2iana[i].tidyOptName;
1383 int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1387 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1388 if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1389 return enc2iana[i].id;
1397 * indent-tabs-mode: nil
1399 * eval: (c-set-offset 'substatement-open 0)