third-party/tidy-html5-master/src/streamio.c

   1 /* streamio.c -- handles character stream I/O
   2
   3   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
   4   See tidy.h for the copyright notice.
   5
   6   Wrapper around Tidy input source and output sink
   7   that calls appropriate interfaces, and applies
   8   necessary char encoding transformations: to/from
   9   ISO-10646 and/or UTF-8.
  10
  11 */
  12
  13 #include <stdio.h>
  14 #include <errno.h>
  15
  16 #include "streamio.h"
  17 #include "tidy-int.h"
  18 #include "lexer.h"
  19 #include "message.h"
  20 #include "utf8.h"
  21 #include "tmbstr.h"
  22
  23 #ifdef TIDY_WIN32_MLANG_SUPPORT
  24 #include "win32tc.h"
  25 #endif
  26
  27 /************************
  28 ** Forward Declarations
  29 ************************/
  30
  31 static uint ReadCharFromStream( StreamIn* in );
  32
  33 static uint ReadByte( StreamIn* in );
  34 static void UngetByte( StreamIn* in, uint byteValue );
  35
  36 static void PutByte( uint byteValue, StreamOut* out );
  37
  38 static void EncodeWin1252( uint c, StreamOut* out );
  39 static void EncodeMacRoman( uint c, StreamOut* out );
  40 static void EncodeIbm858( uint c, StreamOut* out );
  41 static void EncodeLatin0( uint c, StreamOut* out );
  42
  43 static uint DecodeIbm850(uint c);
  44 static uint DecodeLatin0(uint c);
  45
  46 static uint PopChar( StreamIn *in );
  47
  48 /******************************
  49 ** Static (duration) Globals
  50 ******************************/
  51
  52 static StreamOut stderrStreamOut =
  53 {
  54     ASCII,
  55     FSM_ASCII,
  56     DEFAULT_NL_CONFIG,
  57 #ifdef TIDY_WIN32_MLANG_SUPPORT
  58     NULL,
  59 #endif
  60     FileIO,
  61     { 0, TY_(filesink_putByte) }
  62 };
  63
  64 static StreamOut stdoutStreamOut =
  65 {
  66     ASCII,
  67     FSM_ASCII,
  68     DEFAULT_NL_CONFIG,
  69 #ifdef TIDY_WIN32_MLANG_SUPPORT
  70     NULL,
  71 #endif
  72     FileIO,
  73     { 0, TY_(filesink_putByte) }
  74 };
  75
  76 StreamOut* TY_(StdErrOutput)(void)
  77 {
  78   if ( stderrStreamOut.sink.sinkData == 0 )
  79       stderrStreamOut.sink.sinkData = stderr;
  80   return &stderrStreamOut;
  81 }
  82
  83 #if 0
  84 StreamOut* TY_(StdOutOutput)(void)
  85 {
  86   if ( stdoutStreamOut.sink.sinkData == 0 )
  87       stdoutStreamOut.sink.sinkData = stdout;
  88   return &stdoutStreamOut;
  89 }
  90 #endif
  91
  92 void  TY_(ReleaseStreamOut)( TidyDocImpl *doc,  StreamOut* out )
  93 {
  94     if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
  95     {
  96         if ( out->iotype == FileIO )
  97             fclose( (FILE*) out->sink.sinkData );
  98         TidyDocFree( doc, out );
  99     }
 100 }
 101
 102 /************************
 103 ** Source
 104 ************************/
 105
 106 static void InitLastPos( StreamIn *in );
 107
 108 StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
 109 {
 110     StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
 111
 112     TidyClearMemory( in, sizeof(StreamIn) );
 113     in->curline = 1;
 114     in->curcol = 1;
 115     in->encoding = encoding;
 116     in->state = FSM_ASCII;
 117     in->doc = doc;
 118     in->bufsize = CHARBUF_SIZE;
 119     in->allocator = doc->allocator;
 120     in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
 121     InitLastPos( in );
 122 #ifdef TIDY_STORE_ORIGINAL_TEXT
 123     in->otextbuf = NULL;
 124     in->otextlen = 0;
 125     in->otextsize = 0;
 126 #endif
 127     return in;
 128 }
 129
 130 void TY_(freeStreamIn)(StreamIn* in)
 131 {
 132 #ifdef TIDY_STORE_ORIGINAL_TEXT
 133     if (in->otextbuf)
 134         TidyFree(in->allocator, in->otextbuf);
 135 #endif
 136     TidyFree(in->allocator, in->charbuf);
 137     TidyFree(in->allocator, in);
 138 }
 139
 140 StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
 141 {
 142     StreamIn *in = TY_(initStreamIn)( doc, encoding );
 143     if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
 144     {
 145         TY_(freeStreamIn)( in );
 146         return NULL;
 147     }
 148     in->iotype = FileIO;
 149     return in;
 150 }
 151
 152 StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
 153 {
 154     StreamIn *in = TY_(initStreamIn)( doc, encoding );
 155     tidyInitInputBuffer( &in->source, buf );
 156     in->iotype = BufferIO;
 157     return in;
 158 }
 159
 160 StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
 161 {
 162     StreamIn *in = TY_(initStreamIn)( doc, encoding );
 163     memcpy( &in->source, source, sizeof(TidyInputSource) );
 164     in->iotype = UserIO;
 165     return in;
 166 }
 167
 168 int TY_(ReadBOMEncoding)(StreamIn *in)
 169 {
 170     uint c, c1;
 171 #if SUPPORT_UTF16_ENCODINGS
 172     uint bom;
 173 #endif
 174
 175     c = ReadByte(in);
 176     if (c == EndOfStream)
 177         return -1;
 178
 179     c1 = ReadByte( in );
 180     if (c1 == EndOfStream)
 181     {
 182         UngetByte(in, c);
 183         return -1;
 184     }
 185
 186     /* todo: dont warn about mismatch for auto input encoding */
 187     /* todo: let the user override the encoding found here */
 188
 189 #if SUPPORT_UTF16_ENCODINGS
 190     bom = (c << 8) + c1;
 191
 192     if ( bom == UNICODE_BOM_BE )
 193     {
 194         /* big-endian UTF-16 */
 195         if ( in->encoding != UTF16 && in->encoding != UTF16BE )
 196             TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
 197
 198         return UTF16BE; /* return decoded BOM */
 199     }
 200     else if (bom == UNICODE_BOM_LE)
 201     {
 202         /* little-endian UTF-16 */
 203         if (in->encoding != UTF16 && in->encoding != UTF16LE)
 204             TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
 205
 206         return UTF16LE; /* return decoded BOM */
 207     }
 208     else
 209 #endif /* SUPPORT_UTF16_ENCODINGS */
 210     {
 211         uint c2 = ReadByte(in);
 212
 213         if (c2 == EndOfStream)
 214         {
 215             UngetByte(in, c1);
 216             UngetByte(in, c);
 217             return -1;
 218         }
 219
 220         if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
 221         {
 222             /* UTF-8 */
 223             if (in->encoding != UTF8)
 224                 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
 225
 226             return UTF8;
 227         }
 228         else
 229             UngetByte( in, c2 );
 230     }
 231
 232     UngetByte(in, c1);
 233     UngetByte(in, c);
 234
 235     return -1;
 236 }
 237
 238 #ifdef TIDY_STORE_ORIGINAL_TEXT
 239 void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
 240 {
 241     if (in->otextlen + 1 >= in->otextsize)
 242     {
 243         size_t size = in->otextsize ? 1 : 2;
 244         in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size);
 245         in->otextsize += size;
 246     }
 247     in->otextbuf[in->otextlen++] = c;
 248     in->otextbuf[in->otextlen  ] = 0;
 249 }
 250
 251 void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
 252 {
 253     int i, err, count = 0;
 254     tmbchar buf[10] = {0};
 255
 256     err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
 257
 258     if (err)
 259     {
 260         /* replacement character 0xFFFD encoded as UTF-8 */
 261         buf[0] = (byte) 0xEF;
 262         buf[1] = (byte) 0xBF;
 263         buf[2] = (byte) 0xBD;
 264         count = 3;
 265     }
 266
 267     for (i = 0; i < count; ++i)
 268         TY_(AddByteToOriginalText)(in, buf[i]);
 269 }
 270 #endif
 271
 272 static void InitLastPos( StreamIn *in )
 273 {
 274     in->curlastpos = 0;
 275     in->firstlastpos = 0;
 276 }
 277
 278 static void PopLastPos( StreamIn *in )
 279 {
 280     in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
 281     if ( in->curlastpos == in->firstlastpos )
 282         in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
 283 }
 284
 285 static void SaveLastPos( StreamIn *in )
 286 {
 287     PopLastPos( in );
 288     in->lastcols[in->curlastpos] = in->curcol;
 289 }
 290
 291 static void RestoreLastPos( StreamIn *in )
 292 {
 293     if ( in->firstlastpos == in->curlastpos )
 294         in->curcol = 0;
 295     else
 296     {
 297         in->curcol = in->lastcols[in->curlastpos];
 298         if ( in->curlastpos == 0 )
 299             in->curlastpos = LASTPOS_SIZE;
 300         in->curlastpos--;
 301     }
 302 }
 303
 304 uint TY_(ReadChar)( StreamIn *in )
 305 {
 306     uint c = EndOfStream;
 307     uint tabsize = cfg( in->doc, TidyTabSize );
 308 #ifdef TIDY_STORE_ORIGINAL_TEXT
 309     Bool added = no;
 310 #endif
 311
 312     if ( in->pushed )
 313         return PopChar( in );
 314
 315     SaveLastPos( in );
 316
 317     if ( in->tabs > 0 )
 318     {
 319         in->curcol++;
 320         in->tabs--;
 321         return ' ';
 322     }
 323
 324     for (;;)
 325     {
 326         c = ReadCharFromStream(in);
 327
 328         if ( EndOfStream == c )
 329             return EndOfStream;
 330
 331         if (c == '\n')
 332         {
 333 #ifdef TIDY_STORE_ORIGINAL_TEXT
 334             added = yes;
 335             TY_(AddCharToOriginalText)(in, (tchar)c);
 336 #endif
 337             in->curcol = 1;
 338             in->curline++;
 339             break;
 340         }
 341
 342         if (c == '\t')
 343         {
 344 #ifdef TIDY_STORE_ORIGINAL_TEXT
 345             added = yes;
 346             TY_(AddCharToOriginalText)(in, (tchar)c);
 347 #endif
 348             in->tabs = tabsize > 0 ?
 349                 tabsize - ((in->curcol - 1) % tabsize) - 1
 350                 : 0;
 351             in->curcol++;
 352             c = ' ';
 353             break;
 354         }
 355
 356         /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
 357         if (c == '\r')
 358         {
 359 #ifdef TIDY_STORE_ORIGINAL_TEXT
 360             added = yes;
 361             TY_(AddCharToOriginalText)(in, (tchar)c);
 362 #endif
 363             c = ReadCharFromStream(in);
 364             if (c != '\n')
 365             {
 366                 TY_(UngetChar)( c, in );
 367                 c = '\n';
 368             }
 369             else
 370             {
 371 #ifdef TIDY_STORE_ORIGINAL_TEXT
 372                 TY_(AddCharToOriginalText)(in, (tchar)c);
 373 #endif
 374             }
 375             in->curcol = 1;
 376             in->curline++;
 377             break;
 378         }
 379
 380 #ifndef NO_NATIVE_ISO2022_SUPPORT
 381         /* strip control characters, except for Esc */
 382         if (c == '\033')
 383             break;
 384 #endif
 385
 386         /* Form Feed is allowed in HTML */
 387         if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
 388             break;
 389
 390         if ( c < 32 )
 391             continue; /* discard control char */
 392
 393         /* watch out for chars that have already been decoded such as */
 394         /* IS02022, UTF-8 etc, that don't require further decoding */
 395
 396         if (
 397             in->encoding == RAW
 398 #ifndef NO_NATIVE_ISO2022_SUPPORT
 399          || in->encoding == ISO2022
 400 #endif
 401          || in->encoding == UTF8
 402
 403 #if SUPPORT_ASIAN_ENCODINGS
 404          || in->encoding == SHIFTJIS /* #431953 - RJ */
 405          || in->encoding == BIG5     /* #431953 - RJ */
 406 #endif
 407            )
 408         {
 409             in->curcol++;
 410             break;
 411         }
 412
 413 #if SUPPORT_UTF16_ENCODINGS
 414         /* handle surrogate pairs */
 415         if ( in->encoding == UTF16LE ||
 416              in->encoding == UTF16   ||
 417              in->encoding == UTF16BE )
 418         {
 419             if ( !TY_(IsValidUTF16FromUCS4)(c) )
 420             {
 421                 /* invalid UTF-16 value */
 422                 TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
 423                 c = 0;
 424             }
 425             else if ( TY_(IsLowSurrogate)(c) )
 426             {
 427                 uint n = c;
 428                 uint m = ReadCharFromStream( in );
 429                 if ( m == EndOfStream )
 430                    return EndOfStream;
 431
 432                 c = 0;
 433                 if ( TY_(IsHighSurrogate)(m) )
 434                 {
 435                     n = TY_(CombineSurrogatePair)( m, n );
 436                     if ( TY_(IsValidCombinedChar)(n) )
 437                         c = n;
 438                 }
 439                 /* not a valid pair */
 440                 if ( 0 == c )
 441                     TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
 442             }
 443         }
 444 #endif
 445
 446         /* Do first: acts on range 128 - 255 */
 447         switch ( in->encoding )
 448         {
 449         case MACROMAN:
 450             c = TY_(DecodeMacRoman)( c );
 451             break;
 452         case IBM858:
 453             c = DecodeIbm850( c );
 454             break;
 455         case LATIN0:
 456             c = DecodeLatin0( c );
 457             break;
 458         }
 459
 460         /* produced e.g. as a side-effect of smart quotes in Word */
 461         /* but can't happen if using MACROMAN encoding */
 462         if ( 127 < c && c < 160 )
 463         {
 464             uint c1 = 0, replMode = DISCARDED_CHAR;
 465             Bool isVendorChar = ( in->encoding == WIN1252 ||
 466                                   in->encoding == MACROMAN );
 467             Bool isWinChar    = ( in->encoding == WIN1252 ||
 468                                   TY_(ReplacementCharEncoding) == WIN1252 );
 469             Bool isMacChar    = ( in->encoding == MACROMAN ||
 470                                   TY_(ReplacementCharEncoding) == MACROMAN );
 471
 472             /* set error position just before offending character */
 473             if (in->doc->lexer)
 474             {
 475                 in->doc->lexer->lines = in->curline;
 476                 in->doc->lexer->columns = in->curcol;
 477             }
 478
 479             if ( isWinChar )
 480                 c1 = TY_(DecodeWin1252)( c );
 481             else if ( isMacChar )
 482                 c1 = TY_(DecodeMacRoman)( c );
 483             if ( c1 )
 484                 replMode = REPLACED_CHAR;
 485
 486             if ( c1 == 0 && isVendorChar )
 487                 TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
 488             else if ( ! isVendorChar )
 489                 TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
 490
 491             c = c1;
 492         }
 493
 494         if ( c == 0 )
 495             continue; /* illegal char is discarded */
 496
 497         in->curcol++;
 498         break;
 499     }
 500
 501 #ifdef TIDY_STORE_ORIGINAL_TEXT
 502     if (!added)
 503         TY_(AddCharToOriginalText)(in, (tchar)c);
 504 #endif
 505
 506     return c;
 507 }
 508
 509 static uint PopChar( StreamIn *in )
 510 {
 511     uint c = EndOfStream;
 512     if ( in->pushed )
 513     {
 514         assert( in->bufpos > 0 );
 515         c = in->charbuf[ --in->bufpos ];
 516         if ( in->bufpos == 0 )
 517             in->pushed = no;
 518
 519         if ( c == '\n' )
 520         {
 521             in->curcol = 1;
 522             in->curline++;
 523             PopLastPos( in );
 524             return c;
 525         }
 526         in->curcol++;
 527         PopLastPos( in );
 528     }
 529     return c;
 530 }
 531
 532 void TY_(UngetChar)( uint c, StreamIn *in )
 533 {
 534     if (c == EndOfStream)
 535     {
 536         /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
 537         return;
 538     }
 539
 540     in->pushed = yes;
 541
 542     if (in->bufpos + 1 >= in->bufsize)
 543         in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
 544
 545     in->charbuf[(in->bufpos)++] = c;
 546
 547     if (c == '\n')
 548         --(in->curline);
 549
 550     RestoreLastPos( in );
 551 }
 552
 553
 554
 555 /************************
 556 ** Sink
 557 ************************/
 558
 559 static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
 560 {
 561     StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
 562     TidyClearMemory( out, sizeof(StreamOut) );
 563     out->encoding = encoding;
 564     out->state = FSM_ASCII;
 565     out->nl = nl;
 566     return out;
 567 }
 568
 569 StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
 570 {
 571     StreamOut* out = initStreamOut( doc, encoding, nl );
 572     TY_(initFileSink)( &out->sink, fp );
 573     out->iotype = FileIO;
 574     return out;
 575 }
 576 StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
 577 {
 578     StreamOut* out = initStreamOut( doc, encoding, nl );
 579     tidyInitOutputBuffer( &out->sink, buf );
 580     out->iotype = BufferIO;
 581     return out;
 582 }
 583 StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
 584 {
 585     StreamOut* out = initStreamOut( doc, encoding, nl );
 586     memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
 587     out->iotype = UserIO;
 588     return out;
 589 }
 590
 591 void TY_(WriteChar)( uint c, StreamOut* out )
 592 {
 593     /* Translate outgoing newlines */
 594     if ( LF == c )
 595     {
 596       if ( out->nl == TidyCRLF )
 597           TY_(WriteChar)( CR, out );
 598       else if ( out->nl == TidyCR )
 599           c = CR;
 600     }
 601
 602     if (out->encoding == MACROMAN)
 603     {
 604         EncodeMacRoman( c, out );
 605     }
 606     else if (out->encoding == WIN1252)
 607     {
 608         EncodeWin1252( c, out );
 609     }
 610     else if (out->encoding == IBM858)
 611     {
 612         EncodeIbm858( c, out );
 613     }
 614     else if (out->encoding == LATIN0)
 615     {
 616         EncodeLatin0( c, out );
 617     }
 618
 619     else if (out->encoding == UTF8)
 620     {
 621         int count = 0;
 622
 623         TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
 624         if (count <= 0)
 625         {
 626           /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
 627             /* replacement char 0xFFFD encoded as UTF-8 */
 628             PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
 629         }
 630     }
 631 #ifndef NO_NATIVE_ISO2022_SUPPORT
 632     else if (out->encoding == ISO2022)
 633     {
 634         if (c == 0x1b)  /* ESC */
 635             out->state = FSM_ESC;
 636         else
 637         {
 638             switch (out->state)
 639             {
 640             case FSM_ESC:
 641                 if (c == '$')
 642                     out->state = FSM_ESCD;
 643                 else if (c == '(')
 644                     out->state = FSM_ESCP;
 645                 else
 646                     out->state = FSM_ASCII;
 647                 break;
 648
 649             case FSM_ESCD:
 650                 if (c == '(')
 651                     out->state = FSM_ESCDP;
 652                 else
 653                     out->state = FSM_NONASCII;
 654                 break;
 655
 656             case FSM_ESCDP:
 657                 out->state = FSM_NONASCII;
 658                 break;
 659
 660             case FSM_ESCP:
 661                 out->state = FSM_ASCII;
 662                 break;
 663
 664             case FSM_NONASCII:
 665                 c &= 0x7F;
 666                 break;
 667
 668             case FSM_ASCII:
 669                 break;
 670             }
 671         }
 672
 673         PutByte(c, out);
 674     }
 675 #endif /* NO_NATIVE_ISO2022_SUPPORT */
 676
 677 #if SUPPORT_UTF16_ENCODINGS
 678     else if ( out->encoding == UTF16LE ||
 679               out->encoding == UTF16BE ||
 680               out->encoding == UTF16 )
 681     {
 682         int i, numChars = 1;
 683         uint theChars[2];
 684
 685         if ( !TY_(IsValidUTF16FromUCS4)(c) )
 686         {
 687             /* invalid UTF-16 value */
 688             /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
 689             c = 0;
 690             numChars = 0;
 691         }
 692         else if ( TY_(IsCombinedChar)(c) )
 693         {
 694             /* output both, unless something goes wrong */
 695             numChars = 2;
 696             if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
 697             {
 698                 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
 699                 c = 0;
 700                 numChars = 0;
 701             }
 702         }
 703         else
 704         {
 705             /* just put the char out */
 706             theChars[0] = c;
 707         }
 708
 709         for (i = 0; i < numChars; i++)
 710         {
 711             c = theChars[i];
 712
 713             if (out->encoding == UTF16LE)
 714             {
 715                 uint ch = c & 0xFF; PutByte(ch, out);
 716                 ch = (c >> 8) & 0xFF; PutByte(ch, out);
 717             }
 718
 719             else if (out->encoding == UTF16BE || out->encoding == UTF16)
 720             {
 721                 uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
 722                 ch = c & 0xFF; PutByte(ch, out);
 723             }
 724         }
 725     }
 726 #endif
 727
 728 #if SUPPORT_ASIAN_ENCODINGS
 729     else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
 730     {
 731         if (c < 128)
 732             PutByte(c, out);
 733         else
 734         {
 735             uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
 736             ch = c & 0xFF; PutByte(ch, out);
 737         }
 738     }
 739 #endif
 740
 741     else
 742         PutByte( c, out );
 743 }
 744
 745
 746
 747 /****************************
 748 ** Miscellaneous / Helpers
 749 ****************************/
 750
 751 /* char encoding used when replacing illegal SGML chars,
 752 ** regardless of specified encoding.  Set at compile time
 753 ** to either Windows or Mac.
 754 */
 755 const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC;
 756
 757
 758 /* Mapping for Windows Western character set CP 1252
 759 ** (chars 128-159/U+0080-U+009F) to Unicode.
 760 */
 761 static const uint Win2Unicode[32] =
 762 {
 763     0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
 764     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
 765     0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
 766     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
 767 };
 768
 769 /* Function for conversion from Windows-1252 to Unicode */
 770 uint TY_(DecodeWin1252)(uint c)
 771 {
 772     if (127 < c && c < 160)
 773         c = Win2Unicode[c - 128];
 774
 775     return c;
 776 }
 777
 778 static void EncodeWin1252( uint c, StreamOut* out )
 779 {
 780     if (c < 128 || (c > 159 && c < 256))
 781         PutByte(c, out);
 782     else
 783     {
 784         int i;
 785
 786         for (i = 128; i < 160; i++)
 787             if (Win2Unicode[i - 128] == c)
 788             {
 789                 PutByte(i, out);
 790                 break;
 791             }
 792     }
 793 }
 794
 795 /*
 796    John Love-Jensen contributed this table for mapping MacRoman
 797    character set to Unicode
 798 */
 799
 800 /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
 801 static const uint Mac2Unicode[128] =
 802 {
 803     /* x7F = DEL */
 804
 805     0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
 806     0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
 807
 808     0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
 809     0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
 810
 811     0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
 812     0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
 813
 814     0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
 815                                             /* =BD U+2126 OHM SIGN */
 816     0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
 817
 818     0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
 819     0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
 820
 821     0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
 822                             /* =DB U+00A4 CURRENCY SIGN */
 823     0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
 824
 825     0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
 826     0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
 827     /* xF0 = Apple Logo */
 828     /* =F0 U+2665 BLACK HEART SUIT */
 829     0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
 830     0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
 831 };
 832
 833 /* Function to convert from MacRoman to Unicode */
 834 uint TY_(DecodeMacRoman)(uint c)
 835 {
 836     if (127 < c)
 837         c = Mac2Unicode[c - 128];
 838     return c;
 839 }
 840
 841 static void EncodeMacRoman( uint c, StreamOut* out )
 842 {
 843         if (c < 128)
 844             PutByte(c, out);
 845         else
 846         {
 847             /* For mac users, map Unicode back to MacRoman. */
 848             int i;
 849             for (i = 128; i < 256; i++)
 850             {
 851                 if (Mac2Unicode[i - 128] == c)
 852                 {
 853                     PutByte(i, out);
 854                     break;
 855                 }
 856             }
 857         }
 858 }
 859
 860 /* Mapping for OS/2 Western character set CP 850
 861 ** (chars 128-255) to Unicode.
 862 */
 863 static const uint IBM2Unicode[128] =
 864 {
 865     0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
 866     0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
 867     0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
 868     0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
 869     0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
 870     0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
 871     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
 872     0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
 873     0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
 874     0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
 875     0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
 876     0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
 877     0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
 878     0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
 879     0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
 880     0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
 881 };
 882
 883 /* Function for conversion from OS/2-850 to Unicode */
 884 static uint DecodeIbm850(uint c)
 885 {
 886     if (127 < c && c < 256)
 887         c = IBM2Unicode[c - 128];
 888
 889     return c;
 890 }
 891
 892 /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
 893 static void EncodeIbm858( uint c, StreamOut* out )
 894 {
 895     if (c < 128)
 896         PutByte(c, out);
 897     else
 898     {
 899         int i;
 900         for (i = 128; i < 256; i++)
 901         {
 902             if (IBM2Unicode[i - 128] == c)
 903             {
 904                 PutByte(i, out);
 905                 break;
 906             }
 907         }
 908     }
 909 }
 910
 911
 912 /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
 913 static uint DecodeLatin0(uint c)
 914 {
 915     if (159 < c && c < 191)
 916     {
 917         switch (c)
 918         {
 919         case 0xA4: c = 0x20AC; break;
 920         case 0xA6: c = 0x0160; break;
 921         case 0xA8: c = 0x0161; break;
 922         case 0xB4: c = 0x017D; break;
 923         case 0xB8: c = 0x017E; break;
 924         case 0xBC: c = 0x0152; break;
 925         case 0xBD: c = 0x0153; break;
 926         case 0xBE: c = 0x0178; break;
 927         }
 928     }
 929     return c;
 930 }
 931
 932 /* Map Unicode back to ISO-8859-15. */
 933 static void EncodeLatin0( uint c, StreamOut* out )
 934 {
 935     switch (c)
 936     {
 937     case 0x20AC: c = 0xA4; break;
 938     case 0x0160: c = 0xA6; break;
 939     case 0x0161: c = 0xA8; break;
 940     case 0x017D: c = 0xB4; break;
 941     case 0x017E: c = 0xB8; break;
 942     case 0x0152: c = 0xBC; break;
 943     case 0x0153: c = 0xBD; break;
 944     case 0x0178: c = 0xBE; break;
 945     }
 946     PutByte(c, out);
 947 }
 948
 949 /*
 950    Table to map symbol font characters to Unicode; undefined
 951    characters are mapped to 0x0000 and characters without any
 952    Unicode equivalent are mapped to '?'. Is this appropriate?
 953 */
 954
 955 static const uint Symbol2Unicode[] =
 956 {
 957     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
 958     0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
 959
 960     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
 961     0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
 962
 963     0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
 964     0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
 965
 966     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
 967     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
 968
 969     0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
 970     0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
 971
 972     0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
 973     0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
 974
 975     0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
 976     0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
 977
 978     0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
 979     0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
 980
 981     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 982     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 983
 984     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 985     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 986
 987     0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
 988     0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
 989
 990     0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
 991     0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
 992
 993     0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
 994     0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
 995
 996     0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
 997     0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
 998
 999     0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
1000     0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
1001
1002     0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
1003     0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
1004 };
1005
1006 #if 0
1007 /* Function to convert from Symbol Font chars to Unicode */
1008 uint DecodeSymbolFont(uint c)
1009 {
1010     if (c > 255)
1011         return c;
1012
1013     /* todo: add some error message */
1014
1015     return Symbol2Unicode[c];
1016 }
1017 #endif
1018
1019
1020 /* Facilitates user defined source by providing
1021 ** an entry point to marshal pointers-to-functions.
1022 ** Needed by .NET and possibly other language bindings.
1023 */
1024 Bool TIDY_CALL tidyInitSource( TidyInputSource*  source,
1025                                void*             srcData,
1026                                TidyGetByteFunc   gbFunc,
1027                                TidyUngetByteFunc ugbFunc,
1028                                TidyEOFFunc       endFunc )
1029 {
1030   Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
1031
1032   if ( status )
1033   {
1034     source->sourceData = srcData;
1035     source->getByte    = gbFunc;
1036     source->ungetByte  = ugbFunc;
1037     source->eof        = endFunc;
1038   }
1039
1040   return status;
1041 }
1042
1043 Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
1044                              void*           snkData,
1045                              TidyPutByteFunc pbFunc )
1046 {
1047   Bool status = ( sink && snkData && pbFunc );
1048   if ( status )
1049   {
1050     sink->sinkData = snkData;
1051     sink->putByte  = pbFunc;
1052   }
1053   return status;
1054 }
1055
1056 /* GetByte must return a byte value in a signed
1057 ** integer so that a negative value can signal EOF
1058 ** without interfering w/ 0-255 legitimate byte values.
1059 */
1060 uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1061 {
1062   int bv = source->getByte( source->sourceData );
1063   return (uint) bv;
1064 }
1065 Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1066 {
1067   return source->eof( source->sourceData );
1068 }
1069 void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1070 {
1071     source->ungetByte( source->sourceData, (byte) ch );
1072 }
1073 void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1074 {
1075     sink->putByte( sink->sinkData, (byte) ch );
1076 }
1077
1078 static uint ReadByte( StreamIn* in )
1079 {
1080     return tidyGetByte( &in->source );
1081 }
1082 Bool TY_(IsEOF)( StreamIn* in )
1083 {
1084     return tidyIsEOF( &in->source );
1085 }
1086 static void UngetByte( StreamIn* in, uint byteValue )
1087 {
1088     tidyUngetByte( &in->source, byteValue );
1089 }
1090 static void PutByte( uint byteValue, StreamOut* out )
1091 {
1092     tidyPutByte( &out->sink, byteValue );
1093 }
1094
1095 #if 0
1096 static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1097 {
1098     int i;
1099
1100     for (i = 0; i < *count; i++)
1101     {
1102         /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1103         if ( in && TY_(IsEOF)(in) )
1104         {
1105             /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1106             *count = -i;
1107             return;
1108         }
1109
1110         in->source.ungetByte( in->source.sourceData, buf[i] );
1111     }
1112 }
1113
1114 /*
1115    Read raw bytes from stream, return <= 0 if EOF; or if
1116    "unget" is true, Unget the bytes to re-synchronize the input stream
1117    Normally UTF-8 successor bytes are read using this routine.
1118 */
1119 static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1120 {
1121     int ix;
1122     for ( ix=0; ix < *count; ++ix )
1123     {
1124         if ( in->rawPushed )
1125         {
1126             buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1127             if ( in->rawBufpos == 0 )
1128                 in->rawPushed = no;
1129         }
1130         else
1131         {
1132             if ( in->source.eof(in->source.sourceData) )
1133             {
1134                 *count = -i;
1135                 break;
1136             }
1137             buf[ix] = in->source.getByte( in->source.sourceData );
1138         }
1139     }
1140 }
1141 #endif /* 0 */
1142
1143 /* read char from stream */
1144 static uint ReadCharFromStream( StreamIn* in )
1145 {
1146     uint c, n;
1147 #ifdef TIDY_WIN32_MLANG_SUPPORT
1148     uint bytesRead = 0;
1149 #endif
1150
1151     if ( TY_(IsEOF)(in) )
1152         return EndOfStream;
1153
1154     c = ReadByte( in );
1155
1156     if (c == EndOfStream)
1157         return c;
1158
1159 #ifndef NO_NATIVE_ISO2022_SUPPORT
1160     /*
1161        A document in ISO-2022 based encoding uses some ESC sequences
1162        called "designator" to switch character sets. The designators
1163        defined and used in ISO-2022-JP are:
1164
1165         "ESC" + "(" + ?     for ISO646 variants
1166
1167         "ESC" + "$" + ?     and
1168         "ESC" + "$" + "(" + ?   for multibyte character sets
1169
1170        Where ? stands for a single character used to indicate the
1171        character set for multibyte characters.
1172
1173        Tidy handles this by preserving the escape sequence and
1174        setting the top bit of each byte for non-ascii chars. This
1175        bit is then cleared on output. The input stream keeps track
1176        of the state to determine when to set/clear the bit.
1177     */
1178
1179     if (in->encoding == ISO2022)
1180     {
1181         if (c == 0x1b)  /* ESC */
1182         {
1183             in->state = FSM_ESC;
1184             return c;
1185         }
1186
1187         switch (in->state)
1188         {
1189         case FSM_ESC:
1190             if (c == '$')
1191                 in->state = FSM_ESCD;
1192             else if (c == '(')
1193                 in->state = FSM_ESCP;
1194             else
1195                 in->state = FSM_ASCII;
1196             break;
1197
1198         case FSM_ESCD:
1199             if (c == '(')
1200                 in->state = FSM_ESCDP;
1201             else
1202                 in->state = FSM_NONASCII;
1203             break;
1204
1205         case FSM_ESCDP:
1206             in->state = FSM_NONASCII;
1207             break;
1208
1209         case FSM_ESCP:
1210             in->state = FSM_ASCII;
1211             break;
1212
1213         case FSM_NONASCII:
1214             c |= 0x80;
1215             break;
1216
1217         case FSM_ASCII:
1218             break;
1219         }
1220
1221         return c;
1222     }
1223 #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1224
1225 #if SUPPORT_UTF16_ENCODINGS
1226     if ( in->encoding == UTF16LE )
1227     {
1228         uint c1 = ReadByte( in );
1229         if ( EndOfStream == c1 )
1230             return EndOfStream;
1231         n = (c1 << 8) + c;
1232         return n;
1233     }
1234
1235     if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1236     {
1237         uint c1 = ReadByte( in );
1238         if ( EndOfStream == c1 )
1239             return EndOfStream;
1240         n = (c << 8) + c1;
1241         return n;
1242     }
1243 #endif
1244
1245     if ( in->encoding == UTF8 )
1246     {
1247         /* deal with UTF-8 encoded char */
1248
1249         int err, count = 0;
1250
1251         /* first byte "c" is passed in separately */
1252         err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1253         if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1254             return EndOfStream;
1255         else if (err)
1256         {
1257             /* set error position just before offending character */
1258             in->doc->lexer->lines = in->curline;
1259             in->doc->lexer->columns = in->curcol;
1260
1261             TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1262             n = 0xFFFD; /* replacement char */
1263         }
1264
1265         return n;
1266     }
1267
1268 #if SUPPORT_ASIAN_ENCODINGS
1269     /*
1270        This section is suitable for any "multibyte" variable-width
1271        character encoding in which a one-byte code is less than
1272        128, and the first byte of a two-byte code is greater or
1273        equal to 128. Note that Big5 and ShiftJIS fit into this
1274        kind, even though their second byte may be less than 128
1275     */
1276     if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1277     {
1278         if (c < 128)
1279             return c;
1280         else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1281         {
1282             /*
1283               Rick Cameron pointed out that for Shift_JIS, the values from
1284               0xa1 through 0xdf represent singe-byte characters
1285               (U+FF61 to U+FF9F - half-shift Katakana)
1286             */
1287             return c;
1288         }
1289         else
1290         {
1291             uint c1 = ReadByte( in );
1292             if ( EndOfStream == c1 )
1293                 return EndOfStream;
1294             n = (c << 8) + c1;
1295             return n;
1296         }
1297     }
1298 #endif
1299
1300 #ifdef TIDY_WIN32_MLANG_SUPPORT
1301     else if (in->encoding > WIN32MLANG)
1302     {
1303         assert( in->mlang != NULL );
1304         return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1305     }
1306 #endif
1307
1308     else
1309         n = c;
1310
1311     return n;
1312 }
1313
1314 /* Output a Byte Order Mark if required */
1315 void TY_(outBOM)( StreamOut *out )
1316 {
1317     if ( out->encoding == UTF8
1318 #if SUPPORT_UTF16_ENCODINGS
1319          || out->encoding == UTF16LE
1320          || out->encoding == UTF16BE
1321          || out->encoding == UTF16
1322 #endif
1323        )
1324     {
1325         /* this will take care of encoding the BOM correctly */
1326         TY_(WriteChar)( UNICODE_BOM, out );
1327     }
1328 }
1329
1330 /* this is in intermediate fix for various problems in the */
1331 /* long term code and data in charsets.c should be used    */
1332 static struct _enc2iana
1333 {
1334     uint id;
1335     ctmbstr name;
1336     ctmbstr tidyOptName;
1337 } const enc2iana[] =
1338 {
1339   { ASCII,    "us-ascii",     "ascii"   },
1340   { LATIN0,   "iso-8859-15",  "latin0"  },
1341   { LATIN1,   "iso-8859-1",   "latin1"  },
1342   { UTF8,     "utf-8",        "utf8"   },
1343   { MACROMAN, "macintosh",    "mac"     },
1344   { WIN1252,  "windows-1252", "win1252" },
1345   { IBM858,   "ibm00858",     "ibm858"  },
1346 #if SUPPORT_UTF16_ENCODINGS
1347   { UTF16LE,  "utf-16",       "utf16le" },
1348   { UTF16BE,  "utf-16",       "utf16be" },
1349   { UTF16,    "utf-16",       "utf16"   },
1350 #endif
1351 #if SUPPORT_ASIAN_ENCODINGS
1352   { BIG5,     "big5",         "big5"    },
1353   { SHIFTJIS, "shift_jis",    "shiftjis"},
1354 #endif
1355 #ifndef NO_NATIVE_ISO2022_SUPPORT
1356   { ISO2022,  NULL,           "iso2022" },
1357 #endif
1358   { RAW,      NULL,           "raw"     }
1359 };
1360
1361 ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1362 {
1363     uint i;
1364
1365     for (i = 0; enc2iana[i].name; ++i)
1366         if (enc2iana[i].id == id)
1367             return enc2iana[i].name;
1368
1369     return NULL;
1370 }
1371
1372 ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1373 {
1374     uint i;
1375
1376     for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1377         if (enc2iana[i].id == id)
1378             return enc2iana[i].tidyOptName;
1379
1380     return NULL;
1381 }
1382
1383 int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1384 {
1385     uint i;
1386
1387     for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1388         if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1389             return enc2iana[i].id;
1390
1391     return -1;
1392 }
1393
1394 /*
1395  * local variables:
1396  * mode: c
1397  * indent-tabs-mode: nil
1398  * c-basic-offset: 4
1399  * eval: (c-set-offset 'substatement-open 0)
1400  * end:
1401  */