Source/WebCore/platform/text/mac/TextCodecMac.cpp

   1 /*
   2  * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
   3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "TextCodecMac.h"
  29
  30 #include "CharsetData.h"
  31 #include "PlatformString.h"
  32 #include "ThreadGlobalData.h"
  33 #include <wtf/Assertions.h>
  34 #include <wtf/PassOwnPtr.h>
  35 #include <wtf/RetainPtr.h>
  36 #include <wtf/Threading.h>
  37 #include <wtf/text/CString.h>
  38 #include <wtf/unicode/CharacterNames.h>
  39
  40 using namespace std;
  41
  42 namespace WebCore {
  43
  44 // We need to keep this because ICU doesn't support some of the encodings that we need:
  45 // <http://bugs.webkit.org/show_bug.cgi?id=4195>.
  46
  47 const size_t ConversionBufferSize = 16384;
  48
  49 static TECConverterWrapper& cachedConverterTEC()
  50 {
  51     return threadGlobalData().cachedConverterTEC();
  52 }
  53
  54 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
  55 {
  56     TECTextEncodingID lastEncoding = invalidEncoding;
  57     const char* lastName = 0;
  58
  59     for (size_t i = 0; CharsetTable[i].name; ++i) {
  60         if (CharsetTable[i].encoding != lastEncoding) {
  61             lastEncoding = CharsetTable[i].encoding;
  62             lastName = CharsetTable[i].name;
  63         }
  64         registrar(CharsetTable[i].name, lastName);
  65     }
  66 }
  67
  68 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
  69 {
  70     return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData));
  71 }
  72
  73 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
  74 {
  75     TECTextEncodingID lastEncoding = invalidEncoding;
  76
  77     for (size_t i = 0; CharsetTable[i].name; ++i)
  78         if (CharsetTable[i].encoding != lastEncoding) {
  79             registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
  80             lastEncoding = CharsetTable[i].encoding;
  81         }
  82 }
  83
  84 TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
  85     : m_encoding(encoding)
  86     , m_numBufferedBytes(0)
  87     , m_converterTEC(0)
  88 {
  89 }
  90
  91 TextCodecMac::~TextCodecMac()
  92 {
  93     releaseTECConverter();
  94 }
  95
  96 void TextCodecMac::releaseTECConverter() const
  97 {
  98     if (m_converterTEC) {
  99         TECConverterWrapper& cachedConverter = cachedConverterTEC();
 100         if (cachedConverter.converter)
 101             TECDisposeConverter(cachedConverter.converter);
 102         cachedConverter.converter = m_converterTEC;
 103         cachedConverter.encoding = m_encoding;
 104         m_converterTEC = 0;
 105     }
 106 }
 107
 108 OSStatus TextCodecMac::createTECConverter() const
 109 {
 110     TECConverterWrapper& cachedConverter = cachedConverterTEC();
 111
 112     bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
 113     cachedConverter.encoding = invalidEncoding;
 114
 115     if (cachedEncodingEqual && cachedConverter.converter) {
 116         m_converterTEC = cachedConverter.converter;
 117         cachedConverter.converter = 0;
 118
 119         TECClearConverterContextInfo(m_converterTEC);
 120     } else {
 121         OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
 122             CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
 123         if (status)
 124             return status;
 125
 126         TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
 127     }
 128
 129     return noErr;
 130 }
 131
 132 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
 133     void *outputBuffer, int outputBufferLength, int& outputLength)
 134 {
 135     OSStatus status;
 136     unsigned long bytesRead = 0;
 137     unsigned long bytesWritten = 0;
 138
 139     if (m_numBufferedBytes != 0) {
 140         // Finish converting a partial character that's in our buffer.
 141
 142         // First, fill the partial character buffer with as many bytes as are available.
 143         ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
 144         const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
 145         const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
 146         ASSERT(bytesToPutInBuffer != 0);
 147         memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
 148
 149         // Now, do a conversion on the buffer.
 150         status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
 151             reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
 152         ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
 153
 154         if (status == kTECPartialCharErr && bytesRead == 0) {
 155             // Handle the case where the partial character was not converted.
 156             if (bytesToPutInBuffer >= spaceInBuffer) {
 157                 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
 158                 m_numBufferedBytes = 0;
 159                 status = kTECUnmappableElementErr; // should never happen, but use this error code
 160             } else {
 161                 // Tell the caller we read all the source bytes and keep them in the buffer.
 162                 m_numBufferedBytes += bytesToPutInBuffer;
 163                 bytesRead = bytesToPutInBuffer;
 164                 status = noErr;
 165             }
 166         } else {
 167             // We are done with the partial character buffer.
 168             // Also, we have read some of the bytes from the main buffer.
 169             if (bytesRead > m_numBufferedBytes) {
 170                 bytesRead -= m_numBufferedBytes;
 171             } else {
 172                 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
 173                 bytesRead = 0;
 174             }
 175             m_numBufferedBytes = 0;
 176             if (status == kTECPartialCharErr) {
 177                 // While there may be a partial character problem in the small buffer,
 178                 // we have to try again and not get confused and think there is a partial
 179                 // character problem in the large buffer.
 180                 status = noErr;
 181             }
 182         }
 183     } else {
 184         status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
 185             static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
 186         ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
 187     }
 188
 189     // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
 190     if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
 191         status = kTECOutputBufferFullStatus;
 192
 193     inputLength = bytesRead;
 194     outputLength = bytesWritten;
 195     return status;
 196 }
 197
 198 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
 199 {
 200     // Get a converter for the passed-in encoding.
 201     if (!m_converterTEC && createTECConverter() != noErr)
 202         return String();
 203
 204     Vector<UChar> result;
 205
 206     const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
 207     int sourceLength = length;
 208     bool bufferWasFull = false;
 209     UniChar buffer[ConversionBufferSize];
 210
 211     while ((sourceLength || bufferWasFull) && !sawError) {
 212         int bytesRead = 0;
 213         int bytesWritten = 0;
 214         OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
 215         ASSERT(bytesRead <= sourceLength);
 216         sourcePointer += bytesRead;
 217         sourceLength -= bytesRead;
 218
 219         switch (status) {
 220             case noErr:
 221             case kTECOutputBufferFullStatus:
 222                 break;
 223             case kTextMalformedInputErr:
 224             case kTextUndefinedElementErr:
 225                 // FIXME: Put FFFD character into the output string in this case?
 226                 TECClearConverterContextInfo(m_converterTEC);
 227                 if (stopOnError) {
 228                     sawError = true;
 229                     break;
 230                 }
 231                 if (sourceLength) {
 232                     sourcePointer += 1;
 233                     sourceLength -= 1;
 234                 }
 235                 break;
 236             case kTECPartialCharErr: {
 237                 // Put the partial character into the buffer.
 238                 ASSERT(m_numBufferedBytes == 0);
 239                 const int bufferSize = sizeof(m_numBufferedBytes);
 240                 if (sourceLength < bufferSize) {
 241                     memcpy(m_bufferedBytes, sourcePointer, sourceLength);
 242                     m_numBufferedBytes = sourceLength;
 243                 } else {
 244                     LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
 245                 }
 246                 sourceLength = 0;
 247                 break;
 248             }
 249             default:
 250                 sawError = true;
 251                 return String();
 252         }
 253
 254         ASSERT(!(bytesWritten % sizeof(UChar)));
 255         result.append(buffer, bytesWritten / sizeof(UChar));
 256
 257         bufferWasFull = status == kTECOutputBufferFullStatus;
 258     }
 259
 260     if (flush) {
 261         unsigned long bytesWritten = 0;
 262         TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
 263         ASSERT(!(bytesWritten % sizeof(UChar)));
 264         result.append(buffer, bytesWritten / sizeof(UChar));
 265     }
 266
 267     String resultString = String::adopt(result);
 268
 269     // <rdar://problem/3225472>
 270     // Simplified Chinese pages use the code A3A0 to mean "full-width space".
 271     // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
 272     // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
 273     if (m_encoding == kCFStringEncodingGB_18030_2000)
 274         resultString.replace(0xE5E5, ideographicSpace);
 275
 276     return resultString;
 277 }
 278
 279 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
 280 {
 281     // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
 282
 283     // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
 284     // Encoding will change the yen sign back into a backslash.
 285     String copy(characters, length);
 286     copy.replace('\\', m_backslashAsCurrencySymbol);
 287     RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString());
 288
 289     CFIndex startPos = 0;
 290     CFIndex charactersLeft = CFStringGetLength(cfs.get());
 291     Vector<char> result;
 292     size_t size = 0;
 293     UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
 294     while (charactersLeft > 0) {
 295         CFRange range = CFRangeMake(startPos, charactersLeft);
 296         CFIndex bufferLength;
 297         CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
 298
 299         result.grow(size + bufferLength);
 300         unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
 301         CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
 302         size += bufferLength;
 303
 304         if (charactersConverted != charactersLeft) {
 305             unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
 306             ++charactersConverted;
 307             if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
 308                 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
 309                 if ((low & 0xFC00) == 0xDC00) { // is low surrogate
 310                     badChar <<= 10;
 311                     badChar += low;
 312                     badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
 313                     ++charactersConverted;
 314                 }
 315             }
 316             UnencodableReplacementArray entity;
 317             int entityLength = getUnencodableReplacement(badChar, handling, entity);
 318             result.grow(size + entityLength);
 319             memcpy(result.data() + size, entity, entityLength);
 320             size += entityLength;
 321         }
 322
 323         startPos += charactersConverted;
 324         charactersLeft -= charactersConverted;
 325     }
 326     return CString(result.data(), size);
 327 }
 328
 329 } // namespace WebCore