lib/Support/JSON.cpp

   1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===---------------------------------------------------------------------===//
   9
  10 #include "llvm/Support/JSON.h"
  11 #include "llvm/Support/Format.h"
  12 #include <cctype>
  13
  14 namespace llvm {
  15 namespace json {
  16
  17 Value &Object::operator[](const ObjectKey &K) {
  18   return try_emplace(K, nullptr).first->getSecond();
  19 }
  20 Value &Object::operator[](ObjectKey &&K) {
  21   return try_emplace(std::move(K), nullptr).first->getSecond();
  22 }
  23 Value *Object::get(StringRef K) {
  24   auto I = find(K);
  25   if (I == end())
  26     return nullptr;
  27   return &I->second;
  28 }
  29 const Value *Object::get(StringRef K) const {
  30   auto I = find(K);
  31   if (I == end())
  32     return nullptr;
  33   return &I->second;
  34 }
  35 llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
  36   if (auto *V = get(K))
  37     return V->getAsNull();
  38   return llvm::None;
  39 }
  40 llvm::Optional<bool> Object::getBoolean(StringRef K) const {
  41   if (auto *V = get(K))
  42     return V->getAsBoolean();
  43   return llvm::None;
  44 }
  45 llvm::Optional<double> Object::getNumber(StringRef K) const {
  46   if (auto *V = get(K))
  47     return V->getAsNumber();
  48   return llvm::None;
  49 }
  50 llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
  51   if (auto *V = get(K))
  52     return V->getAsInteger();
  53   return llvm::None;
  54 }
  55 llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
  56   if (auto *V = get(K))
  57     return V->getAsString();
  58   return llvm::None;
  59 }
  60 const json::Object *Object::getObject(StringRef K) const {
  61   if (auto *V = get(K))
  62     return V->getAsObject();
  63   return nullptr;
  64 }
  65 json::Object *Object::getObject(StringRef K) {
  66   if (auto *V = get(K))
  67     return V->getAsObject();
  68   return nullptr;
  69 }
  70 const json::Array *Object::getArray(StringRef K) const {
  71   if (auto *V = get(K))
  72     return V->getAsArray();
  73   return nullptr;
  74 }
  75 json::Array *Object::getArray(StringRef K) {
  76   if (auto *V = get(K))
  77     return V->getAsArray();
  78   return nullptr;
  79 }
  80 bool operator==(const Object &LHS, const Object &RHS) {
  81   if (LHS.size() != RHS.size())
  82     return false;
  83   for (const auto &L : LHS) {
  84     auto R = RHS.find(L.first);
  85     if (R == RHS.end() || L.second != R->second)
  86       return false;
  87   }
  88   return true;
  89 }
  90
  91 Array::Array(std::initializer_list<Value> Elements) {
  92   V.reserve(Elements.size());
  93   for (const Value &V : Elements) {
  94     emplace_back(nullptr);
  95     back().moveFrom(std::move(V));
  96   }
  97 }
  98
  99 Value::Value(std::initializer_list<Value> Elements)
 100     : Value(json::Array(Elements)) {}
 101
 102 void Value::copyFrom(const Value &M) {
 103   Type = M.Type;
 104   switch (Type) {
 105   case T_Null:
 106   case T_Boolean:
 107   case T_Double:
 108   case T_Integer:
 109     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
 110     break;
 111   case T_StringRef:
 112     create<StringRef>(M.as<StringRef>());
 113     break;
 114   case T_String:
 115     create<std::string>(M.as<std::string>());
 116     break;
 117   case T_Object:
 118     create<json::Object>(M.as<json::Object>());
 119     break;
 120   case T_Array:
 121     create<json::Array>(M.as<json::Array>());
 122     break;
 123   }
 124 }
 125
 126 void Value::moveFrom(const Value &&M) {
 127   Type = M.Type;
 128   switch (Type) {
 129   case T_Null:
 130   case T_Boolean:
 131   case T_Double:
 132   case T_Integer:
 133     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
 134     break;
 135   case T_StringRef:
 136     create<StringRef>(M.as<StringRef>());
 137     break;
 138   case T_String:
 139     create<std::string>(std::move(M.as<std::string>()));
 140     M.Type = T_Null;
 141     break;
 142   case T_Object:
 143     create<json::Object>(std::move(M.as<json::Object>()));
 144     M.Type = T_Null;
 145     break;
 146   case T_Array:
 147     create<json::Array>(std::move(M.as<json::Array>()));
 148     M.Type = T_Null;
 149     break;
 150   }
 151 }
 152
 153 void Value::destroy() {
 154   switch (Type) {
 155   case T_Null:
 156   case T_Boolean:
 157   case T_Double:
 158   case T_Integer:
 159     break;
 160   case T_StringRef:
 161     as<StringRef>().~StringRef();
 162     break;
 163   case T_String:
 164     as<std::string>().~basic_string();
 165     break;
 166   case T_Object:
 167     as<json::Object>().~Object();
 168     break;
 169   case T_Array:
 170     as<json::Array>().~Array();
 171     break;
 172   }
 173 }
 174
 175 bool operator==(const Value &L, const Value &R) {
 176   if (L.kind() != R.kind())
 177     return false;
 178   switch (L.kind()) {
 179   case Value::Null:
 180     return *L.getAsNull() == *R.getAsNull();
 181   case Value::Boolean:
 182     return *L.getAsBoolean() == *R.getAsBoolean();
 183   case Value::Number:
 184     return *L.getAsNumber() == *R.getAsNumber();
 185   case Value::String:
 186     return *L.getAsString() == *R.getAsString();
 187   case Value::Array:
 188     return *L.getAsArray() == *R.getAsArray();
 189   case Value::Object:
 190     return *L.getAsObject() == *R.getAsObject();
 191   }
 192   llvm_unreachable("Unknown value kind");
 193 }
 194
 195 namespace {
 196 // Simple recursive-descent JSON parser.
 197 class Parser {
 198 public:
 199   Parser(StringRef JSON)
 200       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
 201
 202   bool parseValue(Value &Out);
 203
 204   bool assertEnd() {
 205     eatWhitespace();
 206     if (P == End)
 207       return true;
 208     return parseError("Text after end of document");
 209   }
 210
 211   Error takeError() {
 212     assert(Err);
 213     return std::move(*Err);
 214   }
 215
 216 private:
 217   void eatWhitespace() {
 218     while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
 219       ++P;
 220   }
 221
 222   // On invalid syntax, parseX() functions return false and set Err.
 223   bool parseNumber(char First, Value &Out);
 224   bool parseString(std::string &Out);
 225   bool parseUnicode(std::string &Out);
 226   bool parseError(const char *Msg); // always returns false
 227
 228   char next() { return P == End ? 0 : *P++; }
 229   char peek() { return P == End ? 0 : *P; }
 230   static bool isNumber(char C) {
 231     return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
 232            C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
 233            C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
 234   }
 235
 236   Optional<Error> Err;
 237   const char *Start, *P, *End;
 238 };
 239
 240 bool Parser::parseValue(Value &Out) {
 241   eatWhitespace();
 242   if (P == End)
 243     return parseError("Unexpected EOF");
 244   switch (char C = next()) {
 245   // Bare null/true/false are easy - first char identifies them.
 246   case 'n':
 247     Out = nullptr;
 248     return (next() == 'u' && next() == 'l' && next() == 'l') ||
 249            parseError("Invalid JSON value (null?)");
 250   case 't':
 251     Out = true;
 252     return (next() == 'r' && next() == 'u' && next() == 'e') ||
 253            parseError("Invalid JSON value (true?)");
 254   case 'f':
 255     Out = false;
 256     return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
 257            parseError("Invalid JSON value (false?)");
 258   case '"': {
 259     std::string S;
 260     if (parseString(S)) {
 261       Out = std::move(S);
 262       return true;
 263     }
 264     return false;
 265   }
 266   case '[': {
 267     Out = Array{};
 268     Array &A = *Out.getAsArray();
 269     eatWhitespace();
 270     if (peek() == ']') {
 271       ++P;
 272       return true;
 273     }
 274     for (;;) {
 275       A.emplace_back(nullptr);
 276       if (!parseValue(A.back()))
 277         return false;
 278       eatWhitespace();
 279       switch (next()) {
 280       case ',':
 281         eatWhitespace();
 282         continue;
 283       case ']':
 284         return true;
 285       default:
 286         return parseError("Expected , or ] after array element");
 287       }
 288     }
 289   }
 290   case '{': {
 291     Out = Object{};
 292     Object &O = *Out.getAsObject();
 293     eatWhitespace();
 294     if (peek() == '}') {
 295       ++P;
 296       return true;
 297     }
 298     for (;;) {
 299       if (next() != '"')
 300         return parseError("Expected object key");
 301       std::string K;
 302       if (!parseString(K))
 303         return false;
 304       eatWhitespace();
 305       if (next() != ':')
 306         return parseError("Expected : after object key");
 307       eatWhitespace();
 308       if (!parseValue(O[std::move(K)]))
 309         return false;
 310       eatWhitespace();
 311       switch (next()) {
 312       case ',':
 313         eatWhitespace();
 314         continue;
 315       case '}':
 316         return true;
 317       default:
 318         return parseError("Expected , or } after object property");
 319       }
 320     }
 321   }
 322   default:
 323     if (isNumber(C))
 324       return parseNumber(C, Out);
 325     return parseError("Invalid JSON value");
 326   }
 327 }
 328
 329 bool Parser::parseNumber(char First, Value &Out) {
 330   // Read the number into a string. (Must be null-terminated for strto*).
 331   SmallString<24> S;
 332   S.push_back(First);
 333   while (isNumber(peek()))
 334     S.push_back(next());
 335   char *End;
 336   // Try first to parse as integer, and if so preserve full 64 bits.
 337   // strtoll returns long long >= 64 bits, so check it's in range too.
 338   auto I = std::strtoll(S.c_str(), &End, 10);
 339   if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
 340       I <= std::numeric_limits<int64_t>::max()) {
 341     Out = int64_t(I);
 342     return true;
 343   }
 344   // If it's not an integer
 345   Out = std::strtod(S.c_str(), &End);
 346   return End == S.end() || parseError("Invalid JSON value (number?)");
 347 }
 348
 349 bool Parser::parseString(std::string &Out) {
 350   // leading quote was already consumed.
 351   for (char C = next(); C != '"'; C = next()) {
 352     if (LLVM_UNLIKELY(P == End))
 353       return parseError("Unterminated string");
 354     if (LLVM_UNLIKELY((C & 0x1f) == C))
 355       return parseError("Control character in string");
 356     if (LLVM_LIKELY(C != '\\')) {
 357       Out.push_back(C);
 358       continue;
 359     }
 360     // Handle escape sequence.
 361     switch (C = next()) {
 362     case '"':
 363     case '\\':
 364     case '/':
 365       Out.push_back(C);
 366       break;
 367     case 'b':
 368       Out.push_back('\b');
 369       break;
 370     case 'f':
 371       Out.push_back('\f');
 372       break;
 373     case 'n':
 374       Out.push_back('\n');
 375       break;
 376     case 'r':
 377       Out.push_back('\r');
 378       break;
 379     case 't':
 380       Out.push_back('\t');
 381       break;
 382     case 'u':
 383       if (!parseUnicode(Out))
 384         return false;
 385       break;
 386     default:
 387       return parseError("Invalid escape sequence");
 388     }
 389   }
 390   return true;
 391 }
 392
 393 static void encodeUtf8(uint32_t Rune, std::string &Out) {
 394   if (Rune < 0x80) {
 395     Out.push_back(Rune & 0x7F);
 396   } else if (Rune < 0x800) {
 397     uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
 398     uint8_t SecondByte = 0x80 | (Rune & 0x3F);
 399     Out.push_back(FirstByte);
 400     Out.push_back(SecondByte);
 401   } else if (Rune < 0x10000) {
 402     uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
 403     uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
 404     uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
 405     Out.push_back(FirstByte);
 406     Out.push_back(SecondByte);
 407     Out.push_back(ThirdByte);
 408   } else if (Rune < 0x110000) {
 409     uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
 410     uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
 411     uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
 412     uint8_t FourthByte = 0x80 | (Rune & 0x3F);
 413     Out.push_back(FirstByte);
 414     Out.push_back(SecondByte);
 415     Out.push_back(ThirdByte);
 416     Out.push_back(FourthByte);
 417   } else {
 418     llvm_unreachable("Invalid codepoint");
 419   }
 420 }
 421
 422 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
 423 // May parse several sequential escapes to ensure proper surrogate handling.
 424 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
 425 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
 426 bool Parser::parseUnicode(std::string &Out) {
 427   // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
 428   auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
 429   // Decodes 4 hex digits from the stream into Out, returns false on error.
 430   auto Parse4Hex = [this](uint16_t &Out) -> bool {
 431     Out = 0;
 432     char Bytes[] = {next(), next(), next(), next()};
 433     for (unsigned char C : Bytes) {
 434       if (!std::isxdigit(C))
 435         return parseError("Invalid \\u escape sequence");
 436       Out <<= 4;
 437       Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
 438     }
 439     return true;
 440   };
 441   uint16_t First; // UTF-16 code unit from the first \u escape.
 442   if (!Parse4Hex(First))
 443     return false;
 444
 445   // We loop to allow proper surrogate-pair error handling.
 446   while (true) {
 447     // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
 448     if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
 449       encodeUtf8(First, Out);
 450       return true;
 451     }
 452
 453     // Case 2: it's an (unpaired) trailing surrogate.
 454     if (LLVM_UNLIKELY(First >= 0xDC00)) {
 455       Invalid();
 456       return true;
 457     }
 458
 459     // Case 3: it's a leading surrogate. We expect a trailing one next.
 460     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
 461     if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
 462       Invalid(); // Leading surrogate was unpaired.
 463       return true;
 464     }
 465     P += 2;
 466     uint16_t Second;
 467     if (!Parse4Hex(Second))
 468       return false;
 469     // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
 470     if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
 471       Invalid();      // Leading surrogate was unpaired.
 472       First = Second; // Second escape still needs to be processed.
 473       continue;
 474     }
 475     // Case 3c: a valid surrogate pair encoding an astral codepoint.
 476     encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
 477     return true;
 478   }
 479 }
 480
 481 bool Parser::parseError(const char *Msg) {
 482   int Line = 1;
 483   const char *StartOfLine = Start;
 484   for (const char *X = Start; X < P; ++X) {
 485     if (*X == 0x0A) {
 486       ++Line;
 487       StartOfLine = X + 1;
 488     }
 489   }
 490   Err.emplace(
 491       llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
 492   return false;
 493 }
 494 } // namespace
 495
 496 Expected<Value> parse(StringRef JSON) {
 497   Parser P(JSON);
 498   Value E = nullptr;
 499   if (P.parseValue(E))
 500     if (P.assertEnd())
 501       return std::move(E);
 502   return P.takeError();
 503 }
 504 char ParseError::ID = 0;
 505
 506 static std::vector<const Object::value_type *> sortedElements(const Object &O) {
 507   std::vector<const Object::value_type *> Elements;
 508   for (const auto &E : O)
 509     Elements.push_back(&E);
 510   llvm::sort(Elements.begin(), Elements.end(),
 511              [](const Object::value_type *L, const Object::value_type *R) {
 512                return L->first < R->first;
 513              });
 514   return Elements;
 515 }
 516
 517 } // namespace json
 518 } // namespace llvm
 519
 520 static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
 521   OS << '\"';
 522   for (unsigned char C : S) {
 523     if (C == 0x22 || C == 0x5C)
 524       OS << '\\';
 525     if (C >= 0x20) {
 526       OS << C;
 527       continue;
 528     }
 529     OS << '\\';
 530     switch (C) {
 531     // A few characters are common enough to make short escapes worthwhile.
 532     case '\t':
 533       OS << 't';
 534       break;
 535     case '\n':
 536       OS << 'n';
 537       break;
 538     case '\r':
 539       OS << 'r';
 540       break;
 541     default:
 542       OS << 'u';
 543       llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
 544       break;
 545     }
 546   }
 547   OS << '\"';
 548 }
 549
 550 enum IndenterAction {
 551   Indent,
 552   Outdent,
 553   Newline,
 554   Space,
 555 };
 556
 557 // Prints JSON. The indenter can be used to control formatting.
 558 template <typename Indenter>
 559 void llvm::json::Value::print(raw_ostream &OS, const Indenter &I) const {
 560   switch (Type) {
 561   case T_Null:
 562     OS << "null";
 563     break;
 564   case T_Boolean:
 565     OS << (as<bool>() ? "true" : "false");
 566     break;
 567   case T_Double:
 568     OS << format("%.*g", std::numeric_limits<double>::max_digits10,
 569                  as<double>());
 570     break;
 571   case T_Integer:
 572     OS << as<int64_t>();
 573     break;
 574   case T_StringRef:
 575     quote(OS, as<StringRef>());
 576     break;
 577   case T_String:
 578     quote(OS, as<std::string>());
 579     break;
 580   case T_Object: {
 581     bool Comma = false;
 582     OS << '{';
 583     I(Indent);
 584     for (const auto *P : sortedElements(as<json::Object>())) {
 585       if (Comma)
 586         OS << ',';
 587       Comma = true;
 588       I(Newline);
 589       quote(OS, P->first);
 590       OS << ':';
 591       I(Space);
 592       P->second.print(OS, I);
 593     }
 594     I(Outdent);
 595     if (Comma)
 596       I(Newline);
 597     OS << '}';
 598     break;
 599   }
 600   case T_Array: {
 601     bool Comma = false;
 602     OS << '[';
 603     I(Indent);
 604     for (const auto &E : as<json::Array>()) {
 605       if (Comma)
 606         OS << ',';
 607       Comma = true;
 608       I(Newline);
 609       E.print(OS, I);
 610     }
 611     I(Outdent);
 612     if (Comma)
 613       I(Newline);
 614     OS << ']';
 615     break;
 616   }
 617   }
 618 }
 619
 620 void llvm::format_provider<llvm::json::Value>::format(
 621     const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
 622   if (Options.empty()) {
 623     OS << E;
 624     return;
 625   }
 626   unsigned IndentAmount = 0;
 627   if (Options.getAsInteger(/*Radix=*/10, IndentAmount))
 628     llvm_unreachable("json::Value format options should be an integer");
 629   unsigned IndentLevel = 0;
 630   E.print(OS, [&](IndenterAction A) {
 631     switch (A) {
 632     case Newline:
 633       OS << '\n';
 634       OS.indent(IndentLevel);
 635       break;
 636     case Space:
 637       OS << ' ';
 638       break;
 639     case Indent:
 640       IndentLevel += IndentAmount;
 641       break;
 642     case Outdent:
 643       IndentLevel -= IndentAmount;
 644       break;
 645     };
 646   });
 647 }
 648
 649 llvm::raw_ostream &llvm::json::operator<<(raw_ostream &OS, const Value &E) {
 650   E.print(OS, [](IndenterAction A) { /*ignore*/ });
 651   return OS;
 652 }