AAPT2: Iterate over UTF-8 string by codepoints

author Adam Lesinski <adamlesinski@google.com>

Wed, 28 Jun 2017 01:39:07 +0000 (18:39 -0700)

committer Adam Lesinski <adamlesinski@google.com>

Fri, 30 Jun 2017 01:17:35 +0000 (18:17 -0700)
author Adam Lesinski <adamlesinski@google.com>
Wed, 28 Jun 2017 01:39:07 +0000 (18:39 -0700)
committer Adam Lesinski <adamlesinski@google.com>
Fri, 30 Jun 2017 01:17:35 +0000 (18:17 -0700)
diff --git a/tools/aapt2/Android.bp b/tools/aapt2/Android.bp

index d39a8a8..46c0ff0 100644 (file)
--- a/tools/aapt2/Android.bp
+++ b/tools/aapt2/Android.bp
@@ -115,6 +115,7 @@ cc_library_host_static {
          "unflatten/ResChunkPullParser.cpp",
          "util/BigBuffer.cpp",
          "util/Files.cpp",
+        "util/Utf8Iterator.cpp",
          "util/Util.cpp",
          "ConfigDescription.cpp",
          "Debug.cpp",
diff --git a/tools/aapt2/ResourceParser_test.cpp b/tools/aapt2/ResourceParser_test.cpp

index e189144..d47a529 100644 (file)
--- a/tools/aapt2/ResourceParser_test.cpp
+++ b/tools/aapt2/ResourceParser_test.cpp
@@ -95,6 +95,11 @@ TEST_F(ResourceParserTest, ParseEscapedString) {
    ASSERT_THAT(str, NotNull());
    EXPECT_THAT(*str, StrValueEq("?123"));
    EXPECT_THAT(str->untranslatable_sections, IsEmpty());
+
+  ASSERT_TRUE(TestParse(R"(<string name="bar">This isn\’t a bad string</string>)"));
+  str = test::GetValue<String>(&table_, "string/bar");
+  ASSERT_THAT(str, NotNull());
+  EXPECT_THAT(*str, StrValueEq("This isn’t a bad string"));
  }
  
  TEST_F(ResourceParserTest, ParseFormattedString) {
diff --git a/tools/aapt2/readme.md b/tools/aapt2/readme.md

index 8c476fa..01d4a41 100644 (file)
--- a/tools/aapt2/readme.md
+++ b/tools/aapt2/readme.md
@@ -10,6 +10,7 @@
  - Fixed issue where Java classes referenced from fragments and menus were not added to
    the set of Proguard keep rules. (bug 62216174)
  - Automatically version XML `<adaptive-icon>` resources to v26. (bug 62316340)
+- Fixed issue where escaped unicode characters would generate malformed UTF-8. (bug 62839202)
  
  ## Version 2.17
  ### `aapt2 ...`
diff --git a/tools/aapt2/util/Utf8Iterator.cpp b/tools/aapt2/util/Utf8Iterator.cpp

new file mode 100644 (file)

index 0000000..c09eda7
--- /dev/null
+++ b/tools/aapt2/util/Utf8Iterator.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/Utf8Iterator.h"
+
+#include "android-base/logging.h"
+#include "utils/Unicode.h"
+
+using ::android::StringPiece;
+
+namespace aapt {
+
+Utf8Iterator::Utf8Iterator(const StringPiece& str)
+    : str_(str), next_pos_(0), current_codepoint_(0) {
+  DoNext();
+}
+
+void Utf8Iterator::DoNext() {
+  size_t next_pos = 0u;
+  int32_t result = utf32_from_utf8_at(str_.data(), str_.size(), next_pos_, &next_pos);
+  if (result == -1) {
+    current_codepoint_ = 0u;
+  } else {
+    current_codepoint_ = static_cast<char32_t>(result);
+    next_pos_ = next_pos;
+  }
+}
+
+bool Utf8Iterator::HasNext() const {
+  return current_codepoint_ != 0;
+}
+
+void Utf8Iterator::Skip(int amount) {
+  while (amount > 0 && HasNext()) {
+    Next();
+    --amount;
+  }
+}
+
+char32_t Utf8Iterator::Next() {
+  CHECK(HasNext()) << "Next() called after iterator exhausted";
+  char32_t result = current_codepoint_;
+  DoNext();
+  return result;
+}
+
+}  // namespace aapt
diff --git a/tools/aapt2/util/Utf8Iterator.h b/tools/aapt2/util/Utf8Iterator.h

new file mode 100644 (file)

index 0000000..f2507d8
--- /dev/null
+++ b/tools/aapt2/util/Utf8Iterator.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AAPT_UTIL_UTF8ITERATOR_H
+#define AAPT_UTIL_UTF8ITERATOR_H
+
+#include "android-base/macros.h"
+#include "androidfw/StringPiece.h"
+
+namespace aapt {
+
+class Utf8Iterator {
+ public:
+  explicit Utf8Iterator(const android::StringPiece& str);
+
+  bool HasNext() const;
+
+  void Skip(int amount);
+
+  char32_t Next();
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Utf8Iterator);
+
+  void DoNext();
+
+  android::StringPiece str_;
+  size_t next_pos_;
+  char32_t current_codepoint_;
+};
+
+}  // namespace aapt
+
+#endif  // AAPT_UTIL_UTF8ITERATOR_H
diff --git a/tools/aapt2/util/Utf8Iterator_test.cpp b/tools/aapt2/util/Utf8Iterator_test.cpp

new file mode 100644 (file)

index 0000000..cfebbb0
--- /dev/null
+++ b/tools/aapt2/util/Utf8Iterator_test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/Utf8Iterator.h"
+
+#include "test/Test.h"
+
+using ::testing::Eq;
+
+namespace aapt {
+
+TEST(Utf8IteratorTest, IteratesOverAscii) {
+  Utf8Iterator iter("hello");
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'h'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'e'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'l'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'l'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'o'));
+
+  EXPECT_FALSE(iter.HasNext());
+}
+
+TEST(Utf8IteratorTest, IteratesOverUnicode) {
+  Utf8Iterator iter("Hi there 華勵蓮🍩");
+  iter.Skip(9);
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'華'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'勵'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'蓮'));
+
+  ASSERT_TRUE(iter.HasNext());
+  EXPECT_THAT(iter.Next(), Eq(U'🍩'));
+
+  EXPECT_FALSE(iter.HasNext());
+}
+
+}  // namespace aapt
diff --git a/tools/aapt2/util/Util.cpp b/tools/aapt2/util/Util.cpp

index 28e952e..dfa92d7 100644 (file)
--- a/tools/aapt2/util/Util.cpp
+++ b/tools/aapt2/util/Util.cpp
@@ -16,19 +16,20 @@
  
  #include "util/Util.h"
  
-#include <utils/Unicode.h>
  #include <algorithm>
  #include <ostream>
  #include <string>
  #include <vector>
  
  #include "androidfw/StringPiece.h"
+#include "utils/Unicode.h"
  
  #include "util/BigBuffer.h"
  #include "util/Maybe.h"
+#include "util/Utf8Iterator.h"
  
-using android::StringPiece;
-using android::StringPiece16;
+using ::android::StringPiece;
+using ::android::StringPiece16;
  
  namespace aapt {
  namespace util {
@@ -283,33 +284,46 @@ bool VerifyJavaStringFormat(const StringPiece& str) {
    return true;
  }
  
-static Maybe<std::string> ParseUnicodeCodepoint(const char** start,
-                                                const char* end) {
+static bool AppendCodepointToUtf8String(char32_t codepoint, std::string* output) {
+  ssize_t len = utf32_to_utf8_length(&codepoint, 1);
+  if (len < 0) {
+    return false;
+  }
+
+  const size_t start_append_pos = output->size();
+
+  // Make room for the next character.
+  output->resize(output->size() + len);
+
+  char* dst = &*(output->begin() + start_append_pos);
+  utf32_to_utf8(&codepoint, 1, dst, len + 1);
+  return true;
+}
+
+static bool AppendUnicodeCodepoint(Utf8Iterator* iter, std::string* output) {
    char32_t code = 0;
-  for (size_t i = 0; i < 4 && *start != end; i++, (*start)++) {
-    char c = **start;
+  for (size_t i = 0; i < 4 && iter->HasNext(); i++) {
+    char32_t codepoint = iter->Next();
      char32_t a;
-    if (c >= '0' && c <= '9') {
-      a = c - '0';
-    } else if (c >= 'a' && c <= 'f') {
-      a = c - 'a' + 10;
-    } else if (c >= 'A' && c <= 'F') {
-      a = c - 'A' + 10;
+    if (codepoint >= U'0' && codepoint <= U'9') {
+      a = codepoint - U'0';
+    } else if (codepoint >= U'a' && codepoint <= U'f') {
+      a = codepoint - U'a' + 10;
+    } else if (codepoint >= U'A' && codepoint <= U'F') {
+      a = codepoint - U'A' + 10;
      } else {
        return {};
      }
      code = (code << 4) | a;
    }
+  return AppendCodepointToUtf8String(code, output);
+}
  
-  ssize_t len = utf32_to_utf8_length(&code, 1);
-  if (len < 0) {
-    return {};
+static bool IsCodepointSpace(char32_t codepoint) {
+  if (static_cast<uint32_t>(codepoint) & 0xffffff00u) {
+    return false;
    }
-
-  std::string result_utf8;
-  result_utf8.resize(len);
-  utf32_to_utf8(&code, 1, &*result_utf8.begin(), len + 1);
-  return result_utf8;
+  return isspace(static_cast<char>(codepoint));
  }
  
  StringBuilder& StringBuilder::Append(const StringPiece& str) {
@@ -318,57 +332,46 @@ StringBuilder& StringBuilder::Append(const StringPiece& str) {
    }
  
    // Where the new data will be appended to.
-  size_t new_data_index = str_.size();
+  const size_t new_data_index = str_.size();
+
+  Utf8Iterator iter(str);
+  while (iter.HasNext()) {
+    const char32_t codepoint = iter.Next();
  
-  const char* const end = str.end();
-  const char* start = str.begin();
-  const char* current = start;
-  while (current != end) {
      if (last_char_was_escape_) {
-      switch (*current) {
-        case 't':
+      switch (codepoint) {
+        case U't':
            str_ += '\t';
            break;
-        case 'n':
+
+        case U'n':
            str_ += '\n';
            break;
-        case '#':
-          str_ += '#';
-          break;
-        case '@':
-          str_ += '@';
-          break;
-        case '?':
-          str_ += '?';
-          break;
-        case '"':
-          str_ += '"';
-          break;
-        case '\'':
-          str_ += '\'';
-          break;
-        case '\\':
-          str_ += '\\';
+
+        case U'#':
+        case U'@':
+        case U'?':
+        case U'"':
+        case U'\'':
+        case U'\\':
+          str_ += static_cast<char>(codepoint);
            break;
-        case 'u': {
-          current++;
-          Maybe<std::string> c = ParseUnicodeCodepoint(&current, end);
-          if (!c) {
+
+        case U'u':
+          if (!AppendUnicodeCodepoint(&iter, &str_)) {
              error_ = "invalid unicode escape sequence";
              return *this;
            }
-          str_ += c.value();
-          current -= 1;
            break;
-        }
  
          default:
-          // Ignore.
+          // Ignore the escape character and just include the codepoint.
+          AppendCodepointToUtf8String(codepoint, &str_);
            break;
        }
        last_char_was_escape_ = false;
-      start = current + 1;
-    } else if (*current == '"') {
+
+    } else if (codepoint == U'"') {
        if (!quote_ && trailing_space_) {
          // We found an opening quote, and we have
          // trailing space, so we should append that
@@ -383,13 +386,13 @@ StringBuilder& StringBuilder::Append(const StringPiece& str) {
          }
        }
        quote_ = !quote_;
-      str_.append(start, current - start);
-      start = current + 1;
-    } else if (*current == '\'' && !quote_) {
+
+    } else if (codepoint == U'\'' && !quote_) {
        // This should be escaped.
        error_ = "unescaped apostrophe";
        return *this;
-    } else if (*current == '\\') {
+
+    } else if (codepoint == U'\\') {
        // This is an escape sequence, convert to the real value.
        if (!quote_ && trailing_space_) {
          // We had trailing whitespace, so
@@ -399,40 +402,35 @@ StringBuilder& StringBuilder::Append(const StringPiece& str) {
          }
          trailing_space_ = false;
        }
-      str_.append(start, current - start);
-      start = current + 1;
        last_char_was_escape_ = true;
-    } else if (!quote_) {
-      // This is not quoted text, so look for whitespace.
-      if (isspace(*current)) {
-        // We found whitespace, see if we have seen some
-        // before.
-        if (!trailing_space_) {
-          // We didn't see a previous adjacent space,
-          // so mark that we did.
+    } else {
+      if (quote_) {
+        // Quotes mean everything is taken, including whitespace.
+        AppendCodepointToUtf8String(codepoint, &str_);
+      } else {
+        // This is not quoted text, so we will accumulate whitespace and only emit a single
+        // character of whitespace if it is followed by a non-whitespace character.
+        if (IsCodepointSpace(codepoint)) {
+          // We found whitespace.
            trailing_space_ = true;
-          str_.append(start, current - start);
-        }
-
-        // Keep skipping whitespace.
-        start = current + 1;
-      } else if (trailing_space_) {
-        // We saw trailing space before, so replace all
-        // that trailing space with one space.
-        if (!str_.empty()) {
-          str_ += ' ';
+        } else {
+          if (trailing_space_) {
+            // We saw trailing space before, so replace all
+            // that trailing space with one space.
+            if (!str_.empty()) {
+              str_ += ' ';
+            }
+            trailing_space_ = false;
+          }
+          AppendCodepointToUtf8String(codepoint, &str_);
          }
-        trailing_space_ = false;
        }
      }
-    current++;
    }
-  str_.append(start, end - start);
  
    // Accumulate the added string's UTF-16 length.
-  ssize_t len = utf8_to_utf16_length(
-      reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index,
-      str_.size() - new_data_index);
+  ssize_t len = utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index,
+                                     str_.size() - new_data_index);
    if (len < 0) {
      error_ = "invalid unicode code point";
      return *this;
author	Adam Lesinski <adamlesinski@google.com>
	Wed, 28 Jun 2017 01:39:07 +0000 (18:39 -0700)
committer	Adam Lesinski <adamlesinski@google.com>
	Fri, 30 Jun 2017 01:17:35 +0000 (18:17 -0700)
tools/aapt2/Android.bp		patch \| blob \| history
tools/aapt2/ResourceParser_test.cpp		patch \| blob \| history
tools/aapt2/readme.md		patch \| blob \| history
tools/aapt2/util/Utf8Iterator.cpp	[new file with mode: 0644]	patch \| blob
tools/aapt2/util/Utf8Iterator.h	[new file with mode: 0644]	patch \| blob
tools/aapt2/util/Utf8Iterator_test.cpp	[new file with mode: 0644]	patch \| blob
tools/aapt2/util/Util.cpp		patch \| blob \| history