From d3198ce4bcc3ec64068d6508f2f08be97e6be83e Mon Sep 17 00:00:00 2001 From: Yunzhong Gao Date: Sat, 24 Jan 2015 04:23:08 +0000 Subject: [PATCH] If we see UTF-8 BOM sequence at the beginning of a response file, we shall remove these bytes before parsing. Phabricator Revision: http://reviews.llvm.org/D7156 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226988 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/CommandLine.cpp | 12 ++++++++++++ test/Other/Inputs/utf8-bom-response | 1 + test/Other/Inputs/utf8-response | 1 + test/Other/ResponseFile.ll | 5 +++++ 4 files changed, 19 insertions(+) create mode 100644 test/Other/Inputs/utf8-bom-response create mode 100644 test/Other/Inputs/utf8-response diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index a774421b26c..b4e32257a01 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -655,6 +655,13 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, NewArgv.push_back(nullptr); } +// It is called byte order marker but the UTF-8 BOM is actually not affected +// by the host system's endianness. +static bool hasUTF8ByteOrderMark(ArrayRef S) { + return (S.size() >= 3 && + S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf'); +} + static bool ExpandResponseFile(const char *FName, StringSaver &Saver, TokenizerCallback Tokenizer, SmallVectorImpl &NewArgv, @@ -674,6 +681,11 @@ static bool ExpandResponseFile(const char *FName, StringSaver &Saver, return false; Str = StringRef(UTF8Buf); } + // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove + // these bytes before parsing. + // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark + else if (hasUTF8ByteOrderMark(BufRef)) + Str = StringRef(BufRef.data() + 3, BufRef.size() - 3); // Tokenize the contents into NewArgv. Tokenizer(Str, Saver, NewArgv, MarkEOLs); diff --git a/test/Other/Inputs/utf8-bom-response b/test/Other/Inputs/utf8-bom-response new file mode 100644 index 00000000000..9dae3158ecd --- /dev/null +++ b/test/Other/Inputs/utf8-bom-response @@ -0,0 +1 @@ +-help diff --git a/test/Other/Inputs/utf8-response b/test/Other/Inputs/utf8-response new file mode 100644 index 00000000000..97f455ac44e --- /dev/null +++ b/test/Other/Inputs/utf8-response @@ -0,0 +1 @@ +-help diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll index 914e5480f20..92648b86f5f 100644 --- a/test/Other/ResponseFile.ll +++ b/test/Other/ResponseFile.ll @@ -6,6 +6,11 @@ ; RUN: llvm-as @%t.list2 -o %t.bc ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s +; When the response file begins with UTF8 BOM sequence, we shall remove them. +; Neither command below should return a "Could not open input file" error. +; RUN: llvm-as @%S/Inputs/utf8-response > /dev/null +; RUN: llvm-as @%S/Inputs/utf8-bom-response > /dev/null + ; CHECK: T foobar define void @foobar() { -- 2.11.0