From 38ac6bee58b7ef19cbc2b0540315c3c299ca77a9 Mon Sep 17 00:00:00 2001
From: John Porto <jpp@chromium.org>
Date: Fri, 4 Dec 2015 06:51:38 -0800
Subject: [PATCH] Subzero. ARM32. Initial sandboxing code.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076

Review URL: https://codereview.chromium.org/1491473002 .
---
 Makefile.standalone                     |   5 +-
 pydir/crosstest.py                      |   4 +-
 pydir/crosstest_generator.py            |   2 +-
 pydir/run-pnacl-sz.py                   |  14 ++-
 pydir/szbuild.py                        |   8 +-
 pydir/targets.py                        |   7 +-
 src/IceTargetLoweringARM32.cpp          | 106 ++++++++++++++---------
 src/IceTargetLoweringARM32.h            | 122 ++++++++++++++++++++++++++
 tests_lit/assembler/arm32/sandboxing.ll | 149 ++++++++++++++++++++++++++++++++
 tests_lit/assembler/x86/sandboxing.ll   |   4 +-
 10 files changed, 367 insertions(+), 54 deletions(-)
 create mode 100644 tests_lit/assembler/arm32/sandboxing.ll

diff --git a/Makefile.standalone b/Makefile.standalone
index 27578be02..1f18f006e 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -404,7 +404,10 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
           -e x8664,native,sse2,test_global \
           -i arm32,native,neon \
           -e arm32,native,neon,test_vector_ops \
-          -e arm32,native,neon,test_select
+          -e arm32,native,neon,test_select \
+          -i arm32,sandbox,neon \
+          -e arm32,sandbox,neon,test_vector_ops \
+          -e arm32,sandbox,neon,test_select
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv crosstest/Output
 endif
diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index bd1c89e38..67333de7a 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -182,7 +182,7 @@ def main():
             ).format(root=nacl_root, sb='sb' if args.sandbox else 'native'))
     pure_c = os.path.splitext(args.driver)[1] == '.c'
 
-    # TargetX8664 is ilp32, but clang does not currently support such
+    # TargetX8664 is ilp32, but pnacl-clang does not currently support such
     # configuration. In order to run the crosstests we play nasty, dangerous
     # tricks with the stack pointer.
     needs_stack_hack = (args.target == 'x8664')
@@ -202,7 +202,7 @@ def main():
         bin=bindir, prefix='pnacl-' if args.sandbox else '',
         cc='clang' if pure_c else 'clang++')
     sb_native_args = (['-O0', '--pnacl-allow-native',
-                       '-arch', target_info.target,
+                       '-arch', target_info.compiler_arch,
                        '-Wn,-defsym=__Sz_AbsoluteZero=0']
                       if args.sandbox else
                       ['-g', '-target=' + triple,
diff --git a/pydir/crosstest_generator.py b/pydir/crosstest_generator.py
index 2dbd6e0e3..6c2dd73d6 100755
--- a/pydir/crosstest_generator.py
+++ b/pydir/crosstest_generator.py
@@ -67,7 +67,7 @@ def main():
   arch_flags = { 'x8632': [],
                  'x8664': [],
                  # ARM doesn't have an ELF writer yet.
-                 'arm32': ['--filetype=iasm'] }
+                 'arm32': ['--filetype=asm'] }
   # all_keys is only used in the help text.
   all_keys = '; '.join([' '.join(targets), ' '.join(sandboxing),
                         ' '.join(opt_levels), ' '.join(flat_attrs)])
diff --git a/pydir/run-pnacl-sz.py b/pydir/run-pnacl-sz.py
index cefd47542..2de915ad5 100755
--- a/pydir/run-pnacl-sz.py
+++ b/pydir/run-pnacl-sz.py
@@ -11,14 +11,16 @@ import tempfile
 from utils import shellcmd
 
 
-def TargetAssemblerFlags(target):
+def TargetAssemblerFlags(target, sandboxed):
   # TODO(stichnot): -triple=i686-nacl should be used for a
   # sandboxing test.  This means there should be an args.sandbox
   # argument that also gets passed through to pnacl-sz.
   # TODO(reed kotler). Need to find out exactly we need to
   # add here for Mips32.
-  flags = { 'x8632': ['-triple=i686'],
-            'arm32': ['-triple=armv7a', '-mcpu=cortex-a9', '-mattr=+neon'],
+  flags = { 'x8632': ['-triple=%s' % ('i686' if not sandboxed else 'i686-nacl')],
+            'arm32': ['-triple=%s' % (
+                          'armv7a' if not sandboxed else 'armv7a-nacl'),
+                      '-mcpu=cortex-a9', '-mattr=+neon'],
             'mips32': ['-triple=mipsel' ] }
   return flags[target]
 
@@ -89,6 +91,8 @@ def main():
     argparser.add_argument('--args', '-a', nargs=argparse.REMAINDER,
                            default=[],
                            help='Remaining arguments are passed to pnacl-sz')
+    argparser.add_argument('--sandbox', required=False, action='store_true',
+                           help='Sanboxes the generated code.')
 
     args = argparser.parse_args()
     pnacl_bin_path = args.pnacl_bin_path
@@ -121,6 +125,8 @@ def main():
       cmd += [os.path.join(pnacl_bin_path, 'not')]
     cmd += [args.pnacl_sz]
     cmd += ['--target', args.target]
+    if args.sandbox:
+      cmd += ['-sandbox']
     if args.insts:
       # If the tests are based on '-verbose inst' output, force
       # single-threaded translation because dump output does not get
@@ -147,7 +153,7 @@ def main():
       asm_temp.close()
     if args.assemble and args.filetype != 'obj':
       cmd += (['|', os.path.join(pnacl_bin_path, 'llvm-mc')] +
-              TargetAssemblerFlags(args.target) +
+              TargetAssemblerFlags(args.target, args.sandbox) +
               ['-filetype=obj', '-o', asm_temp.name])
     elif asm_temp:
       cmd += ['-o', asm_temp.name]
diff --git a/pydir/szbuild.py b/pydir/szbuild.py
index 569c8caa6..10ce8c1e5 100755
--- a/pydir/szbuild.py
+++ b/pydir/szbuild.py
@@ -318,10 +318,14 @@ def ProcessPexe(args, pexe, exe):
 
     # Run the linker regardless of hybrid mode.
     if args.sandbox:
-        assert args.target in ['x8632'], \
+        assert args.target in ('x8632', 'arm32'), \
             '-sandbox is not available for %s' % args.target
+        target_lib_dir = {
+          'arm32': 'arm',
+          'x8632': 'x86-32',
+        }[args.target]
         linklib = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
-                   'x86-32/lib').format(root=nacl_root)
+                   '{target_dir}/lib').format(root=nacl_root, target_dir=target_lib_dir)
         shellcmd((
             '{gold} -nostdlib --no-fix-cortex-a8 --eh-frame-hdr -z text ' +
             '--build-id --entry=__pnacl_start -static ' +
diff --git a/pydir/targets.py b/pydir/targets.py
index 3635e1314..7e2222ddc 100644
--- a/pydir/targets.py
+++ b/pydir/targets.py
@@ -17,22 +17,25 @@ def FindARMCrossInclude():
 
 
 TargetInfo = namedtuple('TargetInfo',
-                        ['target', 'triple', 'llc_flags', 'ld_emu',
-                         'cross_headers'])
+                        ['target', 'compiler_arch', 'triple', 'llc_flags',
+                         'ld_emu', 'cross_headers'])
 
 X8632Target = TargetInfo(target='x8632',
+                         compiler_arch='x8632',
                          triple='i686-none-linux',
                          llc_flags=['-mcpu=pentium4m'],
                          ld_emu='elf_i386_nacl',
                          cross_headers=[])
 
 X8664Target = TargetInfo(target='x8664',
+                         compiler_arch='x8664',
                          triple='x86_64-none-linux',
                          llc_flags=['-mcpu=x86-64'],
                          ld_emu='elf_x86_64_nacl',
                          cross_headers=[])
 
 ARM32Target = TargetInfo(target='arm32',
+                         compiler_arch='armv7',
                          triple='armv7a-none-linux-gnueabihf',
                          llc_flags=['-mcpu=cortex-a9',
                                     '-float-abi=hard',
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index d260db0e6..5469db1d5 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -161,7 +161,8 @@ TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
 }
 
 TargetARM32::TargetARM32(Cfg *Func)
-    : TargetLowering(Func), CPUFeatures(Func->getContext()->getFlags()) {}
+    : TargetLowering(Func), NeedSandboxing(Ctx->getFlags().getUseSandboxing()),
+      CPUFeatures(Func->getContext()->getFlags()) {}
 
 void TargetARM32::staticInit() {
   // Limit this size (or do all bitsets need to be the same width)???
@@ -544,8 +545,7 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
       return;
     }
     case Intrinsics::NaClReadTP: {
-      if (Ctx->getFlags().getUseSandboxing()) {
-        UnimplementedError(Func->getContext()->getFlags());
+      if (NeedSandboxing) {
         return;
       }
       static constexpr SizeT MaxArgs = 0;
@@ -1120,6 +1120,10 @@ void TargetARM32::addProlog(CfgNode *Node) {
       continue;
     }
     if (CalleeSaves[i] && RegsUsed[i]) {
+      if (NeedSandboxing && i == RegARM32::Reg_r9) {
+        // r9 is never updated in sandboxed code.
+        continue;
+      }
       ++NumCallee;
       Variable *PhysicalRegister = getPhysicalRegister(i);
       PreservedRegsSizeBytes +=
@@ -1173,10 +1177,9 @@ void TargetARM32::addProlog(CfgNode *Node) {
     // Use the scratch register if needed to legalize the immediate.
     Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
                                   Legal_Reg | Legal_Flex, getReservedTmpReg());
-    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-    _sub(SP, SP, SubAmount);
+    AutoSandboxer(this).sub_sp(SubAmount);
     if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
-      alignRegisterPow2(SP, FixedAllocaAlignBytes);
+      AutoSandboxer(this).align_sp(FixedAllocaAlignBytes);
     }
   }
 
@@ -1270,7 +1273,7 @@ void TargetARM32::addEpilog(CfgNode *Node) {
     // use of SP before the assignment of SP=FP keeps previous SP adjustments
     // from being dead-code eliminated.
     Context.insert(InstFakeUse::create(Func, SP));
-    _mov(SP, FP);
+    AutoSandboxer(this).reset_sp(FP);
   } else {
     // add SP, SpillAreaSizeBytes
     if (SpillAreaSizeBytes) {
@@ -1278,7 +1281,7 @@ void TargetARM32::addEpilog(CfgNode *Node) {
       Operand *AddAmount =
           legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
                    Legal_Reg | Legal_Flex, getReservedTmpReg());
-      _add(SP, SP, AddAmount);
+      AutoSandboxer(this).add_sp(AddAmount);
     }
   }
 
@@ -1302,6 +1305,9 @@ void TargetARM32::addEpilog(CfgNode *Node) {
     }
 
     if (CalleeSaves[i] && RegsUsed[i]) {
+      if (NeedSandboxing && i == RegARM32::Reg_r9) {
+        continue;
+      }
       GPRsToRestore.push_back(getPhysicalRegister(i));
     }
   }
@@ -1318,16 +1324,13 @@ void TargetARM32::addEpilog(CfgNode *Node) {
   // bundle_unlock
   // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to
   // restrict to the lower 1GB as well.
-  Operand *RetMask =
-      legalize(Ctx->getConstantInt32(0xc000000f), Legal_Reg | Legal_Flex);
-  Variable *LR = makeReg(IceType_i32, RegARM32::Reg_lr);
+  Variable *LR = getPhysicalRegister(RegARM32::Reg_lr);
   Variable *RetValue = nullptr;
   if (RI->getSrcSize())
     RetValue = llvm::cast<Variable>(RI->getSrc(0));
-  _bundle_lock();
-  _bic(LR, LR, RetMask);
-  _ret(LR, RetValue);
-  _bundle_unlock();
+
+  AutoSandboxer(this).ret(LR, RetValue);
+
   RI->setDeleted();
 }
 
@@ -1378,7 +1381,7 @@ Variable *TargetARM32::PostLoweringLegalizer::newBaseRegister(
 OperandARM32Mem *TargetARM32::PostLoweringLegalizer::createMemOperand(
     Type Ty, Variable *Base, int32_t Offset, bool AllowOffsets) {
   assert(!Base->isRematerializable());
-  if (AllowOffsets && Target->isLegalMemOffset(Ty, Offset)) {
+  if (Offset == 0 || (AllowOffsets && Target->isLegalMemOffset(Ty, Offset))) {
     return OperandARM32Mem::create(
         Target->Func, Ty, Base,
         llvm::cast<ConstantInteger32>(Target->Ctx->getConstantInt32(Offset)),
@@ -1451,8 +1454,9 @@ void TargetARM32::PostLoweringLegalizer::legalizeMov(InstARM32Mov *MovInstr) {
     assert(!SrcR->isRematerializable());
     const int32_t Offset = Dest->getStackOffset();
     // This is a _mov(Mem(), Variable), i.e., a store.
-    Target->_str(SrcR, createMemOperand(DestTy, StackOrFrameReg, Offset),
-                 MovInstr->getPredicate());
+    TargetARM32::AutoSandboxer(Target)
+        .str(SrcR, createMemOperand(DestTy, StackOrFrameReg, Offset),
+             MovInstr->getPredicate());
     // _str() does not have a Dest, so we add a fake-def(Dest).
     Target->Context.insert(InstFakeDef::create(Target->Func, Dest));
     Legalized = true;
@@ -1476,8 +1480,9 @@ void TargetARM32::PostLoweringLegalizer::legalizeMov(InstARM32Mov *MovInstr) {
       if (!Var->hasReg()) {
         // This is a _mov(Variable, Mem()), i.e., a load.
         const int32_t Offset = Var->getStackOffset();
-        Target->_ldr(Dest, createMemOperand(DestTy, StackOrFrameReg, Offset),
-                     MovInstr->getPredicate());
+        TargetARM32::AutoSandboxer(Target)
+            .ldr(Dest, createMemOperand(DestTy, StackOrFrameReg, Offset),
+                 MovInstr->getPredicate());
         Legalized = true;
       }
     }
@@ -1542,7 +1547,15 @@ TargetARM32::PostLoweringLegalizer::legalizeMemOperand(OperandARM32Mem *Mem,
     Legalized = true;
   }
 
-  if (!Legalized) {
+  if (!Legalized && !Target->NeedSandboxing) {
+    return nullptr;
+  }
+
+  if (Target->NeedSandboxing && Base->getRegNum() == RegARM32::Reg_r9) {
+    if (Legalized) {
+      llvm::report_fatal_error("r9-based mem operand should not need to be "
+                               "legalized.");
+    }
     return nullptr;
   }
 
@@ -1550,6 +1563,7 @@ TargetARM32::PostLoweringLegalizer::legalizeMemOperand(OperandARM32Mem *Mem,
     return createMemOperand(Mem->getType(), Base, Offset, AllowOffsets);
   }
 
+  assert(!Target->NeedSandboxing);
   assert(MemTraits[Mem->getType()].CanHaveIndex);
 
   if (Offset != 0) {
@@ -1621,7 +1635,8 @@ void TargetARM32::postLowerLegalization() {
       } else if (auto *LdrInstr = llvm::dyn_cast<InstARM32Ldr>(CurInstr)) {
         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
                 llvm::cast<OperandARM32Mem>(LdrInstr->getSrc(0)))) {
-          _ldr(CurInstr->getDest(), LegalMem, LdrInstr->getPredicate());
+          AutoSandboxer(this)
+              .ldr(CurInstr->getDest(), LegalMem, LdrInstr->getPredicate());
           CurInstr->setDeleted();
         }
       } else if (auto *LdrexInstr = llvm::dyn_cast<InstARM32Ldrex>(CurInstr)) {
@@ -1629,14 +1644,16 @@ void TargetARM32::postLowerLegalization() {
         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
                 llvm::cast<OperandARM32Mem>(LdrexInstr->getSrc(0)),
                 DisallowOffsetsBecauseLdrex)) {
-          _ldrex(CurInstr->getDest(), LegalMem, LdrexInstr->getPredicate());
+          AutoSandboxer(this)
+              .ldrex(CurInstr->getDest(), LegalMem, LdrexInstr->getPredicate());
           CurInstr->setDeleted();
         }
       } else if (auto *StrInstr = llvm::dyn_cast<InstARM32Str>(CurInstr)) {
+        AutoSandboxer Bundle(this);
         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
                 llvm::cast<OperandARM32Mem>(StrInstr->getSrc(1)))) {
-          _str(llvm::cast<Variable>(CurInstr->getSrc(0)), LegalMem,
-               StrInstr->getPredicate());
+          AutoSandboxer(this).str(llvm::cast<Variable>(CurInstr->getSrc(0)),
+                                  LegalMem, StrInstr->getPredicate());
           CurInstr->setDeleted();
         }
       } else if (auto *StrexInstr = llvm::dyn_cast<InstARM32Strex>(CurInstr)) {
@@ -1644,8 +1661,9 @@ void TargetARM32::postLowerLegalization() {
         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
                 llvm::cast<OperandARM32Mem>(StrexInstr->getSrc(1)),
                 DisallowOffsetsBecauseStrex)) {
-          _strex(CurInstr->getDest(), llvm::cast<Variable>(CurInstr->getSrc(0)),
-                 LegalMem, StrexInstr->getPredicate());
+          AutoSandboxer(this).strex(CurInstr->getDest(),
+                                    llvm::cast<Variable>(CurInstr->getSrc(0)),
+                                    LegalMem, StrexInstr->getPredicate());
           CurInstr->setDeleted();
         }
       }
@@ -1803,7 +1821,7 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
 
   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   if (OverAligned) {
-    alignRegisterPow2(SP, Alignment);
+    AutoSandboxer(this).align_sp(Alignment);
   }
 
   Variable *Dest = Inst->getDest();
@@ -1828,7 +1846,7 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
     // in Dest.
     Operand *SubAmountRF =
         legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
-    _sub(SP, SP, SubAmountRF);
+    AutoSandboxer(this).sub_sp(SubAmountRF);
   } else {
     // Non-constant sizes need to be adjusted to the next highest multiple of
     // the required alignment at runtime.
@@ -1838,7 +1856,7 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
     Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
     _add(T, T, AddAmount);
     alignRegisterPow2(T, Alignment);
-    _sub(SP, SP, T);
+    AutoSandboxer(this).sub_sp(T);
   }
 
   // Adds back a few bytes to SP to account for the out args area.
@@ -3249,8 +3267,6 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
       break;
     }
   }
-  // TODO(jvoung): Handle sandboxing. const bool NeedSandboxing =
-  // Ctx->getFlags().getUseSandboxing();
 
   // Allow ConstantRelocatable to be left alone as a direct call, but force
   // other constants like ConstantInteger32 to be in a register and make it an
@@ -3271,8 +3287,10 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
     // the call.
     Context.insert(InstFakeUse::create(Func, Reg));
   }
-  Inst *NewCall = InstARM32Call::create(Func, ReturnReg, CallTarget);
-  Context.insert(NewCall);
+
+  InstARM32Call *NewCall = AutoSandboxer(this, InstBundleLock::Opt_AlignToEnd)
+                               .bl(ReturnReg, CallTarget);
+
   if (ReturnRegHi)
     Context.insert(InstFakeDef::create(Func, ReturnRegHi));
 
@@ -4612,7 +4630,14 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
     llvm::report_fatal_error("memmove should have been prelowered.");
   }
   case Intrinsics::NaClReadTP: {
-    llvm::report_fatal_error("nacl-read-tp should have been prelowered.");
+    if (!NeedSandboxing) {
+      llvm::report_fatal_error("nacl-read-tp should have been prelowered.");
+    }
+    Variable *TP = legalizeToReg(OperandARM32Mem::create(
+        Func, getPointerType(), getPhysicalRegister(RegARM32::Reg_r9),
+        llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32))));
+    _mov(Dest, TP);
+    return;
   }
   case Intrinsics::Setjmp: {
     llvm::report_fatal_error("setjmp should have been prelowered.");
@@ -4630,9 +4655,8 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
     return;
   }
   case Intrinsics::Stackrestore: {
-    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-    Operand *Val = legalize(Instr->getArg(0), Legal_Reg | Legal_Flex);
-    _mov_redefined(SP, Val);
+    Variable *Val = legalizeToReg(Instr->getArg(0));
+    AutoSandboxer(this).reset_sp(Val);
     return;
   }
   case Intrinsics::Trap:
@@ -4987,8 +5011,9 @@ OperandARM32Mem *TargetARM32::formAddressingMode(Type Ty, Cfg *Func,
   (void)MemTraitsSize;
   assert(Ty < MemTraitsSize);
   auto *TypeTraits = &MemTraits[Ty];
-  const bool CanHaveIndex = TypeTraits->CanHaveIndex;
-  const bool CanHaveShiftedIndex = TypeTraits->CanHaveShiftedIndex;
+  const bool CanHaveIndex = !NeedSandboxing && TypeTraits->CanHaveIndex;
+  const bool CanHaveShiftedIndex =
+      !NeedSandboxing && TypeTraits->CanHaveShiftedIndex;
   const bool CanHaveImm = TypeTraits->CanHaveImm;
   const int32_t ValidImmMask = TypeTraits->ValidImmMask;
   (void)ValidImmMask;
@@ -5160,6 +5185,7 @@ void TargetARM32::lowerRet(const InstRet *Inst) {
   // frame removal instructions. addEpilog is responsible for restoring the
   // "lr" register as needed prior to this ret instruction.
   _ret(getPhysicalRegister(RegARM32::Reg_lr), Reg);
+
   // Add a fake use of sp to make sure sp stays alive for the entire function.
   // Otherwise post-call sp adjustments get dead-code eliminated.
   // TODO: Are there more places where the fake use should be inserted? E.g.
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index ddd10f108..a10b575b5 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -162,6 +162,18 @@ public:
         llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(ShAmtImm & 0x1F)));
   }
 
+  OperandARM32FlexImm *indirectBranchBicMask() const {
+    constexpr uint32_t Imm8 = 0xFC; // 0xC000000F
+    constexpr uint32_t RotateAmt = 2;
+    return OperandARM32FlexImm::create(Func, IceType_i32, Imm8, RotateAmt);
+  }
+
+  OperandARM32FlexImm *memOpBicMask() const {
+    constexpr uint32_t Imm8 = 0x0C; // 0xC0000000
+    constexpr uint32_t RotateAmt = 2;
+    return OperandARM32FlexImm::create(Func, IceType_i32, Imm8, RotateAmt);
+  }
+
   GlobalContext *getCtx() const { return Ctx; }
 
 protected:
@@ -822,6 +834,115 @@ protected:
 
   void postLowerLegalization();
 
+  class AutoSandboxer {
+  public:
+    explicit AutoSandboxer(
+        TargetARM32 *Target,
+        InstBundleLock::Option BundleOption = InstBundleLock::Opt_None)
+        : Target(Target) {
+      if (Target->NeedSandboxing) {
+        Target->_bundle_lock(BundleOption);
+      }
+    }
+
+    void add_sp(Operand *AddAmount) {
+      Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
+      Target->_add(SP, SP, AddAmount);
+      if (Target->NeedSandboxing) {
+        Target->_bic(SP, SP, Target->memOpBicMask());
+      }
+    }
+
+    void align_sp(size_t Alignment) {
+      Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
+      Target->alignRegisterPow2(SP, Alignment);
+      if (Target->NeedSandboxing) {
+        Target->_bic(SP, SP, Target->memOpBicMask());
+      }
+    }
+
+    InstARM32Call *bl(Variable *ReturnReg, Operand *CallTarget) {
+      if (Target->NeedSandboxing) {
+        if (auto *CallTargetR = llvm::dyn_cast<Variable>(CallTarget)) {
+          Target->_bic(CallTargetR, CallTargetR,
+                       Target->indirectBranchBicMask());
+        }
+      }
+      auto *Call = InstARM32Call::create(Target->Func, ReturnReg, CallTarget);
+      Target->Context.insert(Call);
+      return Call;
+    }
+
+    void ldr(Variable *Dest, OperandARM32Mem *Mem, CondARM32::Cond Pred) {
+      if (Target->NeedSandboxing) {
+        assert(!Mem->isRegReg());
+        Variable *MemBase = Mem->getBase();
+        Target->_bic(MemBase, MemBase, Target->memOpBicMask(), Pred);
+      }
+      Target->_ldr(Dest, Mem, Pred);
+    }
+
+    void ldrex(Variable *Dest, OperandARM32Mem *Mem, CondARM32::Cond Pred) {
+      if (Target->NeedSandboxing) {
+        assert(!Mem->isRegReg());
+        Variable *MemBase = Mem->getBase();
+        Target->_bic(MemBase, MemBase, Target->memOpBicMask(), Pred);
+      }
+      Target->_ldrex(Dest, Mem, Pred);
+    }
+
+    void reset_sp(Variable *Src) {
+      Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
+      Target->_mov_redefined(SP, Src);
+      if (Target->NeedSandboxing) {
+        Target->_bic(SP, SP, Target->memOpBicMask());
+      }
+    }
+
+    void ret(Variable *RetAddr, Variable *RetValue) {
+      if (Target->NeedSandboxing) {
+        Target->_bic(RetAddr, RetAddr, Target->indirectBranchBicMask());
+      }
+      Target->_ret(RetAddr, RetValue);
+    }
+
+    void str(Variable *Src, OperandARM32Mem *Mem, CondARM32::Cond Pred) {
+      if (Target->NeedSandboxing) {
+        assert(!Mem->isRegReg());
+        Variable *MemBase = Mem->getBase();
+        Target->_bic(MemBase, MemBase, Target->memOpBicMask(), Pred);
+      }
+      Target->_str(Src, Mem, Pred);
+    }
+
+    void strex(Variable *Dest, Variable *Src, OperandARM32Mem *Mem,
+               CondARM32::Cond Pred) {
+      if (Target->NeedSandboxing) {
+        assert(!Mem->isRegReg());
+        Variable *MemBase = Mem->getBase();
+        Target->_bic(MemBase, MemBase, Target->memOpBicMask(), Pred);
+      }
+      Target->_strex(Dest, Src, Mem, Pred);
+    }
+
+    void sub_sp(Operand *SubAmount) {
+      Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
+      Target->_sub(SP, SP, SubAmount);
+      if (Target->NeedSandboxing) {
+        Target->_bic(SP, SP, Target->memOpBicMask());
+      }
+    }
+
+    ~AutoSandboxer() {
+      if (Target->NeedSandboxing) {
+        Target->_bundle_unlock();
+      }
+    }
+
+  private:
+    TargetARM32 *Target;
+  };
+
   class PostLoweringLegalizer {
     PostLoweringLegalizer() = delete;
     PostLoweringLegalizer(const PostLoweringLegalizer &) = delete;
@@ -878,6 +999,7 @@ protected:
     int32_t TempBaseOffset = 0;
   };
 
+  const bool NeedSandboxing;
   TargetARM32Features CPUFeatures;
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
diff --git a/tests_lit/assembler/arm32/sandboxing.ll b/tests_lit/assembler/arm32/sandboxing.ll
new file mode 100644
index 000000000..bf9935129
--- /dev/null
+++ b/tests_lit/assembler/arm32/sandboxing.ll
@@ -0,0 +1,149 @@
+; Tests basics and corner cases of x86-32 sandboxing, using -Om1 in
+; the hope that the output will remain stable.  When packing bundles,
+; we try to limit to a few instructions with well known sizes and
+; minimal use of registers and stack slots in the lowering sequence.
+
+; RUN: %p2i -i %s --sandbox --filetype=asm --target=arm32 --assemble \
+; RUN:   --disassemble --args -Om1 -allow-externally-defined-symbols \
+; RUN:   -ffunction-sections  | FileCheck %s
+
+declare void @call_target()
+@global_short = internal global [2 x i8] zeroinitializer
+
+; A direct call sequence uses the right mask and register-call sequence.
+define internal void @test_direct_call() {
+entry:
+  call void @call_target()
+  ret void
+}
+; CHECK-LABEL: test_direct_call
+; CHECK: nop
+; CHECK: c: {{.*}} bl {{.*}} call_target
+; CHECK-NEXT: 10:
+
+; An indirect call sequence uses the right mask and register-call sequence.
+define internal void @test_indirect_call(i32 %target) {
+entry:
+  %__1 = inttoptr i32 %target to void ()*
+  call void %__1()
+  ret void
+}
+; CHECK-LABEL: test_indirect_call
+; CHECK: ldr [[REG:.*]], [sp, 
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK: 18: {{.*}} bic [[REG]], [[REG]], {{.*}} 0xc000000f
+; CHECK-NEXT: blx [[REG]]
+; CHECk-NEXT: 20:
+
+; A return sequences uses the right pop / mask / jmp sequence.
+define internal void @test_ret() {
+entry:
+  ret void
+}
+; CHECK-LABEL: test_ret
+; CHECK: 0: {{.*}} bic lr, lr, {{.*}} 0xc000000f
+; CHECK-NEXT: bx lr
+
+; Bundle lock without padding.
+define internal void @bundle_lock_without_padding() {
+entry:
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i16 0, i16* %addr_short, align 1
+  ret void
+}
+; CHECK-LABEL: bundle_lock_without_padding
+; CHECK: 0: {{.*}} movw
+; CHECK-NEXT: movt
+; CHECK-NEXT: movw
+; CHECK-NEXT: strh
+; CHECK-NEXT: bic lr, lr, {{.*}} 0xc000000f
+; CHECK-NEXT: {{.*}} bx lr
+
+; Bundle lock with padding.
+define internal void @bundle_lock_with_padding() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  store i16 0, i16* undef, align 1   ; 3 insts
+  store i16 0, i16* undef, align 1   ; 3 insts
+  store i16 0, i16* undef, align 1   ; 3 insts
+                                     ; SP adjustment + pop
+  ; nop
+  ; bundle boundary
+  ret void
+}
+; CHECK-LABEL: bundle_lock_with_padding
+; CHECK: 38: {{.*}} pop
+; CHECK-NEXT: nop
+; CHECK-NEXT: bic lr, lr, {{.*}} 0xc000000f
+; CHECK-NEXT: {{.*}} bx lr
+
+; Bundle lock align_to_end without any padding.
+define internal void @bundle_lock_align_to_end_padding_0() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  store i16 0, i16* undef, align 1
+  call void @call_target()
+  ; bundle boundary
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_0
+; CHECK: c: {{.*}} bl {{.*}} call_target
+; CHECK-NEXT: movw
+; CHECK-NEXT: movw
+; CHECK-NEXT: strh
+; CHECK-NEXT: bl {{.*}} call_target
+; CHECK-NEXT: add
+; CHECK-NEXT: pop
+; CHECK-NEXT: bic lr, lr, {{.*}} 0xc000000f
+; CHECK-NEXT: {{.*}} bx lr
+
+; Bundle lock align_to_end with one bunch of padding.
+define internal void @bundle_lock_align_to_end_padding_1() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  store i16 0, i16* undef, align 1
+  store i16 0, i16* undef, align 1
+  ; bundle boundary
+  call void @call_target()
+  ; bundle boundary
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_1
+; CHECK: c: {{.*}} bl {{.*}} call_target
+; CHECK-NEXT: movw
+; CHECK-NEXT: movw
+; CHECK-NEXT: strh
+; CHECK-NEXT: movw
+; CHECK-NEXT: movw
+; CHECK-NEXT: strh
+; CHECK-NEXT: nop
+; CHECK-NEXT: bl {{.*}} call_target
+; CHECK-NEXT: add
+; CHECK-NEXT: pop
+; CHECK-NEXT: bic lr, lr, {{.*}} 0xc000000f
+; CHECK-NEXT: {{.*}} bx lr
+
+; Bundle lock align_to_end with two bunches of padding.
+define internal void @bundle_lock_align_to_end_padding_2(i32 %target) {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %__1 = inttoptr i32 %target to void ()*
+  store i8 0, i8* undef, align 1
+  call void %__1()
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_2
+; CHECK: c: {{.*}} bl {{.*}} call_target
+; CHECK-NEXT: movw
+; CHECK-NEXT: movw
+; CHECK-NEXT: strb
+; CHECK: 20: {{.*}} nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: bic [[REG:r[0-9]+]], [[REG]], {{.*}} 0xc000000f
+; CHECK-NEXT: {{.*}} blx [[REG]]
+
diff --git a/tests_lit/assembler/x86/sandboxing.ll b/tests_lit/assembler/x86/sandboxing.ll
index c03d335d0..3233d574a 100644
--- a/tests_lit/assembler/x86/sandboxing.ll
+++ b/tests_lit/assembler/x86/sandboxing.ll
@@ -3,9 +3,9 @@
 ; we try to limit to a few instructions with well known sizes and
 ; minimal use of registers and stack slots in the lowering sequence.
 
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 \
+; RUN: %p2i -i %s --sandbox --filetype=obj --disassemble --args -Om1 \
 ; RUN:   -allow-externally-defined-symbols \
-; RUN:   -ffunction-sections -sandbox | FileCheck %s
+; RUN:   -ffunction-sections | FileCheck %s
 
 declare void @call_target()
 @global_byte = internal global [1 x i8] zeroinitializer
-- 
2.11.0