diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6b209ed2d715..a1e1f9b07aaf 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -144,50 +144,6 @@ class TargetTransformInfoImplBase {
     return TTI::TCC_Expensive;
   }
 
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> ParamTys, const User *U) {
-    switch (IID) {
-    default:
-      // Intrinsics rarely (if ever) have normal argument setup constraints.
-      // Model them as having a basic instruction cost.
-      return TTI::TCC_Basic;
-
-    // TODO: other libc intrinsics.
-    case Intrinsic::memcpy:
-      return getMemcpyCost(dyn_cast<Instruction>(U));
-
-    case Intrinsic::annotation:
-    case Intrinsic::assume:
-    case Intrinsic::sideeffect:
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::dbg_label:
-    case Intrinsic::invariant_start:
-    case Intrinsic::invariant_end:
-    case Intrinsic::launder_invariant_group:
-    case Intrinsic::strip_invariant_group:
-    case Intrinsic::is_constant:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::objectsize:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-    case Intrinsic::experimental_gc_result:
-    case Intrinsic::experimental_gc_relocate:
-    case Intrinsic::coro_alloc:
-    case Intrinsic::coro_begin:
-    case Intrinsic::coro_free:
-    case Intrinsic::coro_end:
-    case Intrinsic::coro_frame:
-    case Intrinsic::coro_size:
-    case Intrinsic::coro_suspend:
-    case Intrinsic::coro_param:
-    case Intrinsic::coro_subfn_addr:
-      // These intrinsics don't actually represent code after lowering.
-      return TTI::TCC_Free;
-    }
-  }
-
   bool hasBranchDivergence() { return false; }
 
   bool isSourceOfDivergence(const Value *V) { return false; }
@@ -786,7 +742,49 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
     return TTI::TCC_Basic;
   }
 
-  using BaseT::getIntrinsicCost;
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys, const User *U) {
+    switch (IID) {
+    default:
+      // Intrinsics rarely (if ever) have normal argument setup constraints.
+      // Model them as having a basic instruction cost.
+      return TTI::TCC_Basic;
+
+    // TODO: other libc intrinsics.
+    case Intrinsic::memcpy:
+      return static_cast<T *>(this)->getMemcpyCost(dyn_cast<Instruction>(U));
+
+    case Intrinsic::annotation:
+    case Intrinsic::assume:
+    case Intrinsic::sideeffect:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
+    case Intrinsic::is_constant:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    case Intrinsic::experimental_gc_result:
+    case Intrinsic::experimental_gc_relocate:
+    case Intrinsic::coro_alloc:
+    case Intrinsic::coro_begin:
+    case Intrinsic::coro_free:
+    case Intrinsic::coro_end:
+    case Intrinsic::coro_frame:
+    case Intrinsic::coro_size:
+    case Intrinsic::coro_suspend:
+    case Intrinsic::coro_param:
+    case Intrinsic::coro_subfn_addr:
+      // These intrinsics don't actually represent code after lowering.
+      return TTI::TCC_Free;
+    }
+  }
 
   unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<const Value *> Arguments, const User *U) {
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index ea68faa7fc7f..5630a01824dc 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -665,7 +665,7 @@ template <class AliasAnalysisType> class ClobberWalker {
   struct generic_def_path_iterator
       : public iterator_facade_base<generic_def_path_iterator<T, Walker>,
                                     std::forward_iterator_tag, T *> {
-    generic_def_path_iterator() = default;
+    generic_def_path_iterator() {}
     generic_def_path_iterator(Walker *W, ListIndex N) : W(W), N(N) {}
 
     T &operator*() const { return curNode(); }
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 66d3a281d42d..88f0630f014f 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -486,13 +486,17 @@ getSchedRegions(MachineBasicBlock *MBB,
       MachineInstr &MI = *std::prev(I);
       if (isSchedBoundary(&MI, &*MBB, MF, TII))
         break;
-      if (!MI.isDebugInstr())
+      if (!MI.isDebugInstr()) {
         // MBB::size() uses instr_iterator to count. Here we need a bundle to
         // count as a single instruction.
         ++NumRegionInstrs;
+      }
     }
 
-    Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
+    // It's possible we found a scheduling region that only has debug
+    // instructions. Don't bother scheduling these.
+    if (NumRegionInstrs != 0)
+      Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
   }
 
   if (RegionsTopDown)
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 2b89f9d16fae..1ebc820a8b49 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -141,15 +141,25 @@ void IntelJITEventListener::notifyObjectLoaded(
     uint64_t Addr = *AddrOrErr;
     uint64_t Size = P.second;
 
+    auto SecOrErr = Sym.getSection();
+    if (!SecOrErr) {
+      // TODO: Actually report errors helpfully.
+      consumeError(SecOrErr.takeError());
+      continue;
+    }
+    object::section_iterator Sec = *SecOrErr;
+    if (Sec == Obj.section_end())
+      continue;
+    uint64_t Index = Sec->getIndex();
+
     // Record this address in a local vector
     Functions.push_back((void*)Addr);
 
     // Build the function loaded notification message
     iJIT_Method_Load FunctionMessage =
       FunctionDescToIntelJITFormat(*Wrapper, Name->data(), Addr, Size);
-    // TODO: it is neccessary to set proper SectionIndex here.
-    // object::SectionedAddress::UndefSection works for only absolute addresses.
-    DILineInfoTable Lines = Context->getLineInfoForAddressRange({Addr, object::SectionedAddress::UndefSection}, Size);
+    DILineInfoTable Lines =
+      Context->getLineInfoForAddressRange({Addr, Index}, Size);
     DILineInfoTable::iterator Begin = Lines.begin();
     DILineInfoTable::iterator End = Lines.end();
     for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
diff --git a/lib/ObjectYAML/MinidumpYAML.cpp b/lib/ObjectYAML/MinidumpYAML.cpp
index e578e0591f90..bd017c82fb4d 100644
--- a/lib/ObjectYAML/MinidumpYAML.cpp
+++ b/lib/ObjectYAML/MinidumpYAML.cpp
@@ -17,7 +17,7 @@ class BlobAllocator {
 public:
   size_t tell() const { return NextOffset; }
 
-  size_t AllocateCallback(size_t Size,
+  size_t allocateCallback(size_t Size,
                           std::function<void(raw_ostream &)> Callback) {
     size_t Offset = NextOffset;
     NextOffset += Size;
@@ -25,18 +25,18 @@ class BlobAllocator {
     return Offset;
   }
 
-  size_t AllocateBytes(ArrayRef<uint8_t> Data) {
-    return AllocateCallback(
+  size_t allocateBytes(ArrayRef<uint8_t> Data) {
+    return allocateCallback(
         Data.size(), [Data](raw_ostream &OS) { OS << toStringRef(Data); });
   }
 
-  template <typename T> size_t AllocateArray(ArrayRef<T> Data) {
-    return AllocateBytes({reinterpret_cast<const uint8_t *>(Data.data()),
+  template <typename T> size_t allocateArray(ArrayRef<T> Data) {
+    return allocateBytes({reinterpret_cast<const uint8_t *>(Data.data()),
                           sizeof(T) * Data.size()});
   }
 
-  template <typename T> size_t AllocateObject(const T &Data) {
-    return AllocateArray(makeArrayRef(Data));
+  template <typename T> size_t allocateObject(const T &Data) {
+    return allocateArray(makeArrayRef(Data));
   }
 
   void writeTo(raw_ostream &OS) const;
@@ -340,7 +340,7 @@ static Directory layout(BlobAllocator &File, Stream &S) {
   switch (S.Kind) {
   case Stream::StreamKind::RawContent: {
     RawContentStream &Raw = cast<RawContentStream>(S);
-    File.AllocateCallback(Raw.Size, [&Raw](raw_ostream &OS) {
+    File.allocateCallback(Raw.Size, [&Raw](raw_ostream &OS) {
       Raw.Content.writeAsBinary(OS);
       assert(Raw.Content.binary_size() <= Raw.Size);
       OS << std::string(Raw.Size - Raw.Content.binary_size(), '\0');
@@ -348,10 +348,10 @@ static Directory layout(BlobAllocator &File, Stream &S) {
     break;
   }
   case Stream::StreamKind::SystemInfo:
-    File.AllocateObject(cast<SystemInfoStream>(S).Info);
+    File.allocateObject(cast<SystemInfoStream>(S).Info);
     break;
   case Stream::StreamKind::TextContent:
-    File.AllocateArray(arrayRefFromStringRef(cast<TextContentStream>(S).Text));
+    File.allocateArray(arrayRefFromStringRef(cast<TextContentStream>(S).Text));
     break;
   }
   Result.Location.DataSize = File.tell() - Result.Location.RVA;
@@ -360,11 +360,11 @@ static Directory layout(BlobAllocator &File, Stream &S) {
 
 void MinidumpYAML::writeAsBinary(Object &Obj, raw_ostream &OS) {
   BlobAllocator File;
-  File.AllocateObject(Obj.Header);
+  File.allocateObject(Obj.Header);
 
   std::vector<Directory> StreamDirectory(Obj.Streams.size());
   Obj.Header.StreamDirectoryRVA =
-      File.AllocateArray(makeArrayRef(StreamDirectory));
+      File.allocateArray(makeArrayRef(StreamDirectory));
   Obj.Header.NumberOfStreams = StreamDirectory.size();
 
   for (auto &Stream : enumerate(Obj.Streams))
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index dae7d455d826..03c0353390f0 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -200,6 +200,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 82817d3b5ad1..5ab211b791e7 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -129,14 +129,13 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   // We're keeping these builders around because we'll want to add support for
   // floating point to them.
-  auto &LoadStoreBuilder =
-      getActionDefinitionsBuilder({G_LOAD, G_STORE})
-          .legalForTypesWithMemDesc({
-              {s1, p0, 8, 8},
-              {s8, p0, 8, 8},
-              {s16, p0, 16, 8},
-              {s32, p0, 32, 8},
-              {p0, p0, 32, 8}});
+  auto &LoadStoreBuilder = getActionDefinitionsBuilder({G_LOAD, G_STORE})
+                               .legalForTypesWithMemDesc({{s1, p0, 8, 8},
+                                                          {s8, p0, 8, 8},
+                                                          {s16, p0, 16, 8},
+                                                          {s32, p0, 32, 8},
+                                                          {p0, p0, 32, 8}})
+                               .unsupportedIfMemSizeNotPow2();
 
   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
   getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
@@ -155,7 +154,9 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
         {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FCONSTANT, G_FNEG})
         .legalFor({s32, s64});
 
-    LoadStoreBuilder.legalFor({{s64, p0}});
+    LoadStoreBuilder
+        .legalForTypesWithMemDesc({{s64, p0, 64, 32}})
+        .maxScalar(0, s32);
     PhiBuilder.legalFor({s64});
 
     getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct({s1},
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index fd0f7921ba3a..0cee6e732ec0 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -23,10 +23,10 @@ using namespace llvm;
 MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
-bool MipsCallLowering::MipsHandler::assign(unsigned VReg,
-                                           const CCValAssign &VA) {
+bool MipsCallLowering::MipsHandler::assign(unsigned VReg, const CCValAssign &VA,
+                                           const EVT &VT) {
   if (VA.isRegLoc()) {
-    assignValueToReg(VReg, VA);
+    assignValueToReg(VReg, VA, VT);
   } else if (VA.isMemLoc()) {
     assignValueToAddress(VReg, VA);
   } else {
@@ -37,9 +37,10 @@ bool MipsCallLowering::MipsHandler::assign(unsigned VReg,
 
 bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<unsigned> VRegs,
                                                 ArrayRef<CCValAssign> ArgLocs,
-                                                unsigned ArgLocsStartIndex) {
+                                                unsigned ArgLocsStartIndex,
+                                                const EVT &VT) {
   for (unsigned i = 0; i < VRegs.size(); ++i)
-    if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i]))
+    if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i], VT))
       return false;
   return true;
 }
@@ -71,10 +72,10 @@ bool MipsCallLowering::MipsHandler::handle(
       for (unsigned i = 0; i < SplitLength; ++i)
         VRegs.push_back(MRI.createGenericVirtualRegister(LLT{RegisterVT}));
 
-      if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Reg))
+      if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Reg, VT))
         return false;
     } else {
-      if (!assign(Args[ArgsIndex].Reg, ArgLocs[ArgLocsIndex]))
+      if (!assign(Args[ArgsIndex].Reg, ArgLocs[ArgLocsIndex], VT))
         return false;
     }
   }
@@ -88,7 +89,8 @@ class IncomingValueHandler : public MipsCallLowering::MipsHandler {
       : MipsHandler(MIRBuilder, MRI) {}
 
 private:
-  void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override;
+  void assignValueToReg(unsigned ValVReg, const CCValAssign &VA,
+                        const EVT &VT) override;
 
   unsigned getStackAddress(const CCValAssign &VA,
                            MachineMemOperand *&MMO) override;
@@ -97,7 +99,7 @@ class IncomingValueHandler : public MipsCallLowering::MipsHandler {
 
   bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
                    ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
-                   unsigned ArgsReg) override;
+                   unsigned ArgsReg, const EVT &VT) override;
 
   virtual void markPhysRegUsed(unsigned PhysReg) {
     MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -127,21 +129,47 @@ class CallReturnHandler : public IncomingValueHandler {
 } // end anonymous namespace
 
 void IncomingValueHandler::assignValueToReg(unsigned ValVReg,
-                                            const CCValAssign &VA) {
+                                            const CCValAssign &VA,
+                                            const EVT &VT) {
+  const MipsSubtarget &STI =
+      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
   unsigned PhysReg = VA.getLocReg();
-  switch (VA.getLocInfo()) {
-  case CCValAssign::LocInfo::SExt:
-  case CCValAssign::LocInfo::ZExt:
-  case CCValAssign::LocInfo::AExt: {
-    auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
-    MIRBuilder.buildTrunc(ValVReg, Copy);
-    break;
-  }
-  default:
-    MIRBuilder.buildCopy(ValVReg, PhysReg);
-    break;
+  if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+    const MipsSubtarget &STI =
+        static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+
+    MIRBuilder
+        .buildInstr(STI.isFP64bit() ? Mips::BuildPairF64_64
+                                    : Mips::BuildPairF64)
+        .addDef(ValVReg)
+        .addUse(PhysReg + (STI.isLittle() ? 0 : 1))
+        .addUse(PhysReg + (STI.isLittle() ? 1 : 0))
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+    markPhysRegUsed(PhysReg);
+    markPhysRegUsed(PhysReg + 1);
+  } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+    MIRBuilder.buildInstr(Mips::MTC1)
+        .addDef(ValVReg)
+        .addUse(PhysReg)
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+    markPhysRegUsed(PhysReg);
+  } else {
+    switch (VA.getLocInfo()) {
+    case CCValAssign::LocInfo::SExt:
+    case CCValAssign::LocInfo::ZExt:
+    case CCValAssign::LocInfo::AExt: {
+      auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+      MIRBuilder.buildTrunc(ValVReg, Copy);
+      break;
+    }
+    default:
+      MIRBuilder.buildCopy(ValVReg, PhysReg);
+      break;
+    }
+    markPhysRegUsed(PhysReg);
   }
-  markPhysRegUsed(PhysReg);
 }
 
 unsigned IncomingValueHandler::getStackAddress(const CCValAssign &VA,
@@ -180,8 +208,8 @@ void IncomingValueHandler::assignValueToAddress(unsigned ValVReg,
 bool IncomingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        ArrayRef<CCValAssign> ArgLocs,
                                        unsigned ArgLocsStartIndex,
-                                       unsigned ArgsReg) {
-  if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
+                                       unsigned ArgsReg, const EVT &VT) {
+  if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
     return false;
   setLeastSignificantFirst(VRegs);
   MIRBuilder.buildMerge(ArgsReg, VRegs);
@@ -196,7 +224,8 @@ class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
       : MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
 
 private:
-  void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override;
+  void assignValueToReg(unsigned ValVReg, const CCValAssign &VA,
+                        const EVT &VT) override;
 
   unsigned getStackAddress(const CCValAssign &VA,
                            MachineMemOperand *&MMO) override;
@@ -205,7 +234,7 @@ class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
 
   bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
                    ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
-                   unsigned ArgsReg) override;
+                   unsigned ArgsReg, const EVT &VT) override;
 
   unsigned extendRegister(unsigned ValReg, const CCValAssign &VA);
 
@@ -214,11 +243,40 @@ class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
 } // end anonymous namespace
 
 void OutgoingValueHandler::assignValueToReg(unsigned ValVReg,
-                                            const CCValAssign &VA) {
+                                            const CCValAssign &VA,
+                                            const EVT &VT) {
   unsigned PhysReg = VA.getLocReg();
-  unsigned ExtReg = extendRegister(ValVReg, VA);
-  MIRBuilder.buildCopy(PhysReg, ExtReg);
-  MIB.addUse(PhysReg, RegState::Implicit);
+  const MipsSubtarget &STI =
+      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+
+  if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+    MIRBuilder
+        .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
+                                    : Mips::ExtractElementF64)
+        .addDef(PhysReg + (STI.isLittle() ? 1 : 0))
+        .addUse(ValVReg)
+        .addImm(1)
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+    MIRBuilder
+        .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
+                                    : Mips::ExtractElementF64)
+        .addDef(PhysReg + (STI.isLittle() ? 0 : 1))
+        .addUse(ValVReg)
+        .addImm(0)
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+  } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+    MIRBuilder.buildInstr(Mips::MFC1)
+        .addDef(PhysReg)
+        .addUse(ValVReg)
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+  } else {
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+    MIB.addUse(PhysReg, RegState::Implicit);
+  }
 }
 
 unsigned OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
@@ -286,10 +344,10 @@ unsigned OutgoingValueHandler::extendRegister(unsigned ValReg,
 bool OutgoingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        ArrayRef<CCValAssign> ArgLocs,
                                        unsigned ArgLocsStartIndex,
-                                       unsigned ArgsReg) {
+                                       unsigned ArgsReg, const EVT &VT) {
   MIRBuilder.buildUnmerge(VRegs, ArgsReg);
   setLeastSignificantFirst(VRegs);
-  if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
+  if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
     return false;
 
   return true;
@@ -300,6 +358,8 @@ static bool isSupportedType(Type *T) {
     return true;
   if (T->isPointerTy())
     return true;
+  if (T->isFloatingPointTy())
+    return true;
   return false;
 }
 
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index dc04d83733e7..05c703b60bd6 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -35,7 +35,7 @@ class MipsCallLowering : public CallLowering {
 
   protected:
     bool assignVRegs(ArrayRef<unsigned> VRegs, ArrayRef<CCValAssign> ArgLocs,
-                     unsigned Index);
+                     unsigned ArgLocsStartIndex, const EVT &VT);
 
     void setLeastSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
 
@@ -43,19 +43,21 @@ class MipsCallLowering : public CallLowering {
     MachineRegisterInfo &MRI;
 
   private:
-    bool assign(unsigned VReg, const CCValAssign &VA);
+    bool assign(unsigned VReg, const CCValAssign &VA, const EVT &VT);
 
     virtual unsigned getStackAddress(const CCValAssign &VA,
                                      MachineMemOperand *&MMO) = 0;
 
-    virtual void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) = 0;
+    virtual void assignValueToReg(unsigned ValVReg, const CCValAssign &VA,
+                                  const EVT &VT) = 0;
 
     virtual void assignValueToAddress(unsigned ValVReg,
                                       const CCValAssign &VA) = 0;
 
     virtual bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
                              ArrayRef<CCValAssign> ArgLocs,
-                             unsigned ArgLocsStartIndex, unsigned ArgsReg) = 0;
+                             unsigned ArgLocsStartIndex, unsigned ArgsReg,
+                             const EVT &VT) = 0;
   };
 
   MipsCallLowering(const MipsTargetLowering &TLI);
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
index 36aea2983591..ded8c1c1fbc0 100644
--- a/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -38,6 +38,7 @@ class MipsInstructionSelector : public InstructionSelector {
   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
   bool materialize32BitImm(unsigned DestReg, APInt Imm,
                            MachineIRBuilder &B) const;
+  bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   const MipsTargetMachine &TM;
   const MipsSubtarget &STI;
@@ -75,15 +76,24 @@ MipsInstructionSelector::MipsInstructionSelector(
 {
 }
 
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
-                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
-                       const RegisterBankInfo &RBI) {
+bool MipsInstructionSelector::selectCopy(MachineInstr &I,
+                                         MachineRegisterInfo &MRI) const {
   unsigned DstReg = I.getOperand(0).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(DstReg))
     return true;
 
-  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
 
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  if (RegBank->getID() == Mips::FPRBRegBankID) {
+    if (DstSize == 32)
+      RC = &Mips::FGR32RegClass;
+    else if (DstSize == 64)
+      RC = STI.isFP64bit() ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+    else
+      llvm_unreachable("Unsupported destination size");
+  }
   if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
                       << " operand\n");
@@ -162,7 +172,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
 
   if (!isPreISelGenericOpcode(I.getOpcode())) {
     if (I.isCopy())
-      return selectCopy(I, TII, MRI, TRI, RBI);
+      return selectCopy(I, MRI);
 
     return true;
   }
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 214dd106869e..08c33a4119ce 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -62,6 +62,11 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
   case Mips::GPRMM16MovePPairFirst_and_GPRMM16MovePPairSecondRegClassID:
   case Mips::SP32RegClassID:
     return getRegBank(Mips::GPRBRegBankID);
+  case Mips::FGRCCRegClassID:
+  case Mips::FGR64RegClassID:
+  case Mips::AFGR64RegClassID:
+  case Mips::AFGR64_and_OddSPRegClassID:
+    return getRegBank(Mips::FPRBRegBankID);
   default:
     llvm_unreachable("Register class not supported");
   }
diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td
index b591841dfef2..14a0181f8f11 100644
--- a/lib/Target/Mips/MipsRegisterBanks.td
+++ b/lib/Target/Mips/MipsRegisterBanks.td
@@ -10,3 +10,5 @@
 //===----------------------------------------------------------------------===//
 
 def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
+
+def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index bfa0df7db347..e08914c536e3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3167,14 +3167,14 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
-    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
-    ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+    ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
     // Record the mem-refs
     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
   } else {
-    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
   }
 
   return NewNode;
@@ -3561,13 +3561,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     // Emit the smaller op and the shift.
-    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
-    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    // Even though we shrink the constant, the VT should match the operation VT.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, NVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, MVT::i32,
+                                         N0->getOperand(0), NewCst);
     if (ShlVal == 1)
-      CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+      CurDAG->SelectNodeTo(Node, AddOp, NVT, MVT::i32, SDValue(New, 0),
                            SDValue(New, 0));
     else
-      CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+      CurDAG->SelectNodeTo(Node, ShlOp, NVT, MVT::i32, SDValue(New, 0),
                            getI8Imm(ShlVal, dl));
     return;
   }
@@ -3968,7 +3970,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           unsigned TrailingZeros = countTrailingZeros(Mask);
           SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
           SDValue Shift =
-            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64,
+            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
                                            N0.getOperand(0), Imm), 0);
           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                        MVT::i32, Shift, Shift);
@@ -3979,7 +3981,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           unsigned LeadingZeros = countLeadingZeros(Mask);
           SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
           SDValue Shift =
-            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64,
+            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
                                            N0.getOperand(0), Imm), 0);
           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                        MVT::i32, Shift, Shift);
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 4c06b1765432..7aab8f8f377f 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -19,11 +19,6 @@ def GetLo32XForm : SDNodeXForm<imm, [{
   return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
 }]>;
 
-def GetLo8XForm : SDNodeXForm<imm, [{
-  // Transformation function: get the low 8 bits.
-  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
 
 //===----------------------------------------------------------------------===//
 // Random Pseudo Instructions.
@@ -1523,7 +1518,7 @@ def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
             (i64 0),
             (AND32ri8
               (EXTRACT_SUBREG GR64:$src, sub_32bit),
-              (i32 (GetLo8XForm imm:$imm))),
+              (i32 (GetLo32XForm imm:$imm))),
             sub_32bit)>;
 
 def : Pat<(and GR64:$src, i64immZExt32:$imm),
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 99252fc2a7aa..2aa5fa45ce92 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -931,17 +931,6 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
   return N->isExactlyValue(+0.0);
 }]>;
 
-def I8Imm : SDNodeXForm<imm, [{
-  // Transformation function: get the low 8 bits.
-  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
-// BYTE_imm - Transform bit immediates into byte immediates.
-def BYTE_imm  : SDNodeXForm<imm, [{
-  // Transformation function: imm >> 3
-  return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
-}]>;
-
 // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
 // to VEXTRACTF128/VEXTRACTI128 imm.
 def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7ed1183c05f9..983b8fa4af1f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2922,13 +2922,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S,
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
     auto *I = cast<Instruction>(VL[0]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
-    if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
-      // Favor having instruction to the right. FIXME: why?
-      std::swap(VLeft, VRight);
-    Left.push_back(VLeft);
-    Right.push_back(VRight);
+    Left.push_back(I->getOperand(0));
+    Right.push_back(I->getOperand(1));
   }
 
   // Keep track if we have instructions with all the same opcode on one side.
diff --git a/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
new file mode 100644
index 000000000000..d60abaf7c27b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -0,0 +1,115 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s
+
+# The sequence of DBG_VALUEs forms a scheduling region with 0 real
+# instructions. The RegPressure tracker would end up skipping over any
+# debug instructions, so it would point to the instruction
+# before/outside of the region, hitting this assert:
+#  assert((BotRPTracker.getPos() == RegionEnd ||
+#          (RegionEnd->isDebugInstr() &&
+#           BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
+#         "Can't find the region bottom");
+
+---
+name:            only_dbg_value_sched_region
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  waveLimiter:     true
+body:             |
+  ; CHECK-LABEL: name: only_dbg_value_sched_region
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $vgpr0
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, implicit $exec
+  ; CHECK:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, implicit $exec
+  ; CHECK:   undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
+  ; CHECK:   %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; CHECK:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   undef %11.sub1:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+  ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
+  ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
+  ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, implicit $exec
+  ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
+  ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
+  ; CHECK:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, implicit $exec
+  ; CHECK:   %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, implicit $exec
+  ; CHECK:   [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, implicit $exec
+  ; CHECK:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, implicit $exec
+  ; CHECK:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, implicit $exec
+  ; CHECK:   [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec
+  ; CHECK:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, implicit $exec
+  ; CHECK:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
+  ; CHECK:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, implicit $exec
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   S_SETREG_IMM32_B32 0, 1
+  ; CHECK:   DBG_VALUE
+  ; CHECK:   DBG_VALUE
+  ; CHECK:   DBG_VALUE
+  ; CHECK:   S_SETREG_IMM32_B32 0, 1
+  ; CHECK: bb.2:
+  ; CHECK:   S_NOP 0, implicit [[COPY]]
+  ; CHECK:   S_NOP 0, implicit [[DEF8]]
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, 0, implicit $exec
+    undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0, %0, implicit $exec
+    %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+    %5:vreg_64 = COPY %2
+    undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $exec
+    %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $exec
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, implicit $exec
+    %8:vreg_64 = IMPLICIT_DEF
+    %9:vreg_64 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    undef %11.sub1:vreg_64 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vreg_64 = IMPLICIT_DEF
+    %15:vreg_64 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $exec
+    %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, implicit $exec
+    %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, implicit $exec
+    %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, implicit $exec
+    %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, implicit $exec
+    %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, 0, implicit $exec
+    %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, 0, implicit $exec
+    %23:vreg_64 = V_LSHLREV_B64 2, %8, implicit $exec
+    S_NOP 0, implicit %13, implicit %23.sub0, implicit %12, implicit %17
+    GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, implicit $exec
+
+  bb.1:
+    S_SETREG_IMM32_B32 0, 1
+    DBG_VALUE
+    DBG_VALUE
+    DBG_VALUE
+    S_SETREG_IMM32_B32 0, 1
+
+  bb.2:
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %16
+    S_ENDPGM 0
+
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir
index 34ed8b843075..a7d83d549b8a 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir
@@ -1,10 +1,16 @@
 # RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
-# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumbv7-- -run-pass=legalizer %s -o - | FileCheck %s
 --- |
   define void @test_legal_loads_stores() { ret void }
   define void @test_load_from_stack() { ret void }
 
+  define void @test_load_store_64_vfp() #0 { ret void }
+  define void @test_load_store_64_novfp() #1 { ret void }
+
   define void @test_gep() { ret void }
+
+  attributes #0 = { "target-features"="+vfp2" }
+  attributes #1 = { "target-features"="-vfp2" }
 ...
 ---
 name:            test_legal_loads_stores
@@ -81,6 +87,88 @@ body:             |
     BX_RET 14, $noreg, implicit $r0
 ...
 ---
+name:            test_load_store_64_vfp
+# CHECK-LABEL: name: test_load_store_64_vfp
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; Can't use the VFP support for unaligned operations, we need to use 32-bits
+    ; operations instead.
+    ; CHECK: [[ADDR1:%[0-9]+]]:_(p0) = COPY $r0
+    ; CHECK-NEXT: [[V1:%[0-9]+]]:_(s32) = G_LOAD [[ADDR1]](p0) :: (load 4, align 1)
+    ; CHECK-NEXT: [[OFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[OFFCOPY:%[0-9]+]]:_(s32) = COPY [[OFF]]
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFFCOPY]]
+    ; CHECK-NEXT: [[V2:%[0-9]+]]:_(s32) = G_LOAD [[ADDR2]](p0) :: (load 4, align 1)
+    ; CHECK-NEXT: G_STORE [[V1]](s32), [[ADDR1]](p0) :: (store 4, align 1)
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFF]]
+    ; CHECK-NEXT: G_STORE [[V2]](s32), [[ADDR2]](p0) :: (store 4, align 1)
+    %0(p0) = COPY $r0
+    %1(s64) = G_LOAD %0(p0) :: (load 8, align 1)
+    G_STORE %1(s64), %0(p0) :: (store 8, align 1)
+
+    ; For word-aligned we can use VFP operations.
+    ; CHECK: [[V:%[0-9]+]]:_(s64) = G_LOAD %0(p0) :: (load 8, align 4)
+    ; CHECK: G_STORE [[V]](s64), %0(p0) :: (store 8, align 4)
+    %2(s64) = G_LOAD %0(p0) :: (load 8, align 4)
+    G_STORE %2(s64), %0(p0) :: (store 8, align 4)
+
+    BX_RET 14, $noreg
+...
+---
+name:            test_load_store_64_novfp
+# CHECK-LABEL: name: test_load_store_64_novfp
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; When we don't have VFP support, we need to use 32-bit operations.
+    ; CHECK: [[ADDR1:%[0-9]+]]:_(p0) = COPY $r0
+    ; CHECK-NEXT: [[V1:%[0-9]+]]:_(s32) = G_LOAD [[ADDR1]](p0) :: (load 4, align 1)
+    ; CHECK-NEXT: [[OFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[OFFCOPY:%[0-9]+]]:_(s32) = COPY [[OFF]]
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFFCOPY]]
+    ; CHECK-NEXT: [[V2:%[0-9]+]]:_(s32) = G_LOAD [[ADDR2]](p0) :: (load 4, align 1)
+    ; CHECK-NEXT: G_STORE [[V1]](s32), [[ADDR1]](p0) :: (store 4, align 1)
+    ; CHECK-NEXT: [[OFFCOPY:%[0-9]+]]:_(s32) = COPY [[OFF]]
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFFCOPY]]
+    ; CHECK-NEXT: G_STORE [[V2]](s32), [[ADDR2]](p0) :: (store 4, align 1)
+    %0(p0) = COPY $r0
+    %1(s64) = G_LOAD %0(p0) :: (load 8, align 1)
+    G_STORE %1(s64), %0(p0) :: (store 8, align 1)
+
+    ; CHECK: [[V1:%[0-9]+]]:_(s32) = G_LOAD [[ADDR1]](p0) :: (load 4)
+    ; CHECK-NEXT: [[OFFCOPY:%[0-9]+]]:_(s32) = COPY [[OFF]]
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFFCOPY]]
+    ; CHECK-NEXT: [[V2:%[0-9]+]]:_(s32) = G_LOAD [[ADDR2]](p0) :: (load 4)
+    ; CHECK-NEXT: G_STORE [[V1]](s32), [[ADDR1]](p0) :: (store 4)
+    ; CHECK-NEXT: [[ADDR2:%[0-9]+]]:_(p0) = G_GEP [[ADDR1]], [[OFF]]
+    ; CHECK-NEXT: G_STORE [[V2]](s32), [[ADDR2]](p0) :: (store 4)
+    %2(s64) = G_LOAD %0(p0) :: (load 8, align 4)
+    G_STORE %2(s64), %0(p0) :: (store 8, align 4)
+
+    BX_RET 14, $noreg
+...
+---
 name:            test_gep
 # CHECK-LABEL: name: test_gep
 legalized:       false
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index ae4e94904ec9..f4408adce960 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -1,7 +1,5 @@
 # RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
 --- |
-  define void @test_load_store_64() #0 { ret void }
-
   define void @test_constants_s64() { ret void }
 
   define void @test_phi_s64() #0 { ret void }
@@ -9,34 +7,6 @@
   attributes #0 = { "target-features"="+vfp2" }
 ...
 ---
-name:            test_load_store_64
-# CHECK-LABEL: name: test_load_store_64
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0
-
-    ; These are legal, so we should find them unchanged in the output
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s64), %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(s64) = G_LOAD %0(p0)
-    %0(p0) = COPY $r0
-    %1(s64) = G_LOAD %0(p0) :: (load 8)
-    G_STORE %1(s64), %0(p0) :: (store 8)
-    BX_RET 14, $noreg
-...
----
 name:            test_constants_s64
 # CHECK-LABEL: name: test_constants_s64
 legalized:       false
diff --git a/test/CodeGen/Mips/GlobalISel/instruction-select/float_args.mir b/test/CodeGen/Mips/GlobalISel/instruction-select/float_args.mir
new file mode 100644
index 000000000000..a81888ab49b3
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/instruction-select/float_args.mir
@@ -0,0 +1,303 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP32
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -mattr=+fp64,+mips32r2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP64
+--- |
+
+  define void @float_in_fpr() {entry: ret void}
+  define void @double_in_fpr() {entry: ret void}
+  define void @float_in_gpr() {entry: ret void}
+  define void @double_in_gpr() {entry: ret void}
+  define void @call_float_in_fpr() {entry: ret void}
+  define void @call_double_in_fpr() {entry: ret void}
+  define void @call_float_in_gpr() {entry: ret void}
+  define void @call_double_in_gpr() {entry: ret void}
+
+...
+---
+name:            float_in_fpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $f12, $f14
+
+    ; FP32-LABEL: name: float_in_fpr
+    ; FP32: liveins: $f12, $f14
+    ; FP32: [[COPY:%[0-9]+]]:fgr32 = COPY $f14
+    ; FP32: $f0 = COPY [[COPY]]
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: float_in_fpr
+    ; FP64: liveins: $f12, $f14
+    ; FP64: [[COPY:%[0-9]+]]:fgr32 = COPY $f14
+    ; FP64: $f0 = COPY [[COPY]]
+    ; FP64: RetRA implicit $f0
+    %1:fprb(s32) = COPY $f14
+    $f0 = COPY %1(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            double_in_fpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $d6, $d7
+
+    ; FP32-LABEL: name: double_in_fpr
+    ; FP32: liveins: $d6, $d7
+    ; FP32: [[COPY:%[0-9]+]]:afgr64 = COPY $d7
+    ; FP32: $d0 = COPY [[COPY]]
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: double_in_fpr
+    ; FP64: liveins: $d6, $d7
+    ; FP64: [[COPY:%[0-9]+]]:fgr64 = COPY $d7
+    ; FP64: $d0 = COPY [[COPY]]
+    ; FP64: RetRA implicit $d0
+    %1:fprb(s64) = COPY $d7
+    $d0 = COPY %1(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            float_in_gpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; FP32-LABEL: name: float_in_gpr
+    ; FP32: liveins: $a0, $a1
+    ; FP32: [[MTC1_:%[0-9]+]]:fgr32 = MTC1 $a1
+    ; FP32: $f0 = COPY [[MTC1_]]
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: float_in_gpr
+    ; FP64: liveins: $a0, $a1
+    ; FP64: [[MTC1_:%[0-9]+]]:fgr32 = MTC1 $a1
+    ; FP64: $f0 = COPY [[MTC1_]]
+    ; FP64: RetRA implicit $f0
+    %1:fgr32(s32) = MTC1 $a1
+    $f0 = COPY %1(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            double_in_gpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a2, $a3
+
+    ; FP32-LABEL: name: double_in_gpr
+    ; FP32: liveins: $a0, $a2, $a3
+    ; FP32: [[BuildPairF64_:%[0-9]+]]:afgr64 = BuildPairF64 $a2, $a3
+    ; FP32: $d0 = COPY [[BuildPairF64_]]
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: double_in_gpr
+    ; FP64: liveins: $a0, $a2, $a3
+    ; FP64: [[BuildPairF64_:%[0-9]+]]:afgr64 = BuildPairF64 $a2, $a3
+    ; FP64: $d0 = COPY [[BuildPairF64_]]
+    ; FP64: RetRA implicit $d0
+    %1:afgr64(s64) = BuildPairF64 $a2, $a3
+    $d0 = COPY %1(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            call_float_in_fpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $f12, $f14
+
+    ; FP32-LABEL: name: call_float_in_fpr
+    ; FP32: liveins: $f12, $f14
+    ; FP32: [[COPY:%[0-9]+]]:fgr32 = COPY $f12
+    ; FP32: [[COPY1:%[0-9]+]]:fgr32 = COPY $f14
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f12 = COPY [[COPY]]
+    ; FP32: $f14 = COPY [[COPY1]]
+    ; FP32: JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    ; FP32: [[COPY2:%[0-9]+]]:fgr32 = COPY $f0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f0 = COPY [[COPY2]]
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: call_float_in_fpr
+    ; FP64: liveins: $f12, $f14
+    ; FP64: [[COPY:%[0-9]+]]:fgr32 = COPY $f12
+    ; FP64: [[COPY1:%[0-9]+]]:fgr32 = COPY $f14
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f12 = COPY [[COPY]]
+    ; FP64: $f14 = COPY [[COPY1]]
+    ; FP64: JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    ; FP64: [[COPY2:%[0-9]+]]:fgr32 = COPY $f0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f0 = COPY [[COPY2]]
+    ; FP64: RetRA implicit $f0
+    %0:fprb(s32) = COPY $f12
+    %1:fprb(s32) = COPY $f14
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $f12 = COPY %0(s32)
+    $f14 = COPY %1(s32)
+    JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    %2:fprb(s32) = COPY $f0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $f0 = COPY %2(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            call_double_in_fpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $d6, $d7
+
+    ; FP32-LABEL: name: call_double_in_fpr
+    ; FP32: liveins: $d6, $d7
+    ; FP32: [[COPY:%[0-9]+]]:afgr64 = COPY $d6
+    ; FP32: [[COPY1:%[0-9]+]]:afgr64 = COPY $d7
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d6 = COPY [[COPY]]
+    ; FP32: $d7 = COPY [[COPY1]]
+    ; FP32: JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    ; FP32: [[COPY2:%[0-9]+]]:afgr64 = COPY $d0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d0 = COPY [[COPY2]]
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: call_double_in_fpr
+    ; FP64: liveins: $d6, $d7
+    ; FP64: [[COPY:%[0-9]+]]:fgr64 = COPY $d6
+    ; FP64: [[COPY1:%[0-9]+]]:fgr64 = COPY $d7
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d6 = COPY [[COPY]]
+    ; FP64: $d7 = COPY [[COPY1]]
+    ; FP64: JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    ; FP64: [[COPY2:%[0-9]+]]:fgr64 = COPY $d0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d0 = COPY [[COPY2]]
+    ; FP64: RetRA implicit $d0
+    %0:fprb(s64) = COPY $d6
+    %1:fprb(s64) = COPY $d7
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $d6 = COPY %0(s64)
+    $d7 = COPY %1(s64)
+    JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    %2:fprb(s64) = COPY $d0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %2(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            call_float_in_gpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; FP32-LABEL: name: call_float_in_gpr
+    ; FP32: liveins: $a0, $a1
+    ; FP32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; FP32: [[MTC1_:%[0-9]+]]:fgr32 = MTC1 $a1
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $a0 = COPY [[COPY]]
+    ; FP32: $a1 = MFC1 [[MTC1_]]
+    ; FP32: JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    ; FP32: [[COPY1:%[0-9]+]]:fgr32 = COPY $f0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f0 = COPY [[COPY1]]
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: call_float_in_gpr
+    ; FP64: liveins: $a0, $a1
+    ; FP64: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; FP64: [[MTC1_:%[0-9]+]]:fgr32 = MTC1 $a1
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $a0 = COPY [[COPY]]
+    ; FP64: $a1 = MFC1 [[MTC1_]]
+    ; FP64: JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    ; FP64: [[COPY1:%[0-9]+]]:fgr32 = COPY $f0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f0 = COPY [[COPY1]]
+    ; FP64: RetRA implicit $f0
+    %0:gprb(s32) = COPY $a0
+    %1:fgr32(s32) = MTC1 $a1
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $a0 = COPY %0(s32)
+    $a1 = MFC1 %1(s32)
+    JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    %2:fprb(s32) = COPY $f0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $f0 = COPY %2(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            call_double_in_gpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a2, $a3
+
+    ; FP32-LABEL: name: call_double_in_gpr
+    ; FP32: liveins: $a0, $a2, $a3
+    ; FP32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; FP32: [[BuildPairF64_:%[0-9]+]]:afgr64 = BuildPairF64 $a2, $a3
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $a0 = COPY [[COPY]]
+    ; FP32: $a3 = ExtractElementF64 [[BuildPairF64_]], 1
+    ; FP32: $a2 = ExtractElementF64 [[BuildPairF64_]], 0
+    ; FP32: JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    ; FP32: [[COPY1:%[0-9]+]]:afgr64 = COPY $d0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d0 = COPY [[COPY1]]
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: call_double_in_gpr
+    ; FP64: liveins: $a0, $a2, $a3
+    ; FP64: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; FP64: [[BuildPairF64_:%[0-9]+]]:afgr64 = BuildPairF64 $a2, $a3
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $a0 = COPY [[COPY]]
+    ; FP64: $a3 = ExtractElementF64 [[BuildPairF64_]], 1
+    ; FP64: $a2 = ExtractElementF64 [[BuildPairF64_]], 0
+    ; FP64: JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    ; FP64: [[COPY1:%[0-9]+]]:fgr64 = COPY $d0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d0 = COPY [[COPY1]]
+    ; FP64: RetRA implicit $d0
+    %0:gprb(s32) = COPY $a0
+    %1:afgr64(s64) = BuildPairF64 $a2, $a3
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $a0 = COPY %0(s32)
+    $a3 = ExtractElementF64 %1(s64), 1
+    $a2 = ExtractElementF64 %1(s64), 0
+    JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    %2:fprb(s64) = COPY $d0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %2(s64)
+    RetRA implicit $d0
+
+...
+
diff --git a/test/CodeGen/Mips/GlobalISel/irtranslator/float_args.ll b/test/CodeGen/Mips/GlobalISel/irtranslator/float_args.ll
new file mode 100644
index 000000000000..24cfcd895a78
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/irtranslator/float_args.ll
@@ -0,0 +1,211 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+
+; RUN: llc -O0 -mtriple=mipsel-linux-gnu -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP32
+; RUN: llc -O0 -mtriple=mipsel-linux-gnu -mattr=+fp64,+mips32r2 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP64
+
+define float @float_in_fpr(float %a, float %b) {
+  ; FP32-LABEL: name: float_in_fpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $f12, $f14
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f14
+  ; FP32:   $f0 = COPY [[COPY1]](s32)
+  ; FP32:   RetRA implicit $f0
+  ; FP64-LABEL: name: float_in_fpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $f12, $f14
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f14
+  ; FP64:   $f0 = COPY [[COPY1]](s32)
+  ; FP64:   RetRA implicit $f0
+entry:
+  ret float %b
+}
+
+define double @double_in_fpr(double %a, double %b) {
+  ; FP32-LABEL: name: double_in_fpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $d6, $d7
+  ; FP32:   [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d7
+  ; FP32:   $d0 = COPY [[COPY1]](s64)
+  ; FP32:   RetRA implicit $d0
+  ; FP64-LABEL: name: double_in_fpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $d12_64, $d14_64
+  ; FP64:   [[COPY:%[0-9]+]]:_(s64) = COPY $d12_64
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d14_64
+  ; FP64:   $d0_64 = COPY [[COPY1]](s64)
+  ; FP64:   RetRA implicit $d0_64
+entry:
+  ret double %b
+}
+
+define float @float_in_gpr(i32 %a, float %b) {
+  ; FP32-LABEL: name: float_in_gpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $a0, $a1
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP32:   [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+  ; FP32:   $f0 = COPY [[MTC1_]](s32)
+  ; FP32:   RetRA implicit $f0
+  ; FP64-LABEL: name: float_in_gpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $a0, $a1
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP64:   [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+  ; FP64:   $f0 = COPY [[MTC1_]](s32)
+  ; FP64:   RetRA implicit $f0
+entry:
+  ret float %b
+}
+
+define double @double_in_gpr(i32 %a, double %b) {
+  ; FP32-LABEL: name: double_in_gpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $a0, $a2, $a3
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP32:   [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+  ; FP32:   $d0 = COPY [[BuildPairF64_]](s64)
+  ; FP32:   RetRA implicit $d0
+  ; FP64-LABEL: name: double_in_gpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $a0, $a2, $a3
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP64:   [[BuildPairF64_64_:%[0-9]+]]:fgr64(s64) = BuildPairF64_64 $a2, $a3
+  ; FP64:   $d0_64 = COPY [[BuildPairF64_64_]](s64)
+  ; FP64:   RetRA implicit $d0_64
+entry:
+  ret double %b
+}
+
+define float @call_float_in_fpr(float %a, float %b) {
+  ; FP32-LABEL: name: call_float_in_fpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $f12, $f14
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f14
+  ; FP32:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $f12 = COPY [[COPY]](s32)
+  ; FP32:   $f14 = COPY [[COPY1]](s32)
+  ; FP32:   JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+  ; FP32:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f0
+  ; FP32:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $f0 = COPY [[COPY2]](s32)
+  ; FP32:   RetRA implicit $f0
+  ; FP64-LABEL: name: call_float_in_fpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $f12, $f14
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f14
+  ; FP64:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $f12 = COPY [[COPY]](s32)
+  ; FP64:   $f14 = COPY [[COPY1]](s32)
+  ; FP64:   JAL @float_in_fpr, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+  ; FP64:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f0
+  ; FP64:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $f0 = COPY [[COPY2]](s32)
+  ; FP64:   RetRA implicit $f0
+entry:
+  %call = call float @float_in_fpr(float %a, float %b)
+  ret float %call
+}
+
+define double @call_double_in_fpr(double %a, double %b) {
+  ; FP32-LABEL: name: call_double_in_fpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $d6, $d7
+  ; FP32:   [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d7
+  ; FP32:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $d6 = COPY [[COPY]](s64)
+  ; FP32:   $d7 = COPY [[COPY1]](s64)
+  ; FP32:   JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+  ; FP32:   [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+  ; FP32:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $d0 = COPY [[COPY2]](s64)
+  ; FP32:   RetRA implicit $d0
+  ; FP64-LABEL: name: call_double_in_fpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $d12_64, $d14_64
+  ; FP64:   [[COPY:%[0-9]+]]:_(s64) = COPY $d12_64
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d14_64
+  ; FP64:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $d12_64 = COPY [[COPY]](s64)
+  ; FP64:   $d14_64 = COPY [[COPY1]](s64)
+  ; FP64:   JAL @double_in_fpr, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $d12_64, implicit $d14_64, implicit-def $d0_64
+  ; FP64:   [[COPY2:%[0-9]+]]:_(s64) = COPY $d0_64
+  ; FP64:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $d0_64 = COPY [[COPY2]](s64)
+  ; FP64:   RetRA implicit $d0_64
+entry:
+  %call = call double @double_in_fpr(double %a, double %b)
+  ret double %call
+}
+
+define float @call_float_in_gpr(i32 %a, float %b) {
+  ; FP32-LABEL: name: call_float_in_gpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $a0, $a1
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP32:   [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+  ; FP32:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $a0 = COPY [[COPY]](s32)
+  ; FP32:   $a1 = MFC1 [[MTC1_]](s32)
+  ; FP32:   JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f0
+  ; FP32:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $f0 = COPY [[COPY1]](s32)
+  ; FP32:   RetRA implicit $f0
+  ; FP64-LABEL: name: call_float_in_gpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $a0, $a1
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP64:   [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+  ; FP64:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $a0 = COPY [[COPY]](s32)
+  ; FP64:   $a1 = MFC1 [[MTC1_]](s32)
+  ; FP64:   JAL @float_in_gpr, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f0
+  ; FP64:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $f0 = COPY [[COPY1]](s32)
+  ; FP64:   RetRA implicit $f0
+entry:
+  %call = call float @float_in_gpr(i32 %a, float %b)
+  ret float %call
+}
+
+
+define double @call_double_in_gpr(i32 %a, double %b) {
+  ; FP32-LABEL: name: call_double_in_gpr
+  ; FP32: bb.1.entry:
+  ; FP32:   liveins: $a0, $a2, $a3
+  ; FP32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP32:   [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+  ; FP32:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $a0 = COPY [[COPY]](s32)
+  ; FP32:   $a3 = ExtractElementF64 [[BuildPairF64_]](s64), 1
+  ; FP32:   $a2 = ExtractElementF64 [[BuildPairF64_]](s64), 0
+  ; FP32:   JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+  ; FP32:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d0
+  ; FP32:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP32:   $d0 = COPY [[COPY1]](s64)
+  ; FP32:   RetRA implicit $d0
+  ; FP64-LABEL: name: call_double_in_gpr
+  ; FP64: bb.1.entry:
+  ; FP64:   liveins: $a0, $a2, $a3
+  ; FP64:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; FP64:   [[BuildPairF64_64_:%[0-9]+]]:fgr64(s64) = BuildPairF64_64 $a2, $a3
+  ; FP64:   ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $a0 = COPY [[COPY]](s32)
+  ; FP64:   $a3 = ExtractElementF64_64 [[BuildPairF64_64_]](s64), 1
+  ; FP64:   $a2 = ExtractElementF64_64 [[BuildPairF64_64_]](s64), 0
+  ; FP64:   JAL @double_in_gpr, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0_64
+  ; FP64:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d0_64
+  ; FP64:   ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+  ; FP64:   $d0_64 = COPY [[COPY1]](s64)
+  ; FP64:   RetRA implicit $d0_64
+entry:
+  %call = call double @double_in_gpr(i32 %a, double %b)
+  ret double %call
+}
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/float_args.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/float_args.ll
new file mode 100644
index 000000000000..e46b7e64acd2
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/float_args.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -global-isel -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32,FP32
+; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -mattr=+fp64,+mips32r2 -global-isel -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32,FP64
+
+define float @float_in_fpr(float %a, float %b) {
+; MIPS32-LABEL: float_in_fpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret float %b
+}
+
+define double @double_in_fpr(double %a, double %b) {
+; MIPS32-LABEL: double_in_fpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret double %b
+}
+
+define float @float_in_gpr(i32 %a, float %b) {
+; MIPS32-LABEL: float_in_gpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    mtc1 $5, $f0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret float %b
+}
+
+define double @double_in_gpr(i32 %a, double %b) {
+; FP32-LABEL: double_in_gpr:
+; FP32:       # %bb.0: # %entry
+; FP32-NEXT:    mtc1 $6, $f0
+; FP32-NEXT:    mtc1 $7, $f1
+; FP32-NEXT:    jr $ra
+; FP32-NEXT:    nop
+;
+; FP64-LABEL: double_in_gpr:
+; FP64:       # %bb.0: # %entry
+; FP64-NEXT:    mtc1 $6, $f0
+; FP64-NEXT:    mthc1 $7, $f0
+; FP64-NEXT:    jr $ra
+; FP64-NEXT:    nop
+entry:
+  ret double %b
+}
+
+define float @call_float_in_fpr(float %a, float %b) {
+; MIPS32-LABEL: call_float_in_fpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    jal float_in_fpr
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %call = call float @float_in_fpr(float %a, float %b)
+  ret float %call
+}
+
+define double @call_double_in_fpr(double %a, double %b) {
+; MIPS32-LABEL: call_double_in_fpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    jal double_in_fpr
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %call = call double @double_in_fpr(double %a, double %b)
+  ret double %call
+}
+
+define float @call_float_in_gpr(i32 %a, float %b) {
+; MIPS32-LABEL: call_float_in_gpr:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    mtc1 $5, $f0
+; MIPS32-NEXT:    mfc1 $5, $f0
+; MIPS32-NEXT:    jal float_in_gpr
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %call = call float @float_in_gpr(i32 %a, float %b)
+  ret float %call
+}
+
+
+define double @call_double_in_gpr(i32 %a, double %b) {
+; FP32-LABEL: call_double_in_gpr:
+; FP32:       # %bb.0: # %entry
+; FP32-NEXT:    addiu $sp, $sp, -24
+; FP32-NEXT:    .cfi_def_cfa_offset 24
+; FP32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; FP32-NEXT:    .cfi_offset 31, -4
+; FP32-NEXT:    mtc1 $6, $f0
+; FP32-NEXT:    mtc1 $7, $f1
+; FP32-NEXT:    mfc1 $7, $f1
+; FP32-NEXT:    mfc1 $6, $f0
+; FP32-NEXT:    jal double_in_gpr
+; FP32-NEXT:    nop
+; FP32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; FP32-NEXT:    addiu $sp, $sp, 24
+; FP32-NEXT:    jr $ra
+; FP32-NEXT:    nop
+;
+; FP64-LABEL: call_double_in_gpr:
+; FP64:       # %bb.0: # %entry
+; FP64-NEXT:    addiu $sp, $sp, -24
+; FP64-NEXT:    .cfi_def_cfa_offset 24
+; FP64-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; FP64-NEXT:    .cfi_offset 31, -4
+; FP64-NEXT:    mtc1 $6, $f0
+; FP64-NEXT:    mthc1 $7, $f0
+; FP64-NEXT:    mfhc1 $7, $f0
+; FP64-NEXT:    mfc1 $6, $f0
+; FP64-NEXT:    jal double_in_gpr
+; FP64-NEXT:    nop
+; FP64-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; FP64-NEXT:    addiu $sp, $sp, 24
+; FP64-NEXT:    jr $ra
+; FP64-NEXT:    nop
+entry:
+  %call = call double @double_in_gpr(i32 %a, double %b)
+  ret double %call
+}
diff --git a/test/CodeGen/Mips/GlobalISel/regbankselect/float_args.mir b/test/CodeGen/Mips/GlobalISel/regbankselect/float_args.mir
new file mode 100644
index 000000000000..ba4d28ca53ac
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/regbankselect/float_args.mir
@@ -0,0 +1,296 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP32
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -mattr=+fp64,+mips32r2 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=FP64
+
+--- |
+
+  define void @float_in_fpr() {entry: ret void}
+  define void @double_in_fpr() {entry: ret void}
+  define void @float_in_gpr() {entry: ret void}
+  define void @double_in_gpr() {entry: ret void}
+  define void @call_float_in_fpr() {entry: ret void}
+  define void @call_double_in_fpr() {entry: ret void}
+  define void @call_float_in_gpr() {entry: ret void}
+  define void @call_double_in_gpr() {entry: ret void}
+
+...
+---
+name:            float_in_fpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $f12, $f14
+
+    ; FP32-LABEL: name: float_in_fpr
+    ; FP32: liveins: $f12, $f14
+    ; FP32: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f14
+    ; FP32: $f0 = COPY [[COPY]](s32)
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: float_in_fpr
+    ; FP64: liveins: $f12, $f14
+    ; FP64: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f14
+    ; FP64: $f0 = COPY [[COPY]](s32)
+    ; FP64: RetRA implicit $f0
+    %1:_(s32) = COPY $f14
+    $f0 = COPY %1(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            double_in_fpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $d6, $d7
+
+    ; FP32-LABEL: name: double_in_fpr
+    ; FP32: liveins: $d6, $d7
+    ; FP32: [[COPY:%[0-9]+]]:fprb(s64) = COPY $d7
+    ; FP32: $d0 = COPY [[COPY]](s64)
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: double_in_fpr
+    ; FP64: liveins: $d6, $d7
+    ; FP64: [[COPY:%[0-9]+]]:fprb(s64) = COPY $d7
+    ; FP64: $d0 = COPY [[COPY]](s64)
+    ; FP64: RetRA implicit $d0
+    %1:_(s64) = COPY $d7
+    $d0 = COPY %1(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            float_in_gpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; FP32-LABEL: name: float_in_gpr
+    ; FP32: liveins: $a0, $a1
+    ; FP32: [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+    ; FP32: $f0 = COPY [[MTC1_]](s32)
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: float_in_gpr
+    ; FP64: liveins: $a0, $a1
+    ; FP64: [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+    ; FP64: $f0 = COPY [[MTC1_]](s32)
+    ; FP64: RetRA implicit $f0
+    %1:fgr32(s32) = MTC1 $a1
+    $f0 = COPY %1(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            double_in_gpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a2, $a3
+
+    ; FP32-LABEL: name: double_in_gpr
+    ; FP32: liveins: $a0, $a2, $a3
+    ; FP32: [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+    ; FP32: $d0 = COPY [[BuildPairF64_]](s64)
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: double_in_gpr
+    ; FP64: liveins: $a0, $a2, $a3
+    ; FP64: [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+    ; FP64: $d0 = COPY [[BuildPairF64_]](s64)
+    ; FP64: RetRA implicit $d0
+    %1:afgr64(s64) = BuildPairF64 $a2, $a3
+    $d0 = COPY %1(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            call_float_in_fpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $f12, $f14
+
+    ; FP32-LABEL: name: call_float_in_fpr
+    ; FP32: liveins: $f12, $f14
+    ; FP32: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f12
+    ; FP32: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f14
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f12 = COPY [[COPY]](s32)
+    ; FP32: $f14 = COPY [[COPY1]](s32)
+    ; FP32: JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    ; FP32: [[COPY2:%[0-9]+]]:fprb(s32) = COPY $f0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f0 = COPY [[COPY2]](s32)
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: call_float_in_fpr
+    ; FP64: liveins: $f12, $f14
+    ; FP64: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f12
+    ; FP64: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f14
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f12 = COPY [[COPY]](s32)
+    ; FP64: $f14 = COPY [[COPY1]](s32)
+    ; FP64: JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    ; FP64: [[COPY2:%[0-9]+]]:fprb(s32) = COPY $f0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f0 = COPY [[COPY2]](s32)
+    ; FP64: RetRA implicit $f0
+    %0:_(s32) = COPY $f12
+    %1:_(s32) = COPY $f14
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $f12 = COPY %0(s32)
+    $f14 = COPY %1(s32)
+    JAL @float_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit $f14, implicit-def $f0
+    %2:_(s32) = COPY $f0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $f0 = COPY %2(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            call_double_in_fpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $d6, $d7
+
+    ; FP32-LABEL: name: call_double_in_fpr
+    ; FP32: liveins: $d6, $d7
+    ; FP32: [[COPY:%[0-9]+]]:fprb(s64) = COPY $d6
+    ; FP32: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $d7
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d6 = COPY [[COPY]](s64)
+    ; FP32: $d7 = COPY [[COPY1]](s64)
+    ; FP32: JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    ; FP32: [[COPY2:%[0-9]+]]:fprb(s64) = COPY $d0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d0 = COPY [[COPY2]](s64)
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: call_double_in_fpr
+    ; FP64: liveins: $d6, $d7
+    ; FP64: [[COPY:%[0-9]+]]:fprb(s64) = COPY $d6
+    ; FP64: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $d7
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d6 = COPY [[COPY]](s64)
+    ; FP64: $d7 = COPY [[COPY1]](s64)
+    ; FP64: JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    ; FP64: [[COPY2:%[0-9]+]]:fprb(s64) = COPY $d0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d0 = COPY [[COPY2]](s64)
+    ; FP64: RetRA implicit $d0
+    %0:_(s64) = COPY $d6
+    %1:_(s64) = COPY $d7
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $d6 = COPY %0(s64)
+    $d7 = COPY %1(s64)
+    JAL @double_in_fpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit $d7, implicit-def $d0
+    %2:_(s64) = COPY $d0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %2(s64)
+    RetRA implicit $d0
+
+...
+---
+name:            call_float_in_gpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; FP32-LABEL: name: call_float_in_gpr
+    ; FP32: liveins: $a0, $a1
+    ; FP32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; FP32: [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $a0 = COPY [[COPY]](s32)
+    ; FP32: $a1 = MFC1 [[MTC1_]](s32)
+    ; FP32: JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    ; FP32: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $f0 = COPY [[COPY1]](s32)
+    ; FP32: RetRA implicit $f0
+    ; FP64-LABEL: name: call_float_in_gpr
+    ; FP64: liveins: $a0, $a1
+    ; FP64: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; FP64: [[MTC1_:%[0-9]+]]:fgr32(s32) = MTC1 $a1
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $a0 = COPY [[COPY]](s32)
+    ; FP64: $a1 = MFC1 [[MTC1_]](s32)
+    ; FP64: JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    ; FP64: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $f0 = COPY [[COPY1]](s32)
+    ; FP64: RetRA implicit $f0
+    %0:_(s32) = COPY $a0
+    %1:fgr32(s32) = MTC1 $a1
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $a0 = COPY %0(s32)
+    $a1 = MFC1 %1(s32)
+    JAL @float_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $f0
+    %2:_(s32) = COPY $f0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $f0 = COPY %2(s32)
+    RetRA implicit $f0
+
+...
+---
+name:            call_double_in_gpr
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a2, $a3
+
+    ; FP32-LABEL: name: call_double_in_gpr
+    ; FP32: liveins: $a0, $a2, $a3
+    ; FP32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; FP32: [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $a0 = COPY [[COPY]](s32)
+    ; FP32: $a3 = ExtractElementF64 [[BuildPairF64_]](s64), 1
+    ; FP32: $a2 = ExtractElementF64 [[BuildPairF64_]](s64), 0
+    ; FP32: JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    ; FP32: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $d0
+    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32: $d0 = COPY [[COPY1]](s64)
+    ; FP32: RetRA implicit $d0
+    ; FP64-LABEL: name: call_double_in_gpr
+    ; FP64: liveins: $a0, $a2, $a3
+    ; FP64: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; FP64: [[BuildPairF64_:%[0-9]+]]:afgr64(s64) = BuildPairF64 $a2, $a3
+    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $a0 = COPY [[COPY]](s32)
+    ; FP64: $a3 = ExtractElementF64 [[BuildPairF64_]](s64), 1
+    ; FP64: $a2 = ExtractElementF64 [[BuildPairF64_]](s64), 0
+    ; FP64: JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    ; FP64: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $d0
+    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64: $d0 = COPY [[COPY1]](s64)
+    ; FP64: RetRA implicit $d0
+    %0:_(s32) = COPY $a0
+    %1:afgr64(s64) = BuildPairF64 $a2, $a3
+    ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    $a0 = COPY %0(s32)
+    $a3 = ExtractElementF64 %1(s64), 1
+    $a2 = ExtractElementF64 %1(s64), 0
+    JAL @double_in_gpr, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit-def $d0
+    %2:_(s64) = COPY $d0
+    ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %2(s64)
+    RetRA implicit $d0
+
+...
+
diff --git a/test/CodeGen/X86/regalloc-copy-hints.mir b/test/CodeGen/X86/regalloc-copy-hints.mir
index 6287066e64fe..3a5cd41b852f 100644
--- a/test/CodeGen/X86/regalloc-copy-hints.mir
+++ b/test/CodeGen/X86/regalloc-copy-hints.mir
@@ -3,310 +3,12 @@
 # REQUIRES: asserts
 
 --- |
-  %0 = type { %1 }
-  %1 = type { %2, %23, %23*, %27*, %28*, %29, %33*, %34, %42, i8, i32, i32, i32 }
-  %2 = type { %3, %6, %14, %14, i8, i8*, i8*, %16 }
-  %3 = type { i32 (...)**, %4*, %5* }
-  %4 = type { i32 (...)**, %3* }
-  %5 = type { i32 (...)** }
-  %6 = type { %7 }
-  %7 = type { %8, i32, %12 }
-  %8 = type { %9**, %9**, %9**, %10 }
-  %9 = type { i32, i32, i32, i8* }
-  %10 = type { %11 }
-  %11 = type { %9** }
-  %12 = type { %13 }
-  %13 = type { i32 }
-  %14 = type { i32, %15* }
-  %15 = type { i32, i32, i8* }
-  %16 = type { %17 }
-  %17 = type { %18*, %20, %22 }
-  %18 = type { %19* }
-  %19 = type <{ %18, %19*, %18*, i8, [3 x i8] }>
-  %20 = type { %21 }
-  %21 = type { %18 }
-  %22 = type { %13 }
-  %23 = type { %24 }
-  %24 = type { %18*, %25, %26 }
-  %25 = type { %21 }
-  %26 = type { %13 }
-  %27 = type { i32 (...)** }
-  %28 = type { i32 (...)** }
-  %29 = type { %30 }
-  %30 = type { %18*, %31, %32 }
-  %31 = type { %21 }
-  %32 = type { %13 }
-  %33 = type { i32 (...)** }
-  %34 = type { %35 }
-  %35 = type { %36 }
-  %36 = type { %37, i32, %41 }
-  %37 = type { %38**, %38**, %38**, %39 }
-  %38 = type { %42, i32 }
-  %39 = type { %40 }
-  %40 = type { %38** }
-  %41 = type { %13 }
-  %42 = type { %43 }
-  %43 = type { %18*, %44, %45 }
-  %44 = type { %21 }
-  %45 = type { %13 }
-  %46 = type { %47, %48 }
-  %47 = type <{ %18, %19*, %18*, i8 }>
-  %48 = type { %49 }
-  %49 = type { i32, %50 }
-  %50 = type { { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 } }
-  
-  define void @fun(%0* %arg) local_unnamed_addr #0 align 2 personality i32 (...)* @__gxx_personality_v0 {
-  bb:
-    %tmp = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1
-    %tmp1 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0
-    br i1 undef, label %bb5, label %bb6
-  
-  bb5:                                              ; preds = %bb
-    unreachable
-  
-  bb6:                                              ; preds = %bb
-    %tmp8 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 8, i32 0, i32 1, i32 0, i32 0
-    br i1 undef, label %bb10, label %bb9
-  
-  bb9:                                              ; preds = %bb6
-    unreachable
-  
-  bb10:                                             ; preds = %bb6
-    store %18* %tmp8, %18** undef
-    br i1 undef, label %bb14, label %bb13
-  
-  bb13:                                             ; preds = %bb10
-    unreachable
-  
-  bb14:                                             ; preds = %bb10
-    br i1 undef, label %bb17, label %bb18
-  
-  bb17:                                             ; preds = %bb14
-    unreachable
-  
-  bb18:                                             ; preds = %bb14
-    br i1 undef, label %bb20, label %bb19
-  
-  bb19:                                             ; preds = %bb18
-    unreachable
-  
-  bb20:                                             ; preds = %bb18
-    br i1 undef, label %bb25, label %bb24
-  
-  bb24:                                             ; preds = %bb20
-    unreachable
-  
-  bb25:                                             ; preds = %bb20
-    br i1 undef, label %bb29, label %bb30
-  
-  bb29:                                             ; preds = %bb25
-    unreachable
-  
-  bb30:                                             ; preds = %bb25
-    br i1 undef, label %bb38, label %bb31
-  
-  bb31:                                             ; preds = %bb30
-    %tmp32 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0
-    br i1 undef, label %bb34, label %bb35
-  
-  bb34:                                             ; preds = %bb31
-    unreachable
-  
-  bb35:                                             ; preds = %bb31
-    br i1 undef, label %bb40, label %bb36
-  
-  bb36:                                             ; preds = %bb35
-    unreachable
-  
-  bb38:                                             ; preds = %bb30
-    %tmp391 = bitcast %18* %tmp1 to %19**
-    br label %bb40
-  
-  bb40:                                             ; preds = %bb35, %bb38
-    %tmp41 = phi %18* [ %tmp1, %bb38 ], [ null, %bb35 ]
-    %tmp42 = phi %19** [ %tmp391, %bb38 ], [ %tmp32, %bb35 ]
-    br i1 undef, label %bb43, label %bb48
-  
-  bb43:                                             ; preds = %bb40
-    %tmp44 = tail call i8* @_Znwj()
-    store %18* %tmp41, %18** undef
-    %tmp46 = bitcast %19** %tmp42 to i8**
-    store i8* %tmp44, i8** %tmp46
-    %0 = bitcast i8* %tmp44 to %46*
-    tail call void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_()
-    br label %bb48
-  
-  bb48:                                             ; preds = %bb43, %bb40
-    %tmp49 = phi %46* [ %0, %bb43 ], [ undef, %bb40 ]
-    %tmp50 = getelementptr inbounds %46, %46* %tmp49, i32 0, i32 1, i32 0, i32 1, i32 4, i32 0
-    store i32 ptrtoint (i1 (%0*)* @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv to i32), i32* %tmp50
-    br i1 undef, label %bb52, label %bb53
-  
-  bb52:                                             ; preds = %bb48
-    unreachable
-  
-  bb53:                                             ; preds = %bb48
-    br i1 undef, label %bb55, label %bb54
-  
-  bb54:                                             ; preds = %bb53
-    unreachable
-  
-  bb55:                                             ; preds = %bb53
-    br i1 undef, label %bb59, label %bb58
-  
-  bb58:                                             ; preds = %bb55
-    unreachable
-  
-  bb59:                                             ; preds = %bb55
-    br i1 undef, label %bb62, label %bb61
-  
-  bb61:                                             ; preds = %bb59
-    unreachable
-  
-  bb62:                                             ; preds = %bb59
-    br i1 undef, label %bb64, label %bb65
-  
-  bb64:                                             ; preds = %bb62
-    unreachable
-  
-  bb65:                                             ; preds = %bb62
-    %tmp66 = icmp eq %46* null, null
-    br i1 %tmp66, label %bb72, label %bb67
-  
-  bb67:                                             ; preds = %bb65
-    %tmp68 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0
-    br i1 undef, label %bb70, label %bb74
-  
-  bb70:                                             ; preds = %bb67
-    unreachable
-  
-  bb72:                                             ; preds = %bb65
-    %tmp732 = bitcast %18* %tmp1 to %19**
-    br label %bb74
-  
-  bb74:                                             ; preds = %bb67, %bb72
-    %tmp75 = phi %18* [ %tmp1, %bb72 ], [ null, %bb67 ]
-    %tmp76 = phi %19** [ %tmp732, %bb72 ], [ %tmp68, %bb67 ]
-    %tmp77 = tail call i8* @_Znwj()
-    store %18* %tmp75, %18** undef
-    %tmp79 = bitcast %19** %tmp76 to i8**
-    store i8* %tmp77, i8** %tmp79
-    %1 = bitcast i8* %tmp77 to %46*
-    tail call void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_()
-    %tmp81 = getelementptr inbounds %46, %46* %1, i32 0, i32 1, i32 0, i32 1, i32 2, i32 0
-    store i32 ptrtoint (i1 (%0*)* @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv to i32), i32* %tmp81
-    store %18* %tmp8, %18** undef
-    %2 = bitcast %0* %arg to i8*
-    %sunkaddr = getelementptr i8, i8* %2, i32 140
-    %3 = bitcast i8* %sunkaddr to %18**
-    %tmp85 = load %18*, %18** %3
-    %tmp864 = bitcast %18* %tmp85 to %19**
-    %tmp87 = load %19*, %19** %tmp864
-    %tmp88 = icmp eq %19* %tmp87, null
-    br i1 %tmp88, label %bb90, label %bb89
-  
-  bb89:                                             ; preds = %bb74
-    unreachable
-  
-  bb90:                                             ; preds = %bb74
-    br i1 undef, label %bb94, label %bb92
-  
-  bb92:                                             ; preds = %bb90
-    br i1 undef, label %bb96, label %bb97
-  
-  bb94:                                             ; preds = %bb90
-    unreachable
-  
-  bb96:                                             ; preds = %bb92
-    unreachable
-  
-  bb97:                                             ; preds = %bb92
-    br i1 undef, label %bb101, label %bb102
-  
-  bb101:                                            ; preds = %bb97
-    unreachable
-  
-  bb102:                                            ; preds = %bb97
-    br i1 undef, label %bb104, label %bb103
-  
-  bb103:                                            ; preds = %bb102
-    unreachable
-  
-  bb104:                                            ; preds = %bb102
-    br i1 undef, label %bb109, label %bb108
-  
-  bb108:                                            ; preds = %bb104
-    unreachable
-  
-  bb109:                                            ; preds = %bb104
-    br i1 undef, label %bb111, label %bb112
-  
-  bb111:                                            ; preds = %bb109
-    unreachable
-  
-  bb112:                                            ; preds = %bb109
-    br i1 undef, label %bb118, label %bb117
-  
-  bb117:                                            ; preds = %bb112
-    unreachable
-  
-  bb118:                                            ; preds = %bb112
-    br i1 undef, label %bb120, label %bb121
-  
-  bb120:                                            ; preds = %bb118
-    unreachable
-  
-  bb121:                                            ; preds = %bb118
-    br i1 undef, label %bb124, label %bb125
-  
-  bb124:                                            ; preds = %bb121
-    unreachable
-  
-  bb125:                                            ; preds = %bb121
-    %4 = bitcast %18* %tmp1 to %46**
-    %tmp126 = load %46*, %46** %4
-    %tmp127 = icmp eq %46* %tmp126, null
-    br i1 %tmp127, label %bb135, label %bb128
-  
-  bb128:                                            ; preds = %bb125
-    br label %bb129
-  
-  bb129:                                            ; preds = %bb131, %bb128
-    %tmp130 = icmp ugt i32 undef, 95406324
-    br i1 %tmp130, label %bb131, label %bb133
-  
-  bb131:                                            ; preds = %bb129
-    br label %bb129
-  
-  bb133:                                            ; preds = %bb129
-    unreachable
-  
-  bb135:                                            ; preds = %bb125
-    br i1 undef, label %bb137, label %bb138
-  
-  bb137:                                            ; preds = %bb135
-    unreachable
-  
-  bb138:                                            ; preds = %bb135
-    unreachable
-  }
-  
-  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv(%0*) #0
-  
-  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv(%0*) #0 align 2
-  
-  declare i32 @__gxx_personality_v0(...) #0
-  
-  declare noalias nonnull i8* @_Znwj() local_unnamed_addr #0
-  
-  declare void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_() local_unnamed_addr #0
-  
-  ; Function Attrs: nounwind
-  declare void @llvm.stackprotector(i8*, i8**) #1
-  
-  attributes #0 = { "target-cpu"="i486" }
-  attributes #1 = { nounwind }
+  define void @fun() { ret void }
 
+  declare noalias nonnull i8* @_Znwj()
+  declare void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_()
+  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv()
+  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv()
 ...
 ---
 # A physreg should always only be hinted once per getRegAllocationHints() query.
@@ -405,7 +107,7 @@ frameInfo:
 fixedStack:      
   - { id: 0, size: 4, alignment: 4, stack-id: 0, isImmutable: true }
 body:             |
-  bb.0.bb:
+  bb.0:
     successors: %bb.1(0x00000001), %bb.2(0x7fffffff)
   
     %13:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -413,11 +115,11 @@ body:             |
     JNE_1 %bb.2, implicit killed $eflags
     JMP_1 %bb.1
   
-  bb.1.bb5:
+  bb.1:
     successors: 
   
   
-  bb.2.bb6:
+  bb.2:
     successors: %bb.4(0x7fffffff), %bb.3(0x00000001)
   
     %15:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -425,26 +127,26 @@ body:             |
     JNE_1 %bb.4, implicit killed $eflags
     JMP_1 %bb.3
   
-  bb.3.bb9:
+  bb.3:
     successors: 
   
   
-  bb.4.bb10:
+  bb.4:
     successors: %bb.6(0x7fffffff), %bb.5(0x00000001)
   
-    %12:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+    %12:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg
     %1:gr32 = LEA32r %12, 1, $noreg, 144, $noreg
-    MOV32mr undef %17:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `%18** undef`)
+    MOV32mr undef %17:gr32, 1, $noreg, 0, $noreg, %1
     %18:gr32_abcd = MOV32r0 implicit-def dead $eflags
     TEST8rr %18.sub_8bit, %18.sub_8bit, implicit-def $eflags
     JNE_1 %bb.6, implicit killed $eflags
     JMP_1 %bb.5
   
-  bb.5.bb13:
+  bb.5:
     successors: 
   
   
-  bb.6.bb14:
+  bb.6:
     successors: %bb.7(0x00000001), %bb.8(0x7fffffff)
   
     %20:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -452,11 +154,11 @@ body:             |
     JNE_1 %bb.8, implicit killed $eflags
     JMP_1 %bb.7
   
-  bb.7.bb17:
+  bb.7:
     successors: 
   
   
-  bb.8.bb18:
+  bb.8:
     successors: %bb.10(0x7fffffff), %bb.9(0x00000001)
   
     %22:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -464,11 +166,11 @@ body:             |
     JNE_1 %bb.10, implicit killed $eflags
     JMP_1 %bb.9
   
-  bb.9.bb19:
+  bb.9:
     successors: 
   
   
-  bb.10.bb20:
+  bb.10:
     successors: %bb.12(0x7fffffff), %bb.11(0x00000001)
   
     %24:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -476,11 +178,11 @@ body:             |
     JNE_1 %bb.12, implicit killed $eflags
     JMP_1 %bb.11
   
-  bb.11.bb24:
+  bb.11:
     successors: 
   
   
-  bb.12.bb25:
+  bb.12:
     successors: %bb.13(0x00000001), %bb.14(0x7fffffff)
   
     %26:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -488,18 +190,18 @@ body:             |
     JNE_1 %bb.14, implicit killed $eflags
     JMP_1 %bb.13
   
-  bb.13.bb29:
+  bb.13:
     successors: 
   
   
-  bb.14.bb30:
+  bb.14:
     %0:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
     %28:gr32_abcd = MOV32r0 implicit-def dead $eflags
     TEST8rr %28.sub_8bit, %28.sub_8bit, implicit-def $eflags
     JNE_1 %bb.20, implicit killed $eflags
     JMP_1 %bb.15
   
-  bb.15.bb31:
+  bb.15:
     successors: %bb.16(0x00000001), %bb.17(0x7fffffff)
   
     %78:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -507,11 +209,11 @@ body:             |
     JNE_1 %bb.17, implicit killed $eflags
     JMP_1 %bb.16
   
-  bb.16.bb34:
+  bb.16:
     successors: 
   
   
-  bb.17.bb35:
+  bb.17:
     successors: %bb.18(0x7fffffff), %bb.19(0x00000001)
   
     TEST8rr %78.sub_8bit, %78.sub_8bit, implicit-def $eflags
@@ -521,15 +223,15 @@ body:             |
     %79:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
     JMP_1 %bb.21
   
-  bb.19.bb36:
+  bb.19:
     successors: 
   
   
-  bb.20.bb38:
+  bb.20:
     %78:gr32_abcd = COPY %0
     %79:gr32 = COPY %0
   
-  bb.21.bb40:
+  bb.21:
     successors: %bb.22, %bb.23
   
     %35:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -538,31 +240,31 @@ body:             |
     JNE_1 %bb.23, implicit killed $eflags
     JMP_1 %bb.22
   
-  bb.22.bb43:
+  bb.22:
     ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     CALLpcrel32 @_Znwj, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     %80:gr32 = COPY killed $eax
-    MOV32mr undef %38:gr32, 1, $noreg, 0, $noreg, %78 :: (store 4 into `%18** undef`)
-    MOV32mr %79, 1, $noreg, 0, $noreg, %80 :: (store 4 into %ir.tmp46)
+    MOV32mr undef %38:gr32, 1, $noreg, 0, $noreg, %78
+    MOV32mr %79, 1, $noreg, 0, $noreg, %80
     ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     CALLpcrel32 @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
     ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
   
-  bb.23.bb48:
+  bb.23:
     successors: %bb.24(0x00000001), %bb.25(0x7fffffff)
   
-    MOV32mi %80, 1, $noreg, 52, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv :: (store 4 into %ir.tmp50)
+    MOV32mi %80, 1, $noreg, 52, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv
     %39:gr32_abcd = MOV32r0 implicit-def dead $eflags
     TEST8rr %39.sub_8bit, %39.sub_8bit, implicit-def $eflags
     JNE_1 %bb.25, implicit killed $eflags
     JMP_1 %bb.24
   
-  bb.24.bb52:
+  bb.24:
     successors: 
   
   
-  bb.25.bb53:
+  bb.25:
     successors: %bb.27(0x7fffffff), %bb.26(0x00000001)
   
     %41:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -570,11 +272,11 @@ body:             |
     JNE_1 %bb.27, implicit killed $eflags
     JMP_1 %bb.26
   
-  bb.26.bb54:
+  bb.26:
     successors: 
   
   
-  bb.27.bb55:
+  bb.27:
     successors: %bb.29(0x7fffffff), %bb.28(0x00000001)
   
     %43:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -582,11 +284,11 @@ body:             |
     JNE_1 %bb.29, implicit killed $eflags
     JMP_1 %bb.28
   
-  bb.28.bb58:
+  bb.28:
     successors: 
   
   
-  bb.29.bb59:
+  bb.29:
     successors: %bb.31(0x7fffffff), %bb.30(0x00000001)
   
     %45:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -594,11 +296,11 @@ body:             |
     JNE_1 %bb.31, implicit killed $eflags
     JMP_1 %bb.30
   
-  bb.30.bb61:
+  bb.30:
     successors: 
   
   
-  bb.31.bb62:
+  bb.31:
     successors: %bb.32(0x00000001), %bb.33(0x7fffffff)
   
     %47:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -606,11 +308,11 @@ body:             |
     JNE_1 %bb.33, implicit killed $eflags
     JMP_1 %bb.32
   
-  bb.32.bb64:
+  bb.32:
     successors: 
   
   
-  bb.33.bb65:
+  bb.33:
     successors: %bb.37(0x30000000), %bb.34(0x50000000)
   
     %49:gr8 = MOV8ri 1
@@ -618,7 +320,7 @@ body:             |
     JNE_1 %bb.37, implicit killed $eflags
     JMP_1 %bb.34
   
-  bb.34.bb67:
+  bb.34:
     successors: %bb.36(0x00000001), %bb.35(0x7fffffff)
   
     %81:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -629,38 +331,38 @@ body:             |
     %82:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
     JMP_1 %bb.38
   
-  bb.36.bb70:
+  bb.36:
     successors: 
   
   
-  bb.37.bb72:
+  bb.37:
     %81:gr32_abcd = COPY %0
     %82:gr32 = COPY %0
   
-  bb.38.bb74:
+  bb.38:
     successors: %bb.40(0x7fffffff), %bb.39(0x00000001)
   
     ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     CALLpcrel32 @_Znwj, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     %52:gr32 = COPY killed $eax
-    MOV32mr undef %53:gr32, 1, $noreg, 0, $noreg, %81 :: (store 4 into `%18** undef`)
-    MOV32mr %82, 1, $noreg, 0, $noreg, %52 :: (store 4 into %ir.tmp79)
+    MOV32mr undef %53:gr32, 1, $noreg, 0, $noreg, %81
+    MOV32mr %82, 1, $noreg, 0, $noreg, %52
     ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
     CALLpcrel32 @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
     ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
-    MOV32mi %52, 1, $noreg, 36, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv :: (store 4 into %ir.tmp81)
-    MOV32mr undef %54:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `%18** undef`)
-    %55:gr32 = MOV32rm %12, 1, $noreg, 140, $noreg :: (load 4 from %ir.3)
-    CMP32mi8 %55, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.tmp864)
+    MOV32mi %52, 1, $noreg, 36, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv
+    MOV32mr undef %54:gr32, 1, $noreg, 0, $noreg, %1
+    %55:gr32 = MOV32rm %12, 1, $noreg, 140, $noreg
+    CMP32mi8 %55, 1, $noreg, 0, $noreg, 0, implicit-def $eflags
     JE_1 %bb.40, implicit killed $eflags
     JMP_1 %bb.39
   
-  bb.39.bb89:
+  bb.39:
     successors: 
   
   
-  bb.40.bb90:
+  bb.40:
     successors: %bb.42(0x00000001), %bb.41(0x7fffffff)
   
     %56:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -668,7 +370,7 @@ body:             |
     JNE_1 %bb.42, implicit killed $eflags
     JMP_1 %bb.41
   
-  bb.41.bb92:
+  bb.41:
     successors: %bb.43(0x00000001), %bb.44(0x7fffffff)
   
     %58:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -676,15 +378,15 @@ body:             |
     JNE_1 %bb.43, implicit killed $eflags
     JMP_1 %bb.44
   
-  bb.42.bb94:
+  bb.42:
     successors: 
   
   
-  bb.43.bb96:
+  bb.43:
     successors: 
   
   
-  bb.44.bb97:
+  bb.44:
     successors: %bb.45(0x00000001), %bb.46(0x7fffffff)
   
     %60:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -692,11 +394,11 @@ body:             |
     JNE_1 %bb.46, implicit killed $eflags
     JMP_1 %bb.45
   
-  bb.45.bb101:
+  bb.45:
     successors: 
   
   
-  bb.46.bb102:
+  bb.46:
     successors: %bb.48(0x7fffffff), %bb.47(0x00000001)
   
     %62:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -704,11 +406,11 @@ body:             |
     JNE_1 %bb.48, implicit killed $eflags
     JMP_1 %bb.47
   
-  bb.47.bb103:
+  bb.47:
     successors: 
   
   
-  bb.48.bb104:
+  bb.48:
     successors: %bb.50(0x7fffffff), %bb.49(0x00000001)
   
     %64:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -716,11 +418,11 @@ body:             |
     JNE_1 %bb.50, implicit killed $eflags
     JMP_1 %bb.49
   
-  bb.49.bb108:
+  bb.49:
     successors: 
   
   
-  bb.50.bb109:
+  bb.50:
     successors: %bb.51(0x00000001), %bb.52(0x7fffffff)
   
     %66:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -728,11 +430,11 @@ body:             |
     JNE_1 %bb.52, implicit killed $eflags
     JMP_1 %bb.51
   
-  bb.51.bb111:
+  bb.51:
     successors: 
   
   
-  bb.52.bb112:
+  bb.52:
     successors: %bb.54(0x7fffffff), %bb.53(0x00000001)
   
     %68:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -740,11 +442,11 @@ body:             |
     JNE_1 %bb.54, implicit killed $eflags
     JMP_1 %bb.53
   
-  bb.53.bb117:
+  bb.53:
     successors: 
   
   
-  bb.54.bb118:
+  bb.54:
     successors: %bb.55(0x00000001), %bb.56(0x7fffffff)
   
     %70:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -752,11 +454,11 @@ body:             |
     JNE_1 %bb.56, implicit killed $eflags
     JMP_1 %bb.55
   
-  bb.55.bb120:
+  bb.55:
     successors: 
   
   
-  bb.56.bb121:
+  bb.56:
     successors: %bb.57(0x00000001), %bb.58(0x7fffffff)
   
     %72:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -764,31 +466,31 @@ body:             |
     JNE_1 %bb.58, implicit killed $eflags
     JMP_1 %bb.57
   
-  bb.57.bb124:
+  bb.57:
     successors: 
   
   
-  bb.58.bb125:
+  bb.58:
     successors: %bb.62(0x00000001), %bb.59(0x7fffffff)
   
-    CMP32mi8 %0, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.4)
+    CMP32mi8 %0, 1, $noreg, 0, $noreg, 0, implicit-def $eflags
     JE_1 %bb.62, implicit killed $eflags
     JMP_1 %bb.59
   
-  bb.59.bb128:
+  bb.59:
   
-  bb.60.bb129:
+  bb.60:
     successors: %bb.60(0x7fffffff), %bb.61(0x00000001)
   
     CMP32ri undef %75:gr32, 95406325, implicit-def $eflags
     JB_1 %bb.61, implicit killed $eflags
     JMP_1 %bb.60
   
-  bb.61.bb133:
+  bb.61:
     successors: 
   
   
-  bb.62.bb135:
+  bb.62:
     successors: %bb.63, %bb.64
   
     %76:gr32_abcd = MOV32r0 implicit-def dead $eflags
@@ -796,10 +498,10 @@ body:             |
     JNE_1 %bb.64, implicit killed $eflags
     JMP_1 %bb.63
   
-  bb.63.bb137:
+  bb.63:
     successors: 
   
   
-  bb.64.bb138:
+  bb.64:
 
 ...
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index 4e436f61e833..c21fc6a73a88 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -2563,3 +2563,160 @@ entry:
   %e = zext <8 x i6> %d to <8 x i64>
   ret <8 x i64> %e
 }
+
+define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) {
+; SSE2-LABEL: splatshuf_zext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: splatshuf_zext_v4i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: splatshuf_zext_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatshuf_zext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatshuf_zext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatshuf_zext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    retq
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ext = zext <4 x i32> %shuf to <4 x i64>
+  ret <4 x i64> %ext
+}
+
+define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) {
+; SSE2-LABEL: splatshuf_zext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,5,4]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: splatshuf_zext_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: splatshuf_zext_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatshuf_zext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatshuf_zext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatshuf_zext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 7, i32 0, i32 undef, i32 3, i32 7>
+  %ext = zext <8 x i16> %shuf to <8 x i32>
+  ret <8 x i32> %ext
+}
+
+define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) {
+; SSE2-LABEL: splatshuf_zext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: splatshuf_zext_v16i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: splatshuf_zext_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatshuf_zext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatshuf_zext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatshuf_zext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
+  %ext = zext <16 x i8> %shuf to <16 x i16>
+  ret <16 x i16> %ext
+}
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index ac535096466c..709e69fbb1da 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -2246,84 +2246,84 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
 ; O3DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
 ; O3DEFAULT-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP7]]
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
 ; O3DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP11]]
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP15]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
 ; O3DEFAULT-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP19]]
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
 ; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP23]]
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
 ; O3DEFAULT-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP27]]
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
 ; O3DEFAULT-NEXT:    [[TMP30:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP31:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
-; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP31]]
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
 ; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP35:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP35]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
 ; O3DEFAULT-NEXT:    [[TMP38:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP39:%.*]] = load <4 x i32>, <4 x i32>* [[TMP38]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP39]]
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
 ; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP43:%.*]] = load <4 x i32>, <4 x i32>* [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP43]]
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
 ; O3DEFAULT-NEXT:    [[TMP46:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP47:%.*]] = load <4 x i32>, <4 x i32>* [[TMP46]], align 4
-; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP47]]
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
 ; O3DEFAULT-NEXT:    [[TMP50:%.*]] = load i32, i32* [[A]], align 4
diff --git a/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index fb3d12d88ba5..8e36a921c758 100644
--- a/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -66,7 +66,7 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 401776aa270d..14a6d0eb72c8 100644
--- a/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -21,7 +21,7 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    ret i32 [[TMP10]]
diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index db02f55dcc77..d3bbf3df8582 100644
--- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -65,7 +65,7 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
@@ -86,10 +86,10 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[ADD11]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[T12]], i32 1
-; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP18]], [[TMP20]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
@@ -184,7 +184,7 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP9]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
@@ -194,10 +194,10 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[ADD11]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[T12]], i32 1
-; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
diff --git a/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
index 7038b0f8e276..6c474705abcf 100644
--- a/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
+++ b/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -16,8 +16,8 @@ define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %a
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> <half 0xH5380, half 0xH5380>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> <half 0xH57F0, half 0xH57F0>, [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
 ; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
diff --git a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index e8a83fa4b7a6..712ff040a918 100644
--- a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -14,7 +14,7 @@ define void @test() #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> <i64 3, i64 2, i64 1, i64 0>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], <i64 3, i64 2, i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
diff --git a/test/Transforms/SLPVectorizer/X86/PR35777.ll b/test/Transforms/SLPVectorizer/X86/PR35777.ll
index adfe77f89f52..4a403e7b9a3f 100644
--- a/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -10,7 +10,7 @@ define { i64, i64 } @patatino(double %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
diff --git a/test/Transforms/SLPVectorizer/X86/PR39774.ll b/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 67717a54659c..ae4a6b88bd31 100644
--- a/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -10,7 +10,7 @@ define void @Test(i32) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
 ; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
@@ -100,7 +100,7 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 0, i32 55, i32 285, i32 1240>, [[SHUFFLE]]
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
 ; FORCE_REDUCTION-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
diff --git a/test/Transforms/SLPVectorizer/X86/PR40310.ll b/test/Transforms/SLPVectorizer/X86/PR40310.ll
index ad1434146a5b..2a0b66ee2817 100644
--- a/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -12,7 +12,7 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
 ; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
 ; CHECK-NEXT:    [[V14:%.*]] = and i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[V16:%.*]] = and i32 undef, [[V14]]
 ; CHECK-NEXT:    [[V18:%.*]] = and i32 undef, [[V16]]
diff --git a/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 2a49864ca1e4..a04beed1a45b 100644
--- a/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -536,12 +536,12 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @add_sub_v8i32_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[R7]]
 ;
diff --git a/test/Transforms/SLPVectorizer/X86/barriercall.ll b/test/Transforms/SLPVectorizer/X86/barriercall.ll
index 7378b8bcb1c9..2ea29ed95c77 100644
--- a/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -15,7 +15,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 9, i32 9, i32 9, i32 9>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
diff --git a/test/Transforms/SLPVectorizer/X86/commutativity.ll b/test/Transforms/SLPVectorizer/X86/commutativity.ll
index 9af59efd3453..ad566cb3411e 100644
--- a/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -96,7 +96,7 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
index ec29f8413ace..c16ac5385598 100644
--- a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -20,8 +20,8 @@ define void @reduce_compare(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
 ; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
diff --git a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
index 47e89df5ab6c..550b8314d606 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -68,12 +68,12 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
 ; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; AVX-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> zeroinitializer, [[TMP0]]
+; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
 ; AVX-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
 ; AVX-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> zeroinitializer, [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; AVX-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
 ; AVX-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
 ; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
index f12de2ad199c..8f57a820197e 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -99,7 +99,7 @@ define void @zot(%struct.hoge* %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index e2d36376f5ea..5c753091f95c 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -31,10 +31,10 @@ define void @main() #0 {
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> <double 0.000000e+00, double undef>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> <double 1.400000e+02, double 1.400000e+02>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> <double 5.000000e+01, double 5.200000e+01>, [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8
diff --git a/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
index 98db3edd90ea..a3d98e39ce1b 100644
--- a/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
+++ b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
@@ -22,7 +22,7 @@ define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> <float 5.000000e+00, float 8.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 5.000000e+00, float 8.000000e+00>
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll
index 5860a24906be..d2512dcd615f 100644
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -18,20 +18,21 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    store double [[ADD12]], double* [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@@ -72,13 +73,13 @@ define i32 @foo(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> <double 7.900000e+00, double 7.700000e+00, double 7.600000e+00, double 7.400000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.700000e+00, double 7.600000e+00, double 7.400000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> <double 6.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    ret i32 undef
@@ -135,7 +136,7 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8
@@ -146,7 +147,7 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> <double 7.000000e+00, double 8.000000e+00>, [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8
@@ -203,13 +204,13 @@ define i32 @foo4(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> <double 7.900000e+00, double 7.900000e+00, double 7.900000e+00, double 7.900000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.900000e+00, double 7.900000e+00, double 7.900000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> <double 6.000000e+00, double 6.000000e+00, double 6.000000e+00, double 6.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 6.000000e+00, double 6.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    ret i32 undef
diff --git a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
index ac6933304780..2ba0a15fed22 100644
--- a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
+++ b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
@@ -24,7 +24,7 @@ define i32 @foo(i32* nocapture %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> <i32 18, i32 19, i32 12, i32 9>, [[TMP3]]
+; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> [[TMP3]], <i32 18, i32 19, i32 12, i32 9>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_029]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
diff --git a/test/Transforms/SLPVectorizer/X86/external_user.ll b/test/Transforms/SLPVectorizer/X86/external_user.ll
index 8ee644f939ba..1e47f7a51fd1 100644
--- a/test/Transforms/SLPVectorizer/X86/external_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/external_user.ll
@@ -32,9 +32,9 @@ define double @ext_user(double* noalias nocapture %B, double* noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_020:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+01, double 1.000000e+01>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_020]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
diff --git a/test/Transforms/SLPVectorizer/X86/extract.ll b/test/Transforms/SLPVectorizer/X86/extract.ll
index 24cf83ca405d..9a741cbb4cfd 100644
--- a/test/Transforms/SLPVectorizer/X86/extract.ll
+++ b/test/Transforms/SLPVectorizer/X86/extract.ll
@@ -8,7 +8,7 @@ define void @fextr(double* %ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.100000e+00>, [[LD]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[LD]], <double 0.000000e+00, double 1.100000e+00>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
@@ -32,7 +32,7 @@ define void @fextr1(double* %ptr) {
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
 ; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> <double 3.400000e+00, double 1.200000e+00>, [[REORDER_SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], <double 3.400000e+00, double 1.200000e+00>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
@@ -59,7 +59,7 @@ define void @fextr2(double* %ptr) {
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> <double 5.500000e+00, double 6.600000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 5.500000e+00, double 6.600000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/test/Transforms/SLPVectorizer/X86/extractcost.ll b/test/Transforms/SLPVectorizer/X86/extractcost.ll
index c9fae4460e57..834f5a089b50 100644
--- a/test/Transforms/SLPVectorizer/X86/extractcost.ll
+++ b/test/Transforms/SLPVectorizer/X86/extractcost.ll
@@ -14,7 +14,7 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 9, i32 9, i32 9, i32 9>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
diff --git a/test/Transforms/SLPVectorizer/X86/hoist.ll b/test/Transforms/SLPVectorizer/X86/hoist.ll
index 885d11acfa15..5abf85f319b9 100644
--- a/test/Transforms/SLPVectorizer/X86/hoist.ll
+++ b/test/Transforms/SLPVectorizer/X86/hoist.ll
@@ -25,7 +25,7 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[SHUFFLE]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[ADD10]] = add nsw i32 [[I_024]], 4
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 986da9fa52b9..311d8a476c71 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -36,7 +36,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
 ; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -76,7 +76,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; STORE-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
 ; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>, [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; STORE-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
 ; STORE-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
 ; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
diff --git a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
index 493e09a1d609..7e0cfb77c6b7 100644
--- a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
+++ b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
@@ -21,8 +21,8 @@ define void @in_tree_user(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index 429ad84f8a6a..2a4d457f1063 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -132,7 +132,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP15]], [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
 ; CHECK-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
@@ -413,52 +413,52 @@ define i32 @foo1() local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> [[TMP10]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP14]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[TMP16]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> [[TMP20]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> [[TMP24]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> [[TMP28]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP31]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60) to <4 x i32>*), align 16
 ; CHECK-NEXT:    br label [[FOR_BODY5:%.*]]
 ; CHECK:       for.cond3:
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index e26eeec63087..5f6e8f143f4e 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -627,7 +627,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
@@ -645,7 +645,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
-; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
 ; ZEROTHRESH-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
 ; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll
index 99b340addb92..ffbdd9f1d148 100644
--- a/test/Transforms/SLPVectorizer/X86/long_chains.ll
+++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -11,22 +11,22 @@ define i32 @test(double* nocapture %A, i8* nocapture %B) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> <i8 3, i8 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], <i8 3, i8 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8
 ; CHECK-NEXT:    ret i32 undef
diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
index 1b19aeae0377..020b50d54632 100644
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -36,7 +36,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[N]], i32 5
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[N]], i32 6
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[N]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP8]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 8
diff --git a/test/Transforms/SLPVectorizer/X86/multi_block.ll b/test/Transforms/SLPVectorizer/X86/multi_block.ll
index d0216103d42a..f785926cc412 100644
--- a/test/Transforms/SLPVectorizer/X86/multi_block.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -26,10 +26,10 @@ define i32 @bar(double* nocapture %A, i32 %d) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]]
 ; CHECK:         [[TMP6:%.*]] = tail call i32 (...) @foo()
 ; CHECK-NEXT:    br label [[TMP7]]
-; CHECK:         [[TMP8:%.*]] = fadd <2 x float> <float 4.000000e+00, float 5.000000e+00>, [[TMP3]]
+; CHECK:         [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 9.000000e+00, double 5.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; CHECK-NEXT:    ret i32 undef
diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll
index ce8594ea84d7..9268adf9481c 100644
--- a/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll
@@ -19,7 +19,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 7, i32 8, i32 9, i32 10>, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 7, i32 8, i32 9, i32 10>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
diff --git a/test/Transforms/SLPVectorizer/X86/operandorder.ll b/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 2354ebd29879..1b959f19883a 100644
--- a/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -14,7 +14,7 @@ define void @shuffle_operands1(double * noalias %from, double * noalias %to,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll
index a0a13b2b5aac..fe604e2652d2 100644
--- a/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -81,9 +81,9 @@ define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+01, double 1.000000e+01>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
@@ -150,9 +150,9 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
@@ -163,27 +163,26 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float undef>, float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP12]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float 8.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <4 x float> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121
+; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 3
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 2
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 1
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 0
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
@@ -255,7 +254,7 @@ define float @sort_phi_type(float* nocapture readonly %A) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> [[TMP8]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
diff --git a/test/Transforms/SLPVectorizer/X86/pr35497.ll b/test/Transforms/SLPVectorizer/X86/pr35497.ll
index c6989c384e01..bdb37b28d58c 100644
--- a/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -55,7 +55,7 @@ define void @pr35497() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> <i64 20, i64 20>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
@@ -63,7 +63,7 @@ define void @pr35497() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> <i64 20, i64 20>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
diff --git a/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
index 7cc0194c7302..380f58fe5dc8 100644
--- a/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
+++ b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
@@ -88,7 +88,7 @@ define void @nsw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -124,7 +124,7 @@ define void @not_nsw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -160,7 +160,7 @@ define void @nuw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -196,7 +196,7 @@ define void @not_nuw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -232,7 +232,7 @@ define void @nnan(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -268,7 +268,7 @@ define void @not_nnan(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -304,7 +304,7 @@ define void @only_fast(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@@ -340,7 +340,7 @@ define void @only_arcp(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll
index 03b7f67ae4ca..e9f8e7f7c884 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -23,7 +23,7 @@ define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[MUL]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 7.000000e+00, double 7.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP3]], [[TMP4]]
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 47a6a44611d8..0f0bbf9a2ad2 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -14,10 +14,10 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 undef, [[ADD]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
@@ -32,11 +32,11 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
diff --git a/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
index d1c7e6e851f5..13884efd98dd 100644
--- a/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -18,7 +18,7 @@ define void @hoge() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[REORDER_SHUFFLE]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
 ; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> <i32 undef, i32 15, i32 31, i32 47>, [[SHUFFLE8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], <i32 undef, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 undef, undef
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef
@@ -37,7 +37,7 @@ define void @hoge() {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> <i32 -49, i32 -33, i32 -33, i32 -17>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 undef, undef
 ; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef
diff --git a/test/Transforms/SLPVectorizer/X86/resched.ll b/test/Transforms/SLPVectorizer/X86/resched.ll
index b8b1ff00db41..28bc95e2f4ca 100644
--- a/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -72,7 +72,7 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
 ; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
 ; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
 ; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll
index f2f858e3c7dd..7e9109a4ef00 100644
--- a/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll
@@ -15,7 +15,7 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a,
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP11]]
diff --git a/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
index 3abde37048fd..bff947e28cae 100644
--- a/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -12,7 +12,7 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
 ; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
diff --git a/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/test/Transforms/SLPVectorizer/X86/simple-loop.ll
index 975a1af7576a..59b94cad17e4 100644
--- a/test/Transforms/SLPVectorizer/X86/simple-loop.ll
+++ b/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@@ -14,8 +14,8 @@ define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i6
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> <i32 7, i32 7, i32 7, i32 7>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> <i32 7, i32 14, i32 21, i32 28>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll
index c2f4b981af9e..78df5a1d68dd 100644
--- a/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -33,9 +33,9 @@ define void @test() {
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
 ; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
diff --git a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 2b593b78652f..8bf3f362f03c 100644
--- a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -12,7 +12,7 @@ define void @add0(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 1, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -136,7 +136,7 @@ define void @sub1(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 4, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 4, i32 -1, i32 -2, i32 -3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -174,7 +174,7 @@ define void @sub2(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -2, i32 -3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -422,7 +422,7 @@ define void @add0f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -546,7 +546,7 @@ define void @sub1f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -584,7 +584,7 @@ define void @sub2f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -751,7 +751,7 @@ define void @add0fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -875,7 +875,7 @@ define void @sub1fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -913,7 +913,7 @@ define void @sub2fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
index fd23381949d7..889bba80b7a4 100644
--- a/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -17,7 +17,7 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
@@ -91,7 +91,7 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
@@ -169,7 +169,7 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
diff --git a/test/tools/llvm-objcopy/ELF/preserve-segment-contents-ehdr-phdrs.test b/test/tools/llvm-objcopy/ELF/preserve-segment-contents-ehdr-phdrs.test
new file mode 100644
index 000000000000..9dc63d753f60
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/preserve-segment-contents-ehdr-phdrs.test
@@ -0,0 +1,41 @@
+## Show that llvm-objcopy correctly updates the elf header and program header
+## table when they are within a segment.
+
+# RUN: yaml2obj %s -o %t.in
+## Validate that the properties are different before the removal.
+# RUN: llvm-readobj --file-headers --program-headers %t.in | FileCheck %s --check-prefix=BEFORE
+# RUN: llvm-objcopy %t.in %t.out -R .remove_me
+# RUN: llvm-readobj --file-headers --program-headers %t.out | FileCheck %s --check-prefix=AFTER
+
+# BEFORE: SectionHeaderCount: 6
+# BEFORE:      Type: PT_LOAD
+# BEFORE-NEXT: Offset: 0x0
+# BEFORE:      Type: PT_LOAD
+# BEFORE-NEXT: Offset: 0x240
+
+# AFTER:      SectionHeaderCount: 5
+# AFTER:      Type: PT_LOAD
+# AFTER-NEXT: Offset: 0x0
+# AFTER:      Type: PT_LOAD
+# AFTER-NEXT: Offset: 0xB0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: .remove_me
+    Type: SHT_PROGBITS
+    Size: 0x10
+  - Name: .keep_me
+    Type: SHT_PROGBITS
+    Size: 0x10
+ProgramHeaders:
+  - Type:     PT_LOAD
+    Offset:   0
+    FileSize: 176 # sizeof(Elf64_Ehdr) + 2 * sizeof(Elf64_Phdr)
+  - Type:     PT_LOAD
+    Sections:
+      - Section: .keep_me
diff --git a/test/tools/llvm-objcopy/ELF/preserve-segment-contents.test b/test/tools/llvm-objcopy/ELF/preserve-segment-contents.test
new file mode 100644
index 000000000000..4de3ac9700e5
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/preserve-segment-contents.test
@@ -0,0 +1,639 @@
+# We want to preserve areas in segments that are not covered by section headers.
+# This test shows that we do this for areas at the start of a segment, between
+# sections in a segment, and after all sections in a segment.
+# To create inputs with arbitrary data in segments, not covered by sections, we
+# use yaml2obj to create segments with sections covering all areas, then remove
+# some sections in those segments, and finally write over the areas of the
+# removed sections using python.
+
+# blob* sections are the sections that will be removed to create unlabelled
+# areas and then overwritten with data to show we preserve the data.
+
+# RUN: yaml2obj %s -o %t.base
+# RUN: llvm-objcopy %t.base %t.stripped --regex -R blob.*
+# Show that the removal leaves the bytes as zeroes, as desired, for all our
+# test cases.
+# RUN: od -t x1 -j 0x2000 -N 24 %t.stripped | FileCheck %s --check-prefix=CHECK1 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x2100 -N 12 %t.stripped | FileCheck %s --check-prefix=CHECK2 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x2200 -N 4  %t.stripped | FileCheck %s --check-prefix=CHECK3 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x2300 -N 12 %t.stripped | FileCheck %s --check-prefix=CHECK4 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x3000 -N 68 %t.stripped | FileCheck %s --check-prefix=CHECK5 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x4000 -N 60 %t.stripped | FileCheck %s --check-prefix=CHECK6 -DPATTERN="00 00 00 00"
+# RUN: od -t x1 -j 0x5000 -N 60 %t.stripped | FileCheck %s --check-prefix=CHECK7 -DPATTERN="00 00 00 00"
+
+# RUN: cp %t.stripped %t.in
+# RUN: echo "with open('%/t.in', 'r+') as input:"                                  > %t.py
+# RUN: echo "  for offset in ["                                                   >> %t.py
+# RUN: echo "   0x2000, 0x2008, 0x200C, 0x2014, 0x2104, 0x2300,"                  >> %t.py
+# RUN: echo "   0x3008, 0x3010, 0x3018, 0x3020, 0x3028, 0x302C, 0x3034, 0x303C,"  >> %t.py
+# RUN: echo "   0x4000, 0x4008, 0x4010, 0x4014, 0x401C, 0x4024, 0x4034,"          >> %t.py
+# RUN: echo "   0x5000, 0x5008, 0x5010, 0x501C, 0x5024, 0x502C, 0x5030, 0x5038]:" >> %t.py
+# RUN: echo "    input.seek(offset)"                                              >> %t.py
+# RUN: echo "    input.write('\xDE\xAD\xBE\xEF')"                                 >> %t.py
+# RUN: %python %t.py
+# RUN: llvm-objcopy %t.in %t.out
+# RUN: od -t x1 -j 0x2000 -N 24 %t.out | FileCheck %s --check-prefix=CHECK1 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x2100 -N 12 %t.out | FileCheck %s --check-prefix=CHECK2 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x2200 -N 4  %t.out | FileCheck %s --check-prefix=CHECK3 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x2300 -N 12 %t.out | FileCheck %s --check-prefix=CHECK4 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x3000 -N 68 %t.out | FileCheck %s --check-prefix=CHECK5 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x4000 -N 60 %t.out | FileCheck %s --check-prefix=CHECK6 -DPATTERN="de ad be ef"
+# RUN: od -t x1 -j 0x5000 -N 60 %t.out | FileCheck %s --check-prefix=CHECK7 -DPATTERN="de ad be ef"
+
+# CHECK1:      [[PATTERN]] 11 22 33 44 [[PATTERN]] [[PATTERN]]
+# CHECK1-NEXT: 55 66 77 88 [[PATTERN]]
+# CHECK2:      99 00 aa bb [[PATTERN]] cc dd ee ff
+# CHECK3:      fe fe fe fe
+# CHECK4:      [[PATTERN]] 00 00 00 00 00 00 00 00
+# CHECK5:      ff ff ee ee dd dd cc cc [[PATTERN]] bb bb aa aa
+# CHECK5-NEXT: [[PATTERN]] 00 00 99 99 [[PATTERN]] 88 88 77 77
+# CHECK5-NEXT: [[PATTERN]] 66 66 55 55 [[PATTERN]] [[PATTERN]]
+# CHECK5-NEXT: 44 44 33 33 [[PATTERN]] 22 22 11 11 [[PATTERN]]
+# CHECK5-NEXT: 00 11 22 33
+# CHECK6:      [[PATTERN]] 44 55 66 77 [[PATTERN]] 88 99 aa bb
+# CHECK6-NEXT: [[PATTERN]] [[PATTERN]] cc dd ee ff [[PATTERN]]
+# CHECK6-NEXT: ff ee dd cc [[PATTERN]] bb aa 99 88 77 66 55 44
+# CHECK6-NEXT: 33 22 11 00 [[PATTERN]] 11 11 11 11
+# CHECK7:      [[PATTERN]] 12 34 56 78 [[PATTERN]] 90 ab cd ef
+# CHECK7-NEXT: [[PATTERN]] fe dc ba 09 87 65 43 21 [[PATTERN]]
+# CHECK7-NEXT: 22 22 22 22 [[PATTERN]] 33 33 33 33 [[PATTERN]]
+# CHECK7-NEXT: [[PATTERN]] 44 44 44 44 [[PATTERN]]
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: blob1
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2000
+    AddressAlign: 0x2000
+  - Name: section1
+    Type: SHT_PROGBITS
+    Address: 0x2004
+    Content: '11223344'
+  - Name: blob2
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2008
+  - Name: section2
+    Type: SHT_NOBITS
+    Size: 4
+    Address: 0x200C
+  - Name: blob3
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2010
+  - Name: section3
+    Type: SHT_PROGBITS
+    Content: '55667788'
+    Address: 0x2014
+  - Name: blob4
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2018
+  - Name: section4
+    Type: SHT_PROGBITS
+    Content: '9900aabb'
+    Address: 0x2100
+    AddressAlign: 0x100
+  - Name: blob5
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2104
+  - Name: section5
+    Type: SHT_PROGBITS
+    Address: 0x2108
+    Content: 'ccddeeff'
+  - Name: section6
+    Type: SHT_PROGBITS
+    Content: 'fefefefe'
+    Address: 0x2200
+    AddressAlign: 0x100
+  - Name: blob6
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x2300
+    AddressAlign: 0x100
+  - Name: sectionA
+    Type: SHT_PROGBITS
+    Content: 'ffffeeee'
+    Address: 0x3000
+    AddressAlign: 0x1000
+  - Name: sectionB
+    Type: SHT_PROGBITS
+    Content: 'ddddcccc'
+    Address: 0x3004
+  - Name: blobA
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3008
+  - Name: sectionC
+    Type: SHT_PROGBITS
+    Content: 'bbbbaaaa'
+    Address: 0x300C
+  - Name: blobB
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3010
+  - Name: sectionD
+    Type: SHT_PROGBITS
+    Content: '00009999'
+    Address: 0x3014
+  - Name: blobC
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3018
+  - Name: sectionE
+    Type: SHT_PROGBITS
+    Content: '88887777'
+    Address: 0x301C
+  - Name: blobD
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3020
+  - Name: sectionF
+    Type: SHT_PROGBITS
+    Content: '66665555'
+    Address: 0x3024
+  - Name: blobE
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3028
+  - Name: blobF
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x302C
+  - Name: sectionG
+    Type: SHT_PROGBITS
+    Content: '44443333'
+    Address: 0x3030
+  - Name: blobG
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x3034
+  - Name: sectionH
+    Type: SHT_PROGBITS
+    Content: '22221111'
+    Address: 0x3038
+  - Name: blobH
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x303C
+  - Name: sectionI
+    Type: SHT_PROGBITS
+    Content: '00112233'
+    Address: 0x3040
+  - Name: blobz
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4000
+    AddressAlign: 0x1000
+  - Name: sectionz
+    Type: SHT_PROGBITS
+    Content: '44556677'
+    Address: 0x4004
+  - Name: bloby
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4008
+  - Name: sectiony
+    Type: SHT_PROGBITS
+    Content: '8899aabb'
+    Address: 0x400C
+  - Name: blobx
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4010
+  - Name: blobw
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4014
+  - Name: sectionx
+    Type: SHT_PROGBITS
+    Content: 'ccddeeff'
+    Address: 0x4018
+  - Name: blobv
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x401C
+  - Name: sectionw
+    Type: SHT_PROGBITS
+    Content: 'ffeeddcc'
+    Address: 0x4020
+  - Name: blobu
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4024
+  - Name: sectionv
+    Type: SHT_PROGBITS
+    Content: 'bbaa9988'
+    Address: 0x4028
+  - Name: sectionu
+    Type: SHT_PROGBITS
+    Content: '77665544'
+    Address: 0x402C
+  - Name: sectiont
+    Type: SHT_PROGBITS
+    Content: '33221100'
+    Address: 0x4030
+  - Name: blobt
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x4034
+  - Name: sections
+    Type: SHT_PROGBITS
+    Content: '11111111'
+    Address: 0x4038
+  - Name: bloba
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5000
+    AddressAlign: 0x1000
+  - Name: sectiona
+    Type: SHT_PROGBITS
+    Content: '12345678'
+    Address: 0x5004
+  - Name: blobb
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5008
+  - Name: sectionb
+    Type: SHT_PROGBITS
+    Content: '90abcdef'
+    Address: 0x500C
+  - Name: blobc
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5010
+  - Name: sectionc
+    Type: SHT_PROGBITS
+    Content: 'fedcba09'
+    Address: 0x5014
+  - Name: sectiond
+    Type: SHT_PROGBITS
+    Content: '87654321'
+    Address: 0x5018
+  - Name: blobd
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x501C
+  - Name: sectione
+    Type: SHT_PROGBITS
+    Content: '22222222'
+    Address: 0x5020
+  - Name: blobe
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5024
+  - Name: sectionf
+    Type: SHT_PROGBITS
+    Content: '33333333'
+    Address: 0x5028
+  - Name: blobf
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x502C
+  - Name: blobg
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5030
+  - Name: sectiong
+    Type: SHT_PROGBITS
+    Content: '44444444'
+    Address: 0x5034
+  - Name: blobh
+    Type: SHT_PROGBITS
+    Content: 'abbababa'
+    Address: 0x5038
+ProgramHeaders:
+  # First segment has unlabelled space at start and end.
+  - Type:  0x6ABCDEF0 # Non-specific segment type.
+    VAddr: 0x2000
+    PAddr: 0x2000
+    Align: 0x2000
+    Sections:
+      - Section: blob1
+      - Section: section1
+      - Section: blob2
+      - Section: section2 # nobits
+      - Section: blob3
+      - Section: section3
+      - Section: blob4
+  # Second segment has sections at start and end.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x2100
+    PAddr: 0x2100
+    Align: 0x100
+    Sections:
+      - Section: section4
+      - Section: blob5
+      - Section: section5
+  # Third segment is all covered by a section.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x2200
+    PAddr: 0x2200
+    Align: 0x100
+    Sections:
+      - Section: section6
+  # Fourth segment has no sections (after removing blob headers).
+  - Type:  0x6ABCDEF0
+    VAddr: 0x2300
+    PAddr: 0x2300
+    Align: 0x100
+    Sections:
+      - Section: blob6
+  # Fifth segment is empty.
+  - Type:   0x6ABCDEF0
+    VAddr:  0x2308
+    PAddr:  0x2308
+    Offset: 0x2308
+
+  # The next few segments test behaviour of fully nested segments.
+  # Sixth segment is the "parent" segment.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x3000
+    PAddr: 0x3000
+    Align: 0x1000
+    Sections:
+      - Section: sectionA
+      - Section: sectionB
+      - Section: blobA
+      - Section: sectionC
+      - Section: blobB
+      - Section: sectionD
+      - Section: blobC
+      - Section: sectionE
+      - Section: blobD
+      - Section: sectionF
+      - Section: blobE
+      - Section: blobF
+      - Section: sectionG
+      - Section: blobG
+      - Section: sectionH
+      - Section: blobH
+      - Section: sectionI
+  # Seventh segment is empty and nested.
+  - Type:   0x6ABCDEF0
+    VAddr:  0x3002
+    PAddr:  0x3002
+    Offset: 0x3002
+  # Eighth segment contains only a section and is nested.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x3004
+    PAddr: 0x3004
+    Sections:
+      - Section: sectionB
+  # Ninth segment contains only unlabelled space and is nested.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x3008
+    PAddr: 0x3008
+    Sections:
+      - Section: blobA
+  # Tenth segment contains two sections with space between and is nested.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x300C
+    PAddr: 0x300C
+    Sections:
+      - Section: sectionC
+      - Section: blobB
+      - Section: sectionD
+  # Eleventh segment contains two sections with space between and at ends and is nested.
+  - Type:  0x6ABCDEF0
+    VAddr: 0x3018
+    PAddr: 0x3018
+    Sections:
+      - Section: blobC
+      - Section: sectionE
+      - Section: blobD
+      - Section: sectionF
+      - Section: blobE
+  # Twelfth segment contains one section with space at ends adjacent to space in parent segment.
+  - Type:     0x6ABCDEF0
+    VAddr:    0x302E
+    PAddr:    0x302E
+    Offset:   0x302E
+    FileSize: 8
+    Sections:
+      - Section: sectionG
+  # Thirteenth segment contains overlaps sections at either end in parent segment.
+  - Type:     0x6ABCDEF0
+    VAddr:    0x303A
+    PAddr:    0x303A
+    Offset:   0x303A
+    FileSize: 0x8
+    Sections:
+      - Section: blobH
+
+  # The next batch of segments are segments that only partially overlap other segments.
+
+  # Segment14: |-unlabelled-|-Sec-|
+  # Segment15:           |--|-Sec-|-unlabelled-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4000
+    PAddr: 0x4000
+    Sections:
+      - Section: blobz
+      - Section: sectionz
+  - Type:   0x6ABCDEF0
+    VAddr:  0x4002
+    PAddr:  0x4002
+    Offset: 0x4002
+    Sections:
+      - Section: sectionz
+      - Section: bloby
+
+  # Segment16: |-Sec-|--|
+  # Segment17:    |--|----unlabelled---|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x400C
+    PAddr: 0x400C
+    FileSize: 6
+    Sections:
+      - Section: sectiony
+  - Type:   0x6ABCDEF0
+    VAddr:  0x400E
+    PAddr:  0x400E
+    Offset: 0x400E
+    Sections:
+      - Section: blobx
+
+  # Segment18: |-unlabelled-|-Sec-|
+  # Segment19:              |-Sec-|-unlabelled-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4014
+    PAddr: 0x4014
+    Sections:
+      - Section: blobw
+      - Section: sectionx
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4018
+    PAddr: 0x4018
+    Sections:
+      - Section: sectionx
+      - Section: blobv
+
+  # Segment20: |-Sec-|
+  # Segment21:    |--|-unlabelled-|-Sec-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4020
+    PAddr: 0x4020
+    Sections:
+      - Section: sectionw
+  - Type:   0x6ABCDEF0
+    VAddr:  0x4022
+    PAddr:  0x4022
+    Offset: 0x4022
+    Sections:
+      - Section: blobu
+      - Section: sectionv
+
+  # Segment22: |-Sec-|
+  # Segment23:    |--|-Sec-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x402C
+    PAddr: 0x402C
+    Sections:
+      - Section: sectionu
+  - Type:   0x6ABCDEF0
+    VAddr:  0x402E
+    PAddr:  0x402E
+    Offset: 0x402E
+    Sections:
+      - Section: sectiont
+
+  # Segment24: |-unlabelled-|--|
+  # Segment25:              |--Sec--|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4034
+    PAddr: 0x4034
+    FileSize: 6
+    Sections:
+      - Section: blobt
+  - Type:  0x6ABCDEF0
+    VAddr: 0x4038
+    PAddr: 0x4038
+    Sections:
+      - Section: sections
+
+  # The next batch of segments represent groups of three nested/overlapping segments,
+  # with one parent segment containing two overlapping segments.
+
+  # Segment26: |-unlabelled-|-Sec-|-unlabelled-|
+  # Segment27: |------------|--|
+  # Segment28:              |-Sec-|------------|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5000
+    PAddr: 0x5000
+    Align: 0x1000
+    Sections:
+      - Section: bloba
+      - Section: sectiona
+      - Section: blobb
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5000
+    PAddr: 0x5000
+    FileSize: 6
+    Sections:
+      - Section: bloba
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5004
+    PAddr: 0x5004
+    Sections:
+      - Section: sectiona
+      - Section: blobb
+
+  # Segment29: |-Sec-|-unlabelled-|-Sec-|
+  # Segment30: |-Sec-|--------|
+  # Segment31:          |---------|-Sec-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x500C
+    PAddr: 0x500C
+    Sections:
+      - Section: sectionb
+      - Section: blobc
+      - Section: sectionc
+  - Type:  0x6ABCDEF0
+    VAddr: 0x500C
+    PAddr: 0x500C
+    FileSize: 7
+    Sections:
+      - Section: sectionb
+  - Type:   0x6ABCDEF0
+    VAddr:  0x5011
+    PAddr:  0x5011
+    Offset: 0x5011
+    Sections:
+      - Section: sectionc
+
+  # Segment32: |-Sec-|-unlabelled-|-Sec-|
+  # Segment33: |-Sec-|------------|
+  # Segment34:       |------------|-Sec-|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5018
+    PAddr: 0x5018
+    Sections:
+      - Section: sectiond
+      - Section: blobd
+      - Section: sectione
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5018
+    PAddr: 0x5018
+    Sections:
+      - Section: sectiond
+      - Section: blobd
+  - Type:  0x6ABCDEF0
+    VAddr: 0x501C
+    PAddr: 0x501C
+    Sections:
+      - Section: blobd
+      - Section: sectione
+
+  # Segment35: |-unlabelled-|-Sec-|-unlabelled-|
+  # Segment36: |------------|-Sec-|
+  # Segment37:              |-Sec-|------------|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5024
+    PAddr: 0x5024
+    Sections:
+      - Section: blobe
+      - Section: sectionf
+      - Section: blobf
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5024
+    PAddr: 0x5024
+    Sections:
+      - Section: blobe
+      - Section: sectionf
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5028
+    PAddr: 0x5028
+    Sections:
+      - Section: sectionf
+      - Section: blobf
+
+  # Segment38: |-unlabelled-|-Sec-|-unlabelled-|
+  # Segment39: |------------|---|
+  # Segment40:                |---|------------|
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5030
+    PAddr: 0x5030
+    Sections:
+      - Section: blobg
+      - Section: sectiong
+      - Section: blobh
+  - Type:  0x6ABCDEF0
+    VAddr: 0x5030
+    PAddr: 0x5030
+    FileSize: 7
+    Sections:
+      - Section: blobg
+  - Type:   0x6ABCDEF0
+    VAddr:  0x5035
+    PAddr:  0x5035
+    Offset: 0x5035
+    Sections:
+      - Section: blobh
diff --git a/test/tools/llvm-readobj/elf-versioninfo.test b/test/tools/llvm-readobj/elf-versioninfo.test
index 82029cf82fd3..7ef599e1370b 100644
--- a/test/tools/llvm-readobj/elf-versioninfo.test
+++ b/test/tools/llvm-readobj/elf-versioninfo.test
@@ -1,106 +1,119 @@
 // Test that llvm-readobj dumps version info tags correctly.
 
-RUN: llvm-readobj -dynamic-table -V %p/Inputs/verdef.elf-x86-64 | FileCheck %s
+RUN: llvm-readobj -dynamic-table -V %p/Inputs/verdef.elf-x86-64 | FileCheck %s --check-prefix=LLVM-VERDEF
+RUN: llvm-readelf -dynamic-table -V %p/Inputs/verdef.elf-x86-64 | FileCheck %s --check-prefix=GNU-VERDEF
 
-CHECK: 0x000000006FFFFFF0 VERSYM               0x24C
-CHECK: 0x000000006FFFFFFC VERDEF               0x25C
-CHECK: 0x000000006FFFFFFD VERDEFNUM            3
+LLVM-VERDEF: 0x000000006FFFFFF0 VERSYM               0x24C
+LLVM-VERDEF: 0x000000006FFFFFFC VERDEF               0x25C
+LLVM-VERDEF: 0x000000006FFFFFFD VERDEFNUM            3
 
-CHECK: Version symbols {
-CHECK-NEXT:   Section Name: .gnu.version (20)
-CHECK-NEXT:   Address: 0x24C
-CHECK-NEXT:   Offset: 0x24C
-CHECK-NEXT:   Link: 1
-CHECK-NEXT:   Symbols [
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 0
-CHECK-NEXT:       Name: {{$}}
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 1
-CHECK-NEXT:       Name: _end{{$}}
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 1
-CHECK-NEXT:       Name: _edata{{$}}
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 3
-CHECK-NEXT:       Name: goo@@VERSION2
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 1
-CHECK-NEXT:       Name: __bss_start{{$}}
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 2
-CHECK-NEXT:       Name: foo@@VERSION1
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 2
-CHECK-NEXT:       Name: VERSION1@@VERSION1
-CHECK-NEXT:     }
-CHECK-NEXT:     Symbol {
-CHECK-NEXT:       Version: 3
-CHECK-NEXT:       Name: VERSION2@@VERSION2
-CHECK-NEXT:     }
-CHECK-NEXT:   ]
-CHECK-NEXT: }
+LLVM-VERDEF: Version symbols {
+LLVM-VERDEF-NEXT:   Section Name: .gnu.version (20)
+LLVM-VERDEF-NEXT:   Address: 0x24C
+LLVM-VERDEF-NEXT:   Offset: 0x24C
+LLVM-VERDEF-NEXT:   Link: 1
+LLVM-VERDEF-NEXT:   Symbols [
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 0
+LLVM-VERDEF-NEXT:       Name: {{$}}
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 1
+LLVM-VERDEF-NEXT:       Name: _end{{$}}
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 1
+LLVM-VERDEF-NEXT:       Name: _edata{{$}}
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 3
+LLVM-VERDEF-NEXT:       Name: goo@@VERSION2
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 1
+LLVM-VERDEF-NEXT:       Name: __bss_start{{$}}
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 2
+LLVM-VERDEF-NEXT:       Name: foo@@VERSION1
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 2
+LLVM-VERDEF-NEXT:       Name: VERSION1@@VERSION1
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:     Symbol {
+LLVM-VERDEF-NEXT:       Version: 3
+LLVM-VERDEF-NEXT:       Name: VERSION2@@VERSION2
+LLVM-VERDEF-NEXT:     }
+LLVM-VERDEF-NEXT:   ]
+LLVM-VERDEF-NEXT: }
 
-CHECK:      SHT_GNU_verdef {
-CHECK-NEXT:   Definition {
-CHECK-NEXT:     Version: 1
-CHECK-NEXT:     Flags: Base (0x1)
-CHECK-NEXT:     Index: 1
-CHECK-NEXT:     Hash: 430712
-CHECK-NEXT:     Name: blah
-CHECK-NEXT:   }
-CHECK-NEXT:   Definition {
-CHECK-NEXT:     Version: 1
-CHECK-NEXT:     Flags: 0x0
-CHECK-NEXT:     Index: 2
-CHECK-NEXT:     Hash: 175630257
-CHECK-NEXT:     Name: VERSION1
-CHECK-NEXT:   }
-CHECK-NEXT:   Definition {
-CHECK-NEXT:     Version: 1
-CHECK-NEXT:     Flags: 0x0
-CHECK-NEXT:     Index: 3
-CHECK-NEXT:     Hash: 175630258
-CHECK-NEXT:     Name: VERSION2
-CHECK-NEXT:     Predecessor: VERSION1
-CHECK-NEXT:   }
-CHECK-NEXT: }
+LLVM-VERDEF:      SHT_GNU_verdef {
+LLVM-VERDEF-NEXT:   Definition {
+LLVM-VERDEF-NEXT:     Version: 1
+LLVM-VERDEF-NEXT:     Flags: Base (0x1)
+LLVM-VERDEF-NEXT:     Index: 1
+LLVM-VERDEF-NEXT:     Hash: 430712
+LLVM-VERDEF-NEXT:     Name: blah
+LLVM-VERDEF-NEXT:   }
+LLVM-VERDEF-NEXT:   Definition {
+LLVM-VERDEF-NEXT:     Version: 1
+LLVM-VERDEF-NEXT:     Flags: 0x0
+LLVM-VERDEF-NEXT:     Index: 2
+LLVM-VERDEF-NEXT:     Hash: 175630257
+LLVM-VERDEF-NEXT:     Name: VERSION1
+LLVM-VERDEF-NEXT:   }
+LLVM-VERDEF-NEXT:   Definition {
+LLVM-VERDEF-NEXT:     Version: 1
+LLVM-VERDEF-NEXT:     Flags: 0x0
+LLVM-VERDEF-NEXT:     Index: 3
+LLVM-VERDEF-NEXT:     Hash: 175630258
+LLVM-VERDEF-NEXT:     Name: VERSION2
+LLVM-VERDEF-NEXT:     Predecessor: VERSION1
+LLVM-VERDEF-NEXT:   }
+LLVM-VERDEF-NEXT: }
 
-RUN: llvm-readobj -V %p/Inputs/verneed.elf-x86-64 | FileCheck %s --check-prefix=VERNEED
+GNU-VERDEF: 0x000000006ffffff0 VERSYM               0x24c
+GNU-VERDEF: 0x000000006ffffffc VERDEF               0x25c
+GNU-VERDEF: 0x000000006ffffffd VERDEFNUM            3
+
+GNU-VERDEF: Dumper for .gnu.version is not implemented
+GNU-VERDEF: Dumper for .gnu.version_d is not implemented
+
+RUN: llvm-readobj -V %p/Inputs/verneed.elf-x86-64 | FileCheck %s --check-prefix=LLVM-VERNEED
+RUN: llvm-readelf -V %p/Inputs/verneed.elf-x86-64 | FileCheck %s --check-prefix=GNU-VERNEED
+
+LLVM-VERNEED:       SHT_GNU_verneed {
+LLVM-VERNEED-NEXT:   Dependency {
+LLVM-VERNEED-NEXT:     Version: 1
+LLVM-VERNEED-NEXT:     Count: 2
+LLVM-VERNEED-NEXT:     FileName: verneed1.so.0
+LLVM-VERNEED-NEXT:     Entry {
+LLVM-VERNEED-NEXT:       Hash: 1938
+LLVM-VERNEED-NEXT:       Flags: 0x0
+LLVM-VERNEED-NEXT:       Index: 3
+LLVM-VERNEED-NEXT:       Name: v2
+LLVM-VERNEED-NEXT:     }
+LLVM-VERNEED-NEXT:     Entry {
+LLVM-VERNEED-NEXT:       Hash: 1939
+LLVM-VERNEED-NEXT:       Flags: 0x0
+LLVM-VERNEED-NEXT:       Index: 2
+LLVM-VERNEED-NEXT:       Name: v3
+LLVM-VERNEED-NEXT:     }
+LLVM-VERNEED-NEXT:   }
+LLVM-VERNEED-NEXT:   Dependency {
+LLVM-VERNEED-NEXT:     Version: 1
+LLVM-VERNEED-NEXT:     Count: 1
+LLVM-VERNEED-NEXT:     FileName: verneed2.so.0
+LLVM-VERNEED-NEXT:     Entry {
+LLVM-VERNEED-NEXT:       Hash: 1937
+LLVM-VERNEED-NEXT:       Flags: 0x0
+LLVM-VERNEED-NEXT:       Index: 4
+LLVM-VERNEED-NEXT:       Name: v1
+LLVM-VERNEED-NEXT:     }
+LLVM-VERNEED-NEXT:   }
+LLVM-VERNEED-NEXT: }
+
+GNU-VERNEED: Dumper for .gnu.version is not implemented
+GNU-VERNEED: Dumper for .gnu.version_r is not implemented
 
-VERNEED:       SHT_GNU_verneed {
-VERNEED-NEXT:   Dependency {
-VERNEED-NEXT:     Version: 1
-VERNEED-NEXT:     Count: 2
-VERNEED-NEXT:     FileName: verneed1.so.0
-VERNEED-NEXT:     Entry {
-VERNEED-NEXT:       Hash: 1938
-VERNEED-NEXT:       Flags: 0x0
-VERNEED-NEXT:       Index: 3
-VERNEED-NEXT:       Name: v2
-VERNEED-NEXT:     }
-VERNEED-NEXT:     Entry {
-VERNEED-NEXT:       Hash: 1939
-VERNEED-NEXT:       Flags: 0x0
-VERNEED-NEXT:       Index: 2
-VERNEED-NEXT:       Name: v3
-VERNEED-NEXT:     }
-VERNEED-NEXT:   }
-VERNEED-NEXT:   Dependency {
-VERNEED-NEXT:     Version: 1
-VERNEED-NEXT:     Count: 1
-VERNEED-NEXT:     FileName: verneed2.so.0
-VERNEED-NEXT:     Entry {
-VERNEED-NEXT:       Hash: 1937
-VERNEED-NEXT:       Flags: 0x0
-VERNEED-NEXT:       Index: 4
-VERNEED-NEXT:       Name: v1
-VERNEED-NEXT:     }
-VERNEED-NEXT:   }
-VERNEED-NEXT: }
diff --git a/test/tools/yaml2obj/verdef-section.yaml b/test/tools/yaml2obj/verdef-section.yaml
index f81bcf196f6f..deac6e736c0b 100644
--- a/test/tools/yaml2obj/verdef-section.yaml
+++ b/test/tools/yaml2obj/verdef-section.yaml
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-readelf -V %t | FileCheck %s
+# RUN: llvm-readobj -V %t | FileCheck %s
 
 # Check we are able to handle the SHT_GNU_verdef sections.
 
diff --git a/test/tools/yaml2obj/verneed-section.yaml b/test/tools/yaml2obj/verneed-section.yaml
index 436e54ba0893..2fc58ad64f0f 100644
--- a/test/tools/yaml2obj/verneed-section.yaml
+++ b/test/tools/yaml2obj/verneed-section.yaml
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-readelf -V %t | FileCheck %s
+# RUN: llvm-readobj -V %t | FileCheck %s
 
 # Check we are able to handle the SHT_GNU_verneed sections.
 
diff --git a/test/tools/yaml2obj/versym-section.yaml b/test/tools/yaml2obj/versym-section.yaml
index 31dfecfa297c..3c08ddd63d2e 100644
--- a/test/tools/yaml2obj/versym-section.yaml
+++ b/test/tools/yaml2obj/versym-section.yaml
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-readelf -V %t | FileCheck %s
+# RUN: llvm-readobj -V %t | FileCheck %s
 
 ## Check we are able to produce a valid SHT_GNU_versym
 ## section from its description.
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index fc1ac38ec872..1ae802ff14b3 100644
--- a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -298,110 +298,94 @@ static bool isUnneededSymbol(const Symbol &Sym) {
          Sym.Type != STT_FILE && Sym.Type != STT_SECTION;
 }
 
-// This function handles the high level operations of GNU objcopy including
-// handling command line options. It's important to outline certain properties
-// we expect to hold of the command line operations. Any operation that "keeps"
-// should keep regardless of a remove. Additionally any removal should respect
-// any previous removals. Lastly whether or not something is removed shouldn't
-// depend a) on the order the options occur in or b) on some opaque priority
-// system. The only priority is that keeps/copies overrule removes.
-static Error handleArgs(const CopyConfig &Config, Object &Obj,
-                        const Reader &Reader, ElfType OutputElfType) {
-
-  if (!Config.SplitDWO.empty())
-    if (Error E =
-            splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType))
-      return E;
-
-  if (Config.OutputArch) {
-    Obj.Machine = Config.OutputArch.getValue().EMachine;
-    Obj.OSABI = Config.OutputArch.getValue().OSABI;
-  }
-
+static Error updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
   // TODO: update or remove symbols only if there is an option that affects
   // them.
-  if (Obj.SymbolTable) {
-    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
-      // Common and undefined symbols don't make sense as local symbols, and can
-      // even cause crashes if we localize those, so skip them.
-      if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
-          ((Config.LocalizeHidden &&
-            (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-           is_contained(Config.SymbolsToLocalize, Sym.Name)))
-        Sym.Binding = STB_LOCAL;
-
-      // Note: these two globalize flags have very similar names but different
-      // meanings:
-      //
-      // --globalize-symbol: promote a symbol to global
-      // --keep-global-symbol: all symbols except for these should be made local
-      //
-      // If --globalize-symbol is specified for a given symbol, it will be
-      // global in the output file even if it is not included via
-      // --keep-global-symbol. Because of that, make sure to check
-      // --globalize-symbol second.
-      if (!Config.SymbolsToKeepGlobal.empty() &&
-          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name) &&
-          Sym.getShndx() != SHN_UNDEF)
-        Sym.Binding = STB_LOCAL;
-
-      if (is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
-          Sym.getShndx() != SHN_UNDEF)
-        Sym.Binding = STB_GLOBAL;
-
-      if (is_contained(Config.SymbolsToWeaken, Sym.Name) &&
-          Sym.Binding == STB_GLOBAL)
-        Sym.Binding = STB_WEAK;
-
-      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
-          Sym.getShndx() != SHN_UNDEF)
-        Sym.Binding = STB_WEAK;
-
-      const auto I = Config.SymbolsToRename.find(Sym.Name);
-      if (I != Config.SymbolsToRename.end())
-        Sym.Name = I->getValue();
-
-      if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
-        Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
-    });
-
-    // The purpose of this loop is to mark symbols referenced by sections
-    // (like GroupSection or RelocationSection). This way, we know which
-    // symbols are still 'needed' and which are not.
-    if (Config.StripUnneeded || !Config.UnneededSymbolsToRemove.empty()) {
-      for (auto &Section : Obj.sections())
-        Section.markSymbols();
-    }
+  if (!Obj.SymbolTable)
+    return Error::success();
+
+  Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+    // Common and undefined symbols don't make sense as local symbols, and can
+    // even cause crashes if we localize those, so skip them.
+    if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
+        ((Config.LocalizeHidden &&
+          (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+         is_contained(Config.SymbolsToLocalize, Sym.Name)))
+      Sym.Binding = STB_LOCAL;
+
+    // Note: these two globalize flags have very similar names but different
+    // meanings:
+    //
+    // --globalize-symbol: promote a symbol to global
+    // --keep-global-symbol: all symbols except for these should be made local
+    //
+    // If --globalize-symbol is specified for a given symbol, it will be
+    // global in the output file even if it is not included via
+    // --keep-global-symbol. Because of that, make sure to check
+    // --globalize-symbol second.
+    if (!Config.SymbolsToKeepGlobal.empty() &&
+        !is_contained(Config.SymbolsToKeepGlobal, Sym.Name) &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_LOCAL;
+
+    if (is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_GLOBAL;
+
+    if (is_contained(Config.SymbolsToWeaken, Sym.Name) &&
+        Sym.Binding == STB_GLOBAL)
+      Sym.Binding = STB_WEAK;
+
+    if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
+        Sym.getShndx() != SHN_UNDEF)
+      Sym.Binding = STB_WEAK;
+
+    const auto I = Config.SymbolsToRename.find(Sym.Name);
+    if (I != Config.SymbolsToRename.end())
+      Sym.Name = I->getValue();
+
+    if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
+      Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
+  });
+
+  // The purpose of this loop is to mark symbols referenced by sections
+  // (like GroupSection or RelocationSection). This way, we know which
+  // symbols are still 'needed' and which are not.
+  if (Config.StripUnneeded || !Config.UnneededSymbolsToRemove.empty()) {
+    for (auto &Section : Obj.sections())
+      Section.markSymbols();
+  }
 
-    auto RemoveSymbolsPred = [&](const Symbol &Sym) {
-      if (is_contained(Config.SymbolsToKeep, Sym.Name) ||
-          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
-        return false;
+  auto RemoveSymbolsPred = [&](const Symbol &Sym) {
+    if (is_contained(Config.SymbolsToKeep, Sym.Name) ||
+        (Config.KeepFileSymbols && Sym.Type == STT_FILE))
+      return false;
 
-      if ((Config.DiscardMode == DiscardType::All ||
-           (Config.DiscardMode == DiscardType::Locals &&
-            StringRef(Sym.Name).startswith(".L"))) &&
-          Sym.Binding == STB_LOCAL && Sym.getShndx() != SHN_UNDEF &&
-          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
-        return true;
+    if ((Config.DiscardMode == DiscardType::All ||
+         (Config.DiscardMode == DiscardType::Locals &&
+          StringRef(Sym.Name).startswith(".L"))) &&
+        Sym.Binding == STB_LOCAL && Sym.getShndx() != SHN_UNDEF &&
+        Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
+      return true;
 
-      if (Config.StripAll || Config.StripAllGNU)
-        return true;
+    if (Config.StripAll || Config.StripAllGNU)
+      return true;
 
-      if (is_contained(Config.SymbolsToRemove, Sym.Name))
-        return true;
+    if (is_contained(Config.SymbolsToRemove, Sym.Name))
+      return true;
 
-      if ((Config.StripUnneeded ||
-           is_contained(Config.UnneededSymbolsToRemove, Sym.Name)) &&
-          isUnneededSymbol(Sym))
-        return true;
+    if ((Config.StripUnneeded ||
+         is_contained(Config.UnneededSymbolsToRemove, Sym.Name)) &&
+        isUnneededSymbol(Sym))
+      return true;
 
-      return false;
-    };
-    if (Error E = Obj.removeSymbols(RemoveSymbolsPred))
-      return E;
-  }
+    return false;
+  };
+
+  return Obj.removeSymbols(RemoveSymbolsPred);
+}
 
+static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
   SectionPred RemovePred = [](const SectionBase &) { return false; };
 
   // Removes:
@@ -535,7 +519,33 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
           return &Obj.addSection<DecompressedSection>(*CS);
         });
 
-  if (Error E = Obj.removeSections(RemovePred))
+  return Obj.removeSections(RemovePred);
+}
+
+// This function handles the high level operations of GNU objcopy including
+// handling command line options. It's important to outline certain properties
+// we expect to hold of the command line operations. Any operation that "keeps"
+// should keep regardless of a remove. Additionally any removal should respect
+// any previous removals. Lastly whether or not something is removed shouldn't
+// depend a) on the order the options occur in or b) on some opaque priority
+// system. The only priority is that keeps/copies overrule removes.
+static Error handleArgs(const CopyConfig &Config, Object &Obj,
+                        const Reader &Reader, ElfType OutputElfType) {
+
+  if (!Config.SplitDWO.empty())
+    if (Error E =
+            splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType))
+      return E;
+
+  if (Config.OutputArch) {
+    Obj.Machine = Config.OutputArch.getValue().EMachine;
+    Obj.OSABI = Config.OutputArch.getValue().OSABI;
+  }
+
+  if (Error E = updateAndRemoveSymbols(Config, Obj))
+    return E;
+
+  if (Error E = replaceAndRemoveSections(Config, Obj))
     return E;
 
   if (!Config.SectionsToRename.empty()) {
diff --git a/tools/llvm-objcopy/ELF/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
index 4639d9053943..7cceb70ca63b 100644
--- a/tools/llvm-objcopy/ELF/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -906,7 +906,9 @@ template <class ELFT> void ELFBuilder<ELFT>::setParentSegment(Segment &Child) {
 template <class ELFT> void ELFBuilder<ELFT>::readProgramHeaders() {
   uint32_t Index = 0;
   for (const auto &Phdr : unwrapOrError(ElfFile.program_headers())) {
-    Segment &Seg = Obj.addSegment();
+    ArrayRef<uint8_t> Data{ElfFile.base() + Phdr.p_offset,
+                           (size_t)Phdr.p_filesz};
+    Segment &Seg = Obj.addSegment(Data);
     Seg.Type = Phdr.p_type;
     Seg.Flags = Phdr.p_flags;
     Seg.OriginalOffset = Phdr.p_offset;
@@ -1350,7 +1352,31 @@ template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
 
 template <class ELFT> void ELFWriter<ELFT>::writeSectionData() {
   for (auto &Sec : Obj.sections())
-    Sec.accept(*SecWriter);
+    // Segments are responsible for writing their contents, so only write the
+    // section data if the section is not in a segment. Note that this renders
+    // sections in segments effectively immutable.
+    if (Sec.ParentSegment == nullptr)
+      Sec.accept(*SecWriter);
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writeSegmentData() {
+  for (Segment &Seg : Obj.segments()) {
+    uint8_t *B = Buf.getBufferStart() + Seg.Offset;
+    assert(Seg.FileSize == Seg.getContents().size() &&
+           "Segment size must match contents size");
+    std::memcpy(B, Seg.getContents().data(), Seg.FileSize);
+  }
+
+  // Iterate over removed sections and overwrite their old data with zeroes.
+  for (auto &Sec : Obj.removedSections()) {
+    Segment *Parent = Sec.ParentSegment;
+    if (Parent == nullptr || Sec.Type == SHT_NOBITS || Sec.Size == 0)
+      continue;
+    uint64_t Offset =
+        Sec.OriginalOffset - Parent->OriginalOffset + Parent->Offset;
+    uint8_t *B = Buf.getBufferStart();
+    std::memset(B + Offset, 0, Sec.Size);
+  }
 }
 
 Error Object::removeSections(
@@ -1396,7 +1422,10 @@ Error Object::removeSections(
       return E;
   }
 
-  // Now finally get rid of them all togethor.
+  // Transfer removed sections into the Object RemovedSections container for use
+  // later.
+  std::move(Iter, Sections.end(), std::back_inserter(RemovedSections));
+  // Now finally get rid of them all together.
   Sections.erase(Iter, std::end(Sections));
   return Error::success();
 }
@@ -1542,6 +1571,9 @@ template <class ELFT> size_t ELFWriter<ELFT>::totalSize() const {
 }
 
 template <class ELFT> Error ELFWriter<ELFT>::write() {
+  // Segment data must be written first, so that the ELF header and program
+  // header tables can overwrite it, if covered by a segment.
+  writeSegmentData();
   writeEhdr();
   writePhdrs();
   writeSectionData();
diff --git a/tools/llvm-objcopy/ELF/Object.h b/tools/llvm-objcopy/ELF/Object.h
index e892d066a6cd..26d6a122c468 100644
--- a/tools/llvm-objcopy/ELF/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -215,6 +215,7 @@ template <class ELFT> class ELFWriter : public Writer {
   void writePhdrs();
   void writeShdrs();
   void writeSectionData();
+  void writeSegmentData();
 
   void assignOffsets();
 
@@ -312,6 +313,10 @@ class Segment {
   uint32_t Index;
   uint64_t OriginalOffset;
   Segment *ParentSegment = nullptr;
+  ArrayRef<uint8_t> Contents;
+
+  explicit Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  Segment() {}
 
   const SectionBase *firstSection() const {
     if (!Sections.empty())
@@ -321,6 +326,8 @@ class Segment {
 
   void removeSection(const SectionBase *Sec) { Sections.erase(Sec); }
   void addSection(const SectionBase *Sec) { Sections.insert(Sec); }
+
+  ArrayRef<uint8_t> getContents() const { return Contents; }
 };
 
 class Section : public SectionBase {
@@ -773,6 +780,7 @@ class Object {
 
   std::vector<SecPtr> Sections;
   std::vector<SegPtr> Segments;
+  std::vector<SecPtr> RemovedSections;
 
 public:
   template <class T>
@@ -815,6 +823,8 @@ class Object {
         find_if(Sections, [&](const SecPtr &Sec) { return Sec->Name == Name; });
     return SecIt == Sections.end() ? nullptr : SecIt->get();
   }
+  SectionTableRef removedSections() { return SectionTableRef(RemovedSections); }
+
   Range<Segment> segments() { return make_pointee_range(Segments); }
   ConstRange<Segment> segments() const { return make_pointee_range(Segments); }
 
@@ -827,8 +837,8 @@ class Object {
     Ptr->Index = Sections.size();
     return *Ptr;
   }
-  Segment &addSegment() {
-    Segments.emplace_back(llvm::make_unique<Segment>());
+  Segment &addSegment(ArrayRef<uint8_t> Data) {
+    Segments.emplace_back(llvm::make_unique<Segment>(Data));
     return *Segments.back();
   }
 };
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 8c1c6fef89ec..091456329f91 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -342,6 +342,12 @@ template <typename ELFT> class DumpStyle {
   virtual void printProgramHeaders(const ELFFile<ELFT> *Obj,
                                    bool PrintProgramHeaders,
                                    cl::boolOrDefault PrintSectionMapping) = 0;
+  virtual void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
+                                         const Elf_Shdr *Sec) = 0;
+  virtual void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
+                                             const Elf_Shdr *Sec) = 0;
+  virtual void printVersionDependencySection(const ELFFile<ELFT> *Obj,
+                                             const Elf_Shdr *Sec) = 0;
   virtual void printHashHistogram(const ELFFile<ELFT> *Obj) = 0;
   virtual void printCGProfile(const ELFFile<ELFT> *Obj) = 0;
   virtual void printAddrsig(const ELFFile<ELFT> *Obj) = 0;
@@ -376,6 +382,12 @@ template <typename ELFT> class GNUStyle : public DumpStyle<ELFT> {
                           size_t Offset) override;
   void printProgramHeaders(const ELFO *Obj, bool PrintProgramHeaders,
                            cl::boolOrDefault PrintSectionMapping) override;
+  void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
+                                 const Elf_Shdr *Sec) override;
+  void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
+                                     const Elf_Shdr *Sec) override;
+  void printVersionDependencySection(const ELFFile<ELFT> *Obj,
+                                     const Elf_Shdr *Sec) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
   void printCGProfile(const ELFFile<ELFT> *Obj) override;
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
@@ -470,6 +482,12 @@ template <typename ELFT> class LLVMStyle : public DumpStyle<ELFT> {
   void printDynamicRelocations(const ELFO *Obj) override;
   void printProgramHeaders(const ELFO *Obj, bool PrintProgramHeaders,
                            cl::boolOrDefault PrintSectionMapping) override;
+  void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
+                                 const Elf_Shdr *Sec) override;
+  void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
+                                     const Elf_Shdr *Sec) override;
+  void printVersionDependencySection(const ELFFile<ELFT> *Obj,
+                                     const Elf_Shdr *Sec) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
   void printCGProfile(const ELFFile<ELFT> *Obj) override;
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
@@ -607,143 +625,6 @@ template <class ELFT> void ELFDumper<ELFT>::LoadVersionMap() const {
     LoadVersionNeeds(dot_gnu_version_r_sec);
 }
 
-template <typename ELFO, class ELFT>
-static void printVersionSymbolSection(ELFDumper<ELFT> *Dumper, const ELFO *Obj,
-                                      const typename ELFO::Elf_Shdr *Sec,
-                                      ScopedPrinter &W) {
-  DictScope SS(W, "Version symbols");
-  if (!Sec)
-    return;
-  StringRef Name = unwrapOrError(Obj->getSectionName(Sec));
-  W.printNumber("Section Name", Name, Sec->sh_name);
-  W.printHex("Address", Sec->sh_addr);
-  W.printHex("Offset", Sec->sh_offset);
-  W.printNumber("Link", Sec->sh_link);
-
-  const uint8_t *P = (const uint8_t *)Obj->base() + Sec->sh_offset;
-  StringRef StrTable = Dumper->getDynamicStringTable();
-
-  // Same number of entries in the dynamic symbol table (DT_SYMTAB).
-  ListScope Syms(W, "Symbols");
-  for (const typename ELFO::Elf_Sym &Sym : Dumper->dynamic_symbols()) {
-    DictScope S(W, "Symbol");
-    std::string FullSymbolName =
-        Dumper->getFullSymbolName(&Sym, StrTable, true /* IsDynamic */);
-    W.printNumber("Version", *P);
-    W.printString("Name", FullSymbolName);
-    P += sizeof(typename ELFO::Elf_Half);
-  }
-}
-
-static const EnumEntry<unsigned> SymVersionFlags[] = {
-    {"Base", "BASE", VER_FLG_BASE},
-    {"Weak", "WEAK", VER_FLG_WEAK},
-    {"Info", "INFO", VER_FLG_INFO}};
-
-template <typename ELFO, class ELFT>
-static void printVersionDefinitionSection(ELFDumper<ELFT> *Dumper,
-                                          const ELFO *Obj,
-                                          const typename ELFO::Elf_Shdr *Sec,
-                                          ScopedPrinter &W) {
-  using VerDef = typename ELFO::Elf_Verdef;
-  using VerdAux = typename ELFO::Elf_Verdaux;
-
-  DictScope SD(W, "SHT_GNU_verdef");
-  if (!Sec)
-    return;
-
-  const uint8_t *SecStartAddress =
-      (const uint8_t *)Obj->base() + Sec->sh_offset;
-  const uint8_t *SecEndAddress = SecStartAddress + Sec->sh_size;
-  const uint8_t *P = SecStartAddress;
-  const typename ELFO::Elf_Shdr *StrTab =
-      unwrapOrError(Obj->getSection(Sec->sh_link));
-
-  unsigned VerDefsNum = Sec->sh_info;
-  while (VerDefsNum--) {
-    if (P + sizeof(VerDef) > SecEndAddress)
-      report_fatal_error("invalid offset in the section");
-
-    auto *VD = reinterpret_cast<const VerDef *>(P);
-    DictScope Def(W, "Definition");
-    W.printNumber("Version", VD->vd_version);
-    W.printEnum("Flags", VD->vd_flags, makeArrayRef(SymVersionFlags));
-    W.printNumber("Index", VD->vd_ndx);
-    W.printNumber("Hash", VD->vd_hash);
-    W.printString("Name",
-                  StringRef((const char *)(Obj->base() + StrTab->sh_offset +
-                                           VD->getAux()->vda_name)));
-    if (!VD->vd_cnt)
-      report_fatal_error("at least one definition string must exist");
-    if (VD->vd_cnt > 2)
-      report_fatal_error("more than one predecessor is not expected");
-
-    if (VD->vd_cnt == 2) {
-      const uint8_t *PAux = P + VD->vd_aux + VD->getAux()->vda_next;
-      const VerdAux *Aux = reinterpret_cast<const VerdAux *>(PAux);
-      W.printString("Predecessor",
-                    StringRef((const char *)(Obj->base() + StrTab->sh_offset +
-                                             Aux->vda_name)));
-    }
-
-    P += VD->vd_next;
-  }
-}
-
-template <typename ELFO, class ELFT>
-static void printVersionDependencySection(ELFDumper<ELFT> *Dumper,
-                                          const ELFO *Obj,
-                                          const typename ELFO::Elf_Shdr *Sec,
-                                          ScopedPrinter &W) {
-  using VerNeed = typename ELFO::Elf_Verneed;
-  using VernAux = typename ELFO::Elf_Vernaux;
-
-  DictScope SD(W, "SHT_GNU_verneed");
-  if (!Sec)
-    return;
-
-  const uint8_t *SecData = (const uint8_t *)Obj->base() + Sec->sh_offset;
-  const typename ELFO::Elf_Shdr *StrTab =
-      unwrapOrError(Obj->getSection(Sec->sh_link));
-
-  const uint8_t *P = SecData;
-  unsigned VerNeedNum = Sec->sh_info;
-  for (unsigned I = 0; I < VerNeedNum; ++I) {
-    const VerNeed *Need = reinterpret_cast<const VerNeed *>(P);
-    DictScope Entry(W, "Dependency");
-    W.printNumber("Version", Need->vn_version);
-    W.printNumber("Count", Need->vn_cnt);
-    W.printString("FileName",
-                  StringRef((const char *)(Obj->base() + StrTab->sh_offset +
-                                           Need->vn_file)));
-
-    const uint8_t *PAux = P + Need->vn_aux;
-    for (unsigned J = 0; J < Need->vn_cnt; ++J) {
-      const VernAux *Aux = reinterpret_cast<const VernAux *>(PAux);
-      DictScope Entry(W, "Entry");
-      W.printNumber("Hash", Aux->vna_hash);
-      W.printEnum("Flags", Aux->vna_flags, makeArrayRef(SymVersionFlags));
-      W.printNumber("Index", Aux->vna_other);
-      W.printString("Name",
-                    StringRef((const char *)(Obj->base() + StrTab->sh_offset +
-                                             Aux->vna_name)));
-      PAux += Aux->vna_next;
-    }
-    P += Need->vn_next;
-  }
-}
-
-template <typename ELFT> void ELFDumper<ELFT>::printVersionInfo() {
-  // Dump version symbol section.
-  printVersionSymbolSection(this, ObjF->getELFFile(), dot_gnu_version_sec, W);
-
-  // Dump version definition section.
-  printVersionDefinitionSection(this, ObjF->getELFFile(), dot_gnu_version_d_sec, W);
-
-  // Dump version dependency section.
-  printVersionDependencySection(this, ObjF->getELFFile(), dot_gnu_version_r_sec, W);
-}
-
 template <typename ELFT>
 StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
                                             const Elf_Sym *symb,
@@ -925,6 +806,11 @@ static const EnumEntry<unsigned> ElfOSABI[] = {
   {"Standalone",   "Standalone App",       ELF::ELFOSABI_STANDALONE}
 };
 
+static const EnumEntry<unsigned> SymVersionFlags[] = {
+    {"Base", "BASE", VER_FLG_BASE},
+    {"Weak", "WEAK", VER_FLG_WEAK},
+    {"Info", "INFO", VER_FLG_INFO}};
+
 static const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
   {"AMDGPU_HSA",    "AMDGPU - HSA",    ELF::ELFOSABI_AMDGPU_HSA},
   {"AMDGPU_PAL",    "AMDGPU - PAL",    ELF::ELFOSABI_AMDGPU_PAL},
@@ -1616,6 +1502,20 @@ void ELFDumper<ELFT>::printProgramHeaders(
                                       PrintSectionMapping);
 }
 
+template <typename ELFT> void ELFDumper<ELFT>::printVersionInfo() {
+  // Dump version symbol section.
+  ELFDumperStyle->printVersionSymbolSection(ObjF->getELFFile(),
+                                            dot_gnu_version_sec);
+
+  // Dump version definition section.
+  ELFDumperStyle->printVersionDefinitionSection(ObjF->getELFFile(),
+                                                dot_gnu_version_d_sec);
+
+  // Dump version dependency section.
+  ELFDumperStyle->printVersionDependencySection(ObjF->getELFFile(),
+                                                dot_gnu_version_r_sec);
+}
+
 template <class ELFT> void ELFDumper<ELFT>::printDynamicRelocations() {
   ELFDumperStyle->printDynamicRelocations(ObjF->getELFFile());
 }
@@ -3460,6 +3360,36 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
   }
 }
 
+template <class ELFT>
+void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
+                                               const Elf_Shdr *Sec) {
+  if (!Sec)
+    return;
+
+  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
+  OS << "Dumper for " << SecName << " is not implemented\n";
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
+                                                   const Elf_Shdr *Sec) {
+  if (!Sec)
+    return;
+
+  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
+  OS << "Dumper for " << SecName << " is not implemented\n";
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
+                                                   const Elf_Shdr *Sec) {
+  if (!Sec)
+    return;
+
+  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
+  OS << "Dumper for " << SecName << " is not implemented\n";
+}
+
 // Hash histogram shows  statistics of how efficient the hash was for the
 // dynamic symbol table. The table shows number of hash buckets for different
 // lengths of chains as absolute number and percentage of the total buckets.
@@ -4538,6 +4468,122 @@ void LLVMStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
   }
 }
 
+template <class ELFT>
+void LLVMStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
+                                                const Elf_Shdr *Sec) {
+  DictScope SS(W, "Version symbols");
+  if (!Sec)
+    return;
+
+  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
+  W.printNumber("Section Name", SecName, Sec->sh_name);
+  W.printHex("Address", Sec->sh_addr);
+  W.printHex("Offset", Sec->sh_offset);
+  W.printNumber("Link", Sec->sh_link);
+
+  const uint8_t *VersymBuf = (const uint8_t *)Obj->base() + Sec->sh_offset;
+  const ELFDumper<ELFT> *Dumper = this->dumper();
+  StringRef StrTable = Dumper->getDynamicStringTable();
+
+  // Same number of entries in the dynamic symbol table (DT_SYMTAB).
+  ListScope Syms(W, "Symbols");
+  for (const Elf_Sym &Sym : Dumper->dynamic_symbols()) {
+    DictScope S(W, "Symbol");
+    const Elf_Versym *Versym = reinterpret_cast<const Elf_Versym *>(VersymBuf);
+    std::string FullSymbolName =
+        Dumper->getFullSymbolName(&Sym, StrTable, true /* IsDynamic */);
+    W.printNumber("Version", Versym->vs_index & VERSYM_VERSION);
+    W.printString("Name", FullSymbolName);
+    VersymBuf += sizeof(Elf_Versym);
+  }
+}
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
+                                                    const Elf_Shdr *Sec) {
+  DictScope SD(W, "SHT_GNU_verdef");
+  if (!Sec)
+    return;
+
+  const uint8_t *SecStartAddress =
+      (const uint8_t *)Obj->base() + Sec->sh_offset;
+  const uint8_t *SecEndAddress = SecStartAddress + Sec->sh_size;
+  const uint8_t *VerdefBuf = SecStartAddress;
+  const Elf_Shdr *StrTab = unwrapOrError(Obj->getSection(Sec->sh_link));
+
+  unsigned VerDefsNum = Sec->sh_info;
+  while (VerDefsNum--) {
+    if (VerdefBuf + sizeof(Elf_Verdef) > SecEndAddress)
+      // FIXME: report_fatal_error is not a good way to report error. We should
+      // emit a parsing error here and below.
+      report_fatal_error("invalid offset in the section");
+
+    const Elf_Verdef *Verdef = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
+    DictScope Def(W, "Definition");
+    W.printNumber("Version", Verdef->vd_version);
+    W.printEnum("Flags", Verdef->vd_flags, makeArrayRef(SymVersionFlags));
+    W.printNumber("Index", Verdef->vd_ndx);
+    W.printNumber("Hash", Verdef->vd_hash);
+    W.printString("Name",
+                  StringRef((const char *)(Obj->base() + StrTab->sh_offset +
+                                           Verdef->getAux()->vda_name)));
+    if (!Verdef->vd_cnt)
+      report_fatal_error("at least one definition string must exist");
+    if (Verdef->vd_cnt > 2)
+      report_fatal_error("more than one predecessor is not expected");
+
+    if (Verdef->vd_cnt == 2) {
+      const uint8_t *VerdauxBuf =
+          VerdefBuf + Verdef->vd_aux + Verdef->getAux()->vda_next;
+      const Elf_Verdaux *Verdaux =
+          reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
+      W.printString("Predecessor",
+                    StringRef((const char *)(Obj->base() + StrTab->sh_offset +
+                                             Verdaux->vda_name)));
+    }
+    VerdefBuf += Verdef->vd_next;
+  }
+}
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
+                                                    const Elf_Shdr *Sec) {
+  DictScope SD(W, "SHT_GNU_verneed");
+  if (!Sec)
+    return;
+
+  const uint8_t *SecData = (const uint8_t *)Obj->base() + Sec->sh_offset;
+  const Elf_Shdr *StrTab = unwrapOrError(Obj->getSection(Sec->sh_link));
+
+  const uint8_t *VerneedBuf = SecData;
+  unsigned VerneedNum = Sec->sh_info;
+  for (unsigned I = 0; I < VerneedNum; ++I) {
+    const Elf_Verneed *Verneed =
+        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
+    DictScope Entry(W, "Dependency");
+    W.printNumber("Version", Verneed->vn_version);
+    W.printNumber("Count", Verneed->vn_cnt);
+    W.printString("FileName",
+                  StringRef((const char *)(Obj->base() + StrTab->sh_offset +
+                                           Verneed->vn_file)));
+
+    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
+    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
+      const Elf_Vernaux *Vernaux =
+          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
+      DictScope Entry(W, "Entry");
+      W.printNumber("Hash", Vernaux->vna_hash);
+      W.printEnum("Flags", Vernaux->vna_flags, makeArrayRef(SymVersionFlags));
+      W.printNumber("Index", Vernaux->vna_other);
+      W.printString("Name",
+                    StringRef((const char *)(Obj->base() + StrTab->sh_offset +
+                                             Vernaux->vna_name)));
+      VernauxBuf += Vernaux->vna_next;
+    }
+    VerneedBuf += Verneed->vn_next;
+  }
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
   W.startLine() << "Hash Histogram not implemented!\n";
diff --git a/utils/gn/README.rst b/utils/gn/README.rst
index 3c40846689c2..7ffa144aafb3 100644
--- a/utils/gn/README.rst
+++ b/utils/gn/README.rst
@@ -22,16 +22,10 @@ build.
 creates ninja files, but it can create some IDE projects (MSVC, Xcode, ...)
 which then shell out to ninja for the actual build.
 
-Its main features are that GN is very fast (it currently produces ninja files
-for LLVM's build in 35ms on the author's laptop, compared to 66s for CMake) --
-a 2000x difference), and since it's so fast it doesn't aggressively cache,
-making it possible to switch e.g. between release and debug builds in one build
-directory.
-
 The main motivation behind the GN build is that some people find it more
 convenient for day-to-day hacking on LLVM than CMake. Distribution, building
-just parts of LLVM, and embedding the LLVM GN build from other builds are a
-non-goal for the GN build.
+just parts of LLVM, and embedding the LLVM GN build from other builds are
+non-goals for the GN build.
 
 This is a `good overview of GN <https://docs.google.com/presentation/d/15Zwb53JcncHfEwHpnG_PoIbbzQ3GQi_cpujYwbpcbZo/edit#slide=id.g119d702868_0_12>`_.
 
@@ -42,39 +36,42 @@ Quick start
 
 GN only works in the monorepo layout.
 
-#. Obtain a gn binary. If gn is not already on your PATH, run
-   `llvm/utils/gn/get.py` to download a prebuilt gn binary if you're on a 64-bit
-   X86 system running Linux, macOS, or Windows, or `build gn yourself
+#. ``git clone https://github.com/llvm/llvm-project.git; cd llvm-project`` if
+   you don't have a monorepo checkout yet.
+
+#. ``llvm/utils/gn/get.py`` to download a prebuilt gn binary if you're on a
+   64-bit X86 system running Linux, macOS, or Windows. `Build gn yourself
    <https://gn.googlesource.com/gn/#getting-started>`_ if you're on a different
    platform or don't want to trust prebuilt binaries.
 
-#. In the root of the monorepo, run `llvm/utils/gn/gn.py gen out/gn`.
-   `out/gn` is the build directory, it can have any name, and you can have as
-   many as you want, each with different build settings.  (The `gn.py` script
-   adds `--dotfile=llvm/utils/gn/.gn --root=.` and just runs regular `gn`;
+#. ``llvm/utils/gn/gn.py gen out/gn`` to run GN and create build files.
+   ``out/gn`` is the build directory, it can have any name, and you can have as
+   many as you want, each with different build settings.  (The ``gn.py`` script
+   adds ``--dotfile=llvm/utils/gn/.gn --root=.`` and just runs regular ``gn``;
    you can manually pass these parameters and not use the wrapper if you
    prefer.)
 
-#. Run e.g. `ninja -C out/gn check-lld` to build all prerequisites for and
-   run the LLD tests.
+#. ``ninja -C out/gn check-lld`` to build all prerequisites for and run the LLD
+   tests.
 
 By default, you get a release build with assertions enabled that targets
-the host arch. You can set various build options by editing `out/gn/args.gn`,
-for example putting `is_debug = true` in there gives you a debug build. Run
-`llvm/utils/gn/gn.py args --list out/gn` to see a list of all possible
-options. After touching `out/gn/args.gn`, just run ninja, it will re-invoke gn
+the host arch. You can set build options by editing ``out/gn/args.gn``, for
+example putting ``is_debug = true`` in there gives you a debug build. Run
+``llvm/utils/gn/gn.py args --list out/gn`` to see a list of all possible
+options. After touching ``out/gn/args.gn`` just run ninja: it will re-invoke gn
 before starting the build.
 
-GN has extensive built-in help; try e.g. `gn help gen` to see the help
-for the `gen` command. The full GN reference is also `available online
-<https://gn.googlesource.com/gn/+/master/docs/reference.md>`_.
+GN has extensive built-in help; try e.g. ``llvm/utils/gn/gn.py help gen`` to see
+the help for the ``gen`` command. The full GN reference is also `available
+online <https://gn.googlesource.com/gn/+/master/docs/reference.md>`_.
 
-GN has an autoformatter: `git ls-files '*.gn' '*.gni' | xargs -n 1 gn format`
+GN has an autoformatter:
+``git ls-files '*.gn' '*.gni' | xargs llvm/utils/gn/gn.py format``
 after making GN build changes is your friend.
 
-To not put `BUILD.gn` into the main tree, they are all below
-`utils/gn/secondary`.  For example, the build file for `llvm/lib/Support` is in
-`utils/gn/secondary/llvm/lib/Support`.
+To not put ``BUILD.gn`` files into the main tree, they are all below
+``utils/gn/secondary``.  For example, the build file for ``llvm/lib/Support``
+is in ``utils/gn/secondary/llvm/lib/Support``.
 
 .. _Syncing GN files from CMake files:
 
@@ -83,15 +80,15 @@ Syncing GN files from CMake files
 
 Sometimes after pulling in the latest changes, the GN build doesn't work.
 Most of the time this is due to someone adding a file to CMakeLists.txt file.
-Run `llvm/utils/gn/build/sync_source_lists_from_cmake.py` to print a report
-of which files need to be added to or removed from `BUILD.gn` files to
-match the corresponding `CMakeLists.txt`. You have to manually read the output
+Run ``llvm/utils/gn/build/sync_source_lists_from_cmake.py`` to print a report
+of which files need to be added to or removed from ``BUILD.gn`` files to
+match the corresponding ``CMakeLists.txt``. You have to manually read the output
 of the script and implement its suggestions.
 
-If new `CMakeLists.txt` files have been added, you have to manually create
-a new corresponding `BUILD.gn` file below `llvm/utils/gn/secondary/`.
+If new ``CMakeLists.txt`` files have been added, you have to manually create
+a new corresponding ``BUILD.gn`` file below ``llvm/utils/gn/secondary/``.
 
-If the dependencies in a `CMakeLists.txt` file have been changed, you have to
+If the dependencies in a ``CMakeLists.txt`` file have been changed, you have to
 manually analyze and fix.
 
 .. _Philosophy:
@@ -133,9 +130,9 @@ configure is used for three classes of feature checks:
   config.h in a build step).
 
 For the last two points, it would be nice if LLVM didn't have a single
-`config.h` header, but one header per toggle. That way, when e.g.
-`llvm_enable_terminfo` is toggled, only the 3 files caring about that setting
-would need to be rebuilt, instead of everything including `config.h`.
+``config.h`` header, but one header per toggle. That way, when e.g.
+``llvm_enable_terminfo`` is toggled, only the 3 files caring about that setting
+would need to be rebuilt, instead of everything including ``config.h``.
 
 GN doesn't believe in users setting arbitrary cflags from an environment
 variable, it wants the build to be controlled by .gn files.
diff --git a/utils/gn/get.py b/utils/gn/get.py
index 4015d5986c9d..c39649df78a4 100755
--- a/utils/gn/get.py
+++ b/utils/gn/get.py
@@ -3,27 +3,20 @@
 
 from __future__ import print_function
 
+import io
 import os
 import urllib2
 import sys
-import tempfile
 import zipfile
 
 
-def download_url(url, output_file):
-    """Download url into output_file."""
+def download_and_unpack(url, output_dir, gn):
+    """Download an archive from url and extract gn from it into output_dir."""
     print('downloading %s ...' % url, end='')
     sys.stdout.flush()
-    output_file.write(urllib2.urlopen(url).read())
+    data = urllib2.urlopen(url).read()
     print(' done')
-
-
-def download_and_unpack(url, output_dir, gn):
-    """Download an archive from url and extract gn from it into output_dir."""
-    with tempfile.TemporaryFile() as f:
-        download_url(url, f)
-        f.seek(0)
-        zipfile.ZipFile(f).extract(gn, path=output_dir)
+    zipfile.ZipFile(io.BytesIO(data)).extract(gn, path=output_dir)
 
 
 def set_executable_bit(path):
diff --git a/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn b/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn
index 2b0bb2b7c05d..415e0fc7f29d 100644
--- a/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn
+++ b/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn
@@ -28,6 +28,7 @@ static_library("clang-tidy") {
 
   sources = [
     "ClangTidy.cpp",
+    "ClangTidyCheck.cpp",
     "ClangTidyDiagnosticConsumer.cpp",
     "ClangTidyModule.cpp",
     "ClangTidyOptions.cpp",
diff --git a/utils/release/merge-request.sh b/utils/release/merge-request.sh
index 6a4ee16d788a..0a2bf7661fac 100755
--- a/utils/release/merge-request.sh
+++ b/utils/release/merge-request.sh
@@ -101,7 +101,7 @@ case $stable_version in
     release_metabug="39106"
     ;;
   8.0)
-    release_metabug="40331"
+    release_metabug="41221"
     ;;
   *)
     echo "error: invalid stable version"