OSDN Git Service

Merge branch 'devel-stable' into for-next
authorRussell King <rmk+kernel@arm.linux.org.uk>
Fri, 22 Jul 2011 22:09:07 +0000 (23:09 +0100)
committerRussell King <rmk+kernel@arm.linux.org.uk>
Fri, 22 Jul 2011 22:09:07 +0000 (23:09 +0100)
Conflicts:
arch/arm/kernel/entry-armv.S

13 files changed:
1  2 
arch/arm/Kconfig
arch/arm/kernel/entry-armv.S
arch/arm/kernel/entry-header.S
arch/arm/kernel/perf_event.c
arch/arm/kernel/setup.c
arch/arm/mach-davinci/board-dm365-evm.c
arch/arm/mm/init.c
arch/arm/mm/proc-arm6_7.S
arch/arm/mm/proc-sa1100.S
arch/arm/mm/tlb-fa.S
arch/arm/mm/tlb-v6.S
arch/arm/mm/tlb-v7.S
arch/arm/vfp/vfpmodule.c

diff --combined arch/arm/Kconfig
@@@ -10,7 -10,7 +10,7 @@@ config AR
        select GENERIC_ATOMIC64 if (CPU_V6 || !CPU_32v6K || !AEABI)
        select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
        select HAVE_ARCH_KGDB
-       select HAVE_KPROBES if (!XIP_KERNEL && !THUMB2_KERNEL)
+       select HAVE_KPROBES if !XIP_KERNEL
        select HAVE_KRETPROBES if (HAVE_KPROBES)
        select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
        select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
@@@ -37,9 -37,6 +37,9 @@@
          Europe.  There is an ARM Linux project with a web page at
          <http://www.arm.linux.org.uk/>.
  
 +config ARM_HAS_SG_CHAIN
 +      bool
 +
  config HAVE_PWM
        bool
  
@@@ -1349,6 -1346,7 +1349,6 @@@ config SMP_ON_U
  
  config HAVE_ARM_SCU
        bool
 -      depends on SMP
        help
          This option enables support for the ARM system coherency unit
  
@@@ -1717,34 -1715,17 +1717,34 @@@ config ZBOOT_RO
          Say Y here if you intend to execute your compressed kernel image
          (zImage) directly from ROM or flash.  If unsure, say N.
  
 +choice
 +      prompt "Include SD/MMC loader in zImage (EXPERIMENTAL)"
 +      depends on ZBOOT_ROM && ARCH_SH7372 && EXPERIMENTAL
 +      default ZBOOT_ROM_NONE
 +      help
 +        Include experimental SD/MMC loading code in the ROM-able zImage.
 +        With this enabled it is possible to write the the ROM-able zImage
 +        kernel image to an MMC or SD card and boot the kernel straight
 +        from the reset vector. At reset the processor Mask ROM will load
 +        the first part of the the ROM-able zImage which in turn loads the
 +        rest the kernel image to RAM.
 +
 +config ZBOOT_ROM_NONE
 +      bool "No SD/MMC loader in zImage (EXPERIMENTAL)"
 +      help
 +        Do not load image from SD or MMC
 +
  config ZBOOT_ROM_MMCIF
        bool "Include MMCIF loader in zImage (EXPERIMENTAL)"
 -      depends on ZBOOT_ROM && ARCH_SH7372 && EXPERIMENTAL
        help
 -        Say Y here to include experimental MMCIF loading code in the
 -        ROM-able zImage. With this enabled it is possible to write the
 -        the ROM-able zImage kernel image to an MMC card and boot the
 -        kernel straight from the reset vector. At reset the processor
 -        Mask ROM will load the first part of the the ROM-able zImage
 -        which in turn loads the rest the kernel image to RAM using the
 -        MMCIF hardware block.
 +        Load image from MMCIF hardware block.
 +
 +config ZBOOT_ROM_SH_MOBILE_SDHI
 +      bool "Include SuperH Mobile SDHI loader in zImage (EXPERIMENTAL)"
 +      help
 +        Load image from SDHI hardware block
 +
 +endchoice
  
  config CMDLINE
        string "Default kernel command string"
  #include <asm/entry-macro-multi.S>
  
  /*
 - * Interrupt handling.  Preserves r7, r8, r9
 + * Interrupt handling.
   */
        .macro  irq_handler
  #ifdef CONFIG_MULTI_IRQ_HANDLER
 -      ldr     r5, =handle_arch_irq
 +      ldr     r1, =handle_arch_irq
        mov     r0, sp
 -      ldr     r5, [r5]
 +      ldr     r1, [r1]
        adr     lr, BSYM(9997f)
 -      teq     r5, #0
 -      movne   pc, r5
 +      teq     r1, #0
 +      movne   pc, r1
  #endif
        arch_irq_handler_default
  9997:
        .endm
  
 +      .macro  pabt_helper
 +      @ PABORT handler takes pt_regs in r2, fault address in r4 and psr in r5
 +#ifdef MULTI_PABORT
 +      ldr     ip, .LCprocfns
 +      mov     lr, pc
 +      ldr     pc, [ip, #PROCESSOR_PABT_FUNC]
 +#else
 +      bl      CPU_PABORT_HANDLER
 +#endif
 +      .endm
 +
 +      .macro  dabt_helper
 +
 +      @
 +      @ Call the processor-specific abort handler:
 +      @
 +      @  r2 - pt_regs
 +      @  r4 - aborted context pc
 +      @  r5 - aborted context psr
 +      @
 +      @ The abort handler must return the aborted address in r0, and
 +      @ the fault status register in r1.  r9 must be preserved.
 +      @
 +#ifdef MULTI_DABORT
 +      ldr     ip, .LCprocfns
 +      mov     lr, pc
 +      ldr     pc, [ip, #PROCESSOR_DABT_FUNC]
 +#else
 +      bl      CPU_DABORT_HANDLER
 +#endif
 +      .endm
 +
  #ifdef CONFIG_KPROBES
        .section        .kprobes.text,"ax",%progbits
  #else
@@@ -158,74 -126,106 +158,74 @@@ ENDPROC(__und_invalid
   SPFIX(       subeq   sp, sp, #4      )
        stmia   sp, {r1 - r12}
  
 -      ldmia   r0, {r1 - r3}
 -      add     r5, sp, #S_SP - 4       @ here for interlock avoidance
 -      mov     r4, #-1                 @  ""  ""      ""       ""
 -      add     r0, sp, #(S_FRAME_SIZE + \stack_hole - 4)
 - SPFIX(       addeq   r0, r0, #4      )
 -      str     r1, [sp, #-4]!          @ save the "real" r0 copied
 +      ldmia   r0, {r3 - r5}
 +      add     r7, sp, #S_SP - 4       @ here for interlock avoidance
 +      mov     r6, #-1                 @  ""  ""      ""       ""
 +      add     r2, sp, #(S_FRAME_SIZE + \stack_hole - 4)
 + SPFIX(       addeq   r2, r2, #4      )
 +      str     r3, [sp, #-4]!          @ save the "real" r0 copied
                                        @ from the exception stack
  
 -      mov     r1, lr
 +      mov     r3, lr
  
        @
        @ We are now ready to fill in the remaining blanks on the stack:
        @
 -      @  r0 - sp_svc
 -      @  r1 - lr_svc
 -      @  r2 - lr_<exception>, already fixed up for correct return/restart
 -      @  r3 - spsr_<exception>
 -      @  r4 - orig_r0 (see pt_regs definition in ptrace.h)
 +      @  r2 - sp_svc
 +      @  r3 - lr_svc
 +      @  r4 - lr_<exception>, already fixed up for correct return/restart
 +      @  r5 - spsr_<exception>
 +      @  r6 - orig_r0 (see pt_regs definition in ptrace.h)
        @
 -      stmia   r5, {r0 - r4}
 +      stmia   r7, {r2 - r6}
 +
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      bl      trace_hardirqs_off
 +#endif
        .endm
  
        .align  5
  __dabt_svc:
        svc_entry
 -
 -      @
 -      @ get ready to re-enable interrupts if appropriate
 -      @
 -      mrs     r9, cpsr
 -      tst     r3, #PSR_I_BIT
 -      biceq   r9, r9, #PSR_I_BIT
 -
 -      @
 -      @ Call the processor-specific abort handler:
 -      @
 -      @  r2 - aborted context pc
 -      @  r3 - aborted context cpsr
 -      @
 -      @ The abort handler must return the aborted address in r0, and
 -      @ the fault status register in r1.  r9 must be preserved.
 -      @
 -#ifdef MULTI_DABORT
 -      ldr     r4, .LCprocfns
 -      mov     lr, pc
 -      ldr     pc, [r4, #PROCESSOR_DABT_FUNC]
 -#else
 -      bl      CPU_DABORT_HANDLER
 -#endif
 -
 -      @
 -      @ set desired IRQ state, then call main handler
 -      @
 -      debug_entry r1
 -      msr     cpsr_c, r9
        mov     r2, sp
 -      bl      do_DataAbort
 +      dabt_helper
  
        @
        @ IRQs off again before pulling preserved data off the stack
        @
        disable_irq_notrace
  
 -      @
 -      @ restore SPSR and restart the instruction
 -      @
 -      ldr     r2, [sp, #S_PSR]
 -      svc_exit r2                             @ return from exception
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      tst     r5, #PSR_I_BIT
 +      bleq    trace_hardirqs_on
 +      tst     r5, #PSR_I_BIT
 +      blne    trace_hardirqs_off
 +#endif
 +      svc_exit r5                             @ return from exception
   UNWIND(.fnend                )
  ENDPROC(__dabt_svc)
  
        .align  5
  __irq_svc:
        svc_entry
 +      irq_handler
  
 -#ifdef CONFIG_TRACE_IRQFLAGS
 -      bl      trace_hardirqs_off
 -#endif
  #ifdef CONFIG_PREEMPT
        get_thread_info tsk
        ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 -      add     r7, r8, #1                      @ increment it
 -      str     r7, [tsk, #TI_PREEMPT]
 -#endif
 -
 -      irq_handler
 -#ifdef CONFIG_PREEMPT
 -      str     r8, [tsk, #TI_PREEMPT]          @ restore preempt count
        ldr     r0, [tsk, #TI_FLAGS]            @ get flags
        teq     r8, #0                          @ if preempt count != 0
        movne   r0, #0                          @ force flags to 0
        tst     r0, #_TIF_NEED_RESCHED
        blne    svc_preempt
  #endif
 -      ldr     r4, [sp, #S_PSR]                @ irqs are already disabled
 +
  #ifdef CONFIG_TRACE_IRQFLAGS
 -      tst     r4, #PSR_I_BIT
 -      bleq    trace_hardirqs_on
 +      @ The parent context IRQs must have been enabled to get here in
 +      @ the first place, so there's no point checking the PSR I bit.
 +      bl      trace_hardirqs_on
  #endif
 -      svc_exit r4                             @ return from exception
 +      svc_exit r5                             @ return from exception
   UNWIND(.fnend                )
  ENDPROC(__irq_svc)
  
@@@ -251,6 -251,7 +251,6 @@@ __und_svc
  #else
        svc_entry
  #endif
 -
        @
        @ call emulation code, which returns using r9 if it has emulated
        @ the instruction, or the more conventional lr if we are to treat
        @  r0 - instruction
        @
  #ifndef       CONFIG_THUMB2_KERNEL
 -      ldr     r0, [r2, #-4]
 +      ldr     r0, [r4, #-4]
  #else
 -      ldrh    r0, [r2, #-2]                   @ Thumb instruction at LR - 2
 +      ldrh    r0, [r4, #-2]                   @ Thumb instruction at LR - 2
        and     r9, r0, #0xf800
        cmp     r9, #0xe800                     @ 32-bit instruction if xx >= 0
 -      ldrhhs  r9, [r2]                        @ bottom 16 bits
 +      ldrhhs  r9, [r4]                        @ bottom 16 bits
        orrhs   r0, r9, r0, lsl #16
  #endif
        adr     r9, BSYM(1f)
 +      mov     r2, r4
        bl      call_fpe
  
        mov     r0, sp                          @ struct pt_regs *regs
        @
        @ restore SPSR and restart the instruction
        @
 -      ldr     r2, [sp, #S_PSR]                @ Get SVC cpsr
 -      svc_exit r2                             @ return from exception
 +      ldr     r5, [sp, #S_PSR]                @ Get SVC cpsr
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      tst     r5, #PSR_I_BIT
 +      bleq    trace_hardirqs_on
 +      tst     r5, #PSR_I_BIT
 +      blne    trace_hardirqs_off
 +#endif
 +      svc_exit r5                             @ return from exception
   UNWIND(.fnend                )
  ENDPROC(__und_svc)
  
        .align  5
  __pabt_svc:
        svc_entry
 -
 -      @
 -      @ re-enable interrupts if appropriate
 -      @
 -      mrs     r9, cpsr
 -      tst     r3, #PSR_I_BIT
 -      biceq   r9, r9, #PSR_I_BIT
 -
 -      mov     r0, r2                  @ pass address of aborted instruction.
 -#ifdef MULTI_PABORT
 -      ldr     r4, .LCprocfns
 -      mov     lr, pc
 -      ldr     pc, [r4, #PROCESSOR_PABT_FUNC]
 -#else
 -      bl      CPU_PABORT_HANDLER
 -#endif
 -      debug_entry r1
 -      msr     cpsr_c, r9                      @ Maybe enable interrupts
        mov     r2, sp                          @ regs
 -      bl      do_PrefetchAbort                @ call abort handler
 +      pabt_helper
  
        @
        @ IRQs off again before pulling preserved data off the stack
        @
        disable_irq_notrace
  
 -      @
 -      @ restore SPSR and restart the instruction
 -      @
 -      ldr     r2, [sp, #S_PSR]
 -      svc_exit r2                             @ return from exception
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      tst     r5, #PSR_I_BIT
 +      bleq    trace_hardirqs_on
 +      tst     r5, #PSR_I_BIT
 +      blne    trace_hardirqs_off
 +#endif
 +      svc_exit r5                             @ return from exception
   UNWIND(.fnend                )
  ENDPROC(__pabt_svc)
  
   ARM( stmib   sp, {r1 - r12}  )
   THUMB(       stmia   sp, {r0 - r12}  )
  
 -      ldmia   r0, {r1 - r3}
 +      ldmia   r0, {r3 - r5}
        add     r0, sp, #S_PC           @ here for interlock avoidance
 -      mov     r4, #-1                 @  ""  ""     ""        ""
 +      mov     r6, #-1                 @  ""  ""     ""        ""
  
 -      str     r1, [sp]                @ save the "real" r0 copied
 +      str     r3, [sp]                @ save the "real" r0 copied
                                        @ from the exception stack
  
        @
        @ We are now ready to fill in the remaining blanks on the stack:
        @
 -      @  r2 - lr_<exception>, already fixed up for correct return/restart
 -      @  r3 - spsr_<exception>
 -      @  r4 - orig_r0 (see pt_regs definition in ptrace.h)
 +      @  r4 - lr_<exception>, already fixed up for correct return/restart
 +      @  r5 - spsr_<exception>
 +      @  r6 - orig_r0 (see pt_regs definition in ptrace.h)
        @
        @ Also, separately save sp_usr and lr_usr
        @
 -      stmia   r0, {r2 - r4}
 +      stmia   r0, {r4 - r6}
   ARM( stmdb   r0, {sp, lr}^                   )
   THUMB(       store_user_sp_lr r0, r1, S_SP - S_PC    )
  
        @ Clear FP to mark the first stack frame
        @
        zero_fp
 +
 +#ifdef CONFIG_IRQSOFF_TRACER
 +      bl      trace_hardirqs_off
 +#endif
        .endm
  
        .macro  kuser_cmpxchg_check
- #if __LINUX_ARM_ARCH__ < 6 && !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
+ #if !defined(CONFIG_CPU_32v6K) && !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
  #ifndef CONFIG_MMU
  #warning "NPTL on non MMU needs fixing"
  #else
        @ if it was interrupted in a critical region.  Here we
        @ perform a quick test inline since it should be false
        @ 99.9999% of the time.  The rest is done out of line.
 -      cmp     r2, #TASK_SIZE
 +      cmp     r4, #TASK_SIZE
-       blhs    kuser_cmpxchg_fixup
+       blhs    kuser_cmpxchg64_fixup
  #endif
  #endif
        .endm
  __dabt_usr:
        usr_entry
        kuser_cmpxchg_check
 -
 -      @
 -      @ Call the processor-specific abort handler:
 -      @
 -      @  r2 - aborted context pc
 -      @  r3 - aborted context cpsr
 -      @
 -      @ The abort handler must return the aborted address in r0, and
 -      @ the fault status register in r1.
 -      @
 -#ifdef MULTI_DABORT
 -      ldr     r4, .LCprocfns
 -      mov     lr, pc
 -      ldr     pc, [r4, #PROCESSOR_DABT_FUNC]
 -#else
 -      bl      CPU_DABORT_HANDLER
 -#endif
 -
 -      @
 -      @ IRQs on, then call the main handler
 -      @
 -      debug_entry r1
 -      enable_irq
        mov     r2, sp
 -      adr     lr, BSYM(ret_from_exception)
 -      b       do_DataAbort
 +      dabt_helper
 +      b       ret_from_exception
   UNWIND(.fnend                )
  ENDPROC(__dabt_usr)
  
  __irq_usr:
        usr_entry
        kuser_cmpxchg_check
 -
 -#ifdef CONFIG_IRQSOFF_TRACER
 -      bl      trace_hardirqs_off
 -#endif
 -
 -      get_thread_info tsk
 -#ifdef CONFIG_PREEMPT
 -      ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 -      add     r7, r8, #1                      @ increment it
 -      str     r7, [tsk, #TI_PREEMPT]
 -#endif
 -
        irq_handler
 -#ifdef CONFIG_PREEMPT
 -      ldr     r0, [tsk, #TI_PREEMPT]
 -      str     r8, [tsk, #TI_PREEMPT]
 -      teq     r0, r7
 - ARM( strne   r0, [r0, -r0]   )
 - THUMB(       movne   r0, #0          )
 - THUMB(       strne   r0, [r0]        )
 -#endif
 -
 +      get_thread_info tsk
        mov     why, #0
        b       ret_to_user_from_irq
   UNWIND(.fnend                )
@@@ -418,9 -467,6 +418,9 @@@ ENDPROC(__irq_usr
  __und_usr:
        usr_entry
  
 +      mov     r2, r4
 +      mov     r3, r5
 +
        @
        @ fall through to the emulation code, which returns using r9 if
        @ it has emulated the instruction, or the more conventional lr
@@@ -636,8 -682,19 +636,8 @@@ ENDPROC(__und_usr_unknown
        .align  5
  __pabt_usr:
        usr_entry
 -
 -      mov     r0, r2                  @ pass address of aborted instruction.
 -#ifdef MULTI_PABORT
 -      ldr     r4, .LCprocfns
 -      mov     lr, pc
 -      ldr     pc, [r4, #PROCESSOR_PABT_FUNC]
 -#else
 -      bl      CPU_PABORT_HANDLER
 -#endif
 -      debug_entry r1
 -      enable_irq                              @ Enable interrupts
        mov     r2, sp                          @ regs
 -      bl      do_PrefetchAbort                @ call abort handler
 +      pabt_helper
   UNWIND(.fnend                )
        /* fall through */
  /*
@@@ -701,31 -758,12 +701,12 @@@ ENDPROC(__switch_to
  /*
   * User helpers.
   *
-  * These are segment of kernel provided user code reachable from user space
-  * at a fixed address in kernel memory.  This is used to provide user space
-  * with some operations which require kernel help because of unimplemented
-  * native feature and/or instructions in many ARM CPUs. The idea is for
-  * this code to be executed directly in user mode for best efficiency but
-  * which is too intimate with the kernel counter part to be left to user
-  * libraries.  In fact this code might even differ from one CPU to another
-  * depending on the available  instruction set and restrictions like on
-  * SMP systems.  In other words, the kernel reserves the right to change
-  * this code as needed without warning. Only the entry points and their
-  * results are guaranteed to be stable.
-  *
   * Each segment is 32-byte aligned and will be moved to the top of the high
   * vector page.  New segments (if ever needed) must be added in front of
   * existing ones.  This mechanism should be used only for things that are
   * really small and justified, and not be abused freely.
   *
-  * User space is expected to implement those things inline when optimizing
-  * for a processor that has the necessary native support, but only if such
-  * resulting binaries are already to be incompatible with earlier ARM
-  * processors due to the use of unsupported instructions other than what
-  * is provided here.  In other words don't make binaries unable to run on
-  * earlier processors just for the sake of not using these kernel helpers
-  * if your compiled code is not going to use the new instructions for other
-  * purpose.
+  * See Documentation/arm/kernel_user_helpers.txt for formal definitions.
   */
   THUMB(       .arm    )
  
  __kuser_helper_start:
  
  /*
-  * Reference prototype:
-  *
-  *    void __kernel_memory_barrier(void)
-  *
-  * Input:
-  *
-  *    lr = return address
-  *
-  * Output:
-  *
-  *    none
-  *
-  * Clobbered:
-  *
-  *    none
-  *
-  * Definition and user space usage example:
-  *
-  *    typedef void (__kernel_dmb_t)(void);
-  *    #define __kernel_dmb (*(__kernel_dmb_t *)0xffff0fa0)
-  *
-  * Apply any needed memory barrier to preserve consistency with data modified
-  * manually and __kuser_cmpxchg usage.
-  *
-  * This could be used as follows:
-  *
-  * #define __kernel_dmb() \
-  *         asm volatile ( "mov r0, #0xffff0fff; mov lr, pc; sub pc, r0, #95" \
-  *            : : : "r0", "lr","cc" )
+  * Due to the length of some sequences, __kuser_cmpxchg64 spans 2 regular
+  * kuser "slots", therefore 0xffff0f80 is not used as a valid entry point.
   */
  
- __kuser_memory_barrier:                               @ 0xffff0fa0
+ __kuser_cmpxchg64:                            @ 0xffff0f60
+ #if defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
+       /*
+        * Poor you.  No fast solution possible...
+        * The kernel itself must perform the operation.
+        * A special ghost syscall is used for that (see traps.c).
+        */
+       stmfd   sp!, {r7, lr}
+       ldr     r7, 1f                  @ it's 20 bits
+       swi     __ARM_NR_cmpxchg64
+       ldmfd   sp!, {r7, pc}
+ 1:    .word   __ARM_NR_cmpxchg64
+ #elif defined(CONFIG_CPU_32v6K)
+       stmfd   sp!, {r4, r5, r6, r7}
+       ldrd    r4, r5, [r0]                    @ load old val
+       ldrd    r6, r7, [r1]                    @ load new val
+       smp_dmb arm
+ 1:    ldrexd  r0, r1, [r2]                    @ load current val
+       eors    r3, r0, r4                      @ compare with oldval (1)
+       eoreqs  r3, r1, r5                      @ compare with oldval (2)
+       strexdeq r3, r6, r7, [r2]               @ store newval if eq
+       teqeq   r3, #1                          @ success?
+       beq     1b                              @ if no then retry
        smp_dmb arm
 -      @ r2 = address of interrupted insn (must be preserved).
+       rsbs    r0, r3, #0                      @ set returned val and C flag
+       ldmfd   sp!, {r4, r5, r6, r7}
+       bx      lr
+ #elif !defined(CONFIG_SMP)
+ #ifdef CONFIG_MMU
+       /*
+        * The only thing that can break atomicity in this cmpxchg64
+        * implementation is either an IRQ or a data abort exception
+        * causing another process/thread to be scheduled in the middle of
+        * the critical sequence.  The same strategy as for cmpxchg is used.
+        */
+       stmfd   sp!, {r4, r5, r6, lr}
+       ldmia   r0, {r4, r5}                    @ load old val
+       ldmia   r1, {r6, lr}                    @ load new val
+ 1:    ldmia   r2, {r0, r1}                    @ load current val
+       eors    r3, r0, r4                      @ compare with oldval (1)
+       eoreqs  r3, r1, r5                      @ compare with oldval (2)
+ 2:    stmeqia r2, {r6, lr}                    @ store newval if eq
+       rsbs    r0, r3, #0                      @ set return val and C flag
+       ldmfd   sp!, {r4, r5, r6, pc}
+       .text
+ kuser_cmpxchg64_fixup:
+       @ Called from kuser_cmpxchg_fixup.
 -      @ If r2 >= 1b and r2 <= 2b then saved pc_usr is set to 1b.
++      @ r4 = address of interrupted insn (must be preserved).
+       @ sp = saved regs. r7 and r8 are clobbered.
+       @ 1b = first critical insn, 2b = last critical insn.
 -      subs    r8, r2, r7
++      @ If r4 >= 1b and r4 <= 2b then saved pc_usr is set to 1b.
+       mov     r7, #0xffff0fff
+       sub     r7, r7, #(0xffff0fff - (0xffff0f60 + (1b - __kuser_cmpxchg64)))
++      subs    r8, r4, r7
+       rsbcss  r8, r8, #(2b - 1b)
+       strcs   r7, [sp, #S_PC]
+ #if __LINUX_ARM_ARCH__ < 6
+       bcc     kuser_cmpxchg32_fixup
+ #endif
+       mov     pc, lr
+       .previous
+ #else
+ #warning "NPTL on non MMU needs fixing"
+       mov     r0, #-1
+       adds    r0, r0, #0
        usr_ret lr
+ #endif
+ #else
+ #error "incoherent kernel configuration"
+ #endif
+       /* pad to next slot */
+       .rept   (16 - (. - __kuser_cmpxchg64)/4)
+       .word   0
+       .endr
  
        .align  5
  
- /*
-  * Reference prototype:
-  *
-  *    int __kernel_cmpxchg(int oldval, int newval, int *ptr)
-  *
-  * Input:
-  *
-  *    r0 = oldval
-  *    r1 = newval
-  *    r2 = ptr
-  *    lr = return address
-  *
-  * Output:
-  *
-  *    r0 = returned value (zero or non-zero)
-  *    C flag = set if r0 == 0, clear if r0 != 0
-  *
-  * Clobbered:
-  *
-  *    r3, ip, flags
-  *
-  * Definition and user space usage example:
-  *
-  *    typedef int (__kernel_cmpxchg_t)(int oldval, int newval, int *ptr);
-  *    #define __kernel_cmpxchg (*(__kernel_cmpxchg_t *)0xffff0fc0)
-  *
-  * Atomically store newval in *ptr if *ptr is equal to oldval for user space.
-  * Return zero if *ptr was changed or non-zero if no exchange happened.
-  * The C flag is also set if *ptr was changed to allow for assembly
-  * optimization in the calling code.
-  *
-  * Notes:
-  *
-  *    - This routine already includes memory barriers as needed.
-  *
-  * For example, a user space atomic_add implementation could look like this:
-  *
-  * #define atomic_add(ptr, val) \
-  *    ({ register unsigned int *__ptr asm("r2") = (ptr); \
-  *       register unsigned int __result asm("r1"); \
-  *       asm volatile ( \
-  *           "1: @ atomic_add\n\t" \
-  *           "ldr     r0, [r2]\n\t" \
-  *           "mov     r3, #0xffff0fff\n\t" \
-  *           "add     lr, pc, #4\n\t" \
-  *           "add     r1, r0, %2\n\t" \
-  *           "add     pc, r3, #(0xffff0fc0 - 0xffff0fff)\n\t" \
-  *           "bcc     1b" \
-  *           : "=&r" (__result) \
-  *           : "r" (__ptr), "rIL" (val) \
-  *           : "r0","r3","ip","lr","cc","memory" ); \
-  *       __result; })
-  */
+ __kuser_memory_barrier:                               @ 0xffff0fa0
+       smp_dmb arm
+       usr_ret lr
+       .align  5
  
  __kuser_cmpxchg:                              @ 0xffff0fc0
  
        usr_ret lr
  
        .text
- kuser_cmpxchg_fixup:
+ kuser_cmpxchg32_fixup:
        @ Called from kuser_cmpxchg_check macro.
 -      @ r2 = address of interrupted insn (must be preserved).
 +      @ r4 = address of interrupted insn (must be preserved).
        @ sp = saved regs. r7 and r8 are clobbered.
        @ 1b = first critical insn, 2b = last critical insn.
 -      @ If r2 >= 1b and r2 <= 2b then saved pc_usr is set to 1b.
 +      @ If r4 >= 1b and r4 <= 2b then saved pc_usr is set to 1b.
        mov     r7, #0xffff0fff
        sub     r7, r7, #(0xffff0fff - (0xffff0fc0 + (1b - __kuser_cmpxchg)))
 -      subs    r8, r2, r7
 +      subs    r8, r4, r7
        rsbcss  r8, r8, #(2b - 1b)
        strcs   r7, [sp, #S_PC]
        mov     pc, lr
  
        .align  5
  
- /*
-  * Reference prototype:
-  *
-  *    int __kernel_get_tls(void)
-  *
-  * Input:
-  *
-  *    lr = return address
-  *
-  * Output:
-  *
-  *    r0 = TLS value
-  *
-  * Clobbered:
-  *
-  *    none
-  *
-  * Definition and user space usage example:
-  *
-  *    typedef int (__kernel_get_tls_t)(void);
-  *    #define __kernel_get_tls (*(__kernel_get_tls_t *)0xffff0fe0)
-  *
-  * Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
-  *
-  * This could be used as follows:
-  *
-  * #define __kernel_get_tls() \
-  *    ({ register unsigned int __val asm("r0"); \
-  *         asm( "mov r0, #0xffff0fff; mov lr, pc; sub pc, r0, #31" \
-  *            : "=r" (__val) : : "lr","cc" ); \
-  *       __val; })
-  */
  __kuser_get_tls:                              @ 0xffff0fe0
        ldr     r0, [pc, #(16 - 8)]     @ read TLS, set in kuser_get_tls_init
        usr_ret lr
        .word   0                       @ 0xffff0ff0 software TLS value, then
        .endr                           @ pad up to __kuser_helper_version
  
- /*
-  * Reference declaration:
-  *
-  *    extern unsigned int __kernel_helper_version;
-  *
-  * Definition and user space usage example:
-  *
-  *    #define __kernel_helper_version (*(unsigned int *)0xffff0ffc)
-  *
-  * User space may read this to determine the curent number of helpers
-  * available.
-  */
  __kuser_helper_version:                               @ 0xffff0ffc
        .word   ((__kuser_helper_end - __kuser_helper_start) >> 5)
  
        .endm
  #else /* CONFIG_THUMB2_KERNEL */
        .macro  svc_exit, rpsr
+       ldr     lr, [sp, #S_SP]                 @ top of the stack
+       ldrd    r0, r1, [sp, #S_LR]             @ calling lr and pc
        clrex                                   @ clear the exclusive monitor
-       ldr     r0, [sp, #S_SP]                 @ top of the stack
-       ldr     r1, [sp, #S_PC]                 @ return address
-       tst     r0, #4                          @ orig stack 8-byte aligned?
-       stmdb   r0, {r1, \rpsr}                 @ rfe context
+       stmdb   lr!, {r0, r1, \rpsr}            @ calling lr and rfe context
        ldmia   sp, {r0 - r12}
-       ldr     lr, [sp, #S_LR]
-       addeq   sp, sp, #S_FRAME_SIZE - 8       @ aligned
-       addne   sp, sp, #S_FRAME_SIZE - 4       @ not aligned
+       mov     sp, lr
+       ldr     lr, [sp], #4
        rfeia   sp!
        .endm
  
        .endm
  #endif        /* !CONFIG_THUMB2_KERNEL */
  
 -      @
 -      @ Debug exceptions are taken as prefetch or data aborts.
 -      @ We must disable preemption during the handler so that
 -      @ we can access the debug registers safely.
 -      @
 -      .macro  debug_entry, fsr
 -#if defined(CONFIG_HAVE_HW_BREAKPOINT) && defined(CONFIG_PREEMPT)
 -      ldr     r4, =0x40f              @ mask out fsr.fs
 -      and     r5, r4, \fsr
 -      cmp     r5, #2                  @ debug exception
 -      bne     1f
 -      get_thread_info r10
 -      ldr     r6, [r10, #TI_PREEMPT]  @ get preempt count
 -      add     r11, r6, #1             @ increment it
 -      str     r11, [r10, #TI_PREEMPT]
 -1:
 -#endif
 -      .endm
 -
  /*
   * These are the registers used in the syscall handler, and allow us to
   * have in theory up to 7 arguments to a function - r0 to r6.
@@@ -435,7 -435,7 +435,7 @@@ armpmu_reserve_hardware(void
                        if (irq >= 0)
                                free_irq(irq, NULL);
                }
 -              release_pmu(pmu_device);
 +              release_pmu(ARM_PMU_DEVICE_CPU);
                pmu_device = NULL;
        }
  
@@@ -454,7 -454,7 +454,7 @@@ armpmu_release_hardware(void
        }
        armpmu->stop();
  
 -      release_pmu(pmu_device);
 +      release_pmu(ARM_PMU_DEVICE_CPU);
        pmu_device = NULL;
  }
  
@@@ -662,6 -662,12 +662,12 @@@ init_hw_perf_events(void
                case 0xC090:    /* Cortex-A9 */
                        armpmu = armv7_a9_pmu_init();
                        break;
+               case 0xC050:    /* Cortex-A5 */
+                       armpmu = armv7_a5_pmu_init();
+                       break;
+               case 0xC0F0:    /* Cortex-A15 */
+                       armpmu = armv7_a15_pmu_init();
+                       break;
                }
        /* Intel CPUs [xscale]. */
        } else if (0x69 == implementor) {
diff --combined arch/arm/kernel/setup.c
@@@ -343,6 -343,54 +343,6 @@@ static void __init feat_v6_fixup(void
                elf_hwcap &= ~HWCAP_TLS;
  }
  
 -static void __init setup_processor(void)
 -{
 -      struct proc_info_list *list;
 -
 -      /*
 -       * locate processor in the list of supported processor
 -       * types.  The linker builds this table for us from the
 -       * entries in arch/arm/mm/proc-*.S
 -       */
 -      list = lookup_processor_type(read_cpuid_id());
 -      if (!list) {
 -              printk("CPU configuration botched (ID %08x), unable "
 -                     "to continue.\n", read_cpuid_id());
 -              while (1);
 -      }
 -
 -      cpu_name = list->cpu_name;
 -
 -#ifdef MULTI_CPU
 -      processor = *list->proc;
 -#endif
 -#ifdef MULTI_TLB
 -      cpu_tlb = *list->tlb;
 -#endif
 -#ifdef MULTI_USER
 -      cpu_user = *list->user;
 -#endif
 -#ifdef MULTI_CACHE
 -      cpu_cache = *list->cache;
 -#endif
 -
 -      printk("CPU: %s [%08x] revision %d (ARMv%s), cr=%08lx\n",
 -             cpu_name, read_cpuid_id(), read_cpuid_id() & 15,
 -             proc_arch[cpu_architecture()], cr_alignment);
 -
 -      sprintf(init_utsname()->machine, "%s%c", list->arch_name, ENDIANNESS);
 -      sprintf(elf_platform, "%s%c", list->elf_name, ENDIANNESS);
 -      elf_hwcap = list->elf_hwcap;
 -#ifndef CONFIG_ARM_THUMB
 -      elf_hwcap &= ~HWCAP_THUMB;
 -#endif
 -
 -      feat_v6_fixup();
 -
 -      cacheid_init();
 -      cpu_proc_init();
 -}
 -
  /*
   * cpu_init - initialise one CPU.
   *
@@@ -358,8 -406,6 +358,8 @@@ void cpu_init(void
                BUG();
        }
  
 +      cpu_proc_init();
 +
        /*
         * Define the placement constraint for the inline asm directive below.
         * In Thumb-2, msr with an immediate value is not allowed.
            : "r14");
  }
  
 +static void __init setup_processor(void)
 +{
 +      struct proc_info_list *list;
 +
 +      /*
 +       * locate processor in the list of supported processor
 +       * types.  The linker builds this table for us from the
 +       * entries in arch/arm/mm/proc-*.S
 +       */
 +      list = lookup_processor_type(read_cpuid_id());
 +      if (!list) {
 +              printk("CPU configuration botched (ID %08x), unable "
 +                     "to continue.\n", read_cpuid_id());
 +              while (1);
 +      }
 +
 +      cpu_name = list->cpu_name;
 +
 +#ifdef MULTI_CPU
 +      processor = *list->proc;
 +#endif
 +#ifdef MULTI_TLB
 +      cpu_tlb = *list->tlb;
 +#endif
 +#ifdef MULTI_USER
 +      cpu_user = *list->user;
 +#endif
 +#ifdef MULTI_CACHE
 +      cpu_cache = *list->cache;
 +#endif
 +
 +      printk("CPU: %s [%08x] revision %d (ARMv%s), cr=%08lx\n",
 +             cpu_name, read_cpuid_id(), read_cpuid_id() & 15,
 +             proc_arch[cpu_architecture()], cr_alignment);
 +
 +      sprintf(init_utsname()->machine, "%s%c", list->arch_name, ENDIANNESS);
 +      sprintf(elf_platform, "%s%c", list->elf_name, ENDIANNESS);
 +      elf_hwcap = list->elf_hwcap;
 +#ifndef CONFIG_ARM_THUMB
 +      elf_hwcap &= ~HWCAP_THUMB;
 +#endif
 +
 +      feat_v6_fixup();
 +
 +      cacheid_init();
 +      cpu_init();
 +}
 +
  void __init dump_machine_table(void)
  {
        struct machine_desc *p;
@@@ -917,8 -915,15 +917,14 @@@ void __init setup_arch(char **cmdline_p
  #endif
        reserve_crashkernel();
  
 -      cpu_init();
        tcm_init();
  
+ #ifdef CONFIG_ZONE_DMA
+       if (mdesc->dma_zone_size) {
+               extern unsigned long arm_dma_zone_size;
+               arm_dma_zone_size = mdesc->dma_zone_size;
+       }
+ #endif
  #ifdef CONFIG_MULTI_IRQ_HANDLER
        handle_arch_irq = mdesc->handle_irq;
  #endif
@@@ -980,6 -985,10 +986,10 @@@ static const char *hwcap_str[] = 
        "neon",
        "vfpv3",
        "vfpv3d16",
+       "tls",
+       "vfpv4",
+       "idiva",
+       "idivt",
        NULL
  };
  
@@@ -520,7 -520,7 +520,7 @@@ fail
         */
        if (have_imager()) {
                label = "HD imager";
 -              mux |= 1;
 +              mux |= 2;
  
                /* externally mux MMC1/ENET/AIC33 to imager */
                mux |= BIT(6) | BIT(5) | BIT(3);
                resets &= ~BIT(1);
  
                if (have_tvp7002()) {
 -                      mux |= 2;
 +                      mux |= 1;
                        resets &= ~BIT(2);
                        label = "tvp7002 HD";
                } else {
@@@ -617,5 -617,6 +617,6 @@@ MACHINE_START(DAVINCI_DM365_EVM, "DaVin
        .init_irq       = davinci_irq_init,
        .timer          = &davinci_timer,
        .init_machine   = dm365_evm_init,
+       .dma_zone_size  = SZ_128M,
  MACHINE_END
  
diff --combined arch/arm/mm/init.c
@@@ -212,6 -212,10 +212,10 @@@ static void __init arm_bootmem_init(uns
  }
  
  #ifdef CONFIG_ZONE_DMA
+ unsigned long arm_dma_zone_size __read_mostly;
+ EXPORT_SYMBOL(arm_dma_zone_size);
  /*
   * The DMA mask corresponding to the maximum bus address allocatable
   * using GFP_DMA.  The default here places no restriction on DMA
@@@ -275,19 -279,17 +279,17 @@@ static void __init arm_bootmem_free(uns
  #endif
        }
  
- #ifdef ARM_DMA_ZONE_SIZE
- #ifndef CONFIG_ZONE_DMA
- #error ARM_DMA_ZONE_SIZE set but no DMA zone to limit allocations
- #endif
+ #ifdef CONFIG_ZONE_DMA
        /*
         * Adjust the sizes according to any special requirements for
         * this machine type.
         */
-       arm_adjust_dma_zone(zone_size, zhole_size,
-               ARM_DMA_ZONE_SIZE >> PAGE_SHIFT);
-       arm_dma_limit = PHYS_OFFSET + ARM_DMA_ZONE_SIZE - 1;
+       if (arm_dma_zone_size) {
+               arm_adjust_dma_zone(zone_size, zhole_size,
+                       arm_dma_zone_size >> PAGE_SHIFT);
+               arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
+       } else
+               arm_dma_limit = 0xffffffff;
  #endif
  
        free_area_init_node(0, zone_size, min, zhole_size);
@@@ -432,17 -434,6 +434,17 @@@ static inline int free_area(unsigned lo
        return pages;
  }
  
 +/*
 + * Poison init memory with an undefined instruction (ARM) or a branch to an
 + * undefined instruction (Thumb).
 + */
 +static inline void poison_init_mem(void *s, size_t count)
 +{
 +      u32 *p = (u32 *)s;
 +      while ((count = count - 4))
 +              *p++ = 0xe7fddef0;
 +}
 +
  static inline void
  free_memmap(unsigned long start_pfn, unsigned long end_pfn)
  {
@@@ -660,8 -651,8 +662,8 @@@ void __init mem_init(void
                        "    pkmap   : 0x%08lx - 0x%08lx   (%4ld MB)\n"
  #endif
                        "    modules : 0x%08lx - 0x%08lx   (%4ld MB)\n"
 -                      "      .init : 0x%p" " - 0x%p" "   (%4d kB)\n"
                        "      .text : 0x%p" " - 0x%p" "   (%4d kB)\n"
 +                      "      .init : 0x%p" " - 0x%p" "   (%4d kB)\n"
                        "      .data : 0x%p" " - 0x%p" "   (%4d kB)\n"
                        "       .bss : 0x%p" " - 0x%p" "   (%4d kB)\n",
  
  #endif
                        MLM(MODULES_VADDR, MODULES_END),
  
 -                      MLK_ROUNDUP(__init_begin, __init_end),
                        MLK_ROUNDUP(_text, _etext),
 +                      MLK_ROUNDUP(__init_begin, __init_end),
                        MLK_ROUNDUP(_sdata, _edata),
                        MLK_ROUNDUP(__bss_start, __bss_stop));
  
@@@ -725,13 -716,11 +727,13 @@@ void free_initmem(void
  #ifdef CONFIG_HAVE_TCM
        extern char __tcm_start, __tcm_end;
  
 +      poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start);
        totalram_pages += free_area(__phys_to_pfn(__pa(&__tcm_start)),
                                    __phys_to_pfn(__pa(&__tcm_end)),
                                    "TCM link");
  #endif
  
 +      poison_init_mem(__init_begin, __init_end - __init_begin);
        if (!machine_is_integrator() && !machine_is_cintegrator())
                totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)),
                                            __phys_to_pfn(__pa(__init_end)),
@@@ -744,12 -733,10 +746,12 @@@ static int keep_initrd
  
  void free_initrd_mem(unsigned long start, unsigned long end)
  {
 -      if (!keep_initrd)
 +      if (!keep_initrd) {
 +              poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
                totalram_pages += free_area(__phys_to_pfn(__pa(start)),
                                            __phys_to_pfn(__pa(end)),
                                            "initrd");
 +      }
  }
  
  static int __init keepinitrd_setup(char *__unused)
@@@ -29,19 -29,19 +29,19 @@@ ENTRY(cpu_arm7_dcache_clean_area
  /*
   * Function: arm6_7_data_abort ()
   *
 - * Params  : r2 = address of aborted instruction
 - *       : sp = pointer to registers
 + * Params  : r2 = pt_regs
 + *       : r4 = aborted context pc
 + *       : r5 = aborted context psr
   *
   * Purpose : obtain information about current aborted instruction
   *
 - * Returns : r0 = address of abort
 - *       : r1 = FSR
 + * Returns : r4-r5, r10-r11, r13 preserved
   */
  
  ENTRY(cpu_arm7_data_abort)
        mrc     p15, 0, r1, c5, c0, 0           @ get FSR
        mrc     p15, 0, r0, c6, c0, 0           @ get FAR
 -      ldr     r8, [r2]                        @ read arm instruction
 +      ldr     r8, [r4]                        @ read arm instruction
        tst     r8, #1 << 20                    @ L = 0 -> write?
        orreq   r1, r1, #1 << 11                @ yes.
        and     r7, r8, #15 << 24
@@@ -49,7 -49,7 +49,7 @@@
        nop
  
  /* 0 */       b       .data_unknown
 -/* 1 */       mov     pc, lr                          @ swp
 +/* 1 */       b       do_DataAbort                    @ swp
  /* 2 */       b       .data_unknown
  /* 3 */       b       .data_unknown
  /* 4 */       b       .data_arm_lateldrpostconst      @ ldr   rd, [rn], #m
  /* 9 */       b       .data_arm_ldmstm                @ ldm*b rn, <rlist>
  /* a */       b       .data_unknown
  /* b */       b       .data_unknown
 -/* c */       mov     pc, lr                          @ ldc   rd, [rn], #m    @ Same as ldr   rd, [rn], #m
 -/* d */       mov     pc, lr                          @ ldc   rd, [rn, #m]
 +/* c */       b       do_DataAbort                    @ ldc   rd, [rn], #m    @ Same as ldr   rd, [rn], #m
 +/* d */       b       do_DataAbort                    @ ldc   rd, [rn, #m]
  /* e */       b       .data_unknown
  /* f */
  .data_unknown:        @ Part of jumptable
 -      mov     r0, r2
 +      mov     r0, r4
        mov     r1, r8
 -      mov     r2, sp
 -      bl      baddataabort
 -      b       ret_from_exception
 +      b       baddataabort
  
  ENTRY(cpu_arm6_data_abort)
        mrc     p15, 0, r1, c5, c0, 0           @ get FSR
        mrc     p15, 0, r0, c6, c0, 0           @ get FAR
 -      ldr     r8, [r2]                        @ read arm instruction
 +      ldr     r8, [r4]                        @ read arm instruction
        tst     r8, #1 << 20                    @ L = 0 -> write?
        orreq   r1, r1, #1 << 11                @ yes.
        and     r7, r8, #14 << 24
        teq     r7, #8 << 24                    @ was it ldm/stm
 -      movne   pc, lr
 +      bne     do_DataAbort
  
  .data_arm_ldmstm:
        tst     r8, #1 << 21                    @ check writeback bit
 -      moveq   pc, lr                          @ no writeback -> no fixup
 +      beq     do_DataAbort                    @ no writeback -> no fixup
        mov     r7, #0x11
        orr     r7, r7, #0x1100
        and     r6, r8, r7
 -      and     r2, r8, r7, lsl #1
 -      add     r6, r6, r2, lsr #1
 -      and     r2, r8, r7, lsl #2
 -      add     r6, r6, r2, lsr #2
 -      and     r2, r8, r7, lsl #3
 -      add     r6, r6, r2, lsr #3
 +      and     r9, r8, r7, lsl #1
 +      add     r6, r6, r9, lsr #1
 +      and     r9, r8, r7, lsl #2
 +      add     r6, r6, r9, lsr #2
 +      and     r9, r8, r7, lsl #3
 +      add     r6, r6, r9, lsr #3
        add     r6, r6, r6, lsr #8
        add     r6, r6, r6, lsr #4
        and     r6, r6, #15                     @ r6 = no. of registers to transfer.
 -      and     r5, r8, #15 << 16               @ Extract 'n' from instruction
 -      ldr     r7, [sp, r5, lsr #14]           @ Get register 'Rn'
 +      and     r9, r8, #15 << 16               @ Extract 'n' from instruction
 +      ldr     r7, [r2, r9, lsr #14]           @ Get register 'Rn'
        tst     r8, #1 << 23                    @ Check U bit
        subne   r7, r7, r6, lsl #2              @ Undo increment
        addeq   r7, r7, r6, lsl #2              @ Undo decrement
 -      str     r7, [sp, r5, lsr #14]           @ Put register 'Rn'
 -      mov     pc, lr
 +      str     r7, [r2, r9, lsr #14]           @ Put register 'Rn'
 +      b       do_DataAbort
  
  .data_arm_apply_r6_and_rn:
 -      and     r5, r8, #15 << 16               @ Extract 'n' from instruction
 -      ldr     r7, [sp, r5, lsr #14]           @ Get register 'Rn'
 +      and     r9, r8, #15 << 16               @ Extract 'n' from instruction
 +      ldr     r7, [r2, r9, lsr #14]           @ Get register 'Rn'
        tst     r8, #1 << 23                    @ Check U bit
        subne   r7, r7, r6                      @ Undo incrmenet
        addeq   r7, r7, r6                      @ Undo decrement
 -      str     r7, [sp, r5, lsr #14]           @ Put register 'Rn'
 -      mov     pc, lr
 +      str     r7, [r2, r9, lsr #14]           @ Put register 'Rn'
 +      b       do_DataAbort
  
  .data_arm_lateldrpreconst:
        tst     r8, #1 << 21                    @ check writeback bit
 -      moveq   pc, lr                          @ no writeback -> no fixup
 +      beq     do_DataAbort                    @ no writeback -> no fixup
  .data_arm_lateldrpostconst:
 -      movs    r2, r8, lsl #20                 @ Get offset
 -      moveq   pc, lr                          @ zero -> no fixup
 -      and     r5, r8, #15 << 16               @ Extract 'n' from instruction
 -      ldr     r7, [sp, r5, lsr #14]           @ Get register 'Rn'
 +      movs    r6, r8, lsl #20                 @ Get offset
 +      beq     do_DataAbort                    @ zero -> no fixup
 +      and     r9, r8, #15 << 16               @ Extract 'n' from instruction
 +      ldr     r7, [r2, r9, lsr #14]           @ Get register 'Rn'
        tst     r8, #1 << 23                    @ Check U bit
 -      subne   r7, r7, r2, lsr #20             @ Undo increment
 -      addeq   r7, r7, r2, lsr #20             @ Undo decrement
 -      str     r7, [sp, r5, lsr #14]           @ Put register 'Rn'
 -      mov     pc, lr
 +      subne   r7, r7, r6, lsr #20             @ Undo increment
 +      addeq   r7, r7, r6, lsr #20             @ Undo decrement
 +      str     r7, [r2, r9, lsr #14]           @ Put register 'Rn'
 +      b       do_DataAbort
  
  .data_arm_lateldrprereg:
        tst     r8, #1 << 21                    @ check writeback bit
 -      moveq   pc, lr                          @ no writeback -> no fixup
 +      beq     do_DataAbort                    @ no writeback -> no fixup
  .data_arm_lateldrpostreg:
        and     r7, r8, #15                     @ Extract 'm' from instruction
 -      ldr     r6, [sp, r7, lsl #2]            @ Get register 'Rm'
 -      mov     r5, r8, lsr #7                  @ get shift count
 -      ands    r5, r5, #31
 +      ldr     r6, [r2, r7, lsl #2]            @ Get register 'Rm'
 +      mov     r9, r8, lsr #7                  @ get shift count
 +      ands    r9, r9, #31
        and     r7, r8, #0x70                   @ get shift type
        orreq   r7, r7, #8                      @ shift count = 0
        add     pc, pc, r7
        nop
  
 -      mov     r6, r6, lsl r5                  @ 0: LSL #!0
 +      mov     r6, r6, lsl r9                  @ 0: LSL #!0
        b       .data_arm_apply_r6_and_rn
        b       .data_arm_apply_r6_and_rn       @ 1: LSL #0
        nop
        nop
        b       .data_unknown                   @ 3: MUL?
        nop
 -      mov     r6, r6, lsr r5                  @ 4: LSR #!0
 +      mov     r6, r6, lsr r9                  @ 4: LSR #!0
        b       .data_arm_apply_r6_and_rn
        mov     r6, r6, lsr #32                 @ 5: LSR #32
        b       .data_arm_apply_r6_and_rn
        nop
        b       .data_unknown                   @ 7: MUL?
        nop
 -      mov     r6, r6, asr r5                  @ 8: ASR #!0
 +      mov     r6, r6, asr r9                  @ 8: ASR #!0
        b       .data_arm_apply_r6_and_rn
        mov     r6, r6, asr #32                 @ 9: ASR #32
        b       .data_arm_apply_r6_and_rn
        nop
        b       .data_unknown                   @ B: MUL?
        nop
 -      mov     r6, r6, ror r5                  @ C: ROR #!0
 +      mov     r6, r6, ror r9                  @ C: ROR #!0
        b       .data_arm_apply_r6_and_rn
        mov     r6, r6, rrx                     @ D: RRX
        b       .data_arm_apply_r6_and_rn
@@@ -267,159 -269,57 +267,57 @@@ __arm7_setup:  mov     r0, #
  
                __INITDATA
  
- /*
-  * Purpose : Function pointers used to access above functions - all calls
-  *         come through these
-  */
-               .type   arm6_processor_functions, #object
- ENTRY(arm6_processor_functions)
-               .word   cpu_arm6_data_abort
-               .word   legacy_pabort
-               .word   cpu_arm6_proc_init
-               .word   cpu_arm6_proc_fin
-               .word   cpu_arm6_reset
-               .word   cpu_arm6_do_idle
-               .word   cpu_arm6_dcache_clean_area
-               .word   cpu_arm6_switch_mm
-               .word   cpu_arm6_set_pte_ext
-               .word   0
-               .word   0
-               .word   0
-               .size   arm6_processor_functions, . - arm6_processor_functions
- /*
-  * Purpose : Function pointers used to access above functions - all calls
-  *         come through these
-  */
-               .type   arm7_processor_functions, #object
- ENTRY(arm7_processor_functions)
-               .word   cpu_arm7_data_abort
-               .word   legacy_pabort
-               .word   cpu_arm7_proc_init
-               .word   cpu_arm7_proc_fin
-               .word   cpu_arm7_reset
-               .word   cpu_arm7_do_idle
-               .word   cpu_arm7_dcache_clean_area
-               .word   cpu_arm7_switch_mm
-               .word   cpu_arm7_set_pte_ext
-               .word   0
-               .word   0
-               .word   0
-               .size   arm7_processor_functions, . - arm7_processor_functions
+               @ define struct processor (see <asm/proc-fns.h> and proc-macros.S)
+               define_processor_functions arm6, dabort=cpu_arm6_data_abort, pabort=legacy_pabort
+               define_processor_functions arm7, dabort=cpu_arm7_data_abort, pabort=legacy_pabort
  
                .section ".rodata"
  
-               .type   cpu_arch_name, #object
- cpu_arch_name:        .asciz  "armv3"
-               .size   cpu_arch_name, . - cpu_arch_name
-               .type   cpu_elf_name, #object
- cpu_elf_name: .asciz  "v3"
-               .size   cpu_elf_name, . - cpu_elf_name
-               .type   cpu_arm6_name, #object
- cpu_arm6_name:        .asciz  "ARM6"
-               .size   cpu_arm6_name, . - cpu_arm6_name
-               .type   cpu_arm610_name, #object
- cpu_arm610_name:
-               .asciz  "ARM610"
-               .size   cpu_arm610_name, . - cpu_arm610_name
-               .type   cpu_arm7_name, #object
- cpu_arm7_name:        .asciz  "ARM7"
-               .size   cpu_arm7_name, . - cpu_arm7_name
-               .type   cpu_arm710_name, #object
- cpu_arm710_name:
-               .asciz  "ARM710"
-               .size   cpu_arm710_name, . - cpu_arm710_name
+               string  cpu_arch_name, "armv3"
+               string  cpu_elf_name, "v3"
+               string  cpu_arm6_name, "ARM6"
+               string  cpu_arm610_name, "ARM610"
+               string  cpu_arm7_name, "ARM7"
+               string  cpu_arm710_name, "ARM710"
  
                .align
  
                .section ".proc.info.init", #alloc, #execinstr
  
-               .type   __arm6_proc_info, #object
- __arm6_proc_info:
-               .long   0x41560600
-               .long   0xfffffff0
-               .long   0x00000c1e
+ .macro arm67_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, \
+       cpu_mm_mmu_flags:req, cpu_flush:req, cpu_proc_funcs:req
+               .type   __\name\()_proc_info, #object
+ __\name\()_proc_info:
+               .long   \cpu_val
+               .long   \cpu_mask
+               .long   \cpu_mm_mmu_flags
                .long   PMD_TYPE_SECT | \
                        PMD_BIT4 | \
                        PMD_SECT_AP_WRITE | \
                        PMD_SECT_AP_READ
-               b       __arm6_setup
+               b       \cpu_flush
                .long   cpu_arch_name
                .long   cpu_elf_name
                .long   HWCAP_SWP | HWCAP_26BIT
-               .long   cpu_arm6_name
-               .long   arm6_processor_functions
+               .long   \cpu_name
+               .long   \cpu_proc_funcs
                .long   v3_tlb_fns
                .long   v3_user_fns
                .long   v3_cache_fns
-               .size   __arm6_proc_info, . - __arm6_proc_info
-               .type   __arm610_proc_info, #object
- __arm610_proc_info:
-               .long   0x41560610
-               .long   0xfffffff0
-               .long   0x00000c1e
-               .long   PMD_TYPE_SECT | \
-                       PMD_BIT4 | \
-                       PMD_SECT_AP_WRITE | \
-                       PMD_SECT_AP_READ
-               b       __arm6_setup
-               .long   cpu_arch_name
-               .long   cpu_elf_name
-               .long   HWCAP_SWP | HWCAP_26BIT
-               .long   cpu_arm610_name
-               .long   arm6_processor_functions
-               .long   v3_tlb_fns
-               .long   v3_user_fns
-               .long   v3_cache_fns
-               .size   __arm610_proc_info, . - __arm610_proc_info
-               .type   __arm7_proc_info, #object
- __arm7_proc_info:
-               .long   0x41007000
-               .long   0xffffff00
-               .long   0x00000c1e
-               .long   PMD_TYPE_SECT | \
-                       PMD_BIT4 | \
-                       PMD_SECT_AP_WRITE | \
-                       PMD_SECT_AP_READ
-               b       __arm7_setup
-               .long   cpu_arch_name
-               .long   cpu_elf_name
-               .long   HWCAP_SWP | HWCAP_26BIT
-               .long   cpu_arm7_name
-               .long   arm7_processor_functions
-               .long   v3_tlb_fns
-               .long   v3_user_fns
-               .long   v3_cache_fns
-               .size   __arm7_proc_info, . - __arm7_proc_info
-               .type   __arm710_proc_info, #object
- __arm710_proc_info:
-               .long   0x41007100
-               .long   0xfff8ff00
-               .long   PMD_TYPE_SECT | \
+               .size   __\name\()_proc_info, . - __\name\()_proc_info
+ .endm
+       arm67_proc_info arm6,   0x41560600, 0xfffffff0, cpu_arm6_name, \
+               0x00000c1e, __arm6_setup, arm6_processor_functions
+       arm67_proc_info arm610, 0x41560610, 0xfffffff0, cpu_arm610_name, \
+               0x00000c1e, __arm6_setup, arm6_processor_functions
+       arm67_proc_info arm7,   0x41007000, 0xffffff00, cpu_arm7_name, \
+               0x00000c1e, __arm7_setup, arm7_processor_functions
+       arm67_proc_info arm710, 0x41007100, 0xfff8ff00, cpu_arm710_name, \
+                       PMD_TYPE_SECT | \
                        PMD_SECT_BUFFERABLE | \
                        PMD_SECT_CACHEABLE | \
                        PMD_BIT4 | \
                        PMD_SECT_AP_WRITE | \
-                       PMD_SECT_AP_READ
-               .long   PMD_TYPE_SECT | \
-                       PMD_BIT4 | \
-                       PMD_SECT_AP_WRITE | \
-                       PMD_SECT_AP_READ
-               b       __arm7_setup
-               .long   cpu_arch_name
-               .long   cpu_elf_name
-               .long   HWCAP_SWP | HWCAP_26BIT
-               .long   cpu_arm710_name
-               .long   arm7_processor_functions
-               .long   v3_tlb_fns
-               .long   v3_user_fns
-               .long   v3_cache_fns
-               .size   __arm710_proc_info, . - __arm710_proc_info
+                       PMD_SECT_AP_READ, \
+               __arm7_setup, arm7_processor_functions
@@@ -34,7 -34,7 +34,7 @@@
   */
  #define DCACHELINESIZE        32
  
 -      __INIT
 +      .section .text
  
  /*
   * cpu_sa1100_proc_init()
@@@ -45,6 -45,8 +45,6 @@@ ENTRY(cpu_sa1100_proc_init
        mcr     p15, 0, r0, c9, c0, 5           @ Allow read-buffer operations from userland
        mov     pc, lr
  
 -      .section .text
 -
  /*
   * cpu_sa1100_proc_fin()
   *
@@@ -198,9 -200,6 +198,6 @@@ ENTRY(cpu_sa1100_do_resume
                     PMD_SECT_CACHEABLE | PMD_SECT_AP_WRITE
        b       cpu_resume_mmu
  ENDPROC(cpu_sa1100_do_resume)
- #else
- #define cpu_sa1100_do_suspend 0
- #define cpu_sa1100_do_resume  0
  #endif
  
        __CPUINIT
@@@ -234,59 -233,28 +231,28 @@@ sa1100_crval
        __INITDATA
  
  /*
-  * Purpose : Function pointers used to access above functions - all calls
-  *         come through these
-  */
- /*
   * SA1100 and SA1110 share the same function calls
   */
-       .type   sa1100_processor_functions, #object
- ENTRY(sa1100_processor_functions)
-       .word   v4_early_abort
-       .word   legacy_pabort
-       .word   cpu_sa1100_proc_init
-       .word   cpu_sa1100_proc_fin
-       .word   cpu_sa1100_reset
-       .word   cpu_sa1100_do_idle
-       .word   cpu_sa1100_dcache_clean_area
-       .word   cpu_sa1100_switch_mm
-       .word   cpu_sa1100_set_pte_ext
-       .word   cpu_sa1100_suspend_size
-       .word   cpu_sa1100_do_suspend
-       .word   cpu_sa1100_do_resume
-       .size   sa1100_processor_functions, . - sa1100_processor_functions
-       .section ".rodata"
  
-       .type   cpu_arch_name, #object
- cpu_arch_name:
-       .asciz  "armv4"
-       .size   cpu_arch_name, . - cpu_arch_name
+       @ define struct processor (see <asm/proc-fns.h> and proc-macros.S)
+       define_processor_functions sa1100, dabort=v4_early_abort, pabort=legacy_pabort, suspend=1
  
-       .type   cpu_elf_name, #object
- cpu_elf_name:
-       .asciz  "v4"
-       .size   cpu_elf_name, . - cpu_elf_name
-       .type   cpu_sa1100_name, #object
- cpu_sa1100_name:
-       .asciz  "StrongARM-1100"
-       .size   cpu_sa1100_name, . - cpu_sa1100_name
+       .section ".rodata"
  
-       .type   cpu_sa1110_name, #object
- cpu_sa1110_name:
-       .asciz  "StrongARM-1110"
-       .size   cpu_sa1110_name, . - cpu_sa1110_name
+       string  cpu_arch_name, "armv4"
+       string  cpu_elf_name, "v4"
+       string  cpu_sa1100_name, "StrongARM-1100"
+       string  cpu_sa1110_name, "StrongARM-1110"
  
        .align
  
        .section ".proc.info.init", #alloc, #execinstr
  
-       .type   __sa1100_proc_info,#object
- __sa1100_proc_info:
-       .long   0x4401a110
-       .long   0xfffffff0
+ .macro sa1100_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req
+       .type   __\name\()_proc_info,#object
+ __\name\()_proc_info:
+       .long   \cpu_val
+       .long   \cpu_mask
        .long   PMD_TYPE_SECT | \
                PMD_SECT_BUFFERABLE | \
                PMD_SECT_CACHEABLE | \
        .long   cpu_arch_name
        .long   cpu_elf_name
        .long   HWCAP_SWP | HWCAP_HALF | HWCAP_26BIT | HWCAP_FAST_MULT
-       .long   cpu_sa1100_name
+       .long   \cpu_name
        .long   sa1100_processor_functions
        .long   v4wb_tlb_fns
        .long   v4_mc_user_fns
        .long   v4wb_cache_fns
-       .size   __sa1100_proc_info, . - __sa1100_proc_info
+       .size   __\name\()_proc_info, . - __\name\()_proc_info
+ .endm
  
-       .type   __sa1110_proc_info,#object
- __sa1110_proc_info:
-       .long   0x6901b110
-       .long   0xfffffff0
-       .long   PMD_TYPE_SECT | \
-               PMD_SECT_BUFFERABLE | \
-               PMD_SECT_CACHEABLE | \
-               PMD_SECT_AP_WRITE | \
-               PMD_SECT_AP_READ
-       .long   PMD_TYPE_SECT | \
-               PMD_SECT_AP_WRITE | \
-               PMD_SECT_AP_READ
-       b       __sa1100_setup
-       .long   cpu_arch_name
-       .long   cpu_elf_name
-       .long   HWCAP_SWP | HWCAP_HALF | HWCAP_26BIT | HWCAP_FAST_MULT
-       .long   cpu_sa1110_name
-       .long   sa1100_processor_functions
-       .long   v4wb_tlb_fns
-       .long   v4_mc_user_fns
-       .long   v4wb_cache_fns
-       .size   __sa1110_proc_info, . - __sa1110_proc_info
+       sa1100_proc_info sa1100, 0x4401a110, 0xfffffff0, cpu_sa1100_name
+       sa1100_proc_info sa1110, 0x6901b110, 0xfffffff0, cpu_sa1110_name
diff --combined arch/arm/mm/tlb-fa.S
@@@ -46,6 -46,7 +46,6 @@@ ENTRY(fa_flush_user_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mcr     p15, 0, r3, c7, c5, 6           @ invalidate BTB
        mcr     p15, 0, r3, c7, c10, 4          @ data write barrier
        mov     pc, lr
  
@@@ -59,15 -60,12 +59,11 @@@ ENTRY(fa_flush_kern_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mcr     p15, 0, r3, c7, c5, 6           @ invalidate BTB
        mcr     p15, 0, r3, c7, c10, 4          @ data write barrier
 -      mcr     p15, 0, r3, c7, c5, 4           @ prefetch flush
 +      mcr     p15, 0, r3, c7, c5, 4           @ prefetch flush (isb)
        mov     pc, lr
  
        __INITDATA
  
-       .type   fa_tlb_fns, #object
- ENTRY(fa_tlb_fns)
-       .long   fa_flush_user_tlb_range
-       .long   fa_flush_kern_tlb_range
-       .long   fa_tlb_flags
-       .size   fa_tlb_fns, . - fa_tlb_fns
+       /* define struct cpu_tlb_fns (see <asm/tlbflush.h> and proc-macros.S) */
+       define_tlb_functions fa, fa_tlb_flags
diff --combined arch/arm/mm/tlb-v6.S
@@@ -54,6 -54,7 +54,6 @@@ ENTRY(v6wbi_flush_user_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mcr     p15, 0, ip, c7, c5, 6           @ flush BTAC/BTB
        mcr     p15, 0, ip, c7, c10, 4          @ data synchronization barrier
        mov     pc, lr
  
@@@ -82,15 -83,12 +82,11 @@@ ENTRY(v6wbi_flush_kern_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mcr     p15, 0, r2, c7, c5, 6           @ flush BTAC/BTB
        mcr     p15, 0, r2, c7, c10, 4          @ data synchronization barrier
 -      mcr     p15, 0, r2, c7, c5, 4           @ prefetch flush
 +      mcr     p15, 0, r2, c7, c5, 4           @ prefetch flush (isb)
        mov     pc, lr
  
        __INIT
  
-       .type   v6wbi_tlb_fns, #object
- ENTRY(v6wbi_tlb_fns)
-       .long   v6wbi_flush_user_tlb_range
-       .long   v6wbi_flush_kern_tlb_range
-       .long   v6wbi_tlb_flags
-       .size   v6wbi_tlb_fns, . - v6wbi_tlb_fns
+       /* define struct cpu_tlb_fns (see <asm/tlbflush.h> and proc-macros.S) */
+       define_tlb_functions v6wbi, v6wbi_tlb_flags
diff --combined arch/arm/mm/tlb-v7.S
@@@ -48,6 -48,9 +48,6 @@@ ENTRY(v7wbi_flush_user_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mov     ip, #0
 -      ALT_SMP(mcr     p15, 0, ip, c7, c1, 6)  @ flush BTAC/BTB Inner Shareable
 -      ALT_UP(mcr      p15, 0, ip, c7, c5, 6)  @ flush BTAC/BTB
        dsb
        mov     pc, lr
  ENDPROC(v7wbi_flush_user_tlb_range)
@@@ -72,6 -75,9 +72,6 @@@ ENTRY(v7wbi_flush_kern_tlb_range
        add     r0, r0, #PAGE_SZ
        cmp     r0, r1
        blo     1b
 -      mov     r2, #0
 -      ALT_SMP(mcr     p15, 0, r2, c7, c1, 6)  @ flush BTAC/BTB Inner Shareable
 -      ALT_UP(mcr      p15, 0, r2, c7, c5, 6)  @ flush BTAC/BTB
        dsb
        isb
        mov     pc, lr
@@@ -79,10 -85,5 +79,5 @@@ ENDPROC(v7wbi_flush_kern_tlb_range
  
        __INIT
  
-       .type   v7wbi_tlb_fns, #object
- ENTRY(v7wbi_tlb_fns)
-       .long   v7wbi_flush_user_tlb_range
-       .long   v7wbi_flush_kern_tlb_range
-       ALT_SMP(.long   v7wbi_tlb_flags_smp)
-       ALT_UP(.long    v7wbi_tlb_flags_up)
-       .size   v7wbi_tlb_fns, . - v7wbi_tlb_fns
+       /* define struct cpu_tlb_fns (see <asm/tlbflush.h> and proc-macros.S) */
+       define_tlb_functions v7wbi, v7wbi_tlb_flags_up, flags_smp=v7wbi_tlb_flags_smp
diff --combined arch/arm/vfp/vfpmodule.c
@@@ -33,6 -33,7 +33,6 @@@ void vfp_support_entry(void)
  void vfp_null_entry(void);
  
  void (*vfp_vector)(void) = vfp_null_entry;
 -union vfp_state *last_VFP_context[NR_CPUS];
  
  /*
   * Dual-use variable.
  unsigned int VFP_arch;
  
  /*
 + * The pointer to the vfpstate structure of the thread which currently
 + * owns the context held in the VFP hardware, or NULL if the hardware
 + * context is invalid.
 + *
 + * For UP, this is sufficient to tell which thread owns the VFP context.
 + * However, for SMP, we also need to check the CPU number stored in the
 + * saved state too to catch migrations.
 + */
 +union vfp_state *vfp_current_hw_state[NR_CPUS];
 +
 +/*
 + * Is 'thread's most up to date state stored in this CPUs hardware?
 + * Must be called from non-preemptible context.
 + */
 +static bool vfp_state_in_hw(unsigned int cpu, struct thread_info *thread)
 +{
 +#ifdef CONFIG_SMP
 +      if (thread->vfpstate.hard.cpu != cpu)
 +              return false;
 +#endif
 +      return vfp_current_hw_state[cpu] == &thread->vfpstate;
 +}
 +
 +/*
 + * Force a reload of the VFP context from the thread structure.  We do
 + * this by ensuring that access to the VFP hardware is disabled, and
 + * clear last_VFP_context.  Must be called from non-preemptible context.
 + */
 +static void vfp_force_reload(unsigned int cpu, struct thread_info *thread)
 +{
 +      if (vfp_state_in_hw(cpu, thread)) {
 +              fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
 +              vfp_current_hw_state[cpu] = NULL;
 +      }
 +#ifdef CONFIG_SMP
 +      thread->vfpstate.hard.cpu = NR_CPUS;
 +#endif
 +}
 +
 +/*
   * Per-thread VFP initialization.
   */
  static void vfp_thread_flush(struct thread_info *thread)
        union vfp_state *vfp = &thread->vfpstate;
        unsigned int cpu;
  
 -      memset(vfp, 0, sizeof(union vfp_state));
 -
 -      vfp->hard.fpexc = FPEXC_EN;
 -      vfp->hard.fpscr = FPSCR_ROUND_NEAREST;
 -
        /*
         * Disable VFP to ensure we initialize it first.  We must ensure
 -       * that the modification of last_VFP_context[] and hardware disable
 -       * are done for the same CPU and without preemption.
 +       * that the modification of vfp_current_hw_state[] and hardware
 +       * disable are done for the same CPU and without preemption.
 +       *
 +       * Do this first to ensure that preemption won't overwrite our
 +       * state saving should access to the VFP be enabled at this point.
         */
        cpu = get_cpu();
 -      if (last_VFP_context[cpu] == vfp)
 -              last_VFP_context[cpu] = NULL;
 +      if (vfp_current_hw_state[cpu] == vfp)
 +              vfp_current_hw_state[cpu] = NULL;
        fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
        put_cpu();
 +
 +      memset(vfp, 0, sizeof(union vfp_state));
 +
 +      vfp->hard.fpexc = FPEXC_EN;
 +      vfp->hard.fpscr = FPSCR_ROUND_NEAREST;
 +#ifdef CONFIG_SMP
 +      vfp->hard.cpu = NR_CPUS;
 +#endif
  }
  
  static void vfp_thread_exit(struct thread_info *thread)
        union vfp_state *vfp = &thread->vfpstate;
        unsigned int cpu = get_cpu();
  
 -      if (last_VFP_context[cpu] == vfp)
 -              last_VFP_context[cpu] = NULL;
 +      if (vfp_current_hw_state[cpu] == vfp)
 +              vfp_current_hw_state[cpu] = NULL;
        put_cpu();
  }
  
@@@ -129,9 -84,6 +129,9 @@@ static void vfp_thread_copy(struct thre
  
        vfp_sync_hwstate(parent);
        thread->vfpstate = parent->vfpstate;
 +#ifdef CONFIG_SMP
 +      thread->vfpstate.hard.cpu = NR_CPUS;
 +#endif
  }
  
  /*
@@@ -177,8 -129,17 +177,8 @@@ static int vfp_notifier(struct notifier
                 * case the thread migrates to a different CPU. The
                 * restoring is done lazily.
                 */
 -              if ((fpexc & FPEXC_EN) && last_VFP_context[cpu]) {
 -                      vfp_save_state(last_VFP_context[cpu], fpexc);
 -                      last_VFP_context[cpu]->hard.cpu = cpu;
 -              }
 -              /*
 -               * Thread migration, just force the reloading of the
 -               * state on the new CPU in case the VFP registers
 -               * contain stale data.
 -               */
 -              if (thread->vfpstate.hard.cpu != cpu)
 -                      last_VFP_context[cpu] = NULL;
 +              if ((fpexc & FPEXC_EN) && vfp_current_hw_state[cpu])
 +                      vfp_save_state(vfp_current_hw_state[cpu], fpexc);
  #endif
  
                /*
@@@ -454,7 -415,7 +454,7 @@@ static int vfp_pm_suspend(void
        }
  
        /* clear any information we had about last context state */
 -      memset(last_VFP_context, 0, sizeof(last_VFP_context));
 +      memset(vfp_current_hw_state, 0, sizeof(vfp_current_hw_state));
  
        return 0;
  }
@@@ -482,15 -443,15 +482,15 @@@ static void vfp_pm_init(void
  static inline void vfp_pm_init(void) { }
  #endif /* CONFIG_PM */
  
 +/*
 + * Ensure that the VFP state stored in 'thread->vfpstate' is up to date
 + * with the hardware state.
 + */
  void vfp_sync_hwstate(struct thread_info *thread)
  {
        unsigned int cpu = get_cpu();
  
 -      /*
 -       * If the thread we're interested in is the current owner of the
 -       * hardware VFP state, then we need to save its state.
 -       */
 -      if (last_VFP_context[cpu] == &thread->vfpstate) {
 +      if (vfp_state_in_hw(cpu, thread)) {
                u32 fpexc = fmrx(FPEXC);
  
                /*
        put_cpu();
  }
  
 +/* Ensure that the thread reloads the hardware VFP state on the next use. */
  void vfp_flush_hwstate(struct thread_info *thread)
  {
        unsigned int cpu = get_cpu();
  
 -      /*
 -       * If the thread we're interested in is the current owner of the
 -       * hardware VFP state, then we need to save its state.
 -       */
 -      if (last_VFP_context[cpu] == &thread->vfpstate) {
 -              u32 fpexc = fmrx(FPEXC);
 -
 -              fmxr(FPEXC, fpexc & ~FPEXC_EN);
 -
 -              /*
 -               * Set the context to NULL to force a reload the next time
 -               * the thread uses the VFP.
 -               */
 -              last_VFP_context[cpu] = NULL;
 -      }
 +      vfp_force_reload(cpu, thread);
  
 -#ifdef CONFIG_SMP
 -      /*
 -       * For SMP we still have to take care of the case where the thread
 -       * migrates to another CPU and then back to the original CPU on which
 -       * the last VFP user is still the same thread. Mark the thread VFP
 -       * state as belonging to a non-existent CPU so that the saved one will
 -       * be reloaded in the above case.
 -       */
 -      thread->vfpstate.hard.cpu = NR_CPUS;
 -#endif
        put_cpu();
  }
  
@@@ -529,7 -513,8 +529,7 @@@ static int vfp_hotplug(struct notifier_
        void *hcpu)
  {
        if (action == CPU_DYING || action == CPU_DYING_FROZEN) {
 -              unsigned int cpu = (long)hcpu;
 -              last_VFP_context[cpu] = NULL;
 +              vfp_force_reload((long)hcpu, current_thread_info());
        } else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
                vfp_enable(NULL);
        return NOTIFY_OK;
@@@ -597,7 -582,6 +597,6 @@@ static int __init vfp_init(void
                                elf_hwcap |= HWCAP_VFPv3D16;
                }
  #endif
- #ifdef CONFIG_NEON
                /*
                 * Check for the presence of the Advanced SIMD
                 * load/store instructions, integer and single
                 * for NEON if the hardware has the MVFR registers.
                 */
                if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
+ #ifdef CONFIG_NEON
                        if ((fmrx(MVFR1) & 0x000fff00) == 0x00011100)
                                elf_hwcap |= HWCAP_NEON;
-               }
  #endif
+                       if ((fmrx(MVFR1) & 0xf0000000) == 0x10000000)
+                               elf_hwcap |= HWCAP_VFPv4;
+               }
        }
        return 0;
  }