}
}
-static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
{
+ u64 *mondo, data0, data1, data2;
+ u16 *cpu_list;
u64 pstate;
int i;
__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
- for_each_cpu_mask_nr(i, *mask)
- spitfire_xcall_helper(data0, data1, data2, pstate, i);
+ cpu_list = __va(tb->cpu_list_pa);
+ mondo = __va(tb->cpu_mondo_block_pa);
+ data0 = mondo[0];
+ data1 = mondo[1];
+ data2 = mondo[2];
+ for (i = 0; i < cnt; i++)
+ spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
}
/* Cheetah now allows to send the whole 64-bytes of data in the interrupt
* packet, but we have no use for that. However we do take advantage of
* the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
*/
-static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask_p)
+static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
{
- u64 pstate, ver, busy_mask;
int nack_busy_id, is_jbus, need_more;
- cpumask_t mask;
-
- if (cpus_empty(*mask_p))
- return;
+ u64 *mondo, pstate, ver, busy_mask;
+ u16 *cpu_list;
- mask = *mask_p;
+ cpu_list = __va(tb->cpu_list_pa);
+ mondo = __va(tb->cpu_mondo_block_pa);
/* Unfortunately, someone at Sun had the brilliant idea to make the
* busy/nack fields hard-coded by ITID number for this Ultra-III
"stxa %2, [%5] %6\n\t"
"membar #Sync\n\t"
: /* no outputs */
- : "r" (data0), "r" (data1), "r" (data2),
+ : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
"r" (0x40), "r" (0x50), "r" (0x60),
"i" (ASI_INTR_W));
{
int i;
- for_each_cpu_mask_nr(i, mask) {
- u64 target = (i << 14) | 0x70;
+ for (i = 0; i < cnt; i++) {
+ u64 target, nr;
+
+ nr = cpu_list[i];
+ if (nr == 0xffff)
+ continue;
+ target = (nr << 14) | 0x70;
if (is_jbus) {
- busy_mask |= (0x1UL << (i * 2));
+ busy_mask |= (0x1UL << (nr * 2));
} else {
target |= (nack_busy_id << 24);
busy_mask |= (0x1UL <<
__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
: : "r" (pstate));
if (unlikely(need_more)) {
- int i, cnt = 0;
- for_each_cpu_mask_nr(i, mask) {
- cpu_clear(i, mask);
- cnt++;
- if (cnt == 32)
+ int i, this_cnt = 0;
+ for (i = 0; i < cnt; i++) {
+ if (cpu_list[i] == 0xffff)
+ continue;
+ cpu_list[i] = 0xffff;
+ this_cnt++;
+ if (this_cnt == 32)
break;
}
goto retry;
/* Clear out the mask bits for cpus which did not
* NACK us.
*/
- for_each_cpu_mask_nr(i, mask) {
- u64 check_mask;
+ for (i = 0; i < cnt; i++) {
+ u64 check_mask, nr;
+
+ nr = cpu_list[i];
+ if (nr == 0xffff)
+ continue;
if (is_jbus)
- check_mask = (0x2UL << (2*i));
+ check_mask = (0x2UL << (2*nr));
else
check_mask = (0x2UL <<
this_busy_nack);
if ((dispatch_stat & check_mask) == 0)
- cpu_clear(i, mask);
+ cpu_list[i] = 0xffff;
this_busy_nack += 2;
if (this_busy_nack == 64)
break;
}
/* Multi-cpu list version. */
-static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
{
- int cnt, retries, this_cpu, prev_sent, i;
+ int retries, this_cpu, prev_sent, i;
unsigned long status;
cpumask_t error_mask;
- struct trap_per_cpu *tb;
u16 *cpu_list;
- u64 *mondo;
-
- if (cpus_empty(*mask))
- return;
this_cpu = smp_processor_id();
- tb = &trap_block[this_cpu];
-
- mondo = __va(tb->cpu_mondo_block_pa);
- mondo[0] = data0;
- mondo[1] = data1;
- mondo[2] = data2;
- wmb();
cpu_list = __va(tb->cpu_list_pa);
- /* Setup the initial cpu list. */
- cnt = 0;
- for_each_cpu_mask_nr(i, *mask)
- cpu_list[cnt++] = i;
-
cpus_clear(error_mask);
retries = 0;
prev_sent = 0;
printk("]\n");
}
-static void (*xcall_deliver_impl)(u64, u64, u64, const cpumask_t *);
+static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
{
+ struct trap_per_cpu *tb;
+ int this_cpu, i, cnt;
unsigned long flags;
+ u16 *cpu_list;
+ u64 *mondo;
/* We have to do this whole thing with interrupts fully disabled.
* Otherwise if we send an xcall from interrupt context it will
* Fortunately, udelay() uses %stick/%tick so we can use that.
*/
local_irq_save(flags);
- xcall_deliver_impl(data0, data1, data2, mask);
+
+ this_cpu = smp_processor_id();
+ tb = &trap_block[this_cpu];
+
+ mondo = __va(tb->cpu_mondo_block_pa);
+ mondo[0] = data0;
+ mondo[1] = data1;
+ mondo[2] = data2;
+ wmb();
+
+ cpu_list = __va(tb->cpu_list_pa);
+
+ /* Setup the initial cpu list. */
+ cnt = 0;
+ for_each_cpu_mask_nr(i, *mask) {
+ if (i == this_cpu || !cpu_online(i))
+ continue;
+ cpu_list[cnt++] = i;
+ }
+
+ if (cnt)
+ xcall_deliver_impl(tb, cnt);
+
local_irq_restore(flags);
}