[Click] Multithreading bug.

Fri Sep 21 23:47:16 EDT 2007

Hi Norbert
Would you like to try this?
Any improvement?
(please forgive my stupid big spinlock.)

-
 linuxmodule/skbmgr.cc |   26 ++++++++++++++------------
 1 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/linuxmodule/skbmgr.cc b/linuxmodule/skbmgr.cc
index 1b78d9e..560b308 100644
--- a/linuxmodule/skbmgr.cc
+++ b/linuxmodule/skbmgr.cc
@@ -22,6 +22,7 @@
 #include <click/glue.hh>
 #include <click/atomic.hh>
 #include <click/skbmgr.hh>
+#include <click/sync.hh>

 #include <click/cxxprotect.h>
 CLICK_CXX_PROTECT
@@ -73,7 +74,7 @@ class RecycledSkbPool { public:
  private:

   RecycledSkbBucket _buckets[NBUCKETS];
-  volatile unsigned long _lock;
+  Spinlock* _lock;
 #if __MTCLICK__
   int _last_producer;
   atomic_uint32_t _consumers;
@@ -152,23 +153,20 @@ RecycledSkbBucket::deq()
 inline void
 RecycledSkbPool::lock()
 {
-  while (test_and_set_bit(0, &_lock)) {
-    while (_lock)
-      /* nothing */;
-  }
+  _lock->acquire();
 }

 inline void
 RecycledSkbPool::unlock()
 {
-  clear_bit(0, &_lock);
+  _lock->release();
 }


 void
 RecycledSkbPool::initialize()
 {
-  _lock = 0;
+  _lock = new Spinlock;
   for (int i = 0; i < NBUCKETS; i++)
     _buckets[i].initialize();
 #if __MTCLICK__
@@ -199,6 +197,9 @@ RecycledSkbPool::cleanup()
 	    _freed, _recycle_freed, _allocated, _recycle_allocated);
 #endif
   unlock();
+
+  delete _lock;
+  _lock = NULL;
 }

 inline RecycledSkbBucket &
@@ -286,7 +287,6 @@ RecycledSkbPool::recycle(struct sk_buff *skbs)

     // try to put in that bucket
     if (bucket >= 0) {
-      lock();
       int tail = _buckets[bucket]._tail;
       int next = _buckets[bucket].next_i(tail);
       if (next != _buckets[bucket]._head) {
@@ -300,7 +300,6 @@ RecycledSkbPool::recycle(struct sk_buff *skbs)
         _recycle_freed++;
 #endif
       }
-      unlock();
     }

     // if not taken care of, then free it
@@ -323,7 +322,6 @@ RecycledSkbPool::allocate(unsigned headroom,
unsigned size, int want, int *store
   int got = 0;

   if (bucket >= 0) {
-    lock();
     RecycledSkbBucket &buck = _buckets[bucket];
     while (got < want && !buck.empty()) {
       struct sk_buff *skb = _buckets[bucket].deq();
@@ -335,7 +333,6 @@ RecycledSkbPool::allocate(unsigned headroom,
unsigned size, int want, int *store
       prev = &skb->next;
       got++;
     }
-    unlock();
   }

   size = size_to_higher_bucket_size(headroom + size);
@@ -394,6 +391,7 @@ skbmgr_allocate_skbs(unsigned headroom, unsigned
size, int *want)
   int bucket = RecycledSkbPool::size_to_higher_bucket(headroom + size);

   int w = *want;
+  pool[producer].lock();
   if (pool[producer].bucket(bucket).size() < w) {
     if (pool[cpu]._last_producer < 0 ||
 	pool[pool[cpu]._last_producer].bucket(bucket).size() < w)
@@ -401,7 +399,9 @@ skbmgr_allocate_skbs(unsigned headroom, unsigned
size, int *want)
     if (pool[cpu]._last_producer >= 0)
       producer = pool[cpu]._last_producer;
   }
-  return pool[producer].allocate(headroom, size, w, want);
+  sk_buff* skb = pool[producer].allocate(headroom, size, w, want);
+  pool[producer].unlock();
+  return skb;
 #else
   return pool.allocate(headroom, size, *want, want);
 #endif
@@ -412,7 +412,9 @@ skbmgr_recycle_skbs(struct sk_buff *skbs)
 {
 #if __MTCLICK__
   int cpu = click_current_processor();
+  pool[cpu].lock();
   pool[cpu].recycle(skbs);
+  pool[cpu].unlock();
 #else
   pool.recycle(skbs);
 #endif
-

Joonwoo Park (Jason Park)


2007/9/21, Egi, Norbert <n.egi at lancaster.ac.uk>:
> Hi,
>
> We've been testing how CPU affinity affects the performance of packet forwarding in Click with a single forwarding path with two interfaces and two threads on a 4 core machine (two dual-core Xeon 5150 CPUs). We are using the recent 1.6.0 release with the 2.6.19.2 kernel.
>
> With StaticThreadSched we assigned PollDevice to one thread and ToDevice to the other and used taskset to tie the threads to cores. When the threads were tied to cores on the same CPU things went quite well, at least we didn't experience any problem. However, when the threads were tied to cores on separate CPUs the system crashed almost all the times after forwarding a few million packets, with the following report in the dmesg:
>
> BUG: soft lockup detected on CPU#3!
>  [<c0133b03>] softlockup_tick+0xa6/0xb3
>  [<c0120dd0>] update_process_times+0x49/0x75
>  [<c010ce48>] smp_apic_timer_interrupt+0x98/0xb4
>  [<c0103677>] apic_timer_interrupt+0x1f/0x24
>  [<f8e5fb1c>] _ZN15RecycledSkbPool7recycleEP7sk_buff+0x78/0xf0 [click]
>  [<f8e5fbc1>] _Z19skbmgr_recycle_skbsP7sk_buff+0x2d/0x32 [click]
>  [<f8e0b74c>] _ZN8ToDevice8run_taskEP4Task+0x2b4/0x3ec [click]
>  [<f8d8872b>] _ZN12RouterThread6driverEv+0x245/0x36a [click]
>  [<f8e5c5ff>] _Z11click_schedPv+0xd9/0x1d6 [click]
>  [<f8e5c526>] _Z11click_schedPv+0x0/0x1d6 [click]
>  [<c01037a3>] kernel_thread_helper+0x7/0x10
>  =======================
> BUG: soft lockup detected on CPU#1!
>  [<c0133b03>] softlockup_tick+0xa6/0xb3
>  [<c0120dd0>] update_process_times+0x49/0x75
>  [<c02f3537>] dev_watchdog+0x0/0xc0
>  [<c010ce48>] smp_apic_timer_interrupt+0x98/0xb4
>  [<c0103677>] apic_timer_interrupt+0x1f/0x24
>  [<c02f3537>] dev_watchdog+0x0/0xc0
>  [<c036851b>] _spin_lock+0xa/0xf
>  [<c02f354a>] dev_watchdog+0x13/0xc0
>  [<c0120d24>] run_timer_softirq+0x101/0x164
>  [<c011cb76>] __do_softirq+0x5e/0xc0
>  [<c011cc0e>] do_softirq+0x36/0x3a
>  [<c010ce4d>] smp_apic_timer_interrupt+0x9d/0xb4
>  [<c0103677>] apic_timer_interrupt+0x1f/0x24
>  [<f8e5f99d>] _ZN15RecycledSkbPool8allocateEjjiPi+0xa9/0x1b0 [click]
>  [<f885b898>] e1000_watchdog_1+0x11e/0x6d9 [e1000]
>  [<f8e5fea6>] _Z20skbmgr_allocate_skbsjjPi+0x1c6/0x230 [click]
>  [<f8e0876f>] _ZN10PollDevice8run_taskEP4Task+0x63/0x2be [click]
>  [<f8d8872b>] _ZN12RouterThread6driverEv+0x245/0x36a [click]
>  [<f8e69656>] _ZN6VectorIiE7reserveEi+0x30/0x88 [click]
>  [<f8e5c5ff>] _Z11click_schedPv+0xd9/0x1d6 [click]
>  [<f8e5c526>] _Z11click_schedPv+0x0/0x1d6 [click]
>  [<c01037a3>] kernel_thread_helper+0x7/0x10
>  =======================
>
>
> We thought that there might be some locking issues, so put the second while loop in the ' struct sk_buff *RecycledSkbPool::allocate()' function (linuxmodule/skbmgr.cc) into lock as well, which was quite a random act, but seemed to help a bit, however we are still experience crashes with the same bug report, but a bit less frequently.
>
> The diff of the skbmgr.cc is simply this:
>
> 342d341
> <   lock();
> 357d355
> <   unlock();
>
>
> While the script we used to test the setup looks like as follows, with traffic arriving only on eth2 and leaving on eth4 (the upper forwarding path):
>
> pd0 :: PollDevice(eth2,PROMISC true) -> MSQueue -> td0 :: ToDevice(eth4);
>
> pd1 :: PollDevice(eth4,PROMISC true) -> MSQueue -> td1 :: ToDevice(eth2);
>
> StaticThreadSched(pd0 0,td0 1,pd1 1,td1 0);
>
>
> Unfortunately, we don't know too much how the SMP stuff is implemented in Click, so are hoping to get responses on where to look for or how to fix this bug.
>
> By the way, here is the dmesg bug report from an earlier Click cvs version on kernel 2.6.16.13:
>
> BUG: soft lockup detected on CPU#2!
> CPU 2:
> Modules linked in: click proclikefs e1000
> Pid: 8881, comm: kclick Not tainted 2.6.16.13 #1
> RIP: 0010:[<ffffffff880c1df4>] <ffffffff880c1df4>{:click:_ZN15RecycledSkbPool7recycleEP7sk_buff+84}
> RSP: 0000:ffff81007e535e28  EFLAGS: 00000202
> RAX: 0000000000000001 RBX: 0000000000000002 RCX: ffff81007dc4f980
> RDX: 0000000000000001 RSI: ffff81007f1c6ec0 RDI: ffff81007f1c6ec0
> RBP: ffffffff881ff6b0 R08: ffff81007bda9000 R09: ffff81007f1c6ec0
> R10: 0000000000000002 R11: ffffffff881ff6b0 R12: ffff81007dc4f980
> R13: 0000000000001d60 R14: 000000000000003c R15: ffffc2000000c000
> FS:  0000000000000000(0000) GS:ffff810002f72240(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: 00002ab7767939e0 CR3: 0000000000101000 CR4: 00000000000006e0
>
> Call Trace: <ffffffff880895ed>{:click:_ZN8ToDevice8run_taskEP4Task+237}
>       <ffffffff88047756>{:click:_ZN12RouterThread6driverEv+742}
>       <ffffffff880bed66>{:click:_Z11click_schedPv+310} <ffffffff8010b5b6>{child_rip+8}
>       <ffffffff880ca330>{:click:_ZNK16BaseErrorHandler7nerrorsEv+0}
>       <ffffffff880bec30>{:click:_Z11click_schedPv+0} <ffffffff8010b5ae>{child_rip+0}
> BUG: soft lockup detected on CPU#3!
> CPU 3:
> Modules linked in: click proclikefs e1000
> Pid: 8880, comm: kclick Not tainted 2.6.16.13 #1
> RIP: 0010:[<ffffffff8038aca4>] <ffffffff8038aca4>{.text.lock.spinlock+5}
> RSP: 0018:ffff810002f9bf00  EFLAGS: 00000286
> RAX: ffff81007e533fd8 RBX: ffff810002f9be58 RCX: ffff81007bda9228
> RDX: ffff810002f9bf18 RSI: ffff810002c2a758 RDI: ffff81007bda9200
> RBP: ffffffff8010ac64 R08: 0000000000000003 R09: 0000000000000040
> R10: 0000000000000040 R11: 0000000000000002 R12: ffffffff80317c06
> R13: ffff81007bda9000 R14: 0000000000000003 R15: ffffffff8010ca6b
> FS:  0000000000000000(0000) GS:ffff810002f52b40(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: 00002b0c0829da90 CR3: 0000000000101000 CR4: 00000000000006e0
>
> Call Trace: <IRQ> <ffffffff80317c16>{dev_watchdog+16}
>       <ffffffff80130840>{run_timer_softirq+355} <ffffffff80122f36>{scheduler_tick+137}
>       <ffffffff8012d0ec>{__do_softirq+80} <ffffffff8010b906>{call_softirq+30}
>       <ffffffff8010caa9>{do_softirq+47} <ffffffff8010b2aa>{apic_timer_interrupt+98} <EOI>
>       <ffffffff880c1f6b>{:click:_ZN15RecycledSkbPool8allocateEjjiPi+187}
>       <ffffffff88086e1f>{:click:_ZN10PollDevice8run_taskEP4Task+111}
>       <ffffffff88047756>{:click:_ZN12RouterThread6driverEv+742}
>       <ffffffff880bed66>{:click:_Z11click_schedPv+310} <ffffffff8010b5b6>{child_rip+8}
>       <ffffffff880ca330>{:click:_ZNK16BaseErrorHandler7nerrorsEv+0}
>       <ffffffff880bec30>{:click:_Z11click_schedPv+0} <ffffffff8010b5ae>{child_rip+0}
>
>
> It looks even nastier than the previous one, but might help to identify the bug.
>
>
>
> Regards,
>
> Norbert
>
>
> _______________________________________________
> click mailing list
> click at amsterdam.lcs.mit.edu
> https://amsterdam.lcs.mit.edu/mailman/listinfo/click
>