Linux kernel panic at run_posix_cpu_timers+0xa4

最近一台主机突然内核崩溃后, 系统通过 kdump 服务捕获了 vmcore 等文件信息, 如下所示从堆栈的信息来看, 系统在进行 xfs 相关操作的时候进行了计时器中断(apic_timer_interrupt), 中断程序在更新进程的使用时间(update_process_times)的过程中, 出现了空指针(RIP: run_posix_cpu_timers+164)而造成内核崩溃:

crash> sys
      KERNEL: /usr/lib/debug/lib/modules/3.10.0-327.18.2.el7.x86_64/vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 40
    NODENAME: czhost
     MACHINE: x86_64  (2197 Mhz)
      MEMORY: 127.9 GB
       PANIC: "BUG: unable to handle kernel NULL pointer dereference at 0000000000000140"
crash> 
crash> bt
PID: 80589  TASK: ffff8807deb08000  CPU: 0   COMMAND: "commserver"
 #0 [ffff88103f203a38] machine_kexec at ffffffff81051beb
 #1 [ffff88103f203a98] crash_kexec at ffffffff810f2782
 #2 [ffff88103f203b68] oops_end at ffffffff8163ea48
 #3 [ffff88103f203b90] no_context at ffffffff8162eb28
 #4 [ffff88103f203be0] __bad_area_nosemaphore at ffffffff8162ebbe
 #5 [ffff88103f203c28] bad_area_nosemaphore at ffffffff8162ed28
 #6 [ffff88103f203c38] __do_page_fault at ffffffff8164184e
 #7 [ffff88103f203c98] do_page_fault at ffffffff816419e3
 #8 [ffff88103f203cc0] page_fault at ffffffff8163dc48
    [exception RIP: run_posix_cpu_timers+164]
    RIP: ffffffff810a86f4  RSP: ffff88103f203d70  RFLAGS: 00010046
    RAX: 0000000000000000  RBX: ffffffff81a684e0  RCX: ffffffff81a68a70
    RDX: ffff88103f203db8  RSI: ffff88103f203da0  RDI: ffffffff81a684e0
    RBP: ffff88103f203e00   R8: ffffffff81a68a78   R9: ffff88103f203d38
    R10: 0000000000000000  R11: 0000000000000005  R12: ffff88103f203da8
    R13: ffffffff819a5f80  R14: ffff88103f20dfa0  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff88103f203e08] update_process_times at ffffffff8108e8ee
#10 [ffff88103f203e68] update_process_times at ffffffff8108e8c7
#11 [ffff88103f203e90] tick_sched_handle at ffffffff810e0825
#12 [ffff88103f203eb0] tick_sched_timer at ffffffff810e08a1
#13 [ffff88103f203ed8] __hrtimer_run_queues at ffffffff810a9d42
#14 [ffff88103f203f30] hrtimer_interrupt at ffffffff810aa2e0
#15 [ffff88103f203f80] local_apic_timer_interrupt at ffffffff81049537
#16 [ffff88103f203f98] smp_apic_timer_interrupt at ffffffff8164874f
#17 [ffff88103f203fb0] apic_timer_interrupt at ffffffff81646e1d
--- <IRQ stack> ---
#18 [ffff88046102fad8] apic_timer_interrupt at ffffffff81646e1d
    [exception RIP: xfs_log_reserve+53]
    RIP: ffffffffa01e10f5  RSP: ffff88046102fb88  RFLAGS: 00000246
    RAX: 00000000fffffffb  RBX: ffffea0004066b40  RCX: ffff880002f35980
    RDX: 0000000000000000  RSI: 0000000000000198  RDI: ffff882025dd9000
    RBP: ffff88046102fbb0   R8: 0000000000000069   R9: 0000000000000000
    R10: ffff88046102ffd8  R11: 0000000000000293  R12: ffff88046102fb28
    R13: 0000000000000246  R14: ffff88000ee3d820  R15: ffff88000ee3d808
    ORIG_RAX: ffffffffffffff10  CS: 0010  SS: 0018
#19 [ffff88046102fbb8] xfs_trans_reserve at ffffffffa01dbad5 [xfs]
#20 [ffff88046102fc00] xfs_vn_update_time at ffffffffa01cd416 [xfs]
#21 [ffff88046102fc40] update_time at ffffffff811f9c55
#22 [ffff88046102fc70] file_update_time at ffffffff811f9f00
#23 [ffff88046102fcb0] xfs_file_aio_write_checks at ffffffffa01c492d [xfs]
#24 [ffff88046102fd18] xfs_file_buffered_aio_write at ffffffffa01c4a23 [xfs]
#25 [ffff88046102fdc8] xfs_file_aio_write at ffffffffa01c4cc0 [xfs]
#26 [ffff88046102fe20] do_sync_write at ffffffff811de02d
#27 [ffff88046102fef8] vfs_write at ffffffff811de84d
#28 [ffff88046102ff38] sys_write at ffffffff811df2ef
#29 [ffff88046102ff80] system_call_fastpath at ffffffff816461c9
    RIP: 00007fd7480bf51d  RSP: 00007fd74511dac8  RFLAGS: 00010297
    RAX: 0000000000000001  RBX: ffffffff816461c9  RCX: 00000000fffffffc
    RDX: 0000000000000059  RSI: 00007fd749415000  RDI: 0000000000000169
    RBP: 00007fd749415000   R8: 00007fd7403c53d0   R9: 00007fd74511e700
    R10: 3030302d30303030  R11: 0000000000000293  R12: 0000000021e4f8cc
    R13: 0000000000000059  R14: 00007fd7403c52f0  R15: 0000000000000059
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b


crash> log | tail -n 61
[16758125.174987] BUG: unable to handle kernel NULL pointer dereference at 0000000000000140
[16758125.175089] IP: [<ffffffff810a86f4>] run_posix_cpu_timers+0xa4/0x840
[16758125.175165] PGD 2d6359067 PUD 861340067 PMD 0 
[16758125.175211] Oops: 0000 [#1] SMP 
......

从函数 run_posix_cpu_timers 的执行过程来看, 最后终止在 kernel/posix-cpu-timers.c 程序的 1292 行代码, 函数中的 tsk 进程信息则由函数 update_process_times 传递, 如下所示,

crash> dis -rl run_posix_cpu_timers+164
....
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/posix-cpu-timers.c: 1291
0xffffffff810a86ed <run_posix_cpu_timers+157>:  mov    0x768(%rbx),%r15
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/posix-cpu-timers.c: 1292
0xffffffff810a86f4 <run_posix_cpu_timers+164>:  mov    0x140(%r15),%ecx
crash> gdb list 1290
1273 static inline int fastpath_timer_check(struct task_struct *tsk)
1274 {
1275         struct signal_struct *sig;
1276         cputime_t utime, stime;
1277 
1278         task_cputime(tsk, &utime, &stime);
1279 
1280         if (!task_cputime_zero(&tsk->cputime_expires)) {
1281                 struct task_cputime task_sample = {
1282                         .utime = utime,
1283                         .stime = stime,
1284                         .sum_exec_runtime = tsk->se.sum_exec_runtime
1285                    };
1286    
1287                    if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1288                            return 1;
1289            }
1290    
1291            sig = tsk->signal;
1292            if (sig->cputimer.running) {
1293                    struct task_cputime group_sample;
1294    

crash> dis -rl ffffffff8108e8c7
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1370
......
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/arch/x86/include/asm/current.h: 14
0xffffffff8108e89c <update_process_times+28>:   mov    %gs:0xb7c0,%rbx

//source/kernel/timer.h
1369 void update_process_times(int user_tick)
1370 {
1371         struct task_struct *p = current;
1372         int cpu = smp_processor_id();
1373 
1374         /* Note: this timer irq context must be accounted for as well. */
1375         account_process_tick(p, user_tick);
1376         run_local_timers();
1377         rcu_check_callbacks(cpu, user_tick);
1378 #ifdef CONFIG_IRQ_WORK
1379         if (in_irq())
1380                 irq_work_tick();
1381 #endif
1382         scheduler_tick();
1383         run_posix_cpu_timers(p);
1384 }

//source/arch/x86/include/asm/current.h
  8 struct task_struct;
  9 
 10 DECLARE_PER_CPU(struct task_struct *, current_task);
 11 
 12 static __always_inline struct task_struct *get_current(void)
 13 {
 14         return this_cpu_read_stable(current_task);
 15 }

crash> dis -rl ffffffff8108e8ee
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1370
...
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/arch/x86/include/asm/current.h: 14
0xffffffff8108e89c <update_process_times+28>:   mov    %gs:0xb7c0,%rbx
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1375
0xffffffff8108e8a5 <update_process_times+37>:   mov    %rbx,%rdi
0xffffffff8108e8a8 <update_process_times+40>:   callq  0xffffffff810bbec0 <account_process_tick>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1402
0xffffffff8108e8ad <update_process_times+45>:   callq  0xffffffff810aa670 <hrtimer_run_queues>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1403
0xffffffff8108e8b2 <update_process_times+50>:   mov    $0x1,%edi
0xffffffff8108e8b7 <update_process_times+55>:   callq  0xffffffff81084f30 <raise_softirq>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1377
0xffffffff8108e8bc <update_process_times+60>:   mov    %r12d,%esi
0xffffffff8108e8bf <update_process_times+63>:   mov    %r13d,%edi
0xffffffff8108e8c2 <update_process_times+66>:   callq  0xffffffff81126690 <rcu_check_callbacks>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/arch/x86/include/asm/thread_info.h: 211
0xffffffff8108e8c7 <update_process_times+71>:   mov    %gs:0xb7b8,%rax
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1379
0xffffffff8108e8d0 <update_process_times+80>:   testl  $0x3ff0000,-0x3fbc(%rax)
0xffffffff8108e8da <update_process_times+90>:   je     0xffffffff8108e8e1 <update_process_times+97>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1380
0xffffffff8108e8dc <update_process_times+92>:   callq  0xffffffff81156070 <irq_work_tick>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1382
0xffffffff8108e8e1 <update_process_times+97>:   callq  0xffffffff810b75c0 <scheduler_tick>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1383
0xffffffff8108e8e6 <update_process_times+102>:  mov    %rbx,%rdi     // task_struct variable
0xffffffff8108e8e9 <update_process_times+105>:  callq  0xffffffff810a8650 <run_posix_cpu_timers>
/usr/src/debug/kernel-3.10.0-327.18.2.el7/linux-3.10.0-327.18.2.el7.x86_64/kernel/timer.c: 1384
0xffffffff8108e8ee <update_process_times+110>:  pop    %rbx

从上述的信息来看, update_process_times 将进程的信息传给了 run_posix_cpu_timers 函数. run_posix_cpu_timers 函数的汇编代码中, 寄存器 R15RDI 对应的值相同, 并且 RDI 寄存器为 run_posix_cpu_timers 函数的第一个参数, 上述的汇编代码中 mov %gs:0xb7c0,%rbx, 0xb7c0 即为 update_process_times 函数的task_struct 结构体的 p 指针变量. 从下述的分析可以看到 p 变量的 signal_struct 变量正常, cputimer.running 为 0:

crash> kmem -o | grep 'CPU 0'
  CPU 0: ffff88103f200000
crash> eval ffff88103f200000 + 0xb7c0
hexadecimal: ffff88103f20b7c0  
    decimal: 18446612202092804032  (-131871616747584)
      octal: 1777774201007710133700
     binary: 1111111111111111100010000001000000111111001000001011011111000000
crash> rd ffff88103f20b7c0
ffff88103f20b7c0:  ffff8807deb08000                    ........
crash> struct task_struct ffff8807deb08000 
struct task_struct {
  state = 0, 
  stack = 0xffff88046102c000, 
...
  signal = 0xffff88108bea8d80,
  pid = 80589, //commserver task
  tgid = 80586,

crash> struct signal_struct 0xffff88108bea8d80
struct signal_struct {
  sigcnt = {
    counter = 77
...
  cputimer = {
    cputime = {
      utime = 0, 
      stime = 0, 
      sum_exec_runtime = 0
    }, 
    running = 0, 
    lock = {

不过我们从 1292 行代码即可看到, 程序在引用 signal_struct 结构体的 cuptimer 成员的时候却成为了空值,导致执行 sig->cputimer.running 代码的时候出现内核崩溃. 对比上述的汇编代码来看 rbx 寄存器的值在传给 rdi 之前经过了以下函数的处理:

account_process_tick
hrtimer_run_queues
raise_softirq
rcu_check_callbacks
irq_work_tick
scheduler_tick

看起来有可能是上述的哪个函数 pop 了错误的 rbx, 导致传给 run_posix_cpu_timers 的变量为空值.

触发条件

目前并未找到引起空指针的原因, 红帽知识库, dell 和 intel 相关的站点也未找到关于上述堆栈的错误信息. 不过从下面两个角度来看最好不做主机的 BIOS 和系统内核的升级:

1. 中断相关的代码在最新版 3.10.0-957 内核中未见变更;
2. 主机 BIOS 版本为 2.0.1, 到最新的 2.9.1 版本的 changelog 中未见到相关 bug 说明;

在第二点中, 由于堆栈里显示的是计时器中断没有获取到对应 cpu 0 的进程信息才引起了空指针,如果触发条件和 cpu 无关, 升级 BIOS 其实没什么作用.

参考

stack-frame-layout-on-x86_64
intel-intro
dell-R430-BIOS-driver