/*
* Event handler for periodic ticks
*/
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
ktime_t next;
tick_periodic(cpu);
#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
/*
* The cpu might have transitioned to HIGHRES or NOHZ mode via
* update_process_times() -> run_local_timers() ->
* hrtimer_run_queues().
*/
if (dev->event_handler != tick_handle_periodic)
return;
#endif
if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
return;
/*
* Setup the next period for devices, which do not have
* periodic mode:
*/
next = ktime_add(dev->next_event, tick_period);
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
/*
* Have to be careful here. If we're in oneshot mode,
* before we call tick_periodic() in a loop, we need
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
* when then will increment time, posibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
next = ktime_add(next, tick_period);
}
}
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
}
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
if (vtime_accounting_enabled())
return;
if (sched_clock_irqtime) {
irqtime_account_process_tick(p, user_tick, rq);
return;
}
if (steal_account_process_tick())
return;
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
one_jiffy_scaled);
else
account_idle_time(cputime_one_jiffy);
}
2.4 account_idle_time()
account_idle_time()的主要功能是更新当前CPU的空闲时间或等待I/O的时间。
这里也可以看出来等待I/O的时间本身也属于CPU idle时间的一部分。
如果CPU当前是空闲状态 ,并且有进程正在等待I/O ,时间将计入iowait。
如果CPU当前是空闲状态 ,并且没有任何进程等待I/O ,时间将计入idle。
/*
* Account for idle time.
* @cputime: the cpu time spent in idle wait
*/
void account_idle_time(cputime_t cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
else
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
Position 1 - user: Time executing processes in user mode, excluding nice time. This represents normal application workload processing.
Position 2 - nice: Time executing niced processes (positive nice values indicating lower priority). This time is separate from regular user time.
Position 3 - system: Time executing in kernel/system mode, including system calls, kernel functions, and device drivers serving user processes.
Position 4 - idle: Time spent in idle state when the CPU has no work to perform. This is the primary metric for calculating utilization.
Position 5 - iowait (Linux 2.5.41+): Time waiting for I/O operations to complete. Critical note: This measurement is unreliable on multi-core systems because the CPU doesn't actually wait for I/O - only individual processes wait while other processes can continue running.
Position 6 - irq (Linux 2.6.0+): Time servicing hardware interrupts with high priority response to hardware events.
Position 7 - softirq (Linux 2.6.0+): Time servicing software interrupts that handle work queued by hardware interrupts at lower priority.
Position 8 - steal (Linux 2.6.11+): Time stolen by the hypervisor in virtualized environments, indicating CPU resources allocated to other virtual machines.
Position 9 - guest (Linux 2.6.24+): Time spent running virtual CPU for guest operating systems. Critical implementation detail: This time is also included in user time, requiring adjustment to avoid double-counting.
Position 10 - guest_nice (Linux 2.6.33+): Time spent running niced guest processes. Also included in nice time, requiring similar adjustment.
# 假设上次idle列统计值为x
prev_idle = x
# 假设上次iowait列统计值为y
prev_iowait = y
# 因为I/O不可用,所有进程处于uninterruptible sleep状态(即D状态),所有空闲时间会计入iowait中,idle列变化基本为0
curr_idle = x + 0
# 因为I/O不可用,所有进程处于uninterruptible sleep状态(即D状态),所有空闲时间会计入iowait中,假设iowait列变化为z
prev_iowait = y + z
prev_idle = prev_idle + prev_iowait = x + y
curr_idle = curr_idle + curr_iowait = x + y + z
# 假设上次采集的user nice system irq softirq steal数值分别为a b c d e f
prev_non_idle = prev_user + prev_nice + prev_system + prev_irq + prev_softirq + prev_steal
= a + b + c + d + e + f
# user nice system irq softirq steal变化量基本为0
curr_non_idle = curr_user + curr_nice + curr_system + curr_irq + curr_softirq + curr_steal
= (a+0) + (b+0) + (c+0) + (d+0) + (e+0) + (f+0)
= a + b + c + d + e + f
prev_total = prev_idle + prev_non_idle = (x + y) + (a + b + c + d + e + f)
curr_total = curr_idle + curr_non_idle = (x + y + z) + (a + b + c + d + e + f)
total_delta = curr_total - prev_total
= ((x + y + z) + (a + b + c + d + e + f)) - ((x + y) + (a + b + c + d + e + f))
= z
idle_delta = curr_idle - prev_idle
= (x + y + z) - (x + y)
= z
# cpu.busy
cpu_usage = (total_delta - idle_delta) / total_delta * 100
= (z - z)/z * 100
= 0
根据正确的逻辑, cpu.busy 的计算结果应该是0%,符合预期。
cpu.iowait正确计算逻辑如下:
iowait_delta = z
total_delta = curr_total - prev_total
= ((x + y + z) + (a + b + c + d + e + f)) - ((x + y) + (a + b + c + d + e + f))
= z
# cpu.iowait
iowait_delta = iowait_delta / total_delta * 100
= z / z * 100
= 100